diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17869 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9998424353196986, + "eval_steps": 500, + "global_step": 11898, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025210348848202185, + "grad_norm": 144.7147216796875, + "learning_rate": 4.201680672268907e-09, + "logits/chosen": -1.244873046875, + "logits/rejected": -1.282617211341858, + "logps/chosen": -287.54376220703125, + "logps/rejected": -257.1000061035156, + "loss": 0.7034, + "rewards/accuracies": 0.20937499403953552, + "rewards/chosen": -0.011601448059082031, + "rewards/margins": -0.019524574279785156, + "rewards/rejected": 0.007927512750029564, + "step": 10 + }, + { + "epoch": 0.005042069769640437, + "grad_norm": 90.3583984375, + "learning_rate": 8.403361344537815e-09, + "logits/chosen": NaN, + "logits/rejected": -1.2949707508087158, + "logps/chosen": -279.6000061035156, + "logps/rejected": -271.91876220703125, + "loss": 0.6987, + "rewards/accuracies": 0.3531250059604645, + "rewards/chosen": 0.0007650375482626259, + "rewards/margins": -0.00536766042932868, + "rewards/rejected": 0.0061325072310864925, + "step": 20 + }, + { + "epoch": 0.007563104654460656, + "grad_norm": 100.33983612060547, + "learning_rate": 1.2605042016806723e-08, + "logits/chosen": -1.305029273033142, + "logits/rejected": -1.313232421875, + "logps/chosen": -283.7906188964844, + "logps/rejected": -255.85000610351562, + "loss": 0.6953, + "rewards/accuracies": 0.32499998807907104, + "rewards/chosen": 0.003363323165103793, + "rewards/margins": -0.0035388946998864412, + "rewards/rejected": 0.006898784544318914, + "step": 30 + }, + { + "epoch": 0.010084139539280874, + "grad_norm": 97.06249237060547, + "learning_rate": 1.680672268907563e-08, + "logits/chosen": -1.3307616710662842, + "logits/rejected": -1.3382079601287842, + "logps/chosen": -290.76873779296875, + "logps/rejected": -244.93124389648438, + "loss": 0.6978, + "rewards/accuracies": 0.34062498807907104, + "rewards/chosen": 0.0071510314010083675, + "rewards/margins": 0.00169200892560184, + "rewards/rejected": 0.0054794312454760075, + "step": 40 + }, + { + "epoch": 0.012605174424101093, + "grad_norm": 90.40794372558594, + "learning_rate": 2.1008403361344538e-08, + "logits/chosen": -1.2814452648162842, + "logits/rejected": -1.3154296875, + "logps/chosen": -283.1499938964844, + "logps/rejected": -257.88592529296875, + "loss": 0.6925, + "rewards/accuracies": 0.33125001192092896, + "rewards/chosen": 0.0049995421431958675, + "rewards/margins": 0.009321212768554688, + "rewards/rejected": -0.004360771272331476, + "step": 50 + }, + { + "epoch": 0.015126209308921312, + "grad_norm": 118.56312561035156, + "learning_rate": 2.5210084033613446e-08, + "logits/chosen": -1.2708008289337158, + "logits/rejected": -1.3049805164337158, + "logps/chosen": -279.1875, + "logps/rejected": -260.3125, + "loss": 0.6926, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.009726142510771751, + "rewards/margins": 0.004282951354980469, + "rewards/rejected": 0.005441856570541859, + "step": 60 + }, + { + "epoch": 0.01764724419374153, + "grad_norm": 89.31433868408203, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": -1.2632324695587158, + "logits/rejected": -1.30419921875, + "logps/chosen": -292.8812561035156, + "logps/rejected": -268.70001220703125, + "loss": 0.6935, + "rewards/accuracies": 0.28437501192092896, + "rewards/chosen": 0.012535477057099342, + "rewards/margins": 0.0027713775634765625, + "rewards/rejected": 0.009746169671416283, + "step": 70 + }, + { + "epoch": 0.020168279078561748, + "grad_norm": 107.96665954589844, + "learning_rate": 3.361344537815126e-08, + "logits/chosen": -1.313720703125, + "logits/rejected": -1.3276855945587158, + "logps/chosen": -295.0874938964844, + "logps/rejected": -256.734375, + "loss": 0.6941, + "rewards/accuracies": 0.3187499940395355, + "rewards/chosen": 0.01777505874633789, + "rewards/margins": -0.0024406432639807463, + "rewards/rejected": 0.020194053649902344, + "step": 80 + }, + { + "epoch": 0.022689313963381967, + "grad_norm": 84.0765380859375, + "learning_rate": 3.7815126050420164e-08, + "logits/chosen": -1.294189453125, + "logits/rejected": -1.278295874595642, + "logps/chosen": -321.5874938964844, + "logps/rejected": -261.8656311035156, + "loss": 0.6892, + "rewards/accuracies": 0.328125, + "rewards/chosen": 0.03657836839556694, + "rewards/margins": 0.01087188720703125, + "rewards/rejected": 0.025716591626405716, + "step": 90 + }, + { + "epoch": 0.025210348848202186, + "grad_norm": 102.03905487060547, + "learning_rate": 4.2016806722689076e-08, + "logits/chosen": -1.2692382335662842, + "logits/rejected": -1.3091552257537842, + "logps/chosen": -291.4156188964844, + "logps/rejected": -268.7562561035156, + "loss": 0.6936, + "rewards/accuracies": 0.34687501192092896, + "rewards/chosen": 0.04565773159265518, + "rewards/margins": -0.0016721725696697831, + "rewards/rejected": 0.04732322692871094, + "step": 100 + }, + { + "epoch": 0.027731383733022405, + "grad_norm": 90.58088684082031, + "learning_rate": 4.621848739495798e-08, + "logits/chosen": -1.277929663658142, + "logits/rejected": -1.276123046875, + "logps/chosen": -299.125, + "logps/rejected": -264.0, + "loss": 0.6902, + "rewards/accuracies": 0.3687500059604645, + "rewards/chosen": 0.0779266357421875, + "rewards/margins": 0.013805675320327282, + "rewards/rejected": 0.06411953270435333, + "step": 110 + }, + { + "epoch": 0.030252418617842624, + "grad_norm": 87.24566650390625, + "learning_rate": 5.042016806722689e-08, + "logits/chosen": -1.280419945716858, + "logits/rejected": -1.312744140625, + "logps/chosen": -298.75, + "logps/rejected": -271.76873779296875, + "loss": 0.6794, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": 0.12237701565027237, + "rewards/margins": 0.030515288934111595, + "rewards/rejected": 0.09185028076171875, + "step": 120 + }, + { + "epoch": 0.03277345350266284, + "grad_norm": 75.01036071777344, + "learning_rate": 5.46218487394958e-08, + "logits/chosen": -1.2948729991912842, + "logits/rejected": -1.3290526866912842, + "logps/chosen": -276.0625, + "logps/rejected": -255.8562469482422, + "loss": 0.692, + "rewards/accuracies": 0.3687500059604645, + "rewards/chosen": 0.15773162245750427, + "rewards/margins": 0.005900573916733265, + "rewards/rejected": 0.1518508940935135, + "step": 130 + }, + { + "epoch": 0.03529448838748306, + "grad_norm": 97.48009490966797, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": -1.262451171875, + "logits/rejected": -1.261206030845642, + "logps/chosen": -281.99688720703125, + "logps/rejected": -259.36248779296875, + "loss": 0.6887, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": 0.21947021782398224, + "rewards/margins": 0.013390731997787952, + "rewards/rejected": 0.20619507133960724, + "step": 140 + }, + { + "epoch": 0.03781552327230328, + "grad_norm": 72.7547836303711, + "learning_rate": 6.302521008403361e-08, + "logits/chosen": -1.2507812976837158, + "logits/rejected": -1.2765624523162842, + "logps/chosen": -271.92498779296875, + "logps/rejected": -284.1656188964844, + "loss": 0.6835, + "rewards/accuracies": 0.453125, + "rewards/chosen": 0.2730773985385895, + "rewards/margins": 0.02921295166015625, + "rewards/rejected": 0.243865966796875, + "step": 150 + }, + { + "epoch": 0.040336558157123496, + "grad_norm": 83.03253173828125, + "learning_rate": 6.722689075630252e-08, + "logits/chosen": -1.237402319908142, + "logits/rejected": -1.281835913658142, + "logps/chosen": -276.99688720703125, + "logps/rejected": -250.66250610351562, + "loss": 0.6773, + "rewards/accuracies": 0.4593749940395355, + "rewards/chosen": 0.30729371309280396, + "rewards/margins": 0.03227996826171875, + "rewards/rejected": 0.2750183045864105, + "step": 160 + }, + { + "epoch": 0.04285759304194372, + "grad_norm": 92.10990142822266, + "learning_rate": 7.142857142857142e-08, + "logits/chosen": -1.273535132408142, + "logits/rejected": -1.3026854991912842, + "logps/chosen": -293.109375, + "logps/rejected": -249.16250610351562, + "loss": 0.6798, + "rewards/accuracies": 0.4468750059604645, + "rewards/chosen": 0.37623292207717896, + "rewards/margins": 0.03588562086224556, + "rewards/rejected": 0.3403076231479645, + "step": 170 + }, + { + "epoch": 0.045378627926763934, + "grad_norm": 99.00688934326172, + "learning_rate": 7.563025210084033e-08, + "logits/chosen": -1.336669921875, + "logits/rejected": -1.320703148841858, + "logps/chosen": -293.23126220703125, + "logps/rejected": -271.29376220703125, + "loss": 0.6813, + "rewards/accuracies": 0.4593749940395355, + "rewards/chosen": 0.43438720703125, + "rewards/margins": 0.04239501804113388, + "rewards/rejected": 0.391998291015625, + "step": 180 + }, + { + "epoch": 0.04789966281158416, + "grad_norm": 78.80610656738281, + "learning_rate": 7.983193277310923e-08, + "logits/chosen": -1.259521484375, + "logits/rejected": -1.280517578125, + "logps/chosen": -286.9375, + "logps/rejected": -270.3999938964844, + "loss": 0.6701, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5096435546875, + "rewards/margins": 0.06426696479320526, + "rewards/rejected": 0.4456543028354645, + "step": 190 + }, + { + "epoch": 0.05042069769640437, + "grad_norm": 79.90438842773438, + "learning_rate": 8.403361344537815e-08, + "logits/chosen": -1.298828125, + "logits/rejected": -1.296905517578125, + "logps/chosen": -262.9437561035156, + "logps/rejected": -240.984375, + "loss": 0.6611, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.5528808832168579, + "rewards/margins": 0.07709045708179474, + "rewards/rejected": 0.4760375916957855, + "step": 200 + }, + { + "epoch": 0.052941732581224595, + "grad_norm": 80.46623992919922, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": -1.301489233970642, + "logits/rejected": -1.3430664539337158, + "logps/chosen": -271.40625, + "logps/rejected": -233.71249389648438, + "loss": 0.6695, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.56744384765625, + "rewards/margins": 0.06519164890050888, + "rewards/rejected": 0.502685546875, + "step": 210 + }, + { + "epoch": 0.05546276746604481, + "grad_norm": 93.72669982910156, + "learning_rate": 9.243697478991596e-08, + "logits/chosen": -1.339013695716858, + "logits/rejected": -1.3197753429412842, + "logps/chosen": -284.09375, + "logps/rejected": -246.71875, + "loss": 0.6632, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": 0.5970458984375, + "rewards/margins": 0.07635955512523651, + "rewards/rejected": 0.520922839641571, + "step": 220 + }, + { + "epoch": 0.05798380235086503, + "grad_norm": 84.46882629394531, + "learning_rate": 9.663865546218488e-08, + "logits/chosen": -1.284082055091858, + "logits/rejected": -1.331884741783142, + "logps/chosen": -286.3062438964844, + "logps/rejected": -239.81875610351562, + "loss": 0.675, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6225341558456421, + "rewards/margins": 0.055267333984375, + "rewards/rejected": 0.5671020746231079, + "step": 230 + }, + { + "epoch": 0.06050483723568525, + "grad_norm": 86.55229949951172, + "learning_rate": 1.0084033613445378e-07, + "logits/chosen": -1.298730492591858, + "logits/rejected": -1.3097655773162842, + "logps/chosen": -268.171875, + "logps/rejected": -257.51251220703125, + "loss": 0.6622, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": 0.620678722858429, + "rewards/margins": 0.08840332180261612, + "rewards/rejected": 0.5323852300643921, + "step": 240 + }, + { + "epoch": 0.06302587212050546, + "grad_norm": 83.27806854248047, + "learning_rate": 1.0504201680672269e-07, + "logits/chosen": -1.290429711341858, + "logits/rejected": -1.319921851158142, + "logps/chosen": -277.7250061035156, + "logps/rejected": -250.27188110351562, + "loss": 0.6593, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": 0.630175769329071, + "rewards/margins": 0.09326171875, + "rewards/rejected": 0.53704833984375, + "step": 250 + }, + { + "epoch": 0.06554690700532569, + "grad_norm": 79.56951141357422, + "learning_rate": 1.092436974789916e-07, + "logits/chosen": -1.2774658203125, + "logits/rejected": -1.353417992591858, + "logps/chosen": -281.546875, + "logps/rejected": -254.2062530517578, + "loss": 0.6541, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": 0.6353759765625, + "rewards/margins": 0.10556793212890625, + "rewards/rejected": 0.5298095941543579, + "step": 260 + }, + { + "epoch": 0.06806794189014591, + "grad_norm": 76.68507385253906, + "learning_rate": 1.134453781512605e-07, + "logits/chosen": -1.2549316883087158, + "logits/rejected": -1.302832007408142, + "logps/chosen": -248.0625, + "logps/rejected": -250.64999389648438, + "loss": 0.6652, + "rewards/accuracies": 0.546875, + "rewards/chosen": 0.633593738079071, + "rewards/margins": 0.08696289360523224, + "rewards/rejected": 0.546875, + "step": 270 + }, + { + "epoch": 0.07058897677496612, + "grad_norm": 75.71800994873047, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": -1.3131835460662842, + "logits/rejected": -1.277856469154358, + "logps/chosen": -288.16876220703125, + "logps/rejected": -275.109375, + "loss": 0.6514, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.575854480266571, + "rewards/margins": 0.11370544135570526, + "rewards/rejected": 0.4620361328125, + "step": 280 + }, + { + "epoch": 0.07311001165978634, + "grad_norm": 83.98974609375, + "learning_rate": 1.2184873949579832e-07, + "logits/chosen": -1.3290283679962158, + "logits/rejected": -1.357177734375, + "logps/chosen": -315.25, + "logps/rejected": -264.66876220703125, + "loss": 0.6568, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.5980224609375, + "rewards/margins": 0.11147765815258026, + "rewards/rejected": 0.4864868223667145, + "step": 290 + }, + { + "epoch": 0.07563104654460656, + "grad_norm": 78.99283599853516, + "learning_rate": 1.2605042016806723e-07, + "logits/chosen": -1.321191430091858, + "logits/rejected": -1.306396484375, + "logps/chosen": -313.17498779296875, + "logps/rejected": -297.0874938964844, + "loss": 0.6544, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.594555675983429, + "rewards/margins": 0.121393583714962, + "rewards/rejected": 0.47313231229782104, + "step": 300 + }, + { + "epoch": 0.07815208142942678, + "grad_norm": 76.23942565917969, + "learning_rate": 1.3025210084033613e-07, + "logits/chosen": -1.273828148841858, + "logits/rejected": -1.316625952720642, + "logps/chosen": -291.34375, + "logps/rejected": -279.51251220703125, + "loss": 0.6524, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": 0.513500988483429, + "rewards/margins": 0.13037414848804474, + "rewards/rejected": 0.3830398619174957, + "step": 310 + }, + { + "epoch": 0.08067311631424699, + "grad_norm": 85.54627990722656, + "learning_rate": 1.3445378151260504e-07, + "logits/chosen": NaN, + "logits/rejected": -1.305908203125, + "logps/chosen": -273.25, + "logps/rejected": -257.0062561035156, + "loss": 0.6564, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.4950103759765625, + "rewards/margins": 0.10860347747802734, + "rewards/rejected": 0.3865097165107727, + "step": 320 + }, + { + "epoch": 0.08319415119906722, + "grad_norm": 79.61593627929688, + "learning_rate": 1.3865546218487394e-07, + "logits/chosen": -1.2723388671875, + "logits/rejected": -1.293066382408142, + "logps/chosen": -268.15313720703125, + "logps/rejected": -250.85000610351562, + "loss": 0.641, + "rewards/accuracies": 0.578125, + "rewards/chosen": 0.4624877870082855, + "rewards/margins": 0.1453094482421875, + "rewards/rejected": 0.3171554505825043, + "step": 330 + }, + { + "epoch": 0.08571518608388744, + "grad_norm": 87.76576232910156, + "learning_rate": 1.4285714285714285e-07, + "logits/chosen": -1.2678954601287842, + "logits/rejected": -1.346899390220642, + "logps/chosen": -310.12969970703125, + "logps/rejected": -265.8218688964844, + "loss": 0.6213, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": 0.42416077852249146, + "rewards/margins": 0.19175872206687927, + "rewards/rejected": 0.2326004058122635, + "step": 340 + }, + { + "epoch": 0.08823622096870766, + "grad_norm": 94.80514526367188, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -1.2676270008087158, + "logits/rejected": -1.311425805091858, + "logps/chosen": -270.75, + "logps/rejected": -248.9375, + "loss": 0.6295, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.36808472871780396, + "rewards/margins": 0.174671933054924, + "rewards/rejected": 0.19347229599952698, + "step": 350 + }, + { + "epoch": 0.09075725585352787, + "grad_norm": 77.4742431640625, + "learning_rate": 1.5126050420168066e-07, + "logits/chosen": -1.245568871498108, + "logits/rejected": -1.2759521007537842, + "logps/chosen": -265.8125, + "logps/rejected": -257.7562561035156, + "loss": 0.6333, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.266213983297348, + "rewards/margins": 0.18226012587547302, + "rewards/rejected": 0.08396835625171661, + "step": 360 + }, + { + "epoch": 0.09327829073834809, + "grad_norm": 73.68795013427734, + "learning_rate": 1.554621848739496e-07, + "logits/chosen": -1.2880859375, + "logits/rejected": -1.327734351158142, + "logps/chosen": -273.13751220703125, + "logps/rejected": -243.6125030517578, + "loss": 0.6406, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2410484254360199, + "rewards/margins": 0.16602936387062073, + "rewards/rejected": 0.07509537041187286, + "step": 370 + }, + { + "epoch": 0.09579932562316831, + "grad_norm": 85.77863311767578, + "learning_rate": 1.5966386554621847e-07, + "logits/chosen": NaN, + "logits/rejected": -1.3193359375, + "logps/chosen": -289.1656188964844, + "logps/rejected": -253.24374389648438, + "loss": 0.6078, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.19881896674633026, + "rewards/margins": 0.25654298067092896, + "rewards/rejected": -0.057816315442323685, + "step": 380 + }, + { + "epoch": 0.09832036050798854, + "grad_norm": 69.80066680908203, + "learning_rate": 1.638655462184874e-07, + "logits/chosen": -1.274169921875, + "logits/rejected": -1.2903320789337158, + "logps/chosen": -294.56561279296875, + "logps/rejected": -272.609375, + "loss": 0.6149, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.19277039170265198, + "rewards/margins": 0.2486732453107834, + "rewards/rejected": -0.055816650390625, + "step": 390 + }, + { + "epoch": 0.10084139539280874, + "grad_norm": 73.84134674072266, + "learning_rate": 1.680672268907563e-07, + "logits/chosen": -1.2617676258087158, + "logits/rejected": -1.277001976966858, + "logps/chosen": -289.4125061035156, + "logps/rejected": -285.171875, + "loss": 0.6195, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.2807784974575043, + "rewards/margins": 0.23712310194969177, + "rewards/rejected": 0.04382782056927681, + "step": 400 + }, + { + "epoch": 0.10336243027762897, + "grad_norm": 90.36155700683594, + "learning_rate": 1.722689075630252e-07, + "logits/chosen": -1.302832007408142, + "logits/rejected": -1.361230492591858, + "logps/chosen": -284.65625, + "logps/rejected": -287.73748779296875, + "loss": 0.6147, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": 0.2798774838447571, + "rewards/margins": 0.2889160215854645, + "rewards/rejected": -0.009383773431181908, + "step": 410 + }, + { + "epoch": 0.10588346516244919, + "grad_norm": 116.59656524658203, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": -1.2851073741912842, + "logits/rejected": -1.30224609375, + "logps/chosen": -279.1875, + "logps/rejected": -269.29998779296875, + "loss": 0.5927, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.10270003974437714, + "rewards/margins": 0.3286422789096832, + "rewards/rejected": -0.22588805854320526, + "step": 420 + }, + { + "epoch": 0.1084045000472694, + "grad_norm": 74.44758605957031, + "learning_rate": 1.8067226890756302e-07, + "logits/chosen": -1.3424072265625, + "logits/rejected": -1.3671875, + "logps/chosen": -297.32501220703125, + "logps/rejected": -257.3062438964844, + "loss": 0.5745, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.09719619899988174, + "rewards/margins": 0.38787537813186646, + "rewards/rejected": -0.290719598531723, + "step": 430 + }, + { + "epoch": 0.11092553493208962, + "grad_norm": 73.85236358642578, + "learning_rate": 1.8487394957983192e-07, + "logits/chosen": -1.315185546875, + "logits/rejected": -1.316796898841858, + "logps/chosen": -278.0375061035156, + "logps/rejected": -264.6937561035156, + "loss": 0.6022, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": 0.14359435439109802, + "rewards/margins": 0.3448944091796875, + "rewards/rejected": -0.20127257704734802, + "step": 440 + }, + { + "epoch": 0.11344656981690984, + "grad_norm": 84.43276977539062, + "learning_rate": 1.8907563025210083e-07, + "logits/chosen": -1.2849853038787842, + "logits/rejected": -1.307861328125, + "logps/chosen": -276.375, + "logps/rejected": -273.6312561035156, + "loss": 0.5859, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.3189895749092102, + "rewards/margins": 0.39819639921188354, + "rewards/rejected": -0.079370878636837, + "step": 450 + }, + { + "epoch": 0.11596760470173006, + "grad_norm": 90.26596069335938, + "learning_rate": 1.9327731092436976e-07, + "logits/chosen": -1.2935791015625, + "logits/rejected": -1.325781226158142, + "logps/chosen": -317.5679626464844, + "logps/rejected": -264.5874938964844, + "loss": 0.6054, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.3327392637729645, + "rewards/margins": 0.360220342874527, + "rewards/rejected": -0.02756500244140625, + "step": 460 + }, + { + "epoch": 0.11848863958655027, + "grad_norm": 73.2596435546875, + "learning_rate": 1.9747899159663864e-07, + "logits/chosen": -1.2628905773162842, + "logits/rejected": -1.300537109375, + "logps/chosen": -299.64373779296875, + "logps/rejected": -285.57501220703125, + "loss": 0.5807, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.02835845947265625, + "rewards/margins": 0.433358758687973, + "rewards/rejected": -0.4616455137729645, + "step": 470 + }, + { + "epoch": 0.1210096744713705, + "grad_norm": 84.38887023925781, + "learning_rate": 2.0168067226890757e-07, + "logits/chosen": -1.215356469154358, + "logits/rejected": -1.287939429283142, + "logps/chosen": -274.875, + "logps/rejected": -266.01251220703125, + "loss": 0.5676, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.106053926050663, + "rewards/margins": 0.49801331758499146, + "rewards/rejected": -0.6037139892578125, + "step": 480 + }, + { + "epoch": 0.12353070935619072, + "grad_norm": 89.26333618164062, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": -1.2712891101837158, + "logits/rejected": -1.285058617591858, + "logps/chosen": -298.1312561035156, + "logps/rejected": -268.21875, + "loss": 0.5684, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.3065994381904602, + "rewards/margins": 0.5032730102539062, + "rewards/rejected": -0.19690552353858948, + "step": 490 + }, + { + "epoch": 0.12605174424101093, + "grad_norm": 86.56298065185547, + "learning_rate": 2.1008403361344538e-07, + "logits/chosen": -1.278344750404358, + "logits/rejected": -1.300927758216858, + "logps/chosen": -279.90625, + "logps/rejected": -256.640625, + "loss": 0.5294, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": 0.02459106408059597, + "rewards/margins": 0.6083038449287415, + "rewards/rejected": -0.583740234375, + "step": 500 + }, + { + "epoch": 0.12857277912583115, + "grad_norm": 78.020263671875, + "learning_rate": 2.1428571428571426e-07, + "logits/chosen": -1.270605444908142, + "logits/rejected": -1.335058569908142, + "logps/chosen": -265.26873779296875, + "logps/rejected": -261.3812561035156, + "loss": 0.5688, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.5009399652481079, + "rewards/margins": 0.5434356927871704, + "rewards/rejected": -1.0441162586212158, + "step": 510 + }, + { + "epoch": 0.13109381401065137, + "grad_norm": 68.09835052490234, + "learning_rate": 2.184873949579832e-07, + "logits/chosen": -1.298486351966858, + "logits/rejected": -1.310644507408142, + "logps/chosen": -286.26873779296875, + "logps/rejected": -265.04376220703125, + "loss": 0.5984, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.0276031494140625, + "rewards/margins": 0.4960693418979645, + "rewards/rejected": -0.5234416723251343, + "step": 520 + }, + { + "epoch": 0.1336148488954716, + "grad_norm": 94.15994262695312, + "learning_rate": 2.226890756302521e-07, + "logits/chosen": -1.2277343273162842, + "logits/rejected": -1.3334472179412842, + "logps/chosen": -266.94061279296875, + "logps/rejected": -250.85000610351562, + "loss": 0.5546, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.03800354152917862, + "rewards/margins": 0.599743664264679, + "rewards/rejected": -0.561663806438446, + "step": 530 + }, + { + "epoch": 0.13613588378029182, + "grad_norm": 89.99702453613281, + "learning_rate": 2.26890756302521e-07, + "logits/chosen": -1.329443335533142, + "logits/rejected": -1.3420898914337158, + "logps/chosen": -295.42498779296875, + "logps/rejected": -259.51251220703125, + "loss": 0.5859, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.07108917087316513, + "rewards/margins": 0.5303710699081421, + "rewards/rejected": -0.6014175415039062, + "step": 540 + }, + { + "epoch": 0.13865691866511204, + "grad_norm": 105.61231994628906, + "learning_rate": 2.3109243697478993e-07, + "logits/chosen": -1.3101074695587158, + "logits/rejected": -1.319091796875, + "logps/chosen": -294.79998779296875, + "logps/rejected": -272.45623779296875, + "loss": 0.5617, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.03331298753619194, + "rewards/margins": 0.595355212688446, + "rewards/rejected": -0.6286590695381165, + "step": 550 + }, + { + "epoch": 0.14117795354993223, + "grad_norm": 72.71495819091797, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": -1.247802734375, + "logits/rejected": -1.289453148841858, + "logps/chosen": -254.91250610351562, + "logps/rejected": -248.2375030517578, + "loss": 0.5539, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": 0.1997833251953125, + "rewards/margins": 0.6131622195243835, + "rewards/rejected": -0.4132888913154602, + "step": 560 + }, + { + "epoch": 0.14369898843475246, + "grad_norm": 83.22612762451172, + "learning_rate": 2.394957983193277e-07, + "logits/chosen": -1.2476074695587158, + "logits/rejected": -1.243749976158142, + "logps/chosen": -267.5249938964844, + "logps/rejected": -274.3374938964844, + "loss": 0.5608, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.33015137910842896, + "rewards/margins": 0.5673843622207642, + "rewards/rejected": -0.8973663449287415, + "step": 570 + }, + { + "epoch": 0.14622002331957268, + "grad_norm": 68.80695343017578, + "learning_rate": 2.4369747899159664e-07, + "logits/chosen": -1.2406494617462158, + "logits/rejected": -1.292321801185608, + "logps/chosen": -266.38751220703125, + "logps/rejected": -275.38751220703125, + "loss": 0.5623, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2216869294643402, + "rewards/margins": 0.6139160394668579, + "rewards/rejected": -0.835650622844696, + "step": 580 + }, + { + "epoch": 0.1487410582043929, + "grad_norm": 72.53914642333984, + "learning_rate": 2.478991596638655e-07, + "logits/chosen": -1.29248046875, + "logits/rejected": -1.28662109375, + "logps/chosen": -295.5249938964844, + "logps/rejected": -256.65313720703125, + "loss": 0.5886, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.18014831840991974, + "rewards/margins": 0.593457043170929, + "rewards/rejected": -0.773388683795929, + "step": 590 + }, + { + "epoch": 0.15126209308921312, + "grad_norm": 90.98090362548828, + "learning_rate": 2.5210084033613445e-07, + "logits/chosen": -1.281396508216858, + "logits/rejected": -1.3123047351837158, + "logps/chosen": -299.625, + "logps/rejected": -278.0093688964844, + "loss": 0.5836, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.263723760843277, + "rewards/margins": 0.5306030511856079, + "rewards/rejected": -0.7945464849472046, + "step": 600 + }, + { + "epoch": 0.15378312797403335, + "grad_norm": 72.77758026123047, + "learning_rate": 2.5630252100840333e-07, + "logits/chosen": -1.2614014148712158, + "logits/rejected": -1.3181641101837158, + "logps/chosen": -275.26251220703125, + "logps/rejected": -263.3031311035156, + "loss": 0.5572, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.4960670471191406, + "rewards/margins": 0.58111572265625, + "rewards/rejected": -1.077172875404358, + "step": 610 + }, + { + "epoch": 0.15630416285885357, + "grad_norm": 106.92414855957031, + "learning_rate": 2.6050420168067226e-07, + "logits/chosen": -1.290771484375, + "logits/rejected": -1.342529296875, + "logps/chosen": -283.70001220703125, + "logps/rejected": -257.7593688964844, + "loss": 0.5548, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.2074592560529709, + "rewards/margins": 0.618426501750946, + "rewards/rejected": -0.8258941769599915, + "step": 620 + }, + { + "epoch": 0.1588251977436738, + "grad_norm": 79.6308364868164, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": -1.259765625, + "logits/rejected": NaN, + "logps/chosen": -298.61248779296875, + "logps/rejected": -279.70623779296875, + "loss": 0.5645, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.35757750272750854, + "rewards/margins": 0.611968994140625, + "rewards/rejected": -0.9693847894668579, + "step": 630 + }, + { + "epoch": 0.16134623262849399, + "grad_norm": 101.44197082519531, + "learning_rate": 2.689075630252101e-07, + "logits/chosen": -1.28271484375, + "logits/rejected": -1.2960937023162842, + "logps/chosen": -279.96875, + "logps/rejected": -270.5093688964844, + "loss": 0.5506, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.2633987367153168, + "rewards/margins": 0.649487316608429, + "rewards/rejected": -0.9127746820449829, + "step": 640 + }, + { + "epoch": 0.1638672675133142, + "grad_norm": 76.77125549316406, + "learning_rate": 2.7310924369747895e-07, + "logits/chosen": -1.3201172351837158, + "logits/rejected": -1.388916015625, + "logps/chosen": -290.2124938964844, + "logps/rejected": -272.3843688964844, + "loss": 0.5504, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.12568894028663635, + "rewards/margins": 0.6548095941543579, + "rewards/rejected": -0.5296264886856079, + "step": 650 + }, + { + "epoch": 0.16638830239813443, + "grad_norm": 97.28453826904297, + "learning_rate": 2.773109243697479e-07, + "logits/chosen": -1.2907226085662842, + "logits/rejected": -1.312597632408142, + "logps/chosen": -280.70623779296875, + "logps/rejected": -284.04998779296875, + "loss": 0.5809, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07011871039867401, + "rewards/margins": 0.598468005657196, + "rewards/rejected": -0.5285194516181946, + "step": 660 + }, + { + "epoch": 0.16890933728295465, + "grad_norm": 91.88009643554688, + "learning_rate": 2.815126050420168e-07, + "logits/chosen": -1.2091796398162842, + "logits/rejected": NaN, + "logps/chosen": -298.46875, + "logps/rejected": -269.84375, + "loss": 0.5684, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.16863402724266052, + "rewards/margins": 0.6591461300849915, + "rewards/rejected": -0.8283462524414062, + "step": 670 + }, + { + "epoch": 0.17143037216777487, + "grad_norm": 74.19715118408203, + "learning_rate": 2.857142857142857e-07, + "logits/chosen": -1.261328101158142, + "logits/rejected": -1.2793457508087158, + "logps/chosen": -289.64373779296875, + "logps/rejected": -281.83123779296875, + "loss": 0.5778, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.475006103515625, + "rewards/margins": 0.564221203327179, + "rewards/rejected": -1.0394470691680908, + "step": 680 + }, + { + "epoch": 0.1739514070525951, + "grad_norm": 75.1677017211914, + "learning_rate": 2.899159663865546e-07, + "logits/chosen": -1.225683569908142, + "logits/rejected": -1.315527319908142, + "logps/chosen": -306.6343688964844, + "logps/rejected": -273.3687438964844, + "loss": 0.5072, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.19892120361328125, + "rewards/margins": 0.7209564447402954, + "rewards/rejected": -0.919659435749054, + "step": 690 + }, + { + "epoch": 0.17647244193741532, + "grad_norm": 97.35240936279297, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -1.290869116783142, + "logits/rejected": -1.293237328529358, + "logps/chosen": -264.83123779296875, + "logps/rejected": -257.33123779296875, + "loss": 0.6068, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08776245266199112, + "rewards/margins": 0.542065441608429, + "rewards/rejected": -0.6296356320381165, + "step": 700 + }, + { + "epoch": 0.17899347682223551, + "grad_norm": 73.25398254394531, + "learning_rate": 2.9831932773109244e-07, + "logits/chosen": -1.2614257335662842, + "logits/rejected": -1.300537109375, + "logps/chosen": -289.8531188964844, + "logps/rejected": -278.890625, + "loss": 0.5682, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.28628236055374146, + "rewards/margins": 0.6059295535087585, + "rewards/rejected": -0.8918091058731079, + "step": 710 + }, + { + "epoch": 0.18151451170705574, + "grad_norm": 82.12859344482422, + "learning_rate": 3.025210084033613e-07, + "logits/chosen": -1.349365234375, + "logits/rejected": -1.3427734375, + "logps/chosen": -290.1937561035156, + "logps/rejected": -276.265625, + "loss": 0.5392, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.135517880320549, + "rewards/margins": 0.6761139035224915, + "rewards/rejected": -0.5405243039131165, + "step": 720 + }, + { + "epoch": 0.18403554659187596, + "grad_norm": 65.37681579589844, + "learning_rate": 3.0672268907563024e-07, + "logits/chosen": -1.2498047351837158, + "logits/rejected": -1.31591796875, + "logps/chosen": -278.0843811035156, + "logps/rejected": -281.8125, + "loss": 0.5853, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.43583983182907104, + "rewards/margins": 0.5022827386856079, + "rewards/rejected": -0.06654968112707138, + "step": 730 + }, + { + "epoch": 0.18655658147669618, + "grad_norm": 101.68387603759766, + "learning_rate": 3.109243697478992e-07, + "logits/chosen": -1.2600586414337158, + "logits/rejected": -1.3029296398162842, + "logps/chosen": -294.7250061035156, + "logps/rejected": -284.4437561035156, + "loss": 0.5604, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02090301550924778, + "rewards/margins": 0.6147094964981079, + "rewards/rejected": -0.5933437347412109, + "step": 740 + }, + { + "epoch": 0.1890776163615164, + "grad_norm": 67.09628295898438, + "learning_rate": 3.1512605042016805e-07, + "logits/chosen": -1.283447265625, + "logits/rejected": -1.2777099609375, + "logps/chosen": -276.29376220703125, + "logps/rejected": -269.3812561035156, + "loss": 0.5529, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.22530308365821838, + "rewards/margins": 0.774169921875, + "rewards/rejected": -0.9990142583847046, + "step": 750 + }, + { + "epoch": 0.19159865124633663, + "grad_norm": 80.1001205444336, + "learning_rate": 3.1932773109243693e-07, + "logits/chosen": -1.291162133216858, + "logits/rejected": -1.2841796875, + "logps/chosen": -289.6000061035156, + "logps/rejected": -261.45623779296875, + "loss": 0.5307, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.24888916313648224, + "rewards/margins": 0.6986938714981079, + "rewards/rejected": -0.44995421171188354, + "step": 760 + }, + { + "epoch": 0.19411968613115685, + "grad_norm": 56.933860778808594, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": -1.3205077648162842, + "logits/rejected": -1.314550757408142, + "logps/chosen": -293.04998779296875, + "logps/rejected": -254.9718780517578, + "loss": 0.5417, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2917343080043793, + "rewards/margins": 0.7984863519668579, + "rewards/rejected": -1.0905120372772217, + "step": 770 + }, + { + "epoch": 0.19664072101597707, + "grad_norm": 95.23239135742188, + "learning_rate": 3.277310924369748e-07, + "logits/chosen": -1.2874023914337158, + "logits/rejected": -1.283959984779358, + "logps/chosen": -288.79376220703125, + "logps/rejected": -278.95623779296875, + "loss": 0.6097, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.217681884765625, + "rewards/margins": 0.5911788940429688, + "rewards/rejected": -0.8089507818222046, + "step": 780 + }, + { + "epoch": 0.19916175590079727, + "grad_norm": 92.63106536865234, + "learning_rate": 3.319327731092437e-07, + "logits/chosen": -1.2746093273162842, + "logits/rejected": -1.323339819908142, + "logps/chosen": -303.8687438964844, + "logps/rejected": -289.54376220703125, + "loss": 0.5632, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18847045302391052, + "rewards/margins": 0.6622680425643921, + "rewards/rejected": -0.8506530523300171, + "step": 790 + }, + { + "epoch": 0.2016827907856175, + "grad_norm": 80.16415405273438, + "learning_rate": 3.361344537815126e-07, + "logits/chosen": -1.289941430091858, + "logits/rejected": -1.2956054210662842, + "logps/chosen": -277.3187561035156, + "logps/rejected": -269.70001220703125, + "loss": 0.5811, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3729003965854645, + "rewards/margins": 0.63128662109375, + "rewards/rejected": -1.0042724609375, + "step": 800 + }, + { + "epoch": 0.2042038256704377, + "grad_norm": 61.04458999633789, + "learning_rate": 3.403361344537815e-07, + "logits/chosen": -1.283203125, + "logits/rejected": -1.317968726158142, + "logps/chosen": -294.71875, + "logps/rejected": -259.11248779296875, + "loss": 0.5079, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.07389144599437714, + "rewards/margins": 0.750195324420929, + "rewards/rejected": -0.824145495891571, + "step": 810 + }, + { + "epoch": 0.20672486055525793, + "grad_norm": 73.20579528808594, + "learning_rate": 3.445378151260504e-07, + "logits/chosen": -1.2428710460662842, + "logits/rejected": -1.3032715320587158, + "logps/chosen": -293.8374938964844, + "logps/rejected": -287.2562561035156, + "loss": 0.5371, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.15455932915210724, + "rewards/margins": 0.719097912311554, + "rewards/rejected": -0.5641471743583679, + "step": 820 + }, + { + "epoch": 0.20924589544007816, + "grad_norm": 119.09114074707031, + "learning_rate": 3.487394957983193e-07, + "logits/chosen": -1.2250244617462158, + "logits/rejected": -1.2574951648712158, + "logps/chosen": -293.98126220703125, + "logps/rejected": -274.2875061035156, + "loss": 0.5384, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.280783087015152, + "rewards/margins": 0.8061981201171875, + "rewards/rejected": -1.0869262218475342, + "step": 830 + }, + { + "epoch": 0.21176693032489838, + "grad_norm": 78.48435974121094, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": -1.2794921398162842, + "logits/rejected": -1.2690918445587158, + "logps/chosen": -274.99688720703125, + "logps/rejected": -266.2250061035156, + "loss": 0.5478, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.04948578029870987, + "rewards/margins": 0.789715588092804, + "rewards/rejected": -0.8386299014091492, + "step": 840 + }, + { + "epoch": 0.2142879652097186, + "grad_norm": 80.56698608398438, + "learning_rate": 3.5714285714285716e-07, + "logits/chosen": -1.275170922279358, + "logits/rejected": -1.351904273033142, + "logps/chosen": -287.8187561035156, + "logps/rejected": -246.4187469482422, + "loss": 0.5376, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": 0.43180543184280396, + "rewards/margins": 0.683911144733429, + "rewards/rejected": -0.2513900697231293, + "step": 850 + }, + { + "epoch": 0.2168090000945388, + "grad_norm": 83.84162139892578, + "learning_rate": 3.6134453781512604e-07, + "logits/chosen": -1.247167944908142, + "logits/rejected": -1.2339355945587158, + "logps/chosen": -279.45001220703125, + "logps/rejected": -255.4562530517578, + "loss": 0.5869, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.07878265529870987, + "rewards/margins": 0.677410900592804, + "rewards/rejected": -0.5982635617256165, + "step": 860 + }, + { + "epoch": 0.21933003497935902, + "grad_norm": 71.6579818725586, + "learning_rate": 3.655462184873949e-07, + "logits/chosen": -1.259765625, + "logits/rejected": -1.245581030845642, + "logps/chosen": -287.58123779296875, + "logps/rejected": -255.28750610351562, + "loss": 0.5896, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": 0.03903808444738388, + "rewards/margins": 0.5729767084121704, + "rewards/rejected": -0.5338500738143921, + "step": 870 + }, + { + "epoch": 0.22185106986417924, + "grad_norm": 75.61174774169922, + "learning_rate": 3.6974789915966385e-07, + "logits/chosen": -1.290307641029358, + "logits/rejected": -1.34326171875, + "logps/chosen": -319.64373779296875, + "logps/rejected": -292.51251220703125, + "loss": 0.5762, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.2825942933559418, + "rewards/margins": 0.6470428705215454, + "rewards/rejected": -0.9297851324081421, + "step": 880 + }, + { + "epoch": 0.22437210474899946, + "grad_norm": 74.02259826660156, + "learning_rate": 3.739495798319328e-07, + "logits/chosen": -1.2446715831756592, + "logits/rejected": -1.341577172279358, + "logps/chosen": -306.1875, + "logps/rejected": -257.70623779296875, + "loss": 0.5545, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5517059564590454, + "rewards/margins": 0.668322741985321, + "rewards/rejected": -1.219885230064392, + "step": 890 + }, + { + "epoch": 0.22689313963381968, + "grad_norm": 79.9630126953125, + "learning_rate": 3.7815126050420166e-07, + "logits/chosen": -1.255957007408142, + "logits/rejected": -1.2941405773162842, + "logps/chosen": -301.47186279296875, + "logps/rejected": -265.76251220703125, + "loss": 0.5639, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -1.028466820716858, + "rewards/margins": 0.7416824102401733, + "rewards/rejected": -1.7698242664337158, + "step": 900 + }, + { + "epoch": 0.2294141745186399, + "grad_norm": 77.22490692138672, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": -1.249658226966858, + "logits/rejected": -1.2742187976837158, + "logps/chosen": -279.21875, + "logps/rejected": -259.8125, + "loss": 0.585, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -1.0482285022735596, + "rewards/margins": 0.6260131597518921, + "rewards/rejected": -1.674279808998108, + "step": 910 + }, + { + "epoch": 0.23193520940346013, + "grad_norm": 68.81954956054688, + "learning_rate": 3.865546218487395e-07, + "logits/chosen": -1.274072289466858, + "logits/rejected": -1.309667944908142, + "logps/chosen": -276.625, + "logps/rejected": -259.3656311035156, + "loss": 0.5303, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.28308868408203125, + "rewards/margins": 0.7613174319267273, + "rewards/rejected": -1.044244408607483, + "step": 920 + }, + { + "epoch": 0.23445624428828035, + "grad_norm": 68.08296203613281, + "learning_rate": 3.907563025210084e-07, + "logits/chosen": -1.2756836414337158, + "logits/rejected": -1.3571288585662842, + "logps/chosen": -301.5, + "logps/rejected": -261.21875, + "loss": 0.5026, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": 0.04568786546587944, + "rewards/margins": 0.817272961139679, + "rewards/rejected": -0.7716888189315796, + "step": 930 + }, + { + "epoch": 0.23697727917310055, + "grad_norm": 102.95132446289062, + "learning_rate": 3.949579831932773e-07, + "logits/chosen": -1.247900366783142, + "logits/rejected": -1.289941430091858, + "logps/chosen": -299.3374938964844, + "logps/rejected": -267.14373779296875, + "loss": 0.5384, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.35980987548828125, + "rewards/margins": 0.7607055902481079, + "rewards/rejected": -1.120202660560608, + "step": 940 + }, + { + "epoch": 0.23949831405792077, + "grad_norm": 96.6849365234375, + "learning_rate": 3.991596638655462e-07, + "logits/chosen": -1.2032959461212158, + "logits/rejected": -1.236962914466858, + "logps/chosen": -278.64373779296875, + "logps/rejected": -262.89373779296875, + "loss": 0.5637, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.323556512594223, + "rewards/margins": 0.7710052728652954, + "rewards/rejected": -1.095068335533142, + "step": 950 + }, + { + "epoch": 0.242019348942741, + "grad_norm": 61.88032150268555, + "learning_rate": 4.0336134453781514e-07, + "logits/chosen": -1.254980444908142, + "logits/rejected": -1.2530028820037842, + "logps/chosen": -294.7437438964844, + "logps/rejected": -257.9437561035156, + "loss": 0.5456, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.07322387397289276, + "rewards/margins": 0.7105438113212585, + "rewards/rejected": -0.7835647463798523, + "step": 960 + }, + { + "epoch": 0.24454038382756121, + "grad_norm": 76.08362579345703, + "learning_rate": 4.07563025210084e-07, + "logits/chosen": -1.26416015625, + "logits/rejected": -1.3228271007537842, + "logps/chosen": -286.6273498535156, + "logps/rejected": -273.36248779296875, + "loss": 0.5411, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": 0.09781646728515625, + "rewards/margins": 0.781726062297821, + "rewards/rejected": -0.6835899353027344, + "step": 970 + }, + { + "epoch": 0.24706141871238144, + "grad_norm": 79.66098022460938, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": -1.285864233970642, + "logits/rejected": -1.3110840320587158, + "logps/chosen": -290.21875, + "logps/rejected": -267.29376220703125, + "loss": 0.5379, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.03505401685833931, + "rewards/margins": 0.8145843744277954, + "rewards/rejected": -0.8496490716934204, + "step": 980 + }, + { + "epoch": 0.24958245359720166, + "grad_norm": 88.81765747070312, + "learning_rate": 4.159663865546218e-07, + "logits/chosen": -1.2433593273162842, + "logits/rejected": -1.279296875, + "logps/chosen": -270.4750061035156, + "logps/rejected": -264.48748779296875, + "loss": 0.5104, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.21263274550437927, + "rewards/margins": 0.974438488483429, + "rewards/rejected": -1.1868622303009033, + "step": 990 + }, + { + "epoch": 0.25210348848202185, + "grad_norm": 94.12862396240234, + "learning_rate": 4.2016806722689076e-07, + "logits/chosen": -1.2711670398712158, + "logits/rejected": -1.3352539539337158, + "logps/chosen": -300.98126220703125, + "logps/rejected": -301.36248779296875, + "loss": 0.5281, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.40046995878219604, + "rewards/margins": 0.9420257806777954, + "rewards/rejected": -1.3425171375274658, + "step": 1000 + }, + { + "epoch": 0.2546245233668421, + "grad_norm": 76.39344024658203, + "learning_rate": 4.2436974789915964e-07, + "logits/chosen": -1.215795874595642, + "logits/rejected": -1.247656226158142, + "logps/chosen": -274.0, + "logps/rejected": -274.6499938964844, + "loss": 0.5548, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": 0.12611083686351776, + "rewards/margins": 0.807659924030304, + "rewards/rejected": -0.6822448968887329, + "step": 1010 + }, + { + "epoch": 0.2571455582516623, + "grad_norm": 62.967628479003906, + "learning_rate": 4.285714285714285e-07, + "logits/chosen": -1.2568359375, + "logits/rejected": -1.3361084461212158, + "logps/chosen": -298.0718688964844, + "logps/rejected": -263.27813720703125, + "loss": 0.5717, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.04311523586511612, + "rewards/margins": 0.7250305414199829, + "rewards/rejected": -0.682232677936554, + "step": 1020 + }, + { + "epoch": 0.25966659313648255, + "grad_norm": 72.51446533203125, + "learning_rate": 4.327731092436975e-07, + "logits/chosen": -1.292822241783142, + "logits/rejected": -1.268945336341858, + "logps/chosen": -294.0249938964844, + "logps/rejected": -285.58123779296875, + "loss": 0.5328, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.2802795469760895, + "rewards/margins": 0.7537078857421875, + "rewards/rejected": -0.47330933809280396, + "step": 1030 + }, + { + "epoch": 0.26218762802130274, + "grad_norm": 110.47750091552734, + "learning_rate": 4.369747899159664e-07, + "logits/chosen": NaN, + "logits/rejected": NaN, + "logps/chosen": -270.46875, + "logps/rejected": -256.91876220703125, + "loss": 0.5586, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.0010284424060955644, + "rewards/margins": 0.735827624797821, + "rewards/rejected": -0.737213134765625, + "step": 1040 + }, + { + "epoch": 0.26470866290612294, + "grad_norm": 99.0147476196289, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -1.225927710533142, + "logits/rejected": -1.264062523841858, + "logps/chosen": -284.73126220703125, + "logps/rejected": -259.83123779296875, + "loss": 0.5227, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.999798595905304, + "rewards/margins": 0.890423595905304, + "rewards/rejected": -1.889714002609253, + "step": 1050 + }, + { + "epoch": 0.2672296977909432, + "grad_norm": 82.93985748291016, + "learning_rate": 4.453781512605042e-07, + "logits/chosen": -1.226342797279358, + "logits/rejected": -1.2687499523162842, + "logps/chosen": -262.21875, + "logps/rejected": -264.4437561035156, + "loss": 0.5703, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.681231677532196, + "rewards/margins": 0.7460876703262329, + "rewards/rejected": -1.4273468255996704, + "step": 1060 + }, + { + "epoch": 0.2697507326757634, + "grad_norm": 88.91585540771484, + "learning_rate": 4.495798319327731e-07, + "logits/chosen": -1.2632324695587158, + "logits/rejected": -1.2573730945587158, + "logps/chosen": -303.3687438964844, + "logps/rejected": -292.6625061035156, + "loss": 0.5432, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.758679211139679, + "rewards/margins": 0.8320068120956421, + "rewards/rejected": -1.590661644935608, + "step": 1070 + }, + { + "epoch": 0.27227176756058363, + "grad_norm": 76.33614349365234, + "learning_rate": 4.53781512605042e-07, + "logits/chosen": -1.189550757408142, + "logits/rejected": NaN, + "logps/chosen": -317.40313720703125, + "logps/rejected": -271.9312438964844, + "loss": 0.582, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5956023931503296, + "rewards/margins": 0.6631225347518921, + "rewards/rejected": -1.2587738037109375, + "step": 1080 + }, + { + "epoch": 0.2747928024454038, + "grad_norm": 83.96426391601562, + "learning_rate": 4.579831932773109e-07, + "logits/chosen": -1.24169921875, + "logits/rejected": -1.2775390148162842, + "logps/chosen": -276.65625, + "logps/rejected": -270.703125, + "loss": 0.5503, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7082275152206421, + "rewards/margins": 0.714599609375, + "rewards/rejected": -1.4234130382537842, + "step": 1090 + }, + { + "epoch": 0.2773138373302241, + "grad_norm": 65.58870697021484, + "learning_rate": 4.6218487394957986e-07, + "logits/chosen": -1.245947241783142, + "logits/rejected": -1.291894555091858, + "logps/chosen": -319.10626220703125, + "logps/rejected": -290.3062438964844, + "loss": 0.5692, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.990826427936554, + "rewards/margins": 0.6439148187637329, + "rewards/rejected": -1.6346435546875, + "step": 1100 + }, + { + "epoch": 0.2798348722150443, + "grad_norm": 69.3920669555664, + "learning_rate": 4.6638655462184874e-07, + "logits/chosen": -1.219946265220642, + "logits/rejected": -1.2772216796875, + "logps/chosen": -292.7906188964844, + "logps/rejected": -255.46875, + "loss": 0.5544, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.80059814453125, + "rewards/margins": 0.734448254108429, + "rewards/rejected": -1.534936547279358, + "step": 1110 + }, + { + "epoch": 0.28235590709986447, + "grad_norm": 61.433929443359375, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": -1.2393310070037842, + "logits/rejected": -1.2877686023712158, + "logps/chosen": -286.4937438964844, + "logps/rejected": -263.48126220703125, + "loss": 0.5409, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4710235595703125, + "rewards/margins": 0.8074401617050171, + "rewards/rejected": -1.278076171875, + "step": 1120 + }, + { + "epoch": 0.2848769419846847, + "grad_norm": 77.4461898803711, + "learning_rate": 4.747899159663865e-07, + "logits/chosen": -1.238745093345642, + "logits/rejected": -1.228674292564392, + "logps/chosen": -318.1499938964844, + "logps/rejected": -300.8687438964844, + "loss": 0.5577, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3881072998046875, + "rewards/margins": 0.819897472858429, + "rewards/rejected": -1.2079254388809204, + "step": 1130 + }, + { + "epoch": 0.2873979768695049, + "grad_norm": 60.04735565185547, + "learning_rate": 4.789915966386554e-07, + "logits/chosen": -1.271484375, + "logits/rejected": -1.2472107410430908, + "logps/chosen": -286.38751220703125, + "logps/rejected": -267.48748779296875, + "loss": 0.5506, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02253875695168972, + "rewards/margins": 0.913525402545929, + "rewards/rejected": -0.9367294311523438, + "step": 1140 + }, + { + "epoch": 0.28991901175432516, + "grad_norm": 73.21792602539062, + "learning_rate": 4.831932773109244e-07, + "logits/chosen": -1.256591796875, + "logits/rejected": -1.2744140625, + "logps/chosen": -247.07186889648438, + "logps/rejected": -249.0281219482422, + "loss": 0.5595, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.13423919677734375, + "rewards/margins": 0.7934204339981079, + "rewards/rejected": -0.9277893304824829, + "step": 1150 + }, + { + "epoch": 0.29244004663914536, + "grad_norm": 87.73804473876953, + "learning_rate": 4.873949579831933e-07, + "logits/chosen": -1.2108154296875, + "logits/rejected": -1.281152367591858, + "logps/chosen": -293.52813720703125, + "logps/rejected": -270.58123779296875, + "loss": 0.5719, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6134201288223267, + "rewards/margins": 0.809191882610321, + "rewards/rejected": -1.4230468273162842, + "step": 1160 + }, + { + "epoch": 0.2949610815239656, + "grad_norm": 72.05789947509766, + "learning_rate": 4.915966386554621e-07, + "logits/chosen": -1.179834008216858, + "logits/rejected": -1.2007324695587158, + "logps/chosen": -309.3062438964844, + "logps/rejected": -277.39373779296875, + "loss": 0.5744, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -1.015649437904358, + "rewards/margins": 0.764697253704071, + "rewards/rejected": -1.779718041419983, + "step": 1170 + }, + { + "epoch": 0.2974821164087858, + "grad_norm": 72.16474914550781, + "learning_rate": 4.95798319327731e-07, + "logits/chosen": -1.264892578125, + "logits/rejected": -1.2868163585662842, + "logps/chosen": -291.6312561035156, + "logps/rejected": -277.23748779296875, + "loss": 0.5899, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.49605637788772583, + "rewards/margins": 0.6390106081962585, + "rewards/rejected": -1.1349884271621704, + "step": 1180 + }, + { + "epoch": 0.30000315129360605, + "grad_norm": 89.52666473388672, + "learning_rate": 5e-07, + "logits/chosen": -1.2664062976837158, + "logits/rejected": -1.28076171875, + "logps/chosen": -311.8187561035156, + "logps/rejected": -287.5625, + "loss": 0.4778, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": 0.05623779445886612, + "rewards/margins": 1.046759009361267, + "rewards/rejected": -0.9906204342842102, + "step": 1190 + }, + { + "epoch": 0.30252418617842625, + "grad_norm": 80.31555938720703, + "learning_rate": 4.999989240484344e-07, + "logits/chosen": -1.2333495616912842, + "logits/rejected": -1.224755883216858, + "logps/chosen": -294.96875, + "logps/rejected": -292.45623779296875, + "loss": 0.5438, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5280731320381165, + "rewards/margins": 0.948699951171875, + "rewards/rejected": -1.477178931236267, + "step": 1200 + }, + { + "epoch": 0.30504522106324644, + "grad_norm": 76.8759765625, + "learning_rate": 4.999956962029988e-07, + "logits/chosen": -1.238378882408142, + "logits/rejected": -1.2504394054412842, + "logps/chosen": -295.5874938964844, + "logps/rejected": -296.97186279296875, + "loss": 0.5663, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9184936285018921, + "rewards/margins": 0.7978515625, + "rewards/rejected": -1.716162085533142, + "step": 1210 + }, + { + "epoch": 0.3075662559480667, + "grad_norm": 73.19841003417969, + "learning_rate": 4.999903164914773e-07, + "logits/chosen": -1.2347412109375, + "logits/rejected": -1.262109398841858, + "logps/chosen": -265.07501220703125, + "logps/rejected": -253.00936889648438, + "loss": 0.4706, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.09563598781824112, + "rewards/margins": 1.0638549327850342, + "rewards/rejected": -1.1590576171875, + "step": 1220 + }, + { + "epoch": 0.3100872908328869, + "grad_norm": 76.41851806640625, + "learning_rate": 4.999827849601764e-07, + "logits/chosen": -1.217748999595642, + "logits/rejected": -1.246728539466858, + "logps/chosen": -298.8812561035156, + "logps/rejected": -296.73126220703125, + "loss": 0.5161, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.3525436520576477, + "rewards/margins": 0.919293224811554, + "rewards/rejected": -1.2719390392303467, + "step": 1230 + }, + { + "epoch": 0.31260832571770714, + "grad_norm": 72.04425048828125, + "learning_rate": 4.999731016739247e-07, + "logits/chosen": -1.201635718345642, + "logits/rejected": -1.237548828125, + "logps/chosen": -263.76873779296875, + "logps/rejected": -271.63751220703125, + "loss": 0.5377, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.5170654058456421, + "rewards/margins": 0.9412124752998352, + "rewards/rejected": -1.4580810070037842, + "step": 1240 + }, + { + "epoch": 0.31512936060252733, + "grad_norm": 81.7342300415039, + "learning_rate": 4.99961266716072e-07, + "logits/chosen": -1.230615258216858, + "logits/rejected": -1.2393066883087158, + "logps/chosen": -270.1812438964844, + "logps/rejected": -244.5124969482422, + "loss": 0.5097, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.07908324897289276, + "rewards/margins": 1.05462646484375, + "rewards/rejected": -1.1331360340118408, + "step": 1250 + }, + { + "epoch": 0.3176503954873476, + "grad_norm": 77.84716796875, + "learning_rate": 4.999472801884891e-07, + "logits/chosen": -1.26123046875, + "logits/rejected": -1.281103491783142, + "logps/chosen": -287.46875, + "logps/rejected": -265.29376220703125, + "loss": 0.5473, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.27579039335250854, + "rewards/margins": 1.008734107017517, + "rewards/rejected": -1.283843994140625, + "step": 1260 + }, + { + "epoch": 0.3201714303721678, + "grad_norm": 79.17774200439453, + "learning_rate": 4.999311422115667e-07, + "logits/chosen": -1.2472655773162842, + "logits/rejected": -1.268945336341858, + "logps/chosen": -291.84063720703125, + "logps/rejected": -275.8374938964844, + "loss": 0.5277, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.46207427978515625, + "rewards/margins": 1.02911376953125, + "rewards/rejected": -1.4900543689727783, + "step": 1270 + }, + { + "epoch": 0.32269246525698797, + "grad_norm": 73.71128845214844, + "learning_rate": 4.99912852924214e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.2743408679962158, + "logps/chosen": -275.86248779296875, + "logps/rejected": -257.4312438964844, + "loss": 0.5621, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.861004650592804, + "rewards/margins": 0.884442150592804, + "rewards/rejected": -1.7460525035858154, + "step": 1280 + }, + { + "epoch": 0.3252135001418082, + "grad_norm": 72.18524169921875, + "learning_rate": 4.998924124838582e-07, + "logits/chosen": -1.2352294921875, + "logits/rejected": -1.1897461414337158, + "logps/chosen": -286.125, + "logps/rejected": -284.8687438964844, + "loss": 0.5683, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.728625476360321, + "rewards/margins": 0.8816589117050171, + "rewards/rejected": -1.610467553138733, + "step": 1290 + }, + { + "epoch": 0.3277345350266284, + "grad_norm": 87.51490783691406, + "learning_rate": 4.99869821066443e-07, + "logits/chosen": -1.231103539466858, + "logits/rejected": -1.284033179283142, + "logps/chosen": -290.26873779296875, + "logps/rejected": -262.1000061035156, + "loss": 0.5934, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.03660278394818306, + "rewards/margins": 0.6874328851699829, + "rewards/rejected": -0.7236099243164062, + "step": 1300 + }, + { + "epoch": 0.33025556991144867, + "grad_norm": 90.1872329711914, + "learning_rate": 4.998450788664262e-07, + "logits/chosen": -1.2363770008087158, + "logits/rejected": -1.287353515625, + "logps/chosen": -289.62188720703125, + "logps/rejected": -279.73126220703125, + "loss": 0.5287, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.158843994140625, + "rewards/margins": 0.8626343011856079, + "rewards/rejected": -0.7040694952011108, + "step": 1310 + }, + { + "epoch": 0.33277660479626886, + "grad_norm": 114.13217163085938, + "learning_rate": 4.998181860967792e-07, + "logits/chosen": -1.1602051258087158, + "logits/rejected": -1.217187523841858, + "logps/chosen": -284.8125, + "logps/rejected": -261.71563720703125, + "loss": 0.5191, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2588607668876648, + "rewards/margins": 0.930255115032196, + "rewards/rejected": -1.189550757408142, + "step": 1320 + }, + { + "epoch": 0.3352976396810891, + "grad_norm": 71.58808898925781, + "learning_rate": 4.997891429889845e-07, + "logits/chosen": -1.2212402820587158, + "logits/rejected": -1.2065918445587158, + "logps/chosen": -291.2562561035156, + "logps/rejected": -277.01251220703125, + "loss": 0.572, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.5848525762557983, + "rewards/margins": 0.8794952630996704, + "rewards/rejected": -1.464563012123108, + "step": 1330 + }, + { + "epoch": 0.3378186745659093, + "grad_norm": 63.032630920410156, + "learning_rate": 4.997579497930341e-07, + "logits/chosen": -1.193566918373108, + "logits/rejected": -1.198278784751892, + "logps/chosen": -313.0531311035156, + "logps/rejected": -278.10626220703125, + "loss": 0.528, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.6841018795967102, + "rewards/margins": 1.0489501953125, + "rewards/rejected": -1.733422875404358, + "step": 1340 + }, + { + "epoch": 0.3403397094507295, + "grad_norm": 78.56951904296875, + "learning_rate": 4.997246067774266e-07, + "logits/chosen": -1.218603491783142, + "logits/rejected": -1.226709008216858, + "logps/chosen": -285.6812438964844, + "logps/rejected": -266.7875061035156, + "loss": 0.5302, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16157226264476776, + "rewards/margins": 1.0615661144256592, + "rewards/rejected": -1.223168969154358, + "step": 1350 + }, + { + "epoch": 0.34286074433554975, + "grad_norm": 76.92902374267578, + "learning_rate": 4.99689114229166e-07, + "logits/chosen": -1.2064940929412842, + "logits/rejected": -1.243896484375, + "logps/chosen": -282.04376220703125, + "logps/rejected": -265.60626220703125, + "loss": 0.4936, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.0020385743118822575, + "rewards/margins": 1.1804535388946533, + "rewards/rejected": -1.181549072265625, + "step": 1360 + }, + { + "epoch": 0.34538177922036994, + "grad_norm": 56.34629440307617, + "learning_rate": 4.996514724537585e-07, + "logits/chosen": -1.178930640220642, + "logits/rejected": -1.2376220226287842, + "logps/chosen": -306.2124938964844, + "logps/rejected": -279.7749938964844, + "loss": 0.5027, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7499847412109375, + "rewards/margins": 1.110345482826233, + "rewards/rejected": -1.86083984375, + "step": 1370 + }, + { + "epoch": 0.3479028141051902, + "grad_norm": 60.09955596923828, + "learning_rate": 4.996116817752096e-07, + "logits/chosen": -1.16754150390625, + "logits/rejected": -1.201391577720642, + "logps/chosen": -301.88751220703125, + "logps/rejected": -297.83123779296875, + "loss": 0.5366, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8794525265693665, + "rewards/margins": 1.064611792564392, + "rewards/rejected": -1.9426391124725342, + "step": 1380 + }, + { + "epoch": 0.3504238489900104, + "grad_norm": 63.94362258911133, + "learning_rate": 4.995697425360223e-07, + "logits/chosen": -1.216455101966858, + "logits/rejected": -1.1755859851837158, + "logps/chosen": -284.35626220703125, + "logps/rejected": -255.9187469482422, + "loss": 0.5926, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7036636471748352, + "rewards/margins": 0.81256103515625, + "rewards/rejected": -1.5166473388671875, + "step": 1390 + }, + { + "epoch": 0.35294488387483064, + "grad_norm": 109.69884490966797, + "learning_rate": 4.995256550971933e-07, + "logits/chosen": -1.262841820716858, + "logits/rejected": -1.2425537109375, + "logps/chosen": -299.10626220703125, + "logps/rejected": -281.3687438964844, + "loss": 0.5305, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.377114862203598, + "rewards/margins": 0.9460830688476562, + "rewards/rejected": -1.323400855064392, + "step": 1400 + }, + { + "epoch": 0.35546591875965083, + "grad_norm": 80.09408569335938, + "learning_rate": 4.9947941983821e-07, + "logits/chosen": -1.225195288658142, + "logits/rejected": -1.2828369140625, + "logps/chosen": -275.29998779296875, + "logps/rejected": -276.2562561035156, + "loss": 0.5974, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.24356994032859802, + "rewards/margins": 0.79180908203125, + "rewards/rejected": -1.0354797840118408, + "step": 1410 + }, + { + "epoch": 0.35798695364447103, + "grad_norm": 60.25597381591797, + "learning_rate": 4.994310371570477e-07, + "logits/chosen": -1.2249755859375, + "logits/rejected": -1.2291991710662842, + "logps/chosen": -277.26873779296875, + "logps/rejected": -263.29376220703125, + "loss": 0.5328, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": 0.07745361328125, + "rewards/margins": 0.9789673089981079, + "rewards/rejected": -0.901257336139679, + "step": 1420 + }, + { + "epoch": 0.3605079885292913, + "grad_norm": 60.347469329833984, + "learning_rate": 4.993805074701659e-07, + "logits/chosen": -1.164282202720642, + "logits/rejected": -1.2513916492462158, + "logps/chosen": -299.3656311035156, + "logps/rejected": -283.31561279296875, + "loss": 0.5581, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.4916442930698395, + "rewards/margins": 0.9429168701171875, + "rewards/rejected": -1.4341919422149658, + "step": 1430 + }, + { + "epoch": 0.3630290234141115, + "grad_norm": 81.96858978271484, + "learning_rate": 4.993278312125045e-07, + "logits/chosen": -1.2378418445587158, + "logits/rejected": -1.256103515625, + "logps/chosen": -295.25, + "logps/rejected": -272.72186279296875, + "loss": 0.569, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0679504871368408, + "rewards/margins": 0.987762451171875, + "rewards/rejected": -2.055981397628784, + "step": 1440 + }, + { + "epoch": 0.3655500582989317, + "grad_norm": 76.29911804199219, + "learning_rate": 4.992730088374802e-07, + "logits/chosen": -1.215185523033142, + "logits/rejected": -1.2369873523712158, + "logps/chosen": -307.68438720703125, + "logps/rejected": -293.7250061035156, + "loss": 0.5266, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.8915435671806335, + "rewards/margins": 1.090203881263733, + "rewards/rejected": -1.9816772937774658, + "step": 1450 + }, + { + "epoch": 0.3680710931837519, + "grad_norm": 70.62122344970703, + "learning_rate": 4.992160408169828e-07, + "logits/chosen": -1.286474585533142, + "logits/rejected": -1.270532250404358, + "logps/chosen": -287.875, + "logps/rejected": -276.76251220703125, + "loss": 0.5476, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.19958190619945526, + "rewards/margins": 0.9004150629043579, + "rewards/rejected": -1.1003234386444092, + "step": 1460 + }, + { + "epoch": 0.37059212806857217, + "grad_norm": 74.41858673095703, + "learning_rate": 4.991569276413711e-07, + "logits/chosen": -1.216650366783142, + "logits/rejected": -1.264550805091858, + "logps/chosen": -312.46875, + "logps/rejected": -291.67498779296875, + "loss": 0.5291, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.3129440248012543, + "rewards/margins": 1.09747314453125, + "rewards/rejected": -1.4101440906524658, + "step": 1470 + }, + { + "epoch": 0.37311316295339236, + "grad_norm": 81.14917755126953, + "learning_rate": 4.990956698194681e-07, + "logits/chosen": -1.228118896484375, + "logits/rejected": -1.2777588367462158, + "logps/chosen": -272.15625, + "logps/rejected": -272.5062561035156, + "loss": 0.5202, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8779250979423523, + "rewards/margins": 1.0378296375274658, + "rewards/rejected": -1.9163818359375, + "step": 1480 + }, + { + "epoch": 0.3756341978382126, + "grad_norm": 87.38745880126953, + "learning_rate": 4.990322678785578e-07, + "logits/chosen": -1.238916039466858, + "logits/rejected": -1.2570068836212158, + "logps/chosen": -297.91876220703125, + "logps/rejected": -295.28125, + "loss": 0.4919, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -1.1091797351837158, + "rewards/margins": 1.247705101966858, + "rewards/rejected": -2.355029344558716, + "step": 1490 + }, + { + "epoch": 0.3781552327230328, + "grad_norm": 74.2760009765625, + "learning_rate": 4.989667223643792e-07, + "logits/chosen": -1.2134277820587158, + "logits/rejected": -1.200659155845642, + "logps/chosen": -301.6499938964844, + "logps/rejected": -299.79998779296875, + "loss": 0.5682, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -1.2309494018554688, + "rewards/margins": 1.127984642982483, + "rewards/rejected": -2.3590087890625, + "step": 1500 + }, + { + "epoch": 0.380676267607853, + "grad_norm": 70.29530334472656, + "learning_rate": 4.988990338411229e-07, + "logits/chosen": -1.2376220226287842, + "logits/rejected": -1.272760033607483, + "logps/chosen": -320.3062438964844, + "logps/rejected": -278.5874938964844, + "loss": 0.5067, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.02947998046875, + "rewards/margins": 1.2171509265899658, + "rewards/rejected": -2.24700927734375, + "step": 1510 + }, + { + "epoch": 0.38319730249267325, + "grad_norm": 44.722530364990234, + "learning_rate": 4.988292028914254e-07, + "logits/chosen": -1.245141625404358, + "logits/rejected": -1.253173828125, + "logps/chosen": -284.7718811035156, + "logps/rejected": -275.0375061035156, + "loss": 0.5143, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5649795532226562, + "rewards/margins": 1.0688445568084717, + "rewards/rejected": -1.633752465248108, + "step": 1520 + }, + { + "epoch": 0.38571833737749345, + "grad_norm": 86.90478515625, + "learning_rate": 4.987572301163644e-07, + "logits/chosen": -1.238378882408142, + "logits/rejected": -1.230688452720642, + "logps/chosen": -303.03125, + "logps/rejected": -274.4312438964844, + "loss": 0.5151, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.41166990995407104, + "rewards/margins": 1.1161895990371704, + "rewards/rejected": -1.527093529701233, + "step": 1530 + }, + { + "epoch": 0.3882393722623137, + "grad_norm": 78.6090087890625, + "learning_rate": 4.986831161354537e-07, + "logits/chosen": -1.23388671875, + "logits/rejected": -1.258203148841858, + "logps/chosen": -283.61248779296875, + "logps/rejected": -279.29376220703125, + "loss": 0.5023, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.6829506158828735, + "rewards/margins": 1.1151854991912842, + "rewards/rejected": -1.79736328125, + "step": 1540 + }, + { + "epoch": 0.3907604071471339, + "grad_norm": 111.22064971923828, + "learning_rate": 4.986068615866377e-07, + "logits/chosen": -1.1684691905975342, + "logits/rejected": -1.2122070789337158, + "logps/chosen": -292.04998779296875, + "logps/rejected": -285.5687561035156, + "loss": 0.5618, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.413818359375, + "rewards/margins": 1.0403320789337158, + "rewards/rejected": -2.4546875953674316, + "step": 1550 + }, + { + "epoch": 0.39328144203195414, + "grad_norm": 106.2069320678711, + "learning_rate": 4.985284671262863e-07, + "logits/chosen": -1.1885986328125, + "logits/rejected": -1.222631812095642, + "logps/chosen": -300.015625, + "logps/rejected": -273.89373779296875, + "loss": 0.5136, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3954284191131592, + "rewards/margins": 1.0596771240234375, + "rewards/rejected": -2.454785108566284, + "step": 1560 + }, + { + "epoch": 0.39580247691677434, + "grad_norm": 56.032257080078125, + "learning_rate": 4.984479334291882e-07, + "logits/chosen": -1.2393615245819092, + "logits/rejected": -1.23095703125, + "logps/chosen": -289.7250061035156, + "logps/rejected": -294.17498779296875, + "loss": 0.5326, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7645019292831421, + "rewards/margins": 1.0052673816680908, + "rewards/rejected": -1.7699096202850342, + "step": 1570 + }, + { + "epoch": 0.39832351180159453, + "grad_norm": 79.32884216308594, + "learning_rate": 4.983652611885465e-07, + "logits/chosen": -1.1991698741912842, + "logits/rejected": -1.258935570716858, + "logps/chosen": -261.4125061035156, + "logps/rejected": -287.96875, + "loss": 0.4822, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.258187860250473, + "rewards/margins": 1.1524276733398438, + "rewards/rejected": -1.4100220203399658, + "step": 1580 + }, + { + "epoch": 0.4008445466864148, + "grad_norm": 103.19246673583984, + "learning_rate": 4.982804511159718e-07, + "logits/chosen": -1.2248046398162842, + "logits/rejected": -1.2166839838027954, + "logps/chosen": -308.65936279296875, + "logps/rejected": -285.4375, + "loss": 0.5429, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.41632384061813354, + "rewards/margins": 0.9964599609375, + "rewards/rejected": -1.4127594232559204, + "step": 1590 + }, + { + "epoch": 0.403365581571235, + "grad_norm": 67.15472412109375, + "learning_rate": 4.981935039414763e-07, + "logits/chosen": -1.2517821788787842, + "logits/rejected": -1.2402832508087158, + "logps/chosen": -296.7562561035156, + "logps/rejected": -255.85000610351562, + "loss": 0.5903, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.444192498922348, + "rewards/margins": 1.0130493640899658, + "rewards/rejected": -1.456445336341858, + "step": 1600 + }, + { + "epoch": 0.4058866164560552, + "grad_norm": 90.23637390136719, + "learning_rate": 4.981044204134676e-07, + "logits/chosen": -1.244873046875, + "logits/rejected": -1.239648461341858, + "logps/chosen": -288.7250061035156, + "logps/rejected": -286.16876220703125, + "loss": 0.5393, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.34358978271484375, + "rewards/margins": 1.0264098644256592, + "rewards/rejected": -1.37060546875, + "step": 1610 + }, + { + "epoch": 0.4084076513408754, + "grad_norm": 74.42823791503906, + "learning_rate": 4.980132012987421e-07, + "logits/chosen": NaN, + "logits/rejected": -1.278662085533142, + "logps/chosen": -295.98126220703125, + "logps/rejected": -286.5874938964844, + "loss": 0.5761, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.25356751680374146, + "rewards/margins": 0.9522186517715454, + "rewards/rejected": -1.2055633068084717, + "step": 1620 + }, + { + "epoch": 0.41092868622569567, + "grad_norm": 73.99746704101562, + "learning_rate": 4.979198473824788e-07, + "logits/chosen": -1.266332983970642, + "logits/rejected": -1.230126976966858, + "logps/chosen": -276.3187561035156, + "logps/rejected": -283.76873779296875, + "loss": 0.5857, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.18967895209789276, + "rewards/margins": 0.88836669921875, + "rewards/rejected": -1.0779266357421875, + "step": 1630 + }, + { + "epoch": 0.41344972111051587, + "grad_norm": 46.616146087646484, + "learning_rate": 4.97824359468232e-07, + "logits/chosen": -1.288915991783142, + "logits/rejected": -1.2846190929412842, + "logps/chosen": -292.8687438964844, + "logps/rejected": -275.16876220703125, + "loss": 0.518, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.2936088442802429, + "rewards/margins": 1.007684350013733, + "rewards/rejected": -1.30059814453125, + "step": 1640 + }, + { + "epoch": 0.41597075599533606, + "grad_norm": 87.4283218383789, + "learning_rate": 4.977267383779244e-07, + "logits/chosen": -1.268652319908142, + "logits/rejected": -1.2604491710662842, + "logps/chosen": -313.86248779296875, + "logps/rejected": -284.7875061035156, + "loss": 0.6047, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.5855468511581421, + "rewards/margins": 0.8833038210868835, + "rewards/rejected": -1.4690628051757812, + "step": 1650 + }, + { + "epoch": 0.4184917908801563, + "grad_norm": 105.78712463378906, + "learning_rate": 4.976269849518408e-07, + "logits/chosen": -1.226586937904358, + "logits/rejected": -1.279638648033142, + "logps/chosen": -289.37811279296875, + "logps/rejected": -290.04998779296875, + "loss": 0.579, + "rewards/accuracies": 0.6781250238418579, + "rewards/chosen": -0.7635498046875, + "rewards/margins": 0.906237781047821, + "rewards/rejected": -1.6697266101837158, + "step": 1660 + }, + { + "epoch": 0.4210128257649765, + "grad_norm": 64.9921646118164, + "learning_rate": 4.9752510004862e-07, + "logits/chosen": -1.2200195789337158, + "logits/rejected": -1.2061767578125, + "logps/chosen": -281.3812561035156, + "logps/rejected": -284.1812438964844, + "loss": 0.5522, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7096710205078125, + "rewards/margins": 1.0792877674102783, + "rewards/rejected": -1.788580298423767, + "step": 1670 + }, + { + "epoch": 0.42353386064979676, + "grad_norm": 68.02913665771484, + "learning_rate": 4.974210845452476e-07, + "logits/chosen": -1.2034912109375, + "logits/rejected": -1.201416015625, + "logps/chosen": -300.07501220703125, + "logps/rejected": -315.1937561035156, + "loss": 0.5504, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8024185299873352, + "rewards/margins": 1.2061767578125, + "rewards/rejected": -2.0079345703125, + "step": 1680 + }, + { + "epoch": 0.42605489553461695, + "grad_norm": 64.83089447021484, + "learning_rate": 4.97314939337049e-07, + "logits/chosen": -1.249243140220642, + "logits/rejected": -1.2323729991912842, + "logps/chosen": -312.64373779296875, + "logps/rejected": -290.66876220703125, + "loss": 0.5003, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8062804937362671, + "rewards/margins": 1.150183081626892, + "rewards/rejected": -1.9544677734375, + "step": 1690 + }, + { + "epoch": 0.4285759304194372, + "grad_norm": 52.674869537353516, + "learning_rate": 4.972066653376808e-07, + "logits/chosen": -1.138159155845642, + "logits/rejected": -1.2062499523162842, + "logps/chosen": -281.0062561035156, + "logps/rejected": -274.3374938964844, + "loss": 0.4777, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5297576785087585, + "rewards/margins": 1.428741455078125, + "rewards/rejected": -1.957452416419983, + "step": 1700 + }, + { + "epoch": 0.4310969653042574, + "grad_norm": 79.29434967041016, + "learning_rate": 4.970962634791238e-07, + "logits/chosen": -1.2255127429962158, + "logits/rejected": -1.250732421875, + "logps/chosen": -300.34686279296875, + "logps/rejected": -268.4125061035156, + "loss": 0.5453, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.44197386503219604, + "rewards/margins": 1.0496704578399658, + "rewards/rejected": -1.4912230968475342, + "step": 1710 + }, + { + "epoch": 0.4336180001890776, + "grad_norm": 81.51036071777344, + "learning_rate": 4.969837347116744e-07, + "logits/chosen": -1.215917944908142, + "logits/rejected": -1.224023461341858, + "logps/chosen": -304.67498779296875, + "logps/rejected": -292.2250061035156, + "loss": 0.5661, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.29062652587890625, + "rewards/margins": 0.96728515625, + "rewards/rejected": -1.257940649986267, + "step": 1720 + }, + { + "epoch": 0.43613903507389784, + "grad_norm": 101.07459259033203, + "learning_rate": 4.968690800039365e-07, + "logits/chosen": -1.227880835533142, + "logits/rejected": NaN, + "logps/chosen": -295.9437561035156, + "logps/rejected": -275.73748779296875, + "loss": 0.5391, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.3242965638637543, + "rewards/margins": 1.051721215248108, + "rewards/rejected": -1.3746246099472046, + "step": 1730 + }, + { + "epoch": 0.43866006995871804, + "grad_norm": 56.40563201904297, + "learning_rate": 4.967523003428134e-07, + "logits/chosen": -1.218603491783142, + "logits/rejected": -1.171240210533142, + "logps/chosen": -299.00311279296875, + "logps/rejected": -297.45001220703125, + "loss": 0.5762, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2166931629180908, + "rewards/margins": 0.8678528070449829, + "rewards/rejected": -2.084667921066284, + "step": 1740 + }, + { + "epoch": 0.4411811048435383, + "grad_norm": 66.51710510253906, + "learning_rate": 4.966333967334992e-07, + "logits/chosen": -1.186132788658142, + "logits/rejected": -1.247412085533142, + "logps/chosen": -304.3687438964844, + "logps/rejected": -270.8187561035156, + "loss": 0.5463, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.0644409656524658, + "rewards/margins": 1.007904052734375, + "rewards/rejected": -2.071972608566284, + "step": 1750 + }, + { + "epoch": 0.4437021397283585, + "grad_norm": 62.56184005737305, + "learning_rate": 4.965123701994703e-07, + "logits/chosen": -1.208642601966858, + "logits/rejected": -1.2099120616912842, + "logps/chosen": -276.78125, + "logps/rejected": -277.39373779296875, + "loss": 0.5387, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.0719420909881592, + "rewards/margins": 1.0028259754180908, + "rewards/rejected": -2.0741944313049316, + "step": 1760 + }, + { + "epoch": 0.44622317461317873, + "grad_norm": 58.99638366699219, + "learning_rate": 4.963892217824761e-07, + "logits/chosen": -1.20947265625, + "logits/rejected": -1.2544434070587158, + "logps/chosen": -276.07501220703125, + "logps/rejected": -262.7875061035156, + "loss": 0.5892, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6793349981307983, + "rewards/margins": 1.070892333984375, + "rewards/rejected": -1.750097632408142, + "step": 1770 + }, + { + "epoch": 0.4487442094979989, + "grad_norm": 63.794219970703125, + "learning_rate": 4.962639525425303e-07, + "logits/chosen": -1.280517578125, + "logits/rejected": -1.318701148033142, + "logps/chosen": -290.01873779296875, + "logps/rejected": -275.1000061035156, + "loss": 0.5096, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -0.1992340087890625, + "rewards/margins": 1.1635620594024658, + "rewards/rejected": -1.3624451160430908, + "step": 1780 + }, + { + "epoch": 0.4512652443828191, + "grad_norm": 55.53218078613281, + "learning_rate": 4.961365635579021e-07, + "logits/chosen": -1.268701195716858, + "logits/rejected": -1.291357398033142, + "logps/chosen": -277.58123779296875, + "logps/rejected": -247.2624969482422, + "loss": 0.5004, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.1181182861328125, + "rewards/margins": 1.1982421875, + "rewards/rejected": -1.316192626953125, + "step": 1790 + }, + { + "epoch": 0.45378627926763937, + "grad_norm": 97.4455337524414, + "learning_rate": 4.960070559251066e-07, + "logits/chosen": -1.2337646484375, + "logits/rejected": -1.273535132408142, + "logps/chosen": -307.07501220703125, + "logps/rejected": -273.15936279296875, + "loss": 0.5169, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.477792352437973, + "rewards/margins": 1.31329345703125, + "rewards/rejected": -1.7910888195037842, + "step": 1800 + }, + { + "epoch": 0.45630731415245956, + "grad_norm": 64.75396728515625, + "learning_rate": 4.958754307588952e-07, + "logits/chosen": -1.198339819908142, + "logits/rejected": -1.2329590320587158, + "logps/chosen": -289.68438720703125, + "logps/rejected": -287.51251220703125, + "loss": 0.5812, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0152862071990967, + "rewards/margins": 1.1847991943359375, + "rewards/rejected": -2.2004973888397217, + "step": 1810 + }, + { + "epoch": 0.4588283490372798, + "grad_norm": 89.44642639160156, + "learning_rate": 4.957416891922463e-07, + "logits/chosen": -1.2565796375274658, + "logits/rejected": -1.2080566883087158, + "logps/chosen": -303.11248779296875, + "logps/rejected": -298.90625, + "loss": 0.5223, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.950427234172821, + "rewards/margins": 1.1783263683319092, + "rewards/rejected": -2.129077196121216, + "step": 1820 + }, + { + "epoch": 0.4613493839221, + "grad_norm": 64.8090591430664, + "learning_rate": 4.956058323763555e-07, + "logits/chosen": -1.2447998523712158, + "logits/rejected": -1.2228515148162842, + "logps/chosen": -297.65625, + "logps/rejected": -268.0874938964844, + "loss": 0.5152, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -0.6448333859443665, + "rewards/margins": 1.299646019935608, + "rewards/rejected": -1.944091796875, + "step": 1830 + }, + { + "epoch": 0.46387041880692026, + "grad_norm": 140.39163208007812, + "learning_rate": 4.954678614806258e-07, + "logits/chosen": -1.247900366783142, + "logits/rejected": -1.224389672279358, + "logps/chosen": -298.84375, + "logps/rejected": -293.09375, + "loss": 0.5528, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.3558692932128906, + "rewards/margins": 1.054406762123108, + "rewards/rejected": -1.4098694324493408, + "step": 1840 + }, + { + "epoch": 0.46639145369174045, + "grad_norm": 56.89118194580078, + "learning_rate": 4.953277776926571e-07, + "logits/chosen": -1.257055640220642, + "logits/rejected": -1.268774390220642, + "logps/chosen": -290.3687438964844, + "logps/rejected": -271.0843811035156, + "loss": 0.5025, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.17799682915210724, + "rewards/margins": 1.232853651046753, + "rewards/rejected": -1.4112396240234375, + "step": 1850 + }, + { + "epoch": 0.4689124885765607, + "grad_norm": 65.44432830810547, + "learning_rate": 4.951855822182363e-07, + "logits/chosen": -1.2215576171875, + "logits/rejected": -1.235107421875, + "logps/chosen": -307.48126220703125, + "logps/rejected": -281.35626220703125, + "loss": 0.5334, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.6034393310546875, + "rewards/margins": 1.090521216392517, + "rewards/rejected": -1.6938965320587158, + "step": 1860 + }, + { + "epoch": 0.4714335234613809, + "grad_norm": 85.89576721191406, + "learning_rate": 4.95041276281327e-07, + "logits/chosen": -1.171777367591858, + "logits/rejected": -1.188232421875, + "logps/chosen": -285.703125, + "logps/rejected": -254.36874389648438, + "loss": 0.5769, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.7682098150253296, + "rewards/margins": 1.019464135169983, + "rewards/rejected": -1.7874023914337158, + "step": 1870 + }, + { + "epoch": 0.4739545583462011, + "grad_norm": 83.72393798828125, + "learning_rate": 4.948948611240588e-07, + "logits/chosen": -1.2034423351287842, + "logits/rejected": -1.2545654773712158, + "logps/chosen": -285.41876220703125, + "logps/rejected": -269.15936279296875, + "loss": 0.6519, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.9195785522460938, + "rewards/margins": 0.881665050983429, + "rewards/rejected": -1.8018066883087158, + "step": 1880 + }, + { + "epoch": 0.47647559323102134, + "grad_norm": 70.85868072509766, + "learning_rate": 4.947463380067166e-07, + "logits/chosen": -1.190771460533142, + "logits/rejected": -1.188134789466858, + "logps/chosen": -270.0531311035156, + "logps/rejected": -266.96875, + "loss": 0.5712, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -1.18328857421875, + "rewards/margins": 0.9207916259765625, + "rewards/rejected": -2.1041016578674316, + "step": 1890 + }, + { + "epoch": 0.47899662811584154, + "grad_norm": 44.25507354736328, + "learning_rate": 4.945957082077298e-07, + "logits/chosen": -1.2071533203125, + "logits/rejected": -1.2270019054412842, + "logps/chosen": -290.39373779296875, + "logps/rejected": -291.3999938964844, + "loss": 0.5092, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0802001953125, + "rewards/margins": 1.0626647472381592, + "rewards/rejected": -2.142773389816284, + "step": 1900 + }, + { + "epoch": 0.4815176630006618, + "grad_norm": 75.29605865478516, + "learning_rate": 4.944429730236617e-07, + "logits/chosen": -1.2137939929962158, + "logits/rejected": -1.208673119544983, + "logps/chosen": -310.04998779296875, + "logps/rejected": -284.546875, + "loss": 0.5739, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.103143334388733, + "rewards/margins": 1.0565338134765625, + "rewards/rejected": -2.1590819358825684, + "step": 1910 + }, + { + "epoch": 0.484038697885482, + "grad_norm": 92.45514678955078, + "learning_rate": 4.942881337691971e-07, + "logits/chosen": -1.1910889148712158, + "logits/rejected": -1.1811034679412842, + "logps/chosen": -286.01251220703125, + "logps/rejected": -278.5249938964844, + "loss": 0.5611, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7924102544784546, + "rewards/margins": 1.12811279296875, + "rewards/rejected": -1.920019507408142, + "step": 1920 + }, + { + "epoch": 0.48655973277030223, + "grad_norm": 45.24339294433594, + "learning_rate": 4.941311917771324e-07, + "logits/chosen": -1.1636962890625, + "logits/rejected": -1.1869628429412842, + "logps/chosen": -313.07501220703125, + "logps/rejected": -286.3531188964844, + "loss": 0.5601, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8625229001045227, + "rewards/margins": 1.1022552251815796, + "rewards/rejected": -1.965612769126892, + "step": 1930 + }, + { + "epoch": 0.48908076765512243, + "grad_norm": 64.42066192626953, + "learning_rate": 4.939721483983639e-07, + "logits/chosen": -1.19384765625, + "logits/rejected": -1.223486304283142, + "logps/chosen": -288.98748779296875, + "logps/rejected": -267.63751220703125, + "loss": 0.6278, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.7465095520019531, + "rewards/margins": 1.087432861328125, + "rewards/rejected": -1.833532691001892, + "step": 1940 + }, + { + "epoch": 0.4916018025399426, + "grad_norm": 73.966552734375, + "learning_rate": 4.938110050018747e-07, + "logits/chosen": -1.1904785633087158, + "logits/rejected": -1.214257836341858, + "logps/chosen": -281.84063720703125, + "logps/rejected": -277.6812438964844, + "loss": 0.62, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.7041015625, + "rewards/margins": 1.0345275402069092, + "rewards/rejected": -1.7391235828399658, + "step": 1950 + }, + { + "epoch": 0.4941228374247629, + "grad_norm": 94.01651000976562, + "learning_rate": 4.936477629747253e-07, + "logits/chosen": -1.164306640625, + "logits/rejected": -1.1973876953125, + "logps/chosen": -317.2875061035156, + "logps/rejected": -308.91876220703125, + "loss": 0.5443, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.9269149899482727, + "rewards/margins": 0.9577087163925171, + "rewards/rejected": -1.884362816810608, + "step": 1960 + }, + { + "epoch": 0.49664387230958307, + "grad_norm": 63.18036651611328, + "learning_rate": 4.934824237220395e-07, + "logits/chosen": -1.193017601966858, + "logits/rejected": -1.193090796470642, + "logps/chosen": -307.71875, + "logps/rejected": -277.96875, + "loss": 0.578, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0204544067382812, + "rewards/margins": 1.105505347251892, + "rewards/rejected": -2.1258301734924316, + "step": 1970 + }, + { + "epoch": 0.4991649071944033, + "grad_norm": 66.69244384765625, + "learning_rate": 4.933149886669936e-07, + "logits/chosen": -1.252294898033142, + "logits/rejected": -1.254492163658142, + "logps/chosen": -281.015625, + "logps/rejected": -281.17498779296875, + "loss": 0.5839, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.6375198364257812, + "rewards/margins": 0.9779907464981079, + "rewards/rejected": -1.6150085926055908, + "step": 1980 + }, + { + "epoch": 0.5016859420792236, + "grad_norm": 67.1665267944336, + "learning_rate": 4.931454592508037e-07, + "logits/chosen": -1.1150634288787842, + "logits/rejected": -1.1283690929412842, + "logps/chosen": -270.01873779296875, + "logps/rejected": -251.30624389648438, + "loss": 0.5874, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.4391418397426605, + "rewards/margins": 1.0847870111465454, + "rewards/rejected": -1.524133324623108, + "step": 1990 + }, + { + "epoch": 0.5042069769640437, + "grad_norm": 68.26512145996094, + "learning_rate": 4.929738369327133e-07, + "logits/chosen": -1.143884301185608, + "logits/rejected": -1.1930663585662842, + "logps/chosen": -267.05938720703125, + "logps/rejected": -284.03125, + "loss": 0.5463, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.475616455078125, + "rewards/margins": 1.23541259765625, + "rewards/rejected": -1.7103195190429688, + "step": 2000 + }, + { + "epoch": 0.506728011848864, + "grad_norm": 82.5437240600586, + "learning_rate": 4.928001231899809e-07, + "logits/chosen": -1.1070556640625, + "logits/rejected": -1.1834716796875, + "logps/chosen": -269.8843688964844, + "logps/rejected": -276.53125, + "loss": 0.4932, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5311965942382812, + "rewards/margins": 1.4447052478790283, + "rewards/rejected": -1.9757812023162842, + "step": 2010 + }, + { + "epoch": 0.5092490467336842, + "grad_norm": 84.4751968383789, + "learning_rate": 4.926243195178669e-07, + "logits/chosen": -1.182763695716858, + "logits/rejected": -1.1971557140350342, + "logps/chosen": -253.47500610351562, + "logps/rejected": -257.90625, + "loss": 0.5344, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.2667526304721832, + "rewards/margins": 1.116084337234497, + "rewards/rejected": -1.382940649986267, + "step": 2020 + }, + { + "epoch": 0.5117700816185043, + "grad_norm": 56.32522201538086, + "learning_rate": 4.924464274296214e-07, + "logits/chosen": -1.174340844154358, + "logits/rejected": -1.2169067859649658, + "logps/chosen": -287.35626220703125, + "logps/rejected": -264.3812561035156, + "loss": 0.5841, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.005999756045639515, + "rewards/margins": 0.9309631586074829, + "rewards/rejected": -0.9245926141738892, + "step": 2030 + }, + { + "epoch": 0.5142911165033246, + "grad_norm": 86.52482604980469, + "learning_rate": 4.922664484564704e-07, + "logits/chosen": -1.199121117591858, + "logits/rejected": -1.259497046470642, + "logps/chosen": -293.8187561035156, + "logps/rejected": -286.4437561035156, + "loss": 0.5224, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.08463134616613388, + "rewards/margins": 1.15631103515625, + "rewards/rejected": -1.241546630859375, + "step": 2040 + }, + { + "epoch": 0.5168121513881448, + "grad_norm": 91.60623931884766, + "learning_rate": 4.920843841476032e-07, + "logits/chosen": -1.1902587413787842, + "logits/rejected": NaN, + "logps/chosen": -292.3031311035156, + "logps/rejected": -273.16876220703125, + "loss": 0.5612, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.268798828125, + "rewards/margins": 1.181677222251892, + "rewards/rejected": -1.450585961341858, + "step": 2050 + }, + { + "epoch": 0.5193331862729651, + "grad_norm": 86.1439208984375, + "learning_rate": 4.91900236070159e-07, + "logits/chosen": -1.124169945716858, + "logits/rejected": -1.155847191810608, + "logps/chosen": -288.82501220703125, + "logps/rejected": -275.0375061035156, + "loss": 0.5803, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7064574956893921, + "rewards/margins": 1.1885802745819092, + "rewards/rejected": -1.894866943359375, + "step": 2060 + }, + { + "epoch": 0.5218542211577852, + "grad_norm": 73.70453643798828, + "learning_rate": 4.917140058092128e-07, + "logits/chosen": -1.213134765625, + "logits/rejected": -1.210473656654358, + "logps/chosen": -274.95623779296875, + "logps/rejected": -275.3062438964844, + "loss": 0.5484, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.441751092672348, + "rewards/margins": 1.181280493736267, + "rewards/rejected": -1.6232421398162842, + "step": 2070 + }, + { + "epoch": 0.5243752560426055, + "grad_norm": 74.48878479003906, + "learning_rate": 4.915256949677628e-07, + "logits/chosen": -1.193359375, + "logits/rejected": -1.2157714366912842, + "logps/chosen": -277.3062438964844, + "logps/rejected": -269.32501220703125, + "loss": 0.5753, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.715441882610321, + "rewards/margins": 0.9764038324356079, + "rewards/rejected": -1.6917724609375, + "step": 2080 + }, + { + "epoch": 0.5268962909274257, + "grad_norm": 81.1043472290039, + "learning_rate": 4.913353051667155e-07, + "logits/chosen": -1.1866455078125, + "logits/rejected": -1.180078148841858, + "logps/chosen": -309.45001220703125, + "logps/rejected": -277.92498779296875, + "loss": 0.5364, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.877227783203125, + "rewards/margins": 1.07843017578125, + "rewards/rejected": -1.9558594226837158, + "step": 2090 + }, + { + "epoch": 0.5294173258122459, + "grad_norm": 91.78522491455078, + "learning_rate": 4.911428380448727e-07, + "logits/chosen": -1.1493651866912842, + "logits/rejected": -1.1673095226287842, + "logps/chosen": -292.4375, + "logps/rejected": -261.45623779296875, + "loss": 0.5596, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.981091320514679, + "rewards/margins": 1.201904296875, + "rewards/rejected": -2.1824951171875, + "step": 2100 + }, + { + "epoch": 0.5319383606970661, + "grad_norm": 72.78822326660156, + "learning_rate": 4.909482952589169e-07, + "logits/chosen": -1.1352112293243408, + "logits/rejected": -1.1470215320587158, + "logps/chosen": -292.5687561035156, + "logps/rejected": -300.8374938964844, + "loss": 0.5316, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.216394066810608, + "rewards/margins": 1.23919677734375, + "rewards/rejected": -2.455810546875, + "step": 2110 + }, + { + "epoch": 0.5344593955818864, + "grad_norm": 68.66783142089844, + "learning_rate": 4.907516784833968e-07, + "logits/chosen": -1.201318383216858, + "logits/rejected": -1.208349585533142, + "logps/chosen": -319.29998779296875, + "logps/rejected": -301.4750061035156, + "loss": 0.544, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.8149169683456421, + "rewards/margins": 1.1826903820037842, + "rewards/rejected": -1.9975707530975342, + "step": 2120 + }, + { + "epoch": 0.5369804304667066, + "grad_norm": 70.89950561523438, + "learning_rate": 4.905529894107136e-07, + "logits/chosen": -1.2000732421875, + "logits/rejected": -1.1967284679412842, + "logps/chosen": -317.9750061035156, + "logps/rejected": -269.3374938964844, + "loss": 0.5125, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -0.8348633050918579, + "rewards/margins": 1.355010986328125, + "rewards/rejected": -2.1889891624450684, + "step": 2130 + }, + { + "epoch": 0.5395014653515268, + "grad_norm": 76.0267105102539, + "learning_rate": 4.903522297511058e-07, + "logits/chosen": -1.2312500476837158, + "logits/rejected": -1.2475097179412842, + "logps/chosen": -298.65936279296875, + "logps/rejected": -294.0625, + "loss": 0.4639, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -0.6670074462890625, + "rewards/margins": 1.417443871498108, + "rewards/rejected": -2.0840697288513184, + "step": 2140 + }, + { + "epoch": 0.542022500236347, + "grad_norm": 85.77741241455078, + "learning_rate": 4.901494012326346e-07, + "logits/chosen": -1.179589867591858, + "logits/rejected": -1.2236816883087158, + "logps/chosen": -282.1968688964844, + "logps/rejected": -281.1312561035156, + "loss": 0.5782, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.866058349609375, + "rewards/margins": 1.250817894935608, + "rewards/rejected": -2.116564989089966, + "step": 2150 + }, + { + "epoch": 0.5445435351211673, + "grad_norm": 74.22309112548828, + "learning_rate": 4.899445056011695e-07, + "logits/chosen": -1.203039526939392, + "logits/rejected": -1.198950171470642, + "logps/chosen": -294.85626220703125, + "logps/rejected": -282.2437438964844, + "loss": 0.515, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5551101565361023, + "rewards/margins": 1.4315674304962158, + "rewards/rejected": -1.987248182296753, + "step": 2160 + }, + { + "epoch": 0.5470645700059874, + "grad_norm": 83.23567199707031, + "learning_rate": 4.897375446203727e-07, + "logits/chosen": -1.156347632408142, + "logits/rejected": -1.172631859779358, + "logps/chosen": -282.48748779296875, + "logps/rejected": -289.40313720703125, + "loss": 0.5962, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.651623547077179, + "rewards/margins": 1.131292700767517, + "rewards/rejected": -1.781622290611267, + "step": 2170 + }, + { + "epoch": 0.5495856048908077, + "grad_norm": 62.51706314086914, + "learning_rate": 4.89528520071684e-07, + "logits/chosen": -1.22314453125, + "logits/rejected": -1.261450171470642, + "logps/chosen": -276.70623779296875, + "logps/rejected": -281.86248779296875, + "loss": 0.5031, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.3576812744140625, + "rewards/margins": 1.385986328125, + "rewards/rejected": -1.7438232898712158, + "step": 2180 + }, + { + "epoch": 0.5521066397756279, + "grad_norm": 70.63616180419922, + "learning_rate": 4.893174337543058e-07, + "logits/chosen": -1.2021973133087158, + "logits/rejected": -1.212744116783142, + "logps/chosen": -266.765625, + "logps/rejected": -272.07501220703125, + "loss": 0.5897, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.4884323179721832, + "rewards/margins": 1.024169921875, + "rewards/rejected": -1.5122253894805908, + "step": 2190 + }, + { + "epoch": 0.5546276746604482, + "grad_norm": 68.28787994384766, + "learning_rate": 4.891042874851873e-07, + "logits/chosen": -1.2497680187225342, + "logits/rejected": -1.2576904296875, + "logps/chosen": -297.79998779296875, + "logps/rejected": -272.76873779296875, + "loss": 0.5681, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.14769592881202698, + "rewards/margins": 1.087792992591858, + "rewards/rejected": -1.235931396484375, + "step": 2200 + }, + { + "epoch": 0.5571487095452683, + "grad_norm": 88.721435546875, + "learning_rate": 4.888890830990091e-07, + "logits/chosen": -1.171240210533142, + "logits/rejected": -1.20465087890625, + "logps/chosen": -295.2749938964844, + "logps/rejected": -279.8374938964844, + "loss": 0.5215, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.351248174905777, + "rewards/margins": 1.323974609375, + "rewards/rejected": -1.676110863685608, + "step": 2210 + }, + { + "epoch": 0.5596697444300885, + "grad_norm": 78.94097137451172, + "learning_rate": 4.88671822448167e-07, + "logits/chosen": -1.20654296875, + "logits/rejected": -1.1768066883087158, + "logps/chosen": -306.9375, + "logps/rejected": -268.90625, + "loss": 0.5687, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5443404912948608, + "rewards/margins": 1.320776343345642, + "rewards/rejected": -1.8653564453125, + "step": 2220 + }, + { + "epoch": 0.5621907793149088, + "grad_norm": 58.73942565917969, + "learning_rate": 4.884525074027566e-07, + "logits/chosen": -1.165771484375, + "logits/rejected": -1.180419921875, + "logps/chosen": -296.7749938964844, + "logps/rejected": -300.0062561035156, + "loss": 0.5771, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.616558849811554, + "rewards/margins": 1.1680572032928467, + "rewards/rejected": -1.7853882312774658, + "step": 2230 + }, + { + "epoch": 0.5647118141997289, + "grad_norm": 70.50154876708984, + "learning_rate": 4.882311398505568e-07, + "logits/chosen": -1.226660132408142, + "logits/rejected": -1.211279273033142, + "logps/chosen": -300.0687561035156, + "logps/rejected": -255.1875, + "loss": 0.5165, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -0.6135452389717102, + "rewards/margins": 1.326025366783142, + "rewards/rejected": -1.9389464855194092, + "step": 2240 + }, + { + "epoch": 0.5672328490845492, + "grad_norm": 79.97781372070312, + "learning_rate": 4.880077216970139e-07, + "logits/chosen": -1.245263695716858, + "logits/rejected": -1.2196533679962158, + "logps/chosen": -289.67498779296875, + "logps/rejected": -293.3125, + "loss": 0.5038, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7392807006835938, + "rewards/margins": 1.505157470703125, + "rewards/rejected": -2.2438080310821533, + "step": 2250 + }, + { + "epoch": 0.5697538839693694, + "grad_norm": 70.58702850341797, + "learning_rate": 4.877822548652244e-07, + "logits/chosen": -1.199804663658142, + "logits/rejected": -1.2206542491912842, + "logps/chosen": -311.0687561035156, + "logps/rejected": -302.6187438964844, + "loss": 0.5111, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -0.6523681879043579, + "rewards/margins": 1.5244140625, + "rewards/rejected": -2.17681884765625, + "step": 2260 + }, + { + "epoch": 0.5722749188541897, + "grad_norm": 73.1260986328125, + "learning_rate": 4.875547412959198e-07, + "logits/chosen": -1.2357909679412842, + "logits/rejected": -1.248437523841858, + "logps/chosen": -304.5625, + "logps/rejected": -303.49688720703125, + "loss": 0.5175, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.703564465045929, + "rewards/margins": 1.4135863780975342, + "rewards/rejected": -2.1169190406799316, + "step": 2270 + }, + { + "epoch": 0.5747959537390098, + "grad_norm": 77.04090881347656, + "learning_rate": 4.873251829474485e-07, + "logits/chosen": -1.2665283679962158, + "logits/rejected": -1.200292944908142, + "logps/chosen": -313.34375, + "logps/rejected": -279.61248779296875, + "loss": 0.5583, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.1821548491716385, + "rewards/margins": 1.315148949623108, + "rewards/rejected": -1.4973633289337158, + "step": 2280 + }, + { + "epoch": 0.5773169886238301, + "grad_norm": 63.97391128540039, + "learning_rate": 4.870935817957599e-07, + "logits/chosen": -1.2111327648162842, + "logits/rejected": -1.1955077648162842, + "logps/chosen": -266.8500061035156, + "logps/rejected": -255.46249389648438, + "loss": 0.572, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24982604384422302, + "rewards/margins": 1.177056908607483, + "rewards/rejected": -1.4273192882537842, + "step": 2290 + }, + { + "epoch": 0.5798380235086503, + "grad_norm": 84.7044448852539, + "learning_rate": 4.868599398343871e-07, + "logits/chosen": -1.1641356945037842, + "logits/rejected": -1.198388695716858, + "logps/chosen": -268.82501220703125, + "logps/rejected": -250.08749389648438, + "loss": 0.5223, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.32326966524124146, + "rewards/margins": 1.1341736316680908, + "rewards/rejected": -1.4576812982559204, + "step": 2300 + }, + { + "epoch": 0.5823590583934706, + "grad_norm": 93.58554077148438, + "learning_rate": 4.866242590744294e-07, + "logits/chosen": -1.206445336341858, + "logits/rejected": NaN, + "logps/chosen": -300.45001220703125, + "logps/rejected": -315.16876220703125, + "loss": 0.5786, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.4164672791957855, + "rewards/margins": 1.204016089439392, + "rewards/rejected": -1.620294213294983, + "step": 2310 + }, + { + "epoch": 0.5848800932782907, + "grad_norm": 90.5208969116211, + "learning_rate": 4.863865415445356e-07, + "logits/chosen": -1.2510986328125, + "logits/rejected": -1.243798851966858, + "logps/chosen": -282.48748779296875, + "logps/rejected": -271.71875, + "loss": 0.507, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.37880247831344604, + "rewards/margins": 1.333251953125, + "rewards/rejected": -1.71160888671875, + "step": 2320 + }, + { + "epoch": 0.587401128163111, + "grad_norm": 57.87455749511719, + "learning_rate": 4.861467892908859e-07, + "logits/chosen": -1.249536156654358, + "logits/rejected": -1.220556616783142, + "logps/chosen": -293.70623779296875, + "logps/rejected": -272.26873779296875, + "loss": 0.5443, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.7286742925643921, + "rewards/margins": 1.315006971359253, + "rewards/rejected": -2.044512987136841, + "step": 2330 + }, + { + "epoch": 0.5899221630479312, + "grad_norm": 56.66450500488281, + "learning_rate": 4.85905004377175e-07, + "logits/chosen": -1.192236304283142, + "logits/rejected": -1.2483398914337158, + "logps/chosen": -310.5375061035156, + "logps/rejected": -304.9375, + "loss": 0.5306, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9629257321357727, + "rewards/margins": 1.29730224609375, + "rewards/rejected": -2.260237216949463, + "step": 2340 + }, + { + "epoch": 0.5924431979327514, + "grad_norm": 99.49959564208984, + "learning_rate": 4.856611888845937e-07, + "logits/chosen": -1.2095215320587158, + "logits/rejected": -1.2102539539337158, + "logps/chosen": -307.4937438964844, + "logps/rejected": -315.46875, + "loss": 0.5489, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8058074712753296, + "rewards/margins": 1.5135376453399658, + "rewards/rejected": -2.3196136951446533, + "step": 2350 + }, + { + "epoch": 0.5949642328175716, + "grad_norm": 68.80374145507812, + "learning_rate": 4.854153449118112e-07, + "logits/chosen": -1.1959960460662842, + "logits/rejected": -1.1903808116912842, + "logps/chosen": -289.48748779296875, + "logps/rejected": -258.50311279296875, + "loss": 0.579, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.03845367580652237, + "rewards/margins": 1.2675507068634033, + "rewards/rejected": -1.3045227527618408, + "step": 2360 + }, + { + "epoch": 0.5974852677023919, + "grad_norm": 95.79170227050781, + "learning_rate": 4.851674745749571e-07, + "logits/chosen": -1.15673828125, + "logits/rejected": -1.094934105873108, + "logps/chosen": -293.453125, + "logps/rejected": -257.1000061035156, + "loss": 0.561, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.86993408203125, + "rewards/margins": 1.3911011219024658, + "rewards/rejected": -2.261944532394409, + "step": 2370 + }, + { + "epoch": 0.6000063025872121, + "grad_norm": 75.12720489501953, + "learning_rate": 4.849175800076034e-07, + "logits/chosen": -1.131585717201233, + "logits/rejected": -1.112768530845642, + "logps/chosen": -320.23748779296875, + "logps/rejected": -314.5625, + "loss": 0.502, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -1.776953101158142, + "rewards/margins": 1.454748511314392, + "rewards/rejected": -3.2318358421325684, + "step": 2380 + }, + { + "epoch": 0.6025273374720322, + "grad_norm": 86.88487243652344, + "learning_rate": 4.846656633607458e-07, + "logits/chosen": -1.111425757408142, + "logits/rejected": -1.0926392078399658, + "logps/chosen": -314.4750061035156, + "logps/rejected": -323.64373779296875, + "loss": 0.6269, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -2.241943359375, + "rewards/margins": 0.9921509027481079, + "rewards/rejected": -3.2347655296325684, + "step": 2390 + }, + { + "epoch": 0.6050483723568525, + "grad_norm": 69.06053161621094, + "learning_rate": 4.844117268027848e-07, + "logits/chosen": -1.138671875, + "logits/rejected": -1.102941870689392, + "logps/chosen": -309.96875, + "logps/rejected": -274.91876220703125, + "loss": 0.5555, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -1.883856177330017, + "rewards/margins": 1.180932641029358, + "rewards/rejected": -3.063671827316284, + "step": 2400 + }, + { + "epoch": 0.6075694072416727, + "grad_norm": 69.51000213623047, + "learning_rate": 4.841557725195083e-07, + "logits/chosen": -1.155615210533142, + "logits/rejected": -1.1832275390625, + "logps/chosen": -296.39373779296875, + "logps/rejected": -289.2437438964844, + "loss": 0.6145, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.435693383216858, + "rewards/margins": 1.1839478015899658, + "rewards/rejected": -2.620678663253784, + "step": 2410 + }, + { + "epoch": 0.6100904421264929, + "grad_norm": 65.10906982421875, + "learning_rate": 4.838978027140713e-07, + "logits/chosen": -1.150396704673767, + "logits/rejected": -1.200341820716858, + "logps/chosen": -299.8343811035156, + "logps/rejected": -277.08123779296875, + "loss": 0.533, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0263763666152954, + "rewards/margins": 1.4067351818084717, + "rewards/rejected": -2.4328370094299316, + "step": 2420 + }, + { + "epoch": 0.6126114770113131, + "grad_norm": 84.71194458007812, + "learning_rate": 4.836378196069781e-07, + "logits/chosen": -1.1405670642852783, + "logits/rejected": -1.1593017578125, + "logps/chosen": -278.7749938964844, + "logps/rejected": -252.38125610351562, + "loss": 0.6179, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.8090149164199829, + "rewards/margins": 1.1510009765625, + "rewards/rejected": -1.9599120616912842, + "step": 2430 + }, + { + "epoch": 0.6151325118961334, + "grad_norm": 65.30789947509766, + "learning_rate": 4.833758254360625e-07, + "logits/chosen": -1.197998046875, + "logits/rejected": -1.17822265625, + "logps/chosen": -281.12188720703125, + "logps/rejected": -260.6937561035156, + "loss": 0.5126, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.529400646686554, + "rewards/margins": 1.24432373046875, + "rewards/rejected": -1.772973656654358, + "step": 2440 + }, + { + "epoch": 0.6176535467809536, + "grad_norm": 47.99665832519531, + "learning_rate": 4.831118224564688e-07, + "logits/chosen": -1.1317627429962158, + "logits/rejected": -1.1888916492462158, + "logps/chosen": -293.66876220703125, + "logps/rejected": -297.16876220703125, + "loss": 0.4998, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.425079345703125, + "rewards/margins": 1.356597900390625, + "rewards/rejected": -1.7805907726287842, + "step": 2450 + }, + { + "epoch": 0.6201745816657738, + "grad_norm": 61.13064956665039, + "learning_rate": 4.828458129406322e-07, + "logits/chosen": -1.1535766124725342, + "logits/rejected": -1.12652587890625, + "logps/chosen": -286.98126220703125, + "logps/rejected": -270.76251220703125, + "loss": 0.4807, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7186523675918579, + "rewards/margins": 1.408850073814392, + "rewards/rejected": -2.1275877952575684, + "step": 2460 + }, + { + "epoch": 0.622695616550594, + "grad_norm": 79.37671661376953, + "learning_rate": 4.825777991782599e-07, + "logits/chosen": -1.1739501953125, + "logits/rejected": -1.191259741783142, + "logps/chosen": -300.09375, + "logps/rejected": -290.0062561035156, + "loss": 0.5922, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8119446039199829, + "rewards/margins": 1.4115784168243408, + "rewards/rejected": -2.2237305641174316, + "step": 2470 + }, + { + "epoch": 0.6252166514354143, + "grad_norm": 78.25971221923828, + "learning_rate": 4.823077834763102e-07, + "logits/chosen": -1.094567894935608, + "logits/rejected": -1.180395483970642, + "logps/chosen": -297.1000061035156, + "logps/rejected": -280.2124938964844, + "loss": 0.5548, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -1.138482689857483, + "rewards/margins": 1.4807860851287842, + "rewards/rejected": -2.618457078933716, + "step": 2480 + }, + { + "epoch": 0.6277376863202344, + "grad_norm": 53.465755462646484, + "learning_rate": 4.820357681589738e-07, + "logits/chosen": -1.214599609375, + "logits/rejected": -1.178979516029358, + "logps/chosen": -294.3812561035156, + "logps/rejected": -275.8187561035156, + "loss": 0.5115, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -1.1684691905975342, + "rewards/margins": 1.486230492591858, + "rewards/rejected": -2.6543211936950684, + "step": 2490 + }, + { + "epoch": 0.6302587212050547, + "grad_norm": 85.9769515991211, + "learning_rate": 4.817617555676531e-07, + "logits/chosen": -1.1224853992462158, + "logits/rejected": -1.14208984375, + "logps/chosen": -289.125, + "logps/rejected": -272.9437561035156, + "loss": 0.5296, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.059788465499878, + "rewards/margins": 1.2673156261444092, + "rewards/rejected": -2.327471971511841, + "step": 2500 + }, + { + "epoch": 0.6327797560898749, + "grad_norm": 65.66055297851562, + "learning_rate": 4.814857480609423e-07, + "logits/chosen": -1.2054870128631592, + "logits/rejected": -1.235107421875, + "logps/chosen": -293.28436279296875, + "logps/rejected": -288.66876220703125, + "loss": 0.5634, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9910598993301392, + "rewards/margins": 1.3326294422149658, + "rewards/rejected": -2.324328660964966, + "step": 2510 + }, + { + "epoch": 0.6353007909746952, + "grad_norm": 83.93264770507812, + "learning_rate": 4.812077480146071e-07, + "logits/chosen": -1.1200683116912842, + "logits/rejected": -1.1492187976837158, + "logps/chosen": -292.90625, + "logps/rejected": -282.76251220703125, + "loss": 0.5442, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.0575134754180908, + "rewards/margins": 1.38275146484375, + "rewards/rejected": -2.4418272972106934, + "step": 2520 + }, + { + "epoch": 0.6378218258595153, + "grad_norm": 100.24918365478516, + "learning_rate": 4.809277578215642e-07, + "logits/chosen": -1.1622314453125, + "logits/rejected": -1.1765868663787842, + "logps/chosen": -282.8062438964844, + "logps/rejected": -269.70001220703125, + "loss": 0.5422, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.6311782598495483, + "rewards/margins": 1.385986328125, + "rewards/rejected": -2.017224073410034, + "step": 2530 + }, + { + "epoch": 0.6403428607443356, + "grad_norm": 67.45328521728516, + "learning_rate": 4.806457798918605e-07, + "logits/chosen": -1.241113305091858, + "logits/rejected": -1.204003930091858, + "logps/chosen": -312.625, + "logps/rejected": -297.92498779296875, + "loss": 0.5716, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.5490020513534546, + "rewards/margins": 1.1793212890625, + "rewards/rejected": -1.7283859252929688, + "step": 2540 + }, + { + "epoch": 0.6428638956291558, + "grad_norm": 85.26969146728516, + "learning_rate": 4.80361816652653e-07, + "logits/chosen": -1.196069359779358, + "logits/rejected": -1.23370361328125, + "logps/chosen": -307.26251220703125, + "logps/rejected": -271.75, + "loss": 0.5261, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.793475329875946, + "rewards/margins": 1.27081298828125, + "rewards/rejected": -2.0640501976013184, + "step": 2550 + }, + { + "epoch": 0.6453849305139759, + "grad_norm": 61.15713882446289, + "learning_rate": 4.800758705481872e-07, + "logits/chosen": -1.1762206554412842, + "logits/rejected": -1.200537085533142, + "logps/chosen": -298.8374938964844, + "logps/rejected": -280.8125, + "loss": 0.4726, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.532562255859375, + "rewards/margins": 1.32684326171875, + "rewards/rejected": -1.859521508216858, + "step": 2560 + }, + { + "epoch": 0.6479059653987962, + "grad_norm": 71.99295806884766, + "learning_rate": 4.797879440397764e-07, + "logits/chosen": -1.144287109375, + "logits/rejected": -1.174462914466858, + "logps/chosen": -260.73748779296875, + "logps/rejected": -258.9437561035156, + "loss": 0.603, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8031395077705383, + "rewards/margins": 1.1868774890899658, + "rewards/rejected": -1.9891357421875, + "step": 2570 + }, + { + "epoch": 0.6504270002836164, + "grad_norm": 72.66377258300781, + "learning_rate": 4.794980396057802e-07, + "logits/chosen": -1.137353539466858, + "logits/rejected": -1.152807593345642, + "logps/chosen": -289.0062561035156, + "logps/rejected": -292.0375061035156, + "loss": 0.5609, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7954727411270142, + "rewards/margins": 1.278875708580017, + "rewards/rejected": -2.07403564453125, + "step": 2580 + }, + { + "epoch": 0.6529480351684367, + "grad_norm": 69.01895141601562, + "learning_rate": 4.792061597415838e-07, + "logits/chosen": -1.2290527820587158, + "logits/rejected": -1.218847632408142, + "logps/chosen": -284.45623779296875, + "logps/rejected": -278.07501220703125, + "loss": 0.5333, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.824688732624054, + "rewards/margins": 1.3822143077850342, + "rewards/rejected": -2.2069945335388184, + "step": 2590 + }, + { + "epoch": 0.6554690700532568, + "grad_norm": 68.94561004638672, + "learning_rate": 4.78912306959576e-07, + "logits/chosen": -1.154443383216858, + "logits/rejected": -1.1708190441131592, + "logps/chosen": -316.5687561035156, + "logps/rejected": -292.3062438964844, + "loss": 0.4711, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.5568755865097046, + "rewards/margins": 1.523004174232483, + "rewards/rejected": -2.0794615745544434, + "step": 2600 + }, + { + "epoch": 0.6579901049380771, + "grad_norm": 50.01373291015625, + "learning_rate": 4.786164837891277e-07, + "logits/chosen": -1.108984351158142, + "logits/rejected": -1.1860840320587158, + "logps/chosen": -277.65625, + "logps/rejected": -298.6937561035156, + "loss": 0.5241, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.6713012456893921, + "rewards/margins": 1.461004614830017, + "rewards/rejected": -2.133190870285034, + "step": 2610 + }, + { + "epoch": 0.6605111398228973, + "grad_norm": 76.2265396118164, + "learning_rate": 4.7831869277657e-07, + "logits/chosen": -1.1467773914337158, + "logits/rejected": -1.172705054283142, + "logps/chosen": -280.00311279296875, + "logps/rejected": -277.2437438964844, + "loss": 0.5286, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8196624517440796, + "rewards/margins": 1.400231957435608, + "rewards/rejected": -2.2198243141174316, + "step": 2620 + }, + { + "epoch": 0.6630321747077175, + "grad_norm": 90.98784637451172, + "learning_rate": 4.780189364851726e-07, + "logits/chosen": -1.1557128429412842, + "logits/rejected": -1.1880614757537842, + "logps/chosen": -281.21875, + "logps/rejected": -282.53125, + "loss": 0.5537, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.435556024312973, + "rewards/margins": 1.472467064857483, + "rewards/rejected": -1.908056616783142, + "step": 2630 + }, + { + "epoch": 0.6655532095925377, + "grad_norm": 110.27445983886719, + "learning_rate": 4.777172174951216e-07, + "logits/chosen": -1.1761963367462158, + "logits/rejected": -1.188818335533142, + "logps/chosen": -312.390625, + "logps/rejected": -299.3374938964844, + "loss": 0.5696, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.5568572878837585, + "rewards/margins": 1.3341461420059204, + "rewards/rejected": -1.8907806873321533, + "step": 2640 + }, + { + "epoch": 0.668074244477358, + "grad_norm": 68.85711669921875, + "learning_rate": 4.77413538403497e-07, + "logits/chosen": -1.197778344154358, + "logits/rejected": -1.183837890625, + "logps/chosen": -303.17498779296875, + "logps/rejected": -293.57501220703125, + "loss": 0.4299, + "rewards/accuracies": 0.7906249761581421, + "rewards/chosen": -0.8754485845565796, + "rewards/margins": 1.8027832508087158, + "rewards/rejected": -2.6781952381134033, + "step": 2650 + }, + { + "epoch": 0.6705952793621782, + "grad_norm": 90.8910903930664, + "learning_rate": 4.771079018242509e-07, + "logits/chosen": NaN, + "logits/rejected": -1.073815941810608, + "logps/chosen": -308.98748779296875, + "logps/rejected": -316.8687438964844, + "loss": 0.6116, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -2.0481934547424316, + "rewards/margins": 1.4583740234375, + "rewards/rejected": -3.5071778297424316, + "step": 2660 + }, + { + "epoch": 0.6731163142469984, + "grad_norm": 88.4048843383789, + "learning_rate": 4.7680031038818445e-07, + "logits/chosen": -1.0806763172149658, + "logits/rejected": -1.0692627429962158, + "logps/chosen": -308.1625061035156, + "logps/rejected": -303.8062438964844, + "loss": 0.5529, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5895018577575684, + "rewards/margins": 1.290319800376892, + "rewards/rejected": -3.880664110183716, + "step": 2670 + }, + { + "epoch": 0.6756373491318186, + "grad_norm": 60.40487289428711, + "learning_rate": 4.7649076674292564e-07, + "logits/chosen": -1.06890869140625, + "logits/rejected": -1.095422387123108, + "logps/chosen": -294.2437438964844, + "logps/rejected": -289.6187438964844, + "loss": 0.5113, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.401989698410034, + "rewards/margins": 1.3101806640625, + "rewards/rejected": -3.7132811546325684, + "step": 2680 + }, + { + "epoch": 0.6781583840166389, + "grad_norm": 102.43937683105469, + "learning_rate": 4.761792735529061e-07, + "logits/chosen": -1.0726318359375, + "logits/rejected": -1.087988257408142, + "logps/chosen": -291.21563720703125, + "logps/rejected": -302.2875061035156, + "loss": 0.5948, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4117188453674316, + "rewards/margins": 1.0986328125, + "rewards/rejected": -3.509765625, + "step": 2690 + }, + { + "epoch": 0.680679418901459, + "grad_norm": 190.5872802734375, + "learning_rate": 4.7586583349933864e-07, + "logits/chosen": -1.0610473155975342, + "logits/rejected": -1.0805175304412842, + "logps/chosen": -322.51251220703125, + "logps/rejected": -307.36248779296875, + "loss": 0.617, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -2.056689500808716, + "rewards/margins": 1.2536132335662842, + "rewards/rejected": -3.310351610183716, + "step": 2700 + }, + { + "epoch": 0.6832004537862792, + "grad_norm": 94.1674575805664, + "learning_rate": 4.755504492801937e-07, + "logits/chosen": -1.0991699695587158, + "logits/rejected": -1.0919067859649658, + "logps/chosen": -307.98748779296875, + "logps/rejected": -278.57501220703125, + "loss": 0.5943, + "rewards/accuracies": 0.703125, + "rewards/chosen": -2.4027342796325684, + "rewards/margins": 1.0565063953399658, + "rewards/rejected": -3.4593749046325684, + "step": 2710 + }, + { + "epoch": 0.6857214886710995, + "grad_norm": 69.85820770263672, + "learning_rate": 4.7523312361017654e-07, + "logits/chosen": -1.136254906654358, + "logits/rejected": -1.1059448719024658, + "logps/chosen": -298.8687438964844, + "logps/rejected": -284.8500061035156, + "loss": 0.4529, + "rewards/accuracies": 0.765625, + "rewards/chosen": -2.005786180496216, + "rewards/margins": 1.614477515220642, + "rewards/rejected": -3.6192383766174316, + "step": 2720 + }, + { + "epoch": 0.6882425235559197, + "grad_norm": 82.56523132324219, + "learning_rate": 4.7491385922070347e-07, + "logits/chosen": NaN, + "logits/rejected": -1.112707495689392, + "logps/chosen": -306.09375, + "logps/rejected": -314.23126220703125, + "loss": 0.5461, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7273375988006592, + "rewards/margins": 1.6251709461212158, + "rewards/rejected": -3.3524413108825684, + "step": 2730 + }, + { + "epoch": 0.6907635584407399, + "grad_norm": 70.45012664794922, + "learning_rate": 4.7459265885987865e-07, + "logits/chosen": -1.096398949623108, + "logits/rejected": -1.0847504138946533, + "logps/chosen": -303.64373779296875, + "logps/rejected": -281.29376220703125, + "loss": 0.5543, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6130187511444092, + "rewards/margins": 1.3455810546875, + "rewards/rejected": -2.958935499191284, + "step": 2740 + }, + { + "epoch": 0.6932845933255601, + "grad_norm": 63.99974060058594, + "learning_rate": 4.7426952529247047e-07, + "logits/chosen": -1.188989281654358, + "logits/rejected": -1.1235840320587158, + "logps/chosen": -311.48748779296875, + "logps/rejected": -293.61248779296875, + "loss": 0.6665, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.8017578125, + "rewards/margins": 1.1265137195587158, + "rewards/rejected": -2.9278321266174316, + "step": 2750 + }, + { + "epoch": 0.6958056282103804, + "grad_norm": 61.86286163330078, + "learning_rate": 4.739444612998872e-07, + "logits/chosen": -1.136474609375, + "logits/rejected": -1.1675536632537842, + "logps/chosen": -294.0375061035156, + "logps/rejected": -263.76873779296875, + "loss": 0.4585, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -1.348413109779358, + "rewards/margins": 1.5325438976287842, + "rewards/rejected": -2.8815369606018066, + "step": 2760 + }, + { + "epoch": 0.6983266630952005, + "grad_norm": 53.799129486083984, + "learning_rate": 4.7361746968015396e-07, + "logits/chosen": -1.1840667724609375, + "logits/rejected": -1.2018311023712158, + "logps/chosen": -322.61248779296875, + "logps/rejected": -308.8187561035156, + "loss": 0.5971, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -1.2908141613006592, + "rewards/margins": 1.435052514076233, + "rewards/rejected": -2.7259521484375, + "step": 2770 + }, + { + "epoch": 0.7008476979800208, + "grad_norm": 65.56761169433594, + "learning_rate": 4.732885532478879e-07, + "logits/chosen": -1.1656982898712158, + "logits/rejected": -1.1639893054962158, + "logps/chosen": -291.3968811035156, + "logps/rejected": -264.57501220703125, + "loss": 0.5467, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7727325558662415, + "rewards/margins": 1.4450256824493408, + "rewards/rejected": -2.216906785964966, + "step": 2780 + }, + { + "epoch": 0.703368732864841, + "grad_norm": 85.63340759277344, + "learning_rate": 4.729577148342742e-07, + "logits/chosen": -1.209924340248108, + "logits/rejected": -1.228369116783142, + "logps/chosen": -303.625, + "logps/rejected": -281.60626220703125, + "loss": 0.5424, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.653521716594696, + "rewards/margins": 1.5137939453125, + "rewards/rejected": -2.167309522628784, + "step": 2790 + }, + { + "epoch": 0.7058897677496613, + "grad_norm": 54.452789306640625, + "learning_rate": 4.7262495728704156e-07, + "logits/chosen": -1.145410180091858, + "logits/rejected": -1.1492431163787842, + "logps/chosen": -307.11248779296875, + "logps/rejected": -294.8500061035156, + "loss": 0.5321, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.905059814453125, + "rewards/margins": 1.4968140125274658, + "rewards/rejected": -2.402099609375, + "step": 2800 + }, + { + "epoch": 0.7084108026344814, + "grad_norm": 61.9515266418457, + "learning_rate": 4.7229028347043826e-07, + "logits/chosen": -1.1328125, + "logits/rejected": -1.1371581554412842, + "logps/chosen": -280.28436279296875, + "logps/rejected": -276.81561279296875, + "loss": 0.554, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7785491943359375, + "rewards/margins": 1.3621094226837158, + "rewards/rejected": -2.1416258811950684, + "step": 2810 + }, + { + "epoch": 0.7109318375193017, + "grad_norm": 59.2089729309082, + "learning_rate": 4.719536962652067e-07, + "logits/chosen": -1.1936523914337158, + "logits/rejected": -1.1558837890625, + "logps/chosen": -294.48126220703125, + "logps/rejected": -291.6156311035156, + "loss": 0.5687, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8442443609237671, + "rewards/margins": 1.181768774986267, + "rewards/rejected": -2.026538133621216, + "step": 2820 + }, + { + "epoch": 0.7134528724041219, + "grad_norm": 72.19743347167969, + "learning_rate": 4.7161519856855915e-07, + "logits/chosen": -1.1260986328125, + "logits/rejected": -1.1375243663787842, + "logps/chosen": -291.0874938964844, + "logps/rejected": -273.3187561035156, + "loss": 0.5357, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.684588611125946, + "rewards/margins": 1.328607201576233, + "rewards/rejected": -2.0126099586486816, + "step": 2830 + }, + { + "epoch": 0.7159739072889421, + "grad_norm": 58.94084548950195, + "learning_rate": 4.7127479329415266e-07, + "logits/chosen": -1.1718933582305908, + "logits/rejected": -1.22802734375, + "logps/chosen": -294.58123779296875, + "logps/rejected": -277.1812438964844, + "loss": 0.5834, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6978393793106079, + "rewards/margins": 1.2098877429962158, + "rewards/rejected": -1.9071533679962158, + "step": 2840 + }, + { + "epoch": 0.7184949421737623, + "grad_norm": 64.77632904052734, + "learning_rate": 4.709324833720639e-07, + "logits/chosen": -1.173608422279358, + "logits/rejected": -1.18798828125, + "logps/chosen": -305.28125, + "logps/rejected": -273.5, + "loss": 0.5544, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.7196075320243835, + "rewards/margins": 1.408563256263733, + "rewards/rejected": -2.128100633621216, + "step": 2850 + }, + { + "epoch": 0.7210159770585826, + "grad_norm": 81.05403900146484, + "learning_rate": 4.7058827174876406e-07, + "logits/chosen": -1.155676245689392, + "logits/rejected": -1.110107421875, + "logps/chosen": -269.25311279296875, + "logps/rejected": -267.3187561035156, + "loss": 0.5214, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6199615597724915, + "rewards/margins": 1.470703125, + "rewards/rejected": -2.0902466773986816, + "step": 2860 + }, + { + "epoch": 0.7235370119434028, + "grad_norm": 108.76309204101562, + "learning_rate": 4.7024216138709333e-07, + "logits/chosen": -1.146337866783142, + "logits/rejected": -1.080541968345642, + "logps/chosen": -286.04998779296875, + "logps/rejected": -279.23748779296875, + "loss": 0.5623, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.6038573980331421, + "rewards/margins": 1.650976538658142, + "rewards/rejected": -2.253796339035034, + "step": 2870 + }, + { + "epoch": 0.726058046828223, + "grad_norm": 73.72408294677734, + "learning_rate": 4.6989415526623566e-07, + "logits/chosen": NaN, + "logits/rejected": -1.15380859375, + "logps/chosen": -270.7593688964844, + "logps/rejected": -282.75, + "loss": 0.6657, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.44719237089157104, + "rewards/margins": 1.3208129405975342, + "rewards/rejected": -1.767968773841858, + "step": 2880 + }, + { + "epoch": 0.7285790817130432, + "grad_norm": 66.9427719116211, + "learning_rate": 4.69544256381693e-07, + "logits/chosen": NaN, + "logits/rejected": -1.100317358970642, + "logps/chosen": -277.33123779296875, + "logps/rejected": -260.64373779296875, + "loss": 0.5389, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.080474853515625, + "rewards/margins": 1.3751342296600342, + "rewards/rejected": -1.4554367065429688, + "step": 2890 + }, + { + "epoch": 0.7311001165978634, + "grad_norm": 85.16573333740234, + "learning_rate": 4.691924677452592e-07, + "logits/chosen": -1.1538574695587158, + "logits/rejected": -1.186254858970642, + "logps/chosen": -316.6812438964844, + "logps/rejected": -291.23126220703125, + "loss": 0.5846, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5623626708984375, + "rewards/margins": 1.3126099109649658, + "rewards/rejected": -1.874786376953125, + "step": 2900 + }, + { + "epoch": 0.7336211514826836, + "grad_norm": 80.74299621582031, + "learning_rate": 4.688387923849947e-07, + "logits/chosen": -1.1162598133087158, + "logits/rejected": -1.0867096185684204, + "logps/chosen": -331.0843811035156, + "logps/rejected": -290.32501220703125, + "loss": 0.6125, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -1.106848120689392, + "rewards/margins": 1.276983618736267, + "rewards/rejected": -2.3839659690856934, + "step": 2910 + }, + { + "epoch": 0.7361421863675038, + "grad_norm": 72.91273498535156, + "learning_rate": 4.684832333451998e-07, + "logits/chosen": -1.050482153892517, + "logits/rejected": -1.0858886241912842, + "logps/chosen": -295.3062438964844, + "logps/rejected": -309.61248779296875, + "loss": 0.5397, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -1.507714867591858, + "rewards/margins": 1.477258324623108, + "rewards/rejected": -2.985400438308716, + "step": 2920 + }, + { + "epoch": 0.7386632212523241, + "grad_norm": 65.06329345703125, + "learning_rate": 4.68125793686389e-07, + "logits/chosen": -1.129608154296875, + "logits/rejected": -1.1381103992462158, + "logps/chosen": -307.91876220703125, + "logps/rejected": -297.45001220703125, + "loss": 0.5723, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.5120971202850342, + "rewards/margins": 1.3308227062225342, + "rewards/rejected": -2.8423829078674316, + "step": 2930 + }, + { + "epoch": 0.7411842561371443, + "grad_norm": 99.28950500488281, + "learning_rate": 4.677664764852644e-07, + "logits/chosen": -1.070556640625, + "logits/rejected": -1.10809326171875, + "logps/chosen": -283.8374938964844, + "logps/rejected": -283.53125, + "loss": 0.5436, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -1.4549328088760376, + "rewards/margins": 1.3447662591934204, + "rewards/rejected": -2.7989745140075684, + "step": 2940 + }, + { + "epoch": 0.7437052910219645, + "grad_norm": 64.50562286376953, + "learning_rate": 4.6740528483468926e-07, + "logits/chosen": -1.1115601062774658, + "logits/rejected": -1.1370360851287842, + "logps/chosen": -287.9468688964844, + "logps/rejected": -279.6625061035156, + "loss": 0.5576, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3379623889923096, + "rewards/margins": 1.367529273033142, + "rewards/rejected": -2.7056641578674316, + "step": 2950 + }, + { + "epoch": 0.7462263259067847, + "grad_norm": 50.57285690307617, + "learning_rate": 4.670422218436613e-07, + "logits/chosen": -1.146215796470642, + "logits/rejected": -1.088281273841858, + "logps/chosen": -295.2718811035156, + "logps/rejected": -280.7250061035156, + "loss": 0.5895, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.332891821861267, + "rewards/margins": 1.275415062904358, + "rewards/rejected": -2.609619140625, + "step": 2960 + }, + { + "epoch": 0.748747360791605, + "grad_norm": 93.81817626953125, + "learning_rate": 4.6667729063728616e-07, + "logits/chosen": -1.117163062095642, + "logits/rejected": -1.115881323814392, + "logps/chosen": -295.3687438964844, + "logps/rejected": -283.03125, + "loss": 0.517, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -1.1867492198944092, + "rewards/margins": 1.4429442882537842, + "rewards/rejected": -2.629931688308716, + "step": 2970 + }, + { + "epoch": 0.7512683956764252, + "grad_norm": 74.55547332763672, + "learning_rate": 4.663104943567502e-07, + "logits/chosen": -1.1306641101837158, + "logits/rejected": -1.1548340320587158, + "logps/chosen": -279.8999938964844, + "logps/rejected": -277.0562438964844, + "loss": 0.595, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0513458251953125, + "rewards/margins": 1.223480224609375, + "rewards/rejected": -2.2755126953125, + "step": 2980 + }, + { + "epoch": 0.7537894305612454, + "grad_norm": 66.2352294921875, + "learning_rate": 4.659418361592936e-07, + "logits/chosen": -1.07928466796875, + "logits/rejected": NaN, + "logps/chosen": -285.15625, + "logps/rejected": -283.14373779296875, + "loss": 0.593, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7939056158065796, + "rewards/margins": 1.283117651939392, + "rewards/rejected": -2.076037645339966, + "step": 2990 + }, + { + "epoch": 0.7563104654460656, + "grad_norm": 100.5522689819336, + "learning_rate": 4.655713192181835e-07, + "logits/chosen": -1.186669945716858, + "logits/rejected": -1.159265160560608, + "logps/chosen": -327.4125061035156, + "logps/rejected": -274.95001220703125, + "loss": 0.5666, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.5676056146621704, + "rewards/margins": 1.240380883216858, + "rewards/rejected": -1.8081023693084717, + "step": 3000 + }, + { + "epoch": 0.7588315003308859, + "grad_norm": 51.96790313720703, + "learning_rate": 4.651989467226859e-07, + "logits/chosen": -1.166284203529358, + "logits/rejected": -1.1627686023712158, + "logps/chosen": -301.23748779296875, + "logps/rejected": -268.55938720703125, + "loss": 0.5571, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.42068177461624146, + "rewards/margins": 1.1670043468475342, + "rewards/rejected": -1.5872313976287842, + "step": 3010 + }, + { + "epoch": 0.761352535215706, + "grad_norm": 78.86668395996094, + "learning_rate": 4.648247218780391e-07, + "logits/chosen": -1.1724426746368408, + "logits/rejected": -1.1681029796600342, + "logps/chosen": -279.98126220703125, + "logps/rejected": -264.46875, + "loss": 0.6394, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.5438163876533508, + "rewards/margins": 1.0772705078125, + "rewards/rejected": -1.621557593345642, + "step": 3020 + }, + { + "epoch": 0.7638735701005263, + "grad_norm": 48.36394500732422, + "learning_rate": 4.644486479054256e-07, + "logits/chosen": -1.1013062000274658, + "logits/rejected": -1.158178687095642, + "logps/chosen": -308.86248779296875, + "logps/rejected": -294.0562438964844, + "loss": 0.5717, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.8183044195175171, + "rewards/margins": 1.3197815418243408, + "rewards/rejected": -2.139080762863159, + "step": 3030 + }, + { + "epoch": 0.7663946049853465, + "grad_norm": 57.000423431396484, + "learning_rate": 4.640707280419444e-07, + "logits/chosen": -1.135351538658142, + "logits/rejected": -1.151525855064392, + "logps/chosen": -291.95001220703125, + "logps/rejected": -270.45623779296875, + "loss": 0.5595, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.9432922601699829, + "rewards/margins": 1.415521264076233, + "rewards/rejected": -2.3579344749450684, + "step": 3040 + }, + { + "epoch": 0.7689156398701668, + "grad_norm": 50.99508285522461, + "learning_rate": 4.636909655405832e-07, + "logits/chosen": -1.132177710533142, + "logits/rejected": -1.151269555091858, + "logps/chosen": -300.16876220703125, + "logps/rejected": -289.1937561035156, + "loss": 0.5131, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -0.838409423828125, + "rewards/margins": 1.471771240234375, + "rewards/rejected": -2.3096680641174316, + "step": 3050 + }, + { + "epoch": 0.7714366747549869, + "grad_norm": 125.89732360839844, + "learning_rate": 4.633093636701904e-07, + "logits/chosen": -1.0341064929962158, + "logits/rejected": -1.1121826171875, + "logps/chosen": -272.1875, + "logps/rejected": -264.9750061035156, + "loss": 0.5571, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -0.6455932855606079, + "rewards/margins": 1.581274390220642, + "rewards/rejected": -2.2275543212890625, + "step": 3060 + }, + { + "epoch": 0.7739577096398071, + "grad_norm": 59.311336517333984, + "learning_rate": 4.629259257154472e-07, + "logits/chosen": -1.117163062095642, + "logits/rejected": -1.1147582530975342, + "logps/chosen": -267.73748779296875, + "logps/rejected": -266.01873779296875, + "loss": 0.5544, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.3879333436489105, + "rewards/margins": 1.393408179283142, + "rewards/rejected": -1.7818222045898438, + "step": 3070 + }, + { + "epoch": 0.7764787445246274, + "grad_norm": 75.37936401367188, + "learning_rate": 4.625406549768389e-07, + "logits/chosen": -1.1622803211212158, + "logits/rejected": -1.1376464366912842, + "logps/chosen": -282.03125, + "logps/rejected": -297.0375061035156, + "loss": 0.6313, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.18592986464500427, + "rewards/margins": 1.1645171642303467, + "rewards/rejected": -1.3501007556915283, + "step": 3080 + }, + { + "epoch": 0.7789997794094475, + "grad_norm": 45.26335906982422, + "learning_rate": 4.621535547706267e-07, + "logits/chosen": -1.115576148033142, + "logits/rejected": -1.152734398841858, + "logps/chosen": -297.2875061035156, + "logps/rejected": -256.875, + "loss": 0.6164, + "rewards/accuracies": 0.684374988079071, + "rewards/chosen": -0.223388671875, + "rewards/margins": 1.0387756824493408, + "rewards/rejected": -1.2618286609649658, + "step": 3090 + }, + { + "epoch": 0.7815208142942678, + "grad_norm": 62.349674224853516, + "learning_rate": 4.6176462842881914e-07, + "logits/chosen": -1.164794921875, + "logits/rejected": -1.13763427734375, + "logps/chosen": -290.48748779296875, + "logps/rejected": -307.4375, + "loss": 0.5724, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.137237548828125, + "rewards/margins": 1.046820044517517, + "rewards/rejected": -1.1840629577636719, + "step": 3100 + }, + { + "epoch": 0.784041849179088, + "grad_norm": 53.608009338378906, + "learning_rate": 4.6137387929914355e-07, + "logits/chosen": -1.122167944908142, + "logits/rejected": -1.119543433189392, + "logps/chosen": -302.51873779296875, + "logps/rejected": -283.5062561035156, + "loss": 0.4835, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.22745056450366974, + "rewards/margins": 1.39703369140625, + "rewards/rejected": -1.6243774890899658, + "step": 3110 + }, + { + "epoch": 0.7865628840639083, + "grad_norm": 55.97255325317383, + "learning_rate": 4.60981310745017e-07, + "logits/chosen": -1.1291015148162842, + "logits/rejected": -1.1198852062225342, + "logps/chosen": -301.4312438964844, + "logps/rejected": -289.1625061035156, + "loss": 0.5309, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.9412506222724915, + "rewards/margins": 1.3587372303009033, + "rewards/rejected": -2.30029296875, + "step": 3120 + }, + { + "epoch": 0.7890839189487284, + "grad_norm": 47.36808776855469, + "learning_rate": 4.6058692614551755e-07, + "logits/chosen": -1.126550316810608, + "logits/rejected": -1.1423828601837158, + "logps/chosen": -311.375, + "logps/rejected": -301.09375, + "loss": 0.5324, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.9432128667831421, + "rewards/margins": 1.5833008289337158, + "rewards/rejected": -2.525921583175659, + "step": 3130 + }, + { + "epoch": 0.7916049538335487, + "grad_norm": 46.63396072387695, + "learning_rate": 4.6019072889535495e-07, + "logits/chosen": -1.0670897960662842, + "logits/rejected": -1.073114037513733, + "logps/chosen": -321.45001220703125, + "logps/rejected": -313.63751220703125, + "loss": 0.5413, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -1.2390563488006592, + "rewards/margins": 1.4928436279296875, + "rewards/rejected": -2.731982469558716, + "step": 3140 + }, + { + "epoch": 0.7941259887183689, + "grad_norm": 67.96356964111328, + "learning_rate": 4.5979272240484156e-07, + "logits/chosen": -1.0963256359100342, + "logits/rejected": -1.0971252918243408, + "logps/chosen": -311.7437438964844, + "logps/rejected": -280.92498779296875, + "loss": 0.5821, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9785400629043579, + "rewards/margins": 1.6365845203399658, + "rewards/rejected": -2.6151123046875, + "step": 3150 + }, + { + "epoch": 0.7966470236031891, + "grad_norm": 69.48004150390625, + "learning_rate": 4.593929100998632e-07, + "logits/chosen": -1.096643090248108, + "logits/rejected": -1.138452172279358, + "logps/chosen": -263.83123779296875, + "logps/rejected": -282.66876220703125, + "loss": 0.5359, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5240997076034546, + "rewards/margins": 1.551782250404358, + "rewards/rejected": -2.075640916824341, + "step": 3160 + }, + { + "epoch": 0.7991680584880093, + "grad_norm": 149.04946899414062, + "learning_rate": 4.5899129542184914e-07, + "logits/chosen": -1.0813720226287842, + "logits/rejected": -1.1157958507537842, + "logps/chosen": -286.39373779296875, + "logps/rejected": -295.98748779296875, + "loss": 0.5777, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3289123475551605, + "rewards/margins": 1.510929822921753, + "rewards/rejected": -1.838830590248108, + "step": 3170 + }, + { + "epoch": 0.8016890933728296, + "grad_norm": 60.59016799926758, + "learning_rate": 4.5858788182774296e-07, + "logits/chosen": -1.1536865234375, + "logits/rejected": -1.177758812904358, + "logps/chosen": -285.71875, + "logps/rejected": -271.8062438964844, + "loss": 0.5454, + "rewards/accuracies": 0.753125011920929, + "rewards/chosen": -0.2949768006801605, + "rewards/margins": 1.6162230968475342, + "rewards/rejected": -1.911962866783142, + "step": 3180 + }, + { + "epoch": 0.8042101282576498, + "grad_norm": 81.2911376953125, + "learning_rate": 4.581826727899725e-07, + "logits/chosen": -1.142236351966858, + "logits/rejected": -1.137426733970642, + "logps/chosen": -293.79376220703125, + "logps/rejected": -280.20623779296875, + "loss": 0.5982, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3628478944301605, + "rewards/margins": 1.4190642833709717, + "rewards/rejected": -1.782073974609375, + "step": 3190 + }, + { + "epoch": 0.80673116314247, + "grad_norm": 71.78868103027344, + "learning_rate": 4.577756717964203e-07, + "logits/chosen": -1.063012719154358, + "logits/rejected": -1.068792700767517, + "logps/chosen": -312.0874938964844, + "logps/rejected": -301.5562438964844, + "loss": 0.5286, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -0.6008270382881165, + "rewards/margins": 1.579858422279358, + "rewards/rejected": -2.180126905441284, + "step": 3200 + }, + { + "epoch": 0.8092521980272902, + "grad_norm": 80.68486022949219, + "learning_rate": 4.57366882350393e-07, + "logits/chosen": -1.046118140220642, + "logits/rejected": -1.0717284679412842, + "logps/chosen": -281.98748779296875, + "logps/rejected": -282.8187561035156, + "loss": 0.5391, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.924053966999054, + "rewards/margins": 1.3265869617462158, + "rewards/rejected": -2.2493653297424316, + "step": 3210 + }, + { + "epoch": 0.8117732329121105, + "grad_norm": 61.843082427978516, + "learning_rate": 4.569563079705919e-07, + "logits/chosen": -1.1148681640625, + "logits/rejected": -1.072900414466858, + "logps/chosen": -301.4937438964844, + "logps/rejected": -293.8999938964844, + "loss": 0.5562, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7594634890556335, + "rewards/margins": 1.3990600109100342, + "rewards/rejected": -2.1579222679138184, + "step": 3220 + }, + { + "epoch": 0.8142942677969306, + "grad_norm": 62.5531005859375, + "learning_rate": 4.5654395219108224e-07, + "logits/chosen": -1.13018798828125, + "logits/rejected": -1.1165587902069092, + "logps/chosen": -317.23126220703125, + "logps/rejected": -289.3999938964844, + "loss": 0.6072, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.8774398565292358, + "rewards/margins": 1.1413085460662842, + "rewards/rejected": -2.0191009044647217, + "step": 3230 + }, + { + "epoch": 0.8168153026817508, + "grad_norm": 83.58039093017578, + "learning_rate": 4.5612981856126264e-07, + "logits/chosen": -1.123754858970642, + "logits/rejected": -1.109716773033142, + "logps/chosen": -283.6937561035156, + "logps/rejected": -295.3062438964844, + "loss": 0.5905, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.789410412311554, + "rewards/margins": 1.2579864263534546, + "rewards/rejected": -2.0471160411834717, + "step": 3240 + }, + { + "epoch": 0.8193363375665711, + "grad_norm": 79.18340301513672, + "learning_rate": 4.55713910645835e-07, + "logits/chosen": -1.1595947742462158, + "logits/rejected": -1.1443603038787842, + "logps/chosen": -294.20623779296875, + "logps/rejected": -283.6000061035156, + "loss": 0.5571, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -0.502471923828125, + "rewards/margins": 1.348358154296875, + "rewards/rejected": -1.8505951166152954, + "step": 3250 + }, + { + "epoch": 0.8218573724513913, + "grad_norm": 79.3178939819336, + "learning_rate": 4.552962320247734e-07, + "logits/chosen": -1.1867187023162842, + "logits/rejected": -1.1916015148162842, + "logps/chosen": -311.015625, + "logps/rejected": -301.78125, + "loss": 0.6201, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.644396960735321, + "rewards/margins": 1.343725562095642, + "rewards/rejected": -1.98846435546875, + "step": 3260 + }, + { + "epoch": 0.8243784073362115, + "grad_norm": 73.20548248291016, + "learning_rate": 4.5487678629329373e-07, + "logits/chosen": NaN, + "logits/rejected": -1.068078637123108, + "logps/chosen": -288.9375, + "logps/rejected": -294.54376220703125, + "loss": 0.5417, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.5860260128974915, + "rewards/margins": 1.6181671619415283, + "rewards/rejected": -2.204010009765625, + "step": 3270 + }, + { + "epoch": 0.8268994422210317, + "grad_norm": 76.6052474975586, + "learning_rate": 4.544555770618222e-07, + "logits/chosen": -1.139013648033142, + "logits/rejected": -1.1353759765625, + "logps/chosen": -273.1499938964844, + "logps/rejected": -298.125, + "loss": 0.601, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7464264035224915, + "rewards/margins": 1.3448486328125, + "rewards/rejected": -2.091296434402466, + "step": 3280 + }, + { + "epoch": 0.829420477105852, + "grad_norm": 48.098236083984375, + "learning_rate": 4.540326079559647e-07, + "logits/chosen": -1.07611083984375, + "logits/rejected": -1.1259765625, + "logps/chosen": -310.8999938964844, + "logps/rejected": -272.73748779296875, + "loss": 0.477, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.0783233642578125, + "rewards/margins": 1.664270043373108, + "rewards/rejected": -2.742236375808716, + "step": 3290 + }, + { + "epoch": 0.8319415119906721, + "grad_norm": 81.13333129882812, + "learning_rate": 4.5360788261647544e-07, + "logits/chosen": -1.105444312095642, + "logits/rejected": -1.031591773033142, + "logps/chosen": -311.0375061035156, + "logps/rejected": -284.4937438964844, + "loss": 0.5279, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.309301733970642, + "rewards/margins": 1.5679442882537842, + "rewards/rejected": -2.8753418922424316, + "step": 3300 + }, + { + "epoch": 0.8344625468754924, + "grad_norm": 66.38290405273438, + "learning_rate": 4.531814046992255e-07, + "logits/chosen": -1.042199730873108, + "logits/rejected": NaN, + "logps/chosen": -295.3374938964844, + "logps/rejected": -285.51251220703125, + "loss": 0.558, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3630584478378296, + "rewards/margins": 1.519934058189392, + "rewards/rejected": -2.8828186988830566, + "step": 3310 + }, + { + "epoch": 0.8369835817603126, + "grad_norm": 81.1019287109375, + "learning_rate": 4.5275317787517166e-07, + "logits/chosen": -1.1407959461212158, + "logits/rejected": -1.1571776866912842, + "logps/chosen": -314.76251220703125, + "logps/rejected": -304.16876220703125, + "loss": 0.6103, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.140203833580017, + "rewards/margins": 1.455957055091858, + "rewards/rejected": -2.595172166824341, + "step": 3320 + }, + { + "epoch": 0.8395046166451329, + "grad_norm": 46.38509750366211, + "learning_rate": 4.5232320583032437e-07, + "logits/chosen": -1.0676758289337158, + "logits/rejected": -1.0481750965118408, + "logps/chosen": -294.16876220703125, + "logps/rejected": -275.0, + "loss": 0.5718, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3431777954101562, + "rewards/margins": 1.3079712390899658, + "rewards/rejected": -2.651416063308716, + "step": 3330 + }, + { + "epoch": 0.842025651529953, + "grad_norm": 65.28993225097656, + "learning_rate": 4.518914922657164e-07, + "logits/chosen": -1.0771973133087158, + "logits/rejected": -1.04425048828125, + "logps/chosen": -286.7250061035156, + "logps/rejected": -290.0062561035156, + "loss": 0.5264, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1759170293807983, + "rewards/margins": 1.192602515220642, + "rewards/rejected": -2.368701219558716, + "step": 3340 + }, + { + "epoch": 0.8445466864147733, + "grad_norm": 53.799381256103516, + "learning_rate": 4.5145804089737093e-07, + "logits/chosen": -1.0987670421600342, + "logits/rejected": -1.08984375, + "logps/chosen": -295.4937438964844, + "logps/rejected": -261.08123779296875, + "loss": 0.547, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1604430675506592, + "rewards/margins": 1.3437378406524658, + "rewards/rejected": -2.504638671875, + "step": 3350 + }, + { + "epoch": 0.8470677212995935, + "grad_norm": 91.82332611083984, + "learning_rate": 4.510228554562693e-07, + "logits/chosen": -1.1641356945037842, + "logits/rejected": -1.0832030773162842, + "logps/chosen": -304.3125, + "logps/rejected": -287.65625, + "loss": 0.5168, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9715011715888977, + "rewards/margins": 1.479833960533142, + "rewards/rejected": -2.45159912109375, + "step": 3360 + }, + { + "epoch": 0.8495887561844137, + "grad_norm": 91.4314193725586, + "learning_rate": 4.505859396883192e-07, + "logits/chosen": -1.1128661632537842, + "logits/rejected": -1.1441528797149658, + "logps/chosen": -254.0656280517578, + "logps/rejected": -269.9937438964844, + "loss": 0.5498, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9849303960800171, + "rewards/margins": 1.516363501548767, + "rewards/rejected": -2.5011229515075684, + "step": 3370 + }, + { + "epoch": 0.8521097910692339, + "grad_norm": 76.95771789550781, + "learning_rate": 4.501472973543222e-07, + "logits/chosen": -1.0636154413223267, + "logits/rejected": -1.068426489830017, + "logps/chosen": -307.46875, + "logps/rejected": -297.5, + "loss": 0.61, + "rewards/accuracies": 0.671875, + "rewards/chosen": -1.4217407703399658, + "rewards/margins": 1.369140625, + "rewards/rejected": -2.790942430496216, + "step": 3380 + }, + { + "epoch": 0.8546308259540542, + "grad_norm": 83.77713775634766, + "learning_rate": 4.497069322299417e-07, + "logits/chosen": -1.122802734375, + "logits/rejected": -1.1002686023712158, + "logps/chosen": -305.2124938964844, + "logps/rejected": -291.82501220703125, + "loss": 0.5781, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -1.9173462390899658, + "rewards/margins": 1.332556128501892, + "rewards/rejected": -3.2500977516174316, + "step": 3390 + }, + { + "epoch": 0.8571518608388744, + "grad_norm": 63.612693786621094, + "learning_rate": 4.4926484810567e-07, + "logits/chosen": -1.072717308998108, + "logits/rejected": -1.0795409679412842, + "logps/chosen": -331.09375, + "logps/rejected": -307.88751220703125, + "loss": 0.5327, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -1.918676733970642, + "rewards/margins": 1.33758544921875, + "rewards/rejected": -3.2566895484924316, + "step": 3400 + }, + { + "epoch": 0.8596728957236945, + "grad_norm": 57.99867248535156, + "learning_rate": 4.4882104878679584e-07, + "logits/chosen": -1.105688452720642, + "logits/rejected": -1.032354712486267, + "logps/chosen": -281.70623779296875, + "logps/rejected": -253.40625, + "loss": 0.4861, + "rewards/accuracies": 0.7718750238418579, + "rewards/chosen": -2.032440185546875, + "rewards/margins": 1.5134704113006592, + "rewards/rejected": -3.5458984375, + "step": 3410 + }, + { + "epoch": 0.8621939306085148, + "grad_norm": 48.57344436645508, + "learning_rate": 4.4837553809337194e-07, + "logits/chosen": -1.030920386314392, + "logits/rejected": -1.0585143566131592, + "logps/chosen": -301.8125, + "logps/rejected": -283.3500061035156, + "loss": 0.6095, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -2.375805616378784, + "rewards/margins": 1.429931640625, + "rewards/rejected": -3.8055663108825684, + "step": 3420 + }, + { + "epoch": 0.864714965493335, + "grad_norm": 71.92064666748047, + "learning_rate": 4.479283198601816e-07, + "logits/chosen": -1.089746117591858, + "logits/rejected": -1.106909155845642, + "logps/chosen": -300.7562561035156, + "logps/rejected": -300.15625, + "loss": 0.5747, + "rewards/accuracies": 0.703125, + "rewards/chosen": -2.282421827316284, + "rewards/margins": 1.3709625005722046, + "rewards/rejected": -3.6548829078674316, + "step": 3430 + }, + { + "epoch": 0.8672360003781552, + "grad_norm": 59.84413146972656, + "learning_rate": 4.474793979367061e-07, + "logits/chosen": -1.0660400390625, + "logits/rejected": -1.075524926185608, + "logps/chosen": -292.2250061035156, + "logps/rejected": -282.1625061035156, + "loss": 0.638, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -1.9407684803009033, + "rewards/margins": 1.238427758216858, + "rewards/rejected": -3.1781249046325684, + "step": 3440 + }, + { + "epoch": 0.8697570352629754, + "grad_norm": 59.574527740478516, + "learning_rate": 4.470287761870916e-07, + "logits/chosen": -1.0941650867462158, + "logits/rejected": -1.115942358970642, + "logps/chosen": -299.73126220703125, + "logps/rejected": -307.7124938964844, + "loss": 0.5402, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.517303466796875, + "rewards/margins": 1.4228637218475342, + "rewards/rejected": -2.940234422683716, + "step": 3450 + }, + { + "epoch": 0.8722780701477957, + "grad_norm": 65.44307708740234, + "learning_rate": 4.465764584901152e-07, + "logits/chosen": -1.032434105873108, + "logits/rejected": -1.0498778820037842, + "logps/chosen": -258.25, + "logps/rejected": -264.54998779296875, + "loss": 0.5861, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.127105712890625, + "rewards/margins": 1.3675415515899658, + "rewards/rejected": -2.4952635765075684, + "step": 3460 + }, + { + "epoch": 0.8747991050326159, + "grad_norm": 73.56661987304688, + "learning_rate": 4.461224487391526e-07, + "logits/chosen": -1.093042016029358, + "logits/rejected": -1.0864746570587158, + "logps/chosen": -317.6312561035156, + "logps/rejected": -293.34375, + "loss": 0.501, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.166131615638733, + "rewards/margins": 1.443243384361267, + "rewards/rejected": -2.6086058616638184, + "step": 3470 + }, + { + "epoch": 0.8773201399174361, + "grad_norm": 86.67921447753906, + "learning_rate": 4.456667508421438e-07, + "logits/chosen": -1.0773437023162842, + "logits/rejected": -1.0750000476837158, + "logps/chosen": -316.9375, + "logps/rejected": -292.60626220703125, + "loss": 0.6976, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -1.1835922002792358, + "rewards/margins": 1.2973754405975342, + "rewards/rejected": -2.480395555496216, + "step": 3480 + }, + { + "epoch": 0.8798411748022563, + "grad_norm": 84.18367004394531, + "learning_rate": 4.4520936872155967e-07, + "logits/chosen": -1.1320312023162842, + "logits/rejected": -1.151269555091858, + "logps/chosen": -279.5249938964844, + "logps/rejected": -290.98748779296875, + "loss": 0.5921, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6704986691474915, + "rewards/margins": 1.3552429676055908, + "rewards/rejected": -2.024883985519409, + "step": 3490 + }, + { + "epoch": 0.8823622096870766, + "grad_norm": 80.883056640625, + "learning_rate": 4.447503063143683e-07, + "logits/chosen": -1.1244628429412842, + "logits/rejected": -1.1005980968475342, + "logps/chosen": -300.46563720703125, + "logps/rejected": -296.7875061035156, + "loss": 0.6703, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8366973996162415, + "rewards/margins": 1.289392113685608, + "rewards/rejected": -2.125659227371216, + "step": 3500 + }, + { + "epoch": 0.8848832445718967, + "grad_norm": 83.55731964111328, + "learning_rate": 4.4428956757200096e-07, + "logits/chosen": -1.111962914466858, + "logits/rejected": -1.09796142578125, + "logps/chosen": -284.5562438964844, + "logps/rejected": -262.9937438964844, + "loss": 0.5635, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.689208984375, + "rewards/margins": 1.4481933116912842, + "rewards/rejected": -2.137737989425659, + "step": 3510 + }, + { + "epoch": 0.887404279456717, + "grad_norm": 68.90113830566406, + "learning_rate": 4.4382715646031834e-07, + "logits/chosen": -1.0959961414337158, + "logits/rejected": -1.048187255859375, + "logps/chosen": -298.9375, + "logps/rejected": -266.875, + "loss": 0.4765, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7831360101699829, + "rewards/margins": 1.63409423828125, + "rewards/rejected": -2.416159152984619, + "step": 3520 + }, + { + "epoch": 0.8899253143415372, + "grad_norm": 50.8995475769043, + "learning_rate": 4.4336307695957605e-07, + "logits/chosen": -1.07177734375, + "logits/rejected": -1.093530297279358, + "logps/chosen": -288.6875, + "logps/rejected": -294.5874938964844, + "loss": 0.6003, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -0.7579803466796875, + "rewards/margins": 1.699853539466858, + "rewards/rejected": -2.456860303878784, + "step": 3530 + }, + { + "epoch": 0.8924463492263575, + "grad_norm": 65.32662200927734, + "learning_rate": 4.428973330643906e-07, + "logits/chosen": -1.082128882408142, + "logits/rejected": -1.100927710533142, + "logps/chosen": -299.3500061035156, + "logps/rejected": -293.04998779296875, + "loss": 0.5206, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.817181408405304, + "rewards/margins": 1.541632056236267, + "rewards/rejected": -2.3595337867736816, + "step": 3540 + }, + { + "epoch": 0.8949673841111776, + "grad_norm": 75.93994140625, + "learning_rate": 4.4242992878370493e-07, + "logits/chosen": -1.0684661865234375, + "logits/rejected": -1.129541039466858, + "logps/chosen": -286.25311279296875, + "logps/rejected": -296.6937561035156, + "loss": 0.5547, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.8977721929550171, + "rewards/margins": 1.512475609779358, + "rewards/rejected": -2.4103026390075684, + "step": 3550 + }, + { + "epoch": 0.8974884189959978, + "grad_norm": 88.91752624511719, + "learning_rate": 4.4196086814075405e-07, + "logits/chosen": -1.0927734375, + "logits/rejected": -1.099035620689392, + "logps/chosen": -288.8687438964844, + "logps/rejected": -283.9437561035156, + "loss": 0.603, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -0.9323943853378296, + "rewards/margins": 1.367132544517517, + "rewards/rejected": -2.2994384765625, + "step": 3560 + }, + { + "epoch": 0.9000094538808181, + "grad_norm": 43.13289260864258, + "learning_rate": 4.4149015517303035e-07, + "logits/chosen": -1.126708984375, + "logits/rejected": -1.063146948814392, + "logps/chosen": -271.8500061035156, + "logps/rejected": -283.4125061035156, + "loss": 0.4782, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -0.3869872987270355, + "rewards/margins": 1.6957275867462158, + "rewards/rejected": -2.083209276199341, + "step": 3570 + }, + { + "epoch": 0.9025304887656382, + "grad_norm": 66.90422821044922, + "learning_rate": 4.410177939322484e-07, + "logits/chosen": -1.1051146984100342, + "logits/rejected": -1.123419165611267, + "logps/chosen": -312.7124938964844, + "logps/rejected": -295.29998779296875, + "loss": 0.4763, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.920245349407196, + "rewards/margins": 1.6244995594024658, + "rewards/rejected": -2.5440430641174316, + "step": 3580 + }, + { + "epoch": 0.9050515236504585, + "grad_norm": 75.59705352783203, + "learning_rate": 4.4054378848431086e-07, + "logits/chosen": -1.067163109779358, + "logits/rejected": -1.0641663074493408, + "logps/chosen": -303.83123779296875, + "logps/rejected": -297.3374938964844, + "loss": 0.545, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -1.148584008216858, + "rewards/margins": 1.770471215248108, + "rewards/rejected": -2.9188232421875, + "step": 3590 + }, + { + "epoch": 0.9075725585352787, + "grad_norm": 83.44449615478516, + "learning_rate": 4.40068142909273e-07, + "logits/chosen": -1.1025879383087158, + "logits/rejected": -1.0754272937774658, + "logps/chosen": -287.3218688964844, + "logps/rejected": -281.82501220703125, + "loss": 0.5576, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -1.1833312511444092, + "rewards/margins": 1.5989501476287842, + "rewards/rejected": -2.783398389816284, + "step": 3600 + }, + { + "epoch": 0.910093593420099, + "grad_norm": 78.62376403808594, + "learning_rate": 4.395908613013076e-07, + "logits/chosen": -1.113623023033142, + "logits/rejected": -1.074987769126892, + "logps/chosen": -318.90625, + "logps/rejected": -304.01873779296875, + "loss": 0.6044, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -0.873699963092804, + "rewards/margins": 1.5673949718475342, + "rewards/rejected": -2.4405517578125, + "step": 3610 + }, + { + "epoch": 0.9126146283049191, + "grad_norm": 82.23873138427734, + "learning_rate": 4.391119477686698e-07, + "logits/chosen": -1.0331542491912842, + "logits/rejected": -1.044946312904358, + "logps/chosen": -271.75, + "logps/rejected": -270.8531188964844, + "loss": 0.6195, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.747485339641571, + "rewards/margins": 1.320947289466858, + "rewards/rejected": -2.0690064430236816, + "step": 3620 + }, + { + "epoch": 0.9151356631897394, + "grad_norm": 75.17716217041016, + "learning_rate": 4.386314064336617e-07, + "logits/chosen": -1.060644507408142, + "logits/rejected": -1.0835692882537842, + "logps/chosen": -279.6312561035156, + "logps/rejected": -285.07501220703125, + "loss": 0.6073, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.6068664789199829, + "rewards/margins": 1.296630859375, + "rewards/rejected": -1.9022216796875, + "step": 3630 + }, + { + "epoch": 0.9176566980745596, + "grad_norm": 72.4251937866211, + "learning_rate": 4.38149241432597e-07, + "logits/chosen": -1.0910155773162842, + "logits/rejected": -1.138879418373108, + "logps/chosen": -295.5687561035156, + "logps/rejected": -281.5625, + "loss": 0.5357, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.6645095944404602, + "rewards/margins": 1.4889037609100342, + "rewards/rejected": -2.1529541015625, + "step": 3640 + }, + { + "epoch": 0.9201777329593799, + "grad_norm": 53.61591720581055, + "learning_rate": 4.3766545691576507e-07, + "logits/chosen": -1.1067383289337158, + "logits/rejected": -1.127832055091858, + "logps/chosen": -280.53125, + "logps/rejected": -278.9125061035156, + "loss": 0.5628, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6921478509902954, + "rewards/margins": 1.342340111732483, + "rewards/rejected": -2.034716844558716, + "step": 3650 + }, + { + "epoch": 0.9226987678442, + "grad_norm": 79.49798583984375, + "learning_rate": 4.3718005704739567e-07, + "logits/chosen": -1.1438171863555908, + "logits/rejected": -1.1519043445587158, + "logps/chosen": -278.85626220703125, + "logps/rejected": -294.26251220703125, + "loss": 0.6286, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.0359008312225342, + "rewards/margins": 1.2102782726287842, + "rewards/rejected": -2.246447801589966, + "step": 3660 + }, + { + "epoch": 0.9252198027290203, + "grad_norm": 44.99201965332031, + "learning_rate": 4.366930460056227e-07, + "logits/chosen": NaN, + "logits/rejected": -1.0991332530975342, + "logps/chosen": -291.3687438964844, + "logps/rejected": -303.41876220703125, + "loss": 0.5093, + "rewards/accuracies": 0.7593749761581421, + "rewards/chosen": -0.9651123285293579, + "rewards/margins": 1.519689917564392, + "rewards/rejected": -2.485278367996216, + "step": 3670 + }, + { + "epoch": 0.9277408376138405, + "grad_norm": 59.285888671875, + "learning_rate": 4.362044279824487e-07, + "logits/chosen": -1.093774437904358, + "logits/rejected": -1.1437256336212158, + "logps/chosen": -274.8374938964844, + "logps/rejected": -291.0625, + "loss": 0.5187, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0367004871368408, + "rewards/margins": 1.437585473060608, + "rewards/rejected": -2.4761719703674316, + "step": 3680 + }, + { + "epoch": 0.9302618724986607, + "grad_norm": 75.43614196777344, + "learning_rate": 4.357142071837081e-07, + "logits/chosen": -1.0698974132537842, + "logits/rejected": -1.0759766101837158, + "logps/chosen": -298.70623779296875, + "logps/rejected": -275.73126220703125, + "loss": 0.5363, + "rewards/accuracies": 0.703125, + "rewards/chosen": -1.1840789318084717, + "rewards/margins": 1.457769751548767, + "rewards/rejected": -2.6414551734924316, + "step": 3690 + }, + { + "epoch": 0.9327829073834809, + "grad_norm": 63.65312957763672, + "learning_rate": 4.3522238782903157e-07, + "logits/chosen": -1.1274902820587158, + "logits/rejected": -1.1024658679962158, + "logps/chosen": -302.3500061035156, + "logps/rejected": -299.7437438964844, + "loss": 0.5913, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0835082530975342, + "rewards/margins": 1.494378685951233, + "rewards/rejected": -2.578015089035034, + "step": 3700 + }, + { + "epoch": 0.9353039422683012, + "grad_norm": 58.389888763427734, + "learning_rate": 4.347289741518097e-07, + "logits/chosen": -1.082788109779358, + "logits/rejected": -1.055932641029358, + "logps/chosen": -304.51251220703125, + "logps/rejected": -298.7437438964844, + "loss": 0.4947, + "rewards/accuracies": 0.765625, + "rewards/chosen": -1.125823974609375, + "rewards/margins": 1.711151123046875, + "rewards/rejected": -2.836346387863159, + "step": 3710 + }, + { + "epoch": 0.9378249771531214, + "grad_norm": 58.16753005981445, + "learning_rate": 4.342339703991561e-07, + "logits/chosen": -1.1070556640625, + "logits/rejected": -1.120361328125, + "logps/chosen": -318.04998779296875, + "logps/rejected": -307.88751220703125, + "loss": 0.5755, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -1.029840111732483, + "rewards/margins": 1.768408179283142, + "rewards/rejected": -2.798046827316284, + "step": 3720 + }, + { + "epoch": 0.9403460120379415, + "grad_norm": 65.45536041259766, + "learning_rate": 4.337373808318713e-07, + "logits/chosen": -1.1427733898162842, + "logits/rejected": -1.125146508216858, + "logps/chosen": -296.48748779296875, + "logps/rejected": -289.7562561035156, + "loss": 0.5597, + "rewards/accuracies": 0.715624988079071, + "rewards/chosen": -0.7230895757675171, + "rewards/margins": 1.6128143072128296, + "rewards/rejected": -2.3357300758361816, + "step": 3730 + }, + { + "epoch": 0.9428670469227618, + "grad_norm": 87.32518768310547, + "learning_rate": 4.33239209724406e-07, + "logits/chosen": -1.1684081554412842, + "logits/rejected": -1.177221655845642, + "logps/chosen": -319.82501220703125, + "logps/rejected": -300.2749938964844, + "loss": 0.587, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -1.0697815418243408, + "rewards/margins": 1.7837402820587158, + "rewards/rejected": -2.8533935546875, + "step": 3740 + }, + { + "epoch": 0.945388081807582, + "grad_norm": 52.16835403442383, + "learning_rate": 4.327394613648239e-07, + "logits/chosen": -1.135839819908142, + "logits/rejected": -1.119384765625, + "logps/chosen": -304.609375, + "logps/rejected": -283.1000061035156, + "loss": 0.4795, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.5196136236190796, + "rewards/margins": 1.75726318359375, + "rewards/rejected": -2.275927782058716, + "step": 3750 + }, + { + "epoch": 0.9479091166924022, + "grad_norm": 92.80891418457031, + "learning_rate": 4.322381400547653e-07, + "logits/chosen": -1.1255614757537842, + "logits/rejected": -1.0975220203399658, + "logps/chosen": -306.82501220703125, + "logps/rejected": -282.1187438964844, + "loss": 0.6211, + "rewards/accuracies": 0.7281249761581421, + "rewards/chosen": -0.5638717412948608, + "rewards/margins": 1.37689208984375, + "rewards/rejected": -1.9398682117462158, + "step": 3760 + }, + { + "epoch": 0.9504301515772224, + "grad_norm": 63.26457595825195, + "learning_rate": 4.317352501094099e-07, + "logits/chosen": -1.0829346179962158, + "logits/rejected": -1.0719726085662842, + "logps/chosen": -301.9375, + "logps/rejected": -275.79998779296875, + "loss": 0.5633, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.53662109375, + "rewards/margins": 1.661810278892517, + "rewards/rejected": -2.19915771484375, + "step": 3770 + }, + { + "epoch": 0.9529511864620427, + "grad_norm": 64.89823150634766, + "learning_rate": 4.3123079585743933e-07, + "logits/chosen": -1.052557349205017, + "logits/rejected": -1.072479248046875, + "logps/chosen": -287.65625, + "logps/rejected": -299.07501220703125, + "loss": 0.5662, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6813461184501648, + "rewards/margins": 1.3924071788787842, + "rewards/rejected": -2.0730528831481934, + "step": 3780 + }, + { + "epoch": 0.9554722213468629, + "grad_norm": 73.54103088378906, + "learning_rate": 4.3072478164100035e-07, + "logits/chosen": -1.0793945789337158, + "logits/rejected": -1.087927222251892, + "logps/chosen": -296.8187561035156, + "logps/rejected": -315.5687561035156, + "loss": 0.6007, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9742339849472046, + "rewards/margins": 1.363793969154358, + "rewards/rejected": -2.3383421897888184, + "step": 3790 + }, + { + "epoch": 0.9579932562316831, + "grad_norm": 50.553653717041016, + "learning_rate": 4.3021721181566726e-07, + "logits/chosen": -1.109521508216858, + "logits/rejected": -1.0997314453125, + "logps/chosen": -325.26251220703125, + "logps/rejected": -288.6499938964844, + "loss": 0.6171, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8993377685546875, + "rewards/margins": 1.397332787513733, + "rewards/rejected": -2.2955565452575684, + "step": 3800 + }, + { + "epoch": 0.9605142911165033, + "grad_norm": 65.36611938476562, + "learning_rate": 4.297080907504046e-07, + "logits/chosen": -1.0438506603240967, + "logits/rejected": -1.0786254405975342, + "logps/chosen": -283.39373779296875, + "logps/rejected": -275.76873779296875, + "loss": 0.5093, + "rewards/accuracies": 0.746874988079071, + "rewards/chosen": -1.195440649986267, + "rewards/margins": 1.481787085533142, + "rewards/rejected": -2.678997755050659, + "step": 3810 + }, + { + "epoch": 0.9630353260013236, + "grad_norm": 67.0266342163086, + "learning_rate": 4.2919742282752914e-07, + "logits/chosen": -1.1145751476287842, + "logits/rejected": -1.133203148841858, + "logps/chosen": -264.58905029296875, + "logps/rejected": -294.76873779296875, + "loss": 0.5937, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -1.2303345203399658, + "rewards/margins": 1.3708922863006592, + "rewards/rejected": -2.60076904296875, + "step": 3820 + }, + { + "epoch": 0.9655563608861437, + "grad_norm": 56.374698638916016, + "learning_rate": 4.2868521244267234e-07, + "logits/chosen": -1.032470703125, + "logits/rejected": -1.096923828125, + "logps/chosen": -264.90625, + "logps/rejected": -296.75, + "loss": 0.5216, + "rewards/accuracies": 0.721875011920929, + "rewards/chosen": -1.167138695716858, + "rewards/margins": 1.549462914466858, + "rewards/rejected": -2.7169432640075684, + "step": 3830 + }, + { + "epoch": 0.968077395770964, + "grad_norm": 81.97763061523438, + "learning_rate": 4.2817146400474293e-07, + "logits/chosen": -1.0275390148162842, + "logits/rejected": -1.0859863758087158, + "logps/chosen": -287.1781311035156, + "logps/rejected": -298.29998779296875, + "loss": 0.5933, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -1.2765655517578125, + "rewards/margins": 1.3431823253631592, + "rewards/rejected": -2.620379686355591, + "step": 3840 + }, + { + "epoch": 0.9705984306557842, + "grad_norm": 66.56047821044922, + "learning_rate": 4.276561819358883e-07, + "logits/chosen": -1.1472899913787842, + "logits/rejected": -1.1371643543243408, + "logps/chosen": -297.51251220703125, + "logps/rejected": -296.53125, + "loss": 0.5619, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.728131115436554, + "rewards/margins": 1.429907202720642, + "rewards/rejected": -2.1571717262268066, + "step": 3850 + }, + { + "epoch": 0.9731194655406045, + "grad_norm": 90.69190979003906, + "learning_rate": 4.271393706714569e-07, + "logits/chosen": -1.12060546875, + "logits/rejected": -1.113427758216858, + "logps/chosen": -312.5249938964844, + "logps/rejected": -294.8687438964844, + "loss": 0.5835, + "rewards/accuracies": 0.690625011920929, + "rewards/chosen": -0.7211105227470398, + "rewards/margins": 1.262664794921875, + "rewards/rejected": -1.983435034751892, + "step": 3860 + }, + { + "epoch": 0.9756405004254246, + "grad_norm": 71.51448822021484, + "learning_rate": 4.266210346599597e-07, + "logits/chosen": -1.151025414466858, + "logits/rejected": -1.1381347179412842, + "logps/chosen": -288.3812561035156, + "logps/rejected": -266.82501220703125, + "loss": 0.5589, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6968048214912415, + "rewards/margins": 1.412664771080017, + "rewards/rejected": -2.109405517578125, + "step": 3870 + }, + { + "epoch": 0.9781615353102449, + "grad_norm": 66.34625244140625, + "learning_rate": 4.261011783630325e-07, + "logits/chosen": -1.1195557117462158, + "logits/rejected": -1.1390869617462158, + "logps/chosen": -296.375, + "logps/rejected": -281.1937561035156, + "loss": 0.6018, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8242248296737671, + "rewards/margins": 1.478912353515625, + "rewards/rejected": -2.3028197288513184, + "step": 3880 + }, + { + "epoch": 0.9806825701950651, + "grad_norm": 47.40770721435547, + "learning_rate": 4.255798062553966e-07, + "logits/chosen": -1.1241943836212158, + "logits/rejected": -1.1107879877090454, + "logps/chosen": -273.63751220703125, + "logps/rejected": -283.4750061035156, + "loss": 0.5358, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1348450183868408, + "rewards/margins": 1.4095947742462158, + "rewards/rejected": -2.5440917015075684, + "step": 3890 + }, + { + "epoch": 0.9832036050798852, + "grad_norm": 69.58695983886719, + "learning_rate": 4.250569228248213e-07, + "logits/chosen": -1.0292479991912842, + "logits/rejected": -1.034875512123108, + "logps/chosen": -314.4750061035156, + "logps/rejected": -306.59063720703125, + "loss": 0.6183, + "rewards/accuracies": 0.6968749761581421, + "rewards/chosen": -1.425872802734375, + "rewards/margins": 1.4315338134765625, + "rewards/rejected": -2.8571624755859375, + "step": 3900 + }, + { + "epoch": 0.9857246399647055, + "grad_norm": 48.445335388183594, + "learning_rate": 4.245325325720844e-07, + "logits/chosen": -1.129638671875, + "logits/rejected": -1.0939452648162842, + "logps/chosen": -300.98748779296875, + "logps/rejected": -303.8812561035156, + "loss": 0.5403, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1617858409881592, + "rewards/margins": 1.6011841297149658, + "rewards/rejected": -2.7610106468200684, + "step": 3910 + }, + { + "epoch": 0.9882456748495257, + "grad_norm": 88.5267333984375, + "learning_rate": 4.2400664001093407e-07, + "logits/chosen": -1.047967553138733, + "logits/rejected": -1.0731322765350342, + "logps/chosen": -275.98748779296875, + "logps/rejected": -276.17498779296875, + "loss": 0.5492, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2724609375, + "rewards/margins": 1.3170502185821533, + "rewards/rejected": -2.5896973609924316, + "step": 3920 + }, + { + "epoch": 0.990766709734346, + "grad_norm": 49.751216888427734, + "learning_rate": 4.234792496680497e-07, + "logits/chosen": -1.0155150890350342, + "logits/rejected": -1.0476562976837158, + "logps/chosen": -289.84686279296875, + "logps/rejected": -288.3500061035156, + "loss": 0.5058, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2067779302597046, + "rewards/margins": 1.655676245689392, + "rewards/rejected": -2.862683057785034, + "step": 3930 + }, + { + "epoch": 0.9932877446191661, + "grad_norm": 60.2679557800293, + "learning_rate": 4.2295036608300305e-07, + "logits/chosen": -1.029028296470642, + "logits/rejected": -1.086694359779358, + "logps/chosen": -298.09375, + "logps/rejected": -296.5562438964844, + "loss": 0.5761, + "rewards/accuracies": 0.7093750238418579, + "rewards/chosen": -1.298675537109375, + "rewards/margins": 1.4742920398712158, + "rewards/rejected": -2.773193359375, + "step": 3940 + }, + { + "epoch": 0.9958087795039864, + "grad_norm": 72.0003890991211, + "learning_rate": 4.224199938082191e-07, + "logits/chosen": -1.1206543445587158, + "logits/rejected": -1.135717749595642, + "logps/chosen": -299.6499938964844, + "logps/rejected": -288.7875061035156, + "loss": 0.5698, + "rewards/accuracies": 0.7406250238418579, + "rewards/chosen": -1.208764672279358, + "rewards/margins": 1.361181616783142, + "rewards/rejected": -2.568603515625, + "step": 3950 + }, + { + "epoch": 0.9983298143888066, + "grad_norm": 72.28197479248047, + "learning_rate": 4.218881374089369e-07, + "logits/chosen": -1.087988257408142, + "logits/rejected": -1.125512719154358, + "logps/chosen": -287.3812561035156, + "logps/rejected": -302.92498779296875, + "loss": 0.5375, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.873364269733429, + "rewards/margins": 1.751013159751892, + "rewards/rejected": -2.62274169921875, + "step": 3960 + }, + { + "epoch": 1.0010084139539281, + "grad_norm": 27.972684860229492, + "learning_rate": 4.2135480146317016e-07, + "logits/chosen": -1.102655291557312, + "logits/rejected": -1.0655343532562256, + "logps/chosen": -298.625, + "logps/rejected": -284.827392578125, + "loss": 0.4539, + "rewards/accuracies": 0.8095238208770752, + "rewards/chosen": -0.445919930934906, + "rewards/margins": 2.1674513816833496, + "rewards/rejected": -2.612729072570801, + "step": 3970 + }, + { + "epoch": 1.0035294488387483, + "grad_norm": 55.124820709228516, + "learning_rate": 4.2081999056166807e-07, + "logits/chosen": -1.1157211065292358, + "logits/rejected": -1.1358826160430908, + "logps/chosen": -331.21875, + "logps/rejected": -315.48748779296875, + "loss": 0.2347, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -0.14286498725414276, + "rewards/margins": 2.959460496902466, + "rewards/rejected": -3.1031250953674316, + "step": 3980 + }, + { + "epoch": 1.0060504837235684, + "grad_norm": 38.26470947265625, + "learning_rate": 4.202837093078756e-07, + "logits/chosen": -1.104638695716858, + "logits/rejected": -1.105981469154358, + "logps/chosen": -311.70623779296875, + "logps/rejected": -302.4937438964844, + "loss": 0.1835, + "rewards/accuracies": 0.940625011920929, + "rewards/chosen": -0.2649780213832855, + "rewards/margins": 3.245849609375, + "rewards/rejected": -3.512011766433716, + "step": 3990 + }, + { + "epoch": 1.0085715186083888, + "grad_norm": 55.7176399230957, + "learning_rate": 4.1974596231789416e-07, + "logits/chosen": -0.9954498410224915, + "logits/rejected": NaN, + "logps/chosen": -292.46875, + "logps/rejected": -295.2437438964844, + "loss": 0.2882, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -0.7926574945449829, + "rewards/margins": 3.079882860183716, + "rewards/rejected": -3.8719725608825684, + "step": 4000 + }, + { + "epoch": 1.011092553493209, + "grad_norm": 31.28335952758789, + "learning_rate": 4.192067542204413e-07, + "logits/chosen": -1.061560034751892, + "logits/rejected": -1.023706078529358, + "logps/chosen": -297.0375061035156, + "logps/rejected": -312.42498779296875, + "loss": 0.234, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.3645355701446533, + "rewards/margins": 3.3698487281799316, + "rewards/rejected": -4.734765529632568, + "step": 4010 + }, + { + "epoch": 1.0136135883780293, + "grad_norm": 39.66532516479492, + "learning_rate": 4.186660896568116e-07, + "logits/chosen": -1.114648461341858, + "logits/rejected": -1.123077392578125, + "logps/chosen": -303.0375061035156, + "logps/rejected": -319.46875, + "loss": 0.2072, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.310919165611267, + "rewards/margins": 3.1480469703674316, + "rewards/rejected": -4.459179878234863, + "step": 4020 + }, + { + "epoch": 1.0161346232628494, + "grad_norm": 68.80731964111328, + "learning_rate": 4.1812397328083584e-07, + "logits/chosen": -1.105810523033142, + "logits/rejected": -1.0820404291152954, + "logps/chosen": -281.0062561035156, + "logps/rejected": -290.17498779296875, + "loss": 0.2325, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.6036239862442017, + "rewards/margins": 3.431103467941284, + "rewards/rejected": -4.033593654632568, + "step": 4030 + }, + { + "epoch": 1.0186556581476696, + "grad_norm": 37.8831672668457, + "learning_rate": 4.1758040975884195e-07, + "logits/chosen": -1.057647705078125, + "logits/rejected": -1.104516625404358, + "logps/chosen": -272.9750061035156, + "logps/rejected": -280.9375, + "loss": 0.2249, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.8665634393692017, + "rewards/margins": 3.409106492996216, + "rewards/rejected": -4.276953220367432, + "step": 4040 + }, + { + "epoch": 1.02117669303249, + "grad_norm": 21.118183135986328, + "learning_rate": 4.1703540376961406e-07, + "logits/chosen": -1.1152832508087158, + "logits/rejected": -1.0788085460662842, + "logps/chosen": -299.4125061035156, + "logps/rejected": -300.39373779296875, + "loss": 0.2219, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.9221404790878296, + "rewards/margins": 3.247851610183716, + "rewards/rejected": -4.170507907867432, + "step": 4050 + }, + { + "epoch": 1.02369772791731, + "grad_norm": 39.22066116333008, + "learning_rate": 4.164889600043525e-07, + "logits/chosen": NaN, + "logits/rejected": -1.0491211414337158, + "logps/chosen": -273.3031311035156, + "logps/rejected": -300.625, + "loss": 0.2248, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.1497116088867188, + "rewards/margins": 3.146044969558716, + "rewards/rejected": -4.2958984375, + "step": 4060 + }, + { + "epoch": 1.0262187628021302, + "grad_norm": 24.629756927490234, + "learning_rate": 4.1594108316663347e-07, + "logits/chosen": -1.041906714439392, + "logits/rejected": NaN, + "logps/chosen": -292.03125, + "logps/rejected": -285.01873779296875, + "loss": 0.2257, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -0.960662841796875, + "rewards/margins": 3.183337450027466, + "rewards/rejected": -4.14208984375, + "step": 4070 + }, + { + "epoch": 1.0287397976869506, + "grad_norm": 43.63905334472656, + "learning_rate": 4.153917779723686e-07, + "logits/chosen": -1.091064453125, + "logits/rejected": -1.0660521984100342, + "logps/chosen": -293.48126220703125, + "logps/rejected": -295.78125, + "loss": 0.2126, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -0.9144653081893921, + "rewards/margins": 3.102587938308716, + "rewards/rejected": -4.017382621765137, + "step": 4080 + }, + { + "epoch": 1.0312608325717707, + "grad_norm": 32.19606018066406, + "learning_rate": 4.14841049149764e-07, + "logits/chosen": -1.089135766029358, + "logits/rejected": -1.0004394054412842, + "logps/chosen": -288.5062561035156, + "logps/rejected": -289.11248779296875, + "loss": 0.2117, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -0.5238097906112671, + "rewards/margins": 3.288330078125, + "rewards/rejected": -3.81298828125, + "step": 4090 + }, + { + "epoch": 1.0337818674565908, + "grad_norm": 30.850324630737305, + "learning_rate": 4.142889014392802e-07, + "logits/chosen": -1.11529541015625, + "logits/rejected": -1.135351538658142, + "logps/chosen": -299.3687438964844, + "logps/rejected": -302.21875, + "loss": 0.2065, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.20770874619483948, + "rewards/margins": 3.523681640625, + "rewards/rejected": -3.730664014816284, + "step": 4100 + }, + { + "epoch": 1.0363029023414112, + "grad_norm": 34.62665557861328, + "learning_rate": 4.137353395935905e-07, + "logits/chosen": -1.090728759765625, + "logits/rejected": -1.0654175281524658, + "logps/chosen": -281.92498779296875, + "logps/rejected": -289.78125, + "loss": 0.2114, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.37632447481155396, + "rewards/margins": 3.229663133621216, + "rewards/rejected": -3.607226610183716, + "step": 4110 + }, + { + "epoch": 1.0388239372262313, + "grad_norm": 24.406591415405273, + "learning_rate": 4.13180368377541e-07, + "logits/chosen": -1.145361304283142, + "logits/rejected": -1.11834716796875, + "logps/chosen": -275.59375, + "logps/rejected": -304.63751220703125, + "loss": 0.1627, + "rewards/accuracies": 0.940625011920929, + "rewards/chosen": -0.637542724609375, + "rewards/margins": 3.4898438453674316, + "rewards/rejected": -4.128710746765137, + "step": 4120 + }, + { + "epoch": 1.0413449721110515, + "grad_norm": 21.681640625, + "learning_rate": 4.126239925681088e-07, + "logits/chosen": -1.078955054283142, + "logits/rejected": -1.080316185951233, + "logps/chosen": -269.88751220703125, + "logps/rejected": -292.29998779296875, + "loss": 0.1666, + "rewards/accuracies": 0.940625011920929, + "rewards/chosen": -0.8504608273506165, + "rewards/margins": 3.740917921066284, + "rewards/rejected": -4.592675685882568, + "step": 4130 + }, + { + "epoch": 1.0438660069958718, + "grad_norm": 34.73222732543945, + "learning_rate": 4.120662169543612e-07, + "logits/chosen": -1.1584351062774658, + "logits/rejected": -1.063623070716858, + "logps/chosen": -281.9624938964844, + "logps/rejected": -304.09375, + "loss": 0.2293, + "rewards/accuracies": 0.8843749761581421, + "rewards/chosen": -1.2438995838165283, + "rewards/margins": 3.4716553688049316, + "rewards/rejected": -4.715429782867432, + "step": 4140 + }, + { + "epoch": 1.046387041880692, + "grad_norm": 70.9593276977539, + "learning_rate": 4.1150704633741456e-07, + "logits/chosen": -1.0972900390625, + "logits/rejected": -1.084020972251892, + "logps/chosen": -300.6937561035156, + "logps/rejected": -295.1875, + "loss": 0.2751, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.6405212879180908, + "rewards/margins": 3.447216749191284, + "rewards/rejected": -5.084863185882568, + "step": 4150 + }, + { + "epoch": 1.0489080767655123, + "grad_norm": 43.35206604003906, + "learning_rate": 4.1094648553039315e-07, + "logits/chosen": -0.990277111530304, + "logits/rejected": -1.0615966320037842, + "logps/chosen": -282.14373779296875, + "logps/rejected": -327.26251220703125, + "loss": 0.2454, + "rewards/accuracies": 0.921875, + "rewards/chosen": -2.327831983566284, + "rewards/margins": 3.537353515625, + "rewards/rejected": -5.863671779632568, + "step": 4160 + }, + { + "epoch": 1.0514291116503325, + "grad_norm": 45.486671447753906, + "learning_rate": 4.103845393583868e-07, + "logits/chosen": -1.1049010753631592, + "logits/rejected": -1.0955932140350342, + "logps/chosen": -301.1499938964844, + "logps/rejected": -308.79998779296875, + "loss": 0.187, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.193652391433716, + "rewards/margins": 3.316357374191284, + "rewards/rejected": -5.512304782867432, + "step": 4170 + }, + { + "epoch": 1.0539501465351526, + "grad_norm": 59.81594467163086, + "learning_rate": 4.0982121265841073e-07, + "logits/chosen": -1.031347632408142, + "logits/rejected": -1.0300414562225342, + "logps/chosen": -318.1937561035156, + "logps/rejected": -328.17498779296875, + "loss": 0.2188, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.973791480064392, + "rewards/margins": 3.4609131813049316, + "rewards/rejected": -5.435351371765137, + "step": 4180 + }, + { + "epoch": 1.056471181419973, + "grad_norm": 32.9062385559082, + "learning_rate": 4.092565102793628e-07, + "logits/chosen": -1.133276343345642, + "logits/rejected": -1.0665709972381592, + "logps/chosen": -282.04998779296875, + "logps/rejected": -321.0062561035156, + "loss": 0.2493, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.005548119544983, + "rewards/margins": 3.323657274246216, + "rewards/rejected": -4.329516410827637, + "step": 4190 + }, + { + "epoch": 1.0589922163047931, + "grad_norm": 36.85378646850586, + "learning_rate": 4.0869043708198224e-07, + "logits/chosen": -1.15753173828125, + "logits/rejected": -1.1194274425506592, + "logps/chosen": -332.3999938964844, + "logps/rejected": -332.04998779296875, + "loss": 0.2871, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -0.8577331304550171, + "rewards/margins": 3.3770995140075684, + "rewards/rejected": -4.233202934265137, + "step": 4200 + }, + { + "epoch": 1.0615132511896133, + "grad_norm": 34.67184829711914, + "learning_rate": 4.0812299793880785e-07, + "logits/chosen": -1.1424560546875, + "logits/rejected": -1.085229516029358, + "logps/chosen": -292.36248779296875, + "logps/rejected": -318.36248779296875, + "loss": 0.1899, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.0035202503204346, + "rewards/margins": 3.4837889671325684, + "rewards/rejected": -4.485547065734863, + "step": 4210 + }, + { + "epoch": 1.0640342860744336, + "grad_norm": 45.7728157043457, + "learning_rate": 4.075541977341358e-07, + "logits/chosen": -1.108056664466858, + "logits/rejected": -1.0416381359100342, + "logps/chosen": -322.3187561035156, + "logps/rejected": -324.38751220703125, + "loss": 0.2596, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.581518530845642, + "rewards/margins": 3.370800733566284, + "rewards/rejected": -4.953125, + "step": 4220 + }, + { + "epoch": 1.0665553209592538, + "grad_norm": 34.45916748046875, + "learning_rate": 4.0698404136397805e-07, + "logits/chosen": -1.1266601085662842, + "logits/rejected": -1.0173828601837158, + "logps/chosen": -293.25, + "logps/rejected": -311.20001220703125, + "loss": 0.2186, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.5161864757537842, + "rewards/margins": 3.4703125953674316, + "rewards/rejected": -4.988085746765137, + "step": 4230 + }, + { + "epoch": 1.069076355844074, + "grad_norm": 36.84361267089844, + "learning_rate": 4.0641253373601957e-07, + "logits/chosen": -1.054724097251892, + "logits/rejected": -1.0573333501815796, + "logps/chosen": -282.3187561035156, + "logps/rejected": -310.42498779296875, + "loss": 0.1894, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.352813720703125, + "rewards/margins": 3.495434522628784, + "rewards/rejected": -4.846875190734863, + "step": 4240 + }, + { + "epoch": 1.0715973907288943, + "grad_norm": 34.635414123535156, + "learning_rate": 4.0583967976957654e-07, + "logits/chosen": -1.0933074951171875, + "logits/rejected": -1.059136986732483, + "logps/chosen": -305.29998779296875, + "logps/rejected": -311.6875, + "loss": 0.2144, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2784011363983154, + "rewards/margins": 3.570849657058716, + "rewards/rejected": -4.85107421875, + "step": 4250 + }, + { + "epoch": 1.0741184256137144, + "grad_norm": 35.24597930908203, + "learning_rate": 4.0526548439555407e-07, + "logits/chosen": -1.1156494617462158, + "logits/rejected": -1.130895972251892, + "logps/chosen": -278.92498779296875, + "logps/rejected": -306.1625061035156, + "loss": 0.1801, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.1475830078125, + "rewards/margins": 3.564257860183716, + "rewards/rejected": -4.711523532867432, + "step": 4260 + }, + { + "epoch": 1.0766394604985345, + "grad_norm": 48.14378356933594, + "learning_rate": 4.046899525564034e-07, + "logits/chosen": -1.176538109779358, + "logits/rejected": -1.14471435546875, + "logps/chosen": -308.3500061035156, + "logps/rejected": -312.29376220703125, + "loss": 0.184, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.820605456829071, + "rewards/margins": 3.810253858566284, + "rewards/rejected": -4.631933689117432, + "step": 4270 + }, + { + "epoch": 1.079160495383355, + "grad_norm": 36.98891830444336, + "learning_rate": 4.0411308920607953e-07, + "logits/chosen": -1.175878882408142, + "logits/rejected": -1.155859351158142, + "logps/chosen": -312.39373779296875, + "logps/rejected": -315.13751220703125, + "loss": 0.2341, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.203314185142517, + "rewards/margins": 3.703418016433716, + "rewards/rejected": -4.906836032867432, + "step": 4280 + }, + { + "epoch": 1.081681530268175, + "grad_norm": 16.921276092529297, + "learning_rate": 4.0353489930999876e-07, + "logits/chosen": -1.178369164466858, + "logits/rejected": -1.121728539466858, + "logps/chosen": -301.1937561035156, + "logps/rejected": -306.6187438964844, + "loss": 0.233, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.4920272827148438, + "rewards/margins": 3.525927782058716, + "rewards/rejected": -5.0166015625, + "step": 4290 + }, + { + "epoch": 1.0842025651529954, + "grad_norm": 47.32865905761719, + "learning_rate": 4.029553878449956e-07, + "logits/chosen": -1.0679442882537842, + "logits/rejected": -1.0796630382537842, + "logps/chosen": -282.92498779296875, + "logps/rejected": -314.5, + "loss": 0.1814, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.5190032720565796, + "rewards/margins": 3.7252440452575684, + "rewards/rejected": -5.246679782867432, + "step": 4300 + }, + { + "epoch": 1.0867236000378155, + "grad_norm": 52.33177185058594, + "learning_rate": 4.0237455979928024e-07, + "logits/chosen": -1.1435058116912842, + "logits/rejected": -1.093946099281311, + "logps/chosen": -322.875, + "logps/rejected": -320.01873779296875, + "loss": 0.2327, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.5239578485488892, + "rewards/margins": 3.4808106422424316, + "rewards/rejected": -5.002831935882568, + "step": 4310 + }, + { + "epoch": 1.0892446349226357, + "grad_norm": 38.97206115722656, + "learning_rate": 4.0179242017239544e-07, + "logits/chosen": -1.139623999595642, + "logits/rejected": -1.134057641029358, + "logps/chosen": -307.42498779296875, + "logps/rejected": -295.79998779296875, + "loss": 0.2415, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.366888403892517, + "rewards/margins": 3.210205078125, + "rewards/rejected": -4.574658393859863, + "step": 4320 + }, + { + "epoch": 1.091765669807456, + "grad_norm": 16.978683471679688, + "learning_rate": 4.012089739751735e-07, + "logits/chosen": -1.093713402748108, + "logits/rejected": -1.1429932117462158, + "logps/chosen": -282.9375, + "logps/rejected": -303.3500061035156, + "loss": 0.2093, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.5668976306915283, + "rewards/margins": 3.2781004905700684, + "rewards/rejected": -4.845898628234863, + "step": 4330 + }, + { + "epoch": 1.0942867046922762, + "grad_norm": 40.3631477355957, + "learning_rate": 4.006242262296933e-07, + "logits/chosen": -1.140039086341858, + "logits/rejected": -1.1598999500274658, + "logps/chosen": -275.3500061035156, + "logps/rejected": -299.23126220703125, + "loss": 0.1771, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.427282691001892, + "rewards/margins": 3.5241942405700684, + "rewards/rejected": -4.950976371765137, + "step": 4340 + }, + { + "epoch": 1.0968077395770963, + "grad_norm": 54.68492126464844, + "learning_rate": 4.0003818196923677e-07, + "logits/chosen": -1.1357421875, + "logits/rejected": -1.099816918373108, + "logps/chosen": -290.08123779296875, + "logps/rejected": -278.10626220703125, + "loss": 0.2141, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.343328833580017, + "rewards/margins": 3.33837890625, + "rewards/rejected": -4.6826171875, + "step": 4350 + }, + { + "epoch": 1.0993287744619167, + "grad_norm": 29.327800750732422, + "learning_rate": 3.994508462382459e-07, + "logits/chosen": -1.1252257823944092, + "logits/rejected": -1.1992371082305908, + "logps/chosen": -275.7437438964844, + "logps/rejected": -311.9624938964844, + "loss": 0.183, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1376861333847046, + "rewards/margins": 3.511059522628784, + "rewards/rejected": -4.649218559265137, + "step": 4360 + }, + { + "epoch": 1.1018498093467368, + "grad_norm": 36.42438507080078, + "learning_rate": 3.98862224092279e-07, + "logits/chosen": -1.129235863685608, + "logits/rejected": -1.137121558189392, + "logps/chosen": -277.17498779296875, + "logps/rejected": -291.23126220703125, + "loss": 0.1848, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.118890404701233, + "rewards/margins": 3.633984327316284, + "rewards/rejected": -4.752636909484863, + "step": 4370 + }, + { + "epoch": 1.104370844231557, + "grad_norm": 47.18445587158203, + "learning_rate": 3.982723205979675e-07, + "logits/chosen": -1.212890625, + "logits/rejected": -1.2003905773162842, + "logps/chosen": -301.38751220703125, + "logps/rejected": -311.1499938964844, + "loss": 0.2051, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.223547339439392, + "rewards/margins": 3.695605516433716, + "rewards/rejected": -4.917187690734863, + "step": 4380 + }, + { + "epoch": 1.1068918791163773, + "grad_norm": 26.98342514038086, + "learning_rate": 3.976811408329721e-07, + "logits/chosen": -1.1772339344024658, + "logits/rejected": -1.1562378406524658, + "logps/chosen": -285.75311279296875, + "logps/rejected": -319.1312561035156, + "loss": 0.2132, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.302832007408142, + "rewards/margins": 3.7227540016174316, + "rewards/rejected": -5.026757717132568, + "step": 4390 + }, + { + "epoch": 1.1094129140011975, + "grad_norm": 18.257844924926758, + "learning_rate": 3.9708868988593916e-07, + "logits/chosen": -1.110937476158142, + "logits/rejected": -1.16046142578125, + "logps/chosen": -297.7406311035156, + "logps/rejected": -309.64373779296875, + "loss": 0.1726, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.378991723060608, + "rewards/margins": 3.613525390625, + "rewards/rejected": -4.991991996765137, + "step": 4400 + }, + { + "epoch": 1.1119339488860178, + "grad_norm": 38.7391471862793, + "learning_rate": 3.9649497285645673e-07, + "logits/chosen": -1.2002990245819092, + "logits/rejected": -1.1981627941131592, + "logps/chosen": -318.8812561035156, + "logps/rejected": -331.1875, + "loss": 0.2149, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.6891753673553467, + "rewards/margins": 3.5896973609924316, + "rewards/rejected": -5.277734279632568, + "step": 4410 + }, + { + "epoch": 1.114454983770838, + "grad_norm": 28.373977661132812, + "learning_rate": 3.958999948550111e-07, + "logits/chosen": -1.178857445716858, + "logits/rejected": -1.144433617591858, + "logps/chosen": -324.8812561035156, + "logps/rejected": -305.4125061035156, + "loss": 0.2874, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.9126068353652954, + "rewards/margins": 3.287841796875, + "rewards/rejected": -5.19921875, + "step": 4420 + }, + { + "epoch": 1.116976018655658, + "grad_norm": 28.570444107055664, + "learning_rate": 3.9530376100294236e-07, + "logits/chosen": -1.1751220226287842, + "logits/rejected": -1.147436499595642, + "logps/chosen": -301.73748779296875, + "logps/rejected": -348.5562438964844, + "loss": 0.2238, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.564123511314392, + "rewards/margins": 3.5875000953674316, + "rewards/rejected": -5.152539253234863, + "step": 4430 + }, + { + "epoch": 1.1194970535404785, + "grad_norm": 36.46849822998047, + "learning_rate": 3.9470627643240054e-07, + "logits/chosen": -1.22021484375, + "logits/rejected": -1.1673583984375, + "logps/chosen": -291.09375, + "logps/rejected": -312.8374938964844, + "loss": 0.177, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -0.914196789264679, + "rewards/margins": 3.430957078933716, + "rewards/rejected": -4.343163967132568, + "step": 4440 + }, + { + "epoch": 1.1220180884252986, + "grad_norm": 29.296356201171875, + "learning_rate": 3.941075462863011e-07, + "logits/chosen": -1.1571776866912842, + "logits/rejected": -1.1372559070587158, + "logps/chosen": -291.10626220703125, + "logps/rejected": -315.5625, + "loss": 0.1728, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.166351318359375, + "rewards/margins": 3.57763671875, + "rewards/rejected": -4.7451171875, + "step": 4450 + }, + { + "epoch": 1.1245391233101187, + "grad_norm": 29.039588928222656, + "learning_rate": 3.935075757182813e-07, + "logits/chosen": -1.1462234258651733, + "logits/rejected": -1.103796362876892, + "logps/chosen": -271.5562438964844, + "logps/rejected": -312.73126220703125, + "loss": 0.2095, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.5503631830215454, + "rewards/margins": 3.8505859375, + "rewards/rejected": -5.400390625, + "step": 4460 + }, + { + "epoch": 1.127060158194939, + "grad_norm": 32.84434509277344, + "learning_rate": 3.9290636989265536e-07, + "logits/chosen": -1.152032494544983, + "logits/rejected": -1.0812256336212158, + "logps/chosen": -298.4624938964844, + "logps/rejected": -316.11248779296875, + "loss": 0.1636, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.385565161705017, + "rewards/margins": 4.075976371765137, + "rewards/rejected": -5.461230278015137, + "step": 4470 + }, + { + "epoch": 1.1295811930797592, + "grad_norm": 43.07621765136719, + "learning_rate": 3.923039339843699e-07, + "logits/chosen": -1.156396508216858, + "logits/rejected": -1.1776825189590454, + "logps/chosen": -292.60626220703125, + "logps/rejected": -299.4750061035156, + "loss": 0.2127, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.4523742198944092, + "rewards/margins": 3.860107421875, + "rewards/rejected": -5.315331935882568, + "step": 4480 + }, + { + "epoch": 1.1321022279645794, + "grad_norm": 38.23931121826172, + "learning_rate": 3.9170027317895993e-07, + "logits/chosen": -1.113500952720642, + "logits/rejected": -1.069494605064392, + "logps/chosen": -313.70001220703125, + "logps/rejected": -326.48126220703125, + "loss": 0.1679, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -1.3317139148712158, + "rewards/margins": 4.272363185882568, + "rewards/rejected": -5.60205078125, + "step": 4490 + }, + { + "epoch": 1.1346232628493997, + "grad_norm": 57.834754943847656, + "learning_rate": 3.910953926725037e-07, + "logits/chosen": -1.1160156726837158, + "logits/rejected": -1.0078856945037842, + "logps/chosen": -330.23126220703125, + "logps/rejected": -323.35626220703125, + "loss": 0.2029, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.3919098377227783, + "rewards/margins": 3.94287109375, + "rewards/rejected": -5.335253715515137, + "step": 4500 + }, + { + "epoch": 1.1371442977342199, + "grad_norm": 44.31748580932617, + "learning_rate": 3.904892976715783e-07, + "logits/chosen": -1.203222632408142, + "logits/rejected": -1.091040015220642, + "logps/chosen": -323.2875061035156, + "logps/rejected": -319.29376220703125, + "loss": 0.2073, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.4412353038787842, + "rewards/margins": 3.8412108421325684, + "rewards/rejected": -5.28173828125, + "step": 4510 + }, + { + "epoch": 1.1396653326190402, + "grad_norm": 29.34000587463379, + "learning_rate": 3.898819933932146e-07, + "logits/chosen": -1.1533203125, + "logits/rejected": -1.1740233898162842, + "logps/chosen": -334.1812438964844, + "logps/rejected": -323.07501220703125, + "loss": 0.2344, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.94696044921875, + "rewards/margins": 3.8212890625, + "rewards/rejected": -4.7689208984375, + "step": 4520 + }, + { + "epoch": 1.1421863675038604, + "grad_norm": 45.65370559692383, + "learning_rate": 3.8927348506485253e-07, + "logits/chosen": -1.1566283702850342, + "logits/rejected": -1.1243164539337158, + "logps/chosen": -300.3812561035156, + "logps/rejected": -317.2437438964844, + "loss": 0.2256, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.33416748046875, + "rewards/margins": 3.622729539871216, + "rewards/rejected": -4.956250190734863, + "step": 4530 + }, + { + "epoch": 1.1447074023886805, + "grad_norm": 51.753509521484375, + "learning_rate": 3.8866377792429593e-07, + "logits/chosen": -1.1453125476837158, + "logits/rejected": -1.13385009765625, + "logps/chosen": -289.14373779296875, + "logps/rejected": -302.78125, + "loss": 0.1984, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.309106469154358, + "rewards/margins": 3.377490282058716, + "rewards/rejected": -4.688086032867432, + "step": 4540 + }, + { + "epoch": 1.1472284372735007, + "grad_norm": 51.0242919921875, + "learning_rate": 3.880528772196677e-07, + "logits/chosen": NaN, + "logits/rejected": -1.122686743736267, + "logps/chosen": -303.51873779296875, + "logps/rejected": -315.7562561035156, + "loss": 0.1893, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.9603027105331421, + "rewards/margins": 3.603710889816284, + "rewards/rejected": -4.563086032867432, + "step": 4550 + }, + { + "epoch": 1.149749472158321, + "grad_norm": 64.95894622802734, + "learning_rate": 3.8744078820936445e-07, + "logits/chosen": -1.139245629310608, + "logits/rejected": -1.1644775867462158, + "logps/chosen": -307.64373779296875, + "logps/rejected": -311.42498779296875, + "loss": 0.2393, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.6932830810546875, + "rewards/margins": 3.6019043922424316, + "rewards/rejected": -4.295312404632568, + "step": 4560 + }, + { + "epoch": 1.1522705070431412, + "grad_norm": 39.9935188293457, + "learning_rate": 3.8682751616201106e-07, + "logits/chosen": -1.162573218345642, + "logits/rejected": -1.1469237804412842, + "logps/chosen": -265.61248779296875, + "logps/rejected": -276.01251220703125, + "loss": 0.2416, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.8093169927597046, + "rewards/margins": 3.1261229515075684, + "rewards/rejected": -3.933300733566284, + "step": 4570 + }, + { + "epoch": 1.1547915419279615, + "grad_norm": 32.537593841552734, + "learning_rate": 3.862130663564158e-07, + "logits/chosen": -1.1119506359100342, + "logits/rejected": -1.17852783203125, + "logps/chosen": -279.26873779296875, + "logps/rejected": -284.28125, + "loss": 0.2043, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.7371917963027954, + "rewards/margins": 3.585644483566284, + "rewards/rejected": -4.320776462554932, + "step": 4580 + }, + { + "epoch": 1.1573125768127817, + "grad_norm": 34.01283645629883, + "learning_rate": 3.855974440815244e-07, + "logits/chosen": -1.1630370616912842, + "logits/rejected": -1.180322289466858, + "logps/chosen": -286.79376220703125, + "logps/rejected": -328.3187561035156, + "loss": 0.1801, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -0.8788086175918579, + "rewards/margins": 3.6949219703674316, + "rewards/rejected": -4.571972846984863, + "step": 4590 + }, + { + "epoch": 1.1598336116976018, + "grad_norm": 39.68577575683594, + "learning_rate": 3.8498065463637505e-07, + "logits/chosen": -1.20361328125, + "logits/rejected": -1.1982910633087158, + "logps/chosen": -305.0, + "logps/rejected": -313.8812561035156, + "loss": 0.2171, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.7044006586074829, + "rewards/margins": 3.478759765625, + "rewards/rejected": -4.183569431304932, + "step": 4600 + }, + { + "epoch": 1.1623546465824222, + "grad_norm": 49.09182357788086, + "learning_rate": 3.843627033300521e-07, + "logits/chosen": -1.240966796875, + "logits/rejected": -1.192773461341858, + "logps/chosen": -281.23748779296875, + "logps/rejected": -282.0, + "loss": 0.1685, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -0.22492066025733948, + "rewards/margins": 3.614208936691284, + "rewards/rejected": -3.8397459983825684, + "step": 4610 + }, + { + "epoch": 1.1648756814672423, + "grad_norm": 31.07063102722168, + "learning_rate": 3.83743595481641e-07, + "logits/chosen": -1.22607421875, + "logits/rejected": -1.106134057044983, + "logps/chosen": -278.5687561035156, + "logps/rejected": -294.0, + "loss": 0.2021, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.831127941608429, + "rewards/margins": 3.7333984375, + "rewards/rejected": -4.563330173492432, + "step": 4620 + }, + { + "epoch": 1.1673967163520624, + "grad_norm": 18.663665771484375, + "learning_rate": 3.831233364201825e-07, + "logits/chosen": -1.205297827720642, + "logits/rejected": -1.162261962890625, + "logps/chosen": -290.859375, + "logps/rejected": -304.9624938964844, + "loss": 0.1641, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.909899890422821, + "rewards/margins": 3.749798536300659, + "rewards/rejected": -4.66015625, + "step": 4630 + }, + { + "epoch": 1.1699177512368828, + "grad_norm": 33.56493377685547, + "learning_rate": 3.8250193148462583e-07, + "logits/chosen": -1.1095702648162842, + "logits/rejected": -1.103540062904358, + "logps/chosen": -304.0, + "logps/rejected": -307.58123779296875, + "loss": 0.1681, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -1.443878173828125, + "rewards/margins": 3.925585985183716, + "rewards/rejected": -5.368456840515137, + "step": 4640 + }, + { + "epoch": 1.172438786121703, + "grad_norm": 34.2917366027832, + "learning_rate": 3.8187938602378413e-07, + "logits/chosen": NaN, + "logits/rejected": -1.0922362804412842, + "logps/chosen": -283.5, + "logps/rejected": -292.28125, + "loss": 0.1955, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.5723145008087158, + "rewards/margins": 3.814746141433716, + "rewards/rejected": -5.384375095367432, + "step": 4650 + }, + { + "epoch": 1.174959821006523, + "grad_norm": 15.947632789611816, + "learning_rate": 3.812557053962875e-07, + "logits/chosen": -1.072790503501892, + "logits/rejected": -1.041162133216858, + "logps/chosen": -307.6937561035156, + "logps/rejected": -297.15625, + "loss": 0.2179, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.70806884765625, + "rewards/margins": 3.767627000808716, + "rewards/rejected": -5.477734565734863, + "step": 4660 + }, + { + "epoch": 1.1774808558913434, + "grad_norm": 40.98158264160156, + "learning_rate": 3.8063089497053713e-07, + "logits/chosen": -1.0684082508087158, + "logits/rejected": -1.1115906238555908, + "logps/chosen": -282.6499938964844, + "logps/rejected": -317.4125061035156, + "loss": 0.1868, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.52655029296875, + "rewards/margins": 3.707080125808716, + "rewards/rejected": -5.233300685882568, + "step": 4670 + }, + { + "epoch": 1.1800018907761636, + "grad_norm": 41.0838623046875, + "learning_rate": 3.80004960124659e-07, + "logits/chosen": -1.1486327648162842, + "logits/rejected": -1.0456054210662842, + "logps/chosen": -295.26251220703125, + "logps/rejected": -307.0375061035156, + "loss": 0.1645, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.7212646007537842, + "rewards/margins": 3.996386766433716, + "rewards/rejected": -5.716406345367432, + "step": 4680 + }, + { + "epoch": 1.182522925660984, + "grad_norm": 41.059024810791016, + "learning_rate": 3.7937790624645776e-07, + "logits/chosen": -1.188085913658142, + "logits/rejected": -1.183569312095642, + "logps/chosen": -320.6031188964844, + "logps/rejected": -301.5, + "loss": 0.2758, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.802056908607483, + "rewards/margins": 3.351367235183716, + "rewards/rejected": -5.156298637390137, + "step": 4690 + }, + { + "epoch": 1.185043960545804, + "grad_norm": 39.89297866821289, + "learning_rate": 3.7874973873337026e-07, + "logits/chosen": -1.094244360923767, + "logits/rejected": -1.1009094715118408, + "logps/chosen": -320.04998779296875, + "logps/rejected": -341.0375061035156, + "loss": 0.2047, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.5172607898712158, + "rewards/margins": 3.868945360183716, + "rewards/rejected": -5.386328220367432, + "step": 4700 + }, + { + "epoch": 1.1875649954306242, + "grad_norm": 46.3644905090332, + "learning_rate": 3.78120462992419e-07, + "logits/chosen": -1.1583251953125, + "logits/rejected": -1.1174805164337158, + "logps/chosen": -300.0375061035156, + "logps/rejected": -312.67498779296875, + "loss": 0.2217, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.545629858970642, + "rewards/margins": 3.8341307640075684, + "rewards/rejected": -5.380078315734863, + "step": 4710 + }, + { + "epoch": 1.1900860303154446, + "grad_norm": 27.166078567504883, + "learning_rate": 3.774900844401657e-07, + "logits/chosen": -1.1233947277069092, + "logits/rejected": -1.1427733898162842, + "logps/chosen": -300.64373779296875, + "logps/rejected": -331.9750061035156, + "loss": 0.1788, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -1.1992919445037842, + "rewards/margins": 4.047656059265137, + "rewards/rejected": -5.247754096984863, + "step": 4720 + }, + { + "epoch": 1.1926070652002647, + "grad_norm": 34.90217971801758, + "learning_rate": 3.768586085026648e-07, + "logits/chosen": -1.188079833984375, + "logits/rejected": -1.186242699623108, + "logps/chosen": -292.1625061035156, + "logps/rejected": -333.8062438964844, + "loss": 0.2236, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.3991210460662842, + "rewards/margins": 4.101513862609863, + "rewards/rejected": -5.499413967132568, + "step": 4730 + }, + { + "epoch": 1.1951281000850849, + "grad_norm": 37.92390823364258, + "learning_rate": 3.7622604061541646e-07, + "logits/chosen": -1.107110619544983, + "logits/rejected": -1.143762230873108, + "logps/chosen": -290.91876220703125, + "logps/rejected": -305.4624938964844, + "loss": 0.1878, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.101324439048767, + "rewards/margins": 3.680908203125, + "rewards/rejected": -4.785742282867432, + "step": 4740 + }, + { + "epoch": 1.1976491349699052, + "grad_norm": 48.947086334228516, + "learning_rate": 3.755923862233199e-07, + "logits/chosen": -1.1432373523712158, + "logits/rejected": -1.1442382335662842, + "logps/chosen": -269.4125061035156, + "logps/rejected": -282.82501220703125, + "loss": 0.2411, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.0486876964569092, + "rewards/margins": 3.489990234375, + "rewards/rejected": -4.5390625, + "step": 4750 + }, + { + "epoch": 1.2001701698547254, + "grad_norm": 39.559715270996094, + "learning_rate": 3.7495765078062653e-07, + "logits/chosen": -1.140527367591858, + "logits/rejected": -1.118432641029358, + "logps/chosen": -300.84375, + "logps/rejected": -302.29998779296875, + "loss": 0.2145, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.0033690929412842, + "rewards/margins": 3.4307618141174316, + "rewards/rejected": -4.433789253234863, + "step": 4760 + }, + { + "epoch": 1.2026912047395455, + "grad_norm": 49.91185760498047, + "learning_rate": 3.7432183975089326e-07, + "logits/chosen": -1.210205078125, + "logits/rejected": -1.1429321765899658, + "logps/chosen": -302.67498779296875, + "logps/rejected": -301.375, + "loss": 0.1675, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -1.054997205734253, + "rewards/margins": 3.777148485183716, + "rewards/rejected": -4.833691596984863, + "step": 4770 + }, + { + "epoch": 1.2052122396243659, + "grad_norm": 41.0870475769043, + "learning_rate": 3.7368495860693493e-07, + "logits/chosen": -1.200048804283142, + "logits/rejected": -1.1798095703125, + "logps/chosen": -293.3812561035156, + "logps/rejected": -326.7749938964844, + "loss": 0.1553, + "rewards/accuracies": 0.9468749761581421, + "rewards/chosen": -0.9073547124862671, + "rewards/margins": 4.201464653015137, + "rewards/rejected": -5.106543064117432, + "step": 4780 + }, + { + "epoch": 1.207733274509186, + "grad_norm": 60.1196403503418, + "learning_rate": 3.730470128307778e-07, + "logits/chosen": -1.1578384637832642, + "logits/rejected": -1.140051245689392, + "logps/chosen": -285.04376220703125, + "logps/rejected": -319.7124938964844, + "loss": 0.2429, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.4465453624725342, + "rewards/margins": 3.917773485183716, + "rewards/rejected": -5.362402439117432, + "step": 4790 + }, + { + "epoch": 1.2102543093940064, + "grad_norm": 41.10110092163086, + "learning_rate": 3.7240800791361176e-07, + "logits/chosen": -1.1771361827850342, + "logits/rejected": -1.1097290515899658, + "logps/chosen": -304.59375, + "logps/rejected": -302.625, + "loss": 0.213, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.100122094154358, + "rewards/margins": 4.222363471984863, + "rewards/rejected": -5.322070121765137, + "step": 4800 + }, + { + "epoch": 1.2127753442788265, + "grad_norm": 49.76337432861328, + "learning_rate": 3.717679493557437e-07, + "logits/chosen": -1.167504906654358, + "logits/rejected": -1.1450592279434204, + "logps/chosen": -295.32501220703125, + "logps/rejected": -322.9750061035156, + "loss": 0.2393, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -0.7194992303848267, + "rewards/margins": 4.076074123382568, + "rewards/rejected": -4.795702934265137, + "step": 4810 + }, + { + "epoch": 1.2152963791636466, + "grad_norm": 35.84429168701172, + "learning_rate": 3.7112684266654954e-07, + "logits/chosen": -1.1913330554962158, + "logits/rejected": -1.170629858970642, + "logps/chosen": -299.88751220703125, + "logps/rejected": -293.16876220703125, + "loss": 0.2405, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.5069335699081421, + "rewards/margins": 3.823779344558716, + "rewards/rejected": -4.329345703125, + "step": 4820 + }, + { + "epoch": 1.2178174140484668, + "grad_norm": 39.85462188720703, + "learning_rate": 3.7048469336442735e-07, + "logits/chosen": -1.1529052257537842, + "logits/rejected": -1.0819122791290283, + "logps/chosen": -289.34375, + "logps/rejected": -320.7749938964844, + "loss": 0.1778, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.807940661907196, + "rewards/margins": 4.0791015625, + "rewards/rejected": -4.886620998382568, + "step": 4830 + }, + { + "epoch": 1.2203384489332871, + "grad_norm": 50.39110565185547, + "learning_rate": 3.698415069767494e-07, + "logits/chosen": -1.2173583507537842, + "logits/rejected": -1.1823852062225342, + "logps/chosen": -303.1000061035156, + "logps/rejected": -301.1000061035156, + "loss": 0.2131, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.1938186883926392, + "rewards/margins": 3.8489990234375, + "rewards/rejected": -5.044238090515137, + "step": 4840 + }, + { + "epoch": 1.2228594838181073, + "grad_norm": 54.38093185424805, + "learning_rate": 3.69197289039815e-07, + "logits/chosen": -1.1825439929962158, + "logits/rejected": -1.17578125, + "logps/chosen": -293.3374938964844, + "logps/rejected": -329.4750061035156, + "loss": 0.2147, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.0760619640350342, + "rewards/margins": 3.711376905441284, + "rewards/rejected": -4.787304878234863, + "step": 4850 + }, + { + "epoch": 1.2253805187029276, + "grad_norm": 36.31102752685547, + "learning_rate": 3.6855204509880243e-07, + "logits/chosen": -1.126806616783142, + "logits/rejected": -1.089715600013733, + "logps/chosen": -289.5062561035156, + "logps/rejected": -301.83123779296875, + "loss": 0.1686, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2572510242462158, + "rewards/margins": 4.262890815734863, + "rewards/rejected": -5.521288871765137, + "step": 4860 + }, + { + "epoch": 1.2279015535877478, + "grad_norm": 35.117347717285156, + "learning_rate": 3.6790578070772166e-07, + "logits/chosen": -1.216943383216858, + "logits/rejected": -1.17559814453125, + "logps/chosen": -285.9125061035156, + "logps/rejected": -316.5625, + "loss": 0.1947, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.0204894542694092, + "rewards/margins": 4.046972751617432, + "rewards/rejected": -5.066113471984863, + "step": 4870 + }, + { + "epoch": 1.230422588472568, + "grad_norm": 59.419010162353516, + "learning_rate": 3.672585014293661e-07, + "logits/chosen": -1.1372191905975342, + "logits/rejected": -1.0812866687774658, + "logps/chosen": -314.4750061035156, + "logps/rejected": -336.4375, + "loss": 0.1868, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -1.260986328125, + "rewards/margins": 4.205786228179932, + "rewards/rejected": -5.466992378234863, + "step": 4880 + }, + { + "epoch": 1.2329436233573883, + "grad_norm": 44.0726203918457, + "learning_rate": 3.666102128352649e-07, + "logits/chosen": -1.19561767578125, + "logits/rejected": -1.107458472251892, + "logps/chosen": -293.45001220703125, + "logps/rejected": -290.95001220703125, + "loss": 0.1788, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.756848156452179, + "rewards/margins": 4.030517578125, + "rewards/rejected": -4.79052734375, + "step": 4890 + }, + { + "epoch": 1.2354646582422084, + "grad_norm": 71.85173034667969, + "learning_rate": 3.6596092050563513e-07, + "logits/chosen": -1.206689476966858, + "logits/rejected": -1.11602783203125, + "logps/chosen": -301.375, + "logps/rejected": -335.359375, + "loss": 0.2441, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.9993346929550171, + "rewards/margins": 3.8375487327575684, + "rewards/rejected": -4.838183403015137, + "step": 4900 + }, + { + "epoch": 1.2379856931270286, + "grad_norm": 37.4676399230957, + "learning_rate": 3.653106300293336e-07, + "logits/chosen": -1.1802246570587158, + "logits/rejected": -1.162255883216858, + "logps/chosen": -308.85626220703125, + "logps/rejected": -314.5, + "loss": 0.2388, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -0.76812744140625, + "rewards/margins": 3.83447265625, + "rewards/rejected": -4.603711128234863, + "step": 4910 + }, + { + "epoch": 1.240506728011849, + "grad_norm": 31.14033317565918, + "learning_rate": 3.6465934700380873e-07, + "logits/chosen": -1.173559546470642, + "logits/rejected": NaN, + "logps/chosen": -314.70001220703125, + "logps/rejected": -307.5874938964844, + "loss": 0.1561, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.8431457281112671, + "rewards/margins": 3.899951219558716, + "rewards/rejected": -4.7431640625, + "step": 4920 + }, + { + "epoch": 1.243027762896669, + "grad_norm": 35.24359893798828, + "learning_rate": 3.640070770350524e-07, + "logits/chosen": -1.168951392173767, + "logits/rejected": -1.149084448814392, + "logps/chosen": -298.35626220703125, + "logps/rejected": -303.17498779296875, + "loss": 0.183, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.4486083984375, + "rewards/margins": 3.719006299972534, + "rewards/rejected": -5.167870998382568, + "step": 4930 + }, + { + "epoch": 1.2455487977814892, + "grad_norm": 73.50921630859375, + "learning_rate": 3.633538257375519e-07, + "logits/chosen": -1.2502930164337158, + "logits/rejected": -1.245141625404358, + "logps/chosen": -314.8500061035156, + "logps/rejected": -315.08123779296875, + "loss": 0.2339, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.270166039466858, + "rewards/margins": 3.577587842941284, + "rewards/rejected": -4.847826957702637, + "step": 4940 + }, + { + "epoch": 1.2480698326663096, + "grad_norm": 43.814964294433594, + "learning_rate": 3.626995987342412e-07, + "logits/chosen": -1.182275414466858, + "logits/rejected": -1.1380615234375, + "logps/chosen": -288.65625, + "logps/rejected": -313.6000061035156, + "loss": 0.2684, + "rewards/accuracies": 0.8843749761581421, + "rewards/chosen": -0.8729751706123352, + "rewards/margins": 3.7605957984924316, + "rewards/rejected": -4.633496284484863, + "step": 4950 + }, + { + "epoch": 1.2505908675511297, + "grad_norm": 54.624507904052734, + "learning_rate": 3.620444016564528e-07, + "logits/chosen": -1.147241234779358, + "logits/rejected": -1.146948218345642, + "logps/chosen": -284.2749938964844, + "logps/rejected": -300.23126220703125, + "loss": 0.228, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.9929214715957642, + "rewards/margins": 3.8893799781799316, + "rewards/rejected": -4.886034965515137, + "step": 4960 + }, + { + "epoch": 1.25311190243595, + "grad_norm": 52.134803771972656, + "learning_rate": 3.6138824014386945e-07, + "logits/chosen": -1.160058617591858, + "logits/rejected": -1.15692138671875, + "logps/chosen": -288.23126220703125, + "logps/rejected": -314.20001220703125, + "loss": 0.1854, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.8441619873046875, + "rewards/margins": 4.047338962554932, + "rewards/rejected": -4.894335746765137, + "step": 4970 + }, + { + "epoch": 1.2556329373207702, + "grad_norm": 33.152957916259766, + "learning_rate": 3.6073111984447497e-07, + "logits/chosen": -1.1812744140625, + "logits/rejected": -1.164697289466858, + "logps/chosen": -303.39373779296875, + "logps/rejected": -310.0, + "loss": 0.2534, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -0.86151123046875, + "rewards/margins": 3.7456297874450684, + "rewards/rejected": -4.605175971984863, + "step": 4980 + }, + { + "epoch": 1.2581539722055903, + "grad_norm": 32.13079071044922, + "learning_rate": 3.600730464145064e-07, + "logits/chosen": -1.172521948814392, + "logits/rejected": -1.151885986328125, + "logps/chosen": -304.4468688964844, + "logps/rejected": -310.79998779296875, + "loss": 0.2877, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -0.9182235598564148, + "rewards/margins": 3.5555176734924316, + "rewards/rejected": -4.47412109375, + "step": 4990 + }, + { + "epoch": 1.2606750070904107, + "grad_norm": 25.800579071044922, + "learning_rate": 3.594140255184048e-07, + "logits/chosen": -1.1635010242462158, + "logits/rejected": -1.1095459461212158, + "logps/chosen": -269.71875, + "logps/rejected": -281.1875, + "loss": 0.2093, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -0.6491439938545227, + "rewards/margins": 3.390625, + "rewards/rejected": -4.040600776672363, + "step": 5000 + }, + { + "epoch": 1.2631960419752308, + "grad_norm": 22.663894653320312, + "learning_rate": 3.5875406282876676e-07, + "logits/chosen": -1.180810570716858, + "logits/rejected": -1.2098388671875, + "logps/chosen": -306.36248779296875, + "logps/rejected": -308.0874938964844, + "loss": 0.2132, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -0.8019149899482727, + "rewards/margins": 3.5633301734924316, + "rewards/rejected": -4.363476753234863, + "step": 5010 + }, + { + "epoch": 1.265717076860051, + "grad_norm": 29.97337532043457, + "learning_rate": 3.5809316402629533e-07, + "logits/chosen": -1.1132690906524658, + "logits/rejected": -1.1020996570587158, + "logps/chosen": -282.05938720703125, + "logps/rejected": -291.78125, + "loss": 0.2372, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.367584228515625, + "rewards/margins": 3.456860303878784, + "rewards/rejected": -4.825781345367432, + "step": 5020 + }, + { + "epoch": 1.2682381117448713, + "grad_norm": 22.39763832092285, + "learning_rate": 3.5743133479975137e-07, + "logits/chosen": -1.1588592529296875, + "logits/rejected": -1.1710937023162842, + "logps/chosen": -303.28125, + "logps/rejected": -323.33123779296875, + "loss": 0.1932, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.229089379310608, + "rewards/margins": 3.6304688453674316, + "rewards/rejected": -4.860058784484863, + "step": 5030 + }, + { + "epoch": 1.2707591466296915, + "grad_norm": 54.3851432800293, + "learning_rate": 3.567685808459044e-07, + "logits/chosen": -1.1513671875, + "logits/rejected": -1.0747467279434204, + "logps/chosen": -274.8374938964844, + "logps/rejected": -301.0062561035156, + "loss": 0.1905, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.5656249523162842, + "rewards/margins": 3.6270508766174316, + "rewards/rejected": -5.195508003234863, + "step": 5040 + }, + { + "epoch": 1.2732801815145116, + "grad_norm": 41.404754638671875, + "learning_rate": 3.5610490786948353e-07, + "logits/chosen": -1.1798827648162842, + "logits/rejected": -1.154943823814392, + "logps/chosen": -297.7875061035156, + "logps/rejected": -317.70623779296875, + "loss": 0.1926, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.4342010021209717, + "rewards/margins": 3.7634034156799316, + "rewards/rejected": -5.200585842132568, + "step": 5050 + }, + { + "epoch": 1.275801216399332, + "grad_norm": 33.274147033691406, + "learning_rate": 3.5544032158312883e-07, + "logits/chosen": -1.1477782726287842, + "logits/rejected": -1.0680663585662842, + "logps/chosen": -295.86248779296875, + "logps/rejected": -328.8500061035156, + "loss": 0.1995, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.2218414545059204, + "rewards/margins": 3.9695801734924316, + "rewards/rejected": -5.192285060882568, + "step": 5060 + }, + { + "epoch": 1.2783222512841521, + "grad_norm": 47.26895523071289, + "learning_rate": 3.5477482770734137e-07, + "logits/chosen": -1.1197388172149658, + "logits/rejected": -1.0371825695037842, + "logps/chosen": -276.1312561035156, + "logps/rejected": -292.6000061035156, + "loss": 0.1895, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.4241516590118408, + "rewards/margins": 3.542285203933716, + "rewards/rejected": -4.967089653015137, + "step": 5070 + }, + { + "epoch": 1.2808432861689725, + "grad_norm": 51.5677375793457, + "learning_rate": 3.5410843197043454e-07, + "logits/chosen": -1.1300780773162842, + "logits/rejected": -1.0869140625, + "logps/chosen": -307.4125061035156, + "logps/rejected": -321.6187438964844, + "loss": 0.2179, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.6571533679962158, + "rewards/margins": 3.7133545875549316, + "rewards/rejected": -5.372460842132568, + "step": 5080 + }, + { + "epoch": 1.2833643210537926, + "grad_norm": 32.997013092041016, + "learning_rate": 3.534411401084848e-07, + "logits/chosen": -1.159387230873108, + "logits/rejected": -1.157080054283142, + "logps/chosen": -300.1312561035156, + "logps/rejected": -300.98748779296875, + "loss": 0.202, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.5150359869003296, + "rewards/margins": 3.515869140625, + "rewards/rejected": -5.030077934265137, + "step": 5090 + }, + { + "epoch": 1.2858853559386128, + "grad_norm": 54.570343017578125, + "learning_rate": 3.5277295786528183e-07, + "logits/chosen": -1.150732398033142, + "logits/rejected": -1.17657470703125, + "logps/chosen": -275.2124938964844, + "logps/rejected": -297.0625, + "loss": 0.1461, + "rewards/accuracies": 0.940625011920929, + "rewards/chosen": -1.5175079107284546, + "rewards/margins": 4.005663871765137, + "rewards/rejected": -5.522558689117432, + "step": 5100 + }, + { + "epoch": 1.288406390823433, + "grad_norm": 35.95811080932617, + "learning_rate": 3.521038909922794e-07, + "logits/chosen": -1.0597655773162842, + "logits/rejected": -1.049890160560608, + "logps/chosen": -281.54998779296875, + "logps/rejected": -328.9437561035156, + "loss": 0.2522, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8772491216659546, + "rewards/margins": 3.99951171875, + "rewards/rejected": -5.873827934265137, + "step": 5110 + }, + { + "epoch": 1.2909274257082533, + "grad_norm": 40.40537643432617, + "learning_rate": 3.5143394524854613e-07, + "logits/chosen": -1.1229889392852783, + "logits/rejected": -1.109014868736267, + "logps/chosen": -314.7875061035156, + "logps/rejected": -323.8125, + "loss": 0.2066, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.917639136314392, + "rewards/margins": 3.8738770484924316, + "rewards/rejected": -5.791015625, + "step": 5120 + }, + { + "epoch": 1.2934484605930734, + "grad_norm": 20.03754425048828, + "learning_rate": 3.5076312640071515e-07, + "logits/chosen": -1.137481689453125, + "logits/rejected": -1.1807434558868408, + "logps/chosen": -308.21875, + "logps/rejected": -331.3500061035156, + "loss": 0.1823, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.027453660964966, + "rewards/margins": 4.025097846984863, + "rewards/rejected": -6.052636623382568, + "step": 5130 + }, + { + "epoch": 1.2959694954778938, + "grad_norm": 44.395782470703125, + "learning_rate": 3.5009144022293533e-07, + "logits/chosen": -1.202172875404358, + "logits/rejected": -1.1327636241912842, + "logps/chosen": -336.58123779296875, + "logps/rejected": -338.54998779296875, + "loss": 0.2106, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.85870361328125, + "rewards/margins": 4.142480373382568, + "rewards/rejected": -6.0029296875, + "step": 5140 + }, + { + "epoch": 1.298490530362714, + "grad_norm": 68.62153625488281, + "learning_rate": 3.4941889249682095e-07, + "logits/chosen": -1.1988646984100342, + "logits/rejected": -1.1822509765625, + "logps/chosen": -312.5249938964844, + "logps/rejected": -334.26251220703125, + "loss": 0.1575, + "rewards/accuracies": 0.9468749761581421, + "rewards/chosen": -2.0000457763671875, + "rewards/margins": 4.140576362609863, + "rewards/rejected": -6.140234470367432, + "step": 5150 + }, + { + "epoch": 1.301011565247534, + "grad_norm": 34.79885482788086, + "learning_rate": 3.487454890114023e-07, + "logits/chosen": -1.132080078125, + "logits/rejected": -1.1778686046600342, + "logps/chosen": -294.10626220703125, + "logps/rejected": -334.5, + "loss": 0.1483, + "rewards/accuracies": 0.9468749761581421, + "rewards/chosen": -1.72509765625, + "rewards/margins": 4.214453220367432, + "rewards/rejected": -5.939648628234863, + "step": 5160 + }, + { + "epoch": 1.3035326001323544, + "grad_norm": 42.98704147338867, + "learning_rate": 3.480712355630757e-07, + "logits/chosen": -1.1443359851837158, + "logits/rejected": -1.146215796470642, + "logps/chosen": -325.26251220703125, + "logps/rejected": -336.8125, + "loss": 0.2341, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.732019066810608, + "rewards/margins": 3.8372559547424316, + "rewards/rejected": -5.572070121765137, + "step": 5170 + }, + { + "epoch": 1.3060536350171745, + "grad_norm": 29.974592208862305, + "learning_rate": 3.4739613795555345e-07, + "logits/chosen": -1.1156005859375, + "logits/rejected": -1.0713622570037842, + "logps/chosen": -295.01873779296875, + "logps/rejected": -321.1875, + "loss": 0.2239, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.9411742687225342, + "rewards/margins": 3.8954100608825684, + "rewards/rejected": -5.833593845367432, + "step": 5180 + }, + { + "epoch": 1.308574669901995, + "grad_norm": 24.7445068359375, + "learning_rate": 3.4672020199981414e-07, + "logits/chosen": -1.097814917564392, + "logits/rejected": -1.0927612781524658, + "logps/chosen": -301.0093688964844, + "logps/rejected": -303.35626220703125, + "loss": 0.1724, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.75927734375, + "rewards/margins": 3.8087401390075684, + "rewards/rejected": -5.568163871765137, + "step": 5190 + }, + { + "epoch": 1.311095704786815, + "grad_norm": 58.359310150146484, + "learning_rate": 3.4604343351405276e-07, + "logits/chosen": -1.1069214344024658, + "logits/rejected": -1.1426513195037842, + "logps/chosen": -315.3812561035156, + "logps/rejected": -330.57501220703125, + "loss": 0.2514, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.066546678543091, + "rewards/margins": 3.8038086891174316, + "rewards/rejected": -5.871679782867432, + "step": 5200 + }, + { + "epoch": 1.3136167396716352, + "grad_norm": 49.104549407958984, + "learning_rate": 3.4536583832363e-07, + "logits/chosen": -1.120825171470642, + "logits/rejected": -1.0521240234375, + "logps/chosen": -270.88751220703125, + "logps/rejected": -318.5249938964844, + "loss": 0.2491, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.7755858898162842, + "rewards/margins": 3.6437010765075684, + "rewards/rejected": -5.419921875, + "step": 5210 + }, + { + "epoch": 1.3161377745564553, + "grad_norm": 23.951229095458984, + "learning_rate": 3.4468742226102285e-07, + "logits/chosen": -1.1461303234100342, + "logits/rejected": -1.1341063976287842, + "logps/chosen": -278.4437561035156, + "logps/rejected": -297.8999938964844, + "loss": 0.1853, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.2996337413787842, + "rewards/margins": 3.734326124191284, + "rewards/rejected": -5.033398628234863, + "step": 5220 + }, + { + "epoch": 1.3186588094412757, + "grad_norm": 29.159059524536133, + "learning_rate": 3.44008191165774e-07, + "logits/chosen": -1.1116211414337158, + "logits/rejected": -1.099633812904358, + "logps/chosen": -313.6312561035156, + "logps/rejected": -331.13751220703125, + "loss": 0.2361, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.46197509765625, + "rewards/margins": 3.6006836891174316, + "rewards/rejected": -5.063086032867432, + "step": 5230 + }, + { + "epoch": 1.3211798443260958, + "grad_norm": 37.64070510864258, + "learning_rate": 3.4332815088444126e-07, + "logits/chosen": -1.108544945716858, + "logits/rejected": -1.1564819812774658, + "logps/chosen": -308.5375061035156, + "logps/rejected": -323.32501220703125, + "loss": 0.1673, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8876953125, + "rewards/margins": 3.787109375, + "rewards/rejected": -5.674023628234863, + "step": 5240 + }, + { + "epoch": 1.3237008792109162, + "grad_norm": 20.202035903930664, + "learning_rate": 3.4264730727054813e-07, + "logits/chosen": -1.107452392578125, + "logits/rejected": -1.107476830482483, + "logps/chosen": -297.9937438964844, + "logps/rejected": -323.1000061035156, + "loss": 0.2111, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.939056396484375, + "rewards/margins": 3.8095703125, + "rewards/rejected": -5.746289253234863, + "step": 5250 + }, + { + "epoch": 1.3262219140957363, + "grad_norm": 47.521690368652344, + "learning_rate": 3.4196566618453236e-07, + "logits/chosen": -1.03961181640625, + "logits/rejected": -1.011254906654358, + "logps/chosen": -304.03125, + "logps/rejected": -335.6875, + "loss": 0.207, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.5875823497772217, + "rewards/margins": 3.8901000022888184, + "rewards/rejected": -5.474413871765137, + "step": 5260 + }, + { + "epoch": 1.3287429489805564, + "grad_norm": 57.11899185180664, + "learning_rate": 3.4128323349369657e-07, + "logits/chosen": -1.1620604991912842, + "logits/rejected": -1.0976440906524658, + "logps/chosen": -332.54998779296875, + "logps/rejected": -335.1875, + "loss": 0.1966, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.4975097179412842, + "rewards/margins": 4.015295505523682, + "rewards/rejected": -5.511816501617432, + "step": 5270 + }, + { + "epoch": 1.3312639838653768, + "grad_norm": 71.34944915771484, + "learning_rate": 3.4060001507215675e-07, + "logits/chosen": -1.1159179210662842, + "logits/rejected": -1.084747314453125, + "logps/chosen": -307.73748779296875, + "logps/rejected": -319.57501220703125, + "loss": 0.2152, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.62872314453125, + "rewards/margins": 3.6752686500549316, + "rewards/rejected": -5.306445121765137, + "step": 5280 + }, + { + "epoch": 1.333785018750197, + "grad_norm": 29.985002517700195, + "learning_rate": 3.399160168007924e-07, + "logits/chosen": -1.1326415538787842, + "logits/rejected": -1.0793578624725342, + "logps/chosen": -302.38751220703125, + "logps/rejected": -297.15625, + "loss": 0.1693, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.788507103919983, + "rewards/margins": 3.4935059547424316, + "rewards/rejected": -5.283789157867432, + "step": 5290 + }, + { + "epoch": 1.3363060536350173, + "grad_norm": 28.097501754760742, + "learning_rate": 3.392312445671957e-07, + "logits/chosen": -1.118872046470642, + "logits/rejected": -1.0843017101287842, + "logps/chosen": -320.6625061035156, + "logps/rejected": -348.6625061035156, + "loss": 0.2394, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.546484351158142, + "rewards/margins": 3.837890625, + "rewards/rejected": -5.383008003234863, + "step": 5300 + }, + { + "epoch": 1.3388270885198374, + "grad_norm": 49.60430145263672, + "learning_rate": 3.385457042656206e-07, + "logits/chosen": -1.1826660633087158, + "logits/rejected": -1.0979735851287842, + "logps/chosen": -320.4437561035156, + "logps/rejected": -331.0562438964844, + "loss": 0.2391, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.9879882335662842, + "rewards/margins": 3.7488770484924316, + "rewards/rejected": -5.738965034484863, + "step": 5310 + }, + { + "epoch": 1.3413481234046576, + "grad_norm": 53.35076141357422, + "learning_rate": 3.378594017969324e-07, + "logits/chosen": -1.087255835533142, + "logits/rejected": -1.0226256847381592, + "logps/chosen": -296.01873779296875, + "logps/rejected": -303.0874938964844, + "loss": 0.2151, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -2.108654737472534, + "rewards/margins": 3.589794874191284, + "rewards/rejected": -5.699804782867432, + "step": 5320 + }, + { + "epoch": 1.3438691582894777, + "grad_norm": 70.57498931884766, + "learning_rate": 3.3717234306855686e-07, + "logits/chosen": -1.175225853919983, + "logits/rejected": -1.122869849205017, + "logps/chosen": -318.7718811035156, + "logps/rejected": -327.54376220703125, + "loss": 0.2535, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4980590343475342, + "rewards/margins": 3.7567381858825684, + "rewards/rejected": -5.253808498382568, + "step": 5330 + }, + { + "epoch": 1.346390193174298, + "grad_norm": 49.551551818847656, + "learning_rate": 3.364845339944292e-07, + "logits/chosen": NaN, + "logits/rejected": -1.133337378501892, + "logps/chosen": -308.70001220703125, + "logps/rejected": -317.11248779296875, + "loss": 0.2449, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.0967658758163452, + "rewards/margins": 3.7632813453674316, + "rewards/rejected": -4.859667778015137, + "step": 5340 + }, + { + "epoch": 1.3489112280591182, + "grad_norm": 41.36616134643555, + "learning_rate": 3.357959804949435e-07, + "logits/chosen": NaN, + "logits/rejected": -1.105920433998108, + "logps/chosen": -273.4437561035156, + "logps/rejected": -288.01251220703125, + "loss": 0.1594, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -0.784588634967804, + "rewards/margins": 3.78125, + "rewards/rejected": -4.5654296875, + "step": 5350 + }, + { + "epoch": 1.3514322629439386, + "grad_norm": 60.294315338134766, + "learning_rate": 3.3510668849690155e-07, + "logits/chosen": -1.1477782726287842, + "logits/rejected": -1.159887671470642, + "logps/chosen": -282.60626220703125, + "logps/rejected": -291.04998779296875, + "loss": 0.2097, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.16192626953125, + "rewards/margins": 3.8250489234924316, + "rewards/rejected": -4.98779296875, + "step": 5360 + }, + { + "epoch": 1.3539532978287587, + "grad_norm": 48.605125427246094, + "learning_rate": 3.3441666393346167e-07, + "logits/chosen": -1.1647217273712158, + "logits/rejected": -1.057653784751892, + "logps/chosen": -293.56561279296875, + "logps/rejected": -302.9375, + "loss": 0.1869, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.273126244544983, + "rewards/margins": 4.179394721984863, + "rewards/rejected": -5.451074123382568, + "step": 5370 + }, + { + "epoch": 1.3564743327135789, + "grad_norm": 30.89805030822754, + "learning_rate": 3.33725912744088e-07, + "logits/chosen": -1.164605736732483, + "logits/rejected": -1.119970679283142, + "logps/chosen": -290.17498779296875, + "logps/rejected": -314.85626220703125, + "loss": 0.2443, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.1137816905975342, + "rewards/margins": 3.905468702316284, + "rewards/rejected": -5.019629001617432, + "step": 5380 + }, + { + "epoch": 1.358995367598399, + "grad_norm": 51.46559524536133, + "learning_rate": 3.330344408744992e-07, + "logits/chosen": -1.0963866710662842, + "logits/rejected": -1.160302758216858, + "logps/chosen": -260.2875061035156, + "logps/rejected": -294.32501220703125, + "loss": 0.2679, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.488922119140625, + "rewards/margins": 3.6673340797424316, + "rewards/rejected": -5.156884670257568, + "step": 5390 + }, + { + "epoch": 1.3615164024832194, + "grad_norm": 70.4235610961914, + "learning_rate": 3.3234225427661697e-07, + "logits/chosen": -1.0990784168243408, + "logits/rejected": -1.1343505382537842, + "logps/chosen": -303.73748779296875, + "logps/rejected": -290.33123779296875, + "loss": 0.3044, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.653594970703125, + "rewards/margins": 3.6046385765075684, + "rewards/rejected": -5.256640434265137, + "step": 5400 + }, + { + "epoch": 1.3640374373680395, + "grad_norm": 63.168914794921875, + "learning_rate": 3.316493589085155e-07, + "logits/chosen": -1.148919701576233, + "logits/rejected": -1.170166015625, + "logps/chosen": -284.45001220703125, + "logps/rejected": -297.0, + "loss": 0.249, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.8449890613555908, + "rewards/margins": 3.6565918922424316, + "rewards/rejected": -5.50146484375, + "step": 5410 + }, + { + "epoch": 1.3665584722528599, + "grad_norm": 77.1202392578125, + "learning_rate": 3.3095576073436964e-07, + "logits/chosen": -1.07568359375, + "logits/rejected": -1.0973358154296875, + "logps/chosen": -298.73126220703125, + "logps/rejected": -321.7875061035156, + "loss": 0.3114, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.070343017578125, + "rewards/margins": 3.6612792015075684, + "rewards/rejected": -5.731249809265137, + "step": 5420 + }, + { + "epoch": 1.36907950713768, + "grad_norm": 43.10295867919922, + "learning_rate": 3.3026146572440366e-07, + "logits/chosen": -1.143774390220642, + "logits/rejected": -1.1075561046600342, + "logps/chosen": -306.2875061035156, + "logps/rejected": -294.3999938964844, + "loss": 0.214, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.480169653892517, + "rewards/margins": 3.926074266433716, + "rewards/rejected": -5.407031059265137, + "step": 5430 + }, + { + "epoch": 1.3716005420225001, + "grad_norm": 49.544525146484375, + "learning_rate": 3.295664798548401e-07, + "logits/chosen": -1.1381804943084717, + "logits/rejected": -1.108007788658142, + "logps/chosen": -302.20001220703125, + "logps/rejected": -317.4125061035156, + "loss": 0.1675, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.933416724205017, + "rewards/margins": 4.133349418640137, + "rewards/rejected": -6.068359375, + "step": 5440 + }, + { + "epoch": 1.3741215769073205, + "grad_norm": 52.61790466308594, + "learning_rate": 3.288708091078479e-07, + "logits/chosen": -1.137231469154358, + "logits/rejected": -1.139685034751892, + "logps/chosen": -301.1000061035156, + "logps/rejected": -322.0249938964844, + "loss": 0.2209, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.884552001953125, + "rewards/margins": 3.8568358421325684, + "rewards/rejected": -5.739648342132568, + "step": 5450 + }, + { + "epoch": 1.3766426117921406, + "grad_norm": 32.10395431518555, + "learning_rate": 3.281744594714914e-07, + "logits/chosen": -1.072509765625, + "logits/rejected": -1.027063012123108, + "logps/chosen": -343.3999938964844, + "logps/rejected": -319.8500061035156, + "loss": 0.1719, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.499053955078125, + "rewards/margins": 4.1962890625, + "rewards/rejected": -5.696875095367432, + "step": 5460 + }, + { + "epoch": 1.379163646676961, + "grad_norm": 52.091915130615234, + "learning_rate": 3.274774369396783e-07, + "logits/chosen": -1.142431616783142, + "logits/rejected": -1.1350829601287842, + "logps/chosen": -294.6312561035156, + "logps/rejected": -319.4375, + "loss": 0.2173, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.5295836925506592, + "rewards/margins": 3.746264696121216, + "rewards/rejected": -5.27783203125, + "step": 5470 + }, + { + "epoch": 1.3816846815617811, + "grad_norm": 56.385223388671875, + "learning_rate": 3.267797475121087e-07, + "logits/chosen": -1.1925048828125, + "logits/rejected": -1.196374535560608, + "logps/chosen": -277.29376220703125, + "logps/rejected": -308.82501220703125, + "loss": 0.1981, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.1184570789337158, + "rewards/margins": 3.901611328125, + "rewards/rejected": -5.021679878234863, + "step": 5480 + }, + { + "epoch": 1.3842057164466013, + "grad_norm": 32.52472686767578, + "learning_rate": 3.260813971942226e-07, + "logits/chosen": -1.22039794921875, + "logits/rejected": -1.18621826171875, + "logps/chosen": -308.4750061035156, + "logps/rejected": -336.9937438964844, + "loss": 0.2595, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.998486340045929, + "rewards/margins": 3.873974561691284, + "rewards/rejected": -4.875781059265137, + "step": 5490 + }, + { + "epoch": 1.3867267513314214, + "grad_norm": 23.37797737121582, + "learning_rate": 3.2538239199714917e-07, + "logits/chosen": -1.09912109375, + "logits/rejected": -1.102990746498108, + "logps/chosen": -266.57501220703125, + "logps/rejected": -297.4125061035156, + "loss": 0.2051, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.5279052257537842, + "rewards/margins": 3.7853760719299316, + "rewards/rejected": -5.312890529632568, + "step": 5500 + }, + { + "epoch": 1.3892477862162418, + "grad_norm": 31.321426391601562, + "learning_rate": 3.246827379376542e-07, + "logits/chosen": -1.1439940929412842, + "logits/rejected": -1.192895531654358, + "logps/chosen": -342.1000061035156, + "logps/rejected": -360.1875, + "loss": 0.1509, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -1.5791015625, + "rewards/margins": 4.104394435882568, + "rewards/rejected": -5.683203220367432, + "step": 5510 + }, + { + "epoch": 1.391768821101062, + "grad_norm": 33.14871597290039, + "learning_rate": 3.239824410380888e-07, + "logits/chosen": -1.122247338294983, + "logits/rejected": -1.107336401939392, + "logps/chosen": -293.85626220703125, + "logps/rejected": -302.35626220703125, + "loss": 0.1789, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.7326171398162842, + "rewards/margins": 3.7352538108825684, + "rewards/rejected": -5.467382907867432, + "step": 5520 + }, + { + "epoch": 1.3942898559858823, + "grad_norm": 47.060447692871094, + "learning_rate": 3.232815073263372e-07, + "logits/chosen": -1.175439476966858, + "logits/rejected": -1.114465355873108, + "logps/chosen": -300.63751220703125, + "logps/rejected": -323.3125, + "loss": 0.2601, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.6728515625, + "rewards/margins": 3.818798780441284, + "rewards/rejected": -5.490234375, + "step": 5530 + }, + { + "epoch": 1.3968108908707024, + "grad_norm": 59.03270721435547, + "learning_rate": 3.225799428357652e-07, + "logits/chosen": -1.1072266101837158, + "logits/rejected": -1.101220726966858, + "logps/chosen": -297.4125061035156, + "logps/rejected": -321.3062438964844, + "loss": 0.2506, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.2929320335388184, + "rewards/margins": 3.776440382003784, + "rewards/rejected": -6.070703029632568, + "step": 5540 + }, + { + "epoch": 1.3993319257555226, + "grad_norm": 29.274913787841797, + "learning_rate": 3.2187775360516827e-07, + "logits/chosen": -1.188848853111267, + "logits/rejected": -1.0924255847930908, + "logps/chosen": -338.9375, + "logps/rejected": -330.6000061035156, + "loss": 0.2505, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.454846143722534, + "rewards/margins": 3.9281249046325684, + "rewards/rejected": -6.381445407867432, + "step": 5550 + }, + { + "epoch": 1.401852960640343, + "grad_norm": 27.96847915649414, + "learning_rate": 3.2117494567871914e-07, + "logits/chosen": -1.184326171875, + "logits/rejected": -1.112554907798767, + "logps/chosen": -298.70001220703125, + "logps/rejected": -316.46875, + "loss": 0.1937, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.985376000404358, + "rewards/margins": 3.8896484375, + "rewards/rejected": -5.876367092132568, + "step": 5560 + }, + { + "epoch": 1.404373995525163, + "grad_norm": 45.046634674072266, + "learning_rate": 3.20471525105916e-07, + "logits/chosen": -1.160980224609375, + "logits/rejected": -1.121740698814392, + "logps/chosen": -296.8062438964844, + "logps/rejected": -306.4125061035156, + "loss": 0.1803, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.79638671875, + "rewards/margins": 4.122363090515137, + "rewards/rejected": -5.918359279632568, + "step": 5570 + }, + { + "epoch": 1.4068950304099834, + "grad_norm": 58.81211853027344, + "learning_rate": 3.197674979415308e-07, + "logits/chosen": -1.2167479991912842, + "logits/rejected": -1.166601538658142, + "logps/chosen": -328.6312561035156, + "logps/rejected": -336.8500061035156, + "loss": 0.1951, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.554895043373108, + "rewards/margins": 4.173193454742432, + "rewards/rejected": -5.726758003234863, + "step": 5580 + }, + { + "epoch": 1.4094160652948036, + "grad_norm": 48.055259704589844, + "learning_rate": 3.190628702455565e-07, + "logits/chosen": -1.20501708984375, + "logits/rejected": -1.149023413658142, + "logps/chosen": -309.70623779296875, + "logps/rejected": -308.89373779296875, + "loss": 0.2834, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.6253143548965454, + "rewards/margins": 3.464245557785034, + "rewards/rejected": -5.089062690734863, + "step": 5590 + }, + { + "epoch": 1.4119371001796237, + "grad_norm": 25.23592185974121, + "learning_rate": 3.183576480831551e-07, + "logits/chosen": -1.10205078125, + "logits/rejected": -1.1086914539337158, + "logps/chosen": -293.6875, + "logps/rejected": -317.8374938964844, + "loss": 0.2192, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.4288756847381592, + "rewards/margins": 3.617626905441284, + "rewards/rejected": -5.044140815734863, + "step": 5600 + }, + { + "epoch": 1.4144581350644438, + "grad_norm": 32.868778228759766, + "learning_rate": 3.17651837524606e-07, + "logits/chosen": -1.1915283203125, + "logits/rejected": -1.1032593250274658, + "logps/chosen": -302.6625061035156, + "logps/rejected": -319.7875061035156, + "loss": 0.2123, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.565393090248108, + "rewards/margins": 3.9247803688049316, + "rewards/rejected": -5.489062309265137, + "step": 5610 + }, + { + "epoch": 1.4169791699492642, + "grad_norm": 56.18616485595703, + "learning_rate": 3.1694544464525274e-07, + "logits/chosen": -1.14697265625, + "logits/rejected": -1.1057250499725342, + "logps/chosen": -292.6625061035156, + "logps/rejected": -316.76251220703125, + "loss": 0.1597, + "rewards/accuracies": 0.940625011920929, + "rewards/chosen": -1.793402075767517, + "rewards/margins": 3.890576124191284, + "rewards/rejected": -5.682226657867432, + "step": 5620 + }, + { + "epoch": 1.4195002048340843, + "grad_norm": 20.569725036621094, + "learning_rate": 3.162384755254517e-07, + "logits/chosen": -1.1304199695587158, + "logits/rejected": -1.0923950672149658, + "logps/chosen": -304.5687561035156, + "logps/rejected": -316.8500061035156, + "loss": 0.1808, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.7995116710662842, + "rewards/margins": 4.0400390625, + "rewards/rejected": -5.840039253234863, + "step": 5630 + }, + { + "epoch": 1.4220212397189047, + "grad_norm": 46.42686462402344, + "learning_rate": 3.155309362505191e-07, + "logits/chosen": -1.080780029296875, + "logits/rejected": -1.074194312095642, + "logps/chosen": -266.5093688964844, + "logps/rejected": -285.66876220703125, + "loss": 0.2078, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.6823608875274658, + "rewards/margins": 3.513378858566284, + "rewards/rejected": -5.196484565734863, + "step": 5640 + }, + { + "epoch": 1.4245422746037248, + "grad_norm": 38.89899444580078, + "learning_rate": 3.1482283291067886e-07, + "logits/chosen": -1.1553223133087158, + "logits/rejected": -1.1325562000274658, + "logps/chosen": -284.4312438964844, + "logps/rejected": -292.66876220703125, + "loss": 0.1991, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.1062500476837158, + "rewards/margins": 3.6515870094299316, + "rewards/rejected": -4.758105278015137, + "step": 5650 + }, + { + "epoch": 1.427063309488545, + "grad_norm": 31.15665054321289, + "learning_rate": 3.141141716010101e-07, + "logits/chosen": -1.1189086437225342, + "logits/rejected": -1.129052758216858, + "logps/chosen": -307.15625, + "logps/rejected": -305.88751220703125, + "loss": 0.168, + "rewards/accuracies": 0.940625011920929, + "rewards/chosen": -0.9198631048202515, + "rewards/margins": 4.07763671875, + "rewards/rejected": -4.99609375, + "step": 5660 + }, + { + "epoch": 1.4295843443733653, + "grad_norm": 60.87965393066406, + "learning_rate": 3.134049584213949e-07, + "logits/chosen": -1.1614501476287842, + "logits/rejected": -1.225744605064392, + "logps/chosen": -304.1000061035156, + "logps/rejected": -345.1875, + "loss": 0.2647, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.2272918224334717, + "rewards/margins": 3.766308546066284, + "rewards/rejected": -4.992089748382568, + "step": 5670 + }, + { + "epoch": 1.4321053792581855, + "grad_norm": 38.806068420410156, + "learning_rate": 3.1269519947646534e-07, + "logits/chosen": -1.163183569908142, + "logits/rejected": -1.1135742664337158, + "logps/chosen": -280.0874938964844, + "logps/rejected": -322.57501220703125, + "loss": 0.2206, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.7157471179962158, + "rewards/margins": 3.8421387672424316, + "rewards/rejected": -5.557909965515137, + "step": 5680 + }, + { + "epoch": 1.4346264141430056, + "grad_norm": 22.673965454101562, + "learning_rate": 3.119849008755515e-07, + "logits/chosen": -1.2099120616912842, + "logits/rejected": -1.2332274913787842, + "logps/chosen": -338.4437561035156, + "logps/rejected": -329.23748779296875, + "loss": 0.1861, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.4517090320587158, + "rewards/margins": 4.054003715515137, + "rewards/rejected": -5.509081840515137, + "step": 5690 + }, + { + "epoch": 1.437147449027826, + "grad_norm": 25.874675750732422, + "learning_rate": 3.112740687326286e-07, + "logits/chosen": -1.1012604236602783, + "logits/rejected": -1.0789794921875, + "logps/chosen": -293.23126220703125, + "logps/rejected": -304.1937561035156, + "loss": 0.2343, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.7524551153182983, + "rewards/margins": 3.746533155441284, + "rewards/rejected": -5.501562595367432, + "step": 5700 + }, + { + "epoch": 1.4396684839126461, + "grad_norm": 53.42656326293945, + "learning_rate": 3.105627091662641e-07, + "logits/chosen": -1.1712768077850342, + "logits/rejected": -1.115148901939392, + "logps/chosen": -280.15625, + "logps/rejected": -308.10626220703125, + "loss": 0.1776, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.9433562755584717, + "rewards/margins": 3.83447265625, + "rewards/rejected": -5.779492378234863, + "step": 5710 + }, + { + "epoch": 1.4421895187974663, + "grad_norm": 34.40613555908203, + "learning_rate": 3.098508282995657e-07, + "logits/chosen": -1.130029320716858, + "logits/rejected": -1.1058349609375, + "logps/chosen": -309.9750061035156, + "logps/rejected": -338.5375061035156, + "loss": 0.2043, + "rewards/accuracies": 0.921875, + "rewards/chosen": -2.000903367996216, + "rewards/margins": 4.115283012390137, + "rewards/rejected": -6.117773532867432, + "step": 5720 + }, + { + "epoch": 1.4447105536822866, + "grad_norm": 22.414045333862305, + "learning_rate": 3.091384322601279e-07, + "logits/chosen": -1.1539306640625, + "logits/rejected": -1.146875023841858, + "logps/chosen": -310.3812561035156, + "logps/rejected": -346.7124938964844, + "loss": 0.2165, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.7687591314315796, + "rewards/margins": 4.116943359375, + "rewards/rejected": -5.885546684265137, + "step": 5730 + }, + { + "epoch": 1.4472315885671068, + "grad_norm": 59.29484176635742, + "learning_rate": 3.0842552717998e-07, + "logits/chosen": -1.112634301185608, + "logits/rejected": -1.109826683998108, + "logps/chosen": -274.4125061035156, + "logps/rejected": -334.125, + "loss": 0.2335, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -2.154559373855591, + "rewards/margins": 3.826464891433716, + "rewards/rejected": -5.981054782867432, + "step": 5740 + }, + { + "epoch": 1.4497526234519271, + "grad_norm": 38.43928909301758, + "learning_rate": 3.077121191955324e-07, + "logits/chosen": -1.1369812488555908, + "logits/rejected": -1.123291015625, + "logps/chosen": -316.89373779296875, + "logps/rejected": -323.0874938964844, + "loss": 0.1646, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.9062316417694092, + "rewards/margins": 4.0869140625, + "rewards/rejected": -5.992578029632568, + "step": 5750 + }, + { + "epoch": 1.4522736583367473, + "grad_norm": 40.83302307128906, + "learning_rate": 3.0699821444752484e-07, + "logits/chosen": -1.1530029773712158, + "logits/rejected": -1.1438720226287842, + "logps/chosen": -311.38751220703125, + "logps/rejected": -322.8125, + "loss": 0.2641, + "rewards/accuracies": 0.890625, + "rewards/chosen": -2.0092499256134033, + "rewards/margins": 3.8388671875, + "rewards/rejected": -5.847754001617432, + "step": 5760 + }, + { + "epoch": 1.4547946932215674, + "grad_norm": 37.13199234008789, + "learning_rate": 3.062838190809727e-07, + "logits/chosen": -1.154150366783142, + "logits/rejected": -1.152459740638733, + "logps/chosen": -329.17498779296875, + "logps/rejected": -331.86248779296875, + "loss": 0.1963, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.7770264148712158, + "rewards/margins": 4.0390625, + "rewards/rejected": -5.81640625, + "step": 5770 + }, + { + "epoch": 1.4573157281063875, + "grad_norm": 69.80667114257812, + "learning_rate": 3.055689392451144e-07, + "logits/chosen": -1.1111571788787842, + "logits/rejected": -1.1268310546875, + "logps/chosen": -282.5062561035156, + "logps/rejected": -317.53125, + "loss": 0.2322, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.979040503501892, + "rewards/margins": 3.720410108566284, + "rewards/rejected": -5.700097560882568, + "step": 5780 + }, + { + "epoch": 1.459836762991208, + "grad_norm": 52.87074661254883, + "learning_rate": 3.0485358109335875e-07, + "logits/chosen": -1.127844214439392, + "logits/rejected": -1.0900757312774658, + "logps/chosen": -289.6000061035156, + "logps/rejected": -306.3999938964844, + "loss": 0.1803, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -1.8661377429962158, + "rewards/margins": 4.056787014007568, + "rewards/rejected": -5.922265529632568, + "step": 5790 + }, + { + "epoch": 1.462357797876028, + "grad_norm": 54.009159088134766, + "learning_rate": 3.041377507832313e-07, + "logits/chosen": -1.1489989757537842, + "logits/rejected": -1.1700439453125, + "logps/chosen": -316.9624938964844, + "logps/rejected": -341.65625, + "loss": 0.2669, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.86407470703125, + "rewards/margins": 3.7484374046325684, + "rewards/rejected": -5.61181640625, + "step": 5800 + }, + { + "epoch": 1.4648788327608484, + "grad_norm": 85.05703735351562, + "learning_rate": 3.034214544763223e-07, + "logits/chosen": -1.185888648033142, + "logits/rejected": -1.1497223377227783, + "logps/chosen": -295.5562438964844, + "logps/rejected": -303.20623779296875, + "loss": 0.3623, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6642272472381592, + "rewards/margins": 3.2563233375549316, + "rewards/rejected": -4.918359279632568, + "step": 5810 + }, + { + "epoch": 1.4673998676456685, + "grad_norm": 50.31169509887695, + "learning_rate": 3.0270469833823246e-07, + "logits/chosen": -1.1232178211212158, + "logits/rejected": -1.1271483898162842, + "logps/chosen": -278.10626220703125, + "logps/rejected": -280.1625061035156, + "loss": 0.1864, + "rewards/accuracies": 0.940625011920929, + "rewards/chosen": -1.6465972661972046, + "rewards/margins": 3.705029249191284, + "rewards/rejected": -5.348828315734863, + "step": 5820 + }, + { + "epoch": 1.4699209025304887, + "grad_norm": 35.56777572631836, + "learning_rate": 3.019874885385211e-07, + "logits/chosen": -1.0975830554962158, + "logits/rejected": -1.1710418462753296, + "logps/chosen": -301.0249938964844, + "logps/rejected": -318.79998779296875, + "loss": 0.19, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.66668701171875, + "rewards/margins": 3.7491211891174316, + "rewards/rejected": -5.416796684265137, + "step": 5830 + }, + { + "epoch": 1.472441937415309, + "grad_norm": 41.645145416259766, + "learning_rate": 3.012698312506523e-07, + "logits/chosen": -1.104577660560608, + "logits/rejected": -1.1301758289337158, + "logps/chosen": -320.40625, + "logps/rejected": -345.70623779296875, + "loss": 0.174, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.4617538452148438, + "rewards/margins": 4.060351371765137, + "rewards/rejected": -5.521679878234863, + "step": 5840 + }, + { + "epoch": 1.4749629723001292, + "grad_norm": 76.22579193115234, + "learning_rate": 3.0055173265194184e-07, + "logits/chosen": -1.119897484779358, + "logits/rejected": -1.038330078125, + "logps/chosen": -273.0562438964844, + "logps/rejected": -283.375, + "loss": 0.2618, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8042480945587158, + "rewards/margins": 3.3912596702575684, + "rewards/rejected": -5.197949409484863, + "step": 5850 + }, + { + "epoch": 1.4774840071849495, + "grad_norm": 41.23089599609375, + "learning_rate": 2.998331989235042e-07, + "logits/chosen": -0.995147705078125, + "logits/rejected": -1.065283179283142, + "logps/chosen": -296.29376220703125, + "logps/rejected": -320.5562438964844, + "loss": 0.2265, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.659918189048767, + "rewards/margins": 3.641796827316284, + "rewards/rejected": -5.302148342132568, + "step": 5860 + }, + { + "epoch": 1.4800050420697697, + "grad_norm": 20.371374130249023, + "learning_rate": 2.991142362501994e-07, + "logits/chosen": -1.1528136730194092, + "logits/rejected": -1.080206274986267, + "logps/chosen": -327.1625061035156, + "logps/rejected": -334.3374938964844, + "loss": 0.1988, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.268011450767517, + "rewards/margins": 4.265576362609863, + "rewards/rejected": -5.5341796875, + "step": 5870 + }, + { + "epoch": 1.4825260769545898, + "grad_norm": 69.50802612304688, + "learning_rate": 2.9839485082057945e-07, + "logits/chosen": -1.068603515625, + "logits/rejected": -1.0320342779159546, + "logps/chosen": -291.1499938964844, + "logps/rejected": -318.2875061035156, + "loss": 0.2308, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6911499500274658, + "rewards/margins": 3.997119188308716, + "rewards/rejected": -5.686230659484863, + "step": 5880 + }, + { + "epoch": 1.48504711183941, + "grad_norm": 20.13905906677246, + "learning_rate": 2.976750488268355e-07, + "logits/chosen": -1.068457007408142, + "logits/rejected": -1.037744164466858, + "logps/chosen": -308.76251220703125, + "logps/rejected": -339.63751220703125, + "loss": 0.2132, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.357421875, + "rewards/margins": 4.101513862609863, + "rewards/rejected": -5.459668159484863, + "step": 5890 + }, + { + "epoch": 1.4875681467242303, + "grad_norm": 25.19346809387207, + "learning_rate": 2.96954836464744e-07, + "logits/chosen": -1.145776391029358, + "logits/rejected": -1.1566894054412842, + "logps/chosen": -330.23126220703125, + "logps/rejected": -327.5625, + "loss": 0.1967, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.196740746498108, + "rewards/margins": 4.27294921875, + "rewards/rejected": -5.466894626617432, + "step": 5900 + }, + { + "epoch": 1.4900891816090505, + "grad_norm": 48.2508659362793, + "learning_rate": 2.9623421993361407e-07, + "logits/chosen": -1.126550316810608, + "logits/rejected": -1.1150023937225342, + "logps/chosen": -312.5375061035156, + "logps/rejected": -328.57501220703125, + "loss": 0.2607, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.260687232017517, + "rewards/margins": 3.830078125, + "rewards/rejected": -5.091796875, + "step": 5910 + }, + { + "epoch": 1.4926102164938708, + "grad_norm": 33.797828674316406, + "learning_rate": 2.955132054362335e-07, + "logits/chosen": -1.12750244140625, + "logits/rejected": -1.135461449623108, + "logps/chosen": -295.4468688964844, + "logps/rejected": -316.7749938964844, + "loss": 0.2019, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.3718032836914062, + "rewards/margins": 3.5299315452575684, + "rewards/rejected": -4.899218559265137, + "step": 5920 + }, + { + "epoch": 1.495131251378691, + "grad_norm": 36.69331359863281, + "learning_rate": 2.9479179917881593e-07, + "logits/chosen": -1.1229248046875, + "logits/rejected": -1.0802001953125, + "logps/chosen": -308.9312438964844, + "logps/rejected": -335.57501220703125, + "loss": 0.1621, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -1.5434691905975342, + "rewards/margins": 4.004296779632568, + "rewards/rejected": -5.548437595367432, + "step": 5930 + }, + { + "epoch": 1.497652286263511, + "grad_norm": 61.35308837890625, + "learning_rate": 2.9407000737094655e-07, + "logits/chosen": -1.1416747570037842, + "logits/rejected": -1.08526611328125, + "logps/chosen": -297.3999938964844, + "logps/rejected": -305.2875061035156, + "loss": 0.2349, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.389404296875, + "rewards/margins": 3.870654344558716, + "rewards/rejected": -5.259179592132568, + "step": 5940 + }, + { + "epoch": 1.5001733211483312, + "grad_norm": 57.0819091796875, + "learning_rate": 2.9334783622552983e-07, + "logits/chosen": -1.0996582508087158, + "logits/rejected": -1.113623023033142, + "logps/chosen": -296.64373779296875, + "logps/rejected": -327.1875, + "loss": 0.1803, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -0.975231945514679, + "rewards/margins": 4.025097846984863, + "rewards/rejected": -5.000586032867432, + "step": 5950 + }, + { + "epoch": 1.5026943560331516, + "grad_norm": 35.29164505004883, + "learning_rate": 2.9262529195873506e-07, + "logits/chosen": -1.155432105064392, + "logits/rejected": -1.187261939048767, + "logps/chosen": -313.4125061035156, + "logps/rejected": -310.8062438964844, + "loss": 0.2001, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.1038116216659546, + "rewards/margins": 3.7796874046325684, + "rewards/rejected": -4.883203029632568, + "step": 5960 + }, + { + "epoch": 1.505215390917972, + "grad_norm": 20.90247917175293, + "learning_rate": 2.9190238078994326e-07, + "logits/chosen": -1.1132323741912842, + "logits/rejected": -1.115136742591858, + "logps/chosen": -313.07501220703125, + "logps/rejected": -297.5687561035156, + "loss": 0.1939, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.4117408990859985, + "rewards/margins": 3.9153809547424316, + "rewards/rejected": -5.327929496765137, + "step": 5970 + }, + { + "epoch": 1.507736425802792, + "grad_norm": 41.171974182128906, + "learning_rate": 2.911791089416938e-07, + "logits/chosen": -1.0940430164337158, + "logits/rejected": -1.1343994140625, + "logps/chosen": -294.66876220703125, + "logps/rejected": -303.4750061035156, + "loss": 0.2523, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.30426025390625, + "rewards/margins": 4.020263671875, + "rewards/rejected": -5.324926853179932, + "step": 5980 + }, + { + "epoch": 1.5102574606876122, + "grad_norm": 36.254337310791016, + "learning_rate": 2.904554826396304e-07, + "logits/chosen": -1.144506812095642, + "logits/rejected": -1.1100342273712158, + "logps/chosen": -290.5, + "logps/rejected": -324.2749938964844, + "loss": 0.2526, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.72589111328125, + "rewards/margins": 3.7916259765625, + "rewards/rejected": -5.515234470367432, + "step": 5990 + }, + { + "epoch": 1.5127784955724324, + "grad_norm": 38.0024299621582, + "learning_rate": 2.89731508112448e-07, + "logits/chosen": -1.145452857017517, + "logits/rejected": -1.141699194908142, + "logps/chosen": -310.36248779296875, + "logps/rejected": -310.3999938964844, + "loss": 0.2551, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.7955322265625, + "rewards/margins": 3.600830078125, + "rewards/rejected": -5.396777153015137, + "step": 6000 + }, + { + "epoch": 1.5152995304572527, + "grad_norm": 13.969449996948242, + "learning_rate": 2.890071915918387e-07, + "logits/chosen": -1.14007568359375, + "logits/rejected": -1.06732177734375, + "logps/chosen": -299.98126220703125, + "logps/rejected": -323.7562561035156, + "loss": 0.1965, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.576837182044983, + "rewards/margins": 3.966747999191284, + "rewards/rejected": -5.544043064117432, + "step": 6010 + }, + { + "epoch": 1.5178205653420729, + "grad_norm": 23.30685806274414, + "learning_rate": 2.8828253931243846e-07, + "logits/chosen": -1.136743187904358, + "logits/rejected": -1.092736840248108, + "logps/chosen": -297.5625, + "logps/rejected": -295.63751220703125, + "loss": 0.2065, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.3041808605194092, + "rewards/margins": 3.9515624046325684, + "rewards/rejected": -5.256249904632568, + "step": 6020 + }, + { + "epoch": 1.5203416002268932, + "grad_norm": 55.4643669128418, + "learning_rate": 2.8755755751177333e-07, + "logits/chosen": -1.1088378429412842, + "logits/rejected": -1.1104736328125, + "logps/chosen": -298.98126220703125, + "logps/rejected": -310.26251220703125, + "loss": 0.2441, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -0.9515517950057983, + "rewards/margins": 3.7629151344299316, + "rewards/rejected": -4.714453220367432, + "step": 6030 + }, + { + "epoch": 1.5228626351117134, + "grad_norm": 38.759033203125, + "learning_rate": 2.8683225243020576e-07, + "logits/chosen": -1.1571044921875, + "logits/rejected": -1.1136596202850342, + "logps/chosen": -310.8812561035156, + "logps/rejected": -321.5, + "loss": 0.1841, + "rewards/accuracies": 0.921875, + "rewards/chosen": -0.6566246151924133, + "rewards/margins": 4.045752048492432, + "rewards/rejected": -4.702441215515137, + "step": 6040 + }, + { + "epoch": 1.5253836699965335, + "grad_norm": 66.81090545654297, + "learning_rate": 2.861066303108808e-07, + "logits/chosen": -1.144775390625, + "logits/rejected": -1.14501953125, + "logps/chosen": -303.71875, + "logps/rejected": -322.5562438964844, + "loss": 0.2178, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.238885521888733, + "rewards/margins": 3.8529052734375, + "rewards/rejected": -5.09521484375, + "step": 6050 + }, + { + "epoch": 1.5279047048813537, + "grad_norm": 58.58757400512695, + "learning_rate": 2.8538069739967257e-07, + "logits/chosen": -1.160614013671875, + "logits/rejected": -1.0935790538787842, + "logps/chosen": -301.48126220703125, + "logps/rejected": -307.95001220703125, + "loss": 0.2144, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.5325835943222046, + "rewards/margins": 3.763232469558716, + "rewards/rejected": -5.295312404632568, + "step": 6060 + }, + { + "epoch": 1.530425739766174, + "grad_norm": 22.95831298828125, + "learning_rate": 2.8465445994513024e-07, + "logits/chosen": -1.1400635242462158, + "logits/rejected": -1.107177734375, + "logps/chosen": -298.421875, + "logps/rejected": -296.40625, + "loss": 0.1795, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3224213123321533, + "rewards/margins": 3.811230421066284, + "rewards/rejected": -5.132910251617432, + "step": 6070 + }, + { + "epoch": 1.5329467746509944, + "grad_norm": 44.37894058227539, + "learning_rate": 2.8392792419842447e-07, + "logits/chosen": -1.174072265625, + "logits/rejected": -1.1715087890625, + "logps/chosen": -294.20001220703125, + "logps/rejected": -311.1625061035156, + "loss": 0.1708, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1699402332305908, + "rewards/margins": 3.9228148460388184, + "rewards/rejected": -5.094140529632568, + "step": 6080 + }, + { + "epoch": 1.5354678095358145, + "grad_norm": 26.776601791381836, + "learning_rate": 2.832010964132934e-07, + "logits/chosen": -1.0991790294647217, + "logits/rejected": -1.1131622791290283, + "logps/chosen": -303.20001220703125, + "logps/rejected": -311.32501220703125, + "loss": 0.1787, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.362890601158142, + "rewards/margins": 3.931884765625, + "rewards/rejected": -5.296484470367432, + "step": 6090 + }, + { + "epoch": 1.5379888444206347, + "grad_norm": 34.41385269165039, + "learning_rate": 2.82473982845989e-07, + "logits/chosen": -1.1327393054962158, + "logits/rejected": -1.085089087486267, + "logps/chosen": -297.2749938964844, + "logps/rejected": -300.29376220703125, + "loss": 0.2211, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2868438959121704, + "rewards/margins": 3.897876024246216, + "rewards/rejected": -5.1845703125, + "step": 6100 + }, + { + "epoch": 1.5405098793054548, + "grad_norm": 33.31650161743164, + "learning_rate": 2.8174658975522305e-07, + "logits/chosen": -1.105615258216858, + "logits/rejected": -1.089147925376892, + "logps/chosen": -287.28125, + "logps/rejected": -295.875, + "loss": 0.2103, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.403588891029358, + "rewards/margins": 3.818066358566284, + "rewards/rejected": -5.220507621765137, + "step": 6110 + }, + { + "epoch": 1.5430309141902752, + "grad_norm": 43.380516052246094, + "learning_rate": 2.810189234021135e-07, + "logits/chosen": -1.16046142578125, + "logits/rejected": -1.108343482017517, + "logps/chosen": -301.6656188964844, + "logps/rejected": -322.0249938964844, + "loss": 0.1868, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.209442138671875, + "rewards/margins": 3.7587890625, + "rewards/rejected": -4.966210842132568, + "step": 6120 + }, + { + "epoch": 1.5455519490750953, + "grad_norm": 40.877193450927734, + "learning_rate": 2.802909900501304e-07, + "logits/chosen": -1.1108887195587158, + "logits/rejected": -1.1327393054962158, + "logps/chosen": -276.1625061035156, + "logps/rejected": -307.6312561035156, + "loss": 0.2464, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.244757056236267, + "rewards/margins": 3.612719774246216, + "rewards/rejected": -4.85546875, + "step": 6130 + }, + { + "epoch": 1.5480729839599157, + "grad_norm": 55.626399993896484, + "learning_rate": 2.7956279596504197e-07, + "logits/chosen": -1.153906226158142, + "logits/rejected": -1.125640869140625, + "logps/chosen": -300.61248779296875, + "logps/rejected": -322.625, + "loss": 0.2489, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.4402649402618408, + "rewards/margins": 3.6713624000549316, + "rewards/rejected": -5.11181640625, + "step": 6140 + }, + { + "epoch": 1.5505940188447358, + "grad_norm": 38.66291809082031, + "learning_rate": 2.7883434741486065e-07, + "logits/chosen": -1.0975463390350342, + "logits/rejected": -1.108941674232483, + "logps/chosen": -300.625, + "logps/rejected": -321.0249938964844, + "loss": 0.2227, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.5415527820587158, + "rewards/margins": 3.765380859375, + "rewards/rejected": -5.306445121765137, + "step": 6150 + }, + { + "epoch": 1.553115053729556, + "grad_norm": 52.171146392822266, + "learning_rate": 2.7810565066978944e-07, + "logits/chosen": -1.167871117591858, + "logits/rejected": -1.1710693836212158, + "logps/chosen": -313.03125, + "logps/rejected": -323.46875, + "loss": 0.2665, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.5628662109375, + "rewards/margins": 3.578356981277466, + "rewards/rejected": -5.140820503234863, + "step": 6160 + }, + { + "epoch": 1.555636088614376, + "grad_norm": 52.03303909301758, + "learning_rate": 2.7737671200216745e-07, + "logits/chosen": -1.181249976158142, + "logits/rejected": -1.1878662109375, + "logps/chosen": -303.5874938964844, + "logps/rejected": -306.3500061035156, + "loss": 0.2064, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.112512230873108, + "rewards/margins": 3.9339842796325684, + "rewards/rejected": -5.047949314117432, + "step": 6170 + }, + { + "epoch": 1.5581571234991964, + "grad_norm": 50.642539978027344, + "learning_rate": 2.766475376864163e-07, + "logits/chosen": -1.167016625404358, + "logits/rejected": -1.126977562904358, + "logps/chosen": -303.61248779296875, + "logps/rejected": -309.4750061035156, + "loss": 0.3047, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.3833954334259033, + "rewards/margins": 3.9080567359924316, + "rewards/rejected": -5.290625095367432, + "step": 6180 + }, + { + "epoch": 1.5606781583840168, + "grad_norm": 22.918367385864258, + "learning_rate": 2.75918133998986e-07, + "logits/chosen": -1.1688721179962158, + "logits/rejected": -1.167626976966858, + "logps/chosen": -313.53125, + "logps/rejected": -349.125, + "loss": 0.1866, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.345666527748108, + "rewards/margins": 3.9029297828674316, + "rewards/rejected": -5.250195503234863, + "step": 6190 + }, + { + "epoch": 1.563199193268837, + "grad_norm": 44.8996467590332, + "learning_rate": 2.751885072183009e-07, + "logits/chosen": -1.181610107421875, + "logits/rejected": -1.16558837890625, + "logps/chosen": -295.9375, + "logps/rejected": -287.42498779296875, + "loss": 0.2211, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.675573706626892, + "rewards/margins": 3.5116209983825684, + "rewards/rejected": -5.189843654632568, + "step": 6200 + }, + { + "epoch": 1.565720228153657, + "grad_norm": 45.8497314453125, + "learning_rate": 2.744586636247056e-07, + "logits/chosen": -1.1245849132537842, + "logits/rejected": -1.102929711341858, + "logps/chosen": -311.15625, + "logps/rejected": -335.0, + "loss": 0.2237, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.4038269519805908, + "rewards/margins": 3.883105516433716, + "rewards/rejected": -5.289648532867432, + "step": 6210 + }, + { + "epoch": 1.5682412630384772, + "grad_norm": 55.734989166259766, + "learning_rate": 2.7372860950041085e-07, + "logits/chosen": -1.1724975109100342, + "logits/rejected": -1.0930907726287842, + "logps/chosen": -308.79376220703125, + "logps/rejected": -320.2749938964844, + "loss": 0.2928, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.165258765220642, + "rewards/margins": 3.848950147628784, + "rewards/rejected": -5.014843940734863, + "step": 6220 + }, + { + "epoch": 1.5707622979232974, + "grad_norm": 27.495655059814453, + "learning_rate": 2.7299835112943984e-07, + "logits/chosen": -1.1748778820037842, + "logits/rejected": -1.119470238685608, + "logps/chosen": -295.79376220703125, + "logps/rejected": -305.76251220703125, + "loss": 0.2145, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.0674560070037842, + "rewards/margins": 3.8984131813049316, + "rewards/rejected": -4.965918064117432, + "step": 6230 + }, + { + "epoch": 1.5732833328081177, + "grad_norm": 49.23637008666992, + "learning_rate": 2.7226789479757355e-07, + "logits/chosen": -1.130926489830017, + "logits/rejected": -1.1825683116912842, + "logps/chosen": -286.89373779296875, + "logps/rejected": -333.01873779296875, + "loss": 0.1955, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8009887933731079, + "rewards/margins": 3.798046827316284, + "rewards/rejected": -4.597460746765137, + "step": 6240 + }, + { + "epoch": 1.575804367692938, + "grad_norm": 54.19160079956055, + "learning_rate": 2.7153724679229707e-07, + "logits/chosen": -1.167236328125, + "logits/rejected": -1.1602294445037842, + "logps/chosen": -282.60626220703125, + "logps/rejected": -316.25, + "loss": 0.2016, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.9388580322265625, + "rewards/margins": 3.560107469558716, + "rewards/rejected": -4.497851371765137, + "step": 6250 + }, + { + "epoch": 1.5783254025777582, + "grad_norm": 58.92972946166992, + "learning_rate": 2.7080641340274536e-07, + "logits/chosen": -1.1320679187774658, + "logits/rejected": -1.1422545909881592, + "logps/chosen": -276.9437561035156, + "logps/rejected": -276.73126220703125, + "loss": 0.2117, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.9053314328193665, + "rewards/margins": 3.3994140625, + "rewards/rejected": -4.304980278015137, + "step": 6260 + }, + { + "epoch": 1.5808464374625784, + "grad_norm": 70.27766418457031, + "learning_rate": 2.70075400919649e-07, + "logits/chosen": -1.146142601966858, + "logits/rejected": -1.141119360923767, + "logps/chosen": -308.76873779296875, + "logps/rejected": -334.01251220703125, + "loss": 0.2086, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.1092650890350342, + "rewards/margins": 3.701171875, + "rewards/rejected": -4.8095703125, + "step": 6270 + }, + { + "epoch": 1.5833674723473985, + "grad_norm": 53.645111083984375, + "learning_rate": 2.6934421563528037e-07, + "logits/chosen": -1.161352515220642, + "logits/rejected": -1.1814453601837158, + "logps/chosen": -300.6937561035156, + "logps/rejected": -303.8999938964844, + "loss": 0.2134, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4230773448944092, + "rewards/margins": 3.53857421875, + "rewards/rejected": -4.9619140625, + "step": 6280 + }, + { + "epoch": 1.5858885072322189, + "grad_norm": 61.04668045043945, + "learning_rate": 2.6861286384339884e-07, + "logits/chosen": -1.156274437904358, + "logits/rejected": -1.114892601966858, + "logps/chosen": -275.88751220703125, + "logps/rejected": -305.64373779296875, + "loss": 0.1491, + "rewards/accuracies": 0.940625011920929, + "rewards/chosen": -1.345971703529358, + "rewards/margins": 4.110595703125, + "rewards/rejected": -5.456640720367432, + "step": 6290 + }, + { + "epoch": 1.588409542117039, + "grad_norm": 41.34684753417969, + "learning_rate": 2.6788135183919743e-07, + "logits/chosen": NaN, + "logits/rejected": -1.123925805091858, + "logps/chosen": -324.57501220703125, + "logps/rejected": -337.3500061035156, + "loss": 0.2148, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.9361572265625, + "rewards/margins": 4.215234279632568, + "rewards/rejected": -6.151757717132568, + "step": 6300 + }, + { + "epoch": 1.5909305770018594, + "grad_norm": 29.185256958007812, + "learning_rate": 2.671496859192479e-07, + "logits/chosen": -1.129492163658142, + "logits/rejected": -1.0899779796600342, + "logps/chosen": -317.42498779296875, + "logps/rejected": -328.57501220703125, + "loss": 0.2089, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.9986541271209717, + "rewards/margins": 4.295092582702637, + "rewards/rejected": -6.292187690734863, + "step": 6310 + }, + { + "epoch": 1.5934516118866795, + "grad_norm": 63.9970588684082, + "learning_rate": 2.6641787238144703e-07, + "logits/chosen": -1.07855224609375, + "logits/rejected": -1.054968237876892, + "logps/chosen": -323.3062438964844, + "logps/rejected": -328.46875, + "loss": 0.1911, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1715362071990967, + "rewards/margins": 4.328906059265137, + "rewards/rejected": -6.4990234375, + "step": 6320 + }, + { + "epoch": 1.5959726467714996, + "grad_norm": 39.56163787841797, + "learning_rate": 2.656859175249622e-07, + "logits/chosen": -1.181298851966858, + "logits/rejected": -1.049646019935608, + "logps/chosen": -304.78125, + "logps/rejected": -321.0687561035156, + "loss": 0.1864, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.1056151390075684, + "rewards/margins": 4.275586128234863, + "rewards/rejected": -6.382616996765137, + "step": 6330 + }, + { + "epoch": 1.5984936816563198, + "grad_norm": 52.43605041503906, + "learning_rate": 2.6495382765017726e-07, + "logits/chosen": -1.0822356939315796, + "logits/rejected": -1.0875732898712158, + "logps/chosen": -318.8125, + "logps/rejected": -325.84375, + "loss": 0.2078, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -2.670114040374756, + "rewards/margins": 4.224707126617432, + "rewards/rejected": -6.895117282867432, + "step": 6340 + }, + { + "epoch": 1.6010147165411401, + "grad_norm": 44.35512161254883, + "learning_rate": 2.6422160905863816e-07, + "logits/chosen": NaN, + "logits/rejected": -1.0967528820037842, + "logps/chosen": -307.70001220703125, + "logps/rejected": -338.08123779296875, + "loss": 0.2578, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -2.820687770843506, + "rewards/margins": 3.874316453933716, + "rewards/rejected": -6.6953125, + "step": 6350 + }, + { + "epoch": 1.6035357514259605, + "grad_norm": 38.240203857421875, + "learning_rate": 2.634892680529988e-07, + "logits/chosen": -1.10693359375, + "logits/rejected": -1.1155273914337158, + "logps/chosen": -300.4375, + "logps/rejected": -321.70001220703125, + "loss": 0.1526, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -2.11431884765625, + "rewards/margins": 4.200097560882568, + "rewards/rejected": -6.314257621765137, + "step": 6360 + }, + { + "epoch": 1.6060567863107806, + "grad_norm": 29.181943893432617, + "learning_rate": 2.627568109369668e-07, + "logits/chosen": -1.0560302734375, + "logits/rejected": -1.066827416419983, + "logps/chosen": -326.5687561035156, + "logps/rejected": -333.5249938964844, + "loss": 0.2289, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.367840528488159, + "rewards/margins": 4.1337890625, + "rewards/rejected": -6.503125190734863, + "step": 6370 + }, + { + "epoch": 1.6085778211956008, + "grad_norm": 30.766389846801758, + "learning_rate": 2.6202424401524914e-07, + "logits/chosen": -1.096716284751892, + "logits/rejected": -1.065435767173767, + "logps/chosen": -290.5562438964844, + "logps/rejected": -314.20001220703125, + "loss": 0.2036, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -1.8820984363555908, + "rewards/margins": 4.274706840515137, + "rewards/rejected": -6.159375190734863, + "step": 6380 + }, + { + "epoch": 1.611098856080421, + "grad_norm": 23.376792907714844, + "learning_rate": 2.6129157359349806e-07, + "logits/chosen": -1.146142601966858, + "logits/rejected": -1.0878417491912842, + "logps/chosen": -319.375, + "logps/rejected": -319.42498779296875, + "loss": 0.1467, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -1.89276123046875, + "rewards/margins": 4.2249755859375, + "rewards/rejected": -6.118359565734863, + "step": 6390 + }, + { + "epoch": 1.6136198909652413, + "grad_norm": 26.900299072265625, + "learning_rate": 2.605588059782567e-07, + "logits/chosen": -1.1492187976837158, + "logits/rejected": -1.09906005859375, + "logps/chosen": -313.6187438964844, + "logps/rejected": -328.20001220703125, + "loss": 0.1892, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.4521667957305908, + "rewards/margins": 4.266747951507568, + "rewards/rejected": -5.717675685882568, + "step": 6400 + }, + { + "epoch": 1.6161409258500614, + "grad_norm": 79.60032653808594, + "learning_rate": 2.5982594747690483e-07, + "logits/chosen": -1.165283203125, + "logits/rejected": -1.1063048839569092, + "logps/chosen": -308.58123779296875, + "logps/rejected": -323.95001220703125, + "loss": 0.2384, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.6304610967636108, + "rewards/margins": 3.887451171875, + "rewards/rejected": -5.51953125, + "step": 6410 + }, + { + "epoch": 1.6186619607348818, + "grad_norm": 55.93655014038086, + "learning_rate": 2.590930043976044e-07, + "logits/chosen": -1.186163306236267, + "logits/rejected": -1.1176269054412842, + "logps/chosen": -319.48748779296875, + "logps/rejected": -334.7875061035156, + "loss": 0.3004, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.7587006092071533, + "rewards/margins": 4.107324123382568, + "rewards/rejected": -5.8671875, + "step": 6420 + }, + { + "epoch": 1.621182995619702, + "grad_norm": 57.82849884033203, + "learning_rate": 2.583599830492453e-07, + "logits/chosen": -1.1078612804412842, + "logits/rejected": -1.145532250404358, + "logps/chosen": -303.41876220703125, + "logps/rejected": -319.4437561035156, + "loss": 0.2404, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.714318871498108, + "rewards/margins": 3.898632764816284, + "rewards/rejected": -5.613085746765137, + "step": 6430 + }, + { + "epoch": 1.623704030504522, + "grad_norm": 40.817840576171875, + "learning_rate": 2.576268897413916e-07, + "logits/chosen": -1.159692406654358, + "logits/rejected": -1.152978539466858, + "logps/chosen": -281.42498779296875, + "logps/rejected": -312.35626220703125, + "loss": 0.2292, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.397894263267517, + "rewards/margins": 3.7555909156799316, + "rewards/rejected": -5.153027534484863, + "step": 6440 + }, + { + "epoch": 1.6262250653893422, + "grad_norm": 53.961727142333984, + "learning_rate": 2.5689373078422603e-07, + "logits/chosen": -1.1435668468475342, + "logits/rejected": NaN, + "logps/chosen": -331.71875, + "logps/rejected": -324.4125061035156, + "loss": 0.228, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.54876708984375, + "rewards/margins": 4.006982326507568, + "rewards/rejected": -5.555468559265137, + "step": 6450 + }, + { + "epoch": 1.6287461002741626, + "grad_norm": 27.66611671447754, + "learning_rate": 2.5616051248849707e-07, + "logits/chosen": -1.141503930091858, + "logits/rejected": -1.1709167957305908, + "logps/chosen": -302.95001220703125, + "logps/rejected": -310.3125, + "loss": 0.2209, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.5449860095977783, + "rewards/margins": 4.110595703125, + "rewards/rejected": -5.657422065734863, + "step": 6460 + }, + { + "epoch": 1.631267135158983, + "grad_norm": 36.329322814941406, + "learning_rate": 2.5542724116546365e-07, + "logits/chosen": -1.1394531726837158, + "logits/rejected": -1.1514403820037842, + "logps/chosen": -315.26873779296875, + "logps/rejected": -322.5625, + "loss": 0.2073, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.494232177734375, + "rewards/margins": 3.925488233566284, + "rewards/rejected": -5.420117378234863, + "step": 6470 + }, + { + "epoch": 1.633788170043803, + "grad_norm": 34.16090393066406, + "learning_rate": 2.5469392312684123e-07, + "logits/chosen": NaN, + "logits/rejected": -1.1286499500274658, + "logps/chosen": -304.8125, + "logps/rejected": -320.42498779296875, + "loss": 0.1821, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.2781188488006592, + "rewards/margins": 4.002392768859863, + "rewards/rejected": -5.278027534484863, + "step": 6480 + }, + { + "epoch": 1.6363092049286232, + "grad_norm": 43.980491638183594, + "learning_rate": 2.539605646847473e-07, + "logits/chosen": -1.1635253429412842, + "logits/rejected": -1.1563720703125, + "logps/chosen": -340.35626220703125, + "logps/rejected": -325.48748779296875, + "loss": 0.2075, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -1.445581078529358, + "rewards/margins": 4.289258003234863, + "rewards/rejected": -5.734179496765137, + "step": 6490 + }, + { + "epoch": 1.6388302398134433, + "grad_norm": 44.270267486572266, + "learning_rate": 2.532271721516472e-07, + "logits/chosen": -1.1770508289337158, + "logits/rejected": -1.088861107826233, + "logps/chosen": -339.67498779296875, + "logps/rejected": -346.9125061035156, + "loss": 0.2181, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1042418479919434, + "rewards/margins": 3.935253858566284, + "rewards/rejected": -6.040429592132568, + "step": 6500 + }, + { + "epoch": 1.6413512746982635, + "grad_norm": 51.83606719970703, + "learning_rate": 2.524937518402997e-07, + "logits/chosen": -1.099084496498108, + "logits/rejected": -1.062255859375, + "logps/chosen": -324.21875, + "logps/rejected": -335.28125, + "loss": 0.2701, + "rewards/accuracies": 0.8843749761581421, + "rewards/chosen": -2.1596922874450684, + "rewards/margins": 3.80712890625, + "rewards/rejected": -5.967382907867432, + "step": 6510 + }, + { + "epoch": 1.6438723095830838, + "grad_norm": 35.56692886352539, + "learning_rate": 2.5176031006370253e-07, + "logits/chosen": -1.087866187095642, + "logits/rejected": -1.108789086341858, + "logps/chosen": -305.35626220703125, + "logps/rejected": -327.5, + "loss": 0.2258, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -2.174731492996216, + "rewards/margins": 3.7359862327575684, + "rewards/rejected": -5.912499904632568, + "step": 6520 + }, + { + "epoch": 1.6463933444679042, + "grad_norm": 45.89760971069336, + "learning_rate": 2.510268531350384e-07, + "logits/chosen": -1.107061743736267, + "logits/rejected": -1.056298851966858, + "logps/chosen": -297.4624938964844, + "logps/rejected": -302.9624938964844, + "loss": 0.1988, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.023632764816284, + "rewards/margins": 3.876147508621216, + "rewards/rejected": -5.900586128234863, + "step": 6530 + }, + { + "epoch": 1.6489143793527243, + "grad_norm": 43.021244049072266, + "learning_rate": 2.502933873676204e-07, + "logits/chosen": -1.1321594715118408, + "logits/rejected": -1.0499999523162842, + "logps/chosen": -293.5375061035156, + "logps/rejected": -317.54998779296875, + "loss": 0.2084, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.8830718994140625, + "rewards/margins": 4.081250190734863, + "rewards/rejected": -5.96484375, + "step": 6540 + }, + { + "epoch": 1.6514354142375445, + "grad_norm": 42.96616744995117, + "learning_rate": 2.4955991907483763e-07, + "logits/chosen": -1.1565430164337158, + "logits/rejected": -1.0712158679962158, + "logps/chosen": -320.3374938964844, + "logps/rejected": -312.1499938964844, + "loss": 0.2099, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.6129639148712158, + "rewards/margins": 4.214648246765137, + "rewards/rejected": -5.828320503234863, + "step": 6550 + }, + { + "epoch": 1.6539564491223646, + "grad_norm": 43.674652099609375, + "learning_rate": 2.4882645457010096e-07, + "logits/chosen": -1.1515624523162842, + "logits/rejected": -1.103051781654358, + "logps/chosen": -310.86248779296875, + "logps/rejected": -305.64373779296875, + "loss": 0.253, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.888769507408142, + "rewards/margins": 3.878222703933716, + "rewards/rejected": -5.765038967132568, + "step": 6560 + }, + { + "epoch": 1.656477484007185, + "grad_norm": 29.459733963012695, + "learning_rate": 2.480930001667887e-07, + "logits/chosen": -1.109655737876892, + "logits/rejected": -0.9969848394393921, + "logps/chosen": -314.07501220703125, + "logps/rejected": -313.45001220703125, + "loss": 0.1971, + "rewards/accuracies": 0.921875, + "rewards/chosen": -2.0628294944763184, + "rewards/margins": 4.00244140625, + "rewards/rejected": -6.064843654632568, + "step": 6570 + }, + { + "epoch": 1.6589985188920051, + "grad_norm": 41.641334533691406, + "learning_rate": 2.473595621781919e-07, + "logits/chosen": -1.125646948814392, + "logits/rejected": -1.1973145008087158, + "logps/chosen": -303.6937561035156, + "logps/rejected": -334.1000061035156, + "loss": 0.2295, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -2.050854444503784, + "rewards/margins": 3.986132860183716, + "rewards/rejected": -6.037695407867432, + "step": 6580 + }, + { + "epoch": 1.6615195537768255, + "grad_norm": 72.03292846679688, + "learning_rate": 2.4662614691746096e-07, + "logits/chosen": -1.15771484375, + "logits/rejected": -1.0685577392578125, + "logps/chosen": -292.7250061035156, + "logps/rejected": -310.7250061035156, + "loss": 0.2149, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.60009765625, + "rewards/margins": 3.970703125, + "rewards/rejected": -5.571093559265137, + "step": 6590 + }, + { + "epoch": 1.6640405886616456, + "grad_norm": 63.81726837158203, + "learning_rate": 2.4589276069754994e-07, + "logits/chosen": -1.092504858970642, + "logits/rejected": -1.0665283203125, + "logps/chosen": -289.0375061035156, + "logps/rejected": -316.7875061035156, + "loss": 0.2592, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.723236083984375, + "rewards/margins": 3.6299805641174316, + "rewards/rejected": -5.355078220367432, + "step": 6600 + }, + { + "epoch": 1.6665616235464658, + "grad_norm": 34.75861358642578, + "learning_rate": 2.451594098311635e-07, + "logits/chosen": -1.0768005847930908, + "logits/rejected": -1.108007788658142, + "logps/chosen": -288.41876220703125, + "logps/rejected": -275.2875061035156, + "loss": 0.2183, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.7859923839569092, + "rewards/margins": 3.5325927734375, + "rewards/rejected": -5.3203125, + "step": 6610 + }, + { + "epoch": 1.669082658431286, + "grad_norm": 38.13240432739258, + "learning_rate": 2.4442610063070143e-07, + "logits/chosen": -1.0810546875, + "logits/rejected": -1.0842163562774658, + "logps/chosen": -273.73126220703125, + "logps/rejected": -293.4750061035156, + "loss": 0.2145, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.8680388927459717, + "rewards/margins": 3.718017578125, + "rewards/rejected": -5.587500095367432, + "step": 6620 + }, + { + "epoch": 1.6716036933161063, + "grad_norm": 42.62815856933594, + "learning_rate": 2.4369283940820557e-07, + "logits/chosen": -1.1541016101837158, + "logits/rejected": -1.151544213294983, + "logps/chosen": -314.98126220703125, + "logps/rejected": -334.1000061035156, + "loss": 0.2425, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.681176781654358, + "rewards/margins": 4.090722560882568, + "rewards/rejected": -5.773828029632568, + "step": 6630 + }, + { + "epoch": 1.6741247282009266, + "grad_norm": 41.75690841674805, + "learning_rate": 2.429596324753042e-07, + "logits/chosen": -1.145074486732483, + "logits/rejected": -1.09808349609375, + "logps/chosen": -322.0062561035156, + "logps/rejected": -330.26251220703125, + "loss": 0.1859, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -1.9456055164337158, + "rewards/margins": 4.161035060882568, + "rewards/rejected": -6.108007907867432, + "step": 6640 + }, + { + "epoch": 1.6766457630857468, + "grad_norm": 22.159154891967773, + "learning_rate": 2.422264861431584e-07, + "logits/chosen": -1.1547729969024658, + "logits/rejected": -1.1577637195587158, + "logps/chosen": -277.33123779296875, + "logps/rejected": -302.78125, + "loss": 0.1759, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.4130675792694092, + "rewards/margins": 4.004248142242432, + "rewards/rejected": -5.413964748382568, + "step": 6650 + }, + { + "epoch": 1.679166797970567, + "grad_norm": 53.878028869628906, + "learning_rate": 2.41493406722408e-07, + "logits/chosen": NaN, + "logits/rejected": -1.054284691810608, + "logps/chosen": -264.5562438964844, + "logps/rejected": -327.29998779296875, + "loss": 0.2667, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.1799819469451904, + "rewards/margins": 3.6225829124450684, + "rewards/rejected": -4.803124904632568, + "step": 6660 + }, + { + "epoch": 1.681687832855387, + "grad_norm": 15.097171783447266, + "learning_rate": 2.407604005231163e-07, + "logits/chosen": -1.246667504310608, + "logits/rejected": -1.18994140625, + "logps/chosen": -295.9312438964844, + "logps/rejected": -312.6937561035156, + "loss": 0.2059, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -0.8031219244003296, + "rewards/margins": 3.8587403297424316, + "rewards/rejected": -4.663183689117432, + "step": 6670 + }, + { + "epoch": 1.6842088677402074, + "grad_norm": 58.274845123291016, + "learning_rate": 2.4002747385471686e-07, + "logits/chosen": -1.180334448814392, + "logits/rejected": -1.1510741710662842, + "logps/chosen": -267.84375, + "logps/rejected": -305.1499938964844, + "loss": 0.246, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.9923461675643921, + "rewards/margins": 3.7740235328674316, + "rewards/rejected": -4.76513671875, + "step": 6680 + }, + { + "epoch": 1.6867299026250275, + "grad_norm": 33.82486343383789, + "learning_rate": 2.392946330259583e-07, + "logits/chosen": -1.151635766029358, + "logits/rejected": -1.0900757312774658, + "logps/chosen": -299.1499938964844, + "logps/rejected": -308.9750061035156, + "loss": 0.1602, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.610064685344696, + "rewards/margins": 4.367919921875, + "rewards/rejected": -4.980175971984863, + "step": 6690 + }, + { + "epoch": 1.689250937509848, + "grad_norm": 43.90882873535156, + "learning_rate": 2.385618843448507e-07, + "logits/chosen": -1.201269507408142, + "logits/rejected": -1.0997803211212158, + "logps/chosen": -284.8187561035156, + "logps/rejected": -289.1499938964844, + "loss": 0.2367, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.1014130115509033, + "rewards/margins": 3.6915040016174316, + "rewards/rejected": -4.791325569152832, + "step": 6700 + }, + { + "epoch": 1.691771972394668, + "grad_norm": 44.88566589355469, + "learning_rate": 2.378292341186107e-07, + "logits/chosen": -1.1644287109375, + "logits/rejected": -1.15838623046875, + "logps/chosen": -302.82501220703125, + "logps/rejected": -298.9937438964844, + "loss": 0.2156, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.352014183998108, + "rewards/margins": 3.757251024246216, + "rewards/rejected": -5.109570503234863, + "step": 6710 + }, + { + "epoch": 1.6942930072794882, + "grad_norm": 73.87053680419922, + "learning_rate": 2.370966886536074e-07, + "logits/chosen": -1.1813232898712158, + "logits/rejected": -1.1998291015625, + "logps/chosen": -316.10626220703125, + "logps/rejected": -330.23748779296875, + "loss": 0.2489, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.630670189857483, + "rewards/margins": 4.134839057922363, + "rewards/rejected": -5.764404296875, + "step": 6720 + }, + { + "epoch": 1.6968140421643083, + "grad_norm": 42.54814147949219, + "learning_rate": 2.3636425425530857e-07, + "logits/chosen": -1.15771484375, + "logits/rejected": -1.0667235851287842, + "logps/chosen": -333.2749938964844, + "logps/rejected": -339.6875, + "loss": 0.2005, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.386584520339966, + "rewards/margins": 3.863940477371216, + "rewards/rejected": -6.252050876617432, + "step": 6730 + }, + { + "epoch": 1.6993350770491287, + "grad_norm": 59.08740997314453, + "learning_rate": 2.3563193722822555e-07, + "logits/chosen": -1.107031226158142, + "logits/rejected": -1.1286437511444092, + "logps/chosen": -306.78125, + "logps/rejected": -313.9624938964844, + "loss": 0.252, + "rewards/accuracies": 0.890625, + "rewards/chosen": -2.3460936546325684, + "rewards/margins": 3.794116258621216, + "rewards/rejected": -6.141406059265137, + "step": 6740 + }, + { + "epoch": 1.701856111933949, + "grad_norm": 55.42203140258789, + "learning_rate": 2.3489974387585964e-07, + "logits/chosen": -1.159082055091858, + "logits/rejected": -1.0864746570587158, + "logps/chosen": -309.65625, + "logps/rejected": -326.42498779296875, + "loss": 0.21, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.3440918922424316, + "rewards/margins": 3.704394578933716, + "rewards/rejected": -6.046679496765137, + "step": 6750 + }, + { + "epoch": 1.7043771468187692, + "grad_norm": 34.926353454589844, + "learning_rate": 2.3416768050064739e-07, + "logits/chosen": -1.160681128501892, + "logits/rejected": -1.121313452720642, + "logps/chosen": -313.40625, + "logps/rejected": -317.76873779296875, + "loss": 0.1959, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.807916283607483, + "rewards/margins": 4.132616996765137, + "rewards/rejected": -5.940625190734863, + "step": 6760 + }, + { + "epoch": 1.7068981817035893, + "grad_norm": 58.994178771972656, + "learning_rate": 2.334357534039069e-07, + "logits/chosen": -1.1013672351837158, + "logits/rejected": -1.03985595703125, + "logps/chosen": -296.45001220703125, + "logps/rejected": -308.60626220703125, + "loss": 0.2025, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.033160448074341, + "rewards/margins": 3.9764161109924316, + "rewards/rejected": -6.009765625, + "step": 6770 + }, + { + "epoch": 1.7094192165884095, + "grad_norm": 52.459075927734375, + "learning_rate": 2.3270396888578283e-07, + "logits/chosen": -1.0966370105743408, + "logits/rejected": -1.0048949718475342, + "logps/chosen": -287.21875, + "logps/rejected": -310.25, + "loss": 0.226, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.9747803211212158, + "rewards/margins": 3.9398193359375, + "rewards/rejected": -5.914453029632568, + "step": 6780 + }, + { + "epoch": 1.7119402514732298, + "grad_norm": 52.806907653808594, + "learning_rate": 2.3197233324519274e-07, + "logits/chosen": -1.19219970703125, + "logits/rejected": NaN, + "logps/chosen": -300.26251220703125, + "logps/rejected": -328.5, + "loss": 0.2229, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.9263885021209717, + "rewards/margins": 3.865185499191284, + "rewards/rejected": -5.791015625, + "step": 6790 + }, + { + "epoch": 1.71446128635805, + "grad_norm": 54.634307861328125, + "learning_rate": 2.312408527797729e-07, + "logits/chosen": -1.1331787109375, + "logits/rejected": -1.1707031726837158, + "logps/chosen": -316.3500061035156, + "logps/rejected": -334.4375, + "loss": 0.1708, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.149493455886841, + "rewards/margins": 4.013085842132568, + "rewards/rejected": -6.160351753234863, + "step": 6800 + }, + { + "epoch": 1.7169823212428703, + "grad_norm": 36.946712493896484, + "learning_rate": 2.305095337858236e-07, + "logits/chosen": -1.110748291015625, + "logits/rejected": -1.109460473060608, + "logps/chosen": -315.21563720703125, + "logps/rejected": -321.29998779296875, + "loss": 0.2232, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.0787720680236816, + "rewards/margins": 4.053857326507568, + "rewards/rejected": -6.132226467132568, + "step": 6810 + }, + { + "epoch": 1.7195033561276905, + "grad_norm": 60.76621627807617, + "learning_rate": 2.2977838255825545e-07, + "logits/chosen": -1.2317078113555908, + "logits/rejected": -1.173803687095642, + "logps/chosen": -313.70001220703125, + "logps/rejected": -329.4750061035156, + "loss": 0.1947, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.5935790538787842, + "rewards/margins": 4.097851753234863, + "rewards/rejected": -5.691992282867432, + "step": 6820 + }, + { + "epoch": 1.7220243910125106, + "grad_norm": 36.37934875488281, + "learning_rate": 2.2904740539053477e-07, + "logits/chosen": -1.1745116710662842, + "logits/rejected": -1.1514892578125, + "logps/chosen": -292.6812438964844, + "logps/rejected": -274.5375061035156, + "loss": 0.2504, + "rewards/accuracies": 0.8656250238418579, + "rewards/chosen": -1.1040496826171875, + "rewards/margins": 3.497354030609131, + "rewards/rejected": -4.602148532867432, + "step": 6830 + }, + { + "epoch": 1.7245454258973307, + "grad_norm": 44.25069046020508, + "learning_rate": 2.2831660857462998e-07, + "logits/chosen": -1.1512359380722046, + "logits/rejected": -1.1413360834121704, + "logps/chosen": -301.2749938964844, + "logps/rejected": -314.3812561035156, + "loss": 0.2143, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -0.8554443120956421, + "rewards/margins": 3.899218797683716, + "rewards/rejected": -4.75341796875, + "step": 6840 + }, + { + "epoch": 1.727066460782151, + "grad_norm": 61.40363693237305, + "learning_rate": 2.275859984009568e-07, + "logits/chosen": -1.1895751953125, + "logits/rejected": -1.2206542491912842, + "logps/chosen": -273.83123779296875, + "logps/rejected": -304.70001220703125, + "loss": 0.2551, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.072595238685608, + "rewards/margins": 3.430371046066284, + "rewards/rejected": -4.502050876617432, + "step": 6850 + }, + { + "epoch": 1.7295874956669715, + "grad_norm": 54.41880416870117, + "learning_rate": 2.2685558115832445e-07, + "logits/chosen": -1.2162597179412842, + "logits/rejected": -1.142572045326233, + "logps/chosen": -289.38751220703125, + "logps/rejected": -313.3999938964844, + "loss": 0.2022, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.388641357421875, + "rewards/margins": 3.712695360183716, + "rewards/rejected": -5.100781440734863, + "step": 6860 + }, + { + "epoch": 1.7321085305517916, + "grad_norm": 14.012011528015137, + "learning_rate": 2.2612536313388172e-07, + "logits/chosen": -1.1832396984100342, + "logits/rejected": -1.1500122547149658, + "logps/chosen": -291.01873779296875, + "logps/rejected": -322.83123779296875, + "loss": 0.1987, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.426141381263733, + "rewards/margins": 3.859081983566284, + "rewards/rejected": -5.286035060882568, + "step": 6870 + }, + { + "epoch": 1.7346295654366117, + "grad_norm": 24.559688568115234, + "learning_rate": 2.253953506130622e-07, + "logits/chosen": -1.149999976158142, + "logits/rejected": -1.1400635242462158, + "logps/chosen": -274.54998779296875, + "logps/rejected": -305.1812438964844, + "loss": 0.1856, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.602081298828125, + "rewards/margins": 3.8511719703674316, + "rewards/rejected": -5.452343940734863, + "step": 6880 + }, + { + "epoch": 1.7371506003214319, + "grad_norm": 85.66828918457031, + "learning_rate": 2.2466554987953107e-07, + "logits/chosen": -1.1237976551055908, + "logits/rejected": -1.166357398033142, + "logps/chosen": -283.7124938964844, + "logps/rejected": -311.38751220703125, + "loss": 0.2423, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.662532091140747, + "rewards/margins": 3.855029344558716, + "rewards/rejected": -5.517675876617432, + "step": 6890 + }, + { + "epoch": 1.739671635206252, + "grad_norm": 81.61643981933594, + "learning_rate": 2.2393596721512994e-07, + "logits/chosen": -1.1964905261993408, + "logits/rejected": -1.189306616783142, + "logps/chosen": -296.73748779296875, + "logps/rejected": -299.70001220703125, + "loss": 0.2583, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.5518035888671875, + "rewards/margins": 4.013427734375, + "rewards/rejected": -5.566601753234863, + "step": 6900 + }, + { + "epoch": 1.7421926700910724, + "grad_norm": 64.79661560058594, + "learning_rate": 2.23206608899824e-07, + "logits/chosen": -1.2158203125, + "logits/rejected": -1.1887085437774658, + "logps/chosen": -299.1499938964844, + "logps/rejected": -308.51873779296875, + "loss": 0.2358, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.6992003917694092, + "rewards/margins": 3.9305663108825684, + "rewards/rejected": -5.630468845367432, + "step": 6910 + }, + { + "epoch": 1.7447137049758927, + "grad_norm": 24.7640380859375, + "learning_rate": 2.2247748121164686e-07, + "logits/chosen": -1.194635033607483, + "logits/rejected": -1.178918480873108, + "logps/chosen": -289.7437438964844, + "logps/rejected": -308.0874938964844, + "loss": 0.2048, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.5421264171600342, + "rewards/margins": 3.921142578125, + "rewards/rejected": -5.463183403015137, + "step": 6920 + }, + { + "epoch": 1.7472347398607129, + "grad_norm": 44.29692840576172, + "learning_rate": 2.2174859042664706e-07, + "logits/chosen": -1.230371117591858, + "logits/rejected": NaN, + "logps/chosen": -315.3125, + "logps/rejected": -304.9125061035156, + "loss": 0.1659, + "rewards/accuracies": 0.934374988079071, + "rewards/chosen": -1.4905884265899658, + "rewards/margins": 3.768798828125, + "rewards/rejected": -5.2587890625, + "step": 6930 + }, + { + "epoch": 1.749755774745533, + "grad_norm": 47.58283996582031, + "learning_rate": 2.210199428188343e-07, + "logits/chosen": -1.18408203125, + "logits/rejected": -1.138671875, + "logps/chosen": -312.3999938964844, + "logps/rejected": -308.0062561035156, + "loss": 0.2728, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.701696753501892, + "rewards/margins": 3.532910108566284, + "rewards/rejected": -5.235449314117432, + "step": 6940 + }, + { + "epoch": 1.7522768096303531, + "grad_norm": 32.884429931640625, + "learning_rate": 2.2029154466012466e-07, + "logits/chosen": -1.1586425304412842, + "logits/rejected": -1.135107398033142, + "logps/chosen": -281.6968688964844, + "logps/rejected": -332.04998779296875, + "loss": 0.1999, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.636499047279358, + "rewards/margins": 3.898730516433716, + "rewards/rejected": -5.537207126617432, + "step": 6950 + }, + { + "epoch": 1.7547978445151735, + "grad_norm": 38.59349822998047, + "learning_rate": 2.1956340222028732e-07, + "logits/chosen": -1.208898901939392, + "logits/rejected": -1.2027466297149658, + "logps/chosen": -309.78125, + "logps/rejected": -323.6875, + "loss": 0.2094, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.5239684581756592, + "rewards/margins": 3.780078172683716, + "rewards/rejected": -5.302734375, + "step": 6960 + }, + { + "epoch": 1.7573188793999936, + "grad_norm": 60.351741790771484, + "learning_rate": 2.1883552176689016e-07, + "logits/chosen": -1.214746117591858, + "logits/rejected": -1.175744652748108, + "logps/chosen": -318.6625061035156, + "logps/rejected": -330.14373779296875, + "loss": 0.1817, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.6495177745819092, + "rewards/margins": 4.301953315734863, + "rewards/rejected": -5.952538967132568, + "step": 6970 + }, + { + "epoch": 1.759839914284814, + "grad_norm": 22.56215476989746, + "learning_rate": 2.181079095652463e-07, + "logits/chosen": -1.1741211414337158, + "logits/rejected": -1.136987328529358, + "logps/chosen": -295.79376220703125, + "logps/rejected": -296.2437438964844, + "loss": 0.1906, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.6622803211212158, + "rewards/margins": 3.964404344558716, + "rewards/rejected": -5.62890625, + "step": 6980 + }, + { + "epoch": 1.7623609491696342, + "grad_norm": 23.98560333251953, + "learning_rate": 2.1738057187835952e-07, + "logits/chosen": -1.173437476158142, + "logits/rejected": -1.0842773914337158, + "logps/chosen": -303.7749938964844, + "logps/rejected": -305.2749938964844, + "loss": 0.1738, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -1.3464279174804688, + "rewards/margins": 4.083984375, + "rewards/rejected": -5.429883003234863, + "step": 6990 + }, + { + "epoch": 1.7648819840544543, + "grad_norm": 36.42485427856445, + "learning_rate": 2.1665351496687068e-07, + "logits/chosen": -1.077172875404358, + "logits/rejected": -1.1149780750274658, + "logps/chosen": -291.0249938964844, + "logps/rejected": -330.3500061035156, + "loss": 0.2611, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8449188470840454, + "rewards/margins": 3.899365186691284, + "rewards/rejected": -5.742578029632568, + "step": 7000 + }, + { + "epoch": 1.7674030189392744, + "grad_norm": 41.8332633972168, + "learning_rate": 2.159267450890042e-07, + "logits/chosen": -1.1143372058868408, + "logits/rejected": NaN, + "logps/chosen": -282.1625061035156, + "logps/rejected": -318.3125, + "loss": 0.2058, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.282708764076233, + "rewards/margins": 4.120995998382568, + "rewards/rejected": -5.402539253234863, + "step": 7010 + }, + { + "epoch": 1.7699240538240948, + "grad_norm": 52.472660064697266, + "learning_rate": 2.1520026850051342e-07, + "logits/chosen": -1.2456543445587158, + "logits/rejected": -1.1812255382537842, + "logps/chosen": -307.08123779296875, + "logps/rejected": -330.25, + "loss": 0.2573, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.2497161626815796, + "rewards/margins": 3.5826783180236816, + "rewards/rejected": -4.832129001617432, + "step": 7020 + }, + { + "epoch": 1.7724450887089152, + "grad_norm": 40.5081901550293, + "learning_rate": 2.1447409145462742e-07, + "logits/chosen": -1.233862280845642, + "logits/rejected": -1.2039916515350342, + "logps/chosen": -282.20001220703125, + "logps/rejected": -295.9437561035156, + "loss": 0.2211, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.7961761355400085, + "rewards/margins": 3.987255811691284, + "rewards/rejected": -4.781152248382568, + "step": 7030 + }, + { + "epoch": 1.7749661235937353, + "grad_norm": 42.15639877319336, + "learning_rate": 2.1374822020199668e-07, + "logits/chosen": -1.2635376453399658, + "logits/rejected": -1.2142455577850342, + "logps/chosen": -317.4125061035156, + "logps/rejected": -310.29998779296875, + "loss": 0.1602, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -0.8376251459121704, + "rewards/margins": 4.2254638671875, + "rewards/rejected": -5.0625, + "step": 7040 + }, + { + "epoch": 1.7774871584785554, + "grad_norm": 40.584781646728516, + "learning_rate": 2.130226609906399e-07, + "logits/chosen": -1.2356750965118408, + "logits/rejected": -1.185400366783142, + "logps/chosen": -303.4437561035156, + "logps/rejected": -303.38751220703125, + "loss": 0.2227, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.0569336414337158, + "rewards/margins": 3.962695360183716, + "rewards/rejected": -5.021484375, + "step": 7050 + }, + { + "epoch": 1.7800081933633756, + "grad_norm": 71.22672271728516, + "learning_rate": 2.1229742006588953e-07, + "logits/chosen": -1.1936523914337158, + "logits/rejected": -1.193823218345642, + "logps/chosen": -293.42498779296875, + "logps/rejected": -318.6000061035156, + "loss": 0.1658, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1354248523712158, + "rewards/margins": 4.288598537445068, + "rewards/rejected": -5.423437595367432, + "step": 7060 + }, + { + "epoch": 1.782529228248196, + "grad_norm": 61.99361801147461, + "learning_rate": 2.115725036703383e-07, + "logits/chosen": -1.1898071765899658, + "logits/rejected": -1.1717712879180908, + "logps/chosen": -302.58123779296875, + "logps/rejected": -342.4750061035156, + "loss": 0.21, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.127203345298767, + "rewards/margins": 4.528173923492432, + "rewards/rejected": -5.654199123382568, + "step": 7070 + }, + { + "epoch": 1.785050263133016, + "grad_norm": 68.46553802490234, + "learning_rate": 2.1084791804378592e-07, + "logits/chosen": -1.232263207435608, + "logits/rejected": -1.1902587413787842, + "logps/chosen": -329.109375, + "logps/rejected": -313.3062438964844, + "loss": 0.2801, + "rewards/accuracies": 0.878125011920929, + "rewards/chosen": -2.0232481956481934, + "rewards/margins": 3.694580078125, + "rewards/rejected": -5.715234279632568, + "step": 7080 + }, + { + "epoch": 1.7875712980178364, + "grad_norm": 20.95526885986328, + "learning_rate": 2.101236694231845e-07, + "logits/chosen": -1.1466553211212158, + "logits/rejected": -1.16998291015625, + "logps/chosen": -309.26251220703125, + "logps/rejected": -317.2875061035156, + "loss": 0.2012, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.7224304676055908, + "rewards/margins": 4.093945503234863, + "rewards/rejected": -5.818945407867432, + "step": 7090 + }, + { + "epoch": 1.7900923329026566, + "grad_norm": 32.11468505859375, + "learning_rate": 2.0939976404258567e-07, + "logits/chosen": -1.24072265625, + "logits/rejected": -1.1890380382537842, + "logps/chosen": -319.48748779296875, + "logps/rejected": -315.75, + "loss": 0.2403, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.635186791419983, + "rewards/margins": 3.9652342796325684, + "rewards/rejected": -5.6015625, + "step": 7100 + }, + { + "epoch": 1.7926133677874767, + "grad_norm": 45.03824234008789, + "learning_rate": 2.086762081330863e-07, + "logits/chosen": -1.1531493663787842, + "logits/rejected": -1.1338379383087158, + "logps/chosen": -310.3374938964844, + "logps/rejected": -337.125, + "loss": 0.1384, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.635858178138733, + "rewards/margins": 4.368029594421387, + "rewards/rejected": -6.004687309265137, + "step": 7110 + }, + { + "epoch": 1.7951344026722968, + "grad_norm": 51.74740982055664, + "learning_rate": 2.079530079227755e-07, + "logits/chosen": -1.1532471179962158, + "logits/rejected": -1.1278960704803467, + "logps/chosen": -303.3062438964844, + "logps/rejected": -322.4125061035156, + "loss": 0.2002, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.971154808998108, + "rewards/margins": 4.110644340515137, + "rewards/rejected": -6.081445217132568, + "step": 7120 + }, + { + "epoch": 1.7976554375571172, + "grad_norm": 39.18357467651367, + "learning_rate": 2.072301696366803e-07, + "logits/chosen": -1.1811370849609375, + "logits/rejected": -1.1466064453125, + "logps/chosen": -326.42498779296875, + "logps/rejected": -306.10626220703125, + "loss": 0.236, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -2.0232481956481934, + "rewards/margins": 3.9530272483825684, + "rewards/rejected": -5.977343559265137, + "step": 7130 + }, + { + "epoch": 1.8001764724419376, + "grad_norm": 21.702817916870117, + "learning_rate": 2.0650769949671257e-07, + "logits/chosen": -1.1583983898162842, + "logits/rejected": -1.0685821771621704, + "logps/chosen": -319.9750061035156, + "logps/rejected": -321.7875061035156, + "loss": 0.1995, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.6598937511444092, + "rewards/margins": 4.170605659484863, + "rewards/rejected": -5.830078125, + "step": 7140 + }, + { + "epoch": 1.8026975073267577, + "grad_norm": 70.05231475830078, + "learning_rate": 2.057856037216155e-07, + "logits/chosen": -1.197509765625, + "logits/rejected": -1.1431152820587158, + "logps/chosen": -328.5406188964844, + "logps/rejected": -326.84375, + "loss": 0.2782, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.883947730064392, + "rewards/margins": 3.9864501953125, + "rewards/rejected": -5.870703220367432, + "step": 7150 + }, + { + "epoch": 1.8052185422115778, + "grad_norm": 51.588199615478516, + "learning_rate": 2.0506388852690958e-07, + "logits/chosen": -1.1375305652618408, + "logits/rejected": -1.0667235851287842, + "logps/chosen": -309.46875, + "logps/rejected": -319.26251220703125, + "loss": 0.2137, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.923803687095642, + "rewards/margins": 3.979199171066284, + "rewards/rejected": -5.904296875, + "step": 7160 + }, + { + "epoch": 1.807739577096398, + "grad_norm": 50.91193389892578, + "learning_rate": 2.043425601248397e-07, + "logits/chosen": -1.1796386241912842, + "logits/rejected": -1.147558569908142, + "logps/chosen": -305.07501220703125, + "logps/rejected": -349.6000061035156, + "loss": 0.2098, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.8881622552871704, + "rewards/margins": 4.063036918640137, + "rewards/rejected": -5.951171875, + "step": 7170 + }, + { + "epoch": 1.8102606119812181, + "grad_norm": 30.04096221923828, + "learning_rate": 2.03621624724321e-07, + "logits/chosen": -1.219384789466858, + "logits/rejected": -1.1800323724746704, + "logps/chosen": -315.734375, + "logps/rejected": -317.91876220703125, + "loss": 0.1466, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.241387963294983, + "rewards/margins": 4.197802543640137, + "rewards/rejected": -5.439135551452637, + "step": 7180 + }, + { + "epoch": 1.8127816468660385, + "grad_norm": 68.7504653930664, + "learning_rate": 2.0290108853088634e-07, + "logits/chosen": -1.182397484779358, + "logits/rejected": -1.1044189929962158, + "logps/chosen": -298.3062438964844, + "logps/rejected": -339.58123779296875, + "loss": 0.2239, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.3178939819335938, + "rewards/margins": 3.9729981422424316, + "rewards/rejected": -5.290625095367432, + "step": 7190 + }, + { + "epoch": 1.8153026817508588, + "grad_norm": 36.92831039428711, + "learning_rate": 2.0218095774663197e-07, + "logits/chosen": -1.19891357421875, + "logits/rejected": -1.244384765625, + "logps/chosen": -290.9375, + "logps/rejected": -325.7875061035156, + "loss": 0.2379, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.2448089122772217, + "rewards/margins": 3.6868653297424316, + "rewards/rejected": -4.9296875, + "step": 7200 + }, + { + "epoch": 1.817823716635679, + "grad_norm": 50.52592086791992, + "learning_rate": 2.0146123857016453e-07, + "logits/chosen": -1.220617651939392, + "logits/rejected": -1.1038939952850342, + "logps/chosen": -315.75, + "logps/rejected": -317.3125, + "loss": 0.1802, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.163214087486267, + "rewards/margins": 4.184423923492432, + "rewards/rejected": -5.347754001617432, + "step": 7210 + }, + { + "epoch": 1.8203447515204991, + "grad_norm": 29.57152557373047, + "learning_rate": 2.0074193719654803e-07, + "logits/chosen": -1.17822265625, + "logits/rejected": -1.1136901378631592, + "logps/chosen": -299.57501220703125, + "logps/rejected": -310.375, + "loss": 0.1828, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -0.9800170660018921, + "rewards/margins": 4.062353610992432, + "rewards/rejected": -5.04345703125, + "step": 7220 + }, + { + "epoch": 1.8228657864053193, + "grad_norm": 53.481231689453125, + "learning_rate": 2.0002305981724983e-07, + "logits/chosen": -1.1823852062225342, + "logits/rejected": NaN, + "logps/chosen": -307.79998779296875, + "logps/rejected": -343.0375061035156, + "loss": 0.2837, + "rewards/accuracies": 0.871874988079071, + "rewards/chosen": -0.9479309320449829, + "rewards/margins": 3.863085985183716, + "rewards/rejected": -4.809912204742432, + "step": 7230 + }, + { + "epoch": 1.8253868212901396, + "grad_norm": 49.503753662109375, + "learning_rate": 1.99304612620088e-07, + "logits/chosen": -1.165826439857483, + "logits/rejected": -1.1661498546600342, + "logps/chosen": -316.64373779296875, + "logps/rejected": -328.20001220703125, + "loss": 0.1966, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.039465308189392, + "rewards/margins": 4.062207221984863, + "rewards/rejected": -5.102246284484863, + "step": 7240 + }, + { + "epoch": 1.8279078561749598, + "grad_norm": 37.43131637573242, + "learning_rate": 1.9858660178917743e-07, + "logits/chosen": NaN, + "logits/rejected": -1.1943480968475342, + "logps/chosen": -300.20623779296875, + "logps/rejected": -325.29376220703125, + "loss": 0.1436, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.8471435308456421, + "rewards/margins": 4.044335842132568, + "rewards/rejected": -4.891406059265137, + "step": 7250 + }, + { + "epoch": 1.8304288910597801, + "grad_norm": 38.07368087768555, + "learning_rate": 1.9786903350487737e-07, + "logits/chosen": -1.1146240234375, + "logits/rejected": NaN, + "logps/chosen": -289.29998779296875, + "logps/rejected": -323.75, + "loss": 0.2243, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.232031226158142, + "rewards/margins": 3.966357469558716, + "rewards/rejected": -5.198339939117432, + "step": 7260 + }, + { + "epoch": 1.8329499259446003, + "grad_norm": 37.894126892089844, + "learning_rate": 1.9715191394373745e-07, + "logits/chosen": -1.1885559558868408, + "logits/rejected": -1.108056664466858, + "logps/chosen": -310.34375, + "logps/rejected": -295.4624938964844, + "loss": 0.2932, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.688812255859375, + "rewards/margins": 3.6373047828674316, + "rewards/rejected": -5.326367378234863, + "step": 7270 + }, + { + "epoch": 1.8354709608294204, + "grad_norm": 18.592529296875, + "learning_rate": 1.964352492784449e-07, + "logits/chosen": -1.239660620689392, + "logits/rejected": -1.174230933189392, + "logps/chosen": -329.13751220703125, + "logps/rejected": -322.2562561035156, + "loss": 0.2346, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.550012230873108, + "rewards/margins": 3.8741211891174316, + "rewards/rejected": -5.4228515625, + "step": 7280 + }, + { + "epoch": 1.8379919957142405, + "grad_norm": 65.15435028076172, + "learning_rate": 1.957190456777717e-07, + "logits/chosen": -1.218542456626892, + "logits/rejected": -1.170739769935608, + "logps/chosen": -336.3999938964844, + "logps/rejected": -329.73748779296875, + "loss": 0.2284, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.6474609375, + "rewards/margins": 3.7369141578674316, + "rewards/rejected": -5.384375095367432, + "step": 7290 + }, + { + "epoch": 1.840513030599061, + "grad_norm": 48.60396957397461, + "learning_rate": 1.9500330930652073e-07, + "logits/chosen": -1.1867187023162842, + "logits/rejected": -1.112945556640625, + "logps/chosen": -315.9624938964844, + "logps/rejected": -312.76873779296875, + "loss": 0.2434, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.55596923828125, + "rewards/margins": 3.893359422683716, + "rewards/rejected": -5.448534965515137, + "step": 7300 + }, + { + "epoch": 1.8430340654838813, + "grad_norm": 49.56249237060547, + "learning_rate": 1.9428804632547348e-07, + "logits/chosen": -1.152978539466858, + "logits/rejected": -1.1125946044921875, + "logps/chosen": -302.4312438964844, + "logps/rejected": -322.86248779296875, + "loss": 0.2171, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.0119261741638184, + "rewards/margins": 3.822216749191284, + "rewards/rejected": -5.832226753234863, + "step": 7310 + }, + { + "epoch": 1.8455551003687014, + "grad_norm": 31.021007537841797, + "learning_rate": 1.9357326289133635e-07, + "logits/chosen": -1.170019507408142, + "logits/rejected": -1.130346655845642, + "logps/chosen": -294.8999938964844, + "logps/rejected": -314.86248779296875, + "loss": 0.2278, + "rewards/accuracies": 0.8843749761581421, + "rewards/chosen": -1.97314453125, + "rewards/margins": 3.8570313453674316, + "rewards/rejected": -5.830664157867432, + "step": 7320 + }, + { + "epoch": 1.8480761352535215, + "grad_norm": 27.7810001373291, + "learning_rate": 1.9285896515668841e-07, + "logits/chosen": -1.089074730873108, + "logits/rejected": -1.113073706626892, + "logps/chosen": -300.5, + "logps/rejected": -339.4750061035156, + "loss": 0.2816, + "rewards/accuracies": 0.871874988079071, + "rewards/chosen": -2.132189989089966, + "rewards/margins": 3.7269043922424316, + "rewards/rejected": -5.856249809265137, + "step": 7330 + }, + { + "epoch": 1.8505971701383417, + "grad_norm": 63.46874237060547, + "learning_rate": 1.9214515926992775e-07, + "logits/chosen": -1.1709716320037842, + "logits/rejected": -1.149572730064392, + "logps/chosen": -297.88751220703125, + "logps/rejected": -338.7124938964844, + "loss": 0.2044, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.0968995094299316, + "rewards/margins": 3.894238233566284, + "rewards/rejected": -5.992383003234863, + "step": 7340 + }, + { + "epoch": 1.853118205023162, + "grad_norm": 48.2232780456543, + "learning_rate": 1.9143185137521863e-07, + "logits/chosen": -1.1376526355743408, + "logits/rejected": -1.178442358970642, + "logps/chosen": -320.36248779296875, + "logps/rejected": -307.8125, + "loss": 0.22, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.9373047351837158, + "rewards/margins": 3.641845703125, + "rewards/rejected": -5.579297065734863, + "step": 7350 + }, + { + "epoch": 1.8556392399079822, + "grad_norm": 56.41461181640625, + "learning_rate": 1.9071904761243935e-07, + "logits/chosen": -1.116979956626892, + "logits/rejected": -1.0767822265625, + "logps/chosen": -292.09375, + "logps/rejected": -312.11248779296875, + "loss": 0.2359, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.8706543445587158, + "rewards/margins": 3.8099608421325684, + "rewards/rejected": -5.6826171875, + "step": 7360 + }, + { + "epoch": 1.8581602747928025, + "grad_norm": 42.00067138671875, + "learning_rate": 1.9000675411712827e-07, + "logits/chosen": -1.0812804698944092, + "logits/rejected": -1.056420922279358, + "logps/chosen": -288.96875, + "logps/rejected": -306.2124938964844, + "loss": 0.1695, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.909002661705017, + "rewards/margins": 3.8208985328674316, + "rewards/rejected": -5.732226371765137, + "step": 7370 + }, + { + "epoch": 1.8606813096776227, + "grad_norm": 32.646976470947266, + "learning_rate": 1.8929497702043194e-07, + "logits/chosen": -1.1477782726287842, + "logits/rejected": -1.127862572669983, + "logps/chosen": -305.01873779296875, + "logps/rejected": -340.20001220703125, + "loss": 0.2013, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.578704833984375, + "rewards/margins": 3.863208055496216, + "rewards/rejected": -5.444238185882568, + "step": 7380 + }, + { + "epoch": 1.8632023445624428, + "grad_norm": 35.01163864135742, + "learning_rate": 1.8858372244905162e-07, + "logits/chosen": -1.206610083580017, + "logits/rejected": -1.176977515220642, + "logps/chosen": -310.64373779296875, + "logps/rejected": -320.23748779296875, + "loss": 0.2215, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.5214020013809204, + "rewards/margins": 3.83935546875, + "rewards/rejected": -5.360253810882568, + "step": 7390 + }, + { + "epoch": 1.865723379447263, + "grad_norm": 60.340877532958984, + "learning_rate": 1.878729965251913e-07, + "logits/chosen": -1.2282226085662842, + "logits/rejected": -1.130517601966858, + "logps/chosen": -297.16876220703125, + "logps/rejected": -297.3999938964844, + "loss": 0.2292, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.558538794517517, + "rewards/margins": 3.6513671875, + "rewards/rejected": -5.211328029632568, + "step": 7400 + }, + { + "epoch": 1.8682444143320833, + "grad_norm": 27.210193634033203, + "learning_rate": 1.871628053665043e-07, + "logits/chosen": -1.1539306640625, + "logits/rejected": -1.116418480873108, + "logps/chosen": -281.4312438964844, + "logps/rejected": -301.7250061035156, + "loss": 0.2023, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.800054907798767, + "rewards/margins": 3.984326124191284, + "rewards/rejected": -5.78125, + "step": 7410 + }, + { + "epoch": 1.8707654492169037, + "grad_norm": 62.806583404541016, + "learning_rate": 1.864531550860407e-07, + "logits/chosen": -1.128961205482483, + "logits/rejected": -1.1193358898162842, + "logps/chosen": -315.125, + "logps/rejected": -343.7124938964844, + "loss": 0.2706, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.7468230724334717, + "rewards/margins": 4.126611232757568, + "rewards/rejected": -5.873632907867432, + "step": 7420 + }, + { + "epoch": 1.8732864841017238, + "grad_norm": 52.17653274536133, + "learning_rate": 1.8574405179219548e-07, + "logits/chosen": -1.184814453125, + "logits/rejected": -1.18505859375, + "logps/chosen": -291.1875, + "logps/rejected": -314.29998779296875, + "loss": 0.237, + "rewards/accuracies": 0.909375011920929, + "rewards/chosen": -1.6356322765350342, + "rewards/margins": 3.832763671875, + "rewards/rejected": -5.46728515625, + "step": 7430 + }, + { + "epoch": 1.875807518986544, + "grad_norm": 56.88821792602539, + "learning_rate": 1.8503550158865476e-07, + "logits/chosen": -1.171630859375, + "logits/rejected": -1.1357085704803467, + "logps/chosen": -308.3500061035156, + "logps/rejected": -325.875, + "loss": 0.2269, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.572625756263733, + "rewards/margins": 3.9573731422424316, + "rewards/rejected": -5.529394626617432, + "step": 7440 + }, + { + "epoch": 1.878328553871364, + "grad_norm": 76.11968994140625, + "learning_rate": 1.8432751057434438e-07, + "logits/chosen": -1.2202575206756592, + "logits/rejected": -1.1420433521270752, + "logps/chosen": -300.82501220703125, + "logps/rejected": -312.4624938964844, + "loss": 0.2412, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3674484491348267, + "rewards/margins": 3.9940428733825684, + "rewards/rejected": -5.360547065734863, + "step": 7450 + }, + { + "epoch": 1.8808495887561845, + "grad_norm": 48.39967346191406, + "learning_rate": 1.8362008484337637e-07, + "logits/chosen": -1.162255883216858, + "logits/rejected": -1.19366455078125, + "logps/chosen": -280.67498779296875, + "logps/rejected": -331.71875, + "loss": 0.1942, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.4645812511444092, + "rewards/margins": 3.917285203933716, + "rewards/rejected": -5.380078315734863, + "step": 7460 + }, + { + "epoch": 1.8833706236410046, + "grad_norm": 56.637794494628906, + "learning_rate": 1.8291323048499762e-07, + "logits/chosen": -1.2105224132537842, + "logits/rejected": -1.1904785633087158, + "logps/chosen": -301.11248779296875, + "logps/rejected": -297.3374938964844, + "loss": 0.2696, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.576446533203125, + "rewards/margins": 3.677197217941284, + "rewards/rejected": -5.254296779632568, + "step": 7470 + }, + { + "epoch": 1.885891658525825, + "grad_norm": 29.607648849487305, + "learning_rate": 1.8220695358353643e-07, + "logits/chosen": -1.1474609375, + "logits/rejected": -1.118371605873108, + "logps/chosen": -305.5375061035156, + "logps/rejected": -324.7562561035156, + "loss": 0.2046, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.4046661853790283, + "rewards/margins": 4.186132907867432, + "rewards/rejected": -5.593359470367432, + "step": 7480 + }, + { + "epoch": 1.888412693410645, + "grad_norm": 13.99630069732666, + "learning_rate": 1.815012602183506e-07, + "logits/chosen": -1.153906226158142, + "logits/rejected": -1.085168480873108, + "logps/chosen": -288.42498779296875, + "logps/rejected": -315.51251220703125, + "loss": 0.2775, + "rewards/accuracies": 0.878125011920929, + "rewards/chosen": -1.6456924676895142, + "rewards/margins": 3.4505372047424316, + "rewards/rejected": -5.094628810882568, + "step": 7490 + }, + { + "epoch": 1.8909337282954652, + "grad_norm": 46.438785552978516, + "learning_rate": 1.8079615646377535e-07, + "logits/chosen": -1.2250487804412842, + "logits/rejected": -1.2243163585662842, + "logps/chosen": -297.53125, + "logps/rejected": -304.17498779296875, + "loss": 0.2278, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.40008544921875, + "rewards/margins": 3.6764159202575684, + "rewards/rejected": -5.075976371765137, + "step": 7500 + }, + { + "epoch": 1.8934547631802854, + "grad_norm": 60.958229064941406, + "learning_rate": 1.800916483890705e-07, + "logits/chosen": -1.116601586341858, + "logits/rejected": -1.04278564453125, + "logps/chosen": -300.8812561035156, + "logps/rejected": -331.48748779296875, + "loss": 0.2387, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.576141357421875, + "rewards/margins": 4.094872951507568, + "rewards/rejected": -5.670312404632568, + "step": 7510 + }, + { + "epoch": 1.8959757980651057, + "grad_norm": 28.104612350463867, + "learning_rate": 1.793877420583686e-07, + "logits/chosen": -1.158361792564392, + "logits/rejected": -1.134667992591858, + "logps/chosen": -315.29998779296875, + "logps/rejected": -324.6875, + "loss": 0.2139, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.790228247642517, + "rewards/margins": 3.9569334983825684, + "rewards/rejected": -5.746289253234863, + "step": 7520 + }, + { + "epoch": 1.898496832949926, + "grad_norm": 43.907108306884766, + "learning_rate": 1.786844435306225e-07, + "logits/chosen": -1.1804687976837158, + "logits/rejected": -1.0574157238006592, + "logps/chosen": -294.1312561035156, + "logps/rejected": -276.9125061035156, + "loss": 0.1924, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.487634301185608, + "rewards/margins": 3.907885789871216, + "rewards/rejected": -5.393750190734863, + "step": 7530 + }, + { + "epoch": 1.9010178678347462, + "grad_norm": 47.292930603027344, + "learning_rate": 1.7798175885955364e-07, + "logits/chosen": -1.1721069812774658, + "logits/rejected": -1.195397973060608, + "logps/chosen": -311.875, + "logps/rejected": -292.3125, + "loss": 0.2577, + "rewards/accuracies": 0.871874988079071, + "rewards/chosen": -1.620141625404358, + "rewards/margins": 3.6788086891174316, + "rewards/rejected": -5.298144340515137, + "step": 7540 + }, + { + "epoch": 1.9035389027195664, + "grad_norm": 66.42214965820312, + "learning_rate": 1.7727969409359922e-07, + "logits/chosen": -1.2217285633087158, + "logits/rejected": -1.190954566001892, + "logps/chosen": -283.6937561035156, + "logps/rejected": -308.70001220703125, + "loss": 0.1982, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.5985686779022217, + "rewards/margins": 3.812548875808716, + "rewards/rejected": -5.410351753234863, + "step": 7550 + }, + { + "epoch": 1.9060599376043865, + "grad_norm": 40.26822280883789, + "learning_rate": 1.7657825527586066e-07, + "logits/chosen": -1.2541015148162842, + "logits/rejected": -1.2113158702850342, + "logps/chosen": -307.51873779296875, + "logps/rejected": -322.45623779296875, + "loss": 0.2394, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.6601440906524658, + "rewards/margins": 3.708544969558716, + "rewards/rejected": -5.368359565734863, + "step": 7560 + }, + { + "epoch": 1.9085809724892067, + "grad_norm": 62.55760192871094, + "learning_rate": 1.7587744844405172e-07, + "logits/chosen": -1.2342407703399658, + "logits/rejected": -1.202978491783142, + "logps/chosen": -297.98748779296875, + "logps/rejected": -305.90625, + "loss": 0.2699, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.6728484630584717, + "rewards/margins": 3.495410203933716, + "rewards/rejected": -5.1689453125, + "step": 7570 + }, + { + "epoch": 1.911102007374027, + "grad_norm": 19.37392234802246, + "learning_rate": 1.7517727963044592e-07, + "logits/chosen": -1.1997802257537842, + "logits/rejected": -1.0855712890625, + "logps/chosen": -295.1312561035156, + "logps/rejected": -291.5687561035156, + "loss": 0.2427, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.407446265220642, + "rewards/margins": 3.903393507003784, + "rewards/rejected": -5.311816215515137, + "step": 7580 + }, + { + "epoch": 1.9136230422588474, + "grad_norm": 26.273832321166992, + "learning_rate": 1.7447775486182518e-07, + "logits/chosen": -1.254736304283142, + "logits/rejected": -1.1654876470565796, + "logps/chosen": -306.0062561035156, + "logps/rejected": -315.51873779296875, + "loss": 0.1584, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.317114233970642, + "rewards/margins": 4.092382907867432, + "rewards/rejected": -5.406933784484863, + "step": 7590 + }, + { + "epoch": 1.9161440771436675, + "grad_norm": 39.29946517944336, + "learning_rate": 1.7377888015942748e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.188256859779358, + "logps/chosen": -293.53125, + "logps/rejected": -300.0874938964844, + "loss": 0.2223, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.52764892578125, + "rewards/margins": 3.745898485183716, + "rewards/rejected": -5.2744140625, + "step": 7600 + }, + { + "epoch": 1.9186651120284877, + "grad_norm": 23.348567962646484, + "learning_rate": 1.7308066153889578e-07, + "logits/chosen": -1.155664086341858, + "logits/rejected": -1.10052490234375, + "logps/chosen": -293.5687561035156, + "logps/rejected": -323.2749938964844, + "loss": 0.2016, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.3254272937774658, + "rewards/margins": 3.885498046875, + "rewards/rejected": -5.208105564117432, + "step": 7610 + }, + { + "epoch": 1.9211861469133078, + "grad_norm": 48.23688888549805, + "learning_rate": 1.7238310501022517e-07, + "logits/chosen": -1.153076171875, + "logits/rejected": -1.1710388660430908, + "logps/chosen": -288.125, + "logps/rejected": -305.9375, + "loss": 0.2616, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.390020728111267, + "rewards/margins": 3.6179442405700684, + "rewards/rejected": -5.009375095367432, + "step": 7620 + }, + { + "epoch": 1.9237071817981282, + "grad_norm": 23.3243465423584, + "learning_rate": 1.71686216577712e-07, + "logits/chosen": -1.1703002452850342, + "logits/rejected": -1.1937377452850342, + "logps/chosen": -293.9125061035156, + "logps/rejected": -311.5718688964844, + "loss": 0.1641, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.2073485851287842, + "rewards/margins": 3.781494140625, + "rewards/rejected": -4.989501953125, + "step": 7630 + }, + { + "epoch": 1.9262282166829483, + "grad_norm": 70.19459533691406, + "learning_rate": 1.70990002239902e-07, + "logits/chosen": -1.2073974609375, + "logits/rejected": -1.1364624500274658, + "logps/chosen": -323.54376220703125, + "logps/rejected": -310.78125, + "loss": 0.2142, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.4185912609100342, + "rewards/margins": 3.931445360183716, + "rewards/rejected": -5.3515625, + "step": 7640 + }, + { + "epoch": 1.9287492515677687, + "grad_norm": 57.75076675415039, + "learning_rate": 1.7029446798953828e-07, + "logits/chosen": -1.165252685546875, + "logits/rejected": -1.179132103919983, + "logps/chosen": -278.4375, + "logps/rejected": -337.29998779296875, + "loss": 0.1795, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.5640045404434204, + "rewards/margins": 4.150390625, + "rewards/rejected": -5.712206840515137, + "step": 7650 + }, + { + "epoch": 1.9312702864525888, + "grad_norm": 26.637720108032227, + "learning_rate": 1.6959961981351025e-07, + "logits/chosen": -1.166986107826233, + "logits/rejected": -1.143103003501892, + "logps/chosen": -304.1812438964844, + "logps/rejected": -309.17498779296875, + "loss": 0.2837, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.929724097251892, + "rewards/margins": 3.954785108566284, + "rewards/rejected": -5.8876953125, + "step": 7660 + }, + { + "epoch": 1.933791321337409, + "grad_norm": 69.45800018310547, + "learning_rate": 1.6890546369280167e-07, + "logits/chosen": -1.156982421875, + "logits/rejected": -1.2170898914337158, + "logps/chosen": -307.39373779296875, + "logps/rejected": -331.65625, + "loss": 0.2257, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.6480376720428467, + "rewards/margins": 4.034033298492432, + "rewards/rejected": -5.680859565734863, + "step": 7670 + }, + { + "epoch": 1.936312356222229, + "grad_norm": 46.982643127441406, + "learning_rate": 1.6821200560243963e-07, + "logits/chosen": -1.1876220703125, + "logits/rejected": -1.088720679283142, + "logps/chosen": -288.5, + "logps/rejected": -316.8125, + "loss": 0.2756, + "rewards/accuracies": 0.890625, + "rewards/chosen": -1.824945092201233, + "rewards/margins": 3.7763915061950684, + "rewards/rejected": -5.601171970367432, + "step": 7680 + }, + { + "epoch": 1.9388333911070494, + "grad_norm": 35.12260437011719, + "learning_rate": 1.6751925151144259e-07, + "logits/chosen": -1.2165100574493408, + "logits/rejected": -1.153601050376892, + "logps/chosen": -285.70001220703125, + "logps/rejected": -319.6499938964844, + "loss": 0.1952, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.728387475013733, + "rewards/margins": 3.882763624191284, + "rewards/rejected": -5.612206935882568, + "step": 7690 + }, + { + "epoch": 1.9413544259918698, + "grad_norm": 35.687042236328125, + "learning_rate": 1.6682720738276918e-07, + "logits/chosen": -1.19384765625, + "logits/rejected": -1.167883276939392, + "logps/chosen": -314.6875, + "logps/rejected": -322.61248779296875, + "loss": 0.2299, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.49139404296875, + "rewards/margins": 3.896240234375, + "rewards/rejected": -5.388281345367432, + "step": 7700 + }, + { + "epoch": 1.94387546087669, + "grad_norm": 13.64608097076416, + "learning_rate": 1.6613587917326738e-07, + "logits/chosen": -1.192773461341858, + "logits/rejected": -1.1346924304962158, + "logps/chosen": -296.86248779296875, + "logps/rejected": -321.3999938964844, + "loss": 0.1853, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.4157593250274658, + "rewards/margins": 4.344433784484863, + "rewards/rejected": -5.75927734375, + "step": 7710 + }, + { + "epoch": 1.94639649576151, + "grad_norm": 64.79629516601562, + "learning_rate": 1.6544527283362237e-07, + "logits/chosen": -1.1622436046600342, + "logits/rejected": -1.1357421875, + "logps/chosen": -287.6499938964844, + "logps/rejected": -302.5375061035156, + "loss": 0.1799, + "rewards/accuracies": 0.9281250238418579, + "rewards/chosen": -1.3306884765625, + "rewards/margins": 3.9735350608825684, + "rewards/rejected": -5.307226657867432, + "step": 7720 + }, + { + "epoch": 1.9489175306463302, + "grad_norm": 48.76823425292969, + "learning_rate": 1.6475539430830604e-07, + "logits/chosen": -1.1828491687774658, + "logits/rejected": -1.1461181640625, + "logps/chosen": -302.29998779296875, + "logps/rejected": -328.8374938964844, + "loss": 0.27, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.4570099115371704, + "rewards/margins": 3.7859864234924316, + "rewards/rejected": -5.241601467132568, + "step": 7730 + }, + { + "epoch": 1.9514385655311506, + "grad_norm": 47.993526458740234, + "learning_rate": 1.640662495355253e-07, + "logits/chosen": -1.2107055187225342, + "logits/rejected": -1.1277954578399658, + "logps/chosen": -314.9375, + "logps/rejected": -307.20001220703125, + "loss": 0.2817, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.59698486328125, + "rewards/margins": 3.7835450172424316, + "rewards/rejected": -5.379101753234863, + "step": 7740 + }, + { + "epoch": 1.9539596004159707, + "grad_norm": 64.34617614746094, + "learning_rate": 1.6337784444717142e-07, + "logits/chosen": -1.1866943836212158, + "logits/rejected": -1.1796753406524658, + "logps/chosen": -275.7875061035156, + "logps/rejected": -296.9750061035156, + "loss": 0.2628, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.360137939453125, + "rewards/margins": 3.6226563453674316, + "rewards/rejected": -4.983300685882568, + "step": 7750 + }, + { + "epoch": 1.956480635300791, + "grad_norm": 52.332664489746094, + "learning_rate": 1.626901849687687e-07, + "logits/chosen": -1.182946801185608, + "logits/rejected": -1.1652953624725342, + "logps/chosen": -289.2093811035156, + "logps/rejected": -308.66876220703125, + "loss": 0.1713, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.5947754383087158, + "rewards/margins": 4.042382717132568, + "rewards/rejected": -5.638867378234863, + "step": 7760 + }, + { + "epoch": 1.9590016701856112, + "grad_norm": 52.05509567260742, + "learning_rate": 1.6200327701942328e-07, + "logits/chosen": -1.251123070716858, + "logits/rejected": -1.162377953529358, + "logps/chosen": -316.75, + "logps/rejected": -322.75, + "loss": 0.2451, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.1040756702423096, + "rewards/margins": 3.8963379859924316, + "rewards/rejected": -4.997754096984863, + "step": 7770 + }, + { + "epoch": 1.9615227050704314, + "grad_norm": 22.568510055541992, + "learning_rate": 1.6131712651177288e-07, + "logits/chosen": -1.1433242559432983, + "logits/rejected": -1.0986816883087158, + "logps/chosen": -274.8999938964844, + "logps/rejected": -298.79998779296875, + "loss": 0.1936, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.565435767173767, + "rewards/margins": 3.958056688308716, + "rewards/rejected": -5.525195121765137, + "step": 7780 + }, + { + "epoch": 1.9640437399552515, + "grad_norm": 37.83517837524414, + "learning_rate": 1.6063173935193503e-07, + "logits/chosen": -1.076226830482483, + "logits/rejected": -1.0985596179962158, + "logps/chosen": -274.79376220703125, + "logps/rejected": -309.38751220703125, + "loss": 0.2418, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -1.1774413585662842, + "rewards/margins": 4.007861137390137, + "rewards/rejected": -5.183667182922363, + "step": 7790 + }, + { + "epoch": 1.9665647748400719, + "grad_norm": 33.46553039550781, + "learning_rate": 1.5994712143945693e-07, + "logits/chosen": -1.147851586341858, + "logits/rejected": -1.068884253501892, + "logps/chosen": -294.14373779296875, + "logps/rejected": -310.8999938964844, + "loss": 0.2499, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.3985474109649658, + "rewards/margins": 3.975830078125, + "rewards/rejected": -5.372656345367432, + "step": 7800 + }, + { + "epoch": 1.9690858097248922, + "grad_norm": 63.27622985839844, + "learning_rate": 1.592632786672642e-07, + "logits/chosen": -1.171142578125, + "logits/rejected": -1.1185424327850342, + "logps/chosen": -312.65625, + "logps/rejected": -321.3500061035156, + "loss": 0.2516, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.239068627357483, + "rewards/margins": 3.9042725563049316, + "rewards/rejected": -5.14501953125, + "step": 7810 + }, + { + "epoch": 1.9716068446097124, + "grad_norm": 53.166255950927734, + "learning_rate": 1.5858021692161054e-07, + "logits/chosen": -1.1641693115234375, + "logits/rejected": -1.1528809070587158, + "logps/chosen": -301.13751220703125, + "logps/rejected": -289.9312438964844, + "loss": 0.2474, + "rewards/accuracies": 0.8968750238418579, + "rewards/chosen": -1.0548827648162842, + "rewards/margins": 3.8763670921325684, + "rewards/rejected": -4.929931640625, + "step": 7820 + }, + { + "epoch": 1.9741278794945325, + "grad_norm": 33.32133102416992, + "learning_rate": 1.578979420820268e-07, + "logits/chosen": -1.178247094154358, + "logits/rejected": -1.154272437095642, + "logps/chosen": -291.5, + "logps/rejected": -300.875, + "loss": 0.217, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.4634521007537842, + "rewards/margins": 3.802197217941284, + "rewards/rejected": -5.264062404632568, + "step": 7830 + }, + { + "epoch": 1.9766489143793526, + "grad_norm": 32.27366638183594, + "learning_rate": 1.572164600212703e-07, + "logits/chosen": -1.191552758216858, + "logits/rejected": -1.1298401355743408, + "logps/chosen": -306.3812561035156, + "logps/rejected": -330.3374938964844, + "loss": 0.1884, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.5618469715118408, + "rewards/margins": 3.993359327316284, + "rewards/rejected": -5.555468559265137, + "step": 7840 + }, + { + "epoch": 1.9791699492641728, + "grad_norm": 66.00794982910156, + "learning_rate": 1.5653577660527474e-07, + "logits/chosen": -1.1798095703125, + "logits/rejected": -1.1678345203399658, + "logps/chosen": -314.6000061035156, + "logps/rejected": -314.54376220703125, + "loss": 0.2634, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.4441254138946533, + "rewards/margins": 3.921435594558716, + "rewards/rejected": -5.366015434265137, + "step": 7850 + }, + { + "epoch": 1.9816909841489931, + "grad_norm": 40.01249694824219, + "learning_rate": 1.5585589769309904e-07, + "logits/chosen": -1.217675805091858, + "logits/rejected": -1.2192871570587158, + "logps/chosen": -326.34375, + "logps/rejected": -334.41876220703125, + "loss": 0.2045, + "rewards/accuracies": 0.9156249761581421, + "rewards/chosen": -1.3330872058868408, + "rewards/margins": 4.169384956359863, + "rewards/rejected": -5.503710746765137, + "step": 7860 + }, + { + "epoch": 1.9842120190338135, + "grad_norm": 14.252655982971191, + "learning_rate": 1.5517682913687764e-07, + "logits/chosen": -1.1039550304412842, + "logits/rejected": -1.070104956626892, + "logps/chosen": -303.0062561035156, + "logps/rejected": -313.33123779296875, + "loss": 0.2002, + "rewards/accuracies": 0.921875, + "rewards/chosen": -1.8024413585662842, + "rewards/margins": 3.9036622047424316, + "rewards/rejected": -5.705078125, + "step": 7870 + }, + { + "epoch": 1.9867330539186336, + "grad_norm": 68.55850982666016, + "learning_rate": 1.544985767817693e-07, + "logits/chosen": -1.151281714439392, + "logits/rejected": -1.1851074695587158, + "logps/chosen": -304.3187561035156, + "logps/rejected": -309.3812561035156, + "loss": 0.2472, + "rewards/accuracies": 0.878125011920929, + "rewards/chosen": -1.7944762706756592, + "rewards/margins": 3.6312499046325684, + "rewards/rejected": -5.427148342132568, + "step": 7880 + }, + { + "epoch": 1.9892540888034538, + "grad_norm": 79.71931457519531, + "learning_rate": 1.5382114646590776e-07, + "logits/chosen": NaN, + "logits/rejected": -1.160913109779358, + "logps/chosen": -327.76251220703125, + "logps/rejected": -325.1875, + "loss": 0.2788, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6047241687774658, + "rewards/margins": 3.879687547683716, + "rewards/rejected": -5.482714653015137, + "step": 7890 + }, + { + "epoch": 1.991775123688274, + "grad_norm": 86.6191177368164, + "learning_rate": 1.5314454402035055e-07, + "logits/chosen": -1.221826195716858, + "logits/rejected": -1.097039818763733, + "logps/chosen": -306.8687438964844, + "logps/rejected": -310.0687561035156, + "loss": 0.283, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.570031762123108, + "rewards/margins": 4.181323051452637, + "rewards/rejected": -5.751074314117432, + "step": 7900 + }, + { + "epoch": 1.9942961585730943, + "grad_norm": 54.50166702270508, + "learning_rate": 1.5246877526902925e-07, + "logits/chosen": -1.2642333507537842, + "logits/rejected": -1.164770483970642, + "logps/chosen": -293.1156311035156, + "logps/rejected": -290.8999938964844, + "loss": 0.2078, + "rewards/accuracies": 0.903124988079071, + "rewards/chosen": -0.9391708374023438, + "rewards/margins": 3.8685059547424316, + "rewards/rejected": -4.808203220367432, + "step": 7910 + }, + { + "epoch": 1.9968171934579144, + "grad_norm": 36.27590560913086, + "learning_rate": 1.5179384602869963e-07, + "logits/chosen": -1.250512719154358, + "logits/rejected": -1.2113158702850342, + "logps/chosen": -324.53125, + "logps/rejected": -305.98126220703125, + "loss": 0.1916, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.6489197015762329, + "rewards/margins": 3.955273389816284, + "rewards/rejected": -4.604784965515137, + "step": 7920 + }, + { + "epoch": 1.9993382283427348, + "grad_norm": 58.246299743652344, + "learning_rate": 1.5111976210889093e-07, + "logits/chosen": -1.2183349132537842, + "logits/rejected": NaN, + "logps/chosen": -306.46875, + "logps/rejected": -326.53125, + "loss": 0.2107, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.170811414718628, + "rewards/margins": 3.90380859375, + "rewards/rejected": -5.075293064117432, + "step": 7930 + }, + { + "epoch": 2.0020168279078563, + "grad_norm": 10.656667709350586, + "learning_rate": 1.5044652931185647e-07, + "logits/chosen": NaN, + "logits/rejected": -1.1141183376312256, + "logps/chosen": -300.76190185546875, + "logps/rejected": -312.1428527832031, + "loss": 0.1289, + "rewards/accuracies": 0.9553571343421936, + "rewards/chosen": -0.8152959942817688, + "rewards/margins": 4.322544574737549, + "rewards/rejected": -5.13876485824585, + "step": 7940 + }, + { + "epoch": 2.0045378627926764, + "grad_norm": 6.418241024017334, + "learning_rate": 1.4977415343252313e-07, + "logits/chosen": -1.1527984142303467, + "logits/rejected": -1.1155884265899658, + "logps/chosen": -291.15625, + "logps/rejected": -302.0249938964844, + "loss": 0.0487, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.9614166021347046, + "rewards/margins": 4.958788871765137, + "rewards/rejected": -5.920117378234863, + "step": 7950 + }, + { + "epoch": 2.0070588976774966, + "grad_norm": 18.13500213623047, + "learning_rate": 1.4910264025844217e-07, + "logits/chosen": -1.182641625404358, + "logits/rejected": -1.108789086341858, + "logps/chosen": -284.61248779296875, + "logps/rejected": -325.0, + "loss": 0.0575, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -1.067224144935608, + "rewards/margins": 4.996484279632568, + "rewards/rejected": -6.062695503234863, + "step": 7960 + }, + { + "epoch": 2.0095799325623167, + "grad_norm": 9.839264869689941, + "learning_rate": 1.4843199556973868e-07, + "logits/chosen": -1.1890137195587158, + "logits/rejected": -1.1378052234649658, + "logps/chosen": -328.7875061035156, + "logps/rejected": -331.54998779296875, + "loss": 0.0505, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9071899652481079, + "rewards/margins": 5.5849609375, + "rewards/rejected": -6.490624904632568, + "step": 7970 + }, + { + "epoch": 2.012100967447137, + "grad_norm": 8.16525936126709, + "learning_rate": 1.4776222513906216e-07, + "logits/chosen": -1.2042968273162842, + "logits/rejected": -1.1714050769805908, + "logps/chosen": -302.328125, + "logps/rejected": -322.51251220703125, + "loss": 0.0756, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -1.3151428699493408, + "rewards/margins": 5.09765625, + "rewards/rejected": -6.412109375, + "step": 7980 + }, + { + "epoch": 2.0146220023319574, + "grad_norm": 9.761605262756348, + "learning_rate": 1.4709333473153717e-07, + "logits/chosen": -1.192602515220642, + "logits/rejected": -1.120324730873108, + "logps/chosen": -299.92498779296875, + "logps/rejected": -337.5249938964844, + "loss": 0.0611, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -1.3048675060272217, + "rewards/margins": 5.318945407867432, + "rewards/rejected": -6.623339653015137, + "step": 7990 + }, + { + "epoch": 2.0171430372167776, + "grad_norm": 5.7592291831970215, + "learning_rate": 1.4642533010471304e-07, + "logits/chosen": -1.21588134765625, + "logits/rejected": -1.2119872570037842, + "logps/chosen": -312.75, + "logps/rejected": -350.7250061035156, + "loss": 0.0502, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2261474132537842, + "rewards/margins": 5.638671875, + "rewards/rejected": -6.861718654632568, + "step": 8000 + }, + { + "epoch": 2.0196640721015977, + "grad_norm": 12.713602066040039, + "learning_rate": 1.4575821700851485e-07, + "logits/chosen": -1.238745093345642, + "logits/rejected": -1.130151391029358, + "logps/chosen": -294.09375, + "logps/rejected": -326.15625, + "loss": 0.0482, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.1590087413787842, + "rewards/margins": 5.390429496765137, + "rewards/rejected": -6.548632621765137, + "step": 8010 + }, + { + "epoch": 2.022185106986418, + "grad_norm": 21.444751739501953, + "learning_rate": 1.4509200118519347e-07, + "logits/chosen": -1.2036864757537842, + "logits/rejected": -1.1689269542694092, + "logps/chosen": -325.88751220703125, + "logps/rejected": -332.45623779296875, + "loss": 0.0487, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.475341796875, + "rewards/margins": 5.63671875, + "rewards/rejected": -7.108202934265137, + "step": 8020 + }, + { + "epoch": 2.024706141871238, + "grad_norm": 10.78364372253418, + "learning_rate": 1.444266883692768e-07, + "logits/chosen": -1.2017090320587158, + "logits/rejected": -1.1442413330078125, + "logps/chosen": -311.42498779296875, + "logps/rejected": -320.73126220703125, + "loss": 0.0449, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7278320789337158, + "rewards/margins": 5.237890720367432, + "rewards/rejected": -6.96484375, + "step": 8030 + }, + { + "epoch": 2.0272271767560586, + "grad_norm": 19.230571746826172, + "learning_rate": 1.4376228428751963e-07, + "logits/chosen": -1.237158179283142, + "logits/rejected": -1.1794312000274658, + "logps/chosen": -297.51873779296875, + "logps/rejected": -375.57501220703125, + "loss": 0.0355, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -1.4023864269256592, + "rewards/margins": 6.161718845367432, + "rewards/rejected": -7.562890529632568, + "step": 8040 + }, + { + "epoch": 2.0297482116408787, + "grad_norm": 11.802494049072266, + "learning_rate": 1.4309879465885478e-07, + "logits/chosen": -1.171606421470642, + "logits/rejected": -1.0933105945587158, + "logps/chosen": -288.84375, + "logps/rejected": -332.0625, + "loss": 0.0446, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.374725341796875, + "rewards/margins": 5.962695121765137, + "rewards/rejected": -7.337304592132568, + "step": 8050 + }, + { + "epoch": 2.032269246525699, + "grad_norm": 27.448190689086914, + "learning_rate": 1.4243622519434407e-07, + "logits/chosen": -1.0920288562774658, + "logits/rejected": -0.997802734375, + "logps/chosen": -264.1312561035156, + "logps/rejected": -327.0, + "loss": 0.0643, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.884466528892517, + "rewards/margins": 5.689257621765137, + "rewards/rejected": -7.575390815734863, + "step": 8060 + }, + { + "epoch": 2.034790281410519, + "grad_norm": 5.394522190093994, + "learning_rate": 1.4177458159712863e-07, + "logits/chosen": -1.201196312904358, + "logits/rejected": -1.0519500970840454, + "logps/chosen": -308.6625061035156, + "logps/rejected": -336.5874938964844, + "loss": 0.0501, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -1.566705346107483, + "rewards/margins": 6.06640625, + "rewards/rejected": -7.633593559265137, + "step": 8070 + }, + { + "epoch": 2.037311316295339, + "grad_norm": 8.081001281738281, + "learning_rate": 1.411138695623802e-07, + "logits/chosen": -1.247949242591858, + "logits/rejected": -1.1958434581756592, + "logps/chosen": -293.25, + "logps/rejected": -314.3125, + "loss": 0.0424, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.3922271728515625, + "rewards/margins": 5.664843559265137, + "rewards/rejected": -7.056836128234863, + "step": 8080 + }, + { + "epoch": 2.0398323511801593, + "grad_norm": 13.59231948852539, + "learning_rate": 1.4045409477725185e-07, + "logits/chosen": -1.230902075767517, + "logits/rejected": -1.155676245689392, + "logps/chosen": -293.3187561035156, + "logps/rejected": -328.29376220703125, + "loss": 0.0597, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -1.3070068359375, + "rewards/margins": 5.885546684265137, + "rewards/rejected": -7.192773342132568, + "step": 8090 + }, + { + "epoch": 2.04235338606498, + "grad_norm": 13.345389366149902, + "learning_rate": 1.3979526292082938e-07, + "logits/chosen": -1.145593285560608, + "logits/rejected": -1.1276366710662842, + "logps/chosen": -320.34375, + "logps/rejected": -358.61248779296875, + "loss": 0.0536, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.7216370105743408, + "rewards/margins": 5.676660060882568, + "rewards/rejected": -7.401953220367432, + "step": 8100 + }, + { + "epoch": 2.0448744209498, + "grad_norm": 11.767427444458008, + "learning_rate": 1.391373796640822e-07, + "logits/chosen": -1.1553466320037842, + "logits/rejected": -1.1388061046600342, + "logps/chosen": -288.45001220703125, + "logps/rejected": -320.2875061035156, + "loss": 0.0539, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.22845458984375, + "rewards/margins": 5.697070121765137, + "rewards/rejected": -7.925000190734863, + "step": 8110 + }, + { + "epoch": 2.04739545583462, + "grad_norm": 29.374244689941406, + "learning_rate": 1.3848045066981433e-07, + "logits/chosen": -1.238500952720642, + "logits/rejected": -1.1590087413787842, + "logps/chosen": -300.5687561035156, + "logps/rejected": -326.875, + "loss": 0.0465, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.066088914871216, + "rewards/margins": 5.727734565734863, + "rewards/rejected": -7.7958984375, + "step": 8120 + }, + { + "epoch": 2.0499164907194403, + "grad_norm": 20.105798721313477, + "learning_rate": 1.3782448159261617e-07, + "logits/chosen": -1.1344482898712158, + "logits/rejected": NaN, + "logps/chosen": -291.51251220703125, + "logps/rejected": -338.57501220703125, + "loss": 0.0417, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.4490723609924316, + "rewards/margins": 5.634961128234863, + "rewards/rejected": -8.087109565734863, + "step": 8130 + }, + { + "epoch": 2.0524375256042604, + "grad_norm": 8.567663192749023, + "learning_rate": 1.3716947807881524e-07, + "logits/chosen": -1.209387183189392, + "logits/rejected": -1.16644287109375, + "logps/chosen": -311.2875061035156, + "logps/rejected": -332.91876220703125, + "loss": 0.0613, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -1.869696021080017, + "rewards/margins": 5.961523532867432, + "rewards/rejected": -7.832421779632568, + "step": 8140 + }, + { + "epoch": 2.054958560489081, + "grad_norm": 15.12371826171875, + "learning_rate": 1.3651544576642808e-07, + "logits/chosen": -1.185278296470642, + "logits/rejected": -1.1482665538787842, + "logps/chosen": -278.17498779296875, + "logps/rejected": -326.45001220703125, + "loss": 0.0555, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.1180450916290283, + "rewards/margins": 6.062109470367432, + "rewards/rejected": -8.181055068969727, + "step": 8150 + }, + { + "epoch": 2.057479595373901, + "grad_norm": 27.506376266479492, + "learning_rate": 1.358623902851112e-07, + "logits/chosen": -1.194268822669983, + "logits/rejected": -1.099145531654358, + "logps/chosen": -290.54376220703125, + "logps/rejected": -349.3374938964844, + "loss": 0.0596, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.6643279790878296, + "rewards/margins": 5.855859279632568, + "rewards/rejected": -7.521484375, + "step": 8160 + }, + { + "epoch": 2.0600006302587213, + "grad_norm": 5.201971054077148, + "learning_rate": 1.3521031725611342e-07, + "logits/chosen": -1.2114379405975342, + "logits/rejected": -1.170434594154358, + "logps/chosen": -308.4125061035156, + "logps/rejected": -331.4937438964844, + "loss": 0.0449, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.5582275390625, + "rewards/margins": 5.56640625, + "rewards/rejected": -7.1220703125, + "step": 8170 + }, + { + "epoch": 2.0625216651435414, + "grad_norm": 11.198143005371094, + "learning_rate": 1.345592322922266e-07, + "logits/chosen": -1.27203369140625, + "logits/rejected": -1.1819946765899658, + "logps/chosen": -308.375, + "logps/rejected": -337.82501220703125, + "loss": 0.048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.663183569908142, + "rewards/margins": 5.663866996765137, + "rewards/rejected": -7.327538967132568, + "step": 8180 + }, + { + "epoch": 2.0650427000283615, + "grad_norm": 12.780969619750977, + "learning_rate": 1.3390914099773773e-07, + "logits/chosen": NaN, + "logits/rejected": -1.1007812023162842, + "logps/chosen": -309.8374938964844, + "logps/rejected": -351.0625, + "loss": 0.0436, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.851709008216858, + "rewards/margins": 5.76953125, + "rewards/rejected": -7.619921684265137, + "step": 8190 + }, + { + "epoch": 2.0675637349131817, + "grad_norm": 13.616006851196289, + "learning_rate": 1.3326004896838096e-07, + "logits/chosen": NaN, + "logits/rejected": -1.057580590248108, + "logps/chosen": -286.6187438964844, + "logps/rejected": -313.61248779296875, + "loss": 0.0424, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.9296753406524658, + "rewards/margins": 5.961133003234863, + "rewards/rejected": -7.889843940734863, + "step": 8200 + }, + { + "epoch": 2.0700847697980023, + "grad_norm": 28.805870056152344, + "learning_rate": 1.3261196179128885e-07, + "logits/chosen": -1.1484496593475342, + "logits/rejected": -1.0574462413787842, + "logps/chosen": -311.23748779296875, + "logps/rejected": -352.04998779296875, + "loss": 0.0652, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.540820360183716, + "rewards/margins": 5.760937690734863, + "rewards/rejected": -8.300585746765137, + "step": 8210 + }, + { + "epoch": 2.0726058046828224, + "grad_norm": 48.506614685058594, + "learning_rate": 1.3196488504494477e-07, + "logits/chosen": -1.084375023841858, + "logits/rejected": -1.068701148033142, + "logps/chosen": -315.0375061035156, + "logps/rejected": -340.38751220703125, + "loss": 0.0941, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -2.3023314476013184, + "rewards/margins": 6.016797065734863, + "rewards/rejected": -8.318163871765137, + "step": 8220 + }, + { + "epoch": 2.0751268395676425, + "grad_norm": 7.5023956298828125, + "learning_rate": 1.3131882429913449e-07, + "logits/chosen": -1.206689476966858, + "logits/rejected": -1.1428344249725342, + "logps/chosen": -319.7875061035156, + "logps/rejected": -331.70001220703125, + "loss": 0.0573, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -1.846343994140625, + "rewards/margins": 5.7177734375, + "rewards/rejected": -7.5654296875, + "step": 8230 + }, + { + "epoch": 2.0776478744524627, + "grad_norm": 11.308616638183594, + "learning_rate": 1.3067378511489865e-07, + "logits/chosen": -1.1685912609100342, + "logits/rejected": -1.1099151372909546, + "logps/chosen": -276.96875, + "logps/rejected": -319.5562438964844, + "loss": 0.0568, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.963647484779358, + "rewards/margins": 5.4619140625, + "rewards/rejected": -7.428124904632568, + "step": 8240 + }, + { + "epoch": 2.080168909337283, + "grad_norm": 25.78227424621582, + "learning_rate": 1.3002977304448477e-07, + "logits/chosen": -1.1837341785430908, + "logits/rejected": -1.12335205078125, + "logps/chosen": -296.671875, + "logps/rejected": -338.3999938964844, + "loss": 0.0544, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.9570465087890625, + "rewards/margins": 5.804491996765137, + "rewards/rejected": -7.7626953125, + "step": 8250 + }, + { + "epoch": 2.082689944222103, + "grad_norm": 8.633604049682617, + "learning_rate": 1.2938679363129896e-07, + "logits/chosen": -1.0977294445037842, + "logits/rejected": -1.002557396888733, + "logps/chosen": -279.4750061035156, + "logps/rejected": -314.23748779296875, + "loss": 0.049, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.847680687904358, + "rewards/margins": 5.765234470367432, + "rewards/rejected": -7.612109184265137, + "step": 8260 + }, + { + "epoch": 2.0852109791069235, + "grad_norm": 17.364688873291016, + "learning_rate": 1.287448524098591e-07, + "logits/chosen": -1.2186279296875, + "logits/rejected": -1.14141845703125, + "logps/chosen": -299.4437561035156, + "logps/rejected": -332.3812561035156, + "loss": 0.093, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.0953369140625, + "rewards/margins": 5.600976467132568, + "rewards/rejected": -7.696484565734863, + "step": 8270 + }, + { + "epoch": 2.0877320139917437, + "grad_norm": 13.196175575256348, + "learning_rate": 1.2810395490574637e-07, + "logits/chosen": -1.1842772960662842, + "logits/rejected": -1.0907989740371704, + "logps/chosen": -306.3687438964844, + "logps/rejected": -344.0874938964844, + "loss": 0.0624, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -1.4218124151229858, + "rewards/margins": 6.040234565734863, + "rewards/rejected": -7.460741996765137, + "step": 8280 + }, + { + "epoch": 2.090253048876564, + "grad_norm": 15.400144577026367, + "learning_rate": 1.2746410663555817e-07, + "logits/chosen": -1.2322509288787842, + "logits/rejected": -1.1100952625274658, + "logps/chosen": -301.8062438964844, + "logps/rejected": -318.73126220703125, + "loss": 0.045, + "rewards/accuracies": 0.984375, + "rewards/chosen": -1.8669312000274658, + "rewards/margins": 5.905077934265137, + "rewards/rejected": -7.7734375, + "step": 8290 + }, + { + "epoch": 2.092774083761384, + "grad_norm": 23.49390983581543, + "learning_rate": 1.268253131068604e-07, + "logits/chosen": -1.2156860828399658, + "logits/rejected": -1.058203101158142, + "logps/chosen": -288.01251220703125, + "logps/rejected": -310.04998779296875, + "loss": 0.0711, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -1.6891295909881592, + "rewards/margins": 6.189257621765137, + "rewards/rejected": -7.877148628234863, + "step": 8300 + }, + { + "epoch": 2.095295118646204, + "grad_norm": 8.779989242553711, + "learning_rate": 1.261875798181404e-07, + "logits/chosen": -1.279028296470642, + "logits/rejected": -1.2348754405975342, + "logps/chosen": -328.38751220703125, + "logps/rejected": -337.6625061035156, + "loss": 0.0579, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.896887183189392, + "rewards/margins": 5.675585746765137, + "rewards/rejected": -7.574023246765137, + "step": 8310 + }, + { + "epoch": 2.0978161535310247, + "grad_norm": 17.310787200927734, + "learning_rate": 1.2555091225875912e-07, + "logits/chosen": -1.1862914562225342, + "logits/rejected": -1.0946991443634033, + "logps/chosen": -306.98126220703125, + "logps/rejected": -322.125, + "loss": 0.0748, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.109936475753784, + "rewards/margins": 5.740185737609863, + "rewards/rejected": -7.849804878234863, + "step": 8320 + }, + { + "epoch": 2.100337188415845, + "grad_norm": 4.619143009185791, + "learning_rate": 1.2491531590890413e-07, + "logits/chosen": -1.2049438953399658, + "logits/rejected": -1.130090355873108, + "logps/chosen": -333.53125, + "logps/rejected": -344.13751220703125, + "loss": 0.0397, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1664185523986816, + "rewards/margins": 6.011914253234863, + "rewards/rejected": -8.179491996765137, + "step": 8330 + }, + { + "epoch": 2.102858223300665, + "grad_norm": 29.662790298461914, + "learning_rate": 1.2428079623954274e-07, + "logits/chosen": -1.118188500404358, + "logits/rejected": -1.0854980945587158, + "logps/chosen": -296.1187438964844, + "logps/rejected": -332.85626220703125, + "loss": 0.07, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.3039183616638184, + "rewards/margins": 5.928320407867432, + "rewards/rejected": -8.230273246765137, + "step": 8340 + }, + { + "epoch": 2.105379258185485, + "grad_norm": 37.8450813293457, + "learning_rate": 1.236473587123743e-07, + "logits/chosen": -1.144342064857483, + "logits/rejected": -1.068994164466858, + "logps/chosen": -281.9375, + "logps/rejected": -317.2437438964844, + "loss": 0.0764, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.2720947265625, + "rewards/margins": 5.494360446929932, + "rewards/rejected": -7.770312309265137, + "step": 8350 + }, + { + "epoch": 2.1079002930703052, + "grad_norm": 9.65998649597168, + "learning_rate": 1.2301500877978353e-07, + "logits/chosen": -1.212011694908142, + "logits/rejected": -1.070886254310608, + "logps/chosen": -305.9437561035156, + "logps/rejected": -311.41876220703125, + "loss": 0.0541, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -1.6458313465118408, + "rewards/margins": 5.796875, + "rewards/rejected": -7.442773342132568, + "step": 8360 + }, + { + "epoch": 2.1104213279551254, + "grad_norm": 25.525508880615234, + "learning_rate": 1.2238375188479374e-07, + "logits/chosen": -1.160986304283142, + "logits/rejected": -1.1099731922149658, + "logps/chosen": -290.9624938964844, + "logps/rejected": -352.48126220703125, + "loss": 0.0488, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.065624952316284, + "rewards/margins": 6.021288871765137, + "rewards/rejected": -8.088086128234863, + "step": 8370 + }, + { + "epoch": 2.112942362839946, + "grad_norm": 22.408308029174805, + "learning_rate": 1.217535934610196e-07, + "logits/chosen": -1.1594116687774658, + "logits/rejected": -1.130712866783142, + "logps/chosen": -333.45623779296875, + "logps/rejected": -357.7875061035156, + "loss": 0.0449, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.0124573707580566, + "rewards/margins": 6.266211032867432, + "rewards/rejected": -8.2763671875, + "step": 8380 + }, + { + "epoch": 2.115463397724766, + "grad_norm": 26.604774475097656, + "learning_rate": 1.2112453893262077e-07, + "logits/chosen": -1.25897216796875, + "logits/rejected": -1.1997497081756592, + "logps/chosen": -333.39373779296875, + "logps/rejected": -361.04376220703125, + "loss": 0.0576, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.855340600013733, + "rewards/margins": 5.884961128234863, + "rewards/rejected": -7.741796970367432, + "step": 8390 + }, + { + "epoch": 2.1179844326095862, + "grad_norm": 23.80531883239746, + "learning_rate": 1.204965937142548e-07, + "logits/chosen": NaN, + "logits/rejected": -1.179907202720642, + "logps/chosen": -315.93438720703125, + "logps/rejected": -343.5062561035156, + "loss": 0.0513, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.9385802745819092, + "rewards/margins": 6.142968654632568, + "rewards/rejected": -8.082616806030273, + "step": 8400 + }, + { + "epoch": 2.1205054674944064, + "grad_norm": 9.067641258239746, + "learning_rate": 1.1986976321103073e-07, + "logits/chosen": -1.172882080078125, + "logits/rejected": -1.0681641101837158, + "logps/chosen": -302.9906311035156, + "logps/rejected": -322.8125, + "loss": 0.051, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.067333936691284, + "rewards/margins": 6.111718654632568, + "rewards/rejected": -8.177148818969727, + "step": 8410 + }, + { + "epoch": 2.1230265023792265, + "grad_norm": 21.33924674987793, + "learning_rate": 1.1924405281846285e-07, + "logits/chosen": NaN, + "logits/rejected": -1.105712890625, + "logps/chosen": -307.8999938964844, + "logps/rejected": -346.7437438964844, + "loss": 0.0619, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.160839796066284, + "rewards/margins": 6.102734565734863, + "rewards/rejected": -8.26214599609375, + "step": 8420 + }, + { + "epoch": 2.1255475372640467, + "grad_norm": 7.438653469085693, + "learning_rate": 1.1861946792242372e-07, + "logits/chosen": -1.1189727783203125, + "logits/rejected": -1.0370910167694092, + "logps/chosen": -291.64373779296875, + "logps/rejected": -342.0625, + "loss": 0.0616, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.2601685523986816, + "rewards/margins": 5.860742092132568, + "rewards/rejected": -8.1201171875, + "step": 8430 + }, + { + "epoch": 2.1280685721488672, + "grad_norm": 24.812488555908203, + "learning_rate": 1.1799601389909795e-07, + "logits/chosen": -1.121026635169983, + "logits/rejected": -1.0014861822128296, + "logps/chosen": -296.25311279296875, + "logps/rejected": -330.8687438964844, + "loss": 0.072, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.4051146507263184, + "rewards/margins": 5.999413967132568, + "rewards/rejected": -8.405077934265137, + "step": 8440 + }, + { + "epoch": 2.1305896070336874, + "grad_norm": 27.550500869750977, + "learning_rate": 1.1737369611493639e-07, + "logits/chosen": -1.1107299327850342, + "logits/rejected": -1.031671166419983, + "logps/chosen": -306.2875061035156, + "logps/rejected": -353.4937438964844, + "loss": 0.0526, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.4344239234924316, + "rewards/margins": 6.117480278015137, + "rewards/rejected": -8.553319931030273, + "step": 8450 + }, + { + "epoch": 2.1331106419185075, + "grad_norm": 24.330074310302734, + "learning_rate": 1.1675251992660931e-07, + "logits/chosen": -1.1606566905975342, + "logits/rejected": -1.1696898937225342, + "logps/chosen": -311.9375, + "logps/rejected": -375.23748779296875, + "loss": 0.0565, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.465075731277466, + "rewards/margins": 6.476953029632568, + "rewards/rejected": -8.94140625, + "step": 8460 + }, + { + "epoch": 2.1356316768033277, + "grad_norm": 10.037461280822754, + "learning_rate": 1.161324906809607e-07, + "logits/chosen": -1.108489990234375, + "logits/rejected": -1.1308135986328125, + "logps/chosen": -320.35626220703125, + "logps/rejected": -356.875, + "loss": 0.0472, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.190533399581909, + "rewards/margins": 6.083398342132568, + "rewards/rejected": -8.2724609375, + "step": 8470 + }, + { + "epoch": 2.138152711688148, + "grad_norm": 16.36578369140625, + "learning_rate": 1.155136137149619e-07, + "logits/chosen": -1.19464111328125, + "logits/rejected": -1.132653832435608, + "logps/chosen": -324.11248779296875, + "logps/rejected": -370.5249938964844, + "loss": 0.0717, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.456561326980591, + "rewards/margins": 5.934960842132568, + "rewards/rejected": -8.390233993530273, + "step": 8480 + }, + { + "epoch": 2.1406737465729684, + "grad_norm": 42.44452667236328, + "learning_rate": 1.1489589435566627e-07, + "logits/chosen": -1.1802489757537842, + "logits/rejected": -1.091644287109375, + "logps/chosen": -299.3687438964844, + "logps/rejected": -351.01251220703125, + "loss": 0.0742, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.403979539871216, + "rewards/margins": 5.965014457702637, + "rewards/rejected": -8.366991996765137, + "step": 8490 + }, + { + "epoch": 2.1431947814577885, + "grad_norm": 4.1698479652404785, + "learning_rate": 1.1427933792016248e-07, + "logits/chosen": -1.2140624523162842, + "logits/rejected": -1.104516625404358, + "logps/chosen": -286.72186279296875, + "logps/rejected": -343.64373779296875, + "loss": 0.0438, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.132031202316284, + "rewards/margins": 5.950585842132568, + "rewards/rejected": -8.080469131469727, + "step": 8500 + }, + { + "epoch": 2.1457158163426087, + "grad_norm": 20.45341682434082, + "learning_rate": 1.1366394971552962e-07, + "logits/chosen": -1.1300170421600342, + "logits/rejected": -1.051171898841858, + "logps/chosen": -322.2875061035156, + "logps/rejected": -381.2749938964844, + "loss": 0.0657, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.202221632003784, + "rewards/margins": 6.493359565734863, + "rewards/rejected": -8.6953125, + "step": 8510 + }, + { + "epoch": 2.148236851227429, + "grad_norm": 6.115811824798584, + "learning_rate": 1.1304973503879076e-07, + "logits/chosen": -1.1869628429412842, + "logits/rejected": -1.080224633216858, + "logps/chosen": -311.23126220703125, + "logps/rejected": -320.11248779296875, + "loss": 0.0452, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.466625928878784, + "rewards/margins": 6.030859470367432, + "rewards/rejected": -8.498046875, + "step": 8520 + }, + { + "epoch": 2.150757886112249, + "grad_norm": 21.175352096557617, + "learning_rate": 1.1243669917686797e-07, + "logits/chosen": -1.100653052330017, + "logits/rejected": -1.1054198741912842, + "logps/chosen": -298.17498779296875, + "logps/rejected": -365.3999938964844, + "loss": 0.0432, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.580127000808716, + "rewards/margins": 6.098046779632568, + "rewards/rejected": -8.6796875, + "step": 8530 + }, + { + "epoch": 2.153278920997069, + "grad_norm": 38.15406036376953, + "learning_rate": 1.1182484740653636e-07, + "logits/chosen": -1.07086181640625, + "logits/rejected": -1.073522925376892, + "logps/chosen": -314.73748779296875, + "logps/rejected": -354.2250061035156, + "loss": 0.0448, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.737866163253784, + "rewards/margins": 6.369531154632568, + "rewards/rejected": -9.104296684265137, + "step": 8540 + }, + { + "epoch": 2.1557999558818897, + "grad_norm": 11.452657699584961, + "learning_rate": 1.1121418499437881e-07, + "logits/chosen": -1.150183081626892, + "logits/rejected": -0.9835265874862671, + "logps/chosen": -328.6875, + "logps/rejected": -351.6499938964844, + "loss": 0.047, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.7571043968200684, + "rewards/margins": 5.888574123382568, + "rewards/rejected": -8.6484375, + "step": 8550 + }, + { + "epoch": 2.15832099076671, + "grad_norm": 9.922450065612793, + "learning_rate": 1.1060471719674092e-07, + "logits/chosen": NaN, + "logits/rejected": -1.154449462890625, + "logps/chosen": -308.09375, + "logps/rejected": -336.16876220703125, + "loss": 0.0526, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.2076447010040283, + "rewards/margins": 6.534960746765137, + "rewards/rejected": -8.737890243530273, + "step": 8560 + }, + { + "epoch": 2.16084202565153, + "grad_norm": 13.323744773864746, + "learning_rate": 1.099964492596852e-07, + "logits/chosen": -1.1337707042694092, + "logits/rejected": -1.10675048828125, + "logps/chosen": -305.2875061035156, + "logps/rejected": -357.8500061035156, + "loss": 0.0541, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.3842225074768066, + "rewards/margins": 6.028515815734863, + "rewards/rejected": -8.414843559265137, + "step": 8570 + }, + { + "epoch": 2.16336306053635, + "grad_norm": 15.773117065429688, + "learning_rate": 1.0938938641894635e-07, + "logits/chosen": -1.1461181640625, + "logits/rejected": -1.1283690929412842, + "logps/chosen": -294.8500061035156, + "logps/rejected": -343.375, + "loss": 0.0526, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.3901610374450684, + "rewards/margins": 6.066601753234863, + "rewards/rejected": -8.456640243530273, + "step": 8580 + }, + { + "epoch": 2.16588409542117, + "grad_norm": 15.18516731262207, + "learning_rate": 1.087835338998862e-07, + "logits/chosen": -1.091589331626892, + "logits/rejected": -1.0149962902069092, + "logps/chosen": -325.7124938964844, + "logps/rejected": -351.4125061035156, + "loss": 0.0488, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1823487281799316, + "rewards/margins": 6.094336032867432, + "rewards/rejected": -8.279687881469727, + "step": 8590 + }, + { + "epoch": 2.168405130305991, + "grad_norm": 19.302209854125977, + "learning_rate": 1.0817889691744844e-07, + "logits/chosen": -1.1664917469024658, + "logits/rejected": -1.096594214439392, + "logps/chosen": -323.1812438964844, + "logps/rejected": -364.1625061035156, + "loss": 0.0395, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.308727979660034, + "rewards/margins": 6.358984470367432, + "rewards/rejected": -8.664843559265137, + "step": 8600 + }, + { + "epoch": 2.170926165190811, + "grad_norm": 22.567893981933594, + "learning_rate": 1.0757548067611388e-07, + "logits/chosen": -1.056249976158142, + "logits/rejected": -0.7930053472518921, + "logps/chosen": -323.54376220703125, + "logps/rejected": -344.2250061035156, + "loss": 0.0473, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.4972596168518066, + "rewards/margins": 6.3125, + "rewards/rejected": -8.811132431030273, + "step": 8610 + }, + { + "epoch": 2.173447200075631, + "grad_norm": 14.041077613830566, + "learning_rate": 1.0697329036985567e-07, + "logits/chosen": -1.1444275379180908, + "logits/rejected": -1.07598876953125, + "logps/chosen": -322.2250061035156, + "logps/rejected": -333.0625, + "loss": 0.0687, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.640765428543091, + "rewards/margins": 6.111132621765137, + "rewards/rejected": -8.752344131469727, + "step": 8620 + }, + { + "epoch": 2.175968234960451, + "grad_norm": 39.61917495727539, + "learning_rate": 1.0637233118209482e-07, + "logits/chosen": -1.09375, + "logits/rejected": -1.0608398914337158, + "logps/chosen": -301.45623779296875, + "logps/rejected": -344.54998779296875, + "loss": 0.0694, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.80816650390625, + "rewards/margins": 6.245312690734863, + "rewards/rejected": -9.0537109375, + "step": 8630 + }, + { + "epoch": 2.1784892698452714, + "grad_norm": 30.946664810180664, + "learning_rate": 1.0577260828565492e-07, + "logits/chosen": -1.172705054283142, + "logits/rejected": -1.0229613780975342, + "logps/chosen": -320.51251220703125, + "logps/rejected": -333.6625061035156, + "loss": 0.0478, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.435986280441284, + "rewards/margins": 6.369726657867432, + "rewards/rejected": -8.804296493530273, + "step": 8640 + }, + { + "epoch": 2.1810103047300915, + "grad_norm": 14.99609375, + "learning_rate": 1.0517412684271856e-07, + "logits/chosen": -1.214208960533142, + "logits/rejected": -1.089514136314392, + "logps/chosen": -319.4156188964844, + "logps/rejected": -360.3374938964844, + "loss": 0.0468, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.698712110519409, + "rewards/margins": 6.314257621765137, + "rewards/rejected": -9.019726753234863, + "step": 8650 + }, + { + "epoch": 2.183531339614912, + "grad_norm": 73.42444610595703, + "learning_rate": 1.0457689200478185e-07, + "logits/chosen": -1.110388159751892, + "logits/rejected": -1.174536108970642, + "logps/chosen": -321.03125, + "logps/rejected": -346.79998779296875, + "loss": 0.0627, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.540576219558716, + "rewards/margins": 6.526757717132568, + "rewards/rejected": -9.067968368530273, + "step": 8660 + }, + { + "epoch": 2.186052374499732, + "grad_norm": 25.020496368408203, + "learning_rate": 1.0398090891261105e-07, + "logits/chosen": -1.173162817955017, + "logits/rejected": -1.091455101966858, + "logps/chosen": -289.6812438964844, + "logps/rejected": -344.6625061035156, + "loss": 0.0694, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.8015990257263184, + "rewards/margins": 5.858593940734863, + "rewards/rejected": -8.662500381469727, + "step": 8670 + }, + { + "epoch": 2.1885734093845524, + "grad_norm": 14.680599212646484, + "learning_rate": 1.0338618269619762e-07, + "logits/chosen": -1.21966552734375, + "logits/rejected": -1.0860412120819092, + "logps/chosen": -307.0562438964844, + "logps/rejected": -379.5249938964844, + "loss": 0.0474, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.5776000022888184, + "rewards/margins": 6.258008003234863, + "rewards/rejected": -8.833984375, + "step": 8680 + }, + { + "epoch": 2.1910944442693725, + "grad_norm": 18.51276397705078, + "learning_rate": 1.0279271847471426e-07, + "logits/chosen": -1.2114989757537842, + "logits/rejected": -1.11944580078125, + "logps/chosen": -324.66876220703125, + "logps/rejected": -363.8500061035156, + "loss": 0.0456, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.37677001953125, + "rewards/margins": 6.630468845367432, + "rewards/rejected": -10.005468368530273, + "step": 8690 + }, + { + "epoch": 2.1936154791541926, + "grad_norm": 16.530040740966797, + "learning_rate": 1.0220052135647129e-07, + "logits/chosen": -1.1586456298828125, + "logits/rejected": -1.078637719154358, + "logps/chosen": -328.6187438964844, + "logps/rejected": -369.6875, + "loss": 0.0659, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.0513062477111816, + "rewards/margins": 6.638867378234863, + "rewards/rejected": -9.693359375, + "step": 8700 + }, + { + "epoch": 2.196136514039013, + "grad_norm": 17.24599266052246, + "learning_rate": 1.0160959643887187e-07, + "logits/chosen": -1.0912902355194092, + "logits/rejected": -1.132360816001892, + "logps/chosen": -302.2562561035156, + "logps/rejected": -347.42498779296875, + "loss": 0.0518, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.469165086746216, + "rewards/margins": 6.382616996765137, + "rewards/rejected": -9.849609375, + "step": 8710 + }, + { + "epoch": 2.1986575489238334, + "grad_norm": 53.602500915527344, + "learning_rate": 1.010199488083687e-07, + "logits/chosen": -1.150793433189392, + "logits/rejected": -1.121130347251892, + "logps/chosen": -326.2250061035156, + "logps/rejected": -357.0625, + "loss": 0.0602, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.182080030441284, + "rewards/margins": 6.177929878234863, + "rewards/rejected": -9.355859756469727, + "step": 8720 + }, + { + "epoch": 2.2011785838086535, + "grad_norm": 8.69321060180664, + "learning_rate": 1.0043158354042027e-07, + "logits/chosen": -1.0912597179412842, + "logits/rejected": -0.937573254108429, + "logps/chosen": -317.20623779296875, + "logps/rejected": -353.625, + "loss": 0.039, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.010894775390625, + "rewards/margins": 6.508398532867432, + "rewards/rejected": -9.523828506469727, + "step": 8730 + }, + { + "epoch": 2.2036996186934736, + "grad_norm": 11.698653221130371, + "learning_rate": 9.984450569944672e-08, + "logits/chosen": -1.1413695812225342, + "logits/rejected": -1.0407836437225342, + "logps/chosen": -299.03125, + "logps/rejected": -330.4624938964844, + "loss": 0.0259, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -2.7392945289611816, + "rewards/margins": 6.866796970367432, + "rewards/rejected": -9.6015625, + "step": 8740 + }, + { + "epoch": 2.2062206535782938, + "grad_norm": 26.454465866088867, + "learning_rate": 9.925872033878662e-08, + "logits/chosen": -1.194921851158142, + "logits/rejected": -1.0609207153320312, + "logps/chosen": -284.7124938964844, + "logps/rejected": -334.17498779296875, + "loss": 0.0482, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.0052733421325684, + "rewards/margins": 6.383496284484863, + "rewards/rejected": -9.394922256469727, + "step": 8750 + }, + { + "epoch": 2.208741688463114, + "grad_norm": 12.642827033996582, + "learning_rate": 9.867423250065332e-08, + "logits/chosen": -1.101049780845642, + "logits/rejected": -1.042272925376892, + "logps/chosen": -316.96875, + "logps/rejected": -340.95001220703125, + "loss": 0.0461, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.052014112472534, + "rewards/margins": 6.135644435882568, + "rewards/rejected": -9.191015243530273, + "step": 8760 + }, + { + "epoch": 2.2112627233479345, + "grad_norm": 12.33161735534668, + "learning_rate": 9.809104721609182e-08, + "logits/chosen": -1.243432641029358, + "logits/rejected": -1.089147925376892, + "logps/chosen": -296.84375, + "logps/rejected": -350.13751220703125, + "loss": 0.0547, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.00762939453125, + "rewards/margins": 6.277734279632568, + "rewards/rejected": -9.284375190734863, + "step": 8770 + }, + { + "epoch": 2.2137837582327546, + "grad_norm": 17.560251235961914, + "learning_rate": 9.75091695049349e-08, + "logits/chosen": -1.136560082435608, + "logits/rejected": -1.00762939453125, + "logps/chosen": -302.1312561035156, + "logps/rejected": -349.82501220703125, + "loss": 0.0404, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.611377000808716, + "rewards/margins": 6.48828125, + "rewards/rejected": -9.103124618530273, + "step": 8780 + }, + { + "epoch": 2.216304793117575, + "grad_norm": 43.00522232055664, + "learning_rate": 9.692860437576061e-08, + "logits/chosen": -1.1465332508087158, + "logits/rejected": -1.110925316810608, + "logps/chosen": -297.03436279296875, + "logps/rejected": -354.36248779296875, + "loss": 0.0768, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.4622559547424316, + "rewards/margins": 6.297656059265137, + "rewards/rejected": -8.760156631469727, + "step": 8790 + }, + { + "epoch": 2.218825828002395, + "grad_norm": 29.944978713989258, + "learning_rate": 9.634935682584846e-08, + "logits/chosen": -1.186193823814392, + "logits/rejected": -1.1214599609375, + "logps/chosen": -312.85626220703125, + "logps/rejected": -360.07501220703125, + "loss": 0.059, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.4377198219299316, + "rewards/margins": 6.615038871765137, + "rewards/rejected": -9.050390243530273, + "step": 8800 + }, + { + "epoch": 2.221346862887215, + "grad_norm": 9.026261329650879, + "learning_rate": 9.577143184113711e-08, + "logits/chosen": -1.2394530773162842, + "logits/rejected": -0.97979736328125, + "logps/chosen": -335.4156188964844, + "logps/rejected": -352.23748779296875, + "loss": 0.0358, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.465161085128784, + "rewards/margins": 6.535546779632568, + "rewards/rejected": -9.005468368530273, + "step": 8810 + }, + { + "epoch": 2.2238678977720356, + "grad_norm": 7.27154016494751, + "learning_rate": 9.519483439618075e-08, + "logits/chosen": -1.1887328624725342, + "logits/rejected": -1.03375244140625, + "logps/chosen": -336.2124938964844, + "logps/rejected": -355.7875061035156, + "loss": 0.0385, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.6427979469299316, + "rewards/margins": 6.421679496765137, + "rewards/rejected": -9.065234184265137, + "step": 8820 + }, + { + "epoch": 2.226388932656856, + "grad_norm": 25.24820899963379, + "learning_rate": 9.461956945410676e-08, + "logits/chosen": -1.2418029308319092, + "logits/rejected": -1.0834472179412842, + "logps/chosen": -315.5249938964844, + "logps/rejected": -335.1499938964844, + "loss": 0.0358, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.6830687522888184, + "rewards/margins": 6.211133003234863, + "rewards/rejected": -8.895312309265137, + "step": 8830 + }, + { + "epoch": 2.228909967541676, + "grad_norm": 7.522816181182861, + "learning_rate": 9.404564196657298e-08, + "logits/chosen": -1.1717712879180908, + "logits/rejected": -1.060333251953125, + "logps/chosen": -331.5874938964844, + "logps/rejected": -346.82501220703125, + "loss": 0.0516, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8359375, + "rewards/margins": 6.302929878234863, + "rewards/rejected": -9.135546684265137, + "step": 8840 + }, + { + "epoch": 2.231431002426496, + "grad_norm": 34.26163864135742, + "learning_rate": 9.347305687372475e-08, + "logits/chosen": -1.2626953125, + "logits/rejected": -1.1672241687774658, + "logps/chosen": -321.9312438964844, + "logps/rejected": -375.6875, + "loss": 0.0754, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.4271607398986816, + "rewards/margins": 6.099413871765137, + "rewards/rejected": -8.525781631469727, + "step": 8850 + }, + { + "epoch": 2.233952037311316, + "grad_norm": 31.264087677001953, + "learning_rate": 9.290181910415263e-08, + "logits/chosen": -1.2138671875, + "logits/rejected": -1.154516577720642, + "logps/chosen": -339.1875, + "logps/rejected": -354.82501220703125, + "loss": 0.0674, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.8809571266174316, + "rewards/margins": 5.992383003234863, + "rewards/rejected": -8.870702743530273, + "step": 8860 + }, + { + "epoch": 2.2364730721961363, + "grad_norm": 5.758995532989502, + "learning_rate": 9.233193357485014e-08, + "logits/chosen": -1.1931030750274658, + "logits/rejected": -1.066613793373108, + "logps/chosen": -323.17498779296875, + "logps/rejected": -354.875, + "loss": 0.0329, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.0781617164611816, + "rewards/margins": 6.630859375, + "rewards/rejected": -9.706640243530273, + "step": 8870 + }, + { + "epoch": 2.238994107080957, + "grad_norm": 18.613447189331055, + "learning_rate": 9.176340519117106e-08, + "logits/chosen": -1.114111304283142, + "logits/rejected": -1.09332275390625, + "logps/chosen": -319.75, + "logps/rejected": -359.40625, + "loss": 0.0513, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.808239698410034, + "rewards/margins": 6.661913871765137, + "rewards/rejected": -9.473047256469727, + "step": 8880 + }, + { + "epoch": 2.241515141965777, + "grad_norm": 27.966007232666016, + "learning_rate": 9.11962388467874e-08, + "logits/chosen": NaN, + "logits/rejected": -1.0225493907928467, + "logps/chosen": -306.5375061035156, + "logps/rejected": -362.3125, + "loss": 0.0408, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.0164794921875, + "rewards/margins": 6.390038967132568, + "rewards/rejected": -9.404687881469727, + "step": 8890 + }, + { + "epoch": 2.244036176850597, + "grad_norm": 11.037649154663086, + "learning_rate": 9.063043942364717e-08, + "logits/chosen": -1.1480591297149658, + "logits/rejected": -1.0681030750274658, + "logps/chosen": -308.35626220703125, + "logps/rejected": -357.2562561035156, + "loss": 0.0474, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.8564209938049316, + "rewards/margins": 6.332812309265137, + "rewards/rejected": -9.19140625, + "step": 8900 + }, + { + "epoch": 2.2465572117354173, + "grad_norm": 61.01218032836914, + "learning_rate": 9.006601179193283e-08, + "logits/chosen": -1.119531273841858, + "logits/rejected": -0.996899425983429, + "logps/chosen": -311.484375, + "logps/rejected": -325.38751220703125, + "loss": 0.0653, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.7358460426330566, + "rewards/margins": 6.0791015625, + "rewards/rejected": -8.814844131469727, + "step": 8910 + }, + { + "epoch": 2.2490782466202375, + "grad_norm": 21.154726028442383, + "learning_rate": 8.950296081001846e-08, + "logits/chosen": -1.11468505859375, + "logits/rejected": -1.112548828125, + "logps/chosen": -331.95623779296875, + "logps/rejected": -356.8125, + "loss": 0.0519, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.967517137527466, + "rewards/margins": 6.508008003234863, + "rewards/rejected": -9.478124618530273, + "step": 8920 + }, + { + "epoch": 2.251599281505058, + "grad_norm": 13.69310474395752, + "learning_rate": 8.894129132442898e-08, + "logits/chosen": -1.1307251453399658, + "logits/rejected": -1.028753638267517, + "logps/chosen": -309.26873779296875, + "logps/rejected": -339.32501220703125, + "loss": 0.0515, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8682618141174316, + "rewards/margins": 6.331250190734863, + "rewards/rejected": -9.195703506469727, + "step": 8930 + }, + { + "epoch": 2.254120316389878, + "grad_norm": 9.159889221191406, + "learning_rate": 8.838100816979751e-08, + "logits/chosen": -1.133874535560608, + "logits/rejected": -0.9897094964981079, + "logps/chosen": -309.54376220703125, + "logps/rejected": -360.1937561035156, + "loss": 0.046, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.882641553878784, + "rewards/margins": 6.360547065734863, + "rewards/rejected": -9.245312690734863, + "step": 8940 + }, + { + "epoch": 2.2566413512746983, + "grad_norm": 8.470044136047363, + "learning_rate": 8.782211616882451e-08, + "logits/chosen": -1.19439697265625, + "logits/rejected": -1.115759253501892, + "logps/chosen": -317.1499938964844, + "logps/rejected": -356.66876220703125, + "loss": 0.0725, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.3541502952575684, + "rewards/margins": 6.437109470367432, + "rewards/rejected": -8.792577743530273, + "step": 8950 + }, + { + "epoch": 2.2591623861595185, + "grad_norm": 29.923450469970703, + "learning_rate": 8.726462013223568e-08, + "logits/chosen": -1.2116210460662842, + "logits/rejected": -1.075598120689392, + "logps/chosen": -320.91876220703125, + "logps/rejected": -351.6000061035156, + "loss": 0.1267, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -2.960766553878784, + "rewards/margins": 6.324804782867432, + "rewards/rejected": -9.283203125, + "step": 8960 + }, + { + "epoch": 2.2616834210443386, + "grad_norm": 20.205322265625, + "learning_rate": 8.67085248587408e-08, + "logits/chosen": -1.205041527748108, + "logits/rejected": -1.1007080078125, + "logps/chosen": -360.4312438964844, + "logps/rejected": -363.23748779296875, + "loss": 0.041, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.801135301589966, + "rewards/margins": 6.781445503234863, + "rewards/rejected": -9.582616806030273, + "step": 8970 + }, + { + "epoch": 2.2642044559291588, + "grad_norm": 39.44192886352539, + "learning_rate": 8.615383513499271e-08, + "logits/chosen": -1.1978759765625, + "logits/rejected": -1.099829077720642, + "logps/chosen": -332.51873779296875, + "logps/rejected": -364.6000061035156, + "loss": 0.07, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.9340453147888184, + "rewards/margins": 6.63330078125, + "rewards/rejected": -9.565234184265137, + "step": 8980 + }, + { + "epoch": 2.266725490813979, + "grad_norm": 27.780113220214844, + "learning_rate": 8.56005557355455e-08, + "logits/chosen": -1.223425269126892, + "logits/rejected": -1.102380394935608, + "logps/chosen": -299.54376220703125, + "logps/rejected": -354.45001220703125, + "loss": 0.0375, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.2032470703125, + "rewards/margins": 6.170507907867432, + "rewards/rejected": -9.373827934265137, + "step": 8990 + }, + { + "epoch": 2.2692465256987995, + "grad_norm": 33.395301818847656, + "learning_rate": 8.50486914228138e-08, + "logits/chosen": -1.200769066810608, + "logits/rejected": -0.997790515422821, + "logps/chosen": -330.96875, + "logps/rejected": -359.1499938964844, + "loss": 0.0513, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.3094482421875, + "rewards/margins": 6.15234375, + "rewards/rejected": -9.4609375, + "step": 9000 + }, + { + "epoch": 2.2717675605836196, + "grad_norm": 12.158639907836914, + "learning_rate": 8.449824694703192e-08, + "logits/chosen": -1.0890014171600342, + "logits/rejected": -1.094213843345642, + "logps/chosen": -318.39373779296875, + "logps/rejected": -357.125, + "loss": 0.0646, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.5050995349884033, + "rewards/margins": 6.290136814117432, + "rewards/rejected": -9.798437118530273, + "step": 9010 + }, + { + "epoch": 2.2742885954684398, + "grad_norm": 19.436952590942383, + "learning_rate": 8.39492270462126e-08, + "logits/chosen": -1.182153344154358, + "logits/rejected": -1.0277221202850342, + "logps/chosen": -321.54376220703125, + "logps/rejected": -350.5874938964844, + "loss": 0.049, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.445483446121216, + "rewards/margins": 6.463281154632568, + "rewards/rejected": -9.912500381469727, + "step": 9020 + }, + { + "epoch": 2.27680963035326, + "grad_norm": 7.460870742797852, + "learning_rate": 8.340163644610634e-08, + "logits/chosen": -1.094354271888733, + "logits/rejected": -0.9900146722793579, + "logps/chosen": -307.91876220703125, + "logps/rejected": -335.13751220703125, + "loss": 0.0455, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.8498778343200684, + "rewards/margins": 6.662304878234863, + "rewards/rejected": -9.508984565734863, + "step": 9030 + }, + { + "epoch": 2.2793306652380805, + "grad_norm": 6.054429531097412, + "learning_rate": 8.285547986016081e-08, + "logits/chosen": -1.124536156654358, + "logits/rejected": -1.0042846202850342, + "logps/chosen": -295.70001220703125, + "logps/rejected": -347.45001220703125, + "loss": 0.0501, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.805468797683716, + "rewards/margins": 6.591894626617432, + "rewards/rejected": -9.399609565734863, + "step": 9040 + }, + { + "epoch": 2.2818517001229006, + "grad_norm": 10.674135208129883, + "learning_rate": 8.231076198948044e-08, + "logits/chosen": -1.124176025390625, + "logits/rejected": -1.138848900794983, + "logps/chosen": -287.9125061035156, + "logps/rejected": -384.2875061035156, + "loss": 0.0596, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.860626220703125, + "rewards/margins": 6.3427734375, + "rewards/rejected": -9.203516006469727, + "step": 9050 + }, + { + "epoch": 2.2843727350077208, + "grad_norm": 14.529641151428223, + "learning_rate": 8.176748752278543e-08, + "logits/chosen": -1.1708495616912842, + "logits/rejected": -1.1232421398162842, + "logps/chosen": -305.9750061035156, + "logps/rejected": -369.6499938964844, + "loss": 0.0468, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.926654100418091, + "rewards/margins": 6.535742282867432, + "rewards/rejected": -9.466211318969727, + "step": 9060 + }, + { + "epoch": 2.286893769892541, + "grad_norm": 67.26555633544922, + "learning_rate": 8.122566113637203e-08, + "logits/chosen": -1.163580298423767, + "logits/rejected": -1.071862816810608, + "logps/chosen": -285.9125061035156, + "logps/rejected": -331.9624938964844, + "loss": 0.054, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.461444139480591, + "rewards/margins": 6.383984565734863, + "rewards/rejected": -8.845312118530273, + "step": 9070 + }, + { + "epoch": 2.289414804777361, + "grad_norm": 19.25269889831543, + "learning_rate": 8.068528749407169e-08, + "logits/chosen": -1.198327660560608, + "logits/rejected": -1.1657836437225342, + "logps/chosen": -316.96875, + "logps/rejected": -348.9375, + "loss": 0.0616, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.714787244796753, + "rewards/margins": 6.205859184265137, + "rewards/rejected": -8.924219131469727, + "step": 9080 + }, + { + "epoch": 2.291935839662181, + "grad_norm": 12.033791542053223, + "learning_rate": 8.014637124721149e-08, + "logits/chosen": -1.12884521484375, + "logits/rejected": -1.029595971107483, + "logps/chosen": -309.9937438964844, + "logps/rejected": -351.01251220703125, + "loss": 0.0369, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.0118165016174316, + "rewards/margins": 6.296484470367432, + "rewards/rejected": -9.310546875, + "step": 9090 + }, + { + "epoch": 2.2944568745470013, + "grad_norm": 22.721290588378906, + "learning_rate": 7.960891703457362e-08, + "logits/chosen": -1.1833984851837158, + "logits/rejected": -1.060644507408142, + "logps/chosen": -336.0375061035156, + "logps/rejected": -352.9125061035156, + "loss": 0.0504, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.138824462890625, + "rewards/margins": 6.34765625, + "rewards/rejected": -9.487500190734863, + "step": 9100 + }, + { + "epoch": 2.296977909431822, + "grad_norm": 29.020238876342773, + "learning_rate": 7.907292948235555e-08, + "logits/chosen": -1.196441650390625, + "logits/rejected": -1.1320068836212158, + "logps/chosen": -339.8187561035156, + "logps/rejected": -365.3374938964844, + "loss": 0.038, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.8363280296325684, + "rewards/margins": 6.8603515625, + "rewards/rejected": -9.697070121765137, + "step": 9110 + }, + { + "epoch": 2.299498944316642, + "grad_norm": 6.52321195602417, + "learning_rate": 7.853841320413065e-08, + "logits/chosen": -1.078771948814392, + "logits/rejected": -1.09234619140625, + "logps/chosen": -298.78125, + "logps/rejected": -344.0874938964844, + "loss": 0.0568, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.917797803878784, + "rewards/margins": 6.548242092132568, + "rewards/rejected": -9.466796875, + "step": 9120 + }, + { + "epoch": 2.302019979201462, + "grad_norm": 2.411801338195801, + "learning_rate": 7.800537280080785e-08, + "logits/chosen": -1.173828125, + "logits/rejected": -1.09161376953125, + "logps/chosen": -348.33123779296875, + "logps/rejected": -372.1000061035156, + "loss": 0.0562, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.947094678878784, + "rewards/margins": 6.550000190734863, + "rewards/rejected": -9.495312690734863, + "step": 9130 + }, + { + "epoch": 2.3045410140862823, + "grad_norm": 19.04088020324707, + "learning_rate": 7.747381286059232e-08, + "logits/chosen": -1.05914306640625, + "logits/rejected": -0.9982833862304688, + "logps/chosen": -307.64373779296875, + "logps/rejected": -334.9125061035156, + "loss": 0.0653, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -3.0763823986053467, + "rewards/margins": 6.140527248382568, + "rewards/rejected": -9.219141006469727, + "step": 9140 + }, + { + "epoch": 2.3070620489711025, + "grad_norm": 27.200042724609375, + "learning_rate": 7.694373795894621e-08, + "logits/chosen": -1.1356079578399658, + "logits/rejected": -1.057153344154358, + "logps/chosen": -312.2124938964844, + "logps/rejected": -328.0874938964844, + "loss": 0.0635, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.725433349609375, + "rewards/margins": 6.452929496765137, + "rewards/rejected": -9.180078506469727, + "step": 9150 + }, + { + "epoch": 2.309583083855923, + "grad_norm": 12.557526588439941, + "learning_rate": 7.641515265854882e-08, + "logits/chosen": -1.166192650794983, + "logits/rejected": -1.040686011314392, + "logps/chosen": -303.3374938964844, + "logps/rejected": -348.63751220703125, + "loss": 0.0388, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.8875975608825684, + "rewards/margins": 6.574609279632568, + "rewards/rejected": -9.466601371765137, + "step": 9160 + }, + { + "epoch": 2.312104118740743, + "grad_norm": 9.453475952148438, + "learning_rate": 7.588806150925755e-08, + "logits/chosen": -1.136755347251892, + "logits/rejected": -1.0892333984375, + "logps/chosen": -350.48126220703125, + "logps/rejected": -367.7124938964844, + "loss": 0.0701, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.347241163253784, + "rewards/margins": 6.4130859375, + "rewards/rejected": -9.757421493530273, + "step": 9170 + }, + { + "epoch": 2.3146251536255633, + "grad_norm": 33.222103118896484, + "learning_rate": 7.536246904806878e-08, + "logits/chosen": -1.1843140125274658, + "logits/rejected": -1.15594482421875, + "logps/chosen": -323.76251220703125, + "logps/rejected": -376.875, + "loss": 0.055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.750439405441284, + "rewards/margins": 6.929296970367432, + "rewards/rejected": -9.6787109375, + "step": 9180 + }, + { + "epoch": 2.3171461885103835, + "grad_norm": 67.34077453613281, + "learning_rate": 7.483837979907886e-08, + "logits/chosen": -1.2142333984375, + "logits/rejected": -1.0245850086212158, + "logps/chosen": -307.46875, + "logps/rejected": -341.20001220703125, + "loss": 0.0518, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.303955078125, + "rewards/margins": 6.383008003234863, + "rewards/rejected": -9.687108993530273, + "step": 9190 + }, + { + "epoch": 2.3196672233952036, + "grad_norm": 11.72309684753418, + "learning_rate": 7.431579827344486e-08, + "logits/chosen": -1.1179687976837158, + "logits/rejected": -1.111181616783142, + "logps/chosen": -316.6625061035156, + "logps/rejected": -353.76251220703125, + "loss": 0.0385, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7853331565856934, + "rewards/margins": 6.5966796875, + "rewards/rejected": -9.383203506469727, + "step": 9200 + }, + { + "epoch": 2.3221882582800237, + "grad_norm": 34.17678451538086, + "learning_rate": 7.379472896934619e-08, + "logits/chosen": -1.0644652843475342, + "logits/rejected": -1.1063232421875, + "logps/chosen": -312.13751220703125, + "logps/rejected": -347.4750061035156, + "loss": 0.0559, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.909912109375, + "rewards/margins": 6.501367092132568, + "rewards/rejected": -9.409375190734863, + "step": 9210 + }, + { + "epoch": 2.3247092931648443, + "grad_norm": 22.319129943847656, + "learning_rate": 7.327517637194535e-08, + "logits/chosen": NaN, + "logits/rejected": -1.126708984375, + "logps/chosen": -296.3125, + "logps/rejected": -369.5625, + "loss": 0.0398, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7155518531799316, + "rewards/margins": 6.420312404632568, + "rewards/rejected": -9.133593559265137, + "step": 9220 + }, + { + "epoch": 2.3272303280496645, + "grad_norm": 59.209041595458984, + "learning_rate": 7.275714495334997e-08, + "logits/chosen": -1.2271239757537842, + "logits/rejected": -1.120031714439392, + "logps/chosen": -300.76873779296875, + "logps/rejected": -343.91876220703125, + "loss": 0.0862, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.5219664573669434, + "rewards/margins": 6.419726371765137, + "rewards/rejected": -8.942968368530273, + "step": 9230 + }, + { + "epoch": 2.3297513629344846, + "grad_norm": 6.592267990112305, + "learning_rate": 7.224063917257369e-08, + "logits/chosen": -1.124670386314392, + "logits/rejected": -1.000756859779358, + "logps/chosen": -316.08123779296875, + "logps/rejected": -371.2749938964844, + "loss": 0.0413, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.987893581390381, + "rewards/margins": 6.382421970367432, + "rewards/rejected": -9.369531631469727, + "step": 9240 + }, + { + "epoch": 2.3322723978193047, + "grad_norm": 6.588650703430176, + "learning_rate": 7.172566347549808e-08, + "logits/chosen": -1.1592895984649658, + "logits/rejected": -1.0453979969024658, + "logps/chosen": -307.1937561035156, + "logps/rejected": -376.92498779296875, + "loss": 0.066, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.601367235183716, + "rewards/margins": 6.116015434265137, + "rewards/rejected": -8.718358993530273, + "step": 9250 + }, + { + "epoch": 2.334793432704125, + "grad_norm": 6.252317905426025, + "learning_rate": 7.12122222948345e-08, + "logits/chosen": -1.1423218250274658, + "logits/rejected": -1.06195068359375, + "logps/chosen": -303.61248779296875, + "logps/rejected": -329.9750061035156, + "loss": 0.0515, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.7229981422424316, + "rewards/margins": 6.4677734375, + "rewards/rejected": -9.189844131469727, + "step": 9260 + }, + { + "epoch": 2.3373144675889455, + "grad_norm": 28.286134719848633, + "learning_rate": 7.070032005008567e-08, + "logits/chosen": -1.1438720226287842, + "logits/rejected": NaN, + "logps/chosen": -327.5625, + "logps/rejected": -363.13751220703125, + "loss": 0.0557, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.8480467796325684, + "rewards/margins": 6.434179782867432, + "rewards/rejected": -9.281641006469727, + "step": 9270 + }, + { + "epoch": 2.3398355024737656, + "grad_norm": 24.243497848510742, + "learning_rate": 7.018996114750766e-08, + "logits/chosen": -1.217199683189392, + "logits/rejected": -1.0411498546600342, + "logps/chosen": -352.23126220703125, + "logps/rejected": -357.5874938964844, + "loss": 0.0546, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.2514891624450684, + "rewards/margins": 6.593163967132568, + "rewards/rejected": -9.844531059265137, + "step": 9280 + }, + { + "epoch": 2.3423565373585857, + "grad_norm": 25.643503189086914, + "learning_rate": 6.968114998007232e-08, + "logits/chosen": -1.198950171470642, + "logits/rejected": -1.0894286632537842, + "logps/chosen": -321.5375061035156, + "logps/rejected": -366.3999938964844, + "loss": 0.0461, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7505125999450684, + "rewards/margins": 6.627539157867432, + "rewards/rejected": -9.380078315734863, + "step": 9290 + }, + { + "epoch": 2.344877572243406, + "grad_norm": 23.797788619995117, + "learning_rate": 6.917389092742893e-08, + "logits/chosen": -1.2471191883087158, + "logits/rejected": -1.09259033203125, + "logps/chosen": -331.9937438964844, + "logps/rejected": -369.95001220703125, + "loss": 0.0695, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.0775389671325684, + "rewards/margins": 6.3974609375, + "rewards/rejected": -9.473437309265137, + "step": 9300 + }, + { + "epoch": 2.347398607128226, + "grad_norm": 66.45001983642578, + "learning_rate": 6.866818835586687e-08, + "logits/chosen": -1.1241180896759033, + "logits/rejected": -1.132360816001892, + "logps/chosen": -302.0687561035156, + "logps/rejected": -342.38751220703125, + "loss": 0.0405, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.0251221656799316, + "rewards/margins": 6.545507907867432, + "rewards/rejected": -9.572656631469727, + "step": 9310 + }, + { + "epoch": 2.349919642013046, + "grad_norm": 17.576152801513672, + "learning_rate": 6.816404661827785e-08, + "logits/chosen": -1.078521728515625, + "logits/rejected": -1.0055725574493408, + "logps/chosen": -329.2250061035156, + "logps/rejected": -354.4624938964844, + "loss": 0.0411, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.387951612472534, + "rewards/margins": 6.321093559265137, + "rewards/rejected": -9.709765434265137, + "step": 9320 + }, + { + "epoch": 2.3524406768978667, + "grad_norm": 34.27827072143555, + "learning_rate": 6.766147005411879e-08, + "logits/chosen": -1.078393578529358, + "logits/rejected": NaN, + "logps/chosen": -310.9375, + "logps/rejected": -347.67498779296875, + "loss": 0.0779, + "rewards/accuracies": 0.953125, + "rewards/chosen": -3.5388426780700684, + "rewards/margins": 6.27001953125, + "rewards/rejected": -9.808984756469727, + "step": 9330 + }, + { + "epoch": 2.354961711782687, + "grad_norm": 17.350936889648438, + "learning_rate": 6.716046298937384e-08, + "logits/chosen": -1.1638672351837158, + "logits/rejected": -1.191857933998108, + "logps/chosen": -320.0062561035156, + "logps/rejected": -378.13751220703125, + "loss": 0.0334, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.066455125808716, + "rewards/margins": 6.711133003234863, + "rewards/rejected": -9.775781631469727, + "step": 9340 + }, + { + "epoch": 2.357482746667507, + "grad_norm": 35.49551010131836, + "learning_rate": 6.666102973651782e-08, + "logits/chosen": -1.2030150890350342, + "logits/rejected": -1.0501830577850342, + "logps/chosen": -332.7250061035156, + "logps/rejected": -352.5375061035156, + "loss": 0.0459, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.151684522628784, + "rewards/margins": 6.623632907867432, + "rewards/rejected": -9.774218559265137, + "step": 9350 + }, + { + "epoch": 2.360003781552327, + "grad_norm": 40.842647552490234, + "learning_rate": 6.616317459447851e-08, + "logits/chosen": -1.1645386219024658, + "logits/rejected": -1.0828826427459717, + "logps/chosen": -300.0406188964844, + "logps/rejected": -338.8125, + "loss": 0.071, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.6125426292419434, + "rewards/margins": 6.321679592132568, + "rewards/rejected": -8.932812690734863, + "step": 9360 + }, + { + "epoch": 2.3625248164371473, + "grad_norm": 14.02625846862793, + "learning_rate": 6.566690184860028e-08, + "logits/chosen": -1.1471679210662842, + "logits/rejected": -1.134558081626892, + "logps/chosen": -314.38751220703125, + "logps/rejected": -344.0625, + "loss": 0.0514, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1183838844299316, + "rewards/margins": 6.333593845367432, + "rewards/rejected": -9.451171875, + "step": 9370 + }, + { + "epoch": 2.365045851321968, + "grad_norm": 19.2847957611084, + "learning_rate": 6.517221577060644e-08, + "logits/chosen": -1.1254456043243408, + "logits/rejected": -1.0552978515625, + "logps/chosen": -308.3187561035156, + "logps/rejected": -365.5, + "loss": 0.0479, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.9517579078674316, + "rewards/margins": 6.430859565734863, + "rewards/rejected": -9.380468368530273, + "step": 9380 + }, + { + "epoch": 2.367566886206788, + "grad_norm": 14.257923126220703, + "learning_rate": 6.46791206185631e-08, + "logits/chosen": -1.1007812023162842, + "logits/rejected": -1.085473656654358, + "logps/chosen": -322.70623779296875, + "logps/rejected": -349.9624938964844, + "loss": 0.0551, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.730761766433716, + "rewards/margins": 6.459912300109863, + "rewards/rejected": -9.19140625, + "step": 9390 + }, + { + "epoch": 2.370087921091608, + "grad_norm": 5.871348857879639, + "learning_rate": 6.418762063684239e-08, + "logits/chosen": -1.181860327720642, + "logits/rejected": -1.190222144126892, + "logps/chosen": -309.71875, + "logps/rejected": -350.5625, + "loss": 0.0614, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.810070753097534, + "rewards/margins": 6.241406440734863, + "rewards/rejected": -9.053515434265137, + "step": 9400 + }, + { + "epoch": 2.3726089559764283, + "grad_norm": 18.109651565551758, + "learning_rate": 6.36977200560856e-08, + "logits/chosen": -1.09820556640625, + "logits/rejected": -1.0347900390625, + "logps/chosen": -330.88751220703125, + "logps/rejected": -367.125, + "loss": 0.0863, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.9474244117736816, + "rewards/margins": 6.869531154632568, + "rewards/rejected": -9.816015243530273, + "step": 9410 + }, + { + "epoch": 2.3751299908612484, + "grad_norm": 25.832246780395508, + "learning_rate": 6.320942309316704e-08, + "logits/chosen": -1.1589676141738892, + "logits/rejected": -1.07421875, + "logps/chosen": -284.4937438964844, + "logps/rejected": -326.01251220703125, + "loss": 0.091, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.2571043968200684, + "rewards/margins": 6.000195503234863, + "rewards/rejected": -9.259374618530273, + "step": 9420 + }, + { + "epoch": 2.3776510257460686, + "grad_norm": 43.289939880371094, + "learning_rate": 6.272273395115794e-08, + "logits/chosen": -1.174719214439392, + "logits/rejected": -1.1595947742462158, + "logps/chosen": -342.1187438964844, + "logps/rejected": -381.57501220703125, + "loss": 0.0507, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.741284132003784, + "rewards/margins": 6.70703125, + "rewards/rejected": -9.451952934265137, + "step": 9430 + }, + { + "epoch": 2.380172060630889, + "grad_norm": 5.902862548828125, + "learning_rate": 6.223765681928977e-08, + "logits/chosen": -1.118981957435608, + "logits/rejected": -1.092565894126892, + "logps/chosen": -320.6625061035156, + "logps/rejected": -389.26251220703125, + "loss": 0.0369, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.941821336746216, + "rewards/margins": 6.872265815734863, + "rewards/rejected": -9.816797256469727, + "step": 9440 + }, + { + "epoch": 2.3826930955157093, + "grad_norm": 50.332122802734375, + "learning_rate": 6.175419587291853e-08, + "logits/chosen": -1.236669898033142, + "logits/rejected": -1.1200439929962158, + "logps/chosen": -332.0249938964844, + "logps/rejected": -367.3999938964844, + "loss": 0.0703, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.7341766357421875, + "rewards/margins": 6.459570407867432, + "rewards/rejected": -9.197265625, + "step": 9450 + }, + { + "epoch": 2.3852141304005294, + "grad_norm": 21.89457130432129, + "learning_rate": 6.127235527348862e-08, + "logits/chosen": -1.2223694324493408, + "logits/rejected": -1.11663818359375, + "logps/chosen": -335.78125, + "logps/rejected": -347.61248779296875, + "loss": 0.0593, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.610461473464966, + "rewards/margins": 6.554296970367432, + "rewards/rejected": -9.166796684265137, + "step": 9460 + }, + { + "epoch": 2.3877351652853496, + "grad_norm": 55.019737243652344, + "learning_rate": 6.079213916849737e-08, + "logits/chosen": -1.1846435070037842, + "logits/rejected": -1.0324188470840454, + "logps/chosen": -317.8374938964844, + "logps/rejected": -324.79998779296875, + "loss": 0.0501, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.8798460960388184, + "rewards/margins": 6.211133003234863, + "rewards/rejected": -9.086328506469727, + "step": 9470 + }, + { + "epoch": 2.3902562001701697, + "grad_norm": 13.911991119384766, + "learning_rate": 6.031355169145882e-08, + "logits/chosen": -1.269677758216858, + "logits/rejected": -1.11077880859375, + "logps/chosen": -315.8187561035156, + "logps/rejected": -347.4125061035156, + "loss": 0.0405, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.2803986072540283, + "rewards/margins": 6.312597751617432, + "rewards/rejected": -8.592187881469727, + "step": 9480 + }, + { + "epoch": 2.3927772350549903, + "grad_norm": 2.236504077911377, + "learning_rate": 5.983659696186868e-08, + "logits/chosen": -1.07550048828125, + "logits/rejected": -1.106024146080017, + "logps/chosen": -290.0843811035156, + "logps/rejected": -364.3125, + "loss": 0.0337, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.88458251953125, + "rewards/margins": 6.879296779632568, + "rewards/rejected": -9.765039443969727, + "step": 9490 + }, + { + "epoch": 2.3952982699398104, + "grad_norm": 5.578651428222656, + "learning_rate": 5.9361279085168274e-08, + "logits/chosen": -1.188720703125, + "logits/rejected": -1.0909423828125, + "logps/chosen": -308.5687561035156, + "logps/rejected": -386.76251220703125, + "loss": 0.0365, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.633349657058716, + "rewards/margins": 6.662109375, + "rewards/rejected": -9.301172256469727, + "step": 9500 + }, + { + "epoch": 2.3978193048246306, + "grad_norm": 26.881193161010742, + "learning_rate": 5.888760215270988e-08, + "logits/chosen": -1.137841820716858, + "logits/rejected": -1.0743408203125, + "logps/chosen": -294.98748779296875, + "logps/rejected": -348.79376220703125, + "loss": 0.0548, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.935420274734497, + "rewards/margins": 6.504101753234863, + "rewards/rejected": -9.437891006469727, + "step": 9510 + }, + { + "epoch": 2.4003403397094507, + "grad_norm": 13.205660820007324, + "learning_rate": 5.8415570241720916e-08, + "logits/chosen": -1.1109740734100342, + "logits/rejected": -1.0967925786972046, + "logps/chosen": -330.2250061035156, + "logps/rejected": -377.625, + "loss": 0.0674, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.0494627952575684, + "rewards/margins": 6.604687690734863, + "rewards/rejected": -9.655858993530273, + "step": 9520 + }, + { + "epoch": 2.402861374594271, + "grad_norm": 20.070270538330078, + "learning_rate": 5.7945187415269076e-08, + "logits/chosen": -1.134033203125, + "logits/rejected": -1.137353539466858, + "logps/chosen": -299.6312561035156, + "logps/rejected": -360.17498779296875, + "loss": 0.0416, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.712261915206909, + "rewards/margins": 6.659375190734863, + "rewards/rejected": -9.369140625, + "step": 9530 + }, + { + "epoch": 2.405382409479091, + "grad_norm": 17.515649795532227, + "learning_rate": 5.747645772222767e-08, + "logits/chosen": -1.218408226966858, + "logits/rejected": -1.003991723060608, + "logps/chosen": -316.70001220703125, + "logps/rejected": -351.42498779296875, + "loss": 0.0417, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.394238233566284, + "rewards/margins": 6.923242092132568, + "rewards/rejected": -9.311132431030273, + "step": 9540 + }, + { + "epoch": 2.4079034443639116, + "grad_norm": 85.17425537109375, + "learning_rate": 5.700938519724016e-08, + "logits/chosen": -1.1747620105743408, + "logits/rejected": -1.0635712146759033, + "logps/chosen": -292.5625, + "logps/rejected": -363.4125061035156, + "loss": 0.0702, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.712146043777466, + "rewards/margins": 6.213086128234863, + "rewards/rejected": -8.924609184265137, + "step": 9550 + }, + { + "epoch": 2.4104244792487317, + "grad_norm": 36.34426498413086, + "learning_rate": 5.6543973860685796e-08, + "logits/chosen": -1.1073486804962158, + "logits/rejected": NaN, + "logps/chosen": -295.4375, + "logps/rejected": -338.26251220703125, + "loss": 0.0661, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.8862547874450684, + "rewards/margins": 6.222265720367432, + "rewards/rejected": -9.106640815734863, + "step": 9560 + }, + { + "epoch": 2.412945514133552, + "grad_norm": 36.26641082763672, + "learning_rate": 5.608022771864515e-08, + "logits/chosen": -1.1702239513397217, + "logits/rejected": -1.047338843345642, + "logps/chosen": -310.48748779296875, + "logps/rejected": -341.0375061035156, + "loss": 0.0318, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.709491014480591, + "rewards/margins": 6.6884765625, + "rewards/rejected": -9.393359184265137, + "step": 9570 + }, + { + "epoch": 2.415466549018372, + "grad_norm": 32.18575668334961, + "learning_rate": 5.56181507628653e-08, + "logits/chosen": -1.1581542491912842, + "logits/rejected": -1.124108910560608, + "logps/chosen": -318.3500061035156, + "logps/rejected": -375.20623779296875, + "loss": 0.0487, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.131308078765869, + "rewards/margins": 6.637109279632568, + "rewards/rejected": -9.770703315734863, + "step": 9580 + }, + { + "epoch": 2.417987583903192, + "grad_norm": 7.18175745010376, + "learning_rate": 5.5157746970725614e-08, + "logits/chosen": -1.1223633289337158, + "logits/rejected": -0.990283191204071, + "logps/chosen": -303.8687438964844, + "logps/rejected": -339.54998779296875, + "loss": 0.0388, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.357025146484375, + "rewards/margins": 6.460156440734863, + "rewards/rejected": -9.817187309265137, + "step": 9590 + }, + { + "epoch": 2.4205086187880127, + "grad_norm": 8.686003684997559, + "learning_rate": 5.469902030520346e-08, + "logits/chosen": -1.137231469154358, + "logits/rejected": -1.021478295326233, + "logps/chosen": -297.88751220703125, + "logps/rejected": -333.13751220703125, + "loss": 0.0338, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8655028343200684, + "rewards/margins": 6.748046875, + "rewards/rejected": -9.61328125, + "step": 9600 + }, + { + "epoch": 2.423029653672833, + "grad_norm": 23.312318801879883, + "learning_rate": 5.424197471484041e-08, + "logits/chosen": -1.124792456626892, + "logits/rejected": -1.041748046875, + "logps/chosen": -322.3374938964844, + "logps/rejected": -375.23748779296875, + "loss": 0.0519, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.4595704078674316, + "rewards/margins": 6.5380859375, + "rewards/rejected": -9.000391006469727, + "step": 9610 + }, + { + "epoch": 2.425550688557653, + "grad_norm": 6.126657485961914, + "learning_rate": 5.378661413370761e-08, + "logits/chosen": -1.0678832530975342, + "logits/rejected": -1.0589721202850342, + "logps/chosen": -297.79998779296875, + "logps/rejected": -340.59375, + "loss": 0.066, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.125317335128784, + "rewards/margins": 6.318359375, + "rewards/rejected": -9.444140434265137, + "step": 9620 + }, + { + "epoch": 2.428071723442473, + "grad_norm": 44.89402389526367, + "learning_rate": 5.333294248137268e-08, + "logits/chosen": -1.218359351158142, + "logits/rejected": -1.157678246498108, + "logps/chosen": -332.0562438964844, + "logps/rejected": -371.98748779296875, + "loss": 0.0779, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.752542018890381, + "rewards/margins": 6.654101371765137, + "rewards/rejected": -9.4052734375, + "step": 9630 + }, + { + "epoch": 2.4305927583272933, + "grad_norm": 26.419673919677734, + "learning_rate": 5.288096366286526e-08, + "logits/chosen": -1.1844971179962158, + "logits/rejected": -1.0656158924102783, + "logps/chosen": -319.45623779296875, + "logps/rejected": -349.2749938964844, + "loss": 0.0525, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.8952879905700684, + "rewards/margins": 6.634375095367432, + "rewards/rejected": -9.530077934265137, + "step": 9640 + }, + { + "epoch": 2.4331137932121134, + "grad_norm": 43.81555938720703, + "learning_rate": 5.243068156864405e-08, + "logits/chosen": -1.0962402820587158, + "logits/rejected": -1.0572326183319092, + "logps/chosen": -314.1937561035156, + "logps/rejected": -377.42498779296875, + "loss": 0.0641, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.0970458984375, + "rewards/margins": 6.368554592132568, + "rewards/rejected": -9.466015815734863, + "step": 9650 + }, + { + "epoch": 2.4356348280969335, + "grad_norm": 29.52123260498047, + "learning_rate": 5.1982100074562776e-08, + "logits/chosen": -1.113500952720642, + "logits/rejected": -0.996478259563446, + "logps/chosen": -306.35626220703125, + "logps/rejected": -349.6499938964844, + "loss": 0.0655, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.409008741378784, + "rewards/margins": 6.287549018859863, + "rewards/rejected": -9.693359375, + "step": 9660 + }, + { + "epoch": 2.438155862981754, + "grad_norm": 21.85391616821289, + "learning_rate": 5.153522304183702e-08, + "logits/chosen": -1.110693335533142, + "logits/rejected": -1.025354027748108, + "logps/chosen": -297.4125061035156, + "logps/rejected": -359.5625, + "loss": 0.0429, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.4086670875549316, + "rewards/margins": 6.50390625, + "rewards/rejected": -9.913671493530273, + "step": 9670 + }, + { + "epoch": 2.4406768978665743, + "grad_norm": 56.9681510925293, + "learning_rate": 5.10900543170113e-08, + "logits/chosen": -1.1635253429412842, + "logits/rejected": -1.068750023841858, + "logps/chosen": -314.7124938964844, + "logps/rejected": -357.13751220703125, + "loss": 0.0654, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.184521436691284, + "rewards/margins": 6.593945503234863, + "rewards/rejected": -9.780077934265137, + "step": 9680 + }, + { + "epoch": 2.4431979327513944, + "grad_norm": 18.39649772644043, + "learning_rate": 5.064659773192542e-08, + "logits/chosen": -1.156762719154358, + "logits/rejected": -1.145349144935608, + "logps/chosen": -326.375, + "logps/rejected": -356.5874938964844, + "loss": 0.0303, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.018969774246216, + "rewards/margins": 6.5927734375, + "rewards/rejected": -9.611719131469727, + "step": 9690 + }, + { + "epoch": 2.4457189676362145, + "grad_norm": 8.979217529296875, + "learning_rate": 5.020485710368177e-08, + "logits/chosen": -1.1500060558319092, + "logits/rejected": -1.086267113685608, + "logps/chosen": -323.6625061035156, + "logps/rejected": -352.36248779296875, + "loss": 0.0548, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.845294237136841, + "rewards/margins": 6.4541015625, + "rewards/rejected": -9.298828125, + "step": 9700 + }, + { + "epoch": 2.448240002521035, + "grad_norm": 23.726537704467773, + "learning_rate": 4.9764836234612665e-08, + "logits/chosen": -1.1351196765899658, + "logits/rejected": -1.0086548328399658, + "logps/chosen": -327.1000061035156, + "logps/rejected": -352.2250061035156, + "loss": 0.0448, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.0148072242736816, + "rewards/margins": 6.356249809265137, + "rewards/rejected": -9.368749618530273, + "step": 9710 + }, + { + "epoch": 2.4507610374058553, + "grad_norm": 6.30889892578125, + "learning_rate": 4.932653891224719e-08, + "logits/chosen": -1.147802710533142, + "logits/rejected": -1.0699462890625, + "logps/chosen": -299.45001220703125, + "logps/rejected": -350.3125, + "loss": 0.0433, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.856768846511841, + "rewards/margins": 6.569531440734863, + "rewards/rejected": -9.425390243530273, + "step": 9720 + }, + { + "epoch": 2.4532820722906754, + "grad_norm": 10.74817943572998, + "learning_rate": 4.8889968909278824e-08, + "logits/chosen": -1.2151610851287842, + "logits/rejected": -1.161279320716858, + "logps/chosen": -328.82501220703125, + "logps/rejected": -359.54998779296875, + "loss": 0.0598, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.0374083518981934, + "rewards/margins": 6.2802734375, + "rewards/rejected": -9.314453125, + "step": 9730 + }, + { + "epoch": 2.4558031071754955, + "grad_norm": 20.406330108642578, + "learning_rate": 4.845512998353296e-08, + "logits/chosen": -1.159692406654358, + "logits/rejected": -1.1065948009490967, + "logps/chosen": -322.5874938964844, + "logps/rejected": -365.1000061035156, + "loss": 0.083, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.5016846656799316, + "rewards/margins": 6.512890815734863, + "rewards/rejected": -9.017187118530273, + "step": 9740 + }, + { + "epoch": 2.4583241420603157, + "grad_norm": 14.469663619995117, + "learning_rate": 4.802202587793469e-08, + "logits/chosen": NaN, + "logits/rejected": -1.0395019054412842, + "logps/chosen": -281.01873779296875, + "logps/rejected": -371.25, + "loss": 0.044, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.469738721847534, + "rewards/margins": 6.765038967132568, + "rewards/rejected": -9.234766006469727, + "step": 9750 + }, + { + "epoch": 2.460845176945136, + "grad_norm": 60.83958435058594, + "learning_rate": 4.7590660320476236e-08, + "logits/chosen": -1.204382300376892, + "logits/rejected": -1.154455542564392, + "logps/chosen": -327.3062438964844, + "logps/rejected": -346.71875, + "loss": 0.0532, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.436450242996216, + "rewards/margins": 6.305078029632568, + "rewards/rejected": -8.7431640625, + "step": 9760 + }, + { + "epoch": 2.463366211829956, + "grad_norm": 64.44366455078125, + "learning_rate": 4.716103702418528e-08, + "logits/chosen": -1.1702392101287842, + "logits/rejected": -1.071496605873108, + "logps/chosen": -295.7093811035156, + "logps/rejected": -337.0625, + "loss": 0.0604, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.7420318126678467, + "rewards/margins": 6.470312595367432, + "rewards/rejected": -9.209765434265137, + "step": 9770 + }, + { + "epoch": 2.4658872467147765, + "grad_norm": 8.146585464477539, + "learning_rate": 4.673315968709257e-08, + "logits/chosen": -1.226287841796875, + "logits/rejected": -1.0896484851837158, + "logps/chosen": -298.45001220703125, + "logps/rejected": -336.23748779296875, + "loss": 0.0539, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.3854613304138184, + "rewards/margins": 6.4619140625, + "rewards/rejected": -8.845703125, + "step": 9780 + }, + { + "epoch": 2.4684082815995967, + "grad_norm": 8.578082084655762, + "learning_rate": 4.630703199220054e-08, + "logits/chosen": -1.158361792564392, + "logits/rejected": -1.140661597251892, + "logps/chosen": -329.29998779296875, + "logps/rejected": -345.9624938964844, + "loss": 0.0689, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.5289580821990967, + "rewards/margins": 6.202734470367432, + "rewards/rejected": -8.731640815734863, + "step": 9790 + }, + { + "epoch": 2.470929316484417, + "grad_norm": 24.780441284179688, + "learning_rate": 4.588265760745125e-08, + "logits/chosen": -1.122656226158142, + "logits/rejected": -1.034277319908142, + "logps/chosen": -276.54376220703125, + "logps/rejected": -323.3374938964844, + "loss": 0.0643, + "rewards/accuracies": 0.9593750238418579, + "rewards/chosen": -2.920092821121216, + "rewards/margins": 6.118359565734863, + "rewards/rejected": -9.040234565734863, + "step": 9800 + }, + { + "epoch": 2.473450351369237, + "grad_norm": 10.892682075500488, + "learning_rate": 4.546004018569488e-08, + "logits/chosen": -1.15179443359375, + "logits/rejected": -1.143945336341858, + "logps/chosen": -320.2749938964844, + "logps/rejected": -362.1875, + "loss": 0.0485, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.894238233566284, + "rewards/margins": 6.426562309265137, + "rewards/rejected": -9.318750381469727, + "step": 9810 + }, + { + "epoch": 2.475971386254057, + "grad_norm": 5.433866500854492, + "learning_rate": 4.503918336465859e-08, + "logits/chosen": -1.178442358970642, + "logits/rejected": -0.9962524175643921, + "logps/chosen": -328.9312438964844, + "logps/rejected": -355.7749938964844, + "loss": 0.0585, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.0894532203674316, + "rewards/margins": 6.390454292297363, + "rewards/rejected": -9.479296684265137, + "step": 9820 + }, + { + "epoch": 2.4784924211388777, + "grad_norm": 105.5046615600586, + "learning_rate": 4.462009076691472e-08, + "logits/chosen": -1.1352112293243408, + "logits/rejected": -1.079833984375, + "logps/chosen": -314.6312561035156, + "logps/rejected": -349.4125061035156, + "loss": 0.0829, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.270092725753784, + "rewards/margins": 6.248193264007568, + "rewards/rejected": -9.517969131469727, + "step": 9830 + }, + { + "epoch": 2.481013456023698, + "grad_norm": 10.480962753295898, + "learning_rate": 4.420276599984993e-08, + "logits/chosen": -1.086206078529358, + "logits/rejected": -1.0146484375, + "logps/chosen": -320.9437561035156, + "logps/rejected": -371.88751220703125, + "loss": 0.0416, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.675891160964966, + "rewards/margins": 6.841796875, + "rewards/rejected": -9.517969131469727, + "step": 9840 + }, + { + "epoch": 2.483534490908518, + "grad_norm": 19.221046447753906, + "learning_rate": 4.3787212655634234e-08, + "logits/chosen": -1.1977050304412842, + "logits/rejected": -1.037451148033142, + "logps/chosen": -306.28125, + "logps/rejected": -359.1625061035156, + "loss": 0.0425, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.0283446311950684, + "rewards/margins": 6.506054878234863, + "rewards/rejected": -9.528905868530273, + "step": 9850 + }, + { + "epoch": 2.486055525793338, + "grad_norm": 8.99593734741211, + "learning_rate": 4.337343431118973e-08, + "logits/chosen": -1.1455199718475342, + "logits/rejected": -1.07769775390625, + "logps/chosen": -307.5062561035156, + "logps/rejected": -359.03125, + "loss": 0.0468, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.8803467750549316, + "rewards/margins": 6.687304496765137, + "rewards/rejected": -9.559374809265137, + "step": 9860 + }, + { + "epoch": 2.4885765606781582, + "grad_norm": 23.91136360168457, + "learning_rate": 4.296143452816009e-08, + "logits/chosen": -1.186376929283142, + "logits/rejected": -1.1511108875274658, + "logps/chosen": -332.21875, + "logps/rejected": -385.92498779296875, + "loss": 0.0367, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.979846239089966, + "rewards/margins": 6.645117282867432, + "rewards/rejected": -9.623827934265137, + "step": 9870 + }, + { + "epoch": 2.4910975955629784, + "grad_norm": 10.396635055541992, + "learning_rate": 4.255121685287974e-08, + "logits/chosen": -1.1538788080215454, + "logits/rejected": -1.103967308998108, + "logps/chosen": -329.13751220703125, + "logps/rejected": -360.1499938964844, + "loss": 0.0512, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.778430223464966, + "rewards/margins": 6.522070407867432, + "rewards/rejected": -9.301953315734863, + "step": 9880 + }, + { + "epoch": 2.493618630447799, + "grad_norm": 34.48313522338867, + "learning_rate": 4.214278481634362e-08, + "logits/chosen": -1.192022681236267, + "logits/rejected": -1.1108276844024658, + "logps/chosen": -311.75, + "logps/rejected": -371.0625, + "loss": 0.0402, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.337881565093994, + "rewards/margins": 6.7109375, + "rewards/rejected": -10.049219131469727, + "step": 9890 + }, + { + "epoch": 2.496139665332619, + "grad_norm": 12.752933502197266, + "learning_rate": 4.173614193417629e-08, + "logits/chosen": -1.0169861316680908, + "logits/rejected": -1.079620361328125, + "logps/chosen": -310.76251220703125, + "logps/rejected": -359.79998779296875, + "loss": 0.0604, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.3267884254455566, + "rewards/margins": 6.771777153015137, + "rewards/rejected": -10.099609375, + "step": 9900 + }, + { + "epoch": 2.4986607002174392, + "grad_norm": 31.37298011779785, + "learning_rate": 4.133129170660227e-08, + "logits/chosen": -1.1909668445587158, + "logits/rejected": -1.039147973060608, + "logps/chosen": -320.0062561035156, + "logps/rejected": -372.8374938964844, + "loss": 0.0512, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.055462598800659, + "rewards/margins": 6.76171875, + "rewards/rejected": -9.815625190734863, + "step": 9910 + }, + { + "epoch": 2.5011817351022594, + "grad_norm": 11.590690612792969, + "learning_rate": 4.0928237618415294e-08, + "logits/chosen": -1.1640136241912842, + "logits/rejected": -1.0757172107696533, + "logps/chosen": -334.34375, + "logps/rejected": -356.0874938964844, + "loss": 0.0348, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.127246141433716, + "rewards/margins": 6.898241996765137, + "rewards/rejected": -10.026952743530273, + "step": 9920 + }, + { + "epoch": 2.50370276998708, + "grad_norm": 3.064460039138794, + "learning_rate": 4.052698313894892e-08, + "logits/chosen": -1.18548583984375, + "logits/rejected": -1.00244140625, + "logps/chosen": -319.32501220703125, + "logps/rejected": -365.73748779296875, + "loss": 0.0497, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.109729051589966, + "rewards/margins": 6.683789253234863, + "rewards/rejected": -9.796875, + "step": 9930 + }, + { + "epoch": 2.5062238048719, + "grad_norm": 14.619731903076172, + "learning_rate": 4.0127531722046195e-08, + "logits/chosen": -1.189910888671875, + "logits/rejected": -1.1230590343475342, + "logps/chosen": -323.98748779296875, + "logps/rejected": -377.5874938964844, + "loss": 0.0398, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.218334913253784, + "rewards/margins": 7.058789253234863, + "rewards/rejected": -10.279296875, + "step": 9940 + }, + { + "epoch": 2.5087448397567202, + "grad_norm": 4.51336669921875, + "learning_rate": 3.972988680603001e-08, + "logits/chosen": -1.147375464439392, + "logits/rejected": -1.0868377685546875, + "logps/chosen": -316.14373779296875, + "logps/rejected": -358.3125, + "loss": 0.0606, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.720410108566284, + "rewards/margins": 6.407812595367432, + "rewards/rejected": -9.127344131469727, + "step": 9950 + }, + { + "epoch": 2.5112658746415404, + "grad_norm": 43.95208740234375, + "learning_rate": 3.933405181367391e-08, + "logits/chosen": -1.144555687904358, + "logits/rejected": -1.0635986328125, + "logps/chosen": -302.54376220703125, + "logps/rejected": -388.0874938964844, + "loss": 0.0742, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.041552782058716, + "rewards/margins": 6.627148628234863, + "rewards/rejected": -9.667577743530273, + "step": 9960 + }, + { + "epoch": 2.5137869095263605, + "grad_norm": 12.412017822265625, + "learning_rate": 3.894003015217206e-08, + "logits/chosen": -1.17156982421875, + "logits/rejected": -1.103387475013733, + "logps/chosen": -322.4312438964844, + "logps/rejected": -362.2124938964844, + "loss": 0.0719, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.747363328933716, + "rewards/margins": 6.265625, + "rewards/rejected": -9.013671875, + "step": 9970 + }, + { + "epoch": 2.5163079444111807, + "grad_norm": 14.251262664794922, + "learning_rate": 3.854782521311018e-08, + "logits/chosen": -1.1359131336212158, + "logits/rejected": -1.132318139076233, + "logps/chosen": -329.45001220703125, + "logps/rejected": -360.3374938964844, + "loss": 0.0762, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.1038451194763184, + "rewards/margins": 6.655468940734863, + "rewards/rejected": -9.762499809265137, + "step": 9980 + }, + { + "epoch": 2.518828979296001, + "grad_norm": 33.08596420288086, + "learning_rate": 3.815744037243651e-08, + "logits/chosen": -1.1314818859100342, + "logits/rejected": -1.0926392078399658, + "logps/chosen": -309.73126220703125, + "logps/rejected": -361.7562561035156, + "loss": 0.0836, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.118969678878784, + "rewards/margins": 6.161523342132568, + "rewards/rejected": -9.279687881469727, + "step": 9990 + }, + { + "epoch": 2.5213500141808214, + "grad_norm": 15.152300834655762, + "learning_rate": 3.776887899043246e-08, + "logits/chosen": -1.1444488763809204, + "logits/rejected": -1.103326439857483, + "logps/chosen": -312.76873779296875, + "logps/rejected": -382.4125061035156, + "loss": 0.0702, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.1241211891174316, + "rewards/margins": 6.632031440734863, + "rewards/rejected": -9.7568359375, + "step": 10000 + }, + { + "epoch": 2.5238710490656415, + "grad_norm": 11.703275680541992, + "learning_rate": 3.7382144411683857e-08, + "logits/chosen": -1.2043030261993408, + "logits/rejected": -1.166845679283142, + "logps/chosen": -316.3687438964844, + "logps/rejected": -334.8125, + "loss": 0.0465, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.0115113258361816, + "rewards/margins": 6.575585842132568, + "rewards/rejected": -9.583984375, + "step": 10010 + }, + { + "epoch": 2.5263920839504617, + "grad_norm": 43.19799041748047, + "learning_rate": 3.699723996505205e-08, + "logits/chosen": -1.1665023565292358, + "logits/rejected": -1.110015869140625, + "logps/chosen": -317.2749938964844, + "logps/rejected": -356.70001220703125, + "loss": 0.0522, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.833251953125, + "rewards/margins": 6.406445503234863, + "rewards/rejected": -9.242968559265137, + "step": 10020 + }, + { + "epoch": 2.528913118835282, + "grad_norm": 16.9388484954834, + "learning_rate": 3.661416896364547e-08, + "logits/chosen": -1.1895630359649658, + "logits/rejected": -1.1419556140899658, + "logps/chosen": -313.3812561035156, + "logps/rejected": -374.9624938964844, + "loss": 0.0457, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.9688477516174316, + "rewards/margins": 6.257421970367432, + "rewards/rejected": -9.228515625, + "step": 10030 + }, + { + "epoch": 2.531434153720102, + "grad_norm": 11.054365158081055, + "learning_rate": 3.623293470479075e-08, + "logits/chosen": -1.1443603038787842, + "logits/rejected": -1.1128661632537842, + "logps/chosen": -300.0375061035156, + "logps/rejected": -341.32501220703125, + "loss": 0.0469, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.071154832839966, + "rewards/margins": 6.469140529632568, + "rewards/rejected": -9.54296875, + "step": 10040 + }, + { + "epoch": 2.5339551886049225, + "grad_norm": 30.528623580932617, + "learning_rate": 3.58535404700048e-08, + "logits/chosen": -1.153588891029358, + "logits/rejected": -1.169403076171875, + "logps/chosen": -324.4750061035156, + "logps/rejected": -348.0, + "loss": 0.0497, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.210498094558716, + "rewards/margins": 6.308398246765137, + "rewards/rejected": -9.520312309265137, + "step": 10050 + }, + { + "epoch": 2.5364762234897427, + "grad_norm": 33.49970245361328, + "learning_rate": 3.5475989524966085e-08, + "logits/chosen": -1.219335913658142, + "logits/rejected": -1.163299560546875, + "logps/chosen": -319.98748779296875, + "logps/rejected": -367.3374938964844, + "loss": 0.0298, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.139880418777466, + "rewards/margins": 6.911523342132568, + "rewards/rejected": -10.049219131469727, + "step": 10060 + }, + { + "epoch": 2.538997258374563, + "grad_norm": 13.296183586120605, + "learning_rate": 3.5100285119486926e-08, + "logits/chosen": -1.166174292564392, + "logits/rejected": -0.998913586139679, + "logps/chosen": -313.54998779296875, + "logps/rejected": -370.48748779296875, + "loss": 0.0405, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.9495301246643066, + "rewards/margins": 6.721484184265137, + "rewards/rejected": -9.670702934265137, + "step": 10070 + }, + { + "epoch": 2.541518293259383, + "grad_norm": 81.87371063232422, + "learning_rate": 3.472643048748525e-08, + "logits/chosen": -1.2131226062774658, + "logits/rejected": -1.116235375404358, + "logps/chosen": -338.3687438964844, + "logps/rejected": -383.0375061035156, + "loss": 0.0785, + "rewards/accuracies": 0.9593750238418579, + "rewards/chosen": -2.916888475418091, + "rewards/margins": 6.690234184265137, + "rewards/rejected": -9.607812881469727, + "step": 10080 + }, + { + "epoch": 2.544039328144203, + "grad_norm": 16.84267807006836, + "learning_rate": 3.43544288469568e-08, + "logits/chosen": -1.158911108970642, + "logits/rejected": -1.0852539539337158, + "logps/chosen": -336.7437438964844, + "logps/rejected": -347.2875061035156, + "loss": 0.0559, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.92132568359375, + "rewards/margins": 6.407910346984863, + "rewards/rejected": -9.330273628234863, + "step": 10090 + }, + { + "epoch": 2.546560363029023, + "grad_norm": 36.27756881713867, + "learning_rate": 3.398428339994763e-08, + "logits/chosen": -1.18414306640625, + "logits/rejected": -1.1505248546600342, + "logps/chosen": -302.58123779296875, + "logps/rejected": -365.1625061035156, + "loss": 0.0585, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.7022337913513184, + "rewards/margins": 6.570508003234863, + "rewards/rejected": -9.26953125, + "step": 10100 + }, + { + "epoch": 2.5490813979138434, + "grad_norm": 30.258563995361328, + "learning_rate": 3.3615997332526345e-08, + "logits/chosen": -1.2242310047149658, + "logits/rejected": -1.186425805091858, + "logps/chosen": -343.4750061035156, + "logps/rejected": -400.86248779296875, + "loss": 0.0456, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.875537157058716, + "rewards/margins": 6.746874809265137, + "rewards/rejected": -9.627734184265137, + "step": 10110 + }, + { + "epoch": 2.551602432798664, + "grad_norm": 35.232215881347656, + "learning_rate": 3.32495738147566e-08, + "logits/chosen": -1.159143090248108, + "logits/rejected": -1.0797851085662842, + "logps/chosen": -338.61248779296875, + "logps/rejected": -373.90625, + "loss": 0.0366, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.01007080078125, + "rewards/margins": 6.5791015625, + "rewards/rejected": -9.591015815734863, + "step": 10120 + }, + { + "epoch": 2.554123467683484, + "grad_norm": 6.726482391357422, + "learning_rate": 3.288501600067017e-08, + "logits/chosen": -1.147790551185608, + "logits/rejected": -1.033911108970642, + "logps/chosen": -314.71875, + "logps/rejected": -351.0375061035156, + "loss": 0.0989, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.8470826148986816, + "rewards/margins": 6.75390625, + "rewards/rejected": -9.600781440734863, + "step": 10130 + }, + { + "epoch": 2.5566445025683042, + "grad_norm": 3.90889835357666, + "learning_rate": 3.2522327028239456e-08, + "logits/chosen": -1.206384301185608, + "logits/rejected": -1.1241180896759033, + "logps/chosen": -318.73126220703125, + "logps/rejected": -356.17498779296875, + "loss": 0.0238, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.2820801734924316, + "rewards/margins": 6.616601467132568, + "rewards/rejected": -9.895312309265137, + "step": 10140 + }, + { + "epoch": 2.5591655374531244, + "grad_norm": 11.42145824432373, + "learning_rate": 3.2161510019350524e-08, + "logits/chosen": -1.1995360851287842, + "logits/rejected": -1.1785399913787842, + "logps/chosen": -326.63751220703125, + "logps/rejected": -358.3500061035156, + "loss": 0.0321, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8296661376953125, + "rewards/margins": 6.55078125, + "rewards/rejected": -9.386327743530273, + "step": 10150 + }, + { + "epoch": 2.561686572337945, + "grad_norm": 20.189697265625, + "learning_rate": 3.180256807977638e-08, + "logits/chosen": -1.1577575206756592, + "logits/rejected": -1.0860474109649658, + "logps/chosen": -315.8500061035156, + "logps/rejected": -338.70001220703125, + "loss": 0.0495, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.1072998046875, + "rewards/margins": 6.552343845367432, + "rewards/rejected": -9.6611328125, + "step": 10160 + }, + { + "epoch": 2.564207607222765, + "grad_norm": 87.76728057861328, + "learning_rate": 3.144550429915027e-08, + "logits/chosen": -1.1464354991912842, + "logits/rejected": -1.103448510169983, + "logps/chosen": -329.38751220703125, + "logps/rejected": -378.9624938964844, + "loss": 0.0394, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.390063524246216, + "rewards/margins": 6.573437690734863, + "rewards/rejected": -9.958593368530273, + "step": 10170 + }, + { + "epoch": 2.5667286421075852, + "grad_norm": 31.60454559326172, + "learning_rate": 3.10903217509387e-08, + "logits/chosen": -1.179846167564392, + "logits/rejected": NaN, + "logps/chosen": -355.6312561035156, + "logps/rejected": -361.6625061035156, + "loss": 0.0496, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.893359422683716, + "rewards/margins": 6.592968940734863, + "rewards/rejected": -9.485937118530273, + "step": 10180 + }, + { + "epoch": 2.5692496769924054, + "grad_norm": 26.464574813842773, + "learning_rate": 3.0737023492415606e-08, + "logits/chosen": -1.1988189220428467, + "logits/rejected": NaN, + "logps/chosen": -325.1812438964844, + "logps/rejected": -365.4375, + "loss": 0.0378, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.072436571121216, + "rewards/margins": 6.5830078125, + "rewards/rejected": -9.656641006469727, + "step": 10190 + }, + { + "epoch": 2.5717707118772255, + "grad_norm": 38.3802490234375, + "learning_rate": 3.0385612564635346e-08, + "logits/chosen": -1.1513671875, + "logits/rejected": -1.091589331626892, + "logps/chosen": -334.5, + "logps/rejected": -370.57501220703125, + "loss": 0.0525, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.2652831077575684, + "rewards/margins": 7.131445407867432, + "rewards/rejected": -10.394140243530273, + "step": 10200 + }, + { + "epoch": 2.5742917467620456, + "grad_norm": 51.423492431640625, + "learning_rate": 3.003609199240711e-08, + "logits/chosen": NaN, + "logits/rejected": -1.1724364757537842, + "logps/chosen": -316.73126220703125, + "logps/rejected": -353.6499938964844, + "loss": 0.0499, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.3993163108825684, + "rewards/margins": 6.147851467132568, + "rewards/rejected": -9.5439453125, + "step": 10210 + }, + { + "epoch": 2.576812781646866, + "grad_norm": 18.73659896850586, + "learning_rate": 2.9688464784268563e-08, + "logits/chosen": -1.090063452720642, + "logits/rejected": -1.012841820716858, + "logps/chosen": -332.8687438964844, + "logps/rejected": -375.13751220703125, + "loss": 0.0529, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.112597703933716, + "rewards/margins": 6.841015815734863, + "rewards/rejected": -9.948827743530273, + "step": 10220 + }, + { + "epoch": 2.5793338165316864, + "grad_norm": 13.378907203674316, + "learning_rate": 2.9342733932459923e-08, + "logits/chosen": -1.169531226158142, + "logits/rejected": -1.110009789466858, + "logps/chosen": -306.5249938964844, + "logps/rejected": -360.63751220703125, + "loss": 0.0491, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.4999022483825684, + "rewards/margins": 6.477148532867432, + "rewards/rejected": -9.979296684265137, + "step": 10230 + }, + { + "epoch": 2.5818548514165065, + "grad_norm": 26.04082489013672, + "learning_rate": 2.8998902412898514e-08, + "logits/chosen": -1.1234649419784546, + "logits/rejected": -1.0699462890625, + "logps/chosen": -310.61248779296875, + "logps/rejected": -369.5, + "loss": 0.0407, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.35650634765625, + "rewards/margins": 6.9501953125, + "rewards/rejected": -10.307031631469727, + "step": 10240 + }, + { + "epoch": 2.5843758863013266, + "grad_norm": 12.310823440551758, + "learning_rate": 2.8656973185152754e-08, + "logits/chosen": -1.1652100086212158, + "logits/rejected": -1.130334496498108, + "logps/chosen": -315.9312438964844, + "logps/rejected": -350.1875, + "loss": 0.0565, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.19012451171875, + "rewards/margins": 6.421679496765137, + "rewards/rejected": -9.615234375, + "step": 10250 + }, + { + "epoch": 2.586896921186147, + "grad_norm": 19.390209197998047, + "learning_rate": 2.831694919241695e-08, + "logits/chosen": -1.107080101966858, + "logits/rejected": -1.073034644126892, + "logps/chosen": -318.32501220703125, + "logps/rejected": -353.79998779296875, + "loss": 0.0818, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -3.2092316150665283, + "rewards/margins": 6.260351657867432, + "rewards/rejected": -9.469531059265137, + "step": 10260 + }, + { + "epoch": 2.5894179560709674, + "grad_norm": 63.577392578125, + "learning_rate": 2.7978833361485933e-08, + "logits/chosen": -1.2262694835662842, + "logits/rejected": -1.148596167564392, + "logps/chosen": -320.5375061035156, + "logps/rejected": -353.3500061035156, + "loss": 0.0444, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.0331053733825684, + "rewards/margins": 6.652148246765137, + "rewards/rejected": -9.686718940734863, + "step": 10270 + }, + { + "epoch": 2.5919389909557875, + "grad_norm": 73.93006134033203, + "learning_rate": 2.7642628602729758e-08, + "logits/chosen": -1.1412231922149658, + "logits/rejected": -1.0458862781524658, + "logps/chosen": -324.2875061035156, + "logps/rejected": -384.1875, + "loss": 0.0736, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.447253465652466, + "rewards/margins": 6.890038967132568, + "rewards/rejected": -10.337890625, + "step": 10280 + }, + { + "epoch": 2.5944600258406076, + "grad_norm": 46.664955139160156, + "learning_rate": 2.7308337810068665e-08, + "logits/chosen": -1.2644164562225342, + "logits/rejected": -1.1243407726287842, + "logps/chosen": -338.2124938964844, + "logps/rejected": -364.8374938964844, + "loss": 0.077, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.272927761077881, + "rewards/margins": 6.66015625, + "rewards/rejected": -9.934374809265137, + "step": 10290 + }, + { + "epoch": 2.596981060725428, + "grad_norm": 3.487577438354492, + "learning_rate": 2.6975963860948247e-08, + "logits/chosen": -1.143798828125, + "logits/rejected": -1.0145995616912842, + "logps/chosen": -346.5062561035156, + "logps/rejected": -377.5249938964844, + "loss": 0.083, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.0484619140625, + "rewards/margins": 6.5068359375, + "rewards/rejected": -9.555078506469727, + "step": 10300 + }, + { + "epoch": 2.599502095610248, + "grad_norm": 23.394622802734375, + "learning_rate": 2.664550961631476e-08, + "logits/chosen": -1.1463501453399658, + "logits/rejected": -1.0346343517303467, + "logps/chosen": -313.95001220703125, + "logps/rejected": -355.1625061035156, + "loss": 0.0514, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.3636717796325684, + "rewards/margins": 6.446484565734863, + "rewards/rejected": -9.816015243530273, + "step": 10310 + }, + { + "epoch": 2.602023130495068, + "grad_norm": 12.553984642028809, + "learning_rate": 2.6316977920590234e-08, + "logits/chosen": -1.226318359375, + "logits/rejected": -1.187158226966858, + "logps/chosen": -336.13751220703125, + "logps/rejected": -367.26251220703125, + "loss": 0.0495, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.112927198410034, + "rewards/margins": 7.0615234375, + "rewards/rejected": -10.178515434265137, + "step": 10320 + }, + { + "epoch": 2.604544165379888, + "grad_norm": 76.98346710205078, + "learning_rate": 2.599037160164827e-08, + "logits/chosen": -1.0966796875, + "logits/rejected": -1.0803344249725342, + "logps/chosen": -320.20623779296875, + "logps/rejected": -362.2875061035156, + "loss": 0.0383, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.385519504547119, + "rewards/margins": 6.6201171875, + "rewards/rejected": -10.006640434265137, + "step": 10330 + }, + { + "epoch": 2.607065200264709, + "grad_norm": 28.021957397460938, + "learning_rate": 2.5665693470789423e-08, + "logits/chosen": -1.196313500404358, + "logits/rejected": -1.0872313976287842, + "logps/chosen": -307.57501220703125, + "logps/rejected": -356.8062438964844, + "loss": 0.053, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.856762647628784, + "rewards/margins": 6.643164157867432, + "rewards/rejected": -9.501172065734863, + "step": 10340 + }, + { + "epoch": 2.609586235149529, + "grad_norm": 29.01761245727539, + "learning_rate": 2.534294632271733e-08, + "logits/chosen": -1.1695435047149658, + "logits/rejected": -1.0758788585662842, + "logps/chosen": -337.9624938964844, + "logps/rejected": -361.7749938964844, + "loss": 0.0463, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.8138794898986816, + "rewards/margins": 6.504492282867432, + "rewards/rejected": -9.318359375, + "step": 10350 + }, + { + "epoch": 2.612107270034349, + "grad_norm": 27.207761764526367, + "learning_rate": 2.5022132935514333e-08, + "logits/chosen": -1.10498046875, + "logits/rejected": -1.078881859779358, + "logps/chosen": -298.73126220703125, + "logps/rejected": -349.2124938964844, + "loss": 0.05, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.0404067039489746, + "rewards/margins": 6.368554592132568, + "rewards/rejected": -9.411230087280273, + "step": 10360 + }, + { + "epoch": 2.614628304919169, + "grad_norm": 13.011737823486328, + "learning_rate": 2.470325607061774e-08, + "logits/chosen": -1.168432593345642, + "logits/rejected": -1.0833740234375, + "logps/chosen": -323.9375, + "logps/rejected": -376.0375061035156, + "loss": 0.0649, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.160961866378784, + "rewards/margins": 6.526171684265137, + "rewards/rejected": -9.688672065734863, + "step": 10370 + }, + { + "epoch": 2.61714933980399, + "grad_norm": 18.515071868896484, + "learning_rate": 2.4386318472796125e-08, + "logits/chosen": -1.166589379310608, + "logits/rejected": -1.118554711341858, + "logps/chosen": -326.82501220703125, + "logps/rejected": -360.48126220703125, + "loss": 0.0548, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.373303174972534, + "rewards/margins": 6.196093559265137, + "rewards/rejected": -9.569531440734863, + "step": 10380 + }, + { + "epoch": 2.61967037468881, + "grad_norm": 12.66776180267334, + "learning_rate": 2.4071322870125475e-08, + "logits/chosen": -1.167730689048767, + "logits/rejected": -1.111474633216858, + "logps/chosen": -313.63751220703125, + "logps/rejected": -364.73748779296875, + "loss": 0.0247, + "rewards/accuracies": 0.996874988079071, + "rewards/chosen": -3.2600951194763184, + "rewards/margins": 6.756249904632568, + "rewards/rejected": -10.015233993530273, + "step": 10390 + }, + { + "epoch": 2.62219140957363, + "grad_norm": 8.786449432373047, + "learning_rate": 2.3758271973965848e-08, + "logits/chosen": -1.1691405773162842, + "logits/rejected": -1.035882592201233, + "logps/chosen": -325.86248779296875, + "logps/rejected": -366.5874938964844, + "loss": 0.0512, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.1101410388946533, + "rewards/margins": 6.529296875, + "rewards/rejected": -9.640625, + "step": 10400 + }, + { + "epoch": 2.62471244445845, + "grad_norm": 33.089996337890625, + "learning_rate": 2.344716847893813e-08, + "logits/chosen": -1.0660278797149658, + "logits/rejected": -1.1205322742462158, + "logps/chosen": -307.7875061035156, + "logps/rejected": -370.5687561035156, + "loss": 0.0577, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.206298828125, + "rewards/margins": 6.607812404632568, + "rewards/rejected": -9.816015243530273, + "step": 10410 + }, + { + "epoch": 2.6272334793432703, + "grad_norm": 15.56445598602295, + "learning_rate": 2.313801506290064e-08, + "logits/chosen": -1.1512877941131592, + "logits/rejected": -1.077978491783142, + "logps/chosen": -324.57501220703125, + "logps/rejected": -328.9624938964844, + "loss": 0.0665, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.110943555831909, + "rewards/margins": 6.384863376617432, + "rewards/rejected": -9.496874809265137, + "step": 10420 + }, + { + "epoch": 2.6297545142280905, + "grad_norm": 42.54911804199219, + "learning_rate": 2.283081438692619e-08, + "logits/chosen": -1.1371886730194092, + "logits/rejected": -1.137750267982483, + "logps/chosen": -307.8500061035156, + "logps/rejected": -346.57501220703125, + "loss": 0.0498, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.9408812522888184, + "rewards/margins": 6.560742378234863, + "rewards/rejected": -9.499804496765137, + "step": 10430 + }, + { + "epoch": 2.6322755491129106, + "grad_norm": 18.617380142211914, + "learning_rate": 2.252556909527911e-08, + "logits/chosen": -1.1597900390625, + "logits/rejected": -1.0846679210662842, + "logps/chosen": -323.3062438964844, + "logps/rejected": -361.57501220703125, + "loss": 0.0672, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.796948194503784, + "rewards/margins": 6.628125190734863, + "rewards/rejected": -9.42578125, + "step": 10440 + }, + { + "epoch": 2.634796583997731, + "grad_norm": 50.39127731323242, + "learning_rate": 2.222228181539268e-08, + "logits/chosen": -1.1407592296600342, + "logits/rejected": -1.1284911632537842, + "logps/chosen": -326.5562438964844, + "logps/rejected": -362.8374938964844, + "loss": 0.038, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.072589159011841, + "rewards/margins": 6.466796875, + "rewards/rejected": -9.537890434265137, + "step": 10450 + }, + { + "epoch": 2.6373176188825513, + "grad_norm": 5.374075412750244, + "learning_rate": 2.1920955157846228e-08, + "logits/chosen": -1.1473388671875, + "logits/rejected": -1.0724060535430908, + "logps/chosen": -325.5625, + "logps/rejected": -357.25, + "loss": 0.0349, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.348498582839966, + "rewards/margins": 6.5615234375, + "rewards/rejected": -9.907031059265137, + "step": 10460 + }, + { + "epoch": 2.6398386537673715, + "grad_norm": 21.453832626342773, + "learning_rate": 2.1621591716342926e-08, + "logits/chosen": -1.152258276939392, + "logits/rejected": -1.100378394126892, + "logps/chosen": -319.2562561035156, + "logps/rejected": -351.2124938964844, + "loss": 0.0503, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1752991676330566, + "rewards/margins": 6.2255859375, + "rewards/rejected": -9.402539253234863, + "step": 10470 + }, + { + "epoch": 2.6423596886521916, + "grad_norm": 37.59913635253906, + "learning_rate": 2.1324194067687235e-08, + "logits/chosen": -1.152099609375, + "logits/rejected": -1.0808899402618408, + "logps/chosen": -323.22186279296875, + "logps/rejected": -369.36248779296875, + "loss": 0.0493, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.341320753097534, + "rewards/margins": 6.634375095367432, + "rewards/rejected": -9.969335556030273, + "step": 10480 + }, + { + "epoch": 2.644880723537012, + "grad_norm": 19.285566329956055, + "learning_rate": 2.1028764771762906e-08, + "logits/chosen": -1.062231421470642, + "logits/rejected": NaN, + "logps/chosen": -316.8500061035156, + "logps/rejected": -361.6875, + "loss": 0.0597, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.268933057785034, + "rewards/margins": 6.443359375, + "rewards/rejected": -9.713671684265137, + "step": 10490 + }, + { + "epoch": 2.6474017584218323, + "grad_norm": 18.747461318969727, + "learning_rate": 2.073530637151086e-08, + "logits/chosen": -1.1163330078125, + "logits/rejected": -1.0819275379180908, + "logps/chosen": -316.48126220703125, + "logps/rejected": -341.76251220703125, + "loss": 0.0613, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.099609375, + "rewards/margins": 6.423047065734863, + "rewards/rejected": -9.522656440734863, + "step": 10500 + }, + { + "epoch": 2.6499227933066525, + "grad_norm": 25.62896728515625, + "learning_rate": 2.0443821392907208e-08, + "logits/chosen": -1.1439392566680908, + "logits/rejected": -1.0073730945587158, + "logps/chosen": -298.2875061035156, + "logps/rejected": -363.98126220703125, + "loss": 0.0514, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.3671202659606934, + "rewards/margins": 6.742578029632568, + "rewards/rejected": -10.109766006469727, + "step": 10510 + }, + { + "epoch": 2.6524438281914726, + "grad_norm": 20.20026206970215, + "learning_rate": 2.0154312344941833e-08, + "logits/chosen": -1.228845238685608, + "logits/rejected": -1.0604064464569092, + "logps/chosen": -314.32501220703125, + "logps/rejected": -335.8500061035156, + "loss": 0.0542, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.132769823074341, + "rewards/margins": 6.710546970367432, + "rewards/rejected": -9.845703125, + "step": 10520 + }, + { + "epoch": 2.6549648630762928, + "grad_norm": 9.448894500732422, + "learning_rate": 1.9866781719596355e-08, + "logits/chosen": -1.099523901939392, + "logits/rejected": -1.088903784751892, + "logps/chosen": -312.1875, + "logps/rejected": -362.42498779296875, + "loss": 0.0396, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.0770263671875, + "rewards/margins": 6.994921684265137, + "rewards/rejected": -10.071484565734863, + "step": 10530 + }, + { + "epoch": 2.657485897961113, + "grad_norm": 32.552032470703125, + "learning_rate": 1.9581231991823045e-08, + "logits/chosen": -1.0786864757537842, + "logits/rejected": -1.0769164562225342, + "logps/chosen": -311.8812561035156, + "logps/rejected": -366.1812438964844, + "loss": 0.0687, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.353442430496216, + "rewards/margins": 6.366601467132568, + "rewards/rejected": -9.720312118530273, + "step": 10540 + }, + { + "epoch": 2.660006932845933, + "grad_norm": 14.566728591918945, + "learning_rate": 1.92976656195232e-08, + "logits/chosen": -1.184838891029358, + "logits/rejected": -1.0648651123046875, + "logps/chosen": -336.51873779296875, + "logps/rejected": -373.4125061035156, + "loss": 0.0634, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -3.2521910667419434, + "rewards/margins": 6.7919921875, + "rewards/rejected": -10.043359756469727, + "step": 10550 + }, + { + "epoch": 2.6625279677307536, + "grad_norm": 39.01841735839844, + "learning_rate": 1.9016085043526446e-08, + "logits/chosen": -1.210479736328125, + "logits/rejected": -1.082800269126892, + "logps/chosen": -334.5, + "logps/rejected": -371.7250061035156, + "loss": 0.0468, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.4530272483825684, + "rewards/margins": 6.431250095367432, + "rewards/rejected": -9.881250381469727, + "step": 10560 + }, + { + "epoch": 2.6650490026155738, + "grad_norm": 8.452898025512695, + "learning_rate": 1.8736492687569163e-08, + "logits/chosen": -1.1693603992462158, + "logits/rejected": -1.219018578529358, + "logps/chosen": -338.8374938964844, + "logps/rejected": -378.4125061035156, + "loss": 0.0582, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.443838596343994, + "rewards/margins": 6.309765815734863, + "rewards/rejected": -9.75390625, + "step": 10570 + }, + { + "epoch": 2.667570037500394, + "grad_norm": 38.938663482666016, + "learning_rate": 1.8458890958273994e-08, + "logits/chosen": -1.1510009765625, + "logits/rejected": -1.090490698814392, + "logps/chosen": -316.3374938964844, + "logps/rejected": -365.26251220703125, + "loss": 0.04, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.244241237640381, + "rewards/margins": 6.348046779632568, + "rewards/rejected": -9.593358993530273, + "step": 10580 + }, + { + "epoch": 2.670091072385214, + "grad_norm": 15.141484260559082, + "learning_rate": 1.818328224512916e-08, + "logits/chosen": -1.1576049327850342, + "logits/rejected": -1.1728026866912842, + "logps/chosen": -320.125, + "logps/rejected": -353.07501220703125, + "loss": 0.0657, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.539276123046875, + "rewards/margins": 6.409375190734863, + "rewards/rejected": -9.950390815734863, + "step": 10590 + }, + { + "epoch": 2.6726121072700346, + "grad_norm": 16.160194396972656, + "learning_rate": 1.790966892046758e-08, + "logits/chosen": -1.137274146080017, + "logits/rejected": -0.9729369878768921, + "logps/chosen": -313.70001220703125, + "logps/rejected": -372.79998779296875, + "loss": 0.0601, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -3.016976833343506, + "rewards/margins": 6.833788871765137, + "rewards/rejected": -9.848437309265137, + "step": 10600 + }, + { + "epoch": 2.6751331421548548, + "grad_norm": 40.86832046508789, + "learning_rate": 1.7638053339446818e-08, + "logits/chosen": -1.179101586341858, + "logits/rejected": -1.145532250404358, + "logps/chosen": -313.7250061035156, + "logps/rejected": -358.32501220703125, + "loss": 0.0633, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.262402296066284, + "rewards/margins": 6.678515434265137, + "rewards/rejected": -9.939844131469727, + "step": 10610 + }, + { + "epoch": 2.677654177039675, + "grad_norm": 23.61850357055664, + "learning_rate": 1.736843784002848e-08, + "logits/chosen": -1.1197693347930908, + "logits/rejected": -1.0531005859375, + "logps/chosen": -300.6812438964844, + "logps/rejected": -356.9375, + "loss": 0.0474, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.128833055496216, + "rewards/margins": 6.652905464172363, + "rewards/rejected": -9.782422065734863, + "step": 10620 + }, + { + "epoch": 2.680175211924495, + "grad_norm": 9.803144454956055, + "learning_rate": 1.7100824742958375e-08, + "logits/chosen": -1.1581542491912842, + "logits/rejected": -1.02667236328125, + "logps/chosen": -330.1499938964844, + "logps/rejected": -390.7250061035156, + "loss": 0.0409, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.4138426780700684, + "rewards/margins": 6.759765625, + "rewards/rejected": -10.170312881469727, + "step": 10630 + }, + { + "epoch": 2.682696246809315, + "grad_norm": 4.958293914794922, + "learning_rate": 1.683521635174631e-08, + "logits/chosen": -1.120001196861267, + "logits/rejected": -0.98406982421875, + "logps/chosen": -306.0625, + "logps/rejected": -363.2562561035156, + "loss": 0.0802, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -3.177197217941284, + "rewards/margins": 6.23828125, + "rewards/rejected": -9.4130859375, + "step": 10640 + }, + { + "epoch": 2.6852172816941353, + "grad_norm": 27.18450164794922, + "learning_rate": 1.657161495264639e-08, + "logits/chosen": -1.151159644126892, + "logits/rejected": -1.1621825695037842, + "logps/chosen": -300.67498779296875, + "logps/rejected": -341.86248779296875, + "loss": 0.0531, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.03460693359375, + "rewards/margins": 6.3662109375, + "rewards/rejected": -9.396484375, + "step": 10650 + }, + { + "epoch": 2.6877383165789555, + "grad_norm": 9.695204734802246, + "learning_rate": 1.6310022814637364e-08, + "logits/chosen": -1.13702392578125, + "logits/rejected": -1.073846459388733, + "logps/chosen": -296.03125, + "logps/rejected": -350.1875, + "loss": 0.0464, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.7076613903045654, + "rewards/margins": 6.625586032867432, + "rewards/rejected": -9.333593368530273, + "step": 10660 + }, + { + "epoch": 2.690259351463776, + "grad_norm": 6.534129619598389, + "learning_rate": 1.605044218940299e-08, + "logits/chosen": -1.184533715248108, + "logits/rejected": -1.124597191810608, + "logps/chosen": -311.46875, + "logps/rejected": -366.7124938964844, + "loss": 0.0319, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.9455809593200684, + "rewards/margins": 6.4033203125, + "rewards/rejected": -9.347265243530273, + "step": 10670 + }, + { + "epoch": 2.692780386348596, + "grad_norm": 8.636625289916992, + "learning_rate": 1.579287531131268e-08, + "logits/chosen": -1.233251929283142, + "logits/rejected": -1.165441870689392, + "logps/chosen": -310.3500061035156, + "logps/rejected": -381.42498779296875, + "loss": 0.0326, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.940380811691284, + "rewards/margins": 6.638867378234863, + "rewards/rejected": -9.581640243530273, + "step": 10680 + }, + { + "epoch": 2.6953014212334163, + "grad_norm": 15.20366382598877, + "learning_rate": 1.553732439740227e-08, + "logits/chosen": -1.180090308189392, + "logits/rejected": -1.1100342273712158, + "logps/chosen": -318.73126220703125, + "logps/rejected": -352.4624938964844, + "loss": 0.0492, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.214587450027466, + "rewards/margins": 6.668554782867432, + "rewards/rejected": -9.882031440734863, + "step": 10690 + }, + { + "epoch": 2.6978224561182365, + "grad_norm": 12.776803016662598, + "learning_rate": 1.5283791647355133e-08, + "logits/chosen": -1.152380347251892, + "logits/rejected": -1.084497094154358, + "logps/chosen": -318.95001220703125, + "logps/rejected": -363.7250061035156, + "loss": 0.0544, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.245312452316284, + "rewards/margins": 6.268750190734863, + "rewards/rejected": -9.516016006469727, + "step": 10700 + }, + { + "epoch": 2.7003434910030566, + "grad_norm": 11.677002906799316, + "learning_rate": 1.503227924348288e-08, + "logits/chosen": -1.174353003501892, + "logits/rejected": -1.10626220703125, + "logps/chosen": -320.8500061035156, + "logps/rejected": -355.20001220703125, + "loss": 0.0653, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.3708252906799316, + "rewards/margins": 6.367968559265137, + "rewards/rejected": -9.73828125, + "step": 10710 + }, + { + "epoch": 2.702864525887877, + "grad_norm": 11.679018020629883, + "learning_rate": 1.4782789350706759e-08, + "logits/chosen": -1.1708984375, + "logits/rejected": -0.9654906988143921, + "logps/chosen": -310.29376220703125, + "logps/rejected": -356.9125061035156, + "loss": 0.0342, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.1154541969299316, + "rewards/margins": 6.944531440734863, + "rewards/rejected": -10.057031631469727, + "step": 10720 + }, + { + "epoch": 2.7053855607726973, + "grad_norm": 8.325181007385254, + "learning_rate": 1.4535324116539238e-08, + "logits/chosen": -1.105200171470642, + "logits/rejected": -1.017724633216858, + "logps/chosen": -283.5375061035156, + "logps/rejected": -345.45001220703125, + "loss": 0.078, + "rewards/accuracies": 0.9593750238418579, + "rewards/chosen": -3.199658155441284, + "rewards/margins": 6.387499809265137, + "rewards/rejected": -9.585156440734863, + "step": 10730 + }, + { + "epoch": 2.7079065956575175, + "grad_norm": 15.453259468078613, + "learning_rate": 1.4289885671065011e-08, + "logits/chosen": -1.168847680091858, + "logits/rejected": -0.9189468622207642, + "logps/chosen": -311.79376220703125, + "logps/rejected": -339.51251220703125, + "loss": 0.0363, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.95849609375, + "rewards/margins": 6.334765434265137, + "rewards/rejected": -9.292577743530273, + "step": 10740 + }, + { + "epoch": 2.7104276305423376, + "grad_norm": 4.290037631988525, + "learning_rate": 1.404647612692328e-08, + "logits/chosen": -1.0586121082305908, + "logits/rejected": -1.096710205078125, + "logps/chosen": -323.1875, + "logps/rejected": -374.38751220703125, + "loss": 0.0351, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.5386962890625, + "rewards/margins": 6.713281154632568, + "rewards/rejected": -10.253710746765137, + "step": 10750 + }, + { + "epoch": 2.7129486654271577, + "grad_norm": 47.159427642822266, + "learning_rate": 1.3805097579288938e-08, + "logits/chosen": -1.122076392173767, + "logits/rejected": -1.0566527843475342, + "logps/chosen": -327.5562438964844, + "logps/rejected": -371.1000061035156, + "loss": 0.0699, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.2559571266174316, + "rewards/margins": 6.468554496765137, + "rewards/rejected": -9.725000381469727, + "step": 10760 + }, + { + "epoch": 2.715469700311978, + "grad_norm": 3.60469913482666, + "learning_rate": 1.3565752105855088e-08, + "logits/chosen": -1.1114838123321533, + "logits/rejected": -1.087731957435608, + "logps/chosen": -299.10626220703125, + "logps/rejected": -345.3062438964844, + "loss": 0.0464, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.2244811058044434, + "rewards/margins": 6.491796970367432, + "rewards/rejected": -9.717187881469727, + "step": 10770 + }, + { + "epoch": 2.717990735196798, + "grad_norm": 21.518774032592773, + "learning_rate": 1.332844176681483e-08, + "logits/chosen": -1.07366943359375, + "logits/rejected": -1.0236937999725342, + "logps/chosen": -293.8999938964844, + "logps/rejected": -327.8374938964844, + "loss": 0.0511, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.255139112472534, + "rewards/margins": 6.462500095367432, + "rewards/rejected": -9.717968940734863, + "step": 10780 + }, + { + "epoch": 2.7205117700816186, + "grad_norm": 32.8369255065918, + "learning_rate": 1.3093168604843524e-08, + "logits/chosen": -1.0945556163787842, + "logits/rejected": -1.083715796470642, + "logps/chosen": -321.21875, + "logps/rejected": -366.17498779296875, + "loss": 0.0536, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.238452196121216, + "rewards/margins": 6.838086128234863, + "rewards/rejected": -10.075390815734863, + "step": 10790 + }, + { + "epoch": 2.7230328049664387, + "grad_norm": 50.40583038330078, + "learning_rate": 1.2859934645081477e-08, + "logits/chosen": -1.1863524913787842, + "logits/rejected": -1.061547875404358, + "logps/chosen": -329.57501220703125, + "logps/rejected": -377.45001220703125, + "loss": 0.0345, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.126269578933716, + "rewards/margins": 6.931836128234863, + "rewards/rejected": -10.059374809265137, + "step": 10800 + }, + { + "epoch": 2.725553839851259, + "grad_norm": 7.012859344482422, + "learning_rate": 1.2628741895116174e-08, + "logits/chosen": -1.117730736732483, + "logits/rejected": -1.086175560951233, + "logps/chosen": -307.26873779296875, + "logps/rejected": -356.2250061035156, + "loss": 0.0732, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.193988084793091, + "rewards/margins": 6.300976753234863, + "rewards/rejected": -9.491406440734863, + "step": 10810 + }, + { + "epoch": 2.728074874736079, + "grad_norm": 8.012761116027832, + "learning_rate": 1.2399592344965293e-08, + "logits/chosen": -1.132226586341858, + "logits/rejected": -0.9681762456893921, + "logps/chosen": -318.4125061035156, + "logps/rejected": -343.4125061035156, + "loss": 0.0429, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.9146485328674316, + "rewards/margins": 6.807226657867432, + "rewards/rejected": -9.723828315734863, + "step": 10820 + }, + { + "epoch": 2.7305959096208996, + "grad_norm": 26.902252197265625, + "learning_rate": 1.2172487967059276e-08, + "logits/chosen": -1.152307152748108, + "logits/rejected": -1.112207055091858, + "logps/chosen": -313.76873779296875, + "logps/rejected": -361.76251220703125, + "loss": 0.0757, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -2.9421753883361816, + "rewards/margins": 6.5927734375, + "rewards/rejected": -9.533984184265137, + "step": 10830 + }, + { + "epoch": 2.7331169445057197, + "grad_norm": 7.114460468292236, + "learning_rate": 1.1947430716224727e-08, + "logits/chosen": -1.0782287120819092, + "logits/rejected": -1.0315673351287842, + "logps/chosen": -307.73748779296875, + "logps/rejected": -346.0874938964844, + "loss": 0.0502, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.268994092941284, + "rewards/margins": 6.445898532867432, + "rewards/rejected": -9.717187881469727, + "step": 10840 + }, + { + "epoch": 2.73563797939054, + "grad_norm": 20.74818229675293, + "learning_rate": 1.1724422529667182e-08, + "logits/chosen": -1.0840942859649658, + "logits/rejected": -1.007910132408142, + "logps/chosen": -293.20623779296875, + "logps/rejected": -352.23748779296875, + "loss": 0.0532, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.3934082984924316, + "rewards/margins": 6.584374904632568, + "rewards/rejected": -9.978124618530273, + "step": 10850 + }, + { + "epoch": 2.73815901427536, + "grad_norm": 40.46851348876953, + "learning_rate": 1.1503465326954703e-08, + "logits/chosen": -1.159704566001892, + "logits/rejected": -0.997851550579071, + "logps/chosen": -313.76873779296875, + "logps/rejected": -348.9750061035156, + "loss": 0.0844, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -3.4705443382263184, + "rewards/margins": 6.450976371765137, + "rewards/rejected": -9.922656059265137, + "step": 10860 + }, + { + "epoch": 2.74068004916018, + "grad_norm": 5.118013858795166, + "learning_rate": 1.1284561010001304e-08, + "logits/chosen": -1.2329833507537842, + "logits/rejected": -1.178552269935608, + "logps/chosen": -343.4375, + "logps/rejected": -379.29998779296875, + "loss": 0.0801, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.0774383544921875, + "rewards/margins": 6.67529296875, + "rewards/rejected": -9.753125190734863, + "step": 10870 + }, + { + "epoch": 2.7432010840450003, + "grad_norm": 13.006446838378906, + "learning_rate": 1.1067711463050495e-08, + "logits/chosen": -1.137304663658142, + "logits/rejected": -1.071325659751892, + "logps/chosen": -321.7875061035156, + "logps/rejected": -377.36248779296875, + "loss": 0.0602, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.237518310546875, + "rewards/margins": 6.621679782867432, + "rewards/rejected": -9.864062309265137, + "step": 10880 + }, + { + "epoch": 2.7457221189298204, + "grad_norm": 52.72837448120117, + "learning_rate": 1.0852918552659185e-08, + "logits/chosen": -1.148718237876892, + "logits/rejected": -1.0368530750274658, + "logps/chosen": -291.86248779296875, + "logps/rejected": -337.51251220703125, + "loss": 0.0696, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.13336181640625, + "rewards/margins": 6.482226371765137, + "rewards/rejected": -9.615625381469727, + "step": 10890 + }, + { + "epoch": 2.748243153814641, + "grad_norm": 18.06778907775879, + "learning_rate": 1.0640184127681472e-08, + "logits/chosen": -1.176794409751892, + "logits/rejected": -1.083398461341858, + "logps/chosen": -315.45001220703125, + "logps/rejected": -359.98748779296875, + "loss": 0.0426, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.621978759765625, + "rewards/margins": 6.8486328125, + "rewards/rejected": -9.466992378234863, + "step": 10900 + }, + { + "epoch": 2.750764188699461, + "grad_norm": 40.34992980957031, + "learning_rate": 1.0429510019252936e-08, + "logits/chosen": -1.1633422374725342, + "logits/rejected": -1.1007201671600342, + "logps/chosen": -301.2562561035156, + "logps/rejected": -345.3374938964844, + "loss": 0.0608, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.822552442550659, + "rewards/margins": 6.488671779632568, + "rewards/rejected": -9.312108993530273, + "step": 10910 + }, + { + "epoch": 2.7532852235842813, + "grad_norm": 5.839588642120361, + "learning_rate": 1.0220898040774611e-08, + "logits/chosen": -1.0826934576034546, + "logits/rejected": -1.069067358970642, + "logps/chosen": -341.7250061035156, + "logps/rejected": -378.5375061035156, + "loss": 0.0224, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.054443359375, + "rewards/margins": 7.171288967132568, + "rewards/rejected": -10.229296684265137, + "step": 10920 + }, + { + "epoch": 2.7558062584691014, + "grad_norm": 22.24617576599121, + "learning_rate": 1.0014349987897575e-08, + "logits/chosen": -1.0678832530975342, + "logits/rejected": -1.052160620689392, + "logps/chosen": -307.20623779296875, + "logps/rejected": -376.76251220703125, + "loss": 0.0593, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.235089063644409, + "rewards/margins": 6.379296779632568, + "rewards/rejected": -9.615234375, + "step": 10930 + }, + { + "epoch": 2.758327293353922, + "grad_norm": 4.8370442390441895, + "learning_rate": 9.809867638507468e-09, + "logits/chosen": -1.155603051185608, + "logits/rejected": -1.070288062095642, + "logps/chosen": -294.7124938964844, + "logps/rejected": -337.70001220703125, + "loss": 0.0381, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.126147508621216, + "rewards/margins": 6.896093845367432, + "rewards/rejected": -10.023046493530273, + "step": 10940 + }, + { + "epoch": 2.760848328238742, + "grad_norm": 12.15842342376709, + "learning_rate": 9.607452752709105e-09, + "logits/chosen": -1.1739990711212158, + "logits/rejected": -1.1599915027618408, + "logps/chosen": -343.6312561035156, + "logps/rejected": -389.2124938964844, + "loss": 0.0405, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8974242210388184, + "rewards/margins": 6.529687404632568, + "rewards/rejected": -9.421875, + "step": 10950 + }, + { + "epoch": 2.7633693631235623, + "grad_norm": 36.01392364501953, + "learning_rate": 9.407107072811393e-09, + "logits/chosen": -1.1204345226287842, + "logits/rejected": -1.054931640625, + "logps/chosen": -316.04376220703125, + "logps/rejected": -364.5, + "loss": 0.0616, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.126208543777466, + "rewards/margins": 6.627539157867432, + "rewards/rejected": -9.755078315734863, + "step": 10960 + }, + { + "epoch": 2.7658903980083824, + "grad_norm": 18.595779418945312, + "learning_rate": 9.208832323312293e-09, + "logits/chosen": -1.209130883216858, + "logits/rejected": -1.159936547279358, + "logps/chosen": -308.2749938964844, + "logps/rejected": -352.2875061035156, + "loss": 0.0421, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.1209716796875, + "rewards/margins": 6.792773246765137, + "rewards/rejected": -9.911328315734863, + "step": 10970 + }, + { + "epoch": 2.7684114328932026, + "grad_norm": 51.25161361694336, + "learning_rate": 9.012630210884053e-09, + "logits/chosen": -1.1571776866912842, + "logits/rejected": -1.007531762123108, + "logps/chosen": -299.95001220703125, + "logps/rejected": -337.2124938964844, + "loss": 0.0821, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -3.107006788253784, + "rewards/margins": 6.533984184265137, + "rewards/rejected": -9.644922256469727, + "step": 10980 + }, + { + "epoch": 2.7709324677780227, + "grad_norm": 22.487014770507812, + "learning_rate": 8.818502424358442e-09, + "logits/chosen": -1.1804687976837158, + "logits/rejected": -1.07830810546875, + "logps/chosen": -316.34375, + "logps/rejected": -361.2250061035156, + "loss": 0.0442, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.305590867996216, + "rewards/margins": 6.674218654632568, + "rewards/rejected": -9.978515625, + "step": 10990 + }, + { + "epoch": 2.773453502662843, + "grad_norm": 14.883648872375488, + "learning_rate": 8.62645063471218e-09, + "logits/chosen": -1.1775391101837158, + "logits/rejected": -1.16473388671875, + "logps/chosen": -335.1625061035156, + "logps/rejected": -365.0, + "loss": 0.0805, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.346142530441284, + "rewards/margins": 6.494921684265137, + "rewards/rejected": -9.839648246765137, + "step": 11000 + }, + { + "epoch": 2.7759745375476634, + "grad_norm": 4.850130081176758, + "learning_rate": 8.43647649505269e-09, + "logits/chosen": -1.093542456626892, + "logits/rejected": -1.0819518566131592, + "logps/chosen": -308.9750061035156, + "logps/rejected": -364.6000061035156, + "loss": 0.0385, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1682372093200684, + "rewards/margins": 6.779589653015137, + "rewards/rejected": -9.9453125, + "step": 11010 + }, + { + "epoch": 2.7784955724324836, + "grad_norm": 4.473644733428955, + "learning_rate": 8.248581640603741e-09, + "logits/chosen": -1.1863892078399658, + "logits/rejected": -1.0742676258087158, + "logps/chosen": -320.23748779296875, + "logps/rejected": -358.23748779296875, + "loss": 0.071, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.234020948410034, + "rewards/margins": 6.4443359375, + "rewards/rejected": -9.680468559265137, + "step": 11020 + }, + { + "epoch": 2.7810166073173037, + "grad_norm": 4.190011024475098, + "learning_rate": 8.062767688691463e-09, + "logits/chosen": -1.12969970703125, + "logits/rejected": -1.104284644126892, + "logps/chosen": -309.92498779296875, + "logps/rejected": -369.4624938964844, + "loss": 0.0418, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.3687682151794434, + "rewards/margins": 6.503222465515137, + "rewards/rejected": -9.873046875, + "step": 11030 + }, + { + "epoch": 2.783537642202124, + "grad_norm": 26.84963607788086, + "learning_rate": 7.879036238730319e-09, + "logits/chosen": -1.153222680091858, + "logits/rejected": -1.0390503406524658, + "logps/chosen": -329.21875, + "logps/rejected": -362.76873779296875, + "loss": 0.0579, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.3701539039611816, + "rewards/margins": 6.40478515625, + "rewards/rejected": -9.776562690734863, + "step": 11040 + }, + { + "epoch": 2.7860586770869444, + "grad_norm": 9.767061233520508, + "learning_rate": 7.697388872209498e-09, + "logits/chosen": -1.1641967296600342, + "logits/rejected": -1.1090209484100342, + "logps/chosen": -301.60626220703125, + "logps/rejected": -350.04998779296875, + "loss": 0.0605, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.3002867698669434, + "rewards/margins": 6.400097846984863, + "rewards/rejected": -9.69921875, + "step": 11050 + }, + { + "epoch": 2.7885797119717646, + "grad_norm": 14.56065559387207, + "learning_rate": 7.517827152679096e-09, + "logits/chosen": -1.053826928138733, + "logits/rejected": NaN, + "logps/chosen": -304.91876220703125, + "logps/rejected": -358.9624938964844, + "loss": 0.0569, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.956677198410034, + "rewards/margins": 6.554883003234863, + "rewards/rejected": -9.513671875, + "step": 11060 + }, + { + "epoch": 2.7911007468565847, + "grad_norm": 25.90925407409668, + "learning_rate": 7.34035262573679e-09, + "logits/chosen": -1.037420630455017, + "logits/rejected": NaN, + "logps/chosen": -320.1499938964844, + "logps/rejected": -370.57501220703125, + "loss": 0.0611, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -3.003216505050659, + "rewards/margins": 6.718847751617432, + "rewards/rejected": -9.721484184265137, + "step": 11070 + }, + { + "epoch": 2.793621781741405, + "grad_norm": 12.51234245300293, + "learning_rate": 7.164966819014628e-09, + "logits/chosen": -1.05120849609375, + "logits/rejected": -0.9507385492324829, + "logps/chosen": -318.5, + "logps/rejected": -362.82501220703125, + "loss": 0.0728, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.9307618141174316, + "rewards/margins": 6.7783203125, + "rewards/rejected": -9.710156440734863, + "step": 11080 + }, + { + "epoch": 2.796142816626225, + "grad_norm": 6.220926284790039, + "learning_rate": 6.991671242165625e-09, + "logits/chosen": -1.0963134765625, + "logits/rejected": -0.9919189214706421, + "logps/chosen": -319.6875, + "logps/rejected": -366.61248779296875, + "loss": 0.0661, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.267333984375, + "rewards/margins": 6.716406345367432, + "rewards/rejected": -9.983202934265137, + "step": 11090 + }, + { + "epoch": 2.798663851511045, + "grad_norm": 10.663161277770996, + "learning_rate": 6.820467386850964e-09, + "logits/chosen": -1.1235229969024658, + "logits/rejected": -1.0130188465118408, + "logps/chosen": -310.10626220703125, + "logps/rejected": -365.3125, + "loss": 0.0469, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.08074951171875, + "rewards/margins": 6.567773342132568, + "rewards/rejected": -9.647265434265137, + "step": 11100 + }, + { + "epoch": 2.8011848863958653, + "grad_norm": 5.685007572174072, + "learning_rate": 6.651356726727064e-09, + "logits/chosen": -1.0920593738555908, + "logits/rejected": -1.085046410560608, + "logps/chosen": -321.2437438964844, + "logps/rejected": -360.26873779296875, + "loss": 0.0377, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.092785596847534, + "rewards/margins": 6.755859375, + "rewards/rejected": -9.852343559265137, + "step": 11110 + }, + { + "epoch": 2.803705921280686, + "grad_norm": 26.812538146972656, + "learning_rate": 6.4843407174330065e-09, + "logits/chosen": -1.1303924322128296, + "logits/rejected": -1.0161926746368408, + "logps/chosen": -321.63751220703125, + "logps/rejected": -350.48126220703125, + "loss": 0.0483, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.475689649581909, + "rewards/margins": 6.286328315734863, + "rewards/rejected": -9.763671875, + "step": 11120 + }, + { + "epoch": 2.806226956165506, + "grad_norm": 14.367386817932129, + "learning_rate": 6.319420796577879e-09, + "logits/chosen": -1.116998314857483, + "logits/rejected": -0.963482677936554, + "logps/chosen": -314.5562438964844, + "logps/rejected": -325.3999938964844, + "loss": 0.0349, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.049877882003784, + "rewards/margins": 6.791015625, + "rewards/rejected": -9.841796875, + "step": 11130 + }, + { + "epoch": 2.808747991050326, + "grad_norm": 37.2702522277832, + "learning_rate": 6.156598383728451e-09, + "logits/chosen": -1.148046851158142, + "logits/rejected": -1.056848168373108, + "logps/chosen": -334.6499938964844, + "logps/rejected": -371.125, + "loss": 0.0476, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.0511107444763184, + "rewards/margins": 6.743554592132568, + "rewards/rejected": -9.795702934265137, + "step": 11140 + }, + { + "epoch": 2.8112690259351463, + "grad_norm": 35.44508361816406, + "learning_rate": 5.995874880396962e-09, + "logits/chosen": -1.1152832508087158, + "logits/rejected": -0.964923083782196, + "logps/chosen": -315.0062561035156, + "logps/rejected": -358.625, + "loss": 0.0626, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.4000244140625, + "rewards/margins": 6.755273342132568, + "rewards/rejected": -10.154687881469727, + "step": 11150 + }, + { + "epoch": 2.813790060819967, + "grad_norm": 11.824356079101562, + "learning_rate": 5.83725167002902e-09, + "logits/chosen": -1.146704077720642, + "logits/rejected": -1.1065673828125, + "logps/chosen": -307.8999938964844, + "logps/rejected": -359.7875061035156, + "loss": 0.0314, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.1994690895080566, + "rewards/margins": 6.810351371765137, + "rewards/rejected": -10.010937690734863, + "step": 11160 + }, + { + "epoch": 2.816311095704787, + "grad_norm": 8.701419830322266, + "learning_rate": 5.680730117991833e-09, + "logits/chosen": -1.1065521240234375, + "logits/rejected": -1.0941650867462158, + "logps/chosen": -317.51251220703125, + "logps/rejected": -370.5874938964844, + "loss": 0.0426, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.38165283203125, + "rewards/margins": 6.740429878234863, + "rewards/rejected": -10.119140625, + "step": 11170 + }, + { + "epoch": 2.818832130589607, + "grad_norm": 7.306780815124512, + "learning_rate": 5.5263115715621925e-09, + "logits/chosen": -1.107873558998108, + "logits/rejected": -1.008874535560608, + "logps/chosen": -343.46875, + "logps/rejected": -359.0249938964844, + "loss": 0.0392, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.1722412109375, + "rewards/margins": 6.995019435882568, + "rewards/rejected": -10.165624618530273, + "step": 11180 + }, + { + "epoch": 2.8213531654744273, + "grad_norm": 6.022578716278076, + "learning_rate": 5.373997359915172e-09, + "logits/chosen": -1.1570250988006592, + "logits/rejected": -1.118261694908142, + "logps/chosen": -327.78125, + "logps/rejected": -365.73748779296875, + "loss": 0.0276, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.9304442405700684, + "rewards/margins": 7.023633003234863, + "rewards/rejected": -9.959765434265137, + "step": 11190 + }, + { + "epoch": 2.8238742003592474, + "grad_norm": 42.67591857910156, + "learning_rate": 5.223788794112449e-09, + "logits/chosen": -1.149041771888733, + "logits/rejected": -1.1051025390625, + "logps/chosen": -324.1937561035156, + "logps/rejected": -339.5375061035156, + "loss": 0.0517, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.979199171066284, + "rewards/margins": 6.374218940734863, + "rewards/rejected": -9.354687690734863, + "step": 11200 + }, + { + "epoch": 2.8263952352440675, + "grad_norm": 41.82542419433594, + "learning_rate": 5.075687167091169e-09, + "logits/chosen": -1.199365258216858, + "logits/rejected": -1.0837891101837158, + "logps/chosen": -307.8062438964844, + "logps/rejected": -345.5, + "loss": 0.0501, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.337634325027466, + "rewards/margins": 6.540625095367432, + "rewards/rejected": -9.876953125, + "step": 11210 + }, + { + "epoch": 2.8289162701288877, + "grad_norm": 34.202144622802734, + "learning_rate": 4.9296937536527635e-09, + "logits/chosen": -1.106774926185608, + "logits/rejected": -1.017059326171875, + "logps/chosen": -291.04998779296875, + "logps/rejected": -336.13751220703125, + "loss": 0.0554, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -3.25830078125, + "rewards/margins": 6.3466796875, + "rewards/rejected": -9.604687690734863, + "step": 11220 + }, + { + "epoch": 2.8314373050137083, + "grad_norm": 5.910395622253418, + "learning_rate": 4.785809810451958e-09, + "logits/chosen": -1.172827124595642, + "logits/rejected": -1.027063012123108, + "logps/chosen": -322.0062561035156, + "logps/rejected": -366.8374938964844, + "loss": 0.0392, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.98138427734375, + "rewards/margins": 6.910937309265137, + "rewards/rejected": -9.889452934265137, + "step": 11230 + }, + { + "epoch": 2.8339583398985284, + "grad_norm": 8.160270690917969, + "learning_rate": 4.644036575985999e-09, + "logits/chosen": -1.1519683599472046, + "logits/rejected": -1.0647369623184204, + "logps/chosen": -320.91876220703125, + "logps/rejected": -370.6000061035156, + "loss": 0.0506, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.365161180496216, + "rewards/margins": 6.4072265625, + "rewards/rejected": -9.774609565734863, + "step": 11240 + }, + { + "epoch": 2.8364793747833486, + "grad_norm": 8.186322212219238, + "learning_rate": 4.504375270583921e-09, + "logits/chosen": -1.1119384765625, + "logits/rejected": -1.0484497547149658, + "logps/chosen": -327.3374938964844, + "logps/rejected": -388.8999938964844, + "loss": 0.03, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.942028760910034, + "rewards/margins": 6.8125, + "rewards/rejected": -9.753515243530273, + "step": 11250 + }, + { + "epoch": 2.8390004096681687, + "grad_norm": 10.144037246704102, + "learning_rate": 4.366827096396131e-09, + "logits/chosen": -1.1428711414337158, + "logits/rejected": -1.105322241783142, + "logps/chosen": -320.90625, + "logps/rejected": -377.32501220703125, + "loss": 0.0424, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.176776170730591, + "rewards/margins": 6.537695407867432, + "rewards/rejected": -9.714452743530273, + "step": 11260 + }, + { + "epoch": 2.8415214445529893, + "grad_norm": 7.901134490966797, + "learning_rate": 4.231393237384057e-09, + "logits/chosen": -1.158837914466858, + "logits/rejected": -1.12847900390625, + "logps/chosen": -296.9781188964844, + "logps/rejected": -359.51251220703125, + "loss": 0.0498, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.226635694503784, + "rewards/margins": 6.946679592132568, + "rewards/rejected": -10.176953315734863, + "step": 11270 + }, + { + "epoch": 2.8440424794378094, + "grad_norm": 12.296640396118164, + "learning_rate": 4.098074859309825e-09, + "logits/chosen": -1.1344115734100342, + "logits/rejected": -1.0700805187225342, + "logps/chosen": -303.98126220703125, + "logps/rejected": -351.42498779296875, + "loss": 0.0402, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.12567138671875, + "rewards/margins": 6.556836128234863, + "rewards/rejected": -9.681249618530273, + "step": 11280 + }, + { + "epoch": 2.8465635143226296, + "grad_norm": 35.342552185058594, + "learning_rate": 3.9668731097264315e-09, + "logits/chosen": -1.194067358970642, + "logits/rejected": -1.0934569835662842, + "logps/chosen": -304.54376220703125, + "logps/rejected": -355.8500061035156, + "loss": 0.0369, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.982250928878784, + "rewards/margins": 6.919921875, + "rewards/rejected": -9.901952743530273, + "step": 11290 + }, + { + "epoch": 2.8490845492074497, + "grad_norm": 17.08281707763672, + "learning_rate": 3.837789117967643e-09, + "logits/chosen": -1.2255370616912842, + "logits/rejected": -1.1570312976837158, + "logps/chosen": -335.9750061035156, + "logps/rejected": -362.5874938964844, + "loss": 0.0454, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.254565477371216, + "rewards/margins": 6.574609279632568, + "rewards/rejected": -9.8251953125, + "step": 11300 + }, + { + "epoch": 2.85160558409227, + "grad_norm": 5.149807453155518, + "learning_rate": 3.7108239951385014e-09, + "logits/chosen": -1.203222632408142, + "logits/rejected": -1.130133032798767, + "logps/chosen": -315.8500061035156, + "logps/rejected": -349.1000061035156, + "loss": 0.0512, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.07989501953125, + "rewards/margins": 6.508203029632568, + "rewards/rejected": -9.590624809265137, + "step": 11310 + }, + { + "epoch": 2.85412661897709, + "grad_norm": 11.808905601501465, + "learning_rate": 3.585978834105524e-09, + "logits/chosen": -1.2020752429962158, + "logits/rejected": -1.12860107421875, + "logps/chosen": -344.4125061035156, + "logps/rejected": -343.3374938964844, + "loss": 0.0456, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.5422699451446533, + "rewards/margins": 6.391015529632568, + "rewards/rejected": -9.932812690734863, + "step": 11320 + }, + { + "epoch": 2.85664765386191, + "grad_norm": 51.579833984375, + "learning_rate": 3.463254709487551e-09, + "logits/chosen": -1.134191870689392, + "logits/rejected": -1.115087866783142, + "logps/chosen": -338.9375, + "logps/rejected": -377.70001220703125, + "loss": 0.0509, + "rewards/accuracies": 0.96875, + "rewards/chosen": -3.247082471847534, + "rewards/margins": 6.463281154632568, + "rewards/rejected": -9.707616806030273, + "step": 11330 + }, + { + "epoch": 2.8591686887467307, + "grad_norm": 32.98530578613281, + "learning_rate": 3.342652677646246e-09, + "logits/chosen": -1.137359619140625, + "logits/rejected": -1.135321021080017, + "logps/chosen": -298.20623779296875, + "logps/rejected": -338.25, + "loss": 0.0641, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.1140379905700684, + "rewards/margins": 6.370995998382568, + "rewards/rejected": -9.483789443969727, + "step": 11340 + }, + { + "epoch": 2.861689723631551, + "grad_norm": 13.890373229980469, + "learning_rate": 3.2241737766771637e-09, + "logits/chosen": -1.16510009765625, + "logits/rejected": -1.0728271007537842, + "logps/chosen": -309.79998779296875, + "logps/rejected": -363.6499938964844, + "loss": 0.0382, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.141650438308716, + "rewards/margins": 6.865820407867432, + "rewards/rejected": -10.006250381469727, + "step": 11350 + }, + { + "epoch": 2.864210758516371, + "grad_norm": 6.062414169311523, + "learning_rate": 3.1078190264008376e-09, + "logits/chosen": -1.18145751953125, + "logits/rejected": -1.05517578125, + "logps/chosen": -330.86248779296875, + "logps/rejected": -350.13751220703125, + "loss": 0.0365, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.515551805496216, + "rewards/margins": 6.795117378234863, + "rewards/rejected": -10.306640625, + "step": 11360 + }, + { + "epoch": 2.866731793401191, + "grad_norm": 21.793306350708008, + "learning_rate": 2.9935894283538154e-09, + "logits/chosen": -1.05755615234375, + "logits/rejected": -1.0051085948944092, + "logps/chosen": -302.0, + "logps/rejected": -359.3374938964844, + "loss": 0.0476, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.3633484840393066, + "rewards/margins": 6.482226371765137, + "rewards/rejected": -9.845312118530273, + "step": 11370 + }, + { + "epoch": 2.8692528282860112, + "grad_norm": 12.266679763793945, + "learning_rate": 2.8814859657802227e-09, + "logits/chosen": -1.20147705078125, + "logits/rejected": -1.0837676525115967, + "logps/chosen": -314.2250061035156, + "logps/rejected": -351.25, + "loss": 0.0419, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.3489317893981934, + "rewards/margins": 6.389062404632568, + "rewards/rejected": -9.737500190734863, + "step": 11380 + }, + { + "epoch": 2.871773863170832, + "grad_norm": 58.831626892089844, + "learning_rate": 2.77150960362324e-09, + "logits/chosen": -1.199304223060608, + "logits/rejected": -1.1606323719024658, + "logps/chosen": -331.2749938964844, + "logps/rejected": -355.26251220703125, + "loss": 0.0426, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.2643675804138184, + "rewards/margins": 6.762499809265137, + "rewards/rejected": -10.026171684265137, + "step": 11390 + }, + { + "epoch": 2.874294898055652, + "grad_norm": 24.17465591430664, + "learning_rate": 2.6636612885167775e-09, + "logits/chosen": -1.146575927734375, + "logits/rejected": -0.9815276861190796, + "logps/chosen": -324.6875, + "logps/rejected": -342.2562561035156, + "loss": 0.065, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.903027296066284, + "rewards/margins": 6.735156059265137, + "rewards/rejected": -9.639452934265137, + "step": 11400 + }, + { + "epoch": 2.876815932940472, + "grad_norm": 16.471965789794922, + "learning_rate": 2.5579419487773424e-09, + "logits/chosen": -1.0539124011993408, + "logits/rejected": -1.096160888671875, + "logps/chosen": -335.2250061035156, + "logps/rejected": -360.125, + "loss": 0.0752, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.3601441383361816, + "rewards/margins": 6.7421875, + "rewards/rejected": -10.101171493530273, + "step": 11410 + }, + { + "epoch": 2.8793369678252922, + "grad_norm": 2.419618844985962, + "learning_rate": 2.4543524943960448e-09, + "logits/chosen": -1.16253662109375, + "logits/rejected": -1.125451683998108, + "logps/chosen": -325.9375, + "logps/rejected": -362.36248779296875, + "loss": 0.0452, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.241986036300659, + "rewards/margins": 6.643750190734863, + "rewards/rejected": -9.885156631469727, + "step": 11420 + }, + { + "epoch": 2.8818580027101124, + "grad_norm": 17.081071853637695, + "learning_rate": 2.352893817030799e-09, + "logits/chosen": -1.224328637123108, + "logits/rejected": -1.121362328529358, + "logps/chosen": -320.3999938964844, + "logps/rejected": -373.11248779296875, + "loss": 0.073, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.23199462890625, + "rewards/margins": 6.426562309265137, + "rewards/rejected": -9.658203125, + "step": 11430 + }, + { + "epoch": 2.8843790375949325, + "grad_norm": 48.259456634521484, + "learning_rate": 2.253566789998523e-09, + "logits/chosen": -1.1755859851837158, + "logits/rejected": -1.0964233875274658, + "logps/chosen": -342.7124938964844, + "logps/rejected": -360.3999938964844, + "loss": 0.0517, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.3258299827575684, + "rewards/margins": 6.460741996765137, + "rewards/rejected": -9.78515625, + "step": 11440 + }, + { + "epoch": 2.8869000724797527, + "grad_norm": 3.223163366317749, + "learning_rate": 2.156372268267842e-09, + "logits/chosen": -1.1101500988006592, + "logits/rejected": -1.1073486804962158, + "logps/chosen": -323.92498779296875, + "logps/rejected": -375.7749938964844, + "loss": 0.0528, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.199572801589966, + "rewards/margins": 6.766992092132568, + "rewards/rejected": -9.963671684265137, + "step": 11450 + }, + { + "epoch": 2.8894211073645732, + "grad_norm": 36.076942443847656, + "learning_rate": 2.061311088451506e-09, + "logits/chosen": -1.1073119640350342, + "logits/rejected": -1.0601990222930908, + "logps/chosen": -308.61248779296875, + "logps/rejected": -347.95001220703125, + "loss": 0.0507, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.39984130859375, + "rewards/margins": 6.541015625, + "rewards/rejected": -9.941797256469727, + "step": 11460 + }, + { + "epoch": 2.8919421422493934, + "grad_norm": 3.8748505115509033, + "learning_rate": 1.9683840687993448e-09, + "logits/chosen": -1.1381317377090454, + "logits/rejected": -1.048803687095642, + "logps/chosen": -299.88751220703125, + "logps/rejected": -332.2875061035156, + "loss": 0.0497, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.089038133621216, + "rewards/margins": 6.600781440734863, + "rewards/rejected": -9.693359375, + "step": 11470 + }, + { + "epoch": 2.8944631771342135, + "grad_norm": 24.626474380493164, + "learning_rate": 1.8775920091911034e-09, + "logits/chosen": -1.156835913658142, + "logits/rejected": -1.0944945812225342, + "logps/chosen": -335.8125, + "logps/rejected": -382.32501220703125, + "loss": 0.0483, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.244555711746216, + "rewards/margins": 6.644726753234863, + "rewards/rejected": -9.890625, + "step": 11480 + }, + { + "epoch": 2.8969842120190337, + "grad_norm": 7.209171772003174, + "learning_rate": 1.7889356911296448e-09, + "logits/chosen": -1.0824127197265625, + "logits/rejected": -1.033239722251892, + "logps/chosen": -335.0874938964844, + "logps/rejected": -366.29998779296875, + "loss": 0.0627, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.234814405441284, + "rewards/margins": 6.557421684265137, + "rewards/rejected": -9.793359756469727, + "step": 11490 + }, + { + "epoch": 2.8995052469038542, + "grad_norm": 27.53736686706543, + "learning_rate": 1.702415877734259e-09, + "logits/chosen": -1.1435120105743408, + "logits/rejected": -1.096521019935608, + "logps/chosen": -311.3500061035156, + "logps/rejected": -349.125, + "loss": 0.0386, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.0798583030700684, + "rewards/margins": 6.843554496765137, + "rewards/rejected": -9.927734375, + "step": 11500 + }, + { + "epoch": 2.9020262817886744, + "grad_norm": 8.53842544555664, + "learning_rate": 1.6180333137339186e-09, + "logits/chosen": -1.103082299232483, + "logits/rejected": -1.1004149913787842, + "logps/chosen": -321.3500061035156, + "logps/rejected": -356.6187438964844, + "loss": 0.0433, + "rewards/accuracies": 0.96875, + "rewards/chosen": -2.960705518722534, + "rewards/margins": 6.835156440734863, + "rewards/rejected": -9.796093940734863, + "step": 11510 + }, + { + "epoch": 2.9045473166734945, + "grad_norm": 6.406665802001953, + "learning_rate": 1.5357887254610623e-09, + "logits/chosen": -1.1190612316131592, + "logits/rejected": -1.068109154701233, + "logps/chosen": -317.70001220703125, + "logps/rejected": -363.09375, + "loss": 0.0604, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.0172119140625, + "rewards/margins": 6.927929878234863, + "rewards/rejected": -9.950390815734863, + "step": 11520 + }, + { + "epoch": 2.9070683515583147, + "grad_norm": 21.32110023498535, + "learning_rate": 1.4556828208452388e-09, + "logits/chosen": -1.2355225086212158, + "logits/rejected": -1.145727515220642, + "logps/chosen": -322.8374938964844, + "logps/rejected": -352.8999938964844, + "loss": 0.0416, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -2.916674852371216, + "rewards/margins": 6.700585842132568, + "rewards/rejected": -9.616796493530273, + "step": 11530 + }, + { + "epoch": 2.909589386443135, + "grad_norm": 8.482006072998047, + "learning_rate": 1.3777162894070272e-09, + "logits/chosen": -1.1676514148712158, + "logits/rejected": -1.1561400890350342, + "logps/chosen": -333.25, + "logps/rejected": -370.5874938964844, + "loss": 0.0573, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.3196778297424316, + "rewards/margins": 6.361620903015137, + "rewards/rejected": -9.681249618530273, + "step": 11540 + }, + { + "epoch": 2.912110421327955, + "grad_norm": 30.82038116455078, + "learning_rate": 1.3018898022521263e-09, + "logits/chosen": -1.105224609375, + "logits/rejected": -1.039794921875, + "logps/chosen": -303.375, + "logps/rejected": -355.2875061035156, + "loss": 0.0562, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.277972459793091, + "rewards/margins": 6.635351657867432, + "rewards/rejected": -9.911718368530273, + "step": 11550 + }, + { + "epoch": 2.914631456212775, + "grad_norm": 6.3294572830200195, + "learning_rate": 1.2282040120655534e-09, + "logits/chosen": -1.093475341796875, + "logits/rejected": -1.021142601966858, + "logps/chosen": -310.42498779296875, + "logps/rejected": -342.11248779296875, + "loss": 0.045, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.444873094558716, + "rewards/margins": 6.439257621765137, + "rewards/rejected": -9.880078315734863, + "step": 11560 + }, + { + "epoch": 2.9171524910975957, + "grad_norm": 19.349315643310547, + "learning_rate": 1.1566595531060374e-09, + "logits/chosen": -1.14337158203125, + "logits/rejected": -1.151037573814392, + "logps/chosen": -310.63751220703125, + "logps/rejected": -334.29998779296875, + "loss": 0.0534, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -2.9684205055236816, + "rewards/margins": 6.611132621765137, + "rewards/rejected": -9.581640243530273, + "step": 11570 + }, + { + "epoch": 2.919673525982416, + "grad_norm": 12.430782318115234, + "learning_rate": 1.087257041200551e-09, + "logits/chosen": -1.119287133216858, + "logits/rejected": -1.060430884361267, + "logps/chosen": -319.375, + "logps/rejected": -360.17498779296875, + "loss": 0.0449, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.308154344558716, + "rewards/margins": 6.586865425109863, + "rewards/rejected": -9.897265434265137, + "step": 11580 + }, + { + "epoch": 2.922194560867236, + "grad_norm": 18.37933349609375, + "learning_rate": 1.019997073739065e-09, + "logits/chosen": -1.1892883777618408, + "logits/rejected": -1.063391089439392, + "logps/chosen": -339.0375061035156, + "logps/rejected": -377.34375, + "loss": 0.0452, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.057421922683716, + "rewards/margins": 6.838086128234863, + "rewards/rejected": -9.889843940734863, + "step": 11590 + }, + { + "epoch": 2.924715595752056, + "grad_norm": 43.751304626464844, + "learning_rate": 9.548802296692749e-10, + "logits/chosen": -1.097900390625, + "logits/rejected": -1.026818871498108, + "logps/chosen": -312.35626220703125, + "logps/rejected": -363.70001220703125, + "loss": 0.0495, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.345568895339966, + "rewards/margins": 6.401757717132568, + "rewards/rejected": -9.747265815734863, + "step": 11600 + }, + { + "epoch": 2.9272366306368767, + "grad_norm": 6.898996353149414, + "learning_rate": 8.919070694917708e-10, + "logits/chosen": -1.1406066417694092, + "logits/rejected": -1.0116455554962158, + "logps/chosen": -313.4125061035156, + "logps/rejected": -346.4375, + "loss": 0.0294, + "rewards/accuracies": 0.9906250238418579, + "rewards/chosen": -3.204620361328125, + "rewards/margins": 6.840429782867432, + "rewards/rejected": -10.045312881469727, + "step": 11610 + }, + { + "epoch": 2.929757665521697, + "grad_norm": 34.288265228271484, + "learning_rate": 8.310781352550977e-10, + "logits/chosen": -1.161535620689392, + "logits/rejected": -1.116845726966858, + "logps/chosen": -324.70623779296875, + "logps/rejected": -363.375, + "loss": 0.0865, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.22119140625, + "rewards/margins": 6.787109375, + "rewards/rejected": -10.008984565734863, + "step": 11620 + }, + { + "epoch": 2.932278700406517, + "grad_norm": 11.433563232421875, + "learning_rate": 7.723939505511478e-10, + "logits/chosen": -1.154638648033142, + "logits/rejected": -1.136083960533142, + "logps/chosen": -306.57501220703125, + "logps/rejected": -366.2562561035156, + "loss": 0.0342, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.341662645339966, + "rewards/margins": 7.1279296875, + "rewards/rejected": -10.459375381469727, + "step": 11630 + }, + { + "epoch": 2.934799735291337, + "grad_norm": 28.94364356994629, + "learning_rate": 7.15855020510664e-10, + "logits/chosen": -1.1469237804412842, + "logits/rejected": -1.078161597251892, + "logps/chosen": -286.7124938964844, + "logps/rejected": -351.3374938964844, + "loss": 0.0367, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1283507347106934, + "rewards/margins": 6.691992282867432, + "rewards/rejected": -9.822461128234863, + "step": 11640 + }, + { + "epoch": 2.9373207701761572, + "grad_norm": 10.941765785217285, + "learning_rate": 6.614618317988263e-10, + "logits/chosen": -1.072302222251892, + "logits/rejected": -1.0020020008087158, + "logps/chosen": -307.6000061035156, + "logps/rejected": -357.13751220703125, + "loss": 0.033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.409228563308716, + "rewards/margins": 6.709765434265137, + "rewards/rejected": -10.120702743530273, + "step": 11650 + }, + { + "epoch": 2.9398418050609774, + "grad_norm": 41.47216796875, + "learning_rate": 6.092148526111451e-10, + "logits/chosen": -1.1149444580078125, + "logits/rejected": -1.0994384288787842, + "logps/chosen": -303.2250061035156, + "logps/rejected": -388.0, + "loss": 0.0552, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.141491651535034, + "rewards/margins": 6.625781059265137, + "rewards/rejected": -9.762109756469727, + "step": 11660 + }, + { + "epoch": 2.9423628399457975, + "grad_norm": 7.507928371429443, + "learning_rate": 5.591145326693525e-10, + "logits/chosen": -1.175994873046875, + "logits/rejected": NaN, + "logps/chosen": -314.6937561035156, + "logps/rejected": -340.17498779296875, + "loss": 0.0529, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.896167039871216, + "rewards/margins": 6.885937690734863, + "rewards/rejected": -9.782031059265137, + "step": 11670 + }, + { + "epoch": 2.944883874830618, + "grad_norm": 10.430318832397461, + "learning_rate": 5.111613032176277e-10, + "logits/chosen": -1.146691918373108, + "logits/rejected": -1.0836150646209717, + "logps/chosen": -311.76873779296875, + "logps/rejected": -389.25, + "loss": 0.0547, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.265576124191284, + "rewards/margins": 6.797461032867432, + "rewards/rejected": -10.062891006469727, + "step": 11680 + }, + { + "epoch": 2.9474049097154382, + "grad_norm": 16.366127014160156, + "learning_rate": 4.6535557701873896e-10, + "logits/chosen": -1.174015760421753, + "logits/rejected": -1.119775414466858, + "logps/chosen": -324.90625, + "logps/rejected": -346.57501220703125, + "loss": 0.045, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.0557494163513184, + "rewards/margins": 6.736523628234863, + "rewards/rejected": -9.793164253234863, + "step": 11690 + }, + { + "epoch": 2.9499259446002584, + "grad_norm": 33.4983024597168, + "learning_rate": 4.216977483506856e-10, + "logits/chosen": -1.176855444908142, + "logits/rejected": -1.0918090343475342, + "logps/chosen": -361.5874938964844, + "logps/rejected": -373.4624938964844, + "loss": 0.0647, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -3.1922364234924316, + "rewards/margins": 6.511328220367432, + "rewards/rejected": -9.704297065734863, + "step": 11700 + }, + { + "epoch": 2.9524469794850785, + "grad_norm": 36.88227462768555, + "learning_rate": 3.8018819300308925e-10, + "logits/chosen": NaN, + "logits/rejected": -1.0539124011993408, + "logps/chosen": -297.8812561035156, + "logps/rejected": -354.9375, + "loss": 0.0489, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.350720167160034, + "rewards/margins": 6.738085746765137, + "rewards/rejected": -10.090624809265137, + "step": 11710 + }, + { + "epoch": 2.954968014369899, + "grad_norm": 16.585119247436523, + "learning_rate": 3.408272682741409e-10, + "logits/chosen": -1.1395142078399658, + "logits/rejected": -1.1004638671875, + "logps/chosen": -320.4125061035156, + "logps/rejected": -357.67498779296875, + "loss": 0.044, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.1923828125, + "rewards/margins": 6.826757907867432, + "rewards/rejected": -10.017578125, + "step": 11720 + }, + { + "epoch": 2.9574890492547192, + "grad_norm": 65.93038940429688, + "learning_rate": 3.036153129674368e-10, + "logits/chosen": -1.170751929283142, + "logits/rejected": -1.065454125404358, + "logps/chosen": -307.82501220703125, + "logps/rejected": -332.1000061035156, + "loss": 0.0543, + "rewards/accuracies": 0.984375, + "rewards/chosen": -2.9806151390075684, + "rewards/margins": 6.692480564117432, + "rewards/rejected": -9.671093940734863, + "step": 11730 + }, + { + "epoch": 2.9600100841395394, + "grad_norm": 10.023194313049316, + "learning_rate": 2.685526473890365e-10, + "logits/chosen": -1.092157006263733, + "logits/rejected": -1.047845482826233, + "logps/chosen": -301.8187561035156, + "logps/rejected": -351.0249938964844, + "loss": 0.0412, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.9297118186950684, + "rewards/margins": 7.150976657867432, + "rewards/rejected": -10.081250190734863, + "step": 11740 + }, + { + "epoch": 2.9625311190243595, + "grad_norm": 6.3317365646362305, + "learning_rate": 2.3563957334482575e-10, + "logits/chosen": -1.0945556163787842, + "logits/rejected": -1.085790991783142, + "logps/chosen": -300.01251220703125, + "logps/rejected": -373.1499938964844, + "loss": 0.0629, + "rewards/accuracies": 0.971875011920929, + "rewards/chosen": -2.794238328933716, + "rewards/margins": 6.842382907867432, + "rewards/rejected": -9.637890815734863, + "step": 11750 + }, + { + "epoch": 2.9650521539091796, + "grad_norm": 8.28089714050293, + "learning_rate": 2.0487637413776903e-10, + "logits/chosen": -1.1538817882537842, + "logits/rejected": -1.142126441001892, + "logps/chosen": -295.25, + "logps/rejected": -366.6625061035156, + "loss": 0.0474, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.4425110816955566, + "rewards/margins": 6.487890720367432, + "rewards/rejected": -9.92578125, + "step": 11760 + }, + { + "epoch": 2.967573188794, + "grad_norm": 11.035055160522461, + "learning_rate": 1.762633145655501e-10, + "logits/chosen": -1.125695824623108, + "logits/rejected": -1.102941870689392, + "logps/chosen": -311.45001220703125, + "logps/rejected": -386.98748779296875, + "loss": 0.0442, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.2675232887268066, + "rewards/margins": 6.892187595367432, + "rewards/rejected": -10.164453506469727, + "step": 11770 + }, + { + "epoch": 2.97009422367882, + "grad_norm": 13.995767593383789, + "learning_rate": 1.4980064091835166e-10, + "logits/chosen": -1.1941039562225342, + "logits/rejected": -1.138085961341858, + "logps/chosen": -322.6499938964844, + "logps/rejected": -358.5249938964844, + "loss": 0.0554, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.00164794921875, + "rewards/margins": 6.380663871765137, + "rewards/rejected": -9.379687309265137, + "step": 11780 + }, + { + "epoch": 2.9726152585636405, + "grad_norm": 27.741172790527344, + "learning_rate": 1.2548858097655157e-10, + "logits/chosen": -1.1244628429412842, + "logits/rejected": -1.027490258216858, + "logps/chosen": -306.54998779296875, + "logps/rejected": -369.76251220703125, + "loss": 0.0662, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.30889892578125, + "rewards/margins": 6.887304782867432, + "rewards/rejected": -10.193750381469727, + "step": 11790 + }, + { + "epoch": 2.9751362934484606, + "grad_norm": 9.173103332519531, + "learning_rate": 1.0332734400897437e-10, + "logits/chosen": -1.211492896080017, + "logits/rejected": -1.057397484779358, + "logps/chosen": -300.70001220703125, + "logps/rejected": -351.7875061035156, + "loss": 0.0317, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -3.187939405441284, + "rewards/margins": 6.911718845367432, + "rewards/rejected": -10.100390434265137, + "step": 11800 + }, + { + "epoch": 2.977657328333281, + "grad_norm": 22.094951629638672, + "learning_rate": 8.331712077094821e-11, + "logits/chosen": -1.1710205078125, + "logits/rejected": -1.077600121498108, + "logps/chosen": -335.67498779296875, + "logps/rejected": -369.125, + "loss": 0.033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8613648414611816, + "rewards/margins": 6.783984184265137, + "rewards/rejected": -9.644335746765137, + "step": 11810 + }, + { + "epoch": 2.980178363218101, + "grad_norm": 17.2425537109375, + "learning_rate": 6.545808350272297e-11, + "logits/chosen": -1.195837378501892, + "logits/rejected": -1.118493676185608, + "logps/chosen": -300.3999938964844, + "logps/rejected": -356.70001220703125, + "loss": 0.0313, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.832348585128784, + "rewards/margins": 6.853906154632568, + "rewards/rejected": -9.684374809265137, + "step": 11820 + }, + { + "epoch": 2.9826993981029215, + "grad_norm": 20.294713973999023, + "learning_rate": 4.9750385927971315e-11, + "logits/chosen": -1.142907738685608, + "logits/rejected": -1.128930687904358, + "logps/chosen": -328.4437561035156, + "logps/rejected": -386.25, + "loss": 0.0473, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -3.2728271484375, + "rewards/margins": 6.649609565734863, + "rewards/rejected": -9.918359756469727, + "step": 11830 + }, + { + "epoch": 2.9852204329877416, + "grad_norm": 12.122188568115234, + "learning_rate": 3.619416325251201e-11, + "logits/chosen": -1.1633621454238892, + "logits/rejected": -1.13446044921875, + "logps/chosen": -310.07501220703125, + "logps/rejected": -371.1625061035156, + "loss": 0.0601, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.8426756858825684, + "rewards/margins": 6.851366996765137, + "rewards/rejected": -9.696484565734863, + "step": 11840 + }, + { + "epoch": 2.987741467872562, + "grad_norm": 32.661895751953125, + "learning_rate": 2.4789532162977632e-11, + "logits/chosen": -1.1033751964569092, + "logits/rejected": -1.0698974132537842, + "logps/chosen": -323.9937438964844, + "logps/rejected": -352.88751220703125, + "loss": 0.0678, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.232617139816284, + "rewards/margins": 6.576757907867432, + "rewards/rejected": -9.807421684265137, + "step": 11850 + }, + { + "epoch": 2.990262502757382, + "grad_norm": 41.79987716674805, + "learning_rate": 1.5536590826065177e-11, + "logits/chosen": -1.104711890220642, + "logits/rejected": -1.12957763671875, + "logps/chosen": -299.4312438964844, + "logps/rejected": -342.6875, + "loss": 0.0549, + "rewards/accuracies": 0.9781249761581421, + "rewards/chosen": -3.066140651702881, + "rewards/margins": 6.560351371765137, + "rewards/rejected": -9.627344131469727, + "step": 11860 + }, + { + "epoch": 2.992783537642202, + "grad_norm": 3.907005548477173, + "learning_rate": 8.435418887509094e-12, + "logits/chosen": -1.119897484779358, + "logits/rejected": -1.130041480064392, + "logps/chosen": -316.83123779296875, + "logps/rejected": -372.1875, + "loss": 0.0502, + "rewards/accuracies": 0.984375, + "rewards/chosen": -3.2793946266174316, + "rewards/margins": 6.817578315734863, + "rewards/rejected": -10.09765625, + "step": 11870 + }, + { + "epoch": 2.995304572527022, + "grad_norm": 49.77162170410156, + "learning_rate": 3.486077471415161e-12, + "logits/chosen": -1.1683471202850342, + "logits/rejected": -1.020263671875, + "logps/chosen": -318.46875, + "logps/rejected": -362.42498779296875, + "loss": 0.0627, + "rewards/accuracies": 0.965624988079071, + "rewards/chosen": -3.028076171875, + "rewards/margins": 6.994336128234863, + "rewards/rejected": -10.017969131469727, + "step": 11880 + }, + { + "epoch": 2.9978256074118423, + "grad_norm": 22.62541961669922, + "learning_rate": 6.886091798441462e-13, + "logits/chosen": -1.0814208984375, + "logits/rejected": -1.0946776866912842, + "logps/chosen": -336.9937438964844, + "logps/rejected": -395.875, + "loss": 0.0526, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.205029249191284, + "rewards/margins": 6.807714939117432, + "rewards/rejected": -10.0146484375, + "step": 11890 + } + ], + "logging_steps": 10, + "max_steps": 11898, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}