{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 2000.0, "global_step": 5796, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005175983436853002, "grad_norm": 48.2049674987793, "learning_rate": 3.448275862068966e-08, "loss": 1.083831787109375, "rewards/accuracies": 0.3359375, "rewards/chosen": -1.0772185325622559, "rewards/margins": -0.4424896240234375, "rewards/rejected": -0.6347158551216125, "step": 1 }, { "epoch": 0.0010351966873706005, "grad_norm": 45.072975158691406, "learning_rate": 6.896551724137931e-08, "loss": 1.004425048828125, "rewards/accuracies": 0.390625, "rewards/chosen": -0.9120941162109375, "rewards/margins": -0.38445281982421875, "rewards/rejected": -0.5279655456542969, "step": 2 }, { "epoch": 0.0015527950310559005, "grad_norm": 48.458595275878906, "learning_rate": 1.0344827586206898e-07, "loss": 0.9905319213867188, "rewards/accuracies": 0.40625, "rewards/chosen": -0.8525078296661377, "rewards/margins": -0.28421783447265625, "rewards/rejected": -0.568293571472168, "step": 3 }, { "epoch": 0.002070393374741201, "grad_norm": 43.84965896606445, "learning_rate": 1.3793103448275863e-07, "loss": 1.0067901611328125, "rewards/accuracies": 0.3671875, "rewards/chosen": -0.9737570285797119, "rewards/margins": -0.3477134704589844, "rewards/rejected": -0.6261043548583984, "step": 4 }, { "epoch": 0.002587991718426501, "grad_norm": 45.529727935791016, "learning_rate": 1.7241379310344828e-07, "loss": 0.9122695922851562, "rewards/accuracies": 0.3984375, "rewards/chosen": -0.9060955047607422, "rewards/margins": -0.218780517578125, "rewards/rejected": -0.6871734857559204, "step": 5 }, { "epoch": 0.003105590062111801, "grad_norm": 41.73444366455078, "learning_rate": 2.0689655172413796e-07, "loss": 0.9801483154296875, "rewards/accuracies": 0.3828125, "rewards/chosen": -1.0084846019744873, "rewards/margins": -0.3373279571533203, "rewards/rejected": -0.6713771820068359, "step": 6 }, { "epoch": 0.0036231884057971015, "grad_norm": 45.47935104370117, "learning_rate": 2.413793103448276e-07, "loss": 1.0174560546875, "rewards/accuracies": 0.3671875, "rewards/chosen": -1.002016544342041, "rewards/margins": -0.3533210754394531, "rewards/rejected": -0.6490609645843506, "step": 7 }, { "epoch": 0.004140786749482402, "grad_norm": 42.251953125, "learning_rate": 2.7586206896551726e-07, "loss": 0.9105300903320312, "rewards/accuracies": 0.421875, "rewards/chosen": -0.9067145586013794, "rewards/margins": -0.20062255859375, "rewards/rejected": -0.7063684463500977, "step": 8 }, { "epoch": 0.004658385093167702, "grad_norm": 44.25387191772461, "learning_rate": 3.1034482758620694e-07, "loss": 0.8822383880615234, "rewards/accuracies": 0.4453125, "rewards/chosen": -0.9774932861328125, "rewards/margins": -0.08828353881835938, "rewards/rejected": -0.88932204246521, "step": 9 }, { "epoch": 0.005175983436853002, "grad_norm": 44.05298614501953, "learning_rate": 3.4482758620689656e-07, "loss": 0.9727096557617188, "rewards/accuracies": 0.4609375, "rewards/chosen": -0.9577093124389648, "rewards/margins": -0.3033866882324219, "rewards/rejected": -0.6542892456054688, "step": 10 }, { "epoch": 0.005693581780538302, "grad_norm": 41.977760314941406, "learning_rate": 3.7931034482758624e-07, "loss": 0.8754386901855469, "rewards/accuracies": 0.421875, "rewards/chosen": -0.9901323318481445, "rewards/margins": -0.1000213623046875, "rewards/rejected": -0.8898963928222656, "step": 11 }, { "epoch": 0.006211180124223602, "grad_norm": 42.35674285888672, "learning_rate": 4.137931034482759e-07, "loss": 0.8957366943359375, "rewards/accuracies": 0.421875, "rewards/chosen": -0.8082046508789062, "rewards/margins": -0.17309284210205078, "rewards/rejected": -0.6349000930786133, "step": 12 }, { "epoch": 0.006728778467908903, "grad_norm": 39.23073959350586, "learning_rate": 4.4827586206896554e-07, "loss": 0.7859992980957031, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.8918161392211914, "rewards/margins": 0.011762619018554688, "rewards/rejected": -0.9033931493759155, "step": 13 }, { "epoch": 0.007246376811594203, "grad_norm": 44.760677337646484, "learning_rate": 4.827586206896552e-07, "loss": 0.9120330810546875, "rewards/accuracies": 0.4296875, "rewards/chosen": -0.8293094635009766, "rewards/margins": -0.21861648559570312, "rewards/rejected": -0.6107501983642578, "step": 14 }, { "epoch": 0.007763975155279503, "grad_norm": 47.34134292602539, "learning_rate": 5.172413793103449e-07, "loss": 0.8465843200683594, "rewards/accuracies": 0.453125, "rewards/chosen": -0.8080534934997559, "rewards/margins": -0.08687400817871094, "rewards/rejected": -0.7210164070129395, "step": 15 }, { "epoch": 0.008281573498964804, "grad_norm": 39.59687423706055, "learning_rate": 5.517241379310345e-07, "loss": 0.8404617309570312, "rewards/accuracies": 0.46875, "rewards/chosen": -0.9085178375244141, "rewards/margins": -0.10288238525390625, "rewards/rejected": -0.8055275678634644, "step": 16 }, { "epoch": 0.008799171842650104, "grad_norm": 35.034393310546875, "learning_rate": 5.862068965517241e-07, "loss": 0.6821613311767578, "rewards/accuracies": 0.640625, "rewards/chosen": -0.6771728992462158, "rewards/margins": 0.29137706756591797, "rewards/rejected": -0.9687380790710449, "step": 17 }, { "epoch": 0.009316770186335404, "grad_norm": 33.379783630371094, "learning_rate": 6.206896551724139e-07, "loss": 0.6682834625244141, "rewards/accuracies": 0.578125, "rewards/chosen": -0.7273998856544495, "rewards/margins": 0.28704833984375, "rewards/rejected": -1.0143239498138428, "step": 18 }, { "epoch": 0.009834368530020704, "grad_norm": 26.55755615234375, "learning_rate": 6.551724137931036e-07, "loss": 0.6690101623535156, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.6442853808403015, "rewards/margins": 0.3290138244628906, "rewards/rejected": -0.9734554290771484, "step": 19 }, { "epoch": 0.010351966873706004, "grad_norm": 26.10276985168457, "learning_rate": 6.896551724137931e-07, "loss": 0.6066074371337891, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5577621459960938, "rewards/margins": 0.45133209228515625, "rewards/rejected": -1.0093493461608887, "step": 20 }, { "epoch": 0.010869565217391304, "grad_norm": 26.049327850341797, "learning_rate": 7.241379310344829e-07, "loss": 0.55633544921875, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.4684741497039795, "rewards/margins": 0.6337356567382812, "rewards/rejected": -1.1024508476257324, "step": 21 }, { "epoch": 0.011387163561076604, "grad_norm": 26.775634765625, "learning_rate": 7.586206896551725e-07, "loss": 0.6049613952636719, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6939582824707031, "rewards/margins": 0.46094512939453125, "rewards/rejected": -1.154770851135254, "step": 22 }, { "epoch": 0.011904761904761904, "grad_norm": 22.341524124145508, "learning_rate": 7.931034482758622e-07, "loss": 0.5002975463867188, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.21880149841308594, "rewards/margins": 0.8792877197265625, "rewards/rejected": -1.0983409881591797, "step": 23 }, { "epoch": 0.012422360248447204, "grad_norm": 15.25274658203125, "learning_rate": 8.275862068965518e-07, "loss": 0.386630654335022, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1032910943031311, "rewards/margins": 1.6306686401367188, "rewards/rejected": -1.5275182723999023, "step": 24 }, { "epoch": 0.012939958592132506, "grad_norm": 13.09037971496582, "learning_rate": 8.620689655172415e-07, "loss": 0.3648037910461426, "rewards/accuracies": 0.7890625, "rewards/chosen": 0.5113210678100586, "rewards/margins": 1.8884124755859375, "rewards/rejected": -1.376150131225586, "step": 25 }, { "epoch": 0.013457556935817806, "grad_norm": 13.49074649810791, "learning_rate": 8.965517241379311e-07, "loss": 0.38571280241012573, "rewards/accuracies": 0.8125, "rewards/chosen": 0.47081470489501953, "rewards/margins": 1.933868408203125, "rewards/rejected": -1.4627585411071777, "step": 26 }, { "epoch": 0.013975155279503106, "grad_norm": 13.919214248657227, "learning_rate": 9.310344827586208e-07, "loss": 0.36539316177368164, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.5168247222900391, "rewards/margins": 2.2050094604492188, "rewards/rejected": -1.6883668899536133, "step": 27 }, { "epoch": 0.014492753623188406, "grad_norm": 12.811761856079102, "learning_rate": 9.655172413793103e-07, "loss": 0.4134490489959717, "rewards/accuracies": 0.796875, "rewards/chosen": 0.6066207885742188, "rewards/margins": 1.9363021850585938, "rewards/rejected": -1.328992247581482, "step": 28 }, { "epoch": 0.015010351966873706, "grad_norm": 10.256165504455566, "learning_rate": 1.0000000000000002e-06, "loss": 0.2946581542491913, "rewards/accuracies": 0.890625, "rewards/chosen": 0.8624057769775391, "rewards/margins": 2.5798187255859375, "rewards/rejected": -1.7168846130371094, "step": 29 }, { "epoch": 0.015527950310559006, "grad_norm": 12.839567184448242, "learning_rate": 1.0344827586206898e-06, "loss": 0.37921789288520813, "rewards/accuracies": 0.796875, "rewards/chosen": 0.5605030059814453, "rewards/margins": 2.1115875244140625, "rewards/rejected": -1.5518708229064941, "step": 30 }, { "epoch": 0.016045548654244308, "grad_norm": 10.127750396728516, "learning_rate": 1.0689655172413794e-06, "loss": 0.3018431067466736, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.9873199462890625, "rewards/margins": 2.58941650390625, "rewards/rejected": -1.6019840240478516, "step": 31 }, { "epoch": 0.016563146997929608, "grad_norm": 10.548563957214355, "learning_rate": 1.103448275862069e-06, "loss": 0.2733133137226105, "rewards/accuracies": 0.875, "rewards/chosen": 1.0029640197753906, "rewards/margins": 2.910369873046875, "rewards/rejected": -1.9071698188781738, "step": 32 }, { "epoch": 0.017080745341614908, "grad_norm": 8.960685729980469, "learning_rate": 1.1379310344827587e-06, "loss": 0.24977490305900574, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4177436828613281, "rewards/margins": 3.6296844482421875, "rewards/rejected": -2.21282958984375, "step": 33 }, { "epoch": 0.017598343685300208, "grad_norm": 15.828512191772461, "learning_rate": 1.1724137931034483e-06, "loss": 0.3778592646121979, "rewards/accuracies": 0.8125, "rewards/chosen": 1.7404170036315918, "rewards/margins": 4.0125274658203125, "rewards/rejected": -2.2714805603027344, "step": 34 }, { "epoch": 0.018115942028985508, "grad_norm": 12.7484130859375, "learning_rate": 1.2068965517241381e-06, "loss": 0.37811312079429626, "rewards/accuracies": 0.859375, "rewards/chosen": 2.5473480224609375, "rewards/margins": 4.65875244140625, "rewards/rejected": -2.111738920211792, "step": 35 }, { "epoch": 0.018633540372670808, "grad_norm": 12.850502014160156, "learning_rate": 1.2413793103448277e-06, "loss": 0.3069648742675781, "rewards/accuracies": 0.828125, "rewards/chosen": 2.566946029663086, "rewards/margins": 4.4907073974609375, "rewards/rejected": -1.9243345260620117, "step": 36 }, { "epoch": 0.019151138716356108, "grad_norm": 12.96895694732666, "learning_rate": 1.2758620689655174e-06, "loss": 0.33785420656204224, "rewards/accuracies": 0.8359375, "rewards/chosen": 2.5041112899780273, "rewards/margins": 4.652183532714844, "rewards/rejected": -2.147531509399414, "step": 37 }, { "epoch": 0.019668737060041408, "grad_norm": 13.604329109191895, "learning_rate": 1.3103448275862072e-06, "loss": 0.3025935888290405, "rewards/accuracies": 0.8828125, "rewards/chosen": 2.741917610168457, "rewards/margins": 4.6793670654296875, "rewards/rejected": -1.936326026916504, "step": 38 }, { "epoch": 0.020186335403726708, "grad_norm": 10.640780448913574, "learning_rate": 1.3448275862068966e-06, "loss": 0.28457096219062805, "rewards/accuracies": 0.828125, "rewards/chosen": 3.029025077819824, "rewards/margins": 4.707756042480469, "rewards/rejected": -1.6775760650634766, "step": 39 }, { "epoch": 0.020703933747412008, "grad_norm": 11.47895622253418, "learning_rate": 1.3793103448275862e-06, "loss": 0.32411718368530273, "rewards/accuracies": 0.8359375, "rewards/chosen": 2.909743309020996, "rewards/margins": 4.38018798828125, "rewards/rejected": -1.4700984954833984, "step": 40 }, { "epoch": 0.021221532091097308, "grad_norm": 10.991114616394043, "learning_rate": 1.413793103448276e-06, "loss": 0.26999741792678833, "rewards/accuracies": 0.890625, "rewards/chosen": 2.715801239013672, "rewards/margins": 4.490989685058594, "rewards/rejected": -1.7742102146148682, "step": 41 }, { "epoch": 0.021739130434782608, "grad_norm": 9.835562705993652, "learning_rate": 1.4482758620689657e-06, "loss": 0.2944469749927521, "rewards/accuracies": 0.8828125, "rewards/chosen": 2.964317798614502, "rewards/margins": 4.663124084472656, "rewards/rejected": -1.6984739303588867, "step": 42 }, { "epoch": 0.022256728778467908, "grad_norm": 11.343695640563965, "learning_rate": 1.4827586206896551e-06, "loss": 0.30344757437705994, "rewards/accuracies": 0.859375, "rewards/chosen": 2.230806350708008, "rewards/margins": 4.203956604003906, "rewards/rejected": -1.9737091064453125, "step": 43 }, { "epoch": 0.022774327122153208, "grad_norm": 10.302155494689941, "learning_rate": 1.517241379310345e-06, "loss": 0.261209636926651, "rewards/accuracies": 0.8515625, "rewards/chosen": 2.0937347412109375, "rewards/margins": 4.122032165527344, "rewards/rejected": -2.028186798095703, "step": 44 }, { "epoch": 0.023291925465838508, "grad_norm": 9.306882858276367, "learning_rate": 1.5517241379310346e-06, "loss": 0.23994797468185425, "rewards/accuracies": 0.875, "rewards/chosen": 2.254694938659668, "rewards/margins": 4.4925689697265625, "rewards/rejected": -2.2377147674560547, "step": 45 }, { "epoch": 0.023809523809523808, "grad_norm": 10.854596138000488, "learning_rate": 1.5862068965517244e-06, "loss": 0.3857841491699219, "rewards/accuracies": 0.8203125, "rewards/chosen": 1.614305019378662, "rewards/margins": 3.599822998046875, "rewards/rejected": -1.9858779907226562, "step": 46 }, { "epoch": 0.024327122153209108, "grad_norm": 8.835027694702148, "learning_rate": 1.6206896551724138e-06, "loss": 0.25571921467781067, "rewards/accuracies": 0.8671875, "rewards/chosen": 1.2926921844482422, "rewards/margins": 3.8629913330078125, "rewards/rejected": -2.5701956748962402, "step": 47 }, { "epoch": 0.024844720496894408, "grad_norm": 8.45661735534668, "learning_rate": 1.6551724137931037e-06, "loss": 0.19672498106956482, "rewards/accuracies": 0.90625, "rewards/chosen": 2.4632530212402344, "rewards/margins": 5.0066986083984375, "rewards/rejected": -2.5410542488098145, "step": 48 }, { "epoch": 0.025362318840579712, "grad_norm": 8.497535705566406, "learning_rate": 1.6896551724137933e-06, "loss": 0.28328895568847656, "rewards/accuracies": 0.859375, "rewards/chosen": 1.7526006698608398, "rewards/margins": 4.342864990234375, "rewards/rejected": -2.5891971588134766, "step": 49 }, { "epoch": 0.025879917184265012, "grad_norm": 8.405112266540527, "learning_rate": 1.724137931034483e-06, "loss": 0.2809831500053406, "rewards/accuracies": 0.828125, "rewards/chosen": 1.181169033050537, "rewards/margins": 3.98590087890625, "rewards/rejected": -2.8031864166259766, "step": 50 }, { "epoch": 0.026397515527950312, "grad_norm": 7.6570611000061035, "learning_rate": 1.7586206896551725e-06, "loss": 0.23254112899303436, "rewards/accuracies": 0.859375, "rewards/chosen": 1.404963493347168, "rewards/margins": 4.419586181640625, "rewards/rejected": -3.014293670654297, "step": 51 }, { "epoch": 0.026915113871635612, "grad_norm": 7.07657527923584, "learning_rate": 1.7931034482758622e-06, "loss": 0.20844317972660065, "rewards/accuracies": 0.875, "rewards/chosen": 1.3794879913330078, "rewards/margins": 4.466835021972656, "rewards/rejected": -3.0870285034179688, "step": 52 }, { "epoch": 0.027432712215320912, "grad_norm": 7.393006324768066, "learning_rate": 1.827586206896552e-06, "loss": 0.23888538777828217, "rewards/accuracies": 0.8515625, "rewards/chosen": 1.930023193359375, "rewards/margins": 4.996307373046875, "rewards/rejected": -3.0675735473632812, "step": 53 }, { "epoch": 0.027950310559006212, "grad_norm": 7.192559242248535, "learning_rate": 1.8620689655172416e-06, "loss": 0.2699611186981201, "rewards/accuracies": 0.8359375, "rewards/chosen": 2.024496078491211, "rewards/margins": 4.945713043212891, "rewards/rejected": -2.921794891357422, "step": 54 }, { "epoch": 0.028467908902691512, "grad_norm": 6.613773345947266, "learning_rate": 1.896551724137931e-06, "loss": 0.22987009584903717, "rewards/accuracies": 0.890625, "rewards/chosen": 1.325765609741211, "rewards/margins": 4.534263610839844, "rewards/rejected": -3.2088022232055664, "step": 55 }, { "epoch": 0.028985507246376812, "grad_norm": 5.5472025871276855, "learning_rate": 1.9310344827586207e-06, "loss": 0.2156962901353836, "rewards/accuracies": 0.890625, "rewards/chosen": 1.5941896438598633, "rewards/margins": 4.903923034667969, "rewards/rejected": -3.30877685546875, "step": 56 }, { "epoch": 0.029503105590062112, "grad_norm": 7.1225409507751465, "learning_rate": 1.9655172413793105e-06, "loss": 0.2327854037284851, "rewards/accuracies": 0.859375, "rewards/chosen": 1.6421279907226562, "rewards/margins": 5.091796875, "rewards/rejected": -3.4467334747314453, "step": 57 }, { "epoch": 0.030020703933747412, "grad_norm": 7.794469356536865, "learning_rate": 2.0000000000000003e-06, "loss": 0.30076295137405396, "rewards/accuracies": 0.8359375, "rewards/chosen": 1.611318588256836, "rewards/margins": 4.837432861328125, "rewards/rejected": -3.2256813049316406, "step": 58 }, { "epoch": 0.030538302277432712, "grad_norm": 7.667142868041992, "learning_rate": 2.0344827586206897e-06, "loss": 0.23074333369731903, "rewards/accuracies": 0.8828125, "rewards/chosen": 2.6159400939941406, "rewards/margins": 6.105182647705078, "rewards/rejected": -3.487103223800659, "step": 59 }, { "epoch": 0.031055900621118012, "grad_norm": 5.354184150695801, "learning_rate": 2.0689655172413796e-06, "loss": 0.2220200002193451, "rewards/accuracies": 0.9140625, "rewards/chosen": 2.19027042388916, "rewards/margins": 5.570831298828125, "rewards/rejected": -3.381047487258911, "step": 60 }, { "epoch": 0.03157349896480331, "grad_norm": 11.22486686706543, "learning_rate": 2.103448275862069e-06, "loss": 0.32622677087783813, "rewards/accuracies": 0.8515625, "rewards/chosen": 1.8544044494628906, "rewards/margins": 5.73260498046875, "rewards/rejected": -3.8797593116760254, "step": 61 }, { "epoch": 0.032091097308488616, "grad_norm": 5.300561904907227, "learning_rate": 2.137931034482759e-06, "loss": 0.22179463505744934, "rewards/accuracies": 0.8828125, "rewards/chosen": 1.7433032989501953, "rewards/margins": 5.1464385986328125, "rewards/rejected": -3.4032440185546875, "step": 62 }, { "epoch": 0.03260869565217391, "grad_norm": 4.371739864349365, "learning_rate": 2.1724137931034482e-06, "loss": 0.16307851672172546, "rewards/accuracies": 0.9375, "rewards/chosen": 2.3416213989257812, "rewards/margins": 5.7353668212890625, "rewards/rejected": -3.3960189819335938, "step": 63 }, { "epoch": 0.033126293995859216, "grad_norm": 7.2413554191589355, "learning_rate": 2.206896551724138e-06, "loss": 0.28656789660453796, "rewards/accuracies": 0.84375, "rewards/chosen": 2.020063638687134, "rewards/margins": 5.401271820068359, "rewards/rejected": -3.381772041320801, "step": 64 }, { "epoch": 0.03364389233954451, "grad_norm": 6.977203369140625, "learning_rate": 2.241379310344828e-06, "loss": 0.21912862360477448, "rewards/accuracies": 0.8828125, "rewards/chosen": 2.175431728363037, "rewards/margins": 5.7516326904296875, "rewards/rejected": -3.5762176513671875, "step": 65 }, { "epoch": 0.034161490683229816, "grad_norm": 4.711480140686035, "learning_rate": 2.2758620689655173e-06, "loss": 0.2144620567560196, "rewards/accuracies": 0.890625, "rewards/chosen": 1.949575424194336, "rewards/margins": 5.216545104980469, "rewards/rejected": -3.2672901153564453, "step": 66 }, { "epoch": 0.03467908902691511, "grad_norm": 4.689819812774658, "learning_rate": 2.310344827586207e-06, "loss": 0.20214474201202393, "rewards/accuracies": 0.9140625, "rewards/chosen": 2.098992347717285, "rewards/margins": 5.3301239013671875, "rewards/rejected": -3.230680465698242, "step": 67 }, { "epoch": 0.035196687370600416, "grad_norm": 6.961520195007324, "learning_rate": 2.3448275862068966e-06, "loss": 0.2924998998641968, "rewards/accuracies": 0.8359375, "rewards/chosen": 2.20432186126709, "rewards/margins": 5.165321350097656, "rewards/rejected": -2.9615478515625, "step": 68 }, { "epoch": 0.03571428571428571, "grad_norm": 5.829561233520508, "learning_rate": 2.3793103448275864e-06, "loss": 0.23486930131912231, "rewards/accuracies": 0.8359375, "rewards/chosen": 2.69102144241333, "rewards/margins": 5.454460144042969, "rewards/rejected": -2.7629432678222656, "step": 69 }, { "epoch": 0.036231884057971016, "grad_norm": 6.91633415222168, "learning_rate": 2.4137931034482762e-06, "loss": 0.28765711188316345, "rewards/accuracies": 0.8203125, "rewards/chosen": 2.8162155151367188, "rewards/margins": 5.396453857421875, "rewards/rejected": -2.5793418884277344, "step": 70 }, { "epoch": 0.03674948240165631, "grad_norm": 4.960651874542236, "learning_rate": 2.4482758620689657e-06, "loss": 0.2830408811569214, "rewards/accuracies": 0.828125, "rewards/chosen": 2.8425827026367188, "rewards/margins": 5.101139068603516, "rewards/rejected": -2.2586593627929688, "step": 71 }, { "epoch": 0.037267080745341616, "grad_norm": 4.590120315551758, "learning_rate": 2.4827586206896555e-06, "loss": 0.22722870111465454, "rewards/accuracies": 0.8984375, "rewards/chosen": 2.9951343536376953, "rewards/margins": 5.406761169433594, "rewards/rejected": -2.411226272583008, "step": 72 }, { "epoch": 0.03778467908902691, "grad_norm": 4.352262496948242, "learning_rate": 2.517241379310345e-06, "loss": 0.2528707981109619, "rewards/accuracies": 0.890625, "rewards/chosen": 3.758772850036621, "rewards/margins": 5.431526184082031, "rewards/rejected": -1.6721410751342773, "step": 73 }, { "epoch": 0.038302277432712216, "grad_norm": 4.152829647064209, "learning_rate": 2.5517241379310347e-06, "loss": 0.19320592284202576, "rewards/accuracies": 0.8984375, "rewards/chosen": 4.060552597045898, "rewards/margins": 5.5336456298828125, "rewards/rejected": -1.4723625183105469, "step": 74 }, { "epoch": 0.03881987577639751, "grad_norm": 5.311220169067383, "learning_rate": 2.5862068965517246e-06, "loss": 0.25355514883995056, "rewards/accuracies": 0.8515625, "rewards/chosen": 3.590068817138672, "rewards/margins": 5.038959503173828, "rewards/rejected": -1.4488134384155273, "step": 75 }, { "epoch": 0.039337474120082816, "grad_norm": 3.8752493858337402, "learning_rate": 2.6206896551724144e-06, "loss": 0.24533961713314056, "rewards/accuracies": 0.8828125, "rewards/chosen": 3.8125228881835938, "rewards/margins": 5.119049072265625, "rewards/rejected": -1.307586669921875, "step": 76 }, { "epoch": 0.03985507246376811, "grad_norm": 4.50554084777832, "learning_rate": 2.6551724137931034e-06, "loss": 0.25242823362350464, "rewards/accuracies": 0.875, "rewards/chosen": 3.328738212585449, "rewards/margins": 4.302162170410156, "rewards/rejected": -0.9732961654663086, "step": 77 }, { "epoch": 0.040372670807453416, "grad_norm": 3.6948342323303223, "learning_rate": 2.6896551724137932e-06, "loss": 0.23185782134532928, "rewards/accuracies": 0.8828125, "rewards/chosen": 3.4577343463897705, "rewards/margins": 4.520668029785156, "rewards/rejected": -1.0631942749023438, "step": 78 }, { "epoch": 0.04089026915113872, "grad_norm": 4.421131610870361, "learning_rate": 2.724137931034483e-06, "loss": 0.2255813181400299, "rewards/accuracies": 0.8828125, "rewards/chosen": 3.026304244995117, "rewards/margins": 4.0570526123046875, "rewards/rejected": -1.031341552734375, "step": 79 }, { "epoch": 0.041407867494824016, "grad_norm": 4.664706707000732, "learning_rate": 2.7586206896551725e-06, "loss": 0.27192920446395874, "rewards/accuracies": 0.859375, "rewards/chosen": 3.1169862747192383, "rewards/margins": 3.790935516357422, "rewards/rejected": -0.6747136116027832, "step": 80 }, { "epoch": 0.04192546583850932, "grad_norm": 3.4651546478271484, "learning_rate": 2.7931034482758623e-06, "loss": 0.2195420265197754, "rewards/accuracies": 0.8984375, "rewards/chosen": 3.5071935653686523, "rewards/margins": 4.310028076171875, "rewards/rejected": -0.8003101348876953, "step": 81 }, { "epoch": 0.042443064182194616, "grad_norm": 3.620100736618042, "learning_rate": 2.827586206896552e-06, "loss": 0.25485241413116455, "rewards/accuracies": 0.8828125, "rewards/chosen": 4.055027008056641, "rewards/margins": 4.566795349121094, "rewards/rejected": -0.5107104778289795, "step": 82 }, { "epoch": 0.04296066252587992, "grad_norm": 3.5596368312835693, "learning_rate": 2.8620689655172416e-06, "loss": 0.21502657234668732, "rewards/accuracies": 0.8828125, "rewards/chosen": 4.326511383056641, "rewards/margins": 4.809326171875, "rewards/rejected": -0.48273658752441406, "step": 83 }, { "epoch": 0.043478260869565216, "grad_norm": 4.525784492492676, "learning_rate": 2.8965517241379314e-06, "loss": 0.25564074516296387, "rewards/accuracies": 0.859375, "rewards/chosen": 4.023506164550781, "rewards/margins": 4.350673675537109, "rewards/rejected": -0.3275279998779297, "step": 84 }, { "epoch": 0.04399585921325052, "grad_norm": 4.533627986907959, "learning_rate": 2.931034482758621e-06, "loss": 0.1834975779056549, "rewards/accuracies": 0.90625, "rewards/chosen": 5.082733154296875, "rewards/margins": 5.396587371826172, "rewards/rejected": -0.3152717351913452, "step": 85 }, { "epoch": 0.044513457556935816, "grad_norm": 3.375351905822754, "learning_rate": 2.9655172413793102e-06, "loss": 0.2103222757577896, "rewards/accuracies": 0.90625, "rewards/chosen": 5.308265686035156, "rewards/margins": 5.089984893798828, "rewards/rejected": 0.21733355522155762, "step": 86 }, { "epoch": 0.04503105590062112, "grad_norm": 7.328167915344238, "learning_rate": 3e-06, "loss": 0.4192392826080322, "rewards/accuracies": 0.796875, "rewards/chosen": 4.825883865356445, "rewards/margins": 4.3303375244140625, "rewards/rejected": 0.4957275390625, "step": 87 }, { "epoch": 0.045548654244306416, "grad_norm": 4.116085529327393, "learning_rate": 3.03448275862069e-06, "loss": 0.20810335874557495, "rewards/accuracies": 0.8671875, "rewards/chosen": 5.591590881347656, "rewards/margins": 5.3447723388671875, "rewards/rejected": 0.24741601943969727, "step": 88 }, { "epoch": 0.04606625258799172, "grad_norm": 5.2743120193481445, "learning_rate": 3.0689655172413797e-06, "loss": 0.279384970664978, "rewards/accuracies": 0.8359375, "rewards/chosen": 6.911870956420898, "rewards/margins": 6.380683898925781, "rewards/rejected": 0.5300607681274414, "step": 89 }, { "epoch": 0.046583850931677016, "grad_norm": 5.964112281799316, "learning_rate": 3.103448275862069e-06, "loss": 0.29090574383735657, "rewards/accuracies": 0.84375, "rewards/chosen": 6.682191848754883, "rewards/margins": 6.1328887939453125, "rewards/rejected": 0.5474948883056641, "step": 90 }, { "epoch": 0.04710144927536232, "grad_norm": 4.039856910705566, "learning_rate": 3.137931034482759e-06, "loss": 0.2368823140859604, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.574067115783691, "rewards/margins": 6.821388244628906, "rewards/rejected": 0.7526569366455078, "step": 91 }, { "epoch": 0.047619047619047616, "grad_norm": 4.16628885269165, "learning_rate": 3.172413793103449e-06, "loss": 0.23034977912902832, "rewards/accuracies": 0.9140625, "rewards/chosen": 7.429656505584717, "rewards/margins": 6.7918701171875, "rewards/rejected": 0.6374797821044922, "step": 92 }, { "epoch": 0.04813664596273292, "grad_norm": 3.9315900802612305, "learning_rate": 3.206896551724138e-06, "loss": 0.2369752675294876, "rewards/accuracies": 0.875, "rewards/chosen": 6.609086990356445, "rewards/margins": 6.355995178222656, "rewards/rejected": 0.2534351348876953, "step": 93 }, { "epoch": 0.048654244306418216, "grad_norm": 4.366872310638428, "learning_rate": 3.2413793103448277e-06, "loss": 0.24396899342536926, "rewards/accuracies": 0.875, "rewards/chosen": 5.969452857971191, "rewards/margins": 6.086357593536377, "rewards/rejected": -0.11756908893585205, "step": 94 }, { "epoch": 0.04917184265010352, "grad_norm": 5.242311477661133, "learning_rate": 3.2758620689655175e-06, "loss": 0.32999032735824585, "rewards/accuracies": 0.828125, "rewards/chosen": 5.597328186035156, "rewards/margins": 5.2629241943359375, "rewards/rejected": 0.33338499069213867, "step": 95 }, { "epoch": 0.049689440993788817, "grad_norm": 4.329901695251465, "learning_rate": 3.3103448275862073e-06, "loss": 0.2847708463668823, "rewards/accuracies": 0.8359375, "rewards/chosen": 5.744209289550781, "rewards/margins": 5.709251403808594, "rewards/rejected": 0.035422325134277344, "step": 96 }, { "epoch": 0.05020703933747412, "grad_norm": 2.951371431350708, "learning_rate": 3.3448275862068967e-06, "loss": 0.22503235936164856, "rewards/accuracies": 0.859375, "rewards/chosen": 6.126384258270264, "rewards/margins": 5.926750183105469, "rewards/rejected": 0.1996205449104309, "step": 97 }, { "epoch": 0.050724637681159424, "grad_norm": 2.794062852859497, "learning_rate": 3.3793103448275866e-06, "loss": 0.20629601180553436, "rewards/accuracies": 0.8828125, "rewards/chosen": 6.31500244140625, "rewards/margins": 6.464008331298828, "rewards/rejected": -0.14941120147705078, "step": 98 }, { "epoch": 0.05124223602484472, "grad_norm": 3.470243453979492, "learning_rate": 3.4137931034482764e-06, "loss": 0.25665509700775146, "rewards/accuracies": 0.8515625, "rewards/chosen": 5.20316219329834, "rewards/margins": 5.103030204772949, "rewards/rejected": 0.10032773017883301, "step": 99 }, { "epoch": 0.051759834368530024, "grad_norm": 4.312779903411865, "learning_rate": 3.448275862068966e-06, "loss": 0.22987432777881622, "rewards/accuracies": 0.8828125, "rewards/chosen": 5.993282318115234, "rewards/margins": 5.828159332275391, "rewards/rejected": 0.16502141952514648, "step": 100 }, { "epoch": 0.05227743271221532, "grad_norm": 3.295339345932007, "learning_rate": 3.4827586206896552e-06, "loss": 0.2707698941230774, "rewards/accuracies": 0.828125, "rewards/chosen": 4.922730922698975, "rewards/margins": 4.858928680419922, "rewards/rejected": 0.061961352825164795, "step": 101 }, { "epoch": 0.052795031055900624, "grad_norm": 3.5643320083618164, "learning_rate": 3.517241379310345e-06, "loss": 0.24709820747375488, "rewards/accuracies": 0.8515625, "rewards/chosen": 6.040592193603516, "rewards/margins": 6.137733459472656, "rewards/rejected": -0.09538745880126953, "step": 102 }, { "epoch": 0.05331262939958592, "grad_norm": 3.11250638961792, "learning_rate": 3.5517241379310345e-06, "loss": 0.19388288259506226, "rewards/accuracies": 0.90625, "rewards/chosen": 6.15880012512207, "rewards/margins": 6.119895935058594, "rewards/rejected": 0.03891921043395996, "step": 103 }, { "epoch": 0.053830227743271224, "grad_norm": 4.298996448516846, "learning_rate": 3.5862068965517243e-06, "loss": 0.22563359141349792, "rewards/accuracies": 0.8984375, "rewards/chosen": 6.855976104736328, "rewards/margins": 6.4120635986328125, "rewards/rejected": 0.44263482093811035, "step": 104 }, { "epoch": 0.05434782608695652, "grad_norm": 4.5972514152526855, "learning_rate": 3.620689655172414e-06, "loss": 0.24349021911621094, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.379327297210693, "rewards/margins": 7.028190612792969, "rewards/rejected": 0.35115838050842285, "step": 105 }, { "epoch": 0.054865424430641824, "grad_norm": 3.959170341491699, "learning_rate": 3.655172413793104e-06, "loss": 0.21213866770267487, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.830653190612793, "rewards/margins": 7.196240425109863, "rewards/rejected": 0.6352625489234924, "step": 106 }, { "epoch": 0.05538302277432712, "grad_norm": 2.844444751739502, "learning_rate": 3.6896551724137934e-06, "loss": 0.2215038686990738, "rewards/accuracies": 0.875, "rewards/chosen": 8.133943557739258, "rewards/margins": 7.763607025146484, "rewards/rejected": 0.3690147399902344, "step": 107 }, { "epoch": 0.055900621118012424, "grad_norm": 4.546253681182861, "learning_rate": 3.7241379310344832e-06, "loss": 0.21082745492458344, "rewards/accuracies": 0.8671875, "rewards/chosen": 8.51050090789795, "rewards/margins": 7.930381774902344, "rewards/rejected": 0.5794758796691895, "step": 108 }, { "epoch": 0.05641821946169772, "grad_norm": 3.4497928619384766, "learning_rate": 3.7586206896551727e-06, "loss": 0.2306499034166336, "rewards/accuracies": 0.875, "rewards/chosen": 8.696076393127441, "rewards/margins": 8.107215881347656, "rewards/rejected": 0.5862927436828613, "step": 109 }, { "epoch": 0.056935817805383024, "grad_norm": 4.145035266876221, "learning_rate": 3.793103448275862e-06, "loss": 0.2138179987668991, "rewards/accuracies": 0.875, "rewards/chosen": 9.017444610595703, "rewards/margins": 8.854373931884766, "rewards/rejected": 0.16342544555664062, "step": 110 }, { "epoch": 0.05745341614906832, "grad_norm": 5.57577657699585, "learning_rate": 3.827586206896552e-06, "loss": 0.29666340351104736, "rewards/accuracies": 0.8515625, "rewards/chosen": 8.207283020019531, "rewards/margins": 7.9043731689453125, "rewards/rejected": 0.30182576179504395, "step": 111 }, { "epoch": 0.057971014492753624, "grad_norm": 4.1178789138793945, "learning_rate": 3.862068965517241e-06, "loss": 0.25105446577072144, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.980827331542969, "rewards/margins": 7.55621337890625, "rewards/rejected": 0.423738956451416, "step": 112 }, { "epoch": 0.05848861283643892, "grad_norm": 3.5516297817230225, "learning_rate": 3.896551724137932e-06, "loss": 0.2023918628692627, "rewards/accuracies": 0.8984375, "rewards/chosen": 8.101219177246094, "rewards/margins": 8.315422058105469, "rewards/rejected": -0.21261024475097656, "step": 113 }, { "epoch": 0.059006211180124224, "grad_norm": 3.791825294494629, "learning_rate": 3.931034482758621e-06, "loss": 0.23399779200553894, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.14630126953125, "rewards/margins": 7.268451690673828, "rewards/rejected": -0.12101554870605469, "step": 114 }, { "epoch": 0.05952380952380952, "grad_norm": 4.820603847503662, "learning_rate": 3.96551724137931e-06, "loss": 0.2700290083885193, "rewards/accuracies": 0.8515625, "rewards/chosen": 6.405097961425781, "rewards/margins": 6.654510498046875, "rewards/rejected": -0.25106334686279297, "step": 115 }, { "epoch": 0.060041407867494824, "grad_norm": 2.830296754837036, "learning_rate": 4.000000000000001e-06, "loss": 0.19523203372955322, "rewards/accuracies": 0.8828125, "rewards/chosen": 5.746395111083984, "rewards/margins": 6.110160827636719, "rewards/rejected": -0.36402416229248047, "step": 116 }, { "epoch": 0.06055900621118013, "grad_norm": 3.363717794418335, "learning_rate": 4.03448275862069e-06, "loss": 0.26650330424308777, "rewards/accuracies": 0.859375, "rewards/chosen": 4.6106390953063965, "rewards/margins": 5.133277893066406, "rewards/rejected": -0.5250282287597656, "step": 117 }, { "epoch": 0.061076604554865424, "grad_norm": 3.708811044692993, "learning_rate": 4.0689655172413795e-06, "loss": 0.2535266876220703, "rewards/accuracies": 0.859375, "rewards/chosen": 4.349265098571777, "rewards/margins": 4.5873823165893555, "rewards/rejected": -0.23874986171722412, "step": 118 }, { "epoch": 0.06159420289855073, "grad_norm": 4.413633823394775, "learning_rate": 4.103448275862069e-06, "loss": 0.2510981857776642, "rewards/accuracies": 0.90625, "rewards/chosen": 3.875882625579834, "rewards/margins": 4.293292999267578, "rewards/rejected": -0.4181241989135742, "step": 119 }, { "epoch": 0.062111801242236024, "grad_norm": 2.7525200843811035, "learning_rate": 4.137931034482759e-06, "loss": 0.20319271087646484, "rewards/accuracies": 0.8984375, "rewards/chosen": 4.161672115325928, "rewards/margins": 4.218410491943359, "rewards/rejected": -0.05804473161697388, "step": 120 }, { "epoch": 0.06262939958592133, "grad_norm": 3.0021684169769287, "learning_rate": 4.1724137931034486e-06, "loss": 0.1342526376247406, "rewards/accuracies": 0.9453125, "rewards/chosen": 4.430843830108643, "rewards/margins": 4.7910308837890625, "rewards/rejected": -0.36023521423339844, "step": 121 }, { "epoch": 0.06314699792960662, "grad_norm": 3.2003989219665527, "learning_rate": 4.206896551724138e-06, "loss": 0.20217397809028625, "rewards/accuracies": 0.890625, "rewards/chosen": 4.996942043304443, "rewards/margins": 4.937568664550781, "rewards/rejected": 0.058243632316589355, "step": 122 }, { "epoch": 0.06366459627329192, "grad_norm": 3.0498130321502686, "learning_rate": 4.241379310344828e-06, "loss": 0.1778711974620819, "rewards/accuracies": 0.921875, "rewards/chosen": 5.921104431152344, "rewards/margins": 6.267280578613281, "rewards/rejected": -0.34682369232177734, "step": 123 }, { "epoch": 0.06418219461697723, "grad_norm": 3.9329278469085693, "learning_rate": 4.275862068965518e-06, "loss": 0.2734295129776001, "rewards/accuracies": 0.859375, "rewards/chosen": 7.526716232299805, "rewards/margins": 6.675483703613281, "rewards/rejected": 0.8511772155761719, "step": 124 }, { "epoch": 0.06469979296066253, "grad_norm": 8.391997337341309, "learning_rate": 4.310344827586207e-06, "loss": 0.37814921140670776, "rewards/accuracies": 0.84375, "rewards/chosen": 8.395095825195312, "rewards/margins": 7.69207763671875, "rewards/rejected": 0.7040977478027344, "step": 125 }, { "epoch": 0.06521739130434782, "grad_norm": 5.7145843505859375, "learning_rate": 4.3448275862068965e-06, "loss": 0.22430801391601562, "rewards/accuracies": 0.890625, "rewards/chosen": 9.65426254272461, "rewards/margins": 8.611358642578125, "rewards/rejected": 1.0406208038330078, "step": 126 }, { "epoch": 0.06573498964803312, "grad_norm": 3.248743772506714, "learning_rate": 4.379310344827587e-06, "loss": 0.243032768368721, "rewards/accuracies": 0.84375, "rewards/chosen": 6.772552490234375, "rewards/margins": 6.3575286865234375, "rewards/rejected": 0.414020299911499, "step": 127 }, { "epoch": 0.06625258799171843, "grad_norm": 3.3589351177215576, "learning_rate": 4.413793103448276e-06, "loss": 0.2956920266151428, "rewards/accuracies": 0.859375, "rewards/chosen": 5.922929763793945, "rewards/margins": 5.614051818847656, "rewards/rejected": 0.30785226821899414, "step": 128 }, { "epoch": 0.06677018633540373, "grad_norm": 3.2474303245544434, "learning_rate": 4.4482758620689656e-06, "loss": 0.2243586927652359, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.0830078125, "rewards/margins": 6.5447540283203125, "rewards/rejected": 0.538588285446167, "step": 129 }, { "epoch": 0.06728778467908902, "grad_norm": 2.9475674629211426, "learning_rate": 4.482758620689656e-06, "loss": 0.22349676489830017, "rewards/accuracies": 0.90625, "rewards/chosen": 5.273468017578125, "rewards/margins": 5.259525299072266, "rewards/rejected": 0.01402735710144043, "step": 130 }, { "epoch": 0.06780538302277432, "grad_norm": 3.580512762069702, "learning_rate": 4.517241379310345e-06, "loss": 0.30566564202308655, "rewards/accuracies": 0.8671875, "rewards/chosen": 4.425535678863525, "rewards/margins": 4.646553039550781, "rewards/rejected": -0.2220740020275116, "step": 131 }, { "epoch": 0.06832298136645963, "grad_norm": 2.818904161453247, "learning_rate": 4.551724137931035e-06, "loss": 0.16598084568977356, "rewards/accuracies": 0.953125, "rewards/chosen": 4.5356903076171875, "rewards/margins": 4.697669982910156, "rewards/rejected": -0.16407464444637299, "step": 132 }, { "epoch": 0.06884057971014493, "grad_norm": 2.31406831741333, "learning_rate": 4.586206896551724e-06, "loss": 0.22995474934577942, "rewards/accuracies": 0.859375, "rewards/chosen": 4.023626327514648, "rewards/margins": 4.2607269287109375, "rewards/rejected": -0.23825621604919434, "step": 133 }, { "epoch": 0.06935817805383022, "grad_norm": 2.6001508235931396, "learning_rate": 4.620689655172414e-06, "loss": 0.2408226728439331, "rewards/accuracies": 0.921875, "rewards/chosen": 3.886568069458008, "rewards/margins": 3.9985313415527344, "rewards/rejected": -0.111572265625, "step": 134 }, { "epoch": 0.06987577639751552, "grad_norm": 3.5911591053009033, "learning_rate": 4.655172413793104e-06, "loss": 0.259224534034729, "rewards/accuracies": 0.875, "rewards/chosen": 4.475061416625977, "rewards/margins": 4.553377151489258, "rewards/rejected": -0.0790863037109375, "step": 135 }, { "epoch": 0.07039337474120083, "grad_norm": 2.9334218502044678, "learning_rate": 4.689655172413793e-06, "loss": 0.22279417514801025, "rewards/accuracies": 0.8515625, "rewards/chosen": 4.295520782470703, "rewards/margins": 4.484519958496094, "rewards/rejected": -0.18933868408203125, "step": 136 }, { "epoch": 0.07091097308488613, "grad_norm": 3.0323708057403564, "learning_rate": 4.724137931034483e-06, "loss": 0.19944319128990173, "rewards/accuracies": 0.890625, "rewards/chosen": 4.774393081665039, "rewards/margins": 4.969047546386719, "rewards/rejected": -0.1945204734802246, "step": 137 }, { "epoch": 0.07142857142857142, "grad_norm": 2.4684181213378906, "learning_rate": 4.758620689655173e-06, "loss": 0.19854456186294556, "rewards/accuracies": 0.8828125, "rewards/chosen": 4.964786529541016, "rewards/margins": 5.056251525878906, "rewards/rejected": -0.09120655059814453, "step": 138 }, { "epoch": 0.07194616977225674, "grad_norm": 2.1721370220184326, "learning_rate": 4.793103448275862e-06, "loss": 0.15876822173595428, "rewards/accuracies": 0.921875, "rewards/chosen": 5.395758628845215, "rewards/margins": 5.4211883544921875, "rewards/rejected": -0.024718046188354492, "step": 139 }, { "epoch": 0.07246376811594203, "grad_norm": 3.001483917236328, "learning_rate": 4.8275862068965525e-06, "loss": 0.20648987591266632, "rewards/accuracies": 0.921875, "rewards/chosen": 5.93328857421875, "rewards/margins": 5.7122955322265625, "rewards/rejected": 0.2214984893798828, "step": 140 }, { "epoch": 0.07298136645962733, "grad_norm": 3.5151872634887695, "learning_rate": 4.862068965517242e-06, "loss": 0.28198397159576416, "rewards/accuracies": 0.8125, "rewards/chosen": 4.738870620727539, "rewards/margins": 4.6703338623046875, "rewards/rejected": 0.07021713256835938, "step": 141 }, { "epoch": 0.07349896480331262, "grad_norm": 2.5221495628356934, "learning_rate": 4.896551724137931e-06, "loss": 0.14456534385681152, "rewards/accuracies": 0.9296875, "rewards/chosen": 7.656902313232422, "rewards/margins": 7.442535400390625, "rewards/rejected": 0.21190738677978516, "step": 142 }, { "epoch": 0.07401656314699794, "grad_norm": 4.747554779052734, "learning_rate": 4.931034482758621e-06, "loss": 0.24441996216773987, "rewards/accuracies": 0.9140625, "rewards/chosen": 7.390690803527832, "rewards/margins": 6.976676940917969, "rewards/rejected": 0.41434478759765625, "step": 143 }, { "epoch": 0.07453416149068323, "grad_norm": 5.073159694671631, "learning_rate": 4.965517241379311e-06, "loss": 0.24412517249584198, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.693696975708008, "rewards/margins": 7.130706787109375, "rewards/rejected": 0.5635738372802734, "step": 144 }, { "epoch": 0.07505175983436853, "grad_norm": 3.858149766921997, "learning_rate": 5e-06, "loss": 0.2046666145324707, "rewards/accuracies": 0.9140625, "rewards/chosen": 8.284231185913086, "rewards/margins": 7.5475311279296875, "rewards/rejected": 0.736842155456543, "step": 145 }, { "epoch": 0.07556935817805382, "grad_norm": 2.562715530395508, "learning_rate": 5.03448275862069e-06, "loss": 0.12593409419059753, "rewards/accuracies": 0.953125, "rewards/chosen": 8.647171020507812, "rewards/margins": 8.144027709960938, "rewards/rejected": 0.5023422241210938, "step": 146 }, { "epoch": 0.07608695652173914, "grad_norm": 3.9407992362976074, "learning_rate": 5.06896551724138e-06, "loss": 0.1957118809223175, "rewards/accuracies": 0.8984375, "rewards/chosen": 8.409391403198242, "rewards/margins": 7.8927764892578125, "rewards/rejected": 0.5171413421630859, "step": 147 }, { "epoch": 0.07660455486542443, "grad_norm": 3.80920672416687, "learning_rate": 5.1034482758620695e-06, "loss": 0.2024538218975067, "rewards/accuracies": 0.9140625, "rewards/chosen": 9.07573413848877, "rewards/margins": 9.070404052734375, "rewards/rejected": 0.0074748992919921875, "step": 148 }, { "epoch": 0.07712215320910973, "grad_norm": 4.323078632354736, "learning_rate": 5.137931034482759e-06, "loss": 0.21427389979362488, "rewards/accuracies": 0.890625, "rewards/chosen": 7.934221267700195, "rewards/margins": 8.16440200805664, "rewards/rejected": -0.23260784149169922, "step": 149 }, { "epoch": 0.07763975155279502, "grad_norm": 3.627352476119995, "learning_rate": 5.172413793103449e-06, "loss": 0.1888948380947113, "rewards/accuracies": 0.890625, "rewards/chosen": 7.843244552612305, "rewards/margins": 8.23651123046875, "rewards/rejected": -0.39284276962280273, "step": 150 }, { "epoch": 0.07815734989648034, "grad_norm": 2.7934701442718506, "learning_rate": 5.206896551724139e-06, "loss": 0.20681074261665344, "rewards/accuracies": 0.890625, "rewards/chosen": 7.290185928344727, "rewards/margins": 8.136085510253906, "rewards/rejected": -0.8455495834350586, "step": 151 }, { "epoch": 0.07867494824016563, "grad_norm": 3.117034435272217, "learning_rate": 5.241379310344829e-06, "loss": 0.21005499362945557, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.436241149902344, "rewards/margins": 8.314788818359375, "rewards/rejected": -0.8792552947998047, "step": 152 }, { "epoch": 0.07919254658385093, "grad_norm": 3.0545687675476074, "learning_rate": 5.275862068965518e-06, "loss": 0.2219194620847702, "rewards/accuracies": 0.890625, "rewards/chosen": 6.950272560119629, "rewards/margins": 7.7178955078125, "rewards/rejected": -0.7694873809814453, "step": 153 }, { "epoch": 0.07971014492753623, "grad_norm": 2.4228034019470215, "learning_rate": 5.310344827586207e-06, "loss": 0.15850505232810974, "rewards/accuracies": 0.9375, "rewards/chosen": 6.484825134277344, "rewards/margins": 7.4555511474609375, "rewards/rejected": -0.9709720611572266, "step": 154 }, { "epoch": 0.08022774327122154, "grad_norm": 3.518083095550537, "learning_rate": 5.344827586206896e-06, "loss": 0.27404341101646423, "rewards/accuracies": 0.828125, "rewards/chosen": 6.0443267822265625, "rewards/margins": 6.940288543701172, "rewards/rejected": -0.8969223499298096, "step": 155 }, { "epoch": 0.08074534161490683, "grad_norm": 4.202250957489014, "learning_rate": 5.3793103448275865e-06, "loss": 0.2870408594608307, "rewards/accuracies": 0.875, "rewards/chosen": 5.900764465332031, "rewards/margins": 6.813152313232422, "rewards/rejected": -0.9099369049072266, "step": 156 }, { "epoch": 0.08126293995859213, "grad_norm": 3.3425400257110596, "learning_rate": 5.413793103448276e-06, "loss": 0.21553939580917358, "rewards/accuracies": 0.8671875, "rewards/chosen": 5.892276763916016, "rewards/margins": 6.5855560302734375, "rewards/rejected": -0.6924905776977539, "step": 157 }, { "epoch": 0.08178053830227744, "grad_norm": 3.5685362815856934, "learning_rate": 5.448275862068966e-06, "loss": 0.2510266900062561, "rewards/accuracies": 0.859375, "rewards/chosen": 4.818975448608398, "rewards/margins": 5.492988586425781, "rewards/rejected": -0.6742821335792542, "step": 158 }, { "epoch": 0.08229813664596274, "grad_norm": 2.6757326126098633, "learning_rate": 5.4827586206896556e-06, "loss": 0.22104737162590027, "rewards/accuracies": 0.859375, "rewards/chosen": 4.149998664855957, "rewards/margins": 4.717832565307617, "rewards/rejected": -0.5668549537658691, "step": 159 }, { "epoch": 0.08281573498964803, "grad_norm": 2.3128397464752197, "learning_rate": 5.517241379310345e-06, "loss": 0.29207995533943176, "rewards/accuracies": 0.8515625, "rewards/chosen": 3.3240222930908203, "rewards/margins": 3.7563323974609375, "rewards/rejected": -0.43264293670654297, "step": 160 }, { "epoch": 0.08333333333333333, "grad_norm": 2.0929207801818848, "learning_rate": 5.551724137931035e-06, "loss": 0.16451774537563324, "rewards/accuracies": 0.9296875, "rewards/chosen": 4.102639198303223, "rewards/margins": 4.611797332763672, "rewards/rejected": -0.5086708068847656, "step": 161 }, { "epoch": 0.08385093167701864, "grad_norm": 1.9980943202972412, "learning_rate": 5.586206896551725e-06, "loss": 0.18911148607730865, "rewards/accuracies": 0.9296875, "rewards/chosen": 4.472187042236328, "rewards/margins": 4.848907470703125, "rewards/rejected": -0.3773655891418457, "step": 162 }, { "epoch": 0.08436853002070394, "grad_norm": 2.3767037391662598, "learning_rate": 5.620689655172414e-06, "loss": 0.17838048934936523, "rewards/accuracies": 0.890625, "rewards/chosen": 5.044076919555664, "rewards/margins": 5.069732666015625, "rewards/rejected": -0.025303781032562256, "step": 163 }, { "epoch": 0.08488612836438923, "grad_norm": 2.7127199172973633, "learning_rate": 5.655172413793104e-06, "loss": 0.16401441395282745, "rewards/accuracies": 0.9375, "rewards/chosen": 6.158172607421875, "rewards/margins": 6.071739196777344, "rewards/rejected": 0.08669376373291016, "step": 164 }, { "epoch": 0.08540372670807453, "grad_norm": 3.8918888568878174, "learning_rate": 5.689655172413794e-06, "loss": 0.2132033407688141, "rewards/accuracies": 0.9140625, "rewards/chosen": 8.645675659179688, "rewards/margins": 7.831062316894531, "rewards/rejected": 0.8161388039588928, "step": 165 }, { "epoch": 0.08592132505175984, "grad_norm": 5.050085067749023, "learning_rate": 5.724137931034483e-06, "loss": 0.27192097902297974, "rewards/accuracies": 0.8828125, "rewards/chosen": 10.603803634643555, "rewards/margins": 9.184959411621094, "rewards/rejected": 1.422576904296875, "step": 166 }, { "epoch": 0.08643892339544514, "grad_norm": 4.419585227966309, "learning_rate": 5.758620689655173e-06, "loss": 0.2242995649576187, "rewards/accuracies": 0.8828125, "rewards/chosen": 10.686050415039062, "rewards/margins": 9.754470825195312, "rewards/rejected": 0.9311666488647461, "step": 167 }, { "epoch": 0.08695652173913043, "grad_norm": 5.062690734863281, "learning_rate": 5.793103448275863e-06, "loss": 0.19873052835464478, "rewards/accuracies": 0.9140625, "rewards/chosen": 12.556732177734375, "rewards/margins": 10.892166137695312, "rewards/rejected": 1.665597915649414, "step": 168 }, { "epoch": 0.08747412008281573, "grad_norm": 9.362822532653809, "learning_rate": 5.827586206896553e-06, "loss": 0.27822431921958923, "rewards/accuracies": 0.8671875, "rewards/chosen": 9.131708145141602, "rewards/margins": 8.010162353515625, "rewards/rejected": 1.1231460571289062, "step": 169 }, { "epoch": 0.08799171842650104, "grad_norm": 10.091778755187988, "learning_rate": 5.862068965517242e-06, "loss": 0.23418116569519043, "rewards/accuracies": 0.90625, "rewards/chosen": 9.61709213256836, "rewards/margins": 8.599853515625, "rewards/rejected": 1.0148515701293945, "step": 170 }, { "epoch": 0.08850931677018634, "grad_norm": 7.210886001586914, "learning_rate": 5.896551724137931e-06, "loss": 0.25554293394088745, "rewards/accuracies": 0.8671875, "rewards/chosen": 8.279373168945312, "rewards/margins": 7.4268646240234375, "rewards/rejected": 0.8523902893066406, "step": 171 }, { "epoch": 0.08902691511387163, "grad_norm": 7.06812858581543, "learning_rate": 5.9310344827586205e-06, "loss": 0.2737012505531311, "rewards/accuracies": 0.859375, "rewards/chosen": 7.133033752441406, "rewards/margins": 6.920867919921875, "rewards/rejected": 0.21203231811523438, "step": 172 }, { "epoch": 0.08954451345755693, "grad_norm": 3.772188901901245, "learning_rate": 5.965517241379311e-06, "loss": 0.2246614247560501, "rewards/accuracies": 0.8671875, "rewards/chosen": 6.845235824584961, "rewards/margins": 6.300140380859375, "rewards/rejected": 0.5446624755859375, "step": 173 }, { "epoch": 0.09006211180124224, "grad_norm": 3.0162353515625, "learning_rate": 6e-06, "loss": 0.24408206343650818, "rewards/accuracies": 0.875, "rewards/chosen": 4.56376314163208, "rewards/margins": 4.6045379638671875, "rewards/rejected": -0.04137682914733887, "step": 174 }, { "epoch": 0.09057971014492754, "grad_norm": 2.3115155696868896, "learning_rate": 6.03448275862069e-06, "loss": 0.1931992471218109, "rewards/accuracies": 0.8515625, "rewards/chosen": 4.520101547241211, "rewards/margins": 4.496494293212891, "rewards/rejected": 0.02373790740966797, "step": 175 }, { "epoch": 0.09109730848861283, "grad_norm": 3.0156493186950684, "learning_rate": 6.06896551724138e-06, "loss": 0.21788984537124634, "rewards/accuracies": 0.8984375, "rewards/chosen": 4.451257705688477, "rewards/margins": 4.383567810058594, "rewards/rejected": 0.06749248504638672, "step": 176 }, { "epoch": 0.09161490683229814, "grad_norm": 3.1794795989990234, "learning_rate": 6.103448275862069e-06, "loss": 0.25536036491394043, "rewards/accuracies": 0.859375, "rewards/chosen": 4.714051246643066, "rewards/margins": 4.823036193847656, "rewards/rejected": -0.10979270935058594, "step": 177 }, { "epoch": 0.09213250517598344, "grad_norm": 2.5448710918426514, "learning_rate": 6.1379310344827595e-06, "loss": 0.2501446008682251, "rewards/accuracies": 0.8828125, "rewards/chosen": 5.407260894775391, "rewards/margins": 5.274909973144531, "rewards/rejected": 0.13178467750549316, "step": 178 }, { "epoch": 0.09265010351966874, "grad_norm": 1.7469847202301025, "learning_rate": 6.172413793103449e-06, "loss": 0.21517708897590637, "rewards/accuracies": 0.890625, "rewards/chosen": 6.636538028717041, "rewards/margins": 6.244964599609375, "rewards/rejected": 0.39082908630371094, "step": 179 }, { "epoch": 0.09316770186335403, "grad_norm": 3.7995853424072266, "learning_rate": 6.206896551724138e-06, "loss": 0.24525505304336548, "rewards/accuracies": 0.8671875, "rewards/chosen": 7.025608062744141, "rewards/margins": 6.763542175292969, "rewards/rejected": 0.2619600296020508, "step": 180 }, { "epoch": 0.09368530020703934, "grad_norm": 2.5189290046691895, "learning_rate": 6.241379310344829e-06, "loss": 0.23161813616752625, "rewards/accuracies": 0.859375, "rewards/chosen": 6.776201248168945, "rewards/margins": 6.152565002441406, "rewards/rejected": 0.6252789497375488, "step": 181 }, { "epoch": 0.09420289855072464, "grad_norm": 3.003974676132202, "learning_rate": 6.275862068965518e-06, "loss": 0.2351873517036438, "rewards/accuracies": 0.90625, "rewards/chosen": 6.06651496887207, "rewards/margins": 5.9562835693359375, "rewards/rejected": 0.111358642578125, "step": 182 }, { "epoch": 0.09472049689440994, "grad_norm": 3.7025020122528076, "learning_rate": 6.310344827586207e-06, "loss": 0.2549179196357727, "rewards/accuracies": 0.875, "rewards/chosen": 5.944145202636719, "rewards/margins": 5.660408020019531, "rewards/rejected": 0.2822732925415039, "step": 183 }, { "epoch": 0.09523809523809523, "grad_norm": 2.1832337379455566, "learning_rate": 6.344827586206898e-06, "loss": 0.1733209192752838, "rewards/accuracies": 0.921875, "rewards/chosen": 6.804145812988281, "rewards/margins": 6.6430206298828125, "rewards/rejected": 0.16199791431427002, "step": 184 }, { "epoch": 0.09575569358178054, "grad_norm": 1.832918405532837, "learning_rate": 6.379310344827587e-06, "loss": 0.16203154623508453, "rewards/accuracies": 0.921875, "rewards/chosen": 5.721090793609619, "rewards/margins": 5.8545074462890625, "rewards/rejected": -0.13193821907043457, "step": 185 }, { "epoch": 0.09627329192546584, "grad_norm": 2.032155990600586, "learning_rate": 6.413793103448276e-06, "loss": 0.1763077825307846, "rewards/accuracies": 0.8984375, "rewards/chosen": 5.804854393005371, "rewards/margins": 6.078094482421875, "rewards/rejected": -0.27344465255737305, "step": 186 }, { "epoch": 0.09679089026915114, "grad_norm": 3.9641056060791016, "learning_rate": 6.448275862068966e-06, "loss": 0.28207463026046753, "rewards/accuracies": 0.8515625, "rewards/chosen": 5.51280403137207, "rewards/margins": 5.6769256591796875, "rewards/rejected": -0.16321563720703125, "step": 187 }, { "epoch": 0.09730848861283643, "grad_norm": 1.6066087484359741, "learning_rate": 6.482758620689655e-06, "loss": 0.1404041349887848, "rewards/accuracies": 0.9453125, "rewards/chosen": 6.1212592124938965, "rewards/margins": 6.4435882568359375, "rewards/rejected": -0.31936216354370117, "step": 188 }, { "epoch": 0.09782608695652174, "grad_norm": 5.209835529327393, "learning_rate": 6.517241379310345e-06, "loss": 0.25775688886642456, "rewards/accuracies": 0.90625, "rewards/chosen": 7.072452545166016, "rewards/margins": 7.00079345703125, "rewards/rejected": 0.07188034057617188, "step": 189 }, { "epoch": 0.09834368530020704, "grad_norm": 2.794898271560669, "learning_rate": 6.551724137931035e-06, "loss": 0.16506701707839966, "rewards/accuracies": 0.90625, "rewards/chosen": 8.477439880371094, "rewards/margins": 8.276565551757812, "rewards/rejected": 0.20029330253601074, "step": 190 }, { "epoch": 0.09886128364389234, "grad_norm": 4.1689581871032715, "learning_rate": 6.586206896551724e-06, "loss": 0.2519005239009857, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.707365036010742, "rewards/margins": 7.4911041259765625, "rewards/rejected": 0.21370935440063477, "step": 191 }, { "epoch": 0.09937888198757763, "grad_norm": 3.968194007873535, "learning_rate": 6.620689655172415e-06, "loss": 0.2639862895011902, "rewards/accuracies": 0.8515625, "rewards/chosen": 7.284564971923828, "rewards/margins": 7.48193359375, "rewards/rejected": -0.1966533660888672, "step": 192 }, { "epoch": 0.09989648033126294, "grad_norm": 2.5770657062530518, "learning_rate": 6.655172413793104e-06, "loss": 0.21228626370429993, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.009006500244141, "rewards/margins": 7.211616516113281, "rewards/rejected": -0.20424413681030273, "step": 193 }, { "epoch": 0.10041407867494824, "grad_norm": 2.4862773418426514, "learning_rate": 6.6896551724137935e-06, "loss": 0.20977795124053955, "rewards/accuracies": 0.8671875, "rewards/chosen": 6.5721282958984375, "rewards/margins": 7.203151702880859, "rewards/rejected": -0.6307659149169922, "step": 194 }, { "epoch": 0.10093167701863354, "grad_norm": 3.894951581954956, "learning_rate": 6.724137931034484e-06, "loss": 0.24303269386291504, "rewards/accuracies": 0.8671875, "rewards/chosen": 5.614631652832031, "rewards/margins": 6.987369537353516, "rewards/rejected": -1.3707866668701172, "step": 195 }, { "epoch": 0.10144927536231885, "grad_norm": 2.5351788997650146, "learning_rate": 6.758620689655173e-06, "loss": 0.16289079189300537, "rewards/accuracies": 0.921875, "rewards/chosen": 5.256660461425781, "rewards/margins": 6.5727081298828125, "rewards/rejected": -1.3152574300765991, "step": 196 }, { "epoch": 0.10196687370600414, "grad_norm": 3.9556884765625, "learning_rate": 6.7931034482758626e-06, "loss": 0.2523379921913147, "rewards/accuracies": 0.8671875, "rewards/chosen": 3.841085433959961, "rewards/margins": 5.573272705078125, "rewards/rejected": -1.7310104370117188, "step": 197 }, { "epoch": 0.10248447204968944, "grad_norm": 2.397601366043091, "learning_rate": 6.827586206896553e-06, "loss": 0.20325139164924622, "rewards/accuracies": 0.890625, "rewards/chosen": 4.441802978515625, "rewards/margins": 6.0892333984375, "rewards/rejected": -1.6461563110351562, "step": 198 }, { "epoch": 0.10300207039337474, "grad_norm": 2.99214243888855, "learning_rate": 6.862068965517242e-06, "loss": 0.20292547345161438, "rewards/accuracies": 0.890625, "rewards/chosen": 4.745257377624512, "rewards/margins": 6.111076354980469, "rewards/rejected": -1.36492919921875, "step": 199 }, { "epoch": 0.10351966873706005, "grad_norm": 2.915168046951294, "learning_rate": 6.896551724137932e-06, "loss": 0.21414293348789215, "rewards/accuracies": 0.890625, "rewards/chosen": 5.617238998413086, "rewards/margins": 6.7811737060546875, "rewards/rejected": -1.1618876457214355, "step": 200 }, { "epoch": 0.10403726708074534, "grad_norm": 2.040475368499756, "learning_rate": 6.931034482758622e-06, "loss": 0.19753502309322357, "rewards/accuracies": 0.90625, "rewards/chosen": 6.381113052368164, "rewards/margins": 7.475444793701172, "rewards/rejected": -1.0931092500686646, "step": 201 }, { "epoch": 0.10455486542443064, "grad_norm": 2.1780588626861572, "learning_rate": 6.9655172413793105e-06, "loss": 0.1748240888118744, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.2174577713012695, "rewards/margins": 7.996986389160156, "rewards/rejected": -0.7812404632568359, "step": 202 }, { "epoch": 0.10507246376811594, "grad_norm": 3.2861979007720947, "learning_rate": 7e-06, "loss": 0.25179386138916016, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.346494674682617, "rewards/margins": 7.8382110595703125, "rewards/rejected": -0.49148082733154297, "step": 203 }, { "epoch": 0.10559006211180125, "grad_norm": 2.6741127967834473, "learning_rate": 7.03448275862069e-06, "loss": 0.18927589058876038, "rewards/accuracies": 0.90625, "rewards/chosen": 10.29409408569336, "rewards/margins": 9.896896362304688, "rewards/rejected": 0.39438724517822266, "step": 204 }, { "epoch": 0.10610766045548654, "grad_norm": 2.5855772495269775, "learning_rate": 7.0689655172413796e-06, "loss": 0.20429542660713196, "rewards/accuracies": 0.90625, "rewards/chosen": 12.042890548706055, "rewards/margins": 10.993606567382812, "rewards/rejected": 1.0487275123596191, "step": 205 }, { "epoch": 0.10662525879917184, "grad_norm": 2.9226601123809814, "learning_rate": 7.103448275862069e-06, "loss": 0.2217361032962799, "rewards/accuracies": 0.8984375, "rewards/chosen": 12.595125198364258, "rewards/margins": 10.699455261230469, "rewards/rejected": 1.894576072692871, "step": 206 }, { "epoch": 0.10714285714285714, "grad_norm": 3.9867329597473145, "learning_rate": 7.137931034482759e-06, "loss": 0.17997032403945923, "rewards/accuracies": 0.8984375, "rewards/chosen": 14.512931823730469, "rewards/margins": 11.144630432128906, "rewards/rejected": 3.369504451751709, "step": 207 }, { "epoch": 0.10766045548654245, "grad_norm": 2.467853546142578, "learning_rate": 7.172413793103449e-06, "loss": 0.20003923773765564, "rewards/accuracies": 0.90625, "rewards/chosen": 12.831315040588379, "rewards/margins": 9.59442138671875, "rewards/rejected": 3.238265037536621, "step": 208 }, { "epoch": 0.10817805383022774, "grad_norm": 3.147653818130493, "learning_rate": 7.206896551724139e-06, "loss": 0.2037552148103714, "rewards/accuracies": 0.8984375, "rewards/chosen": 12.851629257202148, "rewards/margins": 9.878007888793945, "rewards/rejected": 2.9733657836914062, "step": 209 }, { "epoch": 0.10869565217391304, "grad_norm": 2.7109484672546387, "learning_rate": 7.241379310344828e-06, "loss": 0.21130940318107605, "rewards/accuracies": 0.90625, "rewards/chosen": 11.312034606933594, "rewards/margins": 8.062484741210938, "rewards/rejected": 3.249074935913086, "step": 210 }, { "epoch": 0.10921325051759834, "grad_norm": 2.6506736278533936, "learning_rate": 7.275862068965518e-06, "loss": 0.20813900232315063, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.00140380859375, "rewards/margins": 8.012947082519531, "rewards/rejected": 2.9895801544189453, "step": 211 }, { "epoch": 0.10973084886128365, "grad_norm": 2.7473080158233643, "learning_rate": 7.310344827586208e-06, "loss": 0.24156281352043152, "rewards/accuracies": 0.859375, "rewards/chosen": 7.527558326721191, "rewards/margins": 5.890714645385742, "rewards/rejected": 1.638339638710022, "step": 212 }, { "epoch": 0.11024844720496894, "grad_norm": 1.9914803504943848, "learning_rate": 7.344827586206897e-06, "loss": 0.2101120948791504, "rewards/accuracies": 0.921875, "rewards/chosen": 8.08825969696045, "rewards/margins": 6.143337249755859, "rewards/rejected": 1.9431968927383423, "step": 213 }, { "epoch": 0.11076604554865424, "grad_norm": 2.063199043273926, "learning_rate": 7.379310344827587e-06, "loss": 0.19904275238513947, "rewards/accuracies": 0.8828125, "rewards/chosen": 6.112207412719727, "rewards/margins": 4.738250732421875, "rewards/rejected": 1.371159553527832, "step": 214 }, { "epoch": 0.11128364389233955, "grad_norm": 2.608781099319458, "learning_rate": 7.413793103448277e-06, "loss": 0.18735438585281372, "rewards/accuracies": 0.921875, "rewards/chosen": 7.113151550292969, "rewards/margins": 5.480751037597656, "rewards/rejected": 1.6326608657836914, "step": 215 }, { "epoch": 0.11180124223602485, "grad_norm": 2.1588714122772217, "learning_rate": 7.4482758620689665e-06, "loss": 0.21452173590660095, "rewards/accuracies": 0.875, "rewards/chosen": 5.9847564697265625, "rewards/margins": 4.3297576904296875, "rewards/rejected": 1.6540918350219727, "step": 216 }, { "epoch": 0.11231884057971014, "grad_norm": 1.731970191001892, "learning_rate": 7.482758620689656e-06, "loss": 0.17825260758399963, "rewards/accuracies": 0.90625, "rewards/chosen": 6.3281755447387695, "rewards/margins": 5.044750213623047, "rewards/rejected": 1.2843036651611328, "step": 217 }, { "epoch": 0.11283643892339544, "grad_norm": 2.7535080909729004, "learning_rate": 7.517241379310345e-06, "loss": 0.1913154572248459, "rewards/accuracies": 0.890625, "rewards/chosen": 7.265018463134766, "rewards/margins": 5.533054351806641, "rewards/rejected": 1.7322654724121094, "step": 218 }, { "epoch": 0.11335403726708075, "grad_norm": 6.402608394622803, "learning_rate": 7.551724137931035e-06, "loss": 0.36325544118881226, "rewards/accuracies": 0.8359375, "rewards/chosen": 8.469062805175781, "rewards/margins": 6.6209716796875, "rewards/rejected": 1.8477697372436523, "step": 219 }, { "epoch": 0.11387163561076605, "grad_norm": 6.399933815002441, "learning_rate": 7.586206896551724e-06, "loss": 0.251491904258728, "rewards/accuracies": 0.8984375, "rewards/chosen": 10.484442710876465, "rewards/margins": 8.589340209960938, "rewards/rejected": 1.8935699462890625, "step": 220 }, { "epoch": 0.11438923395445134, "grad_norm": 4.1363019943237305, "learning_rate": 7.620689655172414e-06, "loss": 0.23238638043403625, "rewards/accuracies": 0.9140625, "rewards/chosen": 9.73145866394043, "rewards/margins": 8.210037231445312, "rewards/rejected": 1.5214099884033203, "step": 221 }, { "epoch": 0.11490683229813664, "grad_norm": 3.4244210720062256, "learning_rate": 7.655172413793104e-06, "loss": 0.21498161554336548, "rewards/accuracies": 0.875, "rewards/chosen": 11.413873672485352, "rewards/margins": 9.474334716796875, "rewards/rejected": 1.935028076171875, "step": 222 }, { "epoch": 0.11542443064182195, "grad_norm": 3.8852500915527344, "learning_rate": 7.689655172413794e-06, "loss": 0.31183600425720215, "rewards/accuracies": 0.8359375, "rewards/chosen": 12.448497772216797, "rewards/margins": 10.687408447265625, "rewards/rejected": 1.7581405639648438, "step": 223 }, { "epoch": 0.11594202898550725, "grad_norm": 3.3366451263427734, "learning_rate": 7.724137931034483e-06, "loss": 0.27820664644241333, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.42987060546875, "rewards/margins": 10.020004272460938, "rewards/rejected": 1.410904884338379, "step": 224 }, { "epoch": 0.11645962732919254, "grad_norm": 2.270314931869507, "learning_rate": 7.758620689655173e-06, "loss": 0.20100301504135132, "rewards/accuracies": 0.890625, "rewards/chosen": 11.044153213500977, "rewards/margins": 9.903924942016602, "rewards/rejected": 1.1398160457611084, "step": 225 }, { "epoch": 0.11697722567287784, "grad_norm": 2.6024491786956787, "learning_rate": 7.793103448275863e-06, "loss": 0.30248594284057617, "rewards/accuracies": 0.828125, "rewards/chosen": 7.579994201660156, "rewards/margins": 7.19537353515625, "rewards/rejected": 0.38408851623535156, "step": 226 }, { "epoch": 0.11749482401656315, "grad_norm": 2.0437488555908203, "learning_rate": 7.827586206896552e-06, "loss": 0.2474503219127655, "rewards/accuracies": 0.875, "rewards/chosen": 6.974701881408691, "rewards/margins": 6.885841369628906, "rewards/rejected": 0.08844675123691559, "step": 227 }, { "epoch": 0.11801242236024845, "grad_norm": 2.1182823181152344, "learning_rate": 7.862068965517242e-06, "loss": 0.2575953006744385, "rewards/accuracies": 0.8515625, "rewards/chosen": 7.29926872253418, "rewards/margins": 7.635112762451172, "rewards/rejected": -0.3346579670906067, "step": 228 }, { "epoch": 0.11853002070393374, "grad_norm": 1.715200662612915, "learning_rate": 7.896551724137932e-06, "loss": 0.2075975239276886, "rewards/accuracies": 0.90625, "rewards/chosen": 6.216377258300781, "rewards/margins": 6.722738265991211, "rewards/rejected": -0.507381796836853, "step": 229 }, { "epoch": 0.11904761904761904, "grad_norm": 1.3703280687332153, "learning_rate": 7.93103448275862e-06, "loss": 0.24846577644348145, "rewards/accuracies": 0.890625, "rewards/chosen": 7.990478038787842, "rewards/margins": 8.63449478149414, "rewards/rejected": -0.6443800926208496, "step": 230 }, { "epoch": 0.11956521739130435, "grad_norm": 1.7259703874588013, "learning_rate": 7.965517241379311e-06, "loss": 0.22122596204280853, "rewards/accuracies": 0.875, "rewards/chosen": 8.52807331085205, "rewards/margins": 9.183788299560547, "rewards/rejected": -0.6550376415252686, "step": 231 }, { "epoch": 0.12008281573498965, "grad_norm": 1.8707060813903809, "learning_rate": 8.000000000000001e-06, "loss": 0.16938063502311707, "rewards/accuracies": 0.9453125, "rewards/chosen": 10.403680801391602, "rewards/margins": 11.015655517578125, "rewards/rejected": -0.6079463958740234, "step": 232 }, { "epoch": 0.12060041407867494, "grad_norm": 2.902740240097046, "learning_rate": 8.034482758620692e-06, "loss": 0.25176379084587097, "rewards/accuracies": 0.8359375, "rewards/chosen": 14.188962936401367, "rewards/margins": 14.041156768798828, "rewards/rejected": 0.15050840377807617, "step": 233 }, { "epoch": 0.12111801242236025, "grad_norm": 3.372603416442871, "learning_rate": 8.06896551724138e-06, "loss": 0.2507181763648987, "rewards/accuracies": 0.8515625, "rewards/chosen": 12.697463989257812, "rewards/margins": 12.92315673828125, "rewards/rejected": -0.23006153106689453, "step": 234 }, { "epoch": 0.12163561076604555, "grad_norm": 3.3291831016540527, "learning_rate": 8.103448275862069e-06, "loss": 0.24064069986343384, "rewards/accuracies": 0.8671875, "rewards/chosen": 13.655258178710938, "rewards/margins": 13.287837982177734, "rewards/rejected": 0.3652053475379944, "step": 235 }, { "epoch": 0.12215320910973085, "grad_norm": 3.2167322635650635, "learning_rate": 8.137931034482759e-06, "loss": 0.2311703860759735, "rewards/accuracies": 0.8984375, "rewards/chosen": 15.409290313720703, "rewards/margins": 14.731231689453125, "rewards/rejected": 0.6764330267906189, "step": 236 }, { "epoch": 0.12267080745341614, "grad_norm": 4.2148118019104, "learning_rate": 8.17241379310345e-06, "loss": 0.2515755295753479, "rewards/accuracies": 0.8828125, "rewards/chosen": 12.998466491699219, "rewards/margins": 12.864639282226562, "rewards/rejected": 0.13188648223876953, "step": 237 }, { "epoch": 0.12318840579710146, "grad_norm": 3.1535379886627197, "learning_rate": 8.206896551724138e-06, "loss": 0.29858365654945374, "rewards/accuracies": 0.890625, "rewards/chosen": 8.98647689819336, "rewards/margins": 8.650604248046875, "rewards/rejected": 0.33537524938583374, "step": 238 }, { "epoch": 0.12370600414078675, "grad_norm": 2.247323513031006, "learning_rate": 8.241379310344828e-06, "loss": 0.2201038897037506, "rewards/accuracies": 0.90625, "rewards/chosen": 7.527856826782227, "rewards/margins": 7.124931335449219, "rewards/rejected": 0.4005002975463867, "step": 239 }, { "epoch": 0.12422360248447205, "grad_norm": 1.9552971124649048, "learning_rate": 8.275862068965518e-06, "loss": 0.21770413219928741, "rewards/accuracies": 0.890625, "rewards/chosen": 5.942288875579834, "rewards/margins": 5.4145355224609375, "rewards/rejected": 0.5277833938598633, "step": 240 }, { "epoch": 0.12474120082815734, "grad_norm": 2.250195026397705, "learning_rate": 8.310344827586207e-06, "loss": 0.2592617869377136, "rewards/accuracies": 0.8671875, "rewards/chosen": 4.742359161376953, "rewards/margins": 4.561302185058594, "rewards/rejected": 0.1813812255859375, "step": 241 }, { "epoch": 0.12525879917184266, "grad_norm": 2.62225341796875, "learning_rate": 8.344827586206897e-06, "loss": 0.25238215923309326, "rewards/accuracies": 0.8359375, "rewards/chosen": 4.6609649658203125, "rewards/margins": 4.343574523925781, "rewards/rejected": 0.31891441345214844, "step": 242 }, { "epoch": 0.12577639751552794, "grad_norm": 1.7805598974227905, "learning_rate": 8.379310344827587e-06, "loss": 0.24205711483955383, "rewards/accuracies": 0.875, "rewards/chosen": 6.5013580322265625, "rewards/margins": 6.124664306640625, "rewards/rejected": 0.37790536880493164, "step": 243 }, { "epoch": 0.12629399585921325, "grad_norm": 1.3015867471694946, "learning_rate": 8.413793103448276e-06, "loss": 0.2209964096546173, "rewards/accuracies": 0.8984375, "rewards/chosen": 10.446584701538086, "rewards/margins": 9.814922332763672, "rewards/rejected": 0.6301708221435547, "step": 244 }, { "epoch": 0.12681159420289856, "grad_norm": 1.382839322090149, "learning_rate": 8.448275862068966e-06, "loss": 0.21118295192718506, "rewards/accuracies": 0.875, "rewards/chosen": 16.31012725830078, "rewards/margins": 14.724334716796875, "rewards/rejected": 1.590958595275879, "step": 245 }, { "epoch": 0.12732919254658384, "grad_norm": 3.9185824394226074, "learning_rate": 8.482758620689656e-06, "loss": 0.25964242219924927, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.42668342590332, "rewards/margins": 16.11170196533203, "rewards/rejected": 2.314695358276367, "step": 246 }, { "epoch": 0.12784679089026915, "grad_norm": 9.399092674255371, "learning_rate": 8.517241379310345e-06, "loss": 0.201684832572937, "rewards/accuracies": 0.890625, "rewards/chosen": 22.031951904296875, "rewards/margins": 17.540977478027344, "rewards/rejected": 4.484088897705078, "step": 247 }, { "epoch": 0.12836438923395446, "grad_norm": 1.9454642534255981, "learning_rate": 8.551724137931035e-06, "loss": 0.1984676867723465, "rewards/accuracies": 0.875, "rewards/chosen": 17.03447723388672, "rewards/margins": 14.425643920898438, "rewards/rejected": 2.610637664794922, "step": 248 }, { "epoch": 0.12888198757763975, "grad_norm": 2.666651725769043, "learning_rate": 8.586206896551726e-06, "loss": 0.2462148517370224, "rewards/accuracies": 0.875, "rewards/chosen": 12.765146255493164, "rewards/margins": 10.862495422363281, "rewards/rejected": 1.9010772705078125, "step": 249 }, { "epoch": 0.12939958592132506, "grad_norm": 1.901346206665039, "learning_rate": 8.620689655172414e-06, "loss": 0.2528451085090637, "rewards/accuracies": 0.875, "rewards/chosen": 10.018794059753418, "rewards/margins": 8.844440460205078, "rewards/rejected": 1.1754584312438965, "step": 250 }, { "epoch": 0.12991718426501034, "grad_norm": 2.8950910568237305, "learning_rate": 8.655172413793104e-06, "loss": 0.28738176822662354, "rewards/accuracies": 0.875, "rewards/chosen": 8.215190887451172, "rewards/margins": 7.5971832275390625, "rewards/rejected": 0.618053138256073, "step": 251 }, { "epoch": 0.13043478260869565, "grad_norm": 1.6956056356430054, "learning_rate": 8.689655172413793e-06, "loss": 0.1963912546634674, "rewards/accuracies": 0.921875, "rewards/chosen": 6.5314836502075195, "rewards/margins": 6.203926086425781, "rewards/rejected": 0.3276028633117676, "step": 252 }, { "epoch": 0.13095238095238096, "grad_norm": 2.256371021270752, "learning_rate": 8.724137931034483e-06, "loss": 0.20976683497428894, "rewards/accuracies": 0.890625, "rewards/chosen": 6.440571308135986, "rewards/margins": 6.343986511230469, "rewards/rejected": 0.0969691276550293, "step": 253 }, { "epoch": 0.13146997929606624, "grad_norm": 2.3494701385498047, "learning_rate": 8.758620689655173e-06, "loss": 0.21739542484283447, "rewards/accuracies": 0.90625, "rewards/chosen": 5.3040008544921875, "rewards/margins": 5.661338806152344, "rewards/rejected": -0.3569049835205078, "step": 254 }, { "epoch": 0.13198757763975155, "grad_norm": 1.6773978471755981, "learning_rate": 8.793103448275862e-06, "loss": 0.21385446190834045, "rewards/accuracies": 0.875, "rewards/chosen": 5.0317535400390625, "rewards/margins": 5.5125885009765625, "rewards/rejected": -0.4812588691711426, "step": 255 }, { "epoch": 0.13250517598343686, "grad_norm": 2.923190116882324, "learning_rate": 8.827586206896552e-06, "loss": 0.2837335467338562, "rewards/accuracies": 0.8828125, "rewards/chosen": 4.670840740203857, "rewards/margins": 4.9426422119140625, "rewards/rejected": -0.2729562520980835, "step": 256 }, { "epoch": 0.13302277432712215, "grad_norm": 3.5380661487579346, "learning_rate": 8.862068965517243e-06, "loss": 0.21272902190685272, "rewards/accuracies": 0.875, "rewards/chosen": 5.394340515136719, "rewards/margins": 5.842014312744141, "rewards/rejected": -0.44539451599121094, "step": 257 }, { "epoch": 0.13354037267080746, "grad_norm": 2.066338300704956, "learning_rate": 8.896551724137931e-06, "loss": 0.20796406269073486, "rewards/accuracies": 0.8671875, "rewards/chosen": 5.027666091918945, "rewards/margins": 5.4012451171875, "rewards/rejected": -0.3737631142139435, "step": 258 }, { "epoch": 0.13405797101449277, "grad_norm": 2.3536829948425293, "learning_rate": 8.931034482758621e-06, "loss": 0.27382412552833557, "rewards/accuracies": 0.8671875, "rewards/chosen": 4.858287811279297, "rewards/margins": 5.200431823730469, "rewards/rejected": -0.3420848846435547, "step": 259 }, { "epoch": 0.13457556935817805, "grad_norm": 2.255383014678955, "learning_rate": 8.965517241379312e-06, "loss": 0.21926961839199066, "rewards/accuracies": 0.890625, "rewards/chosen": 5.132537841796875, "rewards/margins": 5.657020568847656, "rewards/rejected": -0.5253932476043701, "step": 260 }, { "epoch": 0.13509316770186336, "grad_norm": 1.8507894277572632, "learning_rate": 9e-06, "loss": 0.19941721856594086, "rewards/accuracies": 0.8828125, "rewards/chosen": 6.057213306427002, "rewards/margins": 6.5167694091796875, "rewards/rejected": -0.45963335037231445, "step": 261 }, { "epoch": 0.13561076604554864, "grad_norm": 1.770410180091858, "learning_rate": 9.03448275862069e-06, "loss": 0.20628973841667175, "rewards/accuracies": 0.921875, "rewards/chosen": 5.289228439331055, "rewards/margins": 5.912769317626953, "rewards/rejected": -0.624737024307251, "step": 262 }, { "epoch": 0.13612836438923395, "grad_norm": 1.6308132410049438, "learning_rate": 9.06896551724138e-06, "loss": 0.19954480230808258, "rewards/accuracies": 0.9453125, "rewards/chosen": 6.36961555480957, "rewards/margins": 6.90850830078125, "rewards/rejected": -0.539618968963623, "step": 263 }, { "epoch": 0.13664596273291926, "grad_norm": 1.6981799602508545, "learning_rate": 9.10344827586207e-06, "loss": 0.1594250351190567, "rewards/accuracies": 0.921875, "rewards/chosen": 8.276189804077148, "rewards/margins": 8.524307250976562, "rewards/rejected": -0.2463083267211914, "step": 264 }, { "epoch": 0.13716356107660455, "grad_norm": 3.3816490173339844, "learning_rate": 9.13793103448276e-06, "loss": 0.2105809450149536, "rewards/accuracies": 0.90625, "rewards/chosen": 8.230661392211914, "rewards/margins": 8.702850341796875, "rewards/rejected": -0.4729900360107422, "step": 265 }, { "epoch": 0.13768115942028986, "grad_norm": 3.2372965812683105, "learning_rate": 9.172413793103448e-06, "loss": 0.21365058422088623, "rewards/accuracies": 0.8828125, "rewards/chosen": 8.912827491760254, "rewards/margins": 8.929004669189453, "rewards/rejected": -0.015391707420349121, "step": 266 }, { "epoch": 0.13819875776397517, "grad_norm": 4.221584796905518, "learning_rate": 9.206896551724138e-06, "loss": 0.2854365110397339, "rewards/accuracies": 0.8515625, "rewards/chosen": 8.842072486877441, "rewards/margins": 8.690696716308594, "rewards/rejected": 0.15248540043830872, "step": 267 }, { "epoch": 0.13871635610766045, "grad_norm": 3.2963063716888428, "learning_rate": 9.241379310344829e-06, "loss": 0.18514829874038696, "rewards/accuracies": 0.90625, "rewards/chosen": 8.880885124206543, "rewards/margins": 9.177711486816406, "rewards/rejected": -0.2920198440551758, "step": 268 }, { "epoch": 0.13923395445134576, "grad_norm": 1.9397252798080444, "learning_rate": 9.275862068965517e-06, "loss": 0.17752791941165924, "rewards/accuracies": 0.8984375, "rewards/chosen": 9.38494873046875, "rewards/margins": 9.693115234375, "rewards/rejected": -0.30860424041748047, "step": 269 }, { "epoch": 0.13975155279503104, "grad_norm": 2.3615615367889404, "learning_rate": 9.310344827586207e-06, "loss": 0.19824755191802979, "rewards/accuracies": 0.921875, "rewards/chosen": 9.354759216308594, "rewards/margins": 9.466609954833984, "rewards/rejected": -0.11202716827392578, "step": 270 }, { "epoch": 0.14026915113871635, "grad_norm": 1.9087435007095337, "learning_rate": 9.344827586206898e-06, "loss": 0.2140769064426422, "rewards/accuracies": 0.859375, "rewards/chosen": 9.220741271972656, "rewards/margins": 9.227081298828125, "rewards/rejected": -0.009258270263671875, "step": 271 }, { "epoch": 0.14078674948240166, "grad_norm": 9.074627876281738, "learning_rate": 9.379310344827586e-06, "loss": 0.1746196299791336, "rewards/accuracies": 0.9296875, "rewards/chosen": 10.301717758178711, "rewards/margins": 10.426467895507812, "rewards/rejected": -0.12199783325195312, "step": 272 }, { "epoch": 0.14130434782608695, "grad_norm": 2.663912296295166, "learning_rate": 9.413793103448277e-06, "loss": 0.23550286889076233, "rewards/accuracies": 0.8671875, "rewards/chosen": 10.149480819702148, "rewards/margins": 9.975170135498047, "rewards/rejected": 0.17427635192871094, "step": 273 }, { "epoch": 0.14182194616977226, "grad_norm": 3.086787700653076, "learning_rate": 9.448275862068967e-06, "loss": 0.1701279580593109, "rewards/accuracies": 0.90625, "rewards/chosen": 10.412595748901367, "rewards/margins": 10.06475830078125, "rewards/rejected": 0.34625959396362305, "step": 274 }, { "epoch": 0.14233954451345757, "grad_norm": 1.8937456607818604, "learning_rate": 9.482758620689655e-06, "loss": 0.1671372950077057, "rewards/accuracies": 0.921875, "rewards/chosen": 10.385675430297852, "rewards/margins": 9.635833740234375, "rewards/rejected": 0.7503261566162109, "step": 275 }, { "epoch": 0.14285714285714285, "grad_norm": 3.753880023956299, "learning_rate": 9.517241379310346e-06, "loss": 0.28825220465660095, "rewards/accuracies": 0.8203125, "rewards/chosen": 11.89054012298584, "rewards/margins": 10.30612564086914, "rewards/rejected": 1.5797786712646484, "step": 276 }, { "epoch": 0.14337474120082816, "grad_norm": 3.2461674213409424, "learning_rate": 9.551724137931036e-06, "loss": 0.16936606168746948, "rewards/accuracies": 0.9296875, "rewards/chosen": 12.382884979248047, "rewards/margins": 10.46630859375, "rewards/rejected": 1.9141576290130615, "step": 277 }, { "epoch": 0.14389233954451347, "grad_norm": 3.651427745819092, "learning_rate": 9.586206896551724e-06, "loss": 0.20370003581047058, "rewards/accuracies": 0.9296875, "rewards/chosen": 11.590801239013672, "rewards/margins": 9.455806732177734, "rewards/rejected": 2.1332874298095703, "step": 278 }, { "epoch": 0.14440993788819875, "grad_norm": 3.0242056846618652, "learning_rate": 9.620689655172415e-06, "loss": 0.2170196771621704, "rewards/accuracies": 0.8984375, "rewards/chosen": 11.020275115966797, "rewards/margins": 8.964248657226562, "rewards/rejected": 2.053852081298828, "step": 279 }, { "epoch": 0.14492753623188406, "grad_norm": 1.442434549331665, "learning_rate": 9.655172413793105e-06, "loss": 0.158839151263237, "rewards/accuracies": 0.9296875, "rewards/chosen": 9.824812889099121, "rewards/margins": 8.410797119140625, "rewards/rejected": 1.4137516021728516, "step": 280 }, { "epoch": 0.14544513457556935, "grad_norm": 2.1023261547088623, "learning_rate": 9.689655172413794e-06, "loss": 0.2161823809146881, "rewards/accuracies": 0.8671875, "rewards/chosen": 9.973901748657227, "rewards/margins": 8.404098510742188, "rewards/rejected": 1.5706195831298828, "step": 281 }, { "epoch": 0.14596273291925466, "grad_norm": 1.8753163814544678, "learning_rate": 9.724137931034484e-06, "loss": 0.23404759168624878, "rewards/accuracies": 0.890625, "rewards/chosen": 8.194415092468262, "rewards/margins": 7.079681396484375, "rewards/rejected": 1.1141449213027954, "step": 282 }, { "epoch": 0.14648033126293997, "grad_norm": 1.5978686809539795, "learning_rate": 9.758620689655172e-06, "loss": 0.173639178276062, "rewards/accuracies": 0.9140625, "rewards/chosen": 6.82762336730957, "rewards/margins": 6.187110900878906, "rewards/rejected": 0.6418533325195312, "step": 283 }, { "epoch": 0.14699792960662525, "grad_norm": 1.5851083993911743, "learning_rate": 9.793103448275863e-06, "loss": 0.1847650706768036, "rewards/accuracies": 0.9140625, "rewards/chosen": 7.40822696685791, "rewards/margins": 6.7275390625, "rewards/rejected": 0.6792135238647461, "step": 284 }, { "epoch": 0.14751552795031056, "grad_norm": 2.598728895187378, "learning_rate": 9.827586206896553e-06, "loss": 0.22473889589309692, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.768566131591797, "rewards/margins": 6.781707763671875, "rewards/rejected": 0.9859156608581543, "step": 285 }, { "epoch": 0.14803312629399587, "grad_norm": 2.3576033115386963, "learning_rate": 9.862068965517241e-06, "loss": 0.25064146518707275, "rewards/accuracies": 0.8671875, "rewards/chosen": 8.045787811279297, "rewards/margins": 6.6392364501953125, "rewards/rejected": 1.4071722030639648, "step": 286 }, { "epoch": 0.14855072463768115, "grad_norm": 2.5561797618865967, "learning_rate": 9.896551724137932e-06, "loss": 0.20256751775741577, "rewards/accuracies": 0.875, "rewards/chosen": 8.518714904785156, "rewards/margins": 7.3582305908203125, "rewards/rejected": 1.159598469734192, "step": 287 }, { "epoch": 0.14906832298136646, "grad_norm": 2.9704222679138184, "learning_rate": 9.931034482758622e-06, "loss": 0.2554802894592285, "rewards/accuracies": 0.8984375, "rewards/chosen": 8.205301284790039, "rewards/margins": 7.0975341796875, "rewards/rejected": 1.1093482971191406, "step": 288 }, { "epoch": 0.14958592132505175, "grad_norm": 1.5625061988830566, "learning_rate": 9.96551724137931e-06, "loss": 0.21226607263088226, "rewards/accuracies": 0.8984375, "rewards/chosen": 8.634744644165039, "rewards/margins": 7.5275421142578125, "rewards/rejected": 1.1064832210540771, "step": 289 }, { "epoch": 0.15010351966873706, "grad_norm": 2.016613483428955, "learning_rate": 1e-05, "loss": 0.26286089420318604, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.367624282836914, "rewards/margins": 6.6857147216796875, "rewards/rejected": 0.6819162368774414, "step": 290 }, { "epoch": 0.15062111801242237, "grad_norm": 2.3038909435272217, "learning_rate": 9.999999186106972e-06, "loss": 0.25904959440231323, "rewards/accuracies": 0.890625, "rewards/chosen": 6.946113586425781, "rewards/margins": 6.235176086425781, "rewards/rejected": 0.7128462791442871, "step": 291 }, { "epoch": 0.15113871635610765, "grad_norm": 1.3088080883026123, "learning_rate": 9.999996744428155e-06, "loss": 0.1998758763074875, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.754183769226074, "rewards/margins": 7.0343017578125, "rewards/rejected": 0.7201957702636719, "step": 292 }, { "epoch": 0.15165631469979296, "grad_norm": 1.5174890756607056, "learning_rate": 9.999992674964339e-06, "loss": 0.20783424377441406, "rewards/accuracies": 0.8671875, "rewards/chosen": 6.383505821228027, "rewards/margins": 5.9628448486328125, "rewards/rejected": 0.4214414954185486, "step": 293 }, { "epoch": 0.15217391304347827, "grad_norm": 1.2592238187789917, "learning_rate": 9.999986977716854e-06, "loss": 0.19968725740909576, "rewards/accuracies": 0.9140625, "rewards/chosen": 6.495319366455078, "rewards/margins": 6.157585144042969, "rewards/rejected": 0.33786964416503906, "step": 294 }, { "epoch": 0.15269151138716355, "grad_norm": 2.3639047145843506, "learning_rate": 9.999979652687553e-06, "loss": 0.2353985756635666, "rewards/accuracies": 0.90625, "rewards/chosen": 7.859314918518066, "rewards/margins": 7.572782516479492, "rewards/rejected": 0.2848196029663086, "step": 295 }, { "epoch": 0.15320910973084886, "grad_norm": 1.470066785812378, "learning_rate": 9.999970699878818e-06, "loss": 0.20574454963207245, "rewards/accuracies": 0.9296875, "rewards/chosen": 8.355605125427246, "rewards/margins": 7.8968505859375, "rewards/rejected": 0.4603919982910156, "step": 296 }, { "epoch": 0.15372670807453417, "grad_norm": 1.4339123964309692, "learning_rate": 9.999960119293569e-06, "loss": 0.20367267727851868, "rewards/accuracies": 0.890625, "rewards/chosen": 8.01531982421875, "rewards/margins": 7.819660186767578, "rewards/rejected": 0.1944449543952942, "step": 297 }, { "epoch": 0.15424430641821946, "grad_norm": 1.8903917074203491, "learning_rate": 9.999947910935246e-06, "loss": 0.24078181385993958, "rewards/accuracies": 0.890625, "rewards/chosen": 7.988908290863037, "rewards/margins": 7.884746551513672, "rewards/rejected": 0.10451459884643555, "step": 298 }, { "epoch": 0.15476190476190477, "grad_norm": 2.8035969734191895, "learning_rate": 9.999934074807826e-06, "loss": 0.2869589030742645, "rewards/accuracies": 0.8671875, "rewards/chosen": 10.068763732910156, "rewards/margins": 9.423614501953125, "rewards/rejected": 0.6460936069488525, "step": 299 }, { "epoch": 0.15527950310559005, "grad_norm": 1.3655554056167603, "learning_rate": 9.999918610915811e-06, "loss": 0.15645471215248108, "rewards/accuracies": 0.8984375, "rewards/chosen": 12.25417709350586, "rewards/margins": 11.267711639404297, "rewards/rejected": 0.9853239059448242, "step": 300 }, { "epoch": 0.15579710144927536, "grad_norm": 2.1462244987487793, "learning_rate": 9.99990151926424e-06, "loss": 0.2547569274902344, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.257326126098633, "rewards/margins": 10.106109619140625, "rewards/rejected": 1.1522979736328125, "step": 301 }, { "epoch": 0.15631469979296067, "grad_norm": 1.549450159072876, "learning_rate": 9.999882799858672e-06, "loss": 0.23291070759296417, "rewards/accuracies": 0.9375, "rewards/chosen": 9.9843111038208, "rewards/margins": 9.365142822265625, "rewards/rejected": 0.6180973052978516, "step": 302 }, { "epoch": 0.15683229813664595, "grad_norm": 1.640860915184021, "learning_rate": 9.999862452705205e-06, "loss": 0.21733495593070984, "rewards/accuracies": 0.890625, "rewards/chosen": 8.039352416992188, "rewards/margins": 7.588539123535156, "rewards/rejected": 0.4502146244049072, "step": 303 }, { "epoch": 0.15734989648033126, "grad_norm": 1.3480236530303955, "learning_rate": 9.99984047781046e-06, "loss": 0.24784758687019348, "rewards/accuracies": 0.890625, "rewards/chosen": 6.461722373962402, "rewards/margins": 6.32978630065918, "rewards/rejected": 0.13147063553333282, "step": 304 }, { "epoch": 0.15786749482401657, "grad_norm": 1.428492784500122, "learning_rate": 9.999816875181594e-06, "loss": 0.25768375396728516, "rewards/accuracies": 0.8515625, "rewards/chosen": 8.334461212158203, "rewards/margins": 7.976692199707031, "rewards/rejected": 0.35948944091796875, "step": 305 }, { "epoch": 0.15838509316770186, "grad_norm": 0.8768028020858765, "learning_rate": 9.99979164482629e-06, "loss": 0.22603681683540344, "rewards/accuracies": 0.9140625, "rewards/chosen": 6.945880889892578, "rewards/margins": 6.821364402770996, "rewards/rejected": 0.1232461929321289, "step": 306 }, { "epoch": 0.15890269151138717, "grad_norm": 0.9895910024642944, "learning_rate": 9.999764786752762e-06, "loss": 0.18995092809200287, "rewards/accuracies": 0.90625, "rewards/chosen": 6.240644454956055, "rewards/margins": 6.093299865722656, "rewards/rejected": 0.1483783721923828, "step": 307 }, { "epoch": 0.15942028985507245, "grad_norm": 1.3566256761550903, "learning_rate": 9.999736300969753e-06, "loss": 0.1818409264087677, "rewards/accuracies": 0.890625, "rewards/chosen": 6.47297477722168, "rewards/margins": 6.1208038330078125, "rewards/rejected": 0.3524658679962158, "step": 308 }, { "epoch": 0.15993788819875776, "grad_norm": 1.1778770685195923, "learning_rate": 9.999706187486538e-06, "loss": 0.24347373843193054, "rewards/accuracies": 0.875, "rewards/chosen": 6.7227253913879395, "rewards/margins": 6.369235992431641, "rewards/rejected": 0.35248422622680664, "step": 309 }, { "epoch": 0.16045548654244307, "grad_norm": 1.5524368286132812, "learning_rate": 9.99967444631292e-06, "loss": 0.1609380841255188, "rewards/accuracies": 0.9453125, "rewards/chosen": 8.931562423706055, "rewards/margins": 8.45211410522461, "rewards/rejected": 0.47887611389160156, "step": 310 }, { "epoch": 0.16097308488612835, "grad_norm": 2.4862828254699707, "learning_rate": 9.99964107745923e-06, "loss": 0.27143576741218567, "rewards/accuracies": 0.8125, "rewards/chosen": 9.938273429870605, "rewards/margins": 9.367141723632812, "rewards/rejected": 0.5710122585296631, "step": 311 }, { "epoch": 0.16149068322981366, "grad_norm": 3.7003300189971924, "learning_rate": 9.999606080936337e-06, "loss": 0.2803800702095032, "rewards/accuracies": 0.875, "rewards/chosen": 10.976669311523438, "rewards/margins": 10.58298110961914, "rewards/rejected": 0.3917818069458008, "step": 312 }, { "epoch": 0.16200828157349897, "grad_norm": 1.5716559886932373, "learning_rate": 9.99956945675563e-06, "loss": 0.27279821038246155, "rewards/accuracies": 0.828125, "rewards/chosen": 8.630704879760742, "rewards/margins": 8.127124786376953, "rewards/rejected": 0.5038738250732422, "step": 313 }, { "epoch": 0.16252587991718426, "grad_norm": 1.1252890825271606, "learning_rate": 9.999531204929033e-06, "loss": 0.19315248727798462, "rewards/accuracies": 0.9140625, "rewards/chosen": 11.852890014648438, "rewards/margins": 11.474533081054688, "rewards/rejected": 0.38113975524902344, "step": 314 }, { "epoch": 0.16304347826086957, "grad_norm": 1.633805513381958, "learning_rate": 9.999491325469003e-06, "loss": 0.1886914074420929, "rewards/accuracies": 0.90625, "rewards/chosen": 11.63082504272461, "rewards/margins": 11.17950439453125, "rewards/rejected": 0.4528520107269287, "step": 315 }, { "epoch": 0.16356107660455488, "grad_norm": 2.0420100688934326, "learning_rate": 9.999449818388518e-06, "loss": 0.23143571615219116, "rewards/accuracies": 0.8671875, "rewards/chosen": 12.75953483581543, "rewards/margins": 12.52484130859375, "rewards/rejected": 0.23691654205322266, "step": 316 }, { "epoch": 0.16407867494824016, "grad_norm": 0.9941200613975525, "learning_rate": 9.999406683701093e-06, "loss": 0.13080629706382751, "rewards/accuracies": 0.9375, "rewards/chosen": 15.070016860961914, "rewards/margins": 14.676895141601562, "rewards/rejected": 0.3921537399291992, "step": 317 }, { "epoch": 0.16459627329192547, "grad_norm": 1.9598439931869507, "learning_rate": 9.999361921420771e-06, "loss": 0.21350613236427307, "rewards/accuracies": 0.8671875, "rewards/chosen": 13.296974182128906, "rewards/margins": 12.725784301757812, "rewards/rejected": 0.5735445022583008, "step": 318 }, { "epoch": 0.16511387163561075, "grad_norm": 2.8362462520599365, "learning_rate": 9.999315531562123e-06, "loss": 0.15801352262496948, "rewards/accuracies": 0.921875, "rewards/chosen": 15.509181022644043, "rewards/margins": 15.211814880371094, "rewards/rejected": 0.2960472106933594, "step": 319 }, { "epoch": 0.16563146997929606, "grad_norm": 1.6333720684051514, "learning_rate": 9.999267514140253e-06, "loss": 0.1580343395471573, "rewards/accuracies": 0.9296875, "rewards/chosen": 14.55813217163086, "rewards/margins": 14.402740478515625, "rewards/rejected": 0.15900850296020508, "step": 320 }, { "epoch": 0.16614906832298137, "grad_norm": 2.437072515487671, "learning_rate": 9.999217869170794e-06, "loss": 0.21139870584011078, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.434356689453125, "rewards/margins": 15.872230529785156, "rewards/rejected": 0.5646119117736816, "step": 321 }, { "epoch": 0.16666666666666666, "grad_norm": 3.344059944152832, "learning_rate": 9.99916659666991e-06, "loss": 0.3240286707878113, "rewards/accuracies": 0.875, "rewards/chosen": 14.506539344787598, "rewards/margins": 13.831802368164062, "rewards/rejected": 0.6699380874633789, "step": 322 }, { "epoch": 0.16718426501035197, "grad_norm": 2.1693432331085205, "learning_rate": 9.999113696654287e-06, "loss": 0.23359832167625427, "rewards/accuracies": 0.9296875, "rewards/chosen": 14.211593627929688, "rewards/margins": 14.211647033691406, "rewards/rejected": 0.0037016868591308594, "step": 323 }, { "epoch": 0.16770186335403728, "grad_norm": 2.510045289993286, "learning_rate": 9.999059169141151e-06, "loss": 0.21890634298324585, "rewards/accuracies": 0.8671875, "rewards/chosen": 13.131677627563477, "rewards/margins": 12.689567565917969, "rewards/rejected": 0.4416086673736572, "step": 324 }, { "epoch": 0.16821946169772256, "grad_norm": 1.6228848695755005, "learning_rate": 9.999003014148256e-06, "loss": 0.21170870959758759, "rewards/accuracies": 0.8984375, "rewards/chosen": 10.339622497558594, "rewards/margins": 10.120288848876953, "rewards/rejected": 0.22220993041992188, "step": 325 }, { "epoch": 0.16873706004140787, "grad_norm": 1.8136521577835083, "learning_rate": 9.998945231693879e-06, "loss": 0.19252490997314453, "rewards/accuracies": 0.8984375, "rewards/chosen": 11.230788230895996, "rewards/margins": 11.000228881835938, "rewards/rejected": 0.2291126251220703, "step": 326 }, { "epoch": 0.16925465838509315, "grad_norm": 1.6939325332641602, "learning_rate": 9.998885821796835e-06, "loss": 0.18224065005779266, "rewards/accuracies": 0.9140625, "rewards/chosen": 9.126434326171875, "rewards/margins": 8.990684509277344, "rewards/rejected": 0.13754558563232422, "step": 327 }, { "epoch": 0.16977225672877846, "grad_norm": 1.2913175821304321, "learning_rate": 9.998824784476461e-06, "loss": 0.17218409478664398, "rewards/accuracies": 0.90625, "rewards/chosen": 9.95547866821289, "rewards/margins": 9.616344451904297, "rewards/rejected": 0.33854568004608154, "step": 328 }, { "epoch": 0.17028985507246377, "grad_norm": 1.2791138887405396, "learning_rate": 9.998762119752635e-06, "loss": 0.20179453492164612, "rewards/accuracies": 0.90625, "rewards/chosen": 11.608981132507324, "rewards/margins": 11.17465591430664, "rewards/rejected": 0.43747615814208984, "step": 329 }, { "epoch": 0.17080745341614906, "grad_norm": 1.505529522895813, "learning_rate": 9.998697827645752e-06, "loss": 0.21974706649780273, "rewards/accuracies": 0.921875, "rewards/chosen": 12.102209091186523, "rewards/margins": 11.131027221679688, "rewards/rejected": 0.9728975296020508, "step": 330 }, { "epoch": 0.17132505175983437, "grad_norm": 2.2732365131378174, "learning_rate": 9.998631908176747e-06, "loss": 0.28188470005989075, "rewards/accuracies": 0.859375, "rewards/chosen": 12.393178939819336, "rewards/margins": 11.537422180175781, "rewards/rejected": 0.8632965087890625, "step": 331 }, { "epoch": 0.17184265010351968, "grad_norm": 3.025439500808716, "learning_rate": 9.998564361367075e-06, "loss": 0.27506425976753235, "rewards/accuracies": 0.859375, "rewards/chosen": 12.576831817626953, "rewards/margins": 11.593017578125, "rewards/rejected": 0.982973575592041, "step": 332 }, { "epoch": 0.17236024844720496, "grad_norm": 1.2333215475082397, "learning_rate": 9.998495187238733e-06, "loss": 0.19102880358695984, "rewards/accuracies": 0.8984375, "rewards/chosen": 11.156707763671875, "rewards/margins": 10.08102035522461, "rewards/rejected": 1.0786542892456055, "step": 333 }, { "epoch": 0.17287784679089027, "grad_norm": 1.376479148864746, "learning_rate": 9.998424385814236e-06, "loss": 0.20910987257957458, "rewards/accuracies": 0.8984375, "rewards/chosen": 9.748714447021484, "rewards/margins": 8.839920043945312, "rewards/rejected": 0.9079318046569824, "step": 334 }, { "epoch": 0.17339544513457558, "grad_norm": 1.7545956373214722, "learning_rate": 9.998351957116637e-06, "loss": 0.2040308713912964, "rewards/accuracies": 0.90625, "rewards/chosen": 10.64791488647461, "rewards/margins": 9.295364379882812, "rewards/rejected": 1.352029800415039, "step": 335 }, { "epoch": 0.17391304347826086, "grad_norm": 1.8316025733947754, "learning_rate": 9.998277901169512e-06, "loss": 0.15127389132976532, "rewards/accuracies": 0.9296875, "rewards/chosen": 12.640644073486328, "rewards/margins": 10.932846069335938, "rewards/rejected": 1.7102813720703125, "step": 336 }, { "epoch": 0.17443064182194618, "grad_norm": 1.5782065391540527, "learning_rate": 9.998202217996976e-06, "loss": 0.23237654566764832, "rewards/accuracies": 0.9140625, "rewards/chosen": 9.160348892211914, "rewards/margins": 7.832319259643555, "rewards/rejected": 1.3296339511871338, "step": 337 }, { "epoch": 0.17494824016563146, "grad_norm": 1.6015006303787231, "learning_rate": 9.998124907623663e-06, "loss": 0.19967813789844513, "rewards/accuracies": 0.875, "rewards/chosen": 10.612103462219238, "rewards/margins": 8.843326568603516, "rewards/rejected": 1.7653677463531494, "step": 338 }, { "epoch": 0.17546583850931677, "grad_norm": 1.7383421659469604, "learning_rate": 9.998045970074745e-06, "loss": 0.20289386808872223, "rewards/accuracies": 0.8984375, "rewards/chosen": 12.058679580688477, "rewards/margins": 9.736854553222656, "rewards/rejected": 2.3227624893188477, "step": 339 }, { "epoch": 0.17598343685300208, "grad_norm": 1.3404197692871094, "learning_rate": 9.99796540537592e-06, "loss": 0.16027720272541046, "rewards/accuracies": 0.9453125, "rewards/chosen": 11.329574584960938, "rewards/margins": 9.196929931640625, "rewards/rejected": 2.1327438354492188, "step": 340 }, { "epoch": 0.17650103519668736, "grad_norm": 1.2948949337005615, "learning_rate": 9.997883213553416e-06, "loss": 0.1458686888217926, "rewards/accuracies": 0.9375, "rewards/chosen": 12.262645721435547, "rewards/margins": 9.696868896484375, "rewards/rejected": 2.565678596496582, "step": 341 }, { "epoch": 0.17701863354037267, "grad_norm": 2.5847525596618652, "learning_rate": 9.997799394633992e-06, "loss": 0.22541622817516327, "rewards/accuracies": 0.8984375, "rewards/chosen": 12.053245544433594, "rewards/margins": 9.299407958984375, "rewards/rejected": 2.754680633544922, "step": 342 }, { "epoch": 0.17753623188405798, "grad_norm": 3.2882492542266846, "learning_rate": 9.997713948644934e-06, "loss": 0.252351313829422, "rewards/accuracies": 0.875, "rewards/chosen": 12.288223266601562, "rewards/margins": 9.166244506835938, "rewards/rejected": 3.12213134765625, "step": 343 }, { "epoch": 0.17805383022774326, "grad_norm": 1.8604824542999268, "learning_rate": 9.99762687561406e-06, "loss": 0.20315676927566528, "rewards/accuracies": 0.9140625, "rewards/chosen": 11.612213134765625, "rewards/margins": 8.287498474121094, "rewards/rejected": 3.325885772705078, "step": 344 }, { "epoch": 0.17857142857142858, "grad_norm": 1.4152722358703613, "learning_rate": 9.997538175569719e-06, "loss": 0.16665785014629364, "rewards/accuracies": 0.9296875, "rewards/chosen": 12.358062744140625, "rewards/margins": 8.690032958984375, "rewards/rejected": 3.6708984375, "step": 345 }, { "epoch": 0.17908902691511386, "grad_norm": 1.77675199508667, "learning_rate": 9.997447848540788e-06, "loss": 0.1813628226518631, "rewards/accuracies": 0.921875, "rewards/chosen": 9.804885864257812, "rewards/margins": 6.5296783447265625, "rewards/rejected": 3.2741832733154297, "step": 346 }, { "epoch": 0.17960662525879917, "grad_norm": 1.2923022508621216, "learning_rate": 9.997355894556671e-06, "loss": 0.15573357045650482, "rewards/accuracies": 0.9140625, "rewards/chosen": 8.522613525390625, "rewards/margins": 6.199371337890625, "rewards/rejected": 2.323162078857422, "step": 347 }, { "epoch": 0.18012422360248448, "grad_norm": 1.5173183679580688, "learning_rate": 9.997262313647308e-06, "loss": 0.18820320069789886, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.9517669677734375, "rewards/margins": 6.2382659912109375, "rewards/rejected": 1.7120490074157715, "step": 348 }, { "epoch": 0.18064182194616976, "grad_norm": 2.8092143535614014, "learning_rate": 9.99716710584316e-06, "loss": 0.2582106292247772, "rewards/accuracies": 0.8828125, "rewards/chosen": 8.323287963867188, "rewards/margins": 6.2989501953125, "rewards/rejected": 2.024038314819336, "step": 349 }, { "epoch": 0.18115942028985507, "grad_norm": 1.2790024280548096, "learning_rate": 9.997070271175227e-06, "loss": 0.16617293655872345, "rewards/accuracies": 0.8984375, "rewards/chosen": 9.64833927154541, "rewards/margins": 8.0113525390625, "rewards/rejected": 1.6348623037338257, "step": 350 }, { "epoch": 0.18167701863354038, "grad_norm": 1.7790294885635376, "learning_rate": 9.996971809675032e-06, "loss": 0.18131229281425476, "rewards/accuracies": 0.8984375, "rewards/chosen": 9.440765380859375, "rewards/margins": 8.013008117675781, "rewards/rejected": 1.4263200759887695, "step": 351 }, { "epoch": 0.18219461697722567, "grad_norm": 1.5023657083511353, "learning_rate": 9.99687172137463e-06, "loss": 0.16100716590881348, "rewards/accuracies": 0.8984375, "rewards/chosen": 9.855876922607422, "rewards/margins": 8.887237548828125, "rewards/rejected": 0.9661502838134766, "step": 352 }, { "epoch": 0.18271221532091098, "grad_norm": 2.2923319339752197, "learning_rate": 9.996770006306606e-06, "loss": 0.18080009520053864, "rewards/accuracies": 0.8984375, "rewards/chosen": 10.211082458496094, "rewards/margins": 9.5692138671875, "rewards/rejected": 0.6435012817382812, "step": 353 }, { "epoch": 0.18322981366459629, "grad_norm": 1.578690528869629, "learning_rate": 9.996666664504075e-06, "loss": 0.14962731301784515, "rewards/accuracies": 0.921875, "rewards/chosen": 10.620281219482422, "rewards/margins": 10.329055786132812, "rewards/rejected": 0.2926054000854492, "step": 354 }, { "epoch": 0.18374741200828157, "grad_norm": 3.8335375785827637, "learning_rate": 9.99656169600068e-06, "loss": 0.31179916858673096, "rewards/accuracies": 0.8515625, "rewards/chosen": 14.60049057006836, "rewards/margins": 13.542915344238281, "rewards/rejected": 1.057931900024414, "step": 355 }, { "epoch": 0.18426501035196688, "grad_norm": 2.479617118835449, "learning_rate": 9.996455100830594e-06, "loss": 0.1816507875919342, "rewards/accuracies": 0.9140625, "rewards/chosen": 14.769702911376953, "rewards/margins": 13.879547119140625, "rewards/rejected": 0.8872184753417969, "step": 356 }, { "epoch": 0.18478260869565216, "grad_norm": 3.726712226867676, "learning_rate": 9.996346879028518e-06, "loss": 0.22479361295700073, "rewards/accuracies": 0.90625, "rewards/chosen": 16.189884185791016, "rewards/margins": 15.350006103515625, "rewards/rejected": 0.8413114547729492, "step": 357 }, { "epoch": 0.18530020703933747, "grad_norm": 4.456371307373047, "learning_rate": 9.996237030629686e-06, "loss": 0.2622082531452179, "rewards/accuracies": 0.875, "rewards/chosen": 16.42713165283203, "rewards/margins": 15.763959884643555, "rewards/rejected": 0.6623954772949219, "step": 358 }, { "epoch": 0.18581780538302278, "grad_norm": 1.4591349363327026, "learning_rate": 9.99612555566986e-06, "loss": 0.15157286822795868, "rewards/accuracies": 0.9453125, "rewards/chosen": 15.799739837646484, "rewards/margins": 15.119644165039062, "rewards/rejected": 0.67913818359375, "step": 359 }, { "epoch": 0.18633540372670807, "grad_norm": 2.923647880554199, "learning_rate": 9.996012454185333e-06, "loss": 0.2757996916770935, "rewards/accuracies": 0.890625, "rewards/chosen": 18.97726058959961, "rewards/margins": 17.2359619140625, "rewards/rejected": 1.7409939765930176, "step": 360 }, { "epoch": 0.18685300207039338, "grad_norm": 1.7121108770370483, "learning_rate": 9.995897726212924e-06, "loss": 0.1595596969127655, "rewards/accuracies": 0.921875, "rewards/chosen": 19.00571060180664, "rewards/margins": 17.292083740234375, "rewards/rejected": 1.7162666320800781, "step": 361 }, { "epoch": 0.1873706004140787, "grad_norm": 3.7420847415924072, "learning_rate": 9.995781371789986e-06, "loss": 0.21050846576690674, "rewards/accuracies": 0.8828125, "rewards/chosen": 17.931259155273438, "rewards/margins": 17.05467987060547, "rewards/rejected": 0.8709416389465332, "step": 362 }, { "epoch": 0.18788819875776397, "grad_norm": 1.858075737953186, "learning_rate": 9.995663390954395e-06, "loss": 0.18917763233184814, "rewards/accuracies": 0.890625, "rewards/chosen": 21.06511688232422, "rewards/margins": 19.60382080078125, "rewards/rejected": 1.4581542015075684, "step": 363 }, { "epoch": 0.18840579710144928, "grad_norm": 1.6430879831314087, "learning_rate": 9.995543783744562e-06, "loss": 0.21693730354309082, "rewards/accuracies": 0.875, "rewards/chosen": 20.37842559814453, "rewards/margins": 18.410781860351562, "rewards/rejected": 1.9661369323730469, "step": 364 }, { "epoch": 0.18892339544513456, "grad_norm": 1.7680705785751343, "learning_rate": 9.995422550199427e-06, "loss": 0.1891089379787445, "rewards/accuracies": 0.875, "rewards/chosen": 21.33560562133789, "rewards/margins": 19.543128967285156, "rewards/rejected": 1.781900405883789, "step": 365 }, { "epoch": 0.18944099378881987, "grad_norm": 1.085108995437622, "learning_rate": 9.995299690358459e-06, "loss": 0.16534316539764404, "rewards/accuracies": 0.921875, "rewards/chosen": 24.20030975341797, "rewards/margins": 21.33423614501953, "rewards/rejected": 2.8659396171569824, "step": 366 }, { "epoch": 0.18995859213250518, "grad_norm": 1.5297446250915527, "learning_rate": 9.995175204261653e-06, "loss": 0.18201197683811188, "rewards/accuracies": 0.90625, "rewards/chosen": 22.443302154541016, "rewards/margins": 20.953231811523438, "rewards/rejected": 1.4909675121307373, "step": 367 }, { "epoch": 0.19047619047619047, "grad_norm": 1.61960768699646, "learning_rate": 9.99504909194954e-06, "loss": 0.1733381301164627, "rewards/accuracies": 0.90625, "rewards/chosen": 26.969810485839844, "rewards/margins": 24.2742919921875, "rewards/rejected": 2.694276809692383, "step": 368 }, { "epoch": 0.19099378881987578, "grad_norm": 1.8131269216537476, "learning_rate": 9.994921353463175e-06, "loss": 0.18072715401649475, "rewards/accuracies": 0.8828125, "rewards/chosen": 26.9658145904541, "rewards/margins": 24.696876525878906, "rewards/rejected": 2.273747682571411, "step": 369 }, { "epoch": 0.1915113871635611, "grad_norm": 2.1326169967651367, "learning_rate": 9.994791988844143e-06, "loss": 0.24823354184627533, "rewards/accuracies": 0.890625, "rewards/chosen": 23.82813262939453, "rewards/margins": 21.426891326904297, "rewards/rejected": 2.4044876098632812, "step": 370 }, { "epoch": 0.19202898550724637, "grad_norm": 1.643402099609375, "learning_rate": 9.994660998134562e-06, "loss": 0.13940924406051636, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.58007049560547, "rewards/margins": 24.0789794921875, "rewards/rejected": 1.503767967224121, "step": 371 }, { "epoch": 0.19254658385093168, "grad_norm": 1.3466029167175293, "learning_rate": 9.994528381377076e-06, "loss": 0.18371237814426422, "rewards/accuracies": 0.8984375, "rewards/chosen": 23.24700164794922, "rewards/margins": 21.92211151123047, "rewards/rejected": 1.3284080028533936, "step": 372 }, { "epoch": 0.193064182194617, "grad_norm": 2.21983003616333, "learning_rate": 9.994394138614859e-06, "loss": 0.2076111137866974, "rewards/accuracies": 0.8828125, "rewards/chosen": 26.299638748168945, "rewards/margins": 24.607406616210938, "rewards/rejected": 1.6921329498291016, "step": 373 }, { "epoch": 0.19358178053830227, "grad_norm": 1.792041301727295, "learning_rate": 9.994258269891614e-06, "loss": 0.2718816101551056, "rewards/accuracies": 0.859375, "rewards/chosen": 19.334003448486328, "rewards/margins": 18.58306884765625, "rewards/rejected": 0.7501411437988281, "step": 374 }, { "epoch": 0.19409937888198758, "grad_norm": 2.5320253372192383, "learning_rate": 9.994120775251575e-06, "loss": 0.2494504302740097, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.945449829101562, "rewards/margins": 16.439971923828125, "rewards/rejected": 0.5043797492980957, "step": 375 }, { "epoch": 0.19461697722567287, "grad_norm": 1.569858431816101, "learning_rate": 9.993981654739506e-06, "loss": 0.15126332640647888, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.75914192199707, "rewards/margins": 16.8184814453125, "rewards/rejected": -0.06337642669677734, "step": 376 }, { "epoch": 0.19513457556935818, "grad_norm": 1.335792899131775, "learning_rate": 9.993840908400696e-06, "loss": 0.1884060800075531, "rewards/accuracies": 0.921875, "rewards/chosen": 16.399566650390625, "rewards/margins": 16.384849548339844, "rewards/rejected": 0.013163566589355469, "step": 377 }, { "epoch": 0.1956521739130435, "grad_norm": 2.4782066345214844, "learning_rate": 9.993698536280965e-06, "loss": 0.14569184184074402, "rewards/accuracies": 0.9453125, "rewards/chosen": 13.575206756591797, "rewards/margins": 14.002067565917969, "rewards/rejected": -0.42243385314941406, "step": 378 }, { "epoch": 0.19616977225672877, "grad_norm": 4.402243137359619, "learning_rate": 9.993554538426667e-06, "loss": 0.20770612359046936, "rewards/accuracies": 0.875, "rewards/chosen": 13.718903541564941, "rewards/margins": 14.359001159667969, "rewards/rejected": -0.6373879909515381, "step": 379 }, { "epoch": 0.19668737060041408, "grad_norm": 1.4924671649932861, "learning_rate": 9.99340891488468e-06, "loss": 0.1259123533964157, "rewards/accuracies": 0.921875, "rewards/chosen": 17.368680953979492, "rewards/margins": 18.28443145751953, "rewards/rejected": -0.9108567237854004, "step": 380 }, { "epoch": 0.1972049689440994, "grad_norm": 3.3061087131500244, "learning_rate": 9.993261665702412e-06, "loss": 0.3112160563468933, "rewards/accuracies": 0.828125, "rewards/chosen": 13.71865463256836, "rewards/margins": 14.607330322265625, "rewards/rejected": -0.8920221328735352, "step": 381 }, { "epoch": 0.19772256728778467, "grad_norm": 1.2219380140304565, "learning_rate": 9.993112790927801e-06, "loss": 0.13373172283172607, "rewards/accuracies": 0.953125, "rewards/chosen": 13.922248840332031, "rewards/margins": 15.089492797851562, "rewards/rejected": -1.1685171127319336, "step": 382 }, { "epoch": 0.19824016563146998, "grad_norm": 3.0634233951568604, "learning_rate": 9.992962290609317e-06, "loss": 0.20828449726104736, "rewards/accuracies": 0.8671875, "rewards/chosen": 15.99919319152832, "rewards/margins": 16.841400146484375, "rewards/rejected": -0.8398046493530273, "step": 383 }, { "epoch": 0.19875776397515527, "grad_norm": 2.281729221343994, "learning_rate": 9.992810164795951e-06, "loss": 0.3232704699039459, "rewards/accuracies": 0.828125, "rewards/chosen": 14.326903343200684, "rewards/margins": 15.321563720703125, "rewards/rejected": -0.998903751373291, "step": 384 }, { "epoch": 0.19927536231884058, "grad_norm": 1.5694836378097534, "learning_rate": 9.992656413537234e-06, "loss": 0.1992853879928589, "rewards/accuracies": 0.921875, "rewards/chosen": 16.637168884277344, "rewards/margins": 17.798629760742188, "rewards/rejected": -1.1677217483520508, "step": 385 }, { "epoch": 0.1997929606625259, "grad_norm": 1.9771742820739746, "learning_rate": 9.992501036883219e-06, "loss": 0.24153347313404083, "rewards/accuracies": 0.859375, "rewards/chosen": 12.987592697143555, "rewards/margins": 13.818164825439453, "rewards/rejected": -0.8291244506835938, "step": 386 }, { "epoch": 0.20031055900621117, "grad_norm": 4.257767200469971, "learning_rate": 9.992344034884489e-06, "loss": 0.23192353546619415, "rewards/accuracies": 0.875, "rewards/chosen": 19.12600326538086, "rewards/margins": 19.867652893066406, "rewards/rejected": -0.7443099021911621, "step": 387 }, { "epoch": 0.20082815734989648, "grad_norm": 1.4571160078048706, "learning_rate": 9.992185407592157e-06, "loss": 0.17928430438041687, "rewards/accuracies": 0.890625, "rewards/chosen": 20.68939208984375, "rewards/margins": 21.232208251953125, "rewards/rejected": -0.5421361923217773, "step": 388 }, { "epoch": 0.2013457556935818, "grad_norm": 1.3563796281814575, "learning_rate": 9.992025155057867e-06, "loss": 0.2237900048494339, "rewards/accuracies": 0.890625, "rewards/chosen": 18.23358726501465, "rewards/margins": 18.67279052734375, "rewards/rejected": -0.4358501434326172, "step": 389 }, { "epoch": 0.20186335403726707, "grad_norm": 2.886350154876709, "learning_rate": 9.991863277333792e-06, "loss": 0.27317994832992554, "rewards/accuracies": 0.8046875, "rewards/chosen": 16.327821731567383, "rewards/margins": 16.874855041503906, "rewards/rejected": -0.5494664907455444, "step": 390 }, { "epoch": 0.20238095238095238, "grad_norm": 2.2978427410125732, "learning_rate": 9.991699774472626e-06, "loss": 0.1686873883008957, "rewards/accuracies": 0.9375, "rewards/chosen": 20.182857513427734, "rewards/margins": 20.350540161132812, "rewards/rejected": -0.17121124267578125, "step": 391 }, { "epoch": 0.2028985507246377, "grad_norm": 2.6083946228027344, "learning_rate": 9.991534646527605e-06, "loss": 0.24029187858104706, "rewards/accuracies": 0.8671875, "rewards/chosen": 21.072166442871094, "rewards/margins": 21.115741729736328, "rewards/rejected": -0.05040550231933594, "step": 392 }, { "epoch": 0.20341614906832298, "grad_norm": 2.733712673187256, "learning_rate": 9.991367893552483e-06, "loss": 0.2587130069732666, "rewards/accuracies": 0.828125, "rewards/chosen": 19.513450622558594, "rewards/margins": 19.936050415039062, "rewards/rejected": -0.4243645668029785, "step": 393 }, { "epoch": 0.2039337474120083, "grad_norm": 1.3802770376205444, "learning_rate": 9.991199515601551e-06, "loss": 0.17780299484729767, "rewards/accuracies": 0.90625, "rewards/chosen": 19.27436637878418, "rewards/margins": 19.380809783935547, "rewards/rejected": -0.10560989379882812, "step": 394 }, { "epoch": 0.20445134575569357, "grad_norm": 1.1477593183517456, "learning_rate": 9.991029512729625e-06, "loss": 0.1817449927330017, "rewards/accuracies": 0.890625, "rewards/chosen": 19.683494567871094, "rewards/margins": 19.89611053466797, "rewards/rejected": -0.2248544692993164, "step": 395 }, { "epoch": 0.20496894409937888, "grad_norm": 1.6946632862091064, "learning_rate": 9.99085788499205e-06, "loss": 0.15596336126327515, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.543920516967773, "rewards/margins": 22.629981994628906, "rewards/rejected": -0.08704662322998047, "step": 396 }, { "epoch": 0.2054865424430642, "grad_norm": 1.4592242240905762, "learning_rate": 9.9906846324447e-06, "loss": 0.20626920461654663, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.67060089111328, "rewards/margins": 20.294445037841797, "rewards/rejected": 0.3773367404937744, "step": 397 }, { "epoch": 0.20600414078674947, "grad_norm": 1.5067862272262573, "learning_rate": 9.99050975514398e-06, "loss": 0.2505643963813782, "rewards/accuracies": 0.84375, "rewards/chosen": 20.879840850830078, "rewards/margins": 20.299381256103516, "rewards/rejected": 0.5763545036315918, "step": 398 }, { "epoch": 0.20652173913043478, "grad_norm": 2.10825252532959, "learning_rate": 9.990333253146821e-06, "loss": 0.34345221519470215, "rewards/accuracies": 0.8359375, "rewards/chosen": 15.663850784301758, "rewards/margins": 15.808143615722656, "rewards/rejected": -0.1464991569519043, "step": 399 }, { "epoch": 0.2070393374741201, "grad_norm": 2.9493205547332764, "learning_rate": 9.990155126510686e-06, "loss": 0.22711236774921417, "rewards/accuracies": 0.8671875, "rewards/chosen": 23.468463897705078, "rewards/margins": 22.635787963867188, "rewards/rejected": 0.8276259899139404, "step": 400 }, { "epoch": 0.20755693581780538, "grad_norm": 3.0505855083465576, "learning_rate": 9.989975375293564e-06, "loss": 0.23640576004981995, "rewards/accuracies": 0.890625, "rewards/chosen": 23.778217315673828, "rewards/margins": 22.44412612915039, "rewards/rejected": 1.3329200744628906, "step": 401 }, { "epoch": 0.2080745341614907, "grad_norm": 3.0469703674316406, "learning_rate": 9.989793999553977e-06, "loss": 0.24279874563217163, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.08559799194336, "rewards/margins": 20.919677734375, "rewards/rejected": 0.1619720458984375, "step": 402 }, { "epoch": 0.20859213250517597, "grad_norm": 2.9994211196899414, "learning_rate": 9.98961099935097e-06, "loss": 0.23942670226097107, "rewards/accuracies": 0.8671875, "rewards/chosen": 21.6706485748291, "rewards/margins": 21.258956909179688, "rewards/rejected": 0.41832971572875977, "step": 403 }, { "epoch": 0.20910973084886128, "grad_norm": 3.7607855796813965, "learning_rate": 9.98942637474412e-06, "loss": 0.25371798872947693, "rewards/accuracies": 0.8671875, "rewards/chosen": 20.630084991455078, "rewards/margins": 20.473770141601562, "rewards/rejected": 0.15196847915649414, "step": 404 }, { "epoch": 0.2096273291925466, "grad_norm": 2.831838846206665, "learning_rate": 9.989240125793536e-06, "loss": 0.2296416014432907, "rewards/accuracies": 0.8828125, "rewards/chosen": 18.792057037353516, "rewards/margins": 18.058143615722656, "rewards/rejected": 0.7318172454833984, "step": 405 }, { "epoch": 0.21014492753623187, "grad_norm": 1.4647343158721924, "learning_rate": 9.98905225255985e-06, "loss": 0.17252878844738007, "rewards/accuracies": 0.8984375, "rewards/chosen": 24.33477210998535, "rewards/margins": 23.436290740966797, "rewards/rejected": 0.9053256511688232, "step": 406 }, { "epoch": 0.21066252587991718, "grad_norm": 3.6364035606384277, "learning_rate": 9.988862755104227e-06, "loss": 0.24211207032203674, "rewards/accuracies": 0.8984375, "rewards/chosen": 23.263246536254883, "rewards/margins": 22.202861785888672, "rewards/rejected": 1.0633516311645508, "step": 407 }, { "epoch": 0.2111801242236025, "grad_norm": 1.8441115617752075, "learning_rate": 9.988671633488359e-06, "loss": 0.24533091485500336, "rewards/accuracies": 0.859375, "rewards/chosen": 22.291698455810547, "rewards/margins": 20.771739959716797, "rewards/rejected": 1.5182685852050781, "step": 408 }, { "epoch": 0.21169772256728778, "grad_norm": 1.2402875423431396, "learning_rate": 9.988478887774465e-06, "loss": 0.1769273281097412, "rewards/accuracies": 0.890625, "rewards/chosen": 24.738849639892578, "rewards/margins": 23.54839324951172, "rewards/rejected": 1.1896870136260986, "step": 409 }, { "epoch": 0.2122153209109731, "grad_norm": 1.193552851676941, "learning_rate": 9.988284518025298e-06, "loss": 0.17090292274951935, "rewards/accuracies": 0.9375, "rewards/chosen": 23.41476058959961, "rewards/margins": 21.467041015625, "rewards/rejected": 1.9477100372314453, "step": 410 }, { "epoch": 0.2127329192546584, "grad_norm": 1.578840732574463, "learning_rate": 9.988088524304134e-06, "loss": 0.2197371870279312, "rewards/accuracies": 0.890625, "rewards/chosen": 23.75815200805664, "rewards/margins": 21.088285446166992, "rewards/rejected": 2.6741979122161865, "step": 411 }, { "epoch": 0.21325051759834368, "grad_norm": 2.463118076324463, "learning_rate": 9.987890906674782e-06, "loss": 0.2336515188217163, "rewards/accuracies": 0.8515625, "rewards/chosen": 25.073854446411133, "rewards/margins": 23.444259643554688, "rewards/rejected": 1.6387405395507812, "step": 412 }, { "epoch": 0.213768115942029, "grad_norm": 1.665520429611206, "learning_rate": 9.987691665201575e-06, "loss": 0.2414189577102661, "rewards/accuracies": 0.8828125, "rewards/chosen": 21.525955200195312, "rewards/margins": 20.752124786376953, "rewards/rejected": 0.7640690803527832, "step": 413 }, { "epoch": 0.21428571428571427, "grad_norm": 1.2811424732208252, "learning_rate": 9.987490799949382e-06, "loss": 0.1447194218635559, "rewards/accuracies": 0.953125, "rewards/chosen": 25.750186920166016, "rewards/margins": 24.137557983398438, "rewards/rejected": 1.611318826675415, "step": 414 }, { "epoch": 0.21480331262939958, "grad_norm": 2.260317325592041, "learning_rate": 9.987288310983592e-06, "loss": 0.22364425659179688, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.215505599975586, "rewards/margins": 17.782291412353516, "rewards/rejected": 0.4375457763671875, "step": 415 }, { "epoch": 0.2153209109730849, "grad_norm": 1.2754099369049072, "learning_rate": 9.987084198370128e-06, "loss": 0.1803591251373291, "rewards/accuracies": 0.90625, "rewards/chosen": 21.81165313720703, "rewards/margins": 21.151710510253906, "rewards/rejected": 0.6608614921569824, "step": 416 }, { "epoch": 0.21583850931677018, "grad_norm": 1.8743205070495605, "learning_rate": 9.986878462175443e-06, "loss": 0.24019378423690796, "rewards/accuracies": 0.8828125, "rewards/chosen": 17.875835418701172, "rewards/margins": 17.161720275878906, "rewards/rejected": 0.7218494415283203, "step": 417 }, { "epoch": 0.2163561076604555, "grad_norm": 2.434114933013916, "learning_rate": 9.98667110246651e-06, "loss": 0.18984957039356232, "rewards/accuracies": 0.8984375, "rewards/chosen": 19.502811431884766, "rewards/margins": 18.45587158203125, "rewards/rejected": 1.0464887619018555, "step": 418 }, { "epoch": 0.2168737060041408, "grad_norm": 2.7789134979248047, "learning_rate": 9.986462119310843e-06, "loss": 0.23309674859046936, "rewards/accuracies": 0.890625, "rewards/chosen": 18.863679885864258, "rewards/margins": 17.80144500732422, "rewards/rejected": 1.0639276504516602, "step": 419 }, { "epoch": 0.21739130434782608, "grad_norm": 2.043396472930908, "learning_rate": 9.986251512776472e-06, "loss": 0.3183768391609192, "rewards/accuracies": 0.8125, "rewards/chosen": 16.445154190063477, "rewards/margins": 15.046501159667969, "rewards/rejected": 1.4007688760757446, "step": 420 }, { "epoch": 0.2179089026915114, "grad_norm": 1.9924782514572144, "learning_rate": 9.986039282931966e-06, "loss": 0.1904914677143097, "rewards/accuracies": 0.8828125, "rewards/chosen": 17.376419067382812, "rewards/margins": 15.387847900390625, "rewards/rejected": 1.9839692115783691, "step": 421 }, { "epoch": 0.21842650103519667, "grad_norm": 2.6096913814544678, "learning_rate": 9.985825429846417e-06, "loss": 0.2185894399881363, "rewards/accuracies": 0.890625, "rewards/chosen": 16.257823944091797, "rewards/margins": 14.5181884765625, "rewards/rejected": 1.7385101318359375, "step": 422 }, { "epoch": 0.21894409937888198, "grad_norm": 1.8904589414596558, "learning_rate": 9.985609953589444e-06, "loss": 0.26365748047828674, "rewards/accuracies": 0.8203125, "rewards/chosen": 16.20684051513672, "rewards/margins": 14.375526428222656, "rewards/rejected": 1.8307762145996094, "step": 423 }, { "epoch": 0.2194616977225673, "grad_norm": 1.2376911640167236, "learning_rate": 9.9853928542312e-06, "loss": 0.22924350202083588, "rewards/accuracies": 0.8828125, "rewards/chosen": 14.579020500183105, "rewards/margins": 12.911178588867188, "rewards/rejected": 1.6674957275390625, "step": 424 }, { "epoch": 0.21997929606625258, "grad_norm": 0.8664976358413696, "learning_rate": 9.985174131842362e-06, "loss": 0.14026525616645813, "rewards/accuracies": 0.9140625, "rewards/chosen": 14.894989013671875, "rewards/margins": 13.18743896484375, "rewards/rejected": 1.7124395370483398, "step": 425 }, { "epoch": 0.2204968944099379, "grad_norm": 1.6311167478561401, "learning_rate": 9.984953786494135e-06, "loss": 0.23952679336071014, "rewards/accuracies": 0.875, "rewards/chosen": 15.647045135498047, "rewards/margins": 13.366180419921875, "rewards/rejected": 2.282093048095703, "step": 426 }, { "epoch": 0.2210144927536232, "grad_norm": 2.0896763801574707, "learning_rate": 9.984731818258258e-06, "loss": 0.24399171769618988, "rewards/accuracies": 0.875, "rewards/chosen": 14.296956062316895, "rewards/margins": 12.3375244140625, "rewards/rejected": 1.9613451957702637, "step": 427 }, { "epoch": 0.22153209109730848, "grad_norm": 1.565897822380066, "learning_rate": 9.984508227206989e-06, "loss": 0.20977577567100525, "rewards/accuracies": 0.8984375, "rewards/chosen": 16.196792602539062, "rewards/margins": 13.981246948242188, "rewards/rejected": 2.215728282928467, "step": 428 }, { "epoch": 0.2220496894409938, "grad_norm": 1.5178693532943726, "learning_rate": 9.984283013413125e-06, "loss": 0.21057718992233276, "rewards/accuracies": 0.90625, "rewards/chosen": 17.04328155517578, "rewards/margins": 14.617279052734375, "rewards/rejected": 2.427814483642578, "step": 429 }, { "epoch": 0.2225672877846791, "grad_norm": 1.4279240369796753, "learning_rate": 9.984056176949981e-06, "loss": 0.24564434587955475, "rewards/accuracies": 0.890625, "rewards/chosen": 16.064979553222656, "rewards/margins": 13.560821533203125, "rewards/rejected": 2.5039498805999756, "step": 430 }, { "epoch": 0.22308488612836438, "grad_norm": 1.1903696060180664, "learning_rate": 9.98382771789141e-06, "loss": 0.20068907737731934, "rewards/accuracies": 0.90625, "rewards/chosen": 16.23546600341797, "rewards/margins": 13.895767211914062, "rewards/rejected": 2.340129852294922, "step": 431 }, { "epoch": 0.2236024844720497, "grad_norm": 1.1284343004226685, "learning_rate": 9.983597636311786e-06, "loss": 0.20926332473754883, "rewards/accuracies": 0.875, "rewards/chosen": 19.186168670654297, "rewards/margins": 16.451416015625, "rewards/rejected": 2.7363245487213135, "step": 432 }, { "epoch": 0.22412008281573498, "grad_norm": 1.0745435953140259, "learning_rate": 9.983365932286015e-06, "loss": 0.18114331364631653, "rewards/accuracies": 0.890625, "rewards/chosen": 18.932743072509766, "rewards/margins": 16.3890380859375, "rewards/rejected": 2.543609619140625, "step": 433 }, { "epoch": 0.2246376811594203, "grad_norm": 1.3133763074874878, "learning_rate": 9.983132605889527e-06, "loss": 0.22046880424022675, "rewards/accuracies": 0.921875, "rewards/chosen": 17.40200424194336, "rewards/margins": 15.459636688232422, "rewards/rejected": 1.938119888305664, "step": 434 }, { "epoch": 0.2251552795031056, "grad_norm": 1.9727643728256226, "learning_rate": 9.982897657198285e-06, "loss": 0.2668749988079071, "rewards/accuracies": 0.859375, "rewards/chosen": 19.55832290649414, "rewards/margins": 16.57635498046875, "rewards/rejected": 2.9879837036132812, "step": 435 }, { "epoch": 0.22567287784679088, "grad_norm": 1.3885324001312256, "learning_rate": 9.98266108628878e-06, "loss": 0.19332611560821533, "rewards/accuracies": 0.8828125, "rewards/chosen": 23.946929931640625, "rewards/margins": 20.573440551757812, "rewards/rejected": 3.3732833862304688, "step": 436 }, { "epoch": 0.2261904761904762, "grad_norm": 1.0520036220550537, "learning_rate": 9.982422893238027e-06, "loss": 0.17317140102386475, "rewards/accuracies": 0.890625, "rewards/chosen": 21.578954696655273, "rewards/margins": 18.844154357910156, "rewards/rejected": 2.737539291381836, "step": 437 }, { "epoch": 0.2267080745341615, "grad_norm": 1.9961613416671753, "learning_rate": 9.98218307812357e-06, "loss": 0.24177545309066772, "rewards/accuracies": 0.890625, "rewards/chosen": 19.822677612304688, "rewards/margins": 17.106361389160156, "rewards/rejected": 2.7129621505737305, "step": 438 }, { "epoch": 0.22722567287784678, "grad_norm": 1.3097484111785889, "learning_rate": 9.981941641023486e-06, "loss": 0.2095707505941391, "rewards/accuracies": 0.875, "rewards/chosen": 19.960559844970703, "rewards/margins": 17.822345733642578, "rewards/rejected": 2.137465000152588, "step": 439 }, { "epoch": 0.2277432712215321, "grad_norm": 1.057589054107666, "learning_rate": 9.981698582016376e-06, "loss": 0.14078965783119202, "rewards/accuracies": 0.9453125, "rewards/chosen": 20.521007537841797, "rewards/margins": 18.62169647216797, "rewards/rejected": 1.9001989364624023, "step": 440 }, { "epoch": 0.22826086956521738, "grad_norm": 2.3683736324310303, "learning_rate": 9.981453901181369e-06, "loss": 0.2754810154438019, "rewards/accuracies": 0.859375, "rewards/chosen": 19.31133270263672, "rewards/margins": 16.86279296875, "rewards/rejected": 2.44449520111084, "step": 441 }, { "epoch": 0.2287784679089027, "grad_norm": 1.482633113861084, "learning_rate": 9.981207598598122e-06, "loss": 0.17820532619953156, "rewards/accuracies": 0.890625, "rewards/chosen": 21.784469604492188, "rewards/margins": 19.254966735839844, "rewards/rejected": 2.5342795848846436, "step": 442 }, { "epoch": 0.229296066252588, "grad_norm": 2.907179594039917, "learning_rate": 9.980959674346822e-06, "loss": 0.2927647531032562, "rewards/accuracies": 0.8515625, "rewards/chosen": 20.255645751953125, "rewards/margins": 17.622390747070312, "rewards/rejected": 2.6336002349853516, "step": 443 }, { "epoch": 0.22981366459627328, "grad_norm": 1.673408031463623, "learning_rate": 9.98071012850818e-06, "loss": 0.1797415167093277, "rewards/accuracies": 0.921875, "rewards/chosen": 22.505203247070312, "rewards/margins": 19.16876792907715, "rewards/rejected": 3.3372364044189453, "step": 444 }, { "epoch": 0.2303312629399586, "grad_norm": 2.051011800765991, "learning_rate": 9.98045896116344e-06, "loss": 0.20134451985359192, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.16704559326172, "rewards/margins": 17.810218811035156, "rewards/rejected": 3.3590872287750244, "step": 445 }, { "epoch": 0.2308488612836439, "grad_norm": 1.7858200073242188, "learning_rate": 9.980206172394369e-06, "loss": 0.18798375129699707, "rewards/accuracies": 0.890625, "rewards/chosen": 22.45452308654785, "rewards/margins": 19.426742553710938, "rewards/rejected": 3.0177955627441406, "step": 446 }, { "epoch": 0.23136645962732919, "grad_norm": 2.5253710746765137, "learning_rate": 9.979951762283266e-06, "loss": 0.2445852905511856, "rewards/accuracies": 0.875, "rewards/chosen": 18.707931518554688, "rewards/margins": 16.83362579345703, "rewards/rejected": 1.871156096458435, "step": 447 }, { "epoch": 0.2318840579710145, "grad_norm": 1.3259180784225464, "learning_rate": 9.979695730912957e-06, "loss": 0.19774147868156433, "rewards/accuracies": 0.90625, "rewards/chosen": 17.449047088623047, "rewards/margins": 15.002677917480469, "rewards/rejected": 2.441375732421875, "step": 448 }, { "epoch": 0.2324016563146998, "grad_norm": 1.0917471647262573, "learning_rate": 9.979438078366792e-06, "loss": 0.16958241164684296, "rewards/accuracies": 0.90625, "rewards/chosen": 15.680097579956055, "rewards/margins": 14.25311279296875, "rewards/rejected": 1.4251885414123535, "step": 449 }, { "epoch": 0.2329192546583851, "grad_norm": 1.074837565422058, "learning_rate": 9.979178804728653e-06, "loss": 0.1377275586128235, "rewards/accuracies": 0.9609375, "rewards/chosen": 15.445392608642578, "rewards/margins": 14.535484313964844, "rewards/rejected": 0.9082059860229492, "step": 450 }, { "epoch": 0.2334368530020704, "grad_norm": 1.5042724609375, "learning_rate": 9.978917910082951e-06, "loss": 0.21743148565292358, "rewards/accuracies": 0.8984375, "rewards/chosen": 11.311351776123047, "rewards/margins": 10.724990844726562, "rewards/rejected": 0.5887451171875, "step": 451 }, { "epoch": 0.23395445134575568, "grad_norm": 1.5947628021240234, "learning_rate": 9.978655394514616e-06, "loss": 0.17858141660690308, "rewards/accuracies": 0.9140625, "rewards/chosen": 13.516464233398438, "rewards/margins": 12.516719818115234, "rewards/rejected": 1.0027446746826172, "step": 452 }, { "epoch": 0.234472049689441, "grad_norm": 1.5098865032196045, "learning_rate": 9.978391258109118e-06, "loss": 0.18337737023830414, "rewards/accuracies": 0.890625, "rewards/chosen": 14.029739379882812, "rewards/margins": 12.395904541015625, "rewards/rejected": 1.6289863586425781, "step": 453 }, { "epoch": 0.2349896480331263, "grad_norm": 1.94092857837677, "learning_rate": 9.978125500952444e-06, "loss": 0.23216362297534943, "rewards/accuracies": 0.8671875, "rewards/chosen": 13.593650817871094, "rewards/margins": 12.179550170898438, "rewards/rejected": 1.413032054901123, "step": 454 }, { "epoch": 0.23550724637681159, "grad_norm": 2.2473223209381104, "learning_rate": 9.977858123131116e-06, "loss": 0.2310502529144287, "rewards/accuracies": 0.8671875, "rewards/chosen": 14.425643920898438, "rewards/margins": 12.661958694458008, "rewards/rejected": 1.762678623199463, "step": 455 }, { "epoch": 0.2360248447204969, "grad_norm": 2.5656816959381104, "learning_rate": 9.97758912473218e-06, "loss": 0.19512373208999634, "rewards/accuracies": 0.90625, "rewards/chosen": 15.931177139282227, "rewards/margins": 14.253936767578125, "rewards/rejected": 1.6728248596191406, "step": 456 }, { "epoch": 0.2365424430641822, "grad_norm": 1.9556019306182861, "learning_rate": 9.977318505843212e-06, "loss": 0.22639548778533936, "rewards/accuracies": 0.8515625, "rewards/chosen": 15.405473709106445, "rewards/margins": 12.888641357421875, "rewards/rejected": 2.517025947570801, "step": 457 }, { "epoch": 0.2370600414078675, "grad_norm": 1.8218886852264404, "learning_rate": 9.97704626655231e-06, "loss": 0.23431620001792908, "rewards/accuracies": 0.859375, "rewards/chosen": 15.320068359375, "rewards/margins": 13.076103210449219, "rewards/rejected": 2.2419090270996094, "step": 458 }, { "epoch": 0.2375776397515528, "grad_norm": 1.7387171983718872, "learning_rate": 9.976772406948107e-06, "loss": 0.17164675891399384, "rewards/accuracies": 0.9375, "rewards/chosen": 12.053359031677246, "rewards/margins": 10.7080078125, "rewards/rejected": 1.3394479751586914, "step": 459 }, { "epoch": 0.23809523809523808, "grad_norm": 1.412011981010437, "learning_rate": 9.976496927119759e-06, "loss": 0.25544273853302, "rewards/accuracies": 0.890625, "rewards/chosen": 11.176074981689453, "rewards/margins": 9.804143905639648, "rewards/rejected": 1.3645212650299072, "step": 460 }, { "epoch": 0.2386128364389234, "grad_norm": 1.1013981103897095, "learning_rate": 9.976219827156948e-06, "loss": 0.13503716886043549, "rewards/accuracies": 0.9453125, "rewards/chosen": 13.00779914855957, "rewards/margins": 10.817405700683594, "rewards/rejected": 2.191300392150879, "step": 461 }, { "epoch": 0.2391304347826087, "grad_norm": 1.4477970600128174, "learning_rate": 9.975941107149889e-06, "loss": 0.1649777889251709, "rewards/accuracies": 0.9140625, "rewards/chosen": 10.388660430908203, "rewards/margins": 9.743965148925781, "rewards/rejected": 0.645416259765625, "step": 462 }, { "epoch": 0.23964803312629399, "grad_norm": 1.9927901029586792, "learning_rate": 9.975660767189322e-06, "loss": 0.2596425414085388, "rewards/accuracies": 0.875, "rewards/chosen": 13.584277153015137, "rewards/margins": 12.123588562011719, "rewards/rejected": 1.4582481384277344, "step": 463 }, { "epoch": 0.2401656314699793, "grad_norm": 1.232395887374878, "learning_rate": 9.975378807366509e-06, "loss": 0.17412787675857544, "rewards/accuracies": 0.921875, "rewards/chosen": 14.06436824798584, "rewards/margins": 12.372367858886719, "rewards/rejected": 1.69281005859375, "step": 464 }, { "epoch": 0.2406832298136646, "grad_norm": 1.5666428804397583, "learning_rate": 9.97509522777325e-06, "loss": 0.24720174074172974, "rewards/accuracies": 0.859375, "rewards/chosen": 15.059967041015625, "rewards/margins": 13.026992797851562, "rewards/rejected": 2.0340919494628906, "step": 465 }, { "epoch": 0.2412008281573499, "grad_norm": 1.0606863498687744, "learning_rate": 9.974810028501861e-06, "loss": 0.14667846262454987, "rewards/accuracies": 0.8984375, "rewards/chosen": 17.612289428710938, "rewards/margins": 14.879043579101562, "rewards/rejected": 2.732410430908203, "step": 466 }, { "epoch": 0.2417184265010352, "grad_norm": 1.5033855438232422, "learning_rate": 9.974523209645195e-06, "loss": 0.1603599637746811, "rewards/accuracies": 0.921875, "rewards/chosen": 18.559185028076172, "rewards/margins": 15.726303100585938, "rewards/rejected": 2.8311634063720703, "step": 467 }, { "epoch": 0.2422360248447205, "grad_norm": 1.2986754179000854, "learning_rate": 9.974234771296624e-06, "loss": 0.2079959511756897, "rewards/accuracies": 0.8828125, "rewards/chosen": 17.53369140625, "rewards/margins": 15.609420776367188, "rewards/rejected": 1.9251670837402344, "step": 468 }, { "epoch": 0.2427536231884058, "grad_norm": 1.354543924331665, "learning_rate": 9.973944713550055e-06, "loss": 0.19028997421264648, "rewards/accuracies": 0.8984375, "rewards/chosen": 17.56753158569336, "rewards/margins": 15.21636962890625, "rewards/rejected": 2.351531982421875, "step": 469 }, { "epoch": 0.2432712215320911, "grad_norm": 4.5658698081970215, "learning_rate": 9.973653036499916e-06, "loss": 0.3075636625289917, "rewards/accuracies": 0.8671875, "rewards/chosen": 15.057229995727539, "rewards/margins": 13.951438903808594, "rewards/rejected": 1.1011285781860352, "step": 470 }, { "epoch": 0.24378881987577639, "grad_norm": 1.066131353378296, "learning_rate": 9.973359740241166e-06, "loss": 0.2068997025489807, "rewards/accuracies": 0.8984375, "rewards/chosen": 13.261648178100586, "rewards/margins": 12.454753875732422, "rewards/rejected": 0.8108353614807129, "step": 471 }, { "epoch": 0.2443064182194617, "grad_norm": 1.6297146081924438, "learning_rate": 9.973064824869288e-06, "loss": 0.20937500894069672, "rewards/accuracies": 0.8984375, "rewards/chosen": 13.211381912231445, "rewards/margins": 11.231582641601562, "rewards/rejected": 1.9819974899291992, "step": 472 }, { "epoch": 0.244824016563147, "grad_norm": 1.9518334865570068, "learning_rate": 9.972768290480294e-06, "loss": 0.22094669938087463, "rewards/accuracies": 0.875, "rewards/chosen": 11.158130645751953, "rewards/margins": 9.466552734375, "rewards/rejected": 1.693960189819336, "step": 473 }, { "epoch": 0.2453416149068323, "grad_norm": 1.4899828433990479, "learning_rate": 9.972470137170724e-06, "loss": 0.22774989902973175, "rewards/accuracies": 0.8828125, "rewards/chosen": 8.551305770874023, "rewards/margins": 7.576255798339844, "rewards/rejected": 0.9779901504516602, "step": 474 }, { "epoch": 0.2458592132505176, "grad_norm": 1.5995782613754272, "learning_rate": 9.972170365037646e-06, "loss": 0.2623751163482666, "rewards/accuracies": 0.8359375, "rewards/chosen": 7.033412933349609, "rewards/margins": 6.444602966308594, "rewards/rejected": 0.5897235870361328, "step": 475 }, { "epoch": 0.2463768115942029, "grad_norm": 1.194636583328247, "learning_rate": 9.971868974178648e-06, "loss": 0.17139342427253723, "rewards/accuracies": 0.921875, "rewards/chosen": 5.719137191772461, "rewards/margins": 5.7759246826171875, "rewards/rejected": -0.056304931640625, "step": 476 }, { "epoch": 0.2468944099378882, "grad_norm": 1.4181357622146606, "learning_rate": 9.971565964691853e-06, "loss": 0.19486898183822632, "rewards/accuracies": 0.8671875, "rewards/chosen": 5.394126892089844, "rewards/margins": 5.3746185302734375, "rewards/rejected": 0.021041393280029297, "step": 477 }, { "epoch": 0.2474120082815735, "grad_norm": 1.4824163913726807, "learning_rate": 9.971261336675909e-06, "loss": 0.19153925776481628, "rewards/accuracies": 0.921875, "rewards/chosen": 4.717862606048584, "rewards/margins": 4.883735656738281, "rewards/rejected": -0.16655349731445312, "step": 478 }, { "epoch": 0.24792960662525879, "grad_norm": 3.375779867172241, "learning_rate": 9.970955090229985e-06, "loss": 0.23943865299224854, "rewards/accuracies": 0.8828125, "rewards/chosen": 4.626834869384766, "rewards/margins": 4.667877197265625, "rewards/rejected": -0.04193687438964844, "step": 479 }, { "epoch": 0.2484472049689441, "grad_norm": 1.6912013292312622, "learning_rate": 9.970647225453788e-06, "loss": 0.16557720303535461, "rewards/accuracies": 0.9296875, "rewards/chosen": 5.435704231262207, "rewards/margins": 5.483673095703125, "rewards/rejected": -0.04781341552734375, "step": 480 }, { "epoch": 0.2489648033126294, "grad_norm": 0.7783794403076172, "learning_rate": 9.970337742447543e-06, "loss": 0.13690656423568726, "rewards/accuracies": 0.9140625, "rewards/chosen": 6.890083312988281, "rewards/margins": 6.47491455078125, "rewards/rejected": 0.4156327247619629, "step": 481 }, { "epoch": 0.2494824016563147, "grad_norm": 1.4833050966262817, "learning_rate": 9.970026641312003e-06, "loss": 0.15868127346038818, "rewards/accuracies": 0.9140625, "rewards/chosen": 7.817325592041016, "rewards/margins": 7.507606506347656, "rewards/rejected": 0.30887603759765625, "step": 482 }, { "epoch": 0.25, "grad_norm": 3.164123058319092, "learning_rate": 9.96971392214845e-06, "loss": 0.24464935064315796, "rewards/accuracies": 0.8671875, "rewards/chosen": 5.963600158691406, "rewards/margins": 6.2571258544921875, "rewards/rejected": -0.29492950439453125, "step": 483 }, { "epoch": 0.2505175983436853, "grad_norm": 2.5477805137634277, "learning_rate": 9.969399585058693e-06, "loss": 0.2598860263824463, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.755904197692871, "rewards/margins": 7.570106506347656, "rewards/rejected": 0.18356478214263916, "step": 484 }, { "epoch": 0.2510351966873706, "grad_norm": 2.0858702659606934, "learning_rate": 9.969083630145065e-06, "loss": 0.17584267258644104, "rewards/accuracies": 0.875, "rewards/chosen": 8.528717041015625, "rewards/margins": 8.506057739257812, "rewards/rejected": 0.022237777709960938, "step": 485 }, { "epoch": 0.2515527950310559, "grad_norm": 0.9318129420280457, "learning_rate": 9.968766057510429e-06, "loss": 0.07950849831104279, "rewards/accuracies": 0.9609375, "rewards/chosen": 11.132513046264648, "rewards/margins": 9.884063720703125, "rewards/rejected": 1.2470027208328247, "step": 486 }, { "epoch": 0.2520703933747412, "grad_norm": 4.179053783416748, "learning_rate": 9.968446867258173e-06, "loss": 0.2385377287864685, "rewards/accuracies": 0.890625, "rewards/chosen": 9.178909301757812, "rewards/margins": 8.474021911621094, "rewards/rejected": 0.7023210525512695, "step": 487 }, { "epoch": 0.2525879917184265, "grad_norm": 2.3199350833892822, "learning_rate": 9.96812605949221e-06, "loss": 0.16660627722740173, "rewards/accuracies": 0.9453125, "rewards/chosen": 10.164794921875, "rewards/margins": 9.707733154296875, "rewards/rejected": 0.4570279121398926, "step": 488 }, { "epoch": 0.2531055900621118, "grad_norm": 2.7379465103149414, "learning_rate": 9.967803634316982e-06, "loss": 0.19203075766563416, "rewards/accuracies": 0.8984375, "rewards/chosen": 10.986185073852539, "rewards/margins": 10.570419311523438, "rewards/rejected": 0.4146547317504883, "step": 489 }, { "epoch": 0.2536231884057971, "grad_norm": 3.113711357116699, "learning_rate": 9.967479591837458e-06, "loss": 0.2908574342727661, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.120677947998047, "rewards/margins": 10.569488525390625, "rewards/rejected": 0.5503854751586914, "step": 490 }, { "epoch": 0.25414078674948243, "grad_norm": 3.1231634616851807, "learning_rate": 9.967153932159133e-06, "loss": 0.28801968693733215, "rewards/accuracies": 0.8828125, "rewards/chosen": 12.177937507629395, "rewards/margins": 11.3468017578125, "rewards/rejected": 0.8304786682128906, "step": 491 }, { "epoch": 0.2546583850931677, "grad_norm": 1.4000405073165894, "learning_rate": 9.966826655388025e-06, "loss": 0.20096413791179657, "rewards/accuracies": 0.890625, "rewards/chosen": 11.270009994506836, "rewards/margins": 10.095260620117188, "rewards/rejected": 1.175994873046875, "step": 492 }, { "epoch": 0.255175983436853, "grad_norm": 1.7292300462722778, "learning_rate": 9.966497761630683e-06, "loss": 0.24622221291065216, "rewards/accuracies": 0.8828125, "rewards/chosen": 12.276634216308594, "rewards/margins": 10.59390640258789, "rewards/rejected": 1.6818037033081055, "step": 493 }, { "epoch": 0.2556935817805383, "grad_norm": 1.1370189189910889, "learning_rate": 9.966167250994181e-06, "loss": 0.1791878640651703, "rewards/accuracies": 0.890625, "rewards/chosen": 10.351832389831543, "rewards/margins": 10.294952392578125, "rewards/rejected": 0.05559539794921875, "step": 494 }, { "epoch": 0.2562111801242236, "grad_norm": 1.1885560750961304, "learning_rate": 9.965835123586118e-06, "loss": 0.21292749047279358, "rewards/accuracies": 0.8671875, "rewards/chosen": 10.618864059448242, "rewards/margins": 10.384674072265625, "rewards/rejected": 0.23369598388671875, "step": 495 }, { "epoch": 0.2567287784679089, "grad_norm": 1.076744794845581, "learning_rate": 9.965501379514622e-06, "loss": 0.22716882824897766, "rewards/accuracies": 0.8671875, "rewards/chosen": 10.093405723571777, "rewards/margins": 10.070274353027344, "rewards/rejected": 0.020294189453125, "step": 496 }, { "epoch": 0.2572463768115942, "grad_norm": 0.9300025701522827, "learning_rate": 9.965166018888343e-06, "loss": 0.22924081981182098, "rewards/accuracies": 0.859375, "rewards/chosen": 10.87254524230957, "rewards/margins": 10.428436279296875, "rewards/rejected": 0.4445066452026367, "step": 497 }, { "epoch": 0.2577639751552795, "grad_norm": 0.8625953793525696, "learning_rate": 9.964829041816464e-06, "loss": 0.2003854513168335, "rewards/accuracies": 0.8984375, "rewards/chosen": 11.0751953125, "rewards/margins": 10.889373779296875, "rewards/rejected": 0.18706321716308594, "step": 498 }, { "epoch": 0.2582815734989648, "grad_norm": 0.7677565217018127, "learning_rate": 9.964490448408687e-06, "loss": 0.217991441488266, "rewards/accuracies": 0.890625, "rewards/chosen": 10.116218566894531, "rewards/margins": 9.85174560546875, "rewards/rejected": 0.27459001541137695, "step": 499 }, { "epoch": 0.2587991718426501, "grad_norm": 0.7766988277435303, "learning_rate": 9.964150238775246e-06, "loss": 0.2133076786994934, "rewards/accuracies": 0.8984375, "rewards/chosen": 10.959606170654297, "rewards/margins": 10.691787719726562, "rewards/rejected": 0.26590538024902344, "step": 500 }, { "epoch": 0.2593167701863354, "grad_norm": 1.475355625152588, "learning_rate": 9.963808413026897e-06, "loss": 0.21650557219982147, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.371070861816406, "rewards/margins": 10.266426086425781, "rewards/rejected": 1.1081180572509766, "step": 501 }, { "epoch": 0.2598343685300207, "grad_norm": 1.6603233814239502, "learning_rate": 9.963464971274924e-06, "loss": 0.22377422451972961, "rewards/accuracies": 0.875, "rewards/chosen": 14.06554889678955, "rewards/margins": 11.283702850341797, "rewards/rejected": 2.7824838161468506, "step": 502 }, { "epoch": 0.260351966873706, "grad_norm": 1.2625937461853027, "learning_rate": 9.963119913631138e-06, "loss": 0.19243265688419342, "rewards/accuracies": 0.90625, "rewards/chosen": 13.396598815917969, "rewards/margins": 10.90151596069336, "rewards/rejected": 2.4970340728759766, "step": 503 }, { "epoch": 0.2608695652173913, "grad_norm": 1.5849553346633911, "learning_rate": 9.962773240207873e-06, "loss": 0.22234094142913818, "rewards/accuracies": 0.8515625, "rewards/chosen": 14.772165298461914, "rewards/margins": 10.594352722167969, "rewards/rejected": 4.179746627807617, "step": 504 }, { "epoch": 0.2613871635610766, "grad_norm": 1.4769943952560425, "learning_rate": 9.962424951117994e-06, "loss": 0.15178534388542175, "rewards/accuracies": 0.96875, "rewards/chosen": 16.330604553222656, "rewards/margins": 11.629928588867188, "rewards/rejected": 4.700504302978516, "step": 505 }, { "epoch": 0.2619047619047619, "grad_norm": 2.2524192333221436, "learning_rate": 9.962075046474886e-06, "loss": 0.2123090922832489, "rewards/accuracies": 0.8984375, "rewards/chosen": 17.76593589782715, "rewards/margins": 11.599990844726562, "rewards/rejected": 6.1673583984375, "step": 506 }, { "epoch": 0.26242236024844723, "grad_norm": 2.2239491939544678, "learning_rate": 9.961723526392467e-06, "loss": 0.1616435945034027, "rewards/accuracies": 0.9375, "rewards/chosen": 19.055110931396484, "rewards/margins": 12.510101318359375, "rewards/rejected": 6.544642448425293, "step": 507 }, { "epoch": 0.2629399585921325, "grad_norm": 3.173062324523926, "learning_rate": 9.96137039098517e-06, "loss": 0.2923268973827362, "rewards/accuracies": 0.859375, "rewards/chosen": 16.91805648803711, "rewards/margins": 11.380142211914062, "rewards/rejected": 5.538825035095215, "step": 508 }, { "epoch": 0.2634575569358178, "grad_norm": 1.8058083057403564, "learning_rate": 9.961015640367968e-06, "loss": 0.22124876081943512, "rewards/accuracies": 0.875, "rewards/chosen": 17.1639404296875, "rewards/margins": 12.714691162109375, "rewards/rejected": 4.443309783935547, "step": 509 }, { "epoch": 0.2639751552795031, "grad_norm": 1.8715733289718628, "learning_rate": 9.96065927465635e-06, "loss": 0.20378603041172028, "rewards/accuracies": 0.8671875, "rewards/chosen": 16.16684341430664, "rewards/margins": 12.171808242797852, "rewards/rejected": 3.9953932762145996, "step": 510 }, { "epoch": 0.2644927536231884, "grad_norm": 2.8789448738098145, "learning_rate": 9.96030129396633e-06, "loss": 0.21532641351222992, "rewards/accuracies": 0.8828125, "rewards/chosen": 15.554914474487305, "rewards/margins": 12.751556396484375, "rewards/rejected": 2.8053550720214844, "step": 511 }, { "epoch": 0.2650103519668737, "grad_norm": 2.070885181427002, "learning_rate": 9.959941698414457e-06, "loss": 0.1976078748703003, "rewards/accuracies": 0.890625, "rewards/chosen": 15.237865447998047, "rewards/margins": 12.725547790527344, "rewards/rejected": 2.515730381011963, "step": 512 }, { "epoch": 0.265527950310559, "grad_norm": 1.2216182947158813, "learning_rate": 9.959580488117797e-06, "loss": 0.22816267609596252, "rewards/accuracies": 0.859375, "rewards/chosen": 13.700084686279297, "rewards/margins": 12.312553405761719, "rewards/rejected": 1.3880594968795776, "step": 513 }, { "epoch": 0.2660455486542443, "grad_norm": 2.1387016773223877, "learning_rate": 9.959217663193942e-06, "loss": 0.2456865906715393, "rewards/accuracies": 0.90625, "rewards/chosen": 15.264714241027832, "rewards/margins": 13.445499420166016, "rewards/rejected": 1.8185858726501465, "step": 514 }, { "epoch": 0.2665631469979296, "grad_norm": 1.9373724460601807, "learning_rate": 9.95885322376102e-06, "loss": 0.2093198299407959, "rewards/accuracies": 0.890625, "rewards/chosen": 15.206686973571777, "rewards/margins": 14.475414276123047, "rewards/rejected": 0.7312049865722656, "step": 515 }, { "epoch": 0.2670807453416149, "grad_norm": 0.9820029735565186, "learning_rate": 9.958487169937668e-06, "loss": 0.16088764369487762, "rewards/accuracies": 0.9296875, "rewards/chosen": 14.286128997802734, "rewards/margins": 13.501384735107422, "rewards/rejected": 0.7843761444091797, "step": 516 }, { "epoch": 0.2675983436853002, "grad_norm": 1.032500147819519, "learning_rate": 9.958119501843063e-06, "loss": 0.187415212392807, "rewards/accuracies": 0.921875, "rewards/chosen": 11.489227294921875, "rewards/margins": 11.392284393310547, "rewards/rejected": 0.09243202209472656, "step": 517 }, { "epoch": 0.26811594202898553, "grad_norm": 1.3078575134277344, "learning_rate": 9.9577502195969e-06, "loss": 0.21166031062602997, "rewards/accuracies": 0.84375, "rewards/chosen": 15.540435791015625, "rewards/margins": 14.896141052246094, "rewards/rejected": 0.6474251747131348, "step": 518 }, { "epoch": 0.2686335403726708, "grad_norm": 1.0431935787200928, "learning_rate": 9.957379323319403e-06, "loss": 0.19106359779834747, "rewards/accuracies": 0.921875, "rewards/chosen": 15.131132125854492, "rewards/margins": 14.868522644042969, "rewards/rejected": 0.2663288116455078, "step": 519 }, { "epoch": 0.2691511387163561, "grad_norm": 0.8389433026313782, "learning_rate": 9.957006813131317e-06, "loss": 0.165076345205307, "rewards/accuracies": 0.9375, "rewards/chosen": 16.65725326538086, "rewards/margins": 16.064287185668945, "rewards/rejected": 0.5930202007293701, "step": 520 }, { "epoch": 0.2696687370600414, "grad_norm": 1.117532730102539, "learning_rate": 9.956632689153919e-06, "loss": 0.20640361309051514, "rewards/accuracies": 0.8828125, "rewards/chosen": 14.557321548461914, "rewards/margins": 14.058700561523438, "rewards/rejected": 0.49602794647216797, "step": 521 }, { "epoch": 0.2701863354037267, "grad_norm": 1.3262124061584473, "learning_rate": 9.956256951509004e-06, "loss": 0.19239945709705353, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.334970474243164, "rewards/margins": 16.1802978515625, "rewards/rejected": 1.1518001556396484, "step": 522 }, { "epoch": 0.27070393374741203, "grad_norm": 1.1582266092300415, "learning_rate": 9.955879600318899e-06, "loss": 0.15193912386894226, "rewards/accuracies": 0.8984375, "rewards/chosen": 16.931720733642578, "rewards/margins": 16.362380981445312, "rewards/rejected": 0.5672178268432617, "step": 523 }, { "epoch": 0.2712215320910973, "grad_norm": 2.414443016052246, "learning_rate": 9.955500635706455e-06, "loss": 0.21463844180107117, "rewards/accuracies": 0.890625, "rewards/chosen": 14.927433013916016, "rewards/margins": 14.831871032714844, "rewards/rejected": 0.09708356857299805, "step": 524 }, { "epoch": 0.2717391304347826, "grad_norm": 1.1281472444534302, "learning_rate": 9.95512005779504e-06, "loss": 0.16546308994293213, "rewards/accuracies": 0.921875, "rewards/chosen": 16.882938385009766, "rewards/margins": 16.42493438720703, "rewards/rejected": 0.45577457547187805, "step": 525 }, { "epoch": 0.2722567287784679, "grad_norm": 2.3890576362609863, "learning_rate": 9.954737866708561e-06, "loss": 0.249289870262146, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.157594680786133, "rewards/margins": 15.324878692626953, "rewards/rejected": 0.8310937881469727, "step": 526 }, { "epoch": 0.2727743271221532, "grad_norm": 3.801687717437744, "learning_rate": 9.954354062571438e-06, "loss": 0.24487493932247162, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.265287399291992, "rewards/margins": 16.94414520263672, "rewards/rejected": 0.3242664337158203, "step": 527 }, { "epoch": 0.2732919254658385, "grad_norm": 2.0271449089050293, "learning_rate": 9.953968645508624e-06, "loss": 0.19009965658187866, "rewards/accuracies": 0.8984375, "rewards/chosen": 17.219791412353516, "rewards/margins": 16.652462005615234, "rewards/rejected": 0.5636787414550781, "step": 528 }, { "epoch": 0.27380952380952384, "grad_norm": 3.601979970932007, "learning_rate": 9.953581615645595e-06, "loss": 0.2593082785606384, "rewards/accuracies": 0.84375, "rewards/chosen": 14.265119552612305, "rewards/margins": 14.13405990600586, "rewards/rejected": 0.1336507797241211, "step": 529 }, { "epoch": 0.2743271221532091, "grad_norm": 2.034799098968506, "learning_rate": 9.95319297310835e-06, "loss": 0.17954835295677185, "rewards/accuracies": 0.890625, "rewards/chosen": 15.795302391052246, "rewards/margins": 15.164779663085938, "rewards/rejected": 0.6338481903076172, "step": 530 }, { "epoch": 0.2748447204968944, "grad_norm": 1.919663906097412, "learning_rate": 9.952802718023413e-06, "loss": 0.21567673981189728, "rewards/accuracies": 0.9296875, "rewards/chosen": 14.811651229858398, "rewards/margins": 14.109941482543945, "rewards/rejected": 0.7040506601333618, "step": 531 }, { "epoch": 0.2753623188405797, "grad_norm": 2.3167037963867188, "learning_rate": 9.952410850517838e-06, "loss": 0.17929431796073914, "rewards/accuracies": 0.9140625, "rewards/chosen": 13.355104446411133, "rewards/margins": 13.349746704101562, "rewards/rejected": 0.0032777786254882812, "step": 532 }, { "epoch": 0.275879917184265, "grad_norm": 1.190753698348999, "learning_rate": 9.952017370719196e-06, "loss": 0.1443869173526764, "rewards/accuracies": 0.921875, "rewards/chosen": 13.99411392211914, "rewards/margins": 13.905731201171875, "rewards/rejected": 0.09156504273414612, "step": 533 }, { "epoch": 0.27639751552795033, "grad_norm": 1.8212453126907349, "learning_rate": 9.951622278755588e-06, "loss": 0.1905536651611328, "rewards/accuracies": 0.9140625, "rewards/chosen": 11.548151016235352, "rewards/margins": 11.69964599609375, "rewards/rejected": -0.14928054809570312, "step": 534 }, { "epoch": 0.2769151138716356, "grad_norm": 1.6511930227279663, "learning_rate": 9.951225574755644e-06, "loss": 0.21858671307563782, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.995285034179688, "rewards/margins": 11.95145034790039, "rewards/rejected": 0.04732251167297363, "step": 535 }, { "epoch": 0.2774327122153209, "grad_norm": 1.746800422668457, "learning_rate": 9.950827258848505e-06, "loss": 0.2539050579071045, "rewards/accuracies": 0.8828125, "rewards/chosen": 12.078146934509277, "rewards/margins": 11.755645751953125, "rewards/rejected": 0.321002721786499, "step": 536 }, { "epoch": 0.2779503105590062, "grad_norm": 1.6260056495666504, "learning_rate": 9.950427331163854e-06, "loss": 0.25397342443466187, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.453300476074219, "rewards/margins": 11.154617309570312, "rewards/rejected": 0.2998979091644287, "step": 537 }, { "epoch": 0.2784679089026915, "grad_norm": 2.764343738555908, "learning_rate": 9.950025791831888e-06, "loss": 0.26802366971969604, "rewards/accuracies": 0.84375, "rewards/chosen": 12.085838317871094, "rewards/margins": 11.773582458496094, "rewards/rejected": 0.3113856315612793, "step": 538 }, { "epoch": 0.27898550724637683, "grad_norm": 0.735768735408783, "learning_rate": 9.949622640983329e-06, "loss": 0.19401338696479797, "rewards/accuracies": 0.890625, "rewards/chosen": 12.297783851623535, "rewards/margins": 12.224868774414062, "rewards/rejected": 0.07619285583496094, "step": 539 }, { "epoch": 0.2795031055900621, "grad_norm": 1.5540246963500977, "learning_rate": 9.949217878749426e-06, "loss": 0.22250765562057495, "rewards/accuracies": 0.859375, "rewards/chosen": 12.807881355285645, "rewards/margins": 12.542068481445312, "rewards/rejected": 0.26633167266845703, "step": 540 }, { "epoch": 0.2800207039337474, "grad_norm": 1.9596487283706665, "learning_rate": 9.948811505261953e-06, "loss": 0.26121455430984497, "rewards/accuracies": 0.84375, "rewards/chosen": 13.877643585205078, "rewards/margins": 13.246994018554688, "rewards/rejected": 0.6315336227416992, "step": 541 }, { "epoch": 0.2805383022774327, "grad_norm": 2.631831645965576, "learning_rate": 9.948403520653208e-06, "loss": 0.26089367270469666, "rewards/accuracies": 0.8515625, "rewards/chosen": 12.091769218444824, "rewards/margins": 11.976654052734375, "rewards/rejected": 0.11569404602050781, "step": 542 }, { "epoch": 0.281055900621118, "grad_norm": 1.564047932624817, "learning_rate": 9.947993925056014e-06, "loss": 0.25189492106437683, "rewards/accuracies": 0.84375, "rewards/chosen": 13.447505950927734, "rewards/margins": 13.054229736328125, "rewards/rejected": 0.3937760591506958, "step": 543 }, { "epoch": 0.2815734989648033, "grad_norm": 0.851467490196228, "learning_rate": 9.947582718603716e-06, "loss": 0.16565699875354767, "rewards/accuracies": 0.921875, "rewards/chosen": 14.90335464477539, "rewards/margins": 14.242935180664062, "rewards/rejected": 0.6672601699829102, "step": 544 }, { "epoch": 0.28209109730848864, "grad_norm": 0.8774410486221313, "learning_rate": 9.947169901430186e-06, "loss": 0.1800113171339035, "rewards/accuracies": 0.90625, "rewards/chosen": 13.39242172241211, "rewards/margins": 13.188896179199219, "rewards/rejected": 0.202178955078125, "step": 545 }, { "epoch": 0.2826086956521739, "grad_norm": 1.170512318611145, "learning_rate": 9.946755473669823e-06, "loss": 0.20370444655418396, "rewards/accuracies": 0.90625, "rewards/chosen": 15.66415786743164, "rewards/margins": 14.9647216796875, "rewards/rejected": 0.699202299118042, "step": 546 }, { "epoch": 0.2831262939958592, "grad_norm": 0.8157145380973816, "learning_rate": 9.94633943545754e-06, "loss": 0.20435398817062378, "rewards/accuracies": 0.9296875, "rewards/chosen": 13.810525894165039, "rewards/margins": 13.475669860839844, "rewards/rejected": 0.33533191680908203, "step": 547 }, { "epoch": 0.2836438923395445, "grad_norm": 1.4334744215011597, "learning_rate": 9.945921786928787e-06, "loss": 0.2717939019203186, "rewards/accuracies": 0.84375, "rewards/chosen": 15.649981498718262, "rewards/margins": 15.058784484863281, "rewards/rejected": 0.5980587005615234, "step": 548 }, { "epoch": 0.2841614906832298, "grad_norm": 1.4086095094680786, "learning_rate": 9.945502528219528e-06, "loss": 0.3034875988960266, "rewards/accuracies": 0.84375, "rewards/chosen": 14.600052833557129, "rewards/margins": 13.642951965332031, "rewards/rejected": 0.9626979827880859, "step": 549 }, { "epoch": 0.28467908902691513, "grad_norm": 1.1536351442337036, "learning_rate": 9.94508165946626e-06, "loss": 0.20339462161064148, "rewards/accuracies": 0.890625, "rewards/chosen": 16.968420028686523, "rewards/margins": 15.844257354736328, "rewards/rejected": 1.1232757568359375, "step": 550 }, { "epoch": 0.2851966873706004, "grad_norm": 1.7403939962387085, "learning_rate": 9.944659180805997e-06, "loss": 0.2362908124923706, "rewards/accuracies": 0.8828125, "rewards/chosen": 13.82784652709961, "rewards/margins": 12.579132080078125, "rewards/rejected": 1.2489128112792969, "step": 551 }, { "epoch": 0.2857142857142857, "grad_norm": 1.3350701332092285, "learning_rate": 9.944235092376283e-06, "loss": 0.1807347536087036, "rewards/accuracies": 0.90625, "rewards/chosen": 17.802352905273438, "rewards/margins": 16.06829833984375, "rewards/rejected": 1.7383699417114258, "step": 552 }, { "epoch": 0.286231884057971, "grad_norm": 1.2687690258026123, "learning_rate": 9.94380939431518e-06, "loss": 0.17135019600391388, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.213581085205078, "rewards/margins": 17.628311157226562, "rewards/rejected": 3.5844645500183105, "step": 553 }, { "epoch": 0.2867494824016563, "grad_norm": 2.08290433883667, "learning_rate": 9.94338208676128e-06, "loss": 0.17921368777751923, "rewards/accuracies": 0.9375, "rewards/chosen": 20.762184143066406, "rewards/margins": 17.328521728515625, "rewards/rejected": 3.4312896728515625, "step": 554 }, { "epoch": 0.28726708074534163, "grad_norm": 1.7978266477584839, "learning_rate": 9.94295316985369e-06, "loss": 0.26691120862960815, "rewards/accuracies": 0.8515625, "rewards/chosen": 18.903440475463867, "rewards/margins": 15.309772491455078, "rewards/rejected": 3.592404365539551, "step": 555 }, { "epoch": 0.28778467908902694, "grad_norm": 2.5764780044555664, "learning_rate": 9.942522643732053e-06, "loss": 0.2751501798629761, "rewards/accuracies": 0.8671875, "rewards/chosen": 17.611780166625977, "rewards/margins": 14.273975372314453, "rewards/rejected": 3.3391194343566895, "step": 556 }, { "epoch": 0.2883022774327122, "grad_norm": 1.461378812789917, "learning_rate": 9.94209050853653e-06, "loss": 0.2702863812446594, "rewards/accuracies": 0.8671875, "rewards/chosen": 14.615386962890625, "rewards/margins": 12.2313232421875, "rewards/rejected": 2.3850255012512207, "step": 557 }, { "epoch": 0.2888198757763975, "grad_norm": 1.2239618301391602, "learning_rate": 9.941656764407802e-06, "loss": 0.21418528258800507, "rewards/accuracies": 0.890625, "rewards/chosen": 12.671236038208008, "rewards/margins": 10.8031005859375, "rewards/rejected": 1.8708958625793457, "step": 558 }, { "epoch": 0.2893374741200828, "grad_norm": 1.509720802307129, "learning_rate": 9.94122141148708e-06, "loss": 0.16678854823112488, "rewards/accuracies": 0.9140625, "rewards/chosen": 14.669410705566406, "rewards/margins": 11.93756103515625, "rewards/rejected": 2.7319793701171875, "step": 559 }, { "epoch": 0.2898550724637681, "grad_norm": 1.6088621616363525, "learning_rate": 9.940784449916095e-06, "loss": 0.18094469606876373, "rewards/accuracies": 0.8828125, "rewards/chosen": 12.246931076049805, "rewards/margins": 10.657485961914062, "rewards/rejected": 1.5908641815185547, "step": 560 }, { "epoch": 0.29037267080745344, "grad_norm": 1.4806898832321167, "learning_rate": 9.940345879837104e-06, "loss": 0.22937357425689697, "rewards/accuracies": 0.890625, "rewards/chosen": 10.117192268371582, "rewards/margins": 8.716381072998047, "rewards/rejected": 1.4011871814727783, "step": 561 }, { "epoch": 0.2908902691511387, "grad_norm": 2.111309766769409, "learning_rate": 9.939905701392884e-06, "loss": 0.21852892637252808, "rewards/accuracies": 0.90625, "rewards/chosen": 10.111226081848145, "rewards/margins": 8.64815902709961, "rewards/rejected": 1.4662189483642578, "step": 562 }, { "epoch": 0.291407867494824, "grad_norm": 1.2246311902999878, "learning_rate": 9.939463914726744e-06, "loss": 0.21398866176605225, "rewards/accuracies": 0.921875, "rewards/chosen": 8.450116157531738, "rewards/margins": 7.5570220947265625, "rewards/rejected": 0.8956489562988281, "step": 563 }, { "epoch": 0.2919254658385093, "grad_norm": 1.2003090381622314, "learning_rate": 9.939020519982505e-06, "loss": 0.20074176788330078, "rewards/accuracies": 0.8984375, "rewards/chosen": 8.804313659667969, "rewards/margins": 7.489238739013672, "rewards/rejected": 1.3135782480239868, "step": 564 }, { "epoch": 0.2924430641821946, "grad_norm": 1.3060970306396484, "learning_rate": 9.93857551730452e-06, "loss": 0.1788884848356247, "rewards/accuracies": 0.9296875, "rewards/chosen": 9.534000396728516, "rewards/margins": 8.482879638671875, "rewards/rejected": 1.0533618927001953, "step": 565 }, { "epoch": 0.29296066252587993, "grad_norm": 1.5956860780715942, "learning_rate": 9.938128906837663e-06, "loss": 0.20638170838356018, "rewards/accuracies": 0.90625, "rewards/chosen": 10.504261016845703, "rewards/margins": 8.762359619140625, "rewards/rejected": 1.7417449951171875, "step": 566 }, { "epoch": 0.29347826086956524, "grad_norm": 2.0796847343444824, "learning_rate": 9.937680688727331e-06, "loss": 0.20697762072086334, "rewards/accuracies": 0.8984375, "rewards/chosen": 9.708523750305176, "rewards/margins": 7.9439239501953125, "rewards/rejected": 1.766357421875, "step": 567 }, { "epoch": 0.2939958592132505, "grad_norm": 1.2924466133117676, "learning_rate": 9.937230863119445e-06, "loss": 0.24671563506126404, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.957775115966797, "rewards/margins": 6.8355865478515625, "rewards/rejected": 1.1216850280761719, "step": 568 }, { "epoch": 0.2945134575569358, "grad_norm": 1.4043413400650024, "learning_rate": 9.936779430160446e-06, "loss": 0.2561960518360138, "rewards/accuracies": 0.859375, "rewards/chosen": 7.109494209289551, "rewards/margins": 6.114250183105469, "rewards/rejected": 0.9941041469573975, "step": 569 }, { "epoch": 0.2950310559006211, "grad_norm": 1.3275638818740845, "learning_rate": 9.936326389997306e-06, "loss": 0.28533387184143066, "rewards/accuracies": 0.8515625, "rewards/chosen": 6.367978572845459, "rewards/margins": 5.626472473144531, "rewards/rejected": 0.7408447265625, "step": 570 }, { "epoch": 0.29554865424430643, "grad_norm": 1.8101046085357666, "learning_rate": 9.935871742777511e-06, "loss": 0.18490169942378998, "rewards/accuracies": 0.9140625, "rewards/chosen": 7.2448272705078125, "rewards/margins": 6.4420318603515625, "rewards/rejected": 0.8039340972900391, "step": 571 }, { "epoch": 0.29606625258799174, "grad_norm": 0.8834984302520752, "learning_rate": 9.935415488649081e-06, "loss": 0.21933136880397797, "rewards/accuracies": 0.8828125, "rewards/chosen": 6.051387786865234, "rewards/margins": 5.454105377197266, "rewards/rejected": 0.5982933044433594, "step": 572 }, { "epoch": 0.296583850931677, "grad_norm": 0.9094173312187195, "learning_rate": 9.934957627760544e-06, "loss": 0.1969948410987854, "rewards/accuracies": 0.9296875, "rewards/chosen": 7.164540767669678, "rewards/margins": 6.6734161376953125, "rewards/rejected": 0.4911212921142578, "step": 573 }, { "epoch": 0.2971014492753623, "grad_norm": 1.0302265882492065, "learning_rate": 9.934498160260967e-06, "loss": 0.16748756170272827, "rewards/accuracies": 0.921875, "rewards/chosen": 7.514167785644531, "rewards/margins": 7.198646545410156, "rewards/rejected": 0.3145083785057068, "step": 574 }, { "epoch": 0.2976190476190476, "grad_norm": 1.079383373260498, "learning_rate": 9.93403708629993e-06, "loss": 0.17468981444835663, "rewards/accuracies": 0.9140625, "rewards/chosen": 8.47476577758789, "rewards/margins": 7.8444976806640625, "rewards/rejected": 0.6286683082580566, "step": 575 }, { "epoch": 0.2981366459627329, "grad_norm": 1.6494760513305664, "learning_rate": 9.93357440602754e-06, "loss": 0.23874086141586304, "rewards/accuracies": 0.859375, "rewards/chosen": 9.320554733276367, "rewards/margins": 8.70220947265625, "rewards/rejected": 0.6169910430908203, "step": 576 }, { "epoch": 0.29865424430641824, "grad_norm": 1.3788167238235474, "learning_rate": 9.933110119594424e-06, "loss": 0.23529238998889923, "rewards/accuracies": 0.875, "rewards/chosen": 9.232376098632812, "rewards/margins": 8.6982421875, "rewards/rejected": 0.5321066379547119, "step": 577 }, { "epoch": 0.2991718426501035, "grad_norm": 0.8713099956512451, "learning_rate": 9.932644227151735e-06, "loss": 0.1762845516204834, "rewards/accuracies": 0.9140625, "rewards/chosen": 11.88144302368164, "rewards/margins": 10.969642639160156, "rewards/rejected": 0.9098281860351562, "step": 578 }, { "epoch": 0.2996894409937888, "grad_norm": 1.9590224027633667, "learning_rate": 9.93217672885115e-06, "loss": 0.20559823513031006, "rewards/accuracies": 0.9140625, "rewards/chosen": 13.561071395874023, "rewards/margins": 12.773639678955078, "rewards/rejected": 0.7938766479492188, "step": 579 }, { "epoch": 0.3002070393374741, "grad_norm": 2.468783378601074, "learning_rate": 9.931707624844861e-06, "loss": 0.20525695383548737, "rewards/accuracies": 0.8671875, "rewards/chosen": 15.2840576171875, "rewards/margins": 14.591842651367188, "rewards/rejected": 0.6912784576416016, "step": 580 }, { "epoch": 0.3007246376811594, "grad_norm": 1.6712921857833862, "learning_rate": 9.931236915285594e-06, "loss": 0.14179003238677979, "rewards/accuracies": 0.9375, "rewards/chosen": 15.42292594909668, "rewards/margins": 14.497396469116211, "rewards/rejected": 0.9264993667602539, "step": 581 }, { "epoch": 0.30124223602484473, "grad_norm": 1.6632952690124512, "learning_rate": 9.93076460032659e-06, "loss": 0.233038529753685, "rewards/accuracies": 0.8828125, "rewards/chosen": 18.03629493713379, "rewards/margins": 16.48668670654297, "rewards/rejected": 1.5483664274215698, "step": 582 }, { "epoch": 0.30175983436853004, "grad_norm": 1.2824625968933105, "learning_rate": 9.930290680121613e-06, "loss": 0.1632089763879776, "rewards/accuracies": 0.921875, "rewards/chosen": 17.03852081298828, "rewards/margins": 15.766437530517578, "rewards/rejected": 1.2734482288360596, "step": 583 }, { "epoch": 0.3022774327122153, "grad_norm": 2.083472728729248, "learning_rate": 9.929815154824952e-06, "loss": 0.214199960231781, "rewards/accuracies": 0.90625, "rewards/chosen": 18.17736053466797, "rewards/margins": 16.63818359375, "rewards/rejected": 1.5424957275390625, "step": 584 }, { "epoch": 0.3027950310559006, "grad_norm": 1.323072910308838, "learning_rate": 9.929338024591417e-06, "loss": 0.1749277412891388, "rewards/accuracies": 0.90625, "rewards/chosen": 21.85660171508789, "rewards/margins": 19.699981689453125, "rewards/rejected": 2.151355504989624, "step": 585 }, { "epoch": 0.3033126293995859, "grad_norm": 2.0553977489471436, "learning_rate": 9.928859289576341e-06, "loss": 0.21788126230239868, "rewards/accuracies": 0.90625, "rewards/chosen": 19.867586135864258, "rewards/margins": 18.322647094726562, "rewards/rejected": 1.5463714599609375, "step": 586 }, { "epoch": 0.30383022774327123, "grad_norm": 1.7130460739135742, "learning_rate": 9.928378949935581e-06, "loss": 0.19736257195472717, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.75101089477539, "rewards/margins": 17.4593505859375, "rewards/rejected": 1.2898025512695312, "step": 587 }, { "epoch": 0.30434782608695654, "grad_norm": 1.2065800428390503, "learning_rate": 9.927897005825517e-06, "loss": 0.177854984998703, "rewards/accuracies": 0.921875, "rewards/chosen": 19.934879302978516, "rewards/margins": 18.729782104492188, "rewards/rejected": 1.20489501953125, "step": 588 }, { "epoch": 0.3048654244306418, "grad_norm": 2.215606689453125, "learning_rate": 9.927413457403045e-06, "loss": 0.21983034908771515, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.942649841308594, "rewards/margins": 18.88092041015625, "rewards/rejected": 2.0589990615844727, "step": 589 }, { "epoch": 0.3053830227743271, "grad_norm": 1.7614802122116089, "learning_rate": 9.926928304825589e-06, "loss": 0.16312141716480255, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.67112159729004, "rewards/margins": 19.241661071777344, "rewards/rejected": 1.4251594543457031, "step": 590 }, { "epoch": 0.3059006211180124, "grad_norm": 1.717372179031372, "learning_rate": 9.926441548251095e-06, "loss": 0.2175162434577942, "rewards/accuracies": 0.890625, "rewards/chosen": 17.62645721435547, "rewards/margins": 16.66936492919922, "rewards/rejected": 0.957310676574707, "step": 591 }, { "epoch": 0.3064182194616977, "grad_norm": 1.6291944980621338, "learning_rate": 9.92595318783803e-06, "loss": 0.2135971337556839, "rewards/accuracies": 0.875, "rewards/chosen": 18.94097137451172, "rewards/margins": 16.87061309814453, "rewards/rejected": 2.0676536560058594, "step": 592 }, { "epoch": 0.30693581780538304, "grad_norm": 1.815757155418396, "learning_rate": 9.925463223745383e-06, "loss": 0.19868582487106323, "rewards/accuracies": 0.9296875, "rewards/chosen": 19.220386505126953, "rewards/margins": 17.615005493164062, "rewards/rejected": 1.6046733856201172, "step": 593 }, { "epoch": 0.30745341614906835, "grad_norm": 1.2714723348617554, "learning_rate": 9.924971656132666e-06, "loss": 0.2103961706161499, "rewards/accuracies": 0.90625, "rewards/chosen": 18.21908950805664, "rewards/margins": 17.445266723632812, "rewards/rejected": 0.7716693878173828, "step": 594 }, { "epoch": 0.3079710144927536, "grad_norm": 1.742552638053894, "learning_rate": 9.92447848515991e-06, "loss": 0.23669302463531494, "rewards/accuracies": 0.875, "rewards/chosen": 15.3349609375, "rewards/margins": 14.562744140625, "rewards/rejected": 0.7750778198242188, "step": 595 }, { "epoch": 0.3084886128364389, "grad_norm": 3.2729482650756836, "learning_rate": 9.923983710987674e-06, "loss": 0.2402246594429016, "rewards/accuracies": 0.8828125, "rewards/chosen": 14.860855102539062, "rewards/margins": 14.434944152832031, "rewards/rejected": 0.4209728240966797, "step": 596 }, { "epoch": 0.3090062111801242, "grad_norm": 2.001089572906494, "learning_rate": 9.923487333777033e-06, "loss": 0.21420298516750336, "rewards/accuracies": 0.9140625, "rewards/chosen": 12.709827423095703, "rewards/margins": 12.840461730957031, "rewards/rejected": -0.13401174545288086, "step": 597 }, { "epoch": 0.30952380952380953, "grad_norm": 1.7958060503005981, "learning_rate": 9.922989353689584e-06, "loss": 0.22336557507514954, "rewards/accuracies": 0.90625, "rewards/chosen": 13.24460220336914, "rewards/margins": 12.835514068603516, "rewards/rejected": 0.4063549041748047, "step": 598 }, { "epoch": 0.31004140786749484, "grad_norm": 0.9401001930236816, "learning_rate": 9.922489770887452e-06, "loss": 0.16433310508728027, "rewards/accuracies": 0.9375, "rewards/chosen": 10.765438079833984, "rewards/margins": 11.109184265136719, "rewards/rejected": -0.3422584533691406, "step": 599 }, { "epoch": 0.3105590062111801, "grad_norm": 1.440361738204956, "learning_rate": 9.921988585533278e-06, "loss": 0.21900436282157898, "rewards/accuracies": 0.8984375, "rewards/chosen": 8.583906173706055, "rewards/margins": 8.949546813964844, "rewards/rejected": -0.36365556716918945, "step": 600 }, { "epoch": 0.3110766045548654, "grad_norm": 1.3350083827972412, "learning_rate": 9.921485797790228e-06, "loss": 0.2707397937774658, "rewards/accuracies": 0.8359375, "rewards/chosen": 7.370853424072266, "rewards/margins": 7.7816314697265625, "rewards/rejected": -0.41006946563720703, "step": 601 }, { "epoch": 0.3115942028985507, "grad_norm": 0.9887576699256897, "learning_rate": 9.920981407821985e-06, "loss": 0.23165109753608704, "rewards/accuracies": 0.8671875, "rewards/chosen": 9.893196105957031, "rewards/margins": 10.008949279785156, "rewards/rejected": -0.11330032348632812, "step": 602 }, { "epoch": 0.31211180124223603, "grad_norm": 0.9169860482215881, "learning_rate": 9.92047541579276e-06, "loss": 0.22629766166210175, "rewards/accuracies": 0.8828125, "rewards/chosen": 9.110601425170898, "rewards/margins": 9.251113891601562, "rewards/rejected": -0.1405162811279297, "step": 603 }, { "epoch": 0.31262939958592134, "grad_norm": 0.9107227325439453, "learning_rate": 9.919967821867283e-06, "loss": 0.1644425094127655, "rewards/accuracies": 0.9140625, "rewards/chosen": 9.55673599243164, "rewards/margins": 9.79852294921875, "rewards/rejected": -0.24074220657348633, "step": 604 }, { "epoch": 0.31314699792960665, "grad_norm": 1.1093322038650513, "learning_rate": 9.9194586262108e-06, "loss": 0.17748253047466278, "rewards/accuracies": 0.921875, "rewards/chosen": 10.718835830688477, "rewards/margins": 10.674858093261719, "rewards/rejected": 0.0479433536529541, "step": 605 }, { "epoch": 0.3136645962732919, "grad_norm": 0.9128037095069885, "learning_rate": 9.918947828989087e-06, "loss": 0.13419568538665771, "rewards/accuracies": 0.9609375, "rewards/chosen": 12.159064292907715, "rewards/margins": 11.822608947753906, "rewards/rejected": 0.33499622344970703, "step": 606 }, { "epoch": 0.3141821946169772, "grad_norm": 3.0758743286132812, "learning_rate": 9.918435430368437e-06, "loss": 0.21136784553527832, "rewards/accuracies": 0.8984375, "rewards/chosen": 15.346248626708984, "rewards/margins": 14.008255004882812, "rewards/rejected": 1.341172218322754, "step": 607 }, { "epoch": 0.3146997929606625, "grad_norm": 2.734839916229248, "learning_rate": 9.917921430515664e-06, "loss": 0.21933971345424652, "rewards/accuracies": 0.90625, "rewards/chosen": 15.424964904785156, "rewards/margins": 14.542999267578125, "rewards/rejected": 0.88262939453125, "step": 608 }, { "epoch": 0.31521739130434784, "grad_norm": 2.4219889640808105, "learning_rate": 9.917405829598109e-06, "loss": 0.29714661836624146, "rewards/accuracies": 0.8671875, "rewards/chosen": 15.259773254394531, "rewards/margins": 14.178627014160156, "rewards/rejected": 1.0841913223266602, "step": 609 }, { "epoch": 0.31573498964803315, "grad_norm": 4.213411331176758, "learning_rate": 9.916888627783623e-06, "loss": 0.21113766729831696, "rewards/accuracies": 0.859375, "rewards/chosen": 17.90924835205078, "rewards/margins": 16.299774169921875, "rewards/rejected": 1.6069488525390625, "step": 610 }, { "epoch": 0.3162525879917184, "grad_norm": 1.4516339302062988, "learning_rate": 9.916369825240588e-06, "loss": 0.17712561786174774, "rewards/accuracies": 0.90625, "rewards/chosen": 19.77344512939453, "rewards/margins": 18.179931640625, "rewards/rejected": 1.5947065353393555, "step": 611 }, { "epoch": 0.3167701863354037, "grad_norm": 2.5402121543884277, "learning_rate": 9.915849422137904e-06, "loss": 0.23838941752910614, "rewards/accuracies": 0.890625, "rewards/chosen": 19.147859573364258, "rewards/margins": 17.17394256591797, "rewards/rejected": 1.9700183868408203, "step": 612 }, { "epoch": 0.317287784679089, "grad_norm": 1.4870567321777344, "learning_rate": 9.915327418644991e-06, "loss": 0.22042928636074066, "rewards/accuracies": 0.890625, "rewards/chosen": 19.329343795776367, "rewards/margins": 16.926605224609375, "rewards/rejected": 2.399171829223633, "step": 613 }, { "epoch": 0.31780538302277433, "grad_norm": 1.223986029624939, "learning_rate": 9.914803814931793e-06, "loss": 0.1807408630847931, "rewards/accuracies": 0.8828125, "rewards/chosen": 20.563711166381836, "rewards/margins": 17.45074462890625, "rewards/rejected": 3.117621898651123, "step": 614 }, { "epoch": 0.31832298136645965, "grad_norm": 2.0997982025146484, "learning_rate": 9.914278611168772e-06, "loss": 0.23436032235622406, "rewards/accuracies": 0.890625, "rewards/chosen": 20.97020721435547, "rewards/margins": 18.3140869140625, "rewards/rejected": 2.6572816371917725, "step": 615 }, { "epoch": 0.3188405797101449, "grad_norm": 2.1007699966430664, "learning_rate": 9.91375180752691e-06, "loss": 0.2803199291229248, "rewards/accuracies": 0.875, "rewards/chosen": 22.381359100341797, "rewards/margins": 19.379806518554688, "rewards/rejected": 3.0063657760620117, "step": 616 }, { "epoch": 0.3193581780538302, "grad_norm": 5.082787990570068, "learning_rate": 9.913223404177713e-06, "loss": 0.28709912300109863, "rewards/accuracies": 0.875, "rewards/chosen": 24.581199645996094, "rewards/margins": 20.275604248046875, "rewards/rejected": 4.304925918579102, "step": 617 }, { "epoch": 0.3198757763975155, "grad_norm": 1.1521228551864624, "learning_rate": 9.912693401293209e-06, "loss": 0.25307074189186096, "rewards/accuracies": 0.859375, "rewards/chosen": 17.650646209716797, "rewards/margins": 15.578248023986816, "rewards/rejected": 2.0727593898773193, "step": 618 }, { "epoch": 0.32039337474120083, "grad_norm": 1.472977638244629, "learning_rate": 9.91216179904594e-06, "loss": 0.21027511358261108, "rewards/accuracies": 0.890625, "rewards/chosen": 15.941875457763672, "rewards/margins": 14.44857406616211, "rewards/rejected": 1.496103286743164, "step": 619 }, { "epoch": 0.32091097308488614, "grad_norm": 1.1763569116592407, "learning_rate": 9.911628597608975e-06, "loss": 0.1937856674194336, "rewards/accuracies": 0.890625, "rewards/chosen": 14.310306549072266, "rewards/margins": 13.255898475646973, "rewards/rejected": 1.0495272874832153, "step": 620 }, { "epoch": 0.32142857142857145, "grad_norm": 1.0252782106399536, "learning_rate": 9.911093797155903e-06, "loss": 0.22209149599075317, "rewards/accuracies": 0.875, "rewards/chosen": 13.53994369506836, "rewards/margins": 12.131561279296875, "rewards/rejected": 1.4043426513671875, "step": 621 }, { "epoch": 0.3219461697722567, "grad_norm": 1.4448363780975342, "learning_rate": 9.91055739786083e-06, "loss": 0.2328757345676422, "rewards/accuracies": 0.8671875, "rewards/chosen": 10.24992561340332, "rewards/margins": 9.607704162597656, "rewards/rejected": 0.6400671005249023, "step": 622 }, { "epoch": 0.322463768115942, "grad_norm": 1.324575662612915, "learning_rate": 9.910019399898385e-06, "loss": 0.2724957764148712, "rewards/accuracies": 0.8359375, "rewards/chosen": 7.658723831176758, "rewards/margins": 7.102195739746094, "rewards/rejected": 0.5575332641601562, "step": 623 }, { "epoch": 0.32298136645962733, "grad_norm": 0.8344225287437439, "learning_rate": 9.909479803443718e-06, "loss": 0.15176929533481598, "rewards/accuracies": 0.9609375, "rewards/chosen": 8.379586219787598, "rewards/margins": 7.709768295288086, "rewards/rejected": 0.6687307357788086, "step": 624 }, { "epoch": 0.32349896480331264, "grad_norm": 0.8200079798698425, "learning_rate": 9.908938608672499e-06, "loss": 0.2340242564678192, "rewards/accuracies": 0.8828125, "rewards/chosen": 5.763397216796875, "rewards/margins": 5.253173828125, "rewards/rejected": 0.5095529556274414, "step": 625 }, { "epoch": 0.32401656314699795, "grad_norm": 1.0462838411331177, "learning_rate": 9.908395815760916e-06, "loss": 0.21996109187602997, "rewards/accuracies": 0.890625, "rewards/chosen": 5.8109540939331055, "rewards/margins": 5.186788558959961, "rewards/rejected": 0.6231498718261719, "step": 626 }, { "epoch": 0.3245341614906832, "grad_norm": 0.9711755514144897, "learning_rate": 9.907851424885678e-06, "loss": 0.21167057752609253, "rewards/accuracies": 0.890625, "rewards/chosen": 5.33924674987793, "rewards/margins": 4.974067687988281, "rewards/rejected": 0.3644566535949707, "step": 627 }, { "epoch": 0.3250517598343685, "grad_norm": 0.8579201102256775, "learning_rate": 9.907305436224019e-06, "loss": 0.20277070999145508, "rewards/accuracies": 0.90625, "rewards/chosen": 6.499109745025635, "rewards/margins": 5.9703216552734375, "rewards/rejected": 0.5283927917480469, "step": 628 }, { "epoch": 0.3255693581780538, "grad_norm": 0.8208149671554565, "learning_rate": 9.906757849953688e-06, "loss": 0.20215214788913727, "rewards/accuracies": 0.8984375, "rewards/chosen": 8.025569915771484, "rewards/margins": 6.866354942321777, "rewards/rejected": 1.1576061248779297, "step": 629 }, { "epoch": 0.32608695652173914, "grad_norm": 0.6668251752853394, "learning_rate": 9.906208666252957e-06, "loss": 0.16005167365074158, "rewards/accuracies": 0.921875, "rewards/chosen": 7.97953987121582, "rewards/margins": 7.084907531738281, "rewards/rejected": 0.8952293395996094, "step": 630 }, { "epoch": 0.32660455486542445, "grad_norm": 0.9819477796554565, "learning_rate": 9.905657885300613e-06, "loss": 0.21623611450195312, "rewards/accuracies": 0.8828125, "rewards/chosen": 8.525577545166016, "rewards/margins": 7.628391265869141, "rewards/rejected": 0.8983802795410156, "step": 631 }, { "epoch": 0.32712215320910976, "grad_norm": 1.5493088960647583, "learning_rate": 9.90510550727597e-06, "loss": 0.1964128613471985, "rewards/accuracies": 0.921875, "rewards/chosen": 10.604228973388672, "rewards/margins": 9.090423583984375, "rewards/rejected": 1.5115318298339844, "step": 632 }, { "epoch": 0.327639751552795, "grad_norm": 2.2705562114715576, "learning_rate": 9.904551532358856e-06, "loss": 0.2335338294506073, "rewards/accuracies": 0.8515625, "rewards/chosen": 11.780649185180664, "rewards/margins": 9.930923461914062, "rewards/rejected": 1.8503761291503906, "step": 633 }, { "epoch": 0.3281573498964803, "grad_norm": 0.8905113339424133, "learning_rate": 9.903995960729626e-06, "loss": 0.14994925260543823, "rewards/accuracies": 0.90625, "rewards/chosen": 12.49986457824707, "rewards/margins": 10.929779052734375, "rewards/rejected": 1.5687141418457031, "step": 634 }, { "epoch": 0.32867494824016563, "grad_norm": 2.4685096740722656, "learning_rate": 9.903438792569145e-06, "loss": 0.2903164029121399, "rewards/accuracies": 0.828125, "rewards/chosen": 12.110740661621094, "rewards/margins": 10.47601318359375, "rewards/rejected": 1.6357746124267578, "step": 635 }, { "epoch": 0.32919254658385094, "grad_norm": 1.8666797876358032, "learning_rate": 9.902880028058808e-06, "loss": 0.2713615298271179, "rewards/accuracies": 0.859375, "rewards/chosen": 14.653749465942383, "rewards/margins": 12.352691650390625, "rewards/rejected": 2.2996673583984375, "step": 636 }, { "epoch": 0.32971014492753625, "grad_norm": 2.1280579566955566, "learning_rate": 9.90231966738052e-06, "loss": 0.2084934413433075, "rewards/accuracies": 0.8984375, "rewards/chosen": 13.13671875, "rewards/margins": 11.425506591796875, "rewards/rejected": 1.7100677490234375, "step": 637 }, { "epoch": 0.3302277432712215, "grad_norm": 2.0200304985046387, "learning_rate": 9.901757710716714e-06, "loss": 0.17842420935630798, "rewards/accuracies": 0.90625, "rewards/chosen": 16.57128143310547, "rewards/margins": 14.15301513671875, "rewards/rejected": 2.422152519226074, "step": 638 }, { "epoch": 0.3307453416149068, "grad_norm": 1.5393787622451782, "learning_rate": 9.901194158250338e-06, "loss": 0.14661435782909393, "rewards/accuracies": 0.9140625, "rewards/chosen": 18.226848602294922, "rewards/margins": 15.753265380859375, "rewards/rejected": 2.4719715118408203, "step": 639 }, { "epoch": 0.33126293995859213, "grad_norm": 1.9366540908813477, "learning_rate": 9.900629010164862e-06, "loss": 0.2642735540866852, "rewards/accuracies": 0.859375, "rewards/chosen": 14.535299301147461, "rewards/margins": 12.503639221191406, "rewards/rejected": 2.0281641483306885, "step": 640 }, { "epoch": 0.33178053830227744, "grad_norm": 1.4314534664154053, "learning_rate": 9.90006226664427e-06, "loss": 0.1600450873374939, "rewards/accuracies": 0.9296875, "rewards/chosen": 13.95367431640625, "rewards/margins": 12.396293640136719, "rewards/rejected": 1.5578193664550781, "step": 641 }, { "epoch": 0.33229813664596275, "grad_norm": 2.159153461456299, "learning_rate": 9.899493927873074e-06, "loss": 0.310558021068573, "rewards/accuracies": 0.8359375, "rewards/chosen": 16.84554672241211, "rewards/margins": 13.804012298583984, "rewards/rejected": 3.043773651123047, "step": 642 }, { "epoch": 0.33281573498964806, "grad_norm": 1.6121805906295776, "learning_rate": 9.898923994036299e-06, "loss": 0.24468617141246796, "rewards/accuracies": 0.8671875, "rewards/chosen": 16.667926788330078, "rewards/margins": 14.580581665039062, "rewards/rejected": 2.083505630493164, "step": 643 }, { "epoch": 0.3333333333333333, "grad_norm": 2.3005197048187256, "learning_rate": 9.898352465319488e-06, "loss": 0.28313547372817993, "rewards/accuracies": 0.84375, "rewards/chosen": 17.670177459716797, "rewards/margins": 15.288135528564453, "rewards/rejected": 2.385263442993164, "step": 644 }, { "epoch": 0.3338509316770186, "grad_norm": 1.4119545221328735, "learning_rate": 9.897779341908711e-06, "loss": 0.24120190739631653, "rewards/accuracies": 0.890625, "rewards/chosen": 15.860149383544922, "rewards/margins": 13.465591430664062, "rewards/rejected": 2.3970422744750977, "step": 645 }, { "epoch": 0.33436853002070394, "grad_norm": 6.371023654937744, "learning_rate": 9.897204623990551e-06, "loss": 0.3932299017906189, "rewards/accuracies": 0.8203125, "rewards/chosen": 13.3781099319458, "rewards/margins": 11.824121475219727, "rewards/rejected": 1.5567326545715332, "step": 646 }, { "epoch": 0.33488612836438925, "grad_norm": 1.4919666051864624, "learning_rate": 9.89662831175211e-06, "loss": 0.19445623457431793, "rewards/accuracies": 0.9140625, "rewards/chosen": 15.016319274902344, "rewards/margins": 13.905285835266113, "rewards/rejected": 1.1112031936645508, "step": 647 }, { "epoch": 0.33540372670807456, "grad_norm": 1.6738531589508057, "learning_rate": 9.896050405381012e-06, "loss": 0.2016451358795166, "rewards/accuracies": 0.921875, "rewards/chosen": 14.388004302978516, "rewards/margins": 12.877479553222656, "rewards/rejected": 1.5098991394042969, "step": 648 }, { "epoch": 0.3359213250517598, "grad_norm": 0.8379554152488708, "learning_rate": 9.8954709050654e-06, "loss": 0.1815718114376068, "rewards/accuracies": 0.953125, "rewards/chosen": 12.79991340637207, "rewards/margins": 11.830284118652344, "rewards/rejected": 0.9701639413833618, "step": 649 }, { "epoch": 0.3364389233954451, "grad_norm": 1.367712378501892, "learning_rate": 9.894889810993932e-06, "loss": 0.2073855698108673, "rewards/accuracies": 0.890625, "rewards/chosen": 10.67745590209961, "rewards/margins": 9.983505249023438, "rewards/rejected": 0.6951198577880859, "step": 650 }, { "epoch": 0.33695652173913043, "grad_norm": 0.6603106260299683, "learning_rate": 9.894307123355787e-06, "loss": 0.16634726524353027, "rewards/accuracies": 0.9453125, "rewards/chosen": 11.819572448730469, "rewards/margins": 10.766738891601562, "rewards/rejected": 1.0548315048217773, "step": 651 }, { "epoch": 0.33747412008281574, "grad_norm": 1.4020755290985107, "learning_rate": 9.893722842340666e-06, "loss": 0.23590701818466187, "rewards/accuracies": 0.8671875, "rewards/chosen": 9.00428295135498, "rewards/margins": 8.360389709472656, "rewards/rejected": 0.6426255702972412, "step": 652 }, { "epoch": 0.33799171842650105, "grad_norm": 0.7604324221611023, "learning_rate": 9.893136968138784e-06, "loss": 0.13563381135463715, "rewards/accuracies": 0.9375, "rewards/chosen": 12.96198844909668, "rewards/margins": 11.459266662597656, "rewards/rejected": 1.5034637451171875, "step": 653 }, { "epoch": 0.3385093167701863, "grad_norm": 1.3179911375045776, "learning_rate": 9.892549500940875e-06, "loss": 0.20217075943946838, "rewards/accuracies": 0.875, "rewards/chosen": 11.375846862792969, "rewards/margins": 10.098007202148438, "rewards/rejected": 1.2792315483093262, "step": 654 }, { "epoch": 0.3390269151138716, "grad_norm": 1.2673726081848145, "learning_rate": 9.891960440938198e-06, "loss": 0.18860264122486115, "rewards/accuracies": 0.9140625, "rewards/chosen": 11.656497955322266, "rewards/margins": 9.763580322265625, "rewards/rejected": 1.8946330547332764, "step": 655 }, { "epoch": 0.33954451345755693, "grad_norm": 1.63986074924469, "learning_rate": 9.89136978832252e-06, "loss": 0.22808513045310974, "rewards/accuracies": 0.8515625, "rewards/chosen": 11.689105987548828, "rewards/margins": 10.324398040771484, "rewards/rejected": 1.3648271560668945, "step": 656 }, { "epoch": 0.34006211180124224, "grad_norm": 1.684954285621643, "learning_rate": 9.890777543286139e-06, "loss": 0.2489134818315506, "rewards/accuracies": 0.875, "rewards/chosen": 10.373855590820312, "rewards/margins": 8.925018310546875, "rewards/rejected": 1.4465961456298828, "step": 657 }, { "epoch": 0.34057971014492755, "grad_norm": 1.121510624885559, "learning_rate": 9.890183706021856e-06, "loss": 0.20389732718467712, "rewards/accuracies": 0.8828125, "rewards/chosen": 10.436188697814941, "rewards/margins": 9.074630737304688, "rewards/rejected": 1.3625268936157227, "step": 658 }, { "epoch": 0.34109730848861286, "grad_norm": 1.1396479606628418, "learning_rate": 9.889588276723007e-06, "loss": 0.15830647945404053, "rewards/accuracies": 0.9453125, "rewards/chosen": 11.190448760986328, "rewards/margins": 9.790931701660156, "rewards/rejected": 1.397648811340332, "step": 659 }, { "epoch": 0.3416149068322981, "grad_norm": 1.1562222242355347, "learning_rate": 9.888991255583434e-06, "loss": 0.1620607078075409, "rewards/accuracies": 0.9296875, "rewards/chosen": 9.499820709228516, "rewards/margins": 8.182849884033203, "rewards/rejected": 1.3196821212768555, "step": 660 }, { "epoch": 0.3421325051759834, "grad_norm": 1.0226198434829712, "learning_rate": 9.888392642797501e-06, "loss": 0.19993546605110168, "rewards/accuracies": 0.8984375, "rewards/chosen": 11.38839054107666, "rewards/margins": 9.689979553222656, "rewards/rejected": 1.6991052627563477, "step": 661 }, { "epoch": 0.34265010351966874, "grad_norm": 2.7094125747680664, "learning_rate": 9.887792438560092e-06, "loss": 0.2679567337036133, "rewards/accuracies": 0.875, "rewards/chosen": 10.140437126159668, "rewards/margins": 8.535125732421875, "rewards/rejected": 1.6018600463867188, "step": 662 }, { "epoch": 0.34316770186335405, "grad_norm": 1.5353482961654663, "learning_rate": 9.88719064306661e-06, "loss": 0.17127859592437744, "rewards/accuracies": 0.8984375, "rewards/chosen": 10.779842376708984, "rewards/margins": 9.577926635742188, "rewards/rejected": 1.2036914825439453, "step": 663 }, { "epoch": 0.34368530020703936, "grad_norm": 1.7688270807266235, "learning_rate": 9.886587256512971e-06, "loss": 0.18828684091567993, "rewards/accuracies": 0.921875, "rewards/chosen": 9.939226150512695, "rewards/margins": 9.067718505859375, "rewards/rejected": 0.8721761703491211, "step": 664 }, { "epoch": 0.3442028985507246, "grad_norm": 1.9372658729553223, "learning_rate": 9.885982279095614e-06, "loss": 0.22555062174797058, "rewards/accuracies": 0.828125, "rewards/chosen": 10.586036682128906, "rewards/margins": 9.062309265136719, "rewards/rejected": 1.523641586303711, "step": 665 }, { "epoch": 0.3447204968944099, "grad_norm": 2.0873546600341797, "learning_rate": 9.88537571101149e-06, "loss": 0.16781426966190338, "rewards/accuracies": 0.9296875, "rewards/chosen": 11.470203399658203, "rewards/margins": 10.001155853271484, "rewards/rejected": 1.4675464630126953, "step": 666 }, { "epoch": 0.34523809523809523, "grad_norm": 2.134169578552246, "learning_rate": 9.884767552458077e-06, "loss": 0.2536834180355072, "rewards/accuracies": 0.8671875, "rewards/chosen": 10.816091537475586, "rewards/margins": 9.898368835449219, "rewards/rejected": 0.919097900390625, "step": 667 }, { "epoch": 0.34575569358178054, "grad_norm": 1.5438681840896606, "learning_rate": 9.884157803633361e-06, "loss": 0.17543494701385498, "rewards/accuracies": 0.9140625, "rewards/chosen": 12.176239013671875, "rewards/margins": 10.270042419433594, "rewards/rejected": 1.9027414321899414, "step": 668 }, { "epoch": 0.34627329192546585, "grad_norm": 1.4963594675064087, "learning_rate": 9.883546464735852e-06, "loss": 0.22308418154716492, "rewards/accuracies": 0.8828125, "rewards/chosen": 12.588534355163574, "rewards/margins": 10.803627014160156, "rewards/rejected": 1.78363037109375, "step": 669 }, { "epoch": 0.34679089026915116, "grad_norm": 2.9115285873413086, "learning_rate": 9.882933535964575e-06, "loss": 0.2782153785228729, "rewards/accuracies": 0.8203125, "rewards/chosen": 8.152481079101562, "rewards/margins": 7.331596374511719, "rewards/rejected": 0.8197851181030273, "step": 670 }, { "epoch": 0.3473084886128364, "grad_norm": 1.3372458219528198, "learning_rate": 9.882319017519075e-06, "loss": 0.21009975671768188, "rewards/accuracies": 0.90625, "rewards/chosen": 10.386603355407715, "rewards/margins": 9.010124206542969, "rewards/rejected": 1.377359390258789, "step": 671 }, { "epoch": 0.34782608695652173, "grad_norm": 1.1388192176818848, "learning_rate": 9.881702909599408e-06, "loss": 0.20492568612098694, "rewards/accuracies": 0.8828125, "rewards/chosen": 9.779683113098145, "rewards/margins": 8.500907897949219, "rewards/rejected": 1.2779045104980469, "step": 672 }, { "epoch": 0.34834368530020704, "grad_norm": 1.422365427017212, "learning_rate": 9.881085212406161e-06, "loss": 0.16526654362678528, "rewards/accuracies": 0.9296875, "rewards/chosen": 11.571056365966797, "rewards/margins": 10.150634765625, "rewards/rejected": 1.4177231788635254, "step": 673 }, { "epoch": 0.34886128364389235, "grad_norm": 1.2197246551513672, "learning_rate": 9.880465926140421e-06, "loss": 0.2296680212020874, "rewards/accuracies": 0.8515625, "rewards/chosen": 8.788795471191406, "rewards/margins": 8.009521484375, "rewards/rejected": 0.7816991806030273, "step": 674 }, { "epoch": 0.34937888198757766, "grad_norm": 2.3935647010803223, "learning_rate": 9.879845051003807e-06, "loss": 0.2618173360824585, "rewards/accuracies": 0.875, "rewards/chosen": 11.135452270507812, "rewards/margins": 9.714092254638672, "rewards/rejected": 1.423811435699463, "step": 675 }, { "epoch": 0.3498964803312629, "grad_norm": 0.8867794275283813, "learning_rate": 9.879222587198447e-06, "loss": 0.16712293028831482, "rewards/accuracies": 0.9140625, "rewards/chosen": 10.388751983642578, "rewards/margins": 9.429893493652344, "rewards/rejected": 0.9555778503417969, "step": 676 }, { "epoch": 0.3504140786749482, "grad_norm": 0.8978283405303955, "learning_rate": 9.878598534926988e-06, "loss": 0.154879629611969, "rewards/accuracies": 0.9140625, "rewards/chosen": 10.452975273132324, "rewards/margins": 9.642330169677734, "rewards/rejected": 0.805419921875, "step": 677 }, { "epoch": 0.35093167701863354, "grad_norm": 0.9781925678253174, "learning_rate": 9.877972894392598e-06, "loss": 0.21816027164459229, "rewards/accuracies": 0.90625, "rewards/chosen": 8.929679870605469, "rewards/margins": 8.085235595703125, "rewards/rejected": 0.8473825454711914, "step": 678 }, { "epoch": 0.35144927536231885, "grad_norm": 0.985307514667511, "learning_rate": 9.877345665798955e-06, "loss": 0.2299910932779312, "rewards/accuracies": 0.8671875, "rewards/chosen": 11.212242126464844, "rewards/margins": 10.479949951171875, "rewards/rejected": 0.7326860427856445, "step": 679 }, { "epoch": 0.35196687370600416, "grad_norm": 1.109583854675293, "learning_rate": 9.876716849350259e-06, "loss": 0.20900282263755798, "rewards/accuracies": 0.921875, "rewards/chosen": 11.644654273986816, "rewards/margins": 10.69573974609375, "rewards/rejected": 0.9490400552749634, "step": 680 }, { "epoch": 0.35248447204968947, "grad_norm": 1.0813488960266113, "learning_rate": 9.876086445251226e-06, "loss": 0.17529582977294922, "rewards/accuracies": 0.9140625, "rewards/chosen": 11.049407958984375, "rewards/margins": 10.6807861328125, "rewards/rejected": 0.37227869033813477, "step": 681 }, { "epoch": 0.3530020703933747, "grad_norm": 1.7123258113861084, "learning_rate": 9.875454453707087e-06, "loss": 0.16354402899742126, "rewards/accuracies": 0.921875, "rewards/chosen": 14.409059524536133, "rewards/margins": 13.015365600585938, "rewards/rejected": 1.3888556957244873, "step": 682 }, { "epoch": 0.35351966873706003, "grad_norm": 0.9586467146873474, "learning_rate": 9.874820874923595e-06, "loss": 0.19821816682815552, "rewards/accuracies": 0.90625, "rewards/chosen": 13.668146133422852, "rewards/margins": 12.485685348510742, "rewards/rejected": 1.1841919422149658, "step": 683 }, { "epoch": 0.35403726708074534, "grad_norm": 1.2200785875320435, "learning_rate": 9.874185709107013e-06, "loss": 0.2203201949596405, "rewards/accuracies": 0.8515625, "rewards/chosen": 14.55337142944336, "rewards/margins": 12.836151123046875, "rewards/rejected": 1.7183279991149902, "step": 684 }, { "epoch": 0.35455486542443065, "grad_norm": 1.863709568977356, "learning_rate": 9.873548956464126e-06, "loss": 0.23604939877986908, "rewards/accuracies": 0.875, "rewards/chosen": 15.734115600585938, "rewards/margins": 13.57867431640625, "rewards/rejected": 2.1532249450683594, "step": 685 }, { "epoch": 0.35507246376811596, "grad_norm": 1.6216795444488525, "learning_rate": 9.872910617202231e-06, "loss": 0.2104652225971222, "rewards/accuracies": 0.890625, "rewards/chosen": 14.233806610107422, "rewards/margins": 12.232986450195312, "rewards/rejected": 2.0018765926361084, "step": 686 }, { "epoch": 0.3555900621118012, "grad_norm": 1.24228835105896, "learning_rate": 9.872270691529147e-06, "loss": 0.1621025800704956, "rewards/accuracies": 0.9296875, "rewards/chosen": 13.871445655822754, "rewards/margins": 12.090293884277344, "rewards/rejected": 1.7811341285705566, "step": 687 }, { "epoch": 0.35610766045548653, "grad_norm": 1.2680209875106812, "learning_rate": 9.871629179653204e-06, "loss": 0.2047460824251175, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.082252502441406, "rewards/margins": 13.329414367675781, "rewards/rejected": 2.751729965209961, "step": 688 }, { "epoch": 0.35662525879917184, "grad_norm": 3.3603193759918213, "learning_rate": 9.870986081783251e-06, "loss": 0.25814276933670044, "rewards/accuracies": 0.890625, "rewards/chosen": 16.436262130737305, "rewards/margins": 13.499404907226562, "rewards/rejected": 2.9418811798095703, "step": 689 }, { "epoch": 0.35714285714285715, "grad_norm": 2.3630001544952393, "learning_rate": 9.870341398128653e-06, "loss": 0.27606767416000366, "rewards/accuracies": 0.8515625, "rewards/chosen": 15.269547462463379, "rewards/margins": 13.060562133789062, "rewards/rejected": 2.2067489624023438, "step": 690 }, { "epoch": 0.35766045548654246, "grad_norm": 2.5295281410217285, "learning_rate": 9.869695128899293e-06, "loss": 0.22747920453548431, "rewards/accuracies": 0.921875, "rewards/chosen": 14.463844299316406, "rewards/margins": 12.504928588867188, "rewards/rejected": 1.9599065780639648, "step": 691 }, { "epoch": 0.3581780538302277, "grad_norm": 1.3294848203659058, "learning_rate": 9.869047274305569e-06, "loss": 0.19287431240081787, "rewards/accuracies": 0.9140625, "rewards/chosen": 13.53384017944336, "rewards/margins": 11.450729370117188, "rewards/rejected": 2.0863685607910156, "step": 692 }, { "epoch": 0.358695652173913, "grad_norm": 0.9391077756881714, "learning_rate": 9.868397834558392e-06, "loss": 0.18334294855594635, "rewards/accuracies": 0.9140625, "rewards/chosen": 12.058794021606445, "rewards/margins": 10.484916687011719, "rewards/rejected": 1.5740165710449219, "step": 693 }, { "epoch": 0.35921325051759834, "grad_norm": 1.3390146493911743, "learning_rate": 9.867746809869193e-06, "loss": 0.19378723204135895, "rewards/accuracies": 0.9453125, "rewards/chosen": 12.474096298217773, "rewards/margins": 10.770286560058594, "rewards/rejected": 1.7007989883422852, "step": 694 }, { "epoch": 0.35973084886128365, "grad_norm": 1.4321084022521973, "learning_rate": 9.867094200449918e-06, "loss": 0.14748318493366241, "rewards/accuracies": 0.9296875, "rewards/chosen": 12.827241897583008, "rewards/margins": 11.138870239257812, "rewards/rejected": 1.6897525787353516, "step": 695 }, { "epoch": 0.36024844720496896, "grad_norm": 1.5679813623428345, "learning_rate": 9.86644000651303e-06, "loss": 0.2396484911441803, "rewards/accuracies": 0.8515625, "rewards/chosen": 12.085580825805664, "rewards/margins": 9.893074035644531, "rewards/rejected": 2.1909732818603516, "step": 696 }, { "epoch": 0.36076604554865427, "grad_norm": 2.282423973083496, "learning_rate": 9.865784228271502e-06, "loss": 0.22802041471004486, "rewards/accuracies": 0.8984375, "rewards/chosen": 13.15512466430664, "rewards/margins": 10.8961181640625, "rewards/rejected": 2.2594528198242188, "step": 697 }, { "epoch": 0.3612836438923395, "grad_norm": 1.3051247596740723, "learning_rate": 9.865126865938834e-06, "loss": 0.16270782053470612, "rewards/accuracies": 0.921875, "rewards/chosen": 12.872295379638672, "rewards/margins": 10.325859069824219, "rewards/rejected": 2.5443878173828125, "step": 698 }, { "epoch": 0.36180124223602483, "grad_norm": 1.2219178676605225, "learning_rate": 9.86446791972903e-06, "loss": 0.2023090124130249, "rewards/accuracies": 0.90625, "rewards/chosen": 12.1094970703125, "rewards/margins": 9.643814086914062, "rewards/rejected": 2.4686813354492188, "step": 699 }, { "epoch": 0.36231884057971014, "grad_norm": 0.9794842600822449, "learning_rate": 9.863807389856617e-06, "loss": 0.13064387440681458, "rewards/accuracies": 0.953125, "rewards/chosen": 12.367950439453125, "rewards/margins": 10.267448425292969, "rewards/rejected": 2.0985264778137207, "step": 700 }, { "epoch": 0.36283643892339545, "grad_norm": 2.1817426681518555, "learning_rate": 9.863145276536633e-06, "loss": 0.22173917293548584, "rewards/accuracies": 0.890625, "rewards/chosen": 11.650558471679688, "rewards/margins": 9.525367736816406, "rewards/rejected": 2.126711845397949, "step": 701 }, { "epoch": 0.36335403726708076, "grad_norm": 1.7932425737380981, "learning_rate": 9.862481579984639e-06, "loss": 0.23643463850021362, "rewards/accuracies": 0.8828125, "rewards/chosen": 10.347505569458008, "rewards/margins": 8.614715576171875, "rewards/rejected": 1.7311019897460938, "step": 702 }, { "epoch": 0.363871635610766, "grad_norm": 1.6758486032485962, "learning_rate": 9.8618163004167e-06, "loss": 0.16047516465187073, "rewards/accuracies": 0.9375, "rewards/chosen": 10.21586799621582, "rewards/margins": 8.179580688476562, "rewards/rejected": 2.033824920654297, "step": 703 }, { "epoch": 0.36438923395445133, "grad_norm": 1.1363674402236938, "learning_rate": 9.861149438049404e-06, "loss": 0.16323590278625488, "rewards/accuracies": 0.90625, "rewards/chosen": 11.605562210083008, "rewards/margins": 9.02264404296875, "rewards/rejected": 2.585338592529297, "step": 704 }, { "epoch": 0.36490683229813664, "grad_norm": 1.1203621625900269, "learning_rate": 9.860480993099857e-06, "loss": 0.2117902934551239, "rewards/accuracies": 0.875, "rewards/chosen": 9.829582214355469, "rewards/margins": 8.001480102539062, "rewards/rejected": 1.829437255859375, "step": 705 }, { "epoch": 0.36542443064182195, "grad_norm": 0.9027115106582642, "learning_rate": 9.85981096578567e-06, "loss": 0.1472318321466446, "rewards/accuracies": 0.890625, "rewards/chosen": 11.774772644042969, "rewards/margins": 9.385665893554688, "rewards/rejected": 2.3861846923828125, "step": 706 }, { "epoch": 0.36594202898550726, "grad_norm": 1.121370792388916, "learning_rate": 9.85913935632498e-06, "loss": 0.17204608023166656, "rewards/accuracies": 0.9296875, "rewards/chosen": 10.739067077636719, "rewards/margins": 8.644989013671875, "rewards/rejected": 2.0957870483398438, "step": 707 }, { "epoch": 0.36645962732919257, "grad_norm": 1.353528380393982, "learning_rate": 9.858466164936432e-06, "loss": 0.2264186143875122, "rewards/accuracies": 0.8828125, "rewards/chosen": 10.760334014892578, "rewards/margins": 8.370361328125, "rewards/rejected": 2.3922462463378906, "step": 708 }, { "epoch": 0.3669772256728778, "grad_norm": 1.3089797496795654, "learning_rate": 9.857791391839189e-06, "loss": 0.1842292845249176, "rewards/accuracies": 0.921875, "rewards/chosen": 12.216727256774902, "rewards/margins": 9.999000549316406, "rewards/rejected": 2.217592239379883, "step": 709 }, { "epoch": 0.36749482401656314, "grad_norm": 1.5421844720840454, "learning_rate": 9.85711503725293e-06, "loss": 0.22604401409626007, "rewards/accuracies": 0.890625, "rewards/chosen": 10.040304183959961, "rewards/margins": 8.541473388671875, "rewards/rejected": 1.4955298900604248, "step": 710 }, { "epoch": 0.36801242236024845, "grad_norm": 2.3834340572357178, "learning_rate": 9.856437101397842e-06, "loss": 0.2615843713283539, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.507847785949707, "rewards/margins": 9.966682434082031, "rewards/rejected": 1.5399017333984375, "step": 711 }, { "epoch": 0.36853002070393376, "grad_norm": 2.371129035949707, "learning_rate": 9.855757584494637e-06, "loss": 0.22886544466018677, "rewards/accuracies": 0.8671875, "rewards/chosen": 14.762950897216797, "rewards/margins": 12.200729370117188, "rewards/rejected": 2.5616302490234375, "step": 712 }, { "epoch": 0.36904761904761907, "grad_norm": 1.7247058153152466, "learning_rate": 9.855076486764535e-06, "loss": 0.1967819333076477, "rewards/accuracies": 0.8984375, "rewards/chosen": 13.846283912658691, "rewards/margins": 12.351699829101562, "rewards/rejected": 1.4948360919952393, "step": 713 }, { "epoch": 0.3695652173913043, "grad_norm": 1.0988969802856445, "learning_rate": 9.85439380842927e-06, "loss": 0.2104225754737854, "rewards/accuracies": 0.9140625, "rewards/chosen": 10.923382759094238, "rewards/margins": 9.911128997802734, "rewards/rejected": 1.010697364807129, "step": 714 }, { "epoch": 0.37008281573498963, "grad_norm": 1.308025598526001, "learning_rate": 9.853709549711096e-06, "loss": 0.19058549404144287, "rewards/accuracies": 0.90625, "rewards/chosen": 12.529895782470703, "rewards/margins": 11.019638061523438, "rewards/rejected": 1.5100188255310059, "step": 715 }, { "epoch": 0.37060041407867494, "grad_norm": 1.013482928276062, "learning_rate": 9.853023710832777e-06, "loss": 0.1748560667037964, "rewards/accuracies": 0.9140625, "rewards/chosen": 11.53900146484375, "rewards/margins": 10.32568359375, "rewards/rejected": 1.2133636474609375, "step": 716 }, { "epoch": 0.37111801242236025, "grad_norm": 1.246631383895874, "learning_rate": 9.852336292017594e-06, "loss": 0.21893465518951416, "rewards/accuracies": 0.90625, "rewards/chosen": 12.4205904006958, "rewards/margins": 10.694534301757812, "rewards/rejected": 1.7232780456542969, "step": 717 }, { "epoch": 0.37163561076604557, "grad_norm": 1.2134424448013306, "learning_rate": 9.85164729348934e-06, "loss": 0.19538694620132446, "rewards/accuracies": 0.875, "rewards/chosen": 15.10056209564209, "rewards/margins": 13.683124542236328, "rewards/rejected": 1.417464256286621, "step": 718 }, { "epoch": 0.3721532091097309, "grad_norm": 1.189948320388794, "learning_rate": 9.850956715472323e-06, "loss": 0.19093729555606842, "rewards/accuracies": 0.8671875, "rewards/chosen": 13.28436279296875, "rewards/margins": 12.035655975341797, "rewards/rejected": 1.2447232007980347, "step": 719 }, { "epoch": 0.37267080745341613, "grad_norm": 1.144694447517395, "learning_rate": 9.850264558191368e-06, "loss": 0.21149639785289764, "rewards/accuracies": 0.8828125, "rewards/chosen": 14.299911499023438, "rewards/margins": 13.263633728027344, "rewards/rejected": 1.0370941162109375, "step": 720 }, { "epoch": 0.37318840579710144, "grad_norm": 0.970544695854187, "learning_rate": 9.849570821871807e-06, "loss": 0.19663366675376892, "rewards/accuracies": 0.9296875, "rewards/chosen": 19.755945205688477, "rewards/margins": 17.673248291015625, "rewards/rejected": 2.082141399383545, "step": 721 }, { "epoch": 0.37370600414078675, "grad_norm": 2.061924934387207, "learning_rate": 9.848875506739495e-06, "loss": 0.2536396086215973, "rewards/accuracies": 0.8828125, "rewards/chosen": 19.693710327148438, "rewards/margins": 17.4732666015625, "rewards/rejected": 2.224567413330078, "step": 722 }, { "epoch": 0.37422360248447206, "grad_norm": 1.3001724481582642, "learning_rate": 9.848178613020798e-06, "loss": 0.1957395374774933, "rewards/accuracies": 0.890625, "rewards/chosen": 21.436784744262695, "rewards/margins": 18.677169799804688, "rewards/rejected": 2.763286590576172, "step": 723 }, { "epoch": 0.3747412008281574, "grad_norm": 4.245030879974365, "learning_rate": 9.84748014094259e-06, "loss": 0.22883765399456024, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.562042236328125, "rewards/margins": 19.1759033203125, "rewards/rejected": 3.3896865844726562, "step": 724 }, { "epoch": 0.3752587991718426, "grad_norm": 0.8638401031494141, "learning_rate": 9.846780090732267e-06, "loss": 0.15514934062957764, "rewards/accuracies": 0.8828125, "rewards/chosen": 24.32501983642578, "rewards/margins": 20.750396728515625, "rewards/rejected": 3.573060989379883, "step": 725 }, { "epoch": 0.37577639751552794, "grad_norm": 0.9417202472686768, "learning_rate": 9.846078462617736e-06, "loss": 0.18029022216796875, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.458534240722656, "rewards/margins": 21.109634399414062, "rewards/rejected": 5.349494934082031, "step": 726 }, { "epoch": 0.37629399585921325, "grad_norm": 1.2996535301208496, "learning_rate": 9.845375256827414e-06, "loss": 0.1493099331855774, "rewards/accuracies": 0.953125, "rewards/chosen": 28.81271743774414, "rewards/margins": 20.916595458984375, "rewards/rejected": 7.896046161651611, "step": 727 }, { "epoch": 0.37681159420289856, "grad_norm": 1.231242060661316, "learning_rate": 9.844670473590236e-06, "loss": 0.2013387829065323, "rewards/accuracies": 0.890625, "rewards/chosen": 27.234569549560547, "rewards/margins": 19.1563720703125, "rewards/rejected": 8.070560455322266, "step": 728 }, { "epoch": 0.37732919254658387, "grad_norm": 1.1173583269119263, "learning_rate": 9.84396411313565e-06, "loss": 0.19843898713588715, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.9787540435791, "rewards/margins": 22.25183868408203, "rewards/rejected": 8.73012924194336, "step": 729 }, { "epoch": 0.3778467908902691, "grad_norm": 2.7980971336364746, "learning_rate": 9.843256175693618e-06, "loss": 0.17357225716114044, "rewards/accuracies": 0.9140625, "rewards/chosen": 33.95194625854492, "rewards/margins": 22.376846313476562, "rewards/rejected": 11.581255912780762, "step": 730 }, { "epoch": 0.37836438923395443, "grad_norm": 1.2545956373214722, "learning_rate": 9.842546661494612e-06, "loss": 0.18574625253677368, "rewards/accuracies": 0.8984375, "rewards/chosen": 33.75956344604492, "rewards/margins": 19.721923828125, "rewards/rejected": 14.035364151000977, "step": 731 }, { "epoch": 0.37888198757763975, "grad_norm": 3.8442747592926025, "learning_rate": 9.84183557076962e-06, "loss": 0.2028842568397522, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.95536231994629, "rewards/margins": 17.983726501464844, "rewards/rejected": 10.975410461425781, "step": 732 }, { "epoch": 0.37939958592132506, "grad_norm": 1.2975488901138306, "learning_rate": 9.841122903750143e-06, "loss": 0.17833256721496582, "rewards/accuracies": 0.921875, "rewards/chosen": 25.422279357910156, "rewards/margins": 15.587646484375, "rewards/rejected": 9.83026123046875, "step": 733 }, { "epoch": 0.37991718426501037, "grad_norm": 0.7177997827529907, "learning_rate": 9.840408660668192e-06, "loss": 0.1194600760936737, "rewards/accuracies": 0.921875, "rewards/chosen": 19.988651275634766, "rewards/margins": 13.621047973632812, "rewards/rejected": 6.3693132400512695, "step": 734 }, { "epoch": 0.3804347826086957, "grad_norm": 1.410979986190796, "learning_rate": 9.8396928417563e-06, "loss": 0.18717487156391144, "rewards/accuracies": 0.90625, "rewards/chosen": 18.91668128967285, "rewards/margins": 12.287940979003906, "rewards/rejected": 6.628827095031738, "step": 735 }, { "epoch": 0.38095238095238093, "grad_norm": 1.5316991806030273, "learning_rate": 9.838975447247501e-06, "loss": 0.22402599453926086, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.71817398071289, "rewards/margins": 9.7073974609375, "rewards/rejected": 7.012667179107666, "step": 736 }, { "epoch": 0.38146997929606624, "grad_norm": 1.457931399345398, "learning_rate": 9.838256477375352e-06, "loss": 0.20337113738059998, "rewards/accuracies": 0.875, "rewards/chosen": 11.848058700561523, "rewards/margins": 8.01446533203125, "rewards/rejected": 3.830585479736328, "step": 737 }, { "epoch": 0.38198757763975155, "grad_norm": 2.0316028594970703, "learning_rate": 9.837535932373916e-06, "loss": 0.2609621584415436, "rewards/accuracies": 0.8671875, "rewards/chosen": 10.14752197265625, "rewards/margins": 7.346168518066406, "rewards/rejected": 2.800968647003174, "step": 738 }, { "epoch": 0.38250517598343686, "grad_norm": 1.9904621839523315, "learning_rate": 9.836813812477775e-06, "loss": 0.24515026807785034, "rewards/accuracies": 0.8515625, "rewards/chosen": 10.551675796508789, "rewards/margins": 6.936149597167969, "rewards/rejected": 3.614940643310547, "step": 739 }, { "epoch": 0.3830227743271222, "grad_norm": 1.6718487739562988, "learning_rate": 9.836090117922018e-06, "loss": 0.21575842797756195, "rewards/accuracies": 0.8984375, "rewards/chosen": 9.794353485107422, "rewards/margins": 7.0853271484375, "rewards/rejected": 2.706226348876953, "step": 740 }, { "epoch": 0.38354037267080743, "grad_norm": 1.2334612607955933, "learning_rate": 9.835364848942248e-06, "loss": 0.20035839080810547, "rewards/accuracies": 0.875, "rewards/chosen": 8.039630889892578, "rewards/margins": 6.23614501953125, "rewards/rejected": 1.803044319152832, "step": 741 }, { "epoch": 0.38405797101449274, "grad_norm": 1.0956913232803345, "learning_rate": 9.834638005774584e-06, "loss": 0.2002929002046585, "rewards/accuracies": 0.8828125, "rewards/chosen": 9.543233871459961, "rewards/margins": 6.8964691162109375, "rewards/rejected": 2.6465635299682617, "step": 742 }, { "epoch": 0.38457556935817805, "grad_norm": 1.070222020149231, "learning_rate": 9.833909588655654e-06, "loss": 0.16410647332668304, "rewards/accuracies": 0.90625, "rewards/chosen": 8.58647346496582, "rewards/margins": 7.193004608154297, "rewards/rejected": 1.3968358039855957, "step": 743 }, { "epoch": 0.38509316770186336, "grad_norm": 2.546692371368408, "learning_rate": 9.8331795978226e-06, "loss": 0.2442653477191925, "rewards/accuracies": 0.875, "rewards/chosen": 7.686740875244141, "rewards/margins": 6.0121002197265625, "rewards/rejected": 1.6739654541015625, "step": 744 }, { "epoch": 0.38561076604554867, "grad_norm": 1.602497935295105, "learning_rate": 9.832448033513074e-06, "loss": 0.2742897868156433, "rewards/accuracies": 0.875, "rewards/chosen": 8.3975830078125, "rewards/margins": 6.7347412109375, "rewards/rejected": 1.6634302139282227, "step": 745 }, { "epoch": 0.386128364389234, "grad_norm": 0.9265283346176147, "learning_rate": 9.831714895965242e-06, "loss": 0.15222987532615662, "rewards/accuracies": 0.921875, "rewards/chosen": 10.222671508789062, "rewards/margins": 8.292327880859375, "rewards/rejected": 1.9316349029541016, "step": 746 }, { "epoch": 0.38664596273291924, "grad_norm": 1.079360008239746, "learning_rate": 9.830980185417784e-06, "loss": 0.226613849401474, "rewards/accuracies": 0.8984375, "rewards/chosen": 9.379386901855469, "rewards/margins": 7.930213928222656, "rewards/rejected": 1.4479765892028809, "step": 747 }, { "epoch": 0.38716356107660455, "grad_norm": 1.1119275093078613, "learning_rate": 9.830243902109891e-06, "loss": 0.23032860457897186, "rewards/accuracies": 0.8984375, "rewards/chosen": 9.297677993774414, "rewards/margins": 8.152019500732422, "rewards/rejected": 1.1472721099853516, "step": 748 }, { "epoch": 0.38768115942028986, "grad_norm": 1.6068172454833984, "learning_rate": 9.829506046281263e-06, "loss": 0.25824350118637085, "rewards/accuracies": 0.8828125, "rewards/chosen": 10.014997482299805, "rewards/margins": 8.491485595703125, "rewards/rejected": 1.5214762687683105, "step": 749 }, { "epoch": 0.38819875776397517, "grad_norm": 0.7818301916122437, "learning_rate": 9.828766618172116e-06, "loss": 0.1454024314880371, "rewards/accuracies": 0.9296875, "rewards/chosen": 11.330547332763672, "rewards/margins": 10.110069274902344, "rewards/rejected": 1.2196426391601562, "step": 750 }, { "epoch": 0.3887163561076605, "grad_norm": 1.0108332633972168, "learning_rate": 9.828025618023177e-06, "loss": 0.15869858860969543, "rewards/accuracies": 0.9140625, "rewards/chosen": 12.124991416931152, "rewards/margins": 10.54047679901123, "rewards/rejected": 1.5874390602111816, "step": 751 }, { "epoch": 0.38923395445134573, "grad_norm": 1.0238827466964722, "learning_rate": 9.82728304607568e-06, "loss": 0.1730051040649414, "rewards/accuracies": 0.921875, "rewards/chosen": 12.937259674072266, "rewards/margins": 11.583984375, "rewards/rejected": 1.3531599044799805, "step": 752 }, { "epoch": 0.38975155279503104, "grad_norm": 1.2673447132110596, "learning_rate": 9.826538902571377e-06, "loss": 0.18208575248718262, "rewards/accuracies": 0.890625, "rewards/chosen": 14.684038162231445, "rewards/margins": 12.779914855957031, "rewards/rejected": 1.9047527313232422, "step": 753 }, { "epoch": 0.39026915113871635, "grad_norm": 0.9809436798095703, "learning_rate": 9.825793187752531e-06, "loss": 0.19570180773735046, "rewards/accuracies": 0.8984375, "rewards/chosen": 13.825353622436523, "rewards/margins": 11.862892150878906, "rewards/rejected": 1.9628658294677734, "step": 754 }, { "epoch": 0.39078674948240166, "grad_norm": 1.5118844509124756, "learning_rate": 9.825045901861911e-06, "loss": 0.17989546060562134, "rewards/accuracies": 0.921875, "rewards/chosen": 16.794849395751953, "rewards/margins": 14.430755615234375, "rewards/rejected": 2.368557929992676, "step": 755 }, { "epoch": 0.391304347826087, "grad_norm": 0.8239021897315979, "learning_rate": 9.824297045142806e-06, "loss": 0.12050564587116241, "rewards/accuracies": 0.9453125, "rewards/chosen": 18.260486602783203, "rewards/margins": 15.728729248046875, "rewards/rejected": 2.534764289855957, "step": 756 }, { "epoch": 0.3918219461697723, "grad_norm": 2.078847646713257, "learning_rate": 9.823546617839007e-06, "loss": 0.2842577397823334, "rewards/accuracies": 0.8671875, "rewards/chosen": 18.9869384765625, "rewards/margins": 15.6689453125, "rewards/rejected": 3.3169612884521484, "step": 757 }, { "epoch": 0.39233954451345754, "grad_norm": 1.6347492933273315, "learning_rate": 9.822794620194824e-06, "loss": 0.23144373297691345, "rewards/accuracies": 0.9140625, "rewards/chosen": 18.74720001220703, "rewards/margins": 16.15949249267578, "rewards/rejected": 2.5896568298339844, "step": 758 }, { "epoch": 0.39285714285714285, "grad_norm": 1.258353352546692, "learning_rate": 9.822041052455074e-06, "loss": 0.24161086976528168, "rewards/accuracies": 0.875, "rewards/chosen": 18.441253662109375, "rewards/margins": 15.15610122680664, "rewards/rejected": 3.289344310760498, "step": 759 }, { "epoch": 0.39337474120082816, "grad_norm": 0.8840195536613464, "learning_rate": 9.821285914865088e-06, "loss": 0.15893453359603882, "rewards/accuracies": 0.9375, "rewards/chosen": 20.700027465820312, "rewards/margins": 17.158599853515625, "rewards/rejected": 3.545527458190918, "step": 760 }, { "epoch": 0.39389233954451347, "grad_norm": 1.2519925832748413, "learning_rate": 9.820529207670706e-06, "loss": 0.2166917622089386, "rewards/accuracies": 0.8828125, "rewards/chosen": 20.416166305541992, "rewards/margins": 17.10157012939453, "rewards/rejected": 3.3144454956054688, "step": 761 }, { "epoch": 0.3944099378881988, "grad_norm": 1.0762138366699219, "learning_rate": 9.819770931118277e-06, "loss": 0.19426599144935608, "rewards/accuracies": 0.8828125, "rewards/chosen": 22.817806243896484, "rewards/margins": 18.340072631835938, "rewards/rejected": 4.487735748291016, "step": 762 }, { "epoch": 0.39492753623188404, "grad_norm": 2.2689883708953857, "learning_rate": 9.819011085454666e-06, "loss": 0.3453158736228943, "rewards/accuracies": 0.7890625, "rewards/chosen": 18.616058349609375, "rewards/margins": 15.493370056152344, "rewards/rejected": 3.131479263305664, "step": 763 }, { "epoch": 0.39544513457556935, "grad_norm": 1.3786132335662842, "learning_rate": 9.818249670927246e-06, "loss": 0.2071910947561264, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.157947540283203, "rewards/margins": 18.238555908203125, "rewards/rejected": 3.9139866828918457, "step": 764 }, { "epoch": 0.39596273291925466, "grad_norm": 1.4006268978118896, "learning_rate": 9.817486687783901e-06, "loss": 0.23690073192119598, "rewards/accuracies": 0.8828125, "rewards/chosen": 20.30471420288086, "rewards/margins": 16.721717834472656, "rewards/rejected": 3.5817413330078125, "step": 765 }, { "epoch": 0.39648033126293997, "grad_norm": 0.9719031453132629, "learning_rate": 9.816722136273024e-06, "loss": 0.17282602190971375, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.76789093017578, "rewards/margins": 16.85662841796875, "rewards/rejected": 3.9118762016296387, "step": 766 }, { "epoch": 0.3969979296066253, "grad_norm": 1.5405819416046143, "learning_rate": 9.815956016643522e-06, "loss": 0.23001828789710999, "rewards/accuracies": 0.890625, "rewards/chosen": 18.78411865234375, "rewards/margins": 15.861465454101562, "rewards/rejected": 2.9198875427246094, "step": 767 }, { "epoch": 0.39751552795031053, "grad_norm": 1.259480595588684, "learning_rate": 9.815188329144809e-06, "loss": 0.2280757874250412, "rewards/accuracies": 0.8828125, "rewards/chosen": 21.730323791503906, "rewards/margins": 18.092998504638672, "rewards/rejected": 3.6419460773468018, "step": 768 }, { "epoch": 0.39803312629399584, "grad_norm": 1.0625183582305908, "learning_rate": 9.814419074026814e-06, "loss": 0.2301289439201355, "rewards/accuracies": 0.875, "rewards/chosen": 22.782777786254883, "rewards/margins": 18.861534118652344, "rewards/rejected": 3.922138214111328, "step": 769 }, { "epoch": 0.39855072463768115, "grad_norm": 1.150001883506775, "learning_rate": 9.813648251539971e-06, "loss": 0.1792278289794922, "rewards/accuracies": 0.890625, "rewards/chosen": 21.73516273498535, "rewards/margins": 17.60491180419922, "rewards/rejected": 4.12347412109375, "step": 770 }, { "epoch": 0.39906832298136646, "grad_norm": 0.7456644177436829, "learning_rate": 9.812875861935229e-06, "loss": 0.18013307452201843, "rewards/accuracies": 0.921875, "rewards/chosen": 21.083425521850586, "rewards/margins": 16.93808937072754, "rewards/rejected": 4.14506721496582, "step": 771 }, { "epoch": 0.3995859213250518, "grad_norm": 1.2431856393814087, "learning_rate": 9.812101905464043e-06, "loss": 0.25051817297935486, "rewards/accuracies": 0.875, "rewards/chosen": 19.98161506652832, "rewards/margins": 16.09241485595703, "rewards/rejected": 3.8905410766601562, "step": 772 }, { "epoch": 0.4001035196687371, "grad_norm": 0.7587676644325256, "learning_rate": 9.81132638237838e-06, "loss": 0.1910700798034668, "rewards/accuracies": 0.90625, "rewards/chosen": 21.4517822265625, "rewards/margins": 17.072547912597656, "rewards/rejected": 4.380359649658203, "step": 773 }, { "epoch": 0.40062111801242234, "grad_norm": 0.6740064024925232, "learning_rate": 9.810549292930718e-06, "loss": 0.16134238243103027, "rewards/accuracies": 0.9296875, "rewards/chosen": 21.105844497680664, "rewards/margins": 17.22258758544922, "rewards/rejected": 3.8796091079711914, "step": 774 }, { "epoch": 0.40113871635610765, "grad_norm": 1.1945514678955078, "learning_rate": 9.809770637374046e-06, "loss": 0.21174287796020508, "rewards/accuracies": 0.8984375, "rewards/chosen": 21.453231811523438, "rewards/margins": 16.75200653076172, "rewards/rejected": 4.706138610839844, "step": 775 }, { "epoch": 0.40165631469979296, "grad_norm": 1.0767347812652588, "learning_rate": 9.808990415961858e-06, "loss": 0.17042604088783264, "rewards/accuracies": 0.9375, "rewards/chosen": 20.929555892944336, "rewards/margins": 16.25391387939453, "rewards/rejected": 4.680454254150391, "step": 776 }, { "epoch": 0.40217391304347827, "grad_norm": 2.8255748748779297, "learning_rate": 9.808208628948161e-06, "loss": 0.211513489484787, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.304943084716797, "rewards/margins": 16.856468200683594, "rewards/rejected": 5.445032119750977, "step": 777 }, { "epoch": 0.4026915113871636, "grad_norm": 1.243708848953247, "learning_rate": 9.807425276587473e-06, "loss": 0.18465031683444977, "rewards/accuracies": 0.921875, "rewards/chosen": 16.938617706298828, "rewards/margins": 13.455680847167969, "rewards/rejected": 3.4851741790771484, "step": 778 }, { "epoch": 0.40320910973084884, "grad_norm": 1.8138669729232788, "learning_rate": 9.806640359134819e-06, "loss": 0.18371735513210297, "rewards/accuracies": 0.9140625, "rewards/chosen": 15.424005508422852, "rewards/margins": 12.536155700683594, "rewards/rejected": 2.8883819580078125, "step": 779 }, { "epoch": 0.40372670807453415, "grad_norm": 1.7039378881454468, "learning_rate": 9.805853876845734e-06, "loss": 0.19788311421871185, "rewards/accuracies": 0.890625, "rewards/chosen": 18.135122299194336, "rewards/margins": 14.618331909179688, "rewards/rejected": 3.51456356048584, "step": 780 }, { "epoch": 0.40424430641821946, "grad_norm": 0.8874423503875732, "learning_rate": 9.805065829976263e-06, "loss": 0.1695145219564438, "rewards/accuracies": 0.890625, "rewards/chosen": 13.955020904541016, "rewards/margins": 11.536781311035156, "rewards/rejected": 2.4193649291992188, "step": 781 }, { "epoch": 0.40476190476190477, "grad_norm": 0.9339531064033508, "learning_rate": 9.804276218782964e-06, "loss": 0.1677173376083374, "rewards/accuracies": 0.8828125, "rewards/chosen": 13.0989990234375, "rewards/margins": 11.224740982055664, "rewards/rejected": 1.876412034034729, "step": 782 }, { "epoch": 0.4052795031055901, "grad_norm": 1.3641997575759888, "learning_rate": 9.803485043522895e-06, "loss": 0.22793632745742798, "rewards/accuracies": 0.8671875, "rewards/chosen": 10.688286781311035, "rewards/margins": 9.114837646484375, "rewards/rejected": 1.5708274841308594, "step": 783 }, { "epoch": 0.4057971014492754, "grad_norm": 1.2544397115707397, "learning_rate": 9.802692304453631e-06, "loss": 0.21246817708015442, "rewards/accuracies": 0.90625, "rewards/chosen": 10.745420455932617, "rewards/margins": 8.96368408203125, "rewards/rejected": 1.7837371826171875, "step": 784 }, { "epoch": 0.40631469979296064, "grad_norm": 2.067718982696533, "learning_rate": 9.801898001833257e-06, "loss": 0.14453184604644775, "rewards/accuracies": 0.9140625, "rewards/chosen": 11.109672546386719, "rewards/margins": 9.091484069824219, "rewards/rejected": 2.0160675048828125, "step": 785 }, { "epoch": 0.40683229813664595, "grad_norm": 2.2342376708984375, "learning_rate": 9.801102135920359e-06, "loss": 0.19286280870437622, "rewards/accuracies": 0.875, "rewards/chosen": 9.836006164550781, "rewards/margins": 8.149124145507812, "rewards/rejected": 1.6888799667358398, "step": 786 }, { "epoch": 0.40734989648033126, "grad_norm": 1.4714553356170654, "learning_rate": 9.800304706974041e-06, "loss": 0.1691381335258484, "rewards/accuracies": 0.921875, "rewards/chosen": 9.176868438720703, "rewards/margins": 7.934028625488281, "rewards/rejected": 1.240386962890625, "step": 787 }, { "epoch": 0.4078674948240166, "grad_norm": 3.2511892318725586, "learning_rate": 9.799505715253908e-06, "loss": 0.18824255466461182, "rewards/accuracies": 0.921875, "rewards/chosen": 8.749549865722656, "rewards/margins": 7.301307678222656, "rewards/rejected": 1.4489936828613281, "step": 788 }, { "epoch": 0.4083850931677019, "grad_norm": 1.784614086151123, "learning_rate": 9.798705161020081e-06, "loss": 0.25624823570251465, "rewards/accuracies": 0.8671875, "rewards/chosen": 9.511926651000977, "rewards/margins": 8.212875366210938, "rewards/rejected": 1.2987086772918701, "step": 789 }, { "epoch": 0.40890269151138714, "grad_norm": 2.871861457824707, "learning_rate": 9.797903044533184e-06, "loss": 0.2903103828430176, "rewards/accuracies": 0.890625, "rewards/chosen": 9.176839828491211, "rewards/margins": 7.621574401855469, "rewards/rejected": 1.5566024780273438, "step": 790 }, { "epoch": 0.40942028985507245, "grad_norm": 1.7746596336364746, "learning_rate": 9.797099366054352e-06, "loss": 0.24460569024085999, "rewards/accuracies": 0.890625, "rewards/chosen": 8.140655517578125, "rewards/margins": 6.9302978515625, "rewards/rejected": 1.2128753662109375, "step": 791 }, { "epoch": 0.40993788819875776, "grad_norm": 4.441712379455566, "learning_rate": 9.79629412584523e-06, "loss": 0.19758102297782898, "rewards/accuracies": 0.890625, "rewards/chosen": 12.570899963378906, "rewards/margins": 10.64895248413086, "rewards/rejected": 1.9192695617675781, "step": 792 }, { "epoch": 0.41045548654244307, "grad_norm": 1.0158239603042603, "learning_rate": 9.795487324167966e-06, "loss": 0.15432527661323547, "rewards/accuracies": 0.9296875, "rewards/chosen": 13.976432800292969, "rewards/margins": 12.049545288085938, "rewards/rejected": 1.9225006103515625, "step": 793 }, { "epoch": 0.4109730848861284, "grad_norm": 0.6805354952812195, "learning_rate": 9.794678961285226e-06, "loss": 0.11704470962285995, "rewards/accuracies": 0.953125, "rewards/chosen": 15.131904602050781, "rewards/margins": 12.526283264160156, "rewards/rejected": 2.603504180908203, "step": 794 }, { "epoch": 0.4114906832298137, "grad_norm": 1.2786864042282104, "learning_rate": 9.793869037460172e-06, "loss": 0.22743770480155945, "rewards/accuracies": 0.890625, "rewards/chosen": 15.503776550292969, "rewards/margins": 13.215591430664062, "rewards/rejected": 2.2928810119628906, "step": 795 }, { "epoch": 0.41200828157349895, "grad_norm": 1.1055896282196045, "learning_rate": 9.793057552956486e-06, "loss": 0.17810559272766113, "rewards/accuracies": 0.90625, "rewards/chosen": 16.581531524658203, "rewards/margins": 14.12200927734375, "rewards/rejected": 2.4613876342773438, "step": 796 }, { "epoch": 0.41252587991718426, "grad_norm": 0.8924383521080017, "learning_rate": 9.792244508038349e-06, "loss": 0.14294368028640747, "rewards/accuracies": 0.9296875, "rewards/chosen": 19.648704528808594, "rewards/margins": 16.2022705078125, "rewards/rejected": 3.4433176517486572, "step": 797 }, { "epoch": 0.41304347826086957, "grad_norm": 3.4473252296447754, "learning_rate": 9.791429902970454e-06, "loss": 0.2730567455291748, "rewards/accuracies": 0.859375, "rewards/chosen": 19.414043426513672, "rewards/margins": 16.016204833984375, "rewards/rejected": 3.3965682983398438, "step": 798 }, { "epoch": 0.4135610766045549, "grad_norm": 1.4583888053894043, "learning_rate": 9.790613738018004e-06, "loss": 0.2335437387228012, "rewards/accuracies": 0.8671875, "rewards/chosen": 21.747220993041992, "rewards/margins": 17.5216064453125, "rewards/rejected": 4.230310440063477, "step": 799 }, { "epoch": 0.4140786749482402, "grad_norm": 1.1060925722122192, "learning_rate": 9.789796013446705e-06, "loss": 0.1688200831413269, "rewards/accuracies": 0.875, "rewards/chosen": 26.402870178222656, "rewards/margins": 21.165634155273438, "rewards/rejected": 5.236255645751953, "step": 800 }, { "epoch": 0.41459627329192544, "grad_norm": 1.5555874109268188, "learning_rate": 9.788976729522774e-06, "loss": 0.2459203600883484, "rewards/accuracies": 0.859375, "rewards/chosen": 21.44751739501953, "rewards/margins": 17.796432495117188, "rewards/rejected": 3.6460113525390625, "step": 801 }, { "epoch": 0.41511387163561075, "grad_norm": 1.6679444313049316, "learning_rate": 9.788155886512935e-06, "loss": 0.2888544797897339, "rewards/accuracies": 0.8203125, "rewards/chosen": 23.626876831054688, "rewards/margins": 18.63892364501953, "rewards/rejected": 4.984996795654297, "step": 802 }, { "epoch": 0.41563146997929606, "grad_norm": 1.0737512111663818, "learning_rate": 9.787333484684418e-06, "loss": 0.1791670024394989, "rewards/accuracies": 0.921875, "rewards/chosen": 25.46088409423828, "rewards/margins": 20.410926818847656, "rewards/rejected": 5.048062801361084, "step": 803 }, { "epoch": 0.4161490683229814, "grad_norm": 1.1706535816192627, "learning_rate": 9.786509524304963e-06, "loss": 0.22595301270484924, "rewards/accuracies": 0.8984375, "rewards/chosen": 25.721670150756836, "rewards/margins": 20.594772338867188, "rewards/rejected": 5.126001358032227, "step": 804 }, { "epoch": 0.4166666666666667, "grad_norm": 1.1380809545516968, "learning_rate": 9.785684005642816e-06, "loss": 0.20053280889987946, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.380325317382812, "rewards/margins": 18.1368408203125, "rewards/rejected": 3.2414512634277344, "step": 805 }, { "epoch": 0.41718426501035194, "grad_norm": 0.6992011070251465, "learning_rate": 9.784856928966732e-06, "loss": 0.15656645596027374, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.417200088500977, "rewards/margins": 20.75360870361328, "rewards/rejected": 3.6624069213867188, "step": 806 }, { "epoch": 0.41770186335403725, "grad_norm": 0.9150616526603699, "learning_rate": 9.78402829454597e-06, "loss": 0.2215949445962906, "rewards/accuracies": 0.8671875, "rewards/chosen": 21.950504302978516, "rewards/margins": 18.32208251953125, "rewards/rejected": 3.6319379806518555, "step": 807 }, { "epoch": 0.41821946169772256, "grad_norm": 1.0275514125823975, "learning_rate": 9.783198102650298e-06, "loss": 0.2102573812007904, "rewards/accuracies": 0.890625, "rewards/chosen": 20.662710189819336, "rewards/margins": 17.383216857910156, "rewards/rejected": 3.2734549045562744, "step": 808 }, { "epoch": 0.41873706004140787, "grad_norm": 1.257975459098816, "learning_rate": 9.782366353549992e-06, "loss": 0.27980244159698486, "rewards/accuracies": 0.8046875, "rewards/chosen": 20.789966583251953, "rewards/margins": 17.6636962890625, "rewards/rejected": 3.1255569458007812, "step": 809 }, { "epoch": 0.4192546583850932, "grad_norm": 0.9574421644210815, "learning_rate": 9.781533047515832e-06, "loss": 0.17503947019577026, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.537109375, "rewards/margins": 19.665740966796875, "rewards/rejected": 3.86529541015625, "step": 810 }, { "epoch": 0.4197722567287785, "grad_norm": 0.8174079656600952, "learning_rate": 9.780698184819108e-06, "loss": 0.1759582906961441, "rewards/accuracies": 0.890625, "rewards/chosen": 24.067991256713867, "rewards/margins": 20.497894287109375, "rewards/rejected": 3.580531120300293, "step": 811 }, { "epoch": 0.42028985507246375, "grad_norm": 1.700168251991272, "learning_rate": 9.779861765731616e-06, "loss": 0.21252557635307312, "rewards/accuracies": 0.90625, "rewards/chosen": 23.556678771972656, "rewards/margins": 19.378639221191406, "rewards/rejected": 4.179678916931152, "step": 812 }, { "epoch": 0.42080745341614906, "grad_norm": 1.589462161064148, "learning_rate": 9.779023790525658e-06, "loss": 0.2079090178012848, "rewards/accuracies": 0.90625, "rewards/chosen": 23.940855026245117, "rewards/margins": 19.97411346435547, "rewards/rejected": 3.9765920639038086, "step": 813 }, { "epoch": 0.42132505175983437, "grad_norm": 0.9135864973068237, "learning_rate": 9.778184259474044e-06, "loss": 0.16427713632583618, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.2706298828125, "rewards/margins": 20.197555541992188, "rewards/rejected": 4.07098388671875, "step": 814 }, { "epoch": 0.4218426501035197, "grad_norm": 1.3698989152908325, "learning_rate": 9.777343172850086e-06, "loss": 0.18449291586875916, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.978395462036133, "rewards/margins": 20.347976684570312, "rewards/rejected": 4.632026672363281, "step": 815 }, { "epoch": 0.422360248447205, "grad_norm": 1.2395448684692383, "learning_rate": 9.776500530927609e-06, "loss": 0.20697848498821259, "rewards/accuracies": 0.875, "rewards/chosen": 24.25595474243164, "rewards/margins": 19.3062744140625, "rewards/rejected": 4.956084251403809, "step": 816 }, { "epoch": 0.42287784679089024, "grad_norm": 2.114337921142578, "learning_rate": 9.77565633398094e-06, "loss": 0.17527279257774353, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.905616760253906, "rewards/margins": 24.91663360595703, "rewards/rejected": 5.98866081237793, "step": 817 }, { "epoch": 0.42339544513457555, "grad_norm": 1.336929440498352, "learning_rate": 9.774810582284913e-06, "loss": 0.1864233911037445, "rewards/accuracies": 0.90625, "rewards/chosen": 25.589534759521484, "rewards/margins": 20.980396270751953, "rewards/rejected": 4.604620933532715, "step": 818 }, { "epoch": 0.42391304347826086, "grad_norm": 1.1310011148452759, "learning_rate": 9.77396327611487e-06, "loss": 0.14228308200836182, "rewards/accuracies": 0.921875, "rewards/chosen": 25.01062774658203, "rewards/margins": 20.501022338867188, "rewards/rejected": 4.511472702026367, "step": 819 }, { "epoch": 0.4244306418219462, "grad_norm": 1.8936266899108887, "learning_rate": 9.773114415746657e-06, "loss": 0.2572318911552429, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.636795043945312, "rewards/margins": 17.512496948242188, "rewards/rejected": 3.1290597915649414, "step": 820 }, { "epoch": 0.4249482401656315, "grad_norm": 4.396151542663574, "learning_rate": 9.772264001456623e-06, "loss": 0.300519198179245, "rewards/accuracies": 0.8828125, "rewards/chosen": 21.466861724853516, "rewards/margins": 17.918792724609375, "rewards/rejected": 3.550806999206543, "step": 821 }, { "epoch": 0.4254658385093168, "grad_norm": 1.9850369691848755, "learning_rate": 9.771412033521634e-06, "loss": 0.21970364451408386, "rewards/accuracies": 0.890625, "rewards/chosen": 21.501232147216797, "rewards/margins": 18.71337127685547, "rewards/rejected": 2.787281036376953, "step": 822 }, { "epoch": 0.42598343685300205, "grad_norm": 2.1570611000061035, "learning_rate": 9.770558512219049e-06, "loss": 0.2113747000694275, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.461204528808594, "rewards/margins": 17.38214111328125, "rewards/rejected": 3.07827091217041, "step": 823 }, { "epoch": 0.42650103519668736, "grad_norm": 1.540856957435608, "learning_rate": 9.769703437826737e-06, "loss": 0.20443600416183472, "rewards/accuracies": 0.890625, "rewards/chosen": 19.275527954101562, "rewards/margins": 17.224456787109375, "rewards/rejected": 2.0468406677246094, "step": 824 }, { "epoch": 0.42701863354037267, "grad_norm": 2.1991546154022217, "learning_rate": 9.768846810623078e-06, "loss": 0.33278506994247437, "rewards/accuracies": 0.859375, "rewards/chosen": 17.37996482849121, "rewards/margins": 15.12786865234375, "rewards/rejected": 2.2525205612182617, "step": 825 }, { "epoch": 0.427536231884058, "grad_norm": 2.3043510913848877, "learning_rate": 9.76798863088695e-06, "loss": 0.18941235542297363, "rewards/accuracies": 0.890625, "rewards/chosen": 15.077163696289062, "rewards/margins": 13.4759521484375, "rewards/rejected": 1.6015119552612305, "step": 826 }, { "epoch": 0.4280538302277433, "grad_norm": 1.6939955949783325, "learning_rate": 9.76712889889774e-06, "loss": 0.24997150897979736, "rewards/accuracies": 0.890625, "rewards/chosen": 14.694540023803711, "rewards/margins": 13.089088439941406, "rewards/rejected": 1.604513168334961, "step": 827 }, { "epoch": 0.42857142857142855, "grad_norm": 0.9616042375564575, "learning_rate": 9.766267614935342e-06, "loss": 0.1795942485332489, "rewards/accuracies": 0.9140625, "rewards/chosen": 14.826009750366211, "rewards/margins": 13.584613800048828, "rewards/rejected": 1.2441387176513672, "step": 828 }, { "epoch": 0.42908902691511386, "grad_norm": 1.0883750915527344, "learning_rate": 9.765404779280151e-06, "loss": 0.20178912580013275, "rewards/accuracies": 0.90625, "rewards/chosen": 15.351655960083008, "rewards/margins": 13.453689575195312, "rewards/rejected": 1.8957748413085938, "step": 829 }, { "epoch": 0.42960662525879917, "grad_norm": 1.3544753789901733, "learning_rate": 9.76454039221307e-06, "loss": 0.24630728363990784, "rewards/accuracies": 0.9140625, "rewards/chosen": 11.386497497558594, "rewards/margins": 10.625991821289062, "rewards/rejected": 0.7624657154083252, "step": 830 }, { "epoch": 0.4301242236024845, "grad_norm": 1.4327460527420044, "learning_rate": 9.763674454015506e-06, "loss": 0.23618444800376892, "rewards/accuracies": 0.8671875, "rewards/chosen": 13.195838928222656, "rewards/margins": 12.245628356933594, "rewards/rejected": 0.9469442367553711, "step": 831 }, { "epoch": 0.4306418219461698, "grad_norm": 0.9512755274772644, "learning_rate": 9.762806964969372e-06, "loss": 0.20237095654010773, "rewards/accuracies": 0.90625, "rewards/chosen": 12.237289428710938, "rewards/margins": 11.04766845703125, "rewards/rejected": 1.190042495727539, "step": 832 }, { "epoch": 0.4311594202898551, "grad_norm": 1.2067569494247437, "learning_rate": 9.761937925357086e-06, "loss": 0.2291947901248932, "rewards/accuracies": 0.8515625, "rewards/chosen": 13.05411434173584, "rewards/margins": 11.798053741455078, "rewards/rejected": 1.252912998199463, "step": 833 }, { "epoch": 0.43167701863354035, "grad_norm": 1.2156050205230713, "learning_rate": 9.76106733546157e-06, "loss": 0.23221099376678467, "rewards/accuracies": 0.859375, "rewards/chosen": 13.006240844726562, "rewards/margins": 11.965888977050781, "rewards/rejected": 1.0430984497070312, "step": 834 }, { "epoch": 0.43219461697722567, "grad_norm": 1.28280508518219, "learning_rate": 9.760195195566248e-06, "loss": 0.2366698980331421, "rewards/accuracies": 0.875, "rewards/chosen": 13.060577392578125, "rewards/margins": 11.933486938476562, "rewards/rejected": 1.1317615509033203, "step": 835 }, { "epoch": 0.432712215320911, "grad_norm": 0.8626473546028137, "learning_rate": 9.759321505955055e-06, "loss": 0.20035813748836517, "rewards/accuracies": 0.9375, "rewards/chosen": 13.088546752929688, "rewards/margins": 12.122161865234375, "rewards/rejected": 0.9632134437561035, "step": 836 }, { "epoch": 0.4332298136645963, "grad_norm": 1.0725581645965576, "learning_rate": 9.758446266912425e-06, "loss": 0.15771488845348358, "rewards/accuracies": 0.9375, "rewards/chosen": 17.96158790588379, "rewards/margins": 16.608383178710938, "rewards/rejected": 1.353085994720459, "step": 837 }, { "epoch": 0.4337474120082816, "grad_norm": 1.4432188272476196, "learning_rate": 9.7575694787233e-06, "loss": 0.1610233187675476, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.563961029052734, "rewards/margins": 14.792320251464844, "rewards/rejected": 1.7729568481445312, "step": 838 }, { "epoch": 0.43426501035196685, "grad_norm": 0.9213033318519592, "learning_rate": 9.756691141673121e-06, "loss": 0.18999728560447693, "rewards/accuracies": 0.890625, "rewards/chosen": 17.30863380432129, "rewards/margins": 15.858657836914062, "rewards/rejected": 1.4472885131835938, "step": 839 }, { "epoch": 0.43478260869565216, "grad_norm": 1.166054368019104, "learning_rate": 9.755811256047842e-06, "loss": 0.188326895236969, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.89948844909668, "rewards/margins": 15.347267150878906, "rewards/rejected": 1.5454959869384766, "step": 840 }, { "epoch": 0.4353002070393375, "grad_norm": 1.3031243085861206, "learning_rate": 9.754929822133914e-06, "loss": 0.2548946142196655, "rewards/accuracies": 0.875, "rewards/chosen": 19.966711044311523, "rewards/margins": 17.485031127929688, "rewards/rejected": 2.484375, "step": 841 }, { "epoch": 0.4358178053830228, "grad_norm": 1.1804670095443726, "learning_rate": 9.754046840218292e-06, "loss": 0.26743462681770325, "rewards/accuracies": 0.875, "rewards/chosen": 21.839305877685547, "rewards/margins": 19.383392333984375, "rewards/rejected": 2.462463617324829, "step": 842 }, { "epoch": 0.4363354037267081, "grad_norm": 1.1584434509277344, "learning_rate": 9.75316231058844e-06, "loss": 0.21667474508285522, "rewards/accuracies": 0.8828125, "rewards/chosen": 21.350875854492188, "rewards/margins": 19.161087036132812, "rewards/rejected": 2.196065902709961, "step": 843 }, { "epoch": 0.43685300207039335, "grad_norm": 1.037217140197754, "learning_rate": 9.752276233532322e-06, "loss": 0.23116208612918854, "rewards/accuracies": 0.8671875, "rewards/chosen": 23.310914993286133, "rewards/margins": 20.2078857421875, "rewards/rejected": 3.102022647857666, "step": 844 }, { "epoch": 0.43737060041407866, "grad_norm": 0.6697558164596558, "learning_rate": 9.751388609338407e-06, "loss": 0.15853825211524963, "rewards/accuracies": 0.921875, "rewards/chosen": 25.76512908935547, "rewards/margins": 21.863624572753906, "rewards/rejected": 3.9006080627441406, "step": 845 }, { "epoch": 0.43788819875776397, "grad_norm": 0.8569114804267883, "learning_rate": 9.750499438295667e-06, "loss": 0.1792735755443573, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.957040786743164, "rewards/margins": 22.344619750976562, "rewards/rejected": 4.619014739990234, "step": 846 }, { "epoch": 0.4384057971014493, "grad_norm": 1.2976925373077393, "learning_rate": 9.749608720693578e-06, "loss": 0.1790296733379364, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.990625381469727, "rewards/margins": 23.62164306640625, "rewards/rejected": 4.360935211181641, "step": 847 }, { "epoch": 0.4389233954451346, "grad_norm": 1.4810380935668945, "learning_rate": 9.74871645682212e-06, "loss": 0.1676499843597412, "rewards/accuracies": 0.90625, "rewards/chosen": 30.990318298339844, "rewards/margins": 25.320587158203125, "rewards/rejected": 5.672186374664307, "step": 848 }, { "epoch": 0.4394409937888199, "grad_norm": 1.3064736127853394, "learning_rate": 9.747822646971777e-06, "loss": 0.2007116675376892, "rewards/accuracies": 0.890625, "rewards/chosen": 30.532459259033203, "rewards/margins": 24.936569213867188, "rewards/rejected": 5.586456298828125, "step": 849 }, { "epoch": 0.43995859213250516, "grad_norm": 1.7098931074142456, "learning_rate": 9.746927291433532e-06, "loss": 0.193487286567688, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.230304718017578, "rewards/margins": 24.152679443359375, "rewards/rejected": 7.076604843139648, "step": 850 }, { "epoch": 0.44047619047619047, "grad_norm": 1.3382493257522583, "learning_rate": 9.746030390498877e-06, "loss": 0.21135176718235016, "rewards/accuracies": 0.890625, "rewards/chosen": 31.027488708496094, "rewards/margins": 23.036869049072266, "rewards/rejected": 7.996651649475098, "step": 851 }, { "epoch": 0.4409937888198758, "grad_norm": 1.3908381462097168, "learning_rate": 9.745131944459804e-06, "loss": 0.22137451171875, "rewards/accuracies": 0.8828125, "rewards/chosen": 31.161819458007812, "rewards/margins": 23.998519897460938, "rewards/rejected": 7.164905071258545, "step": 852 }, { "epoch": 0.4415113871635611, "grad_norm": 0.948134183883667, "learning_rate": 9.74423195360881e-06, "loss": 0.16102342307567596, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.373626708984375, "rewards/margins": 23.15696144104004, "rewards/rejected": 7.220305442810059, "step": 853 }, { "epoch": 0.4420289855072464, "grad_norm": 0.772343635559082, "learning_rate": 9.743330418238892e-06, "loss": 0.1681777387857437, "rewards/accuracies": 0.8984375, "rewards/chosen": 32.07157897949219, "rewards/margins": 24.57958984375, "rewards/rejected": 7.492275238037109, "step": 854 }, { "epoch": 0.44254658385093165, "grad_norm": 1.1609911918640137, "learning_rate": 9.74242733864355e-06, "loss": 0.1927567720413208, "rewards/accuracies": 0.8828125, "rewards/chosen": 34.2916145324707, "rewards/margins": 24.562652587890625, "rewards/rejected": 9.733749389648438, "step": 855 }, { "epoch": 0.44306418219461696, "grad_norm": 1.3254281282424927, "learning_rate": 9.741522715116792e-06, "loss": 0.24966266751289368, "rewards/accuracies": 0.859375, "rewards/chosen": 33.425594329833984, "rewards/margins": 24.483978271484375, "rewards/rejected": 8.942936897277832, "step": 856 }, { "epoch": 0.4435817805383023, "grad_norm": 0.8504253029823303, "learning_rate": 9.740616547953118e-06, "loss": 0.1693175733089447, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.997299194335938, "rewards/margins": 23.888519287109375, "rewards/rejected": 7.1021270751953125, "step": 857 }, { "epoch": 0.4440993788819876, "grad_norm": 2.431537628173828, "learning_rate": 9.739708837447544e-06, "loss": 0.2345900535583496, "rewards/accuracies": 0.875, "rewards/chosen": 30.47187614440918, "rewards/margins": 23.456802368164062, "rewards/rejected": 7.023408889770508, "step": 858 }, { "epoch": 0.4446169772256729, "grad_norm": 3.0177531242370605, "learning_rate": 9.73879958389558e-06, "loss": 0.25166717171669006, "rewards/accuracies": 0.8828125, "rewards/chosen": 37.40204620361328, "rewards/margins": 26.4898681640625, "rewards/rejected": 10.911603927612305, "step": 859 }, { "epoch": 0.4451345755693582, "grad_norm": 1.7441684007644653, "learning_rate": 9.737888787593238e-06, "loss": 0.30412647128105164, "rewards/accuracies": 0.8359375, "rewards/chosen": 29.21139144897461, "rewards/margins": 21.54074478149414, "rewards/rejected": 7.669666290283203, "step": 860 }, { "epoch": 0.44565217391304346, "grad_norm": 2.5366711616516113, "learning_rate": 9.736976448837037e-06, "loss": 0.1774640679359436, "rewards/accuracies": 0.921875, "rewards/chosen": 32.585105895996094, "rewards/margins": 24.57990264892578, "rewards/rejected": 8.020057678222656, "step": 861 }, { "epoch": 0.44616977225672877, "grad_norm": 1.9754431247711182, "learning_rate": 9.736062567923991e-06, "loss": 0.197488933801651, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.23027801513672, "rewards/margins": 20.672714233398438, "rewards/rejected": 6.568197250366211, "step": 862 }, { "epoch": 0.4466873706004141, "grad_norm": 0.9413161873817444, "learning_rate": 9.735147145151626e-06, "loss": 0.19837945699691772, "rewards/accuracies": 0.890625, "rewards/chosen": 26.225601196289062, "rewards/margins": 21.149032592773438, "rewards/rejected": 5.068975448608398, "step": 863 }, { "epoch": 0.4472049689440994, "grad_norm": 0.5125924348831177, "learning_rate": 9.734230180817962e-06, "loss": 0.13502928614616394, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.475048065185547, "rewards/margins": 23.581817626953125, "rewards/rejected": 6.895469665527344, "step": 864 }, { "epoch": 0.4477225672877847, "grad_norm": 0.9092521071434021, "learning_rate": 9.733311675221523e-06, "loss": 0.1897944062948227, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.123886108398438, "rewards/margins": 20.740463256835938, "rewards/rejected": 6.388175964355469, "step": 865 }, { "epoch": 0.44824016563146996, "grad_norm": 1.2203881740570068, "learning_rate": 9.732391628661337e-06, "loss": 0.18869855999946594, "rewards/accuracies": 0.8828125, "rewards/chosen": 22.63132095336914, "rewards/margins": 18.475433349609375, "rewards/rejected": 4.1674957275390625, "step": 866 }, { "epoch": 0.44875776397515527, "grad_norm": 0.8329616189002991, "learning_rate": 9.731470041436928e-06, "loss": 0.22963076829910278, "rewards/accuracies": 0.890625, "rewards/chosen": 22.44137954711914, "rewards/margins": 18.410903930664062, "rewards/rejected": 4.0330047607421875, "step": 867 }, { "epoch": 0.4492753623188406, "grad_norm": 2.3714637756347656, "learning_rate": 9.73054691384833e-06, "loss": 0.2191944420337677, "rewards/accuracies": 0.90625, "rewards/chosen": 28.055850982666016, "rewards/margins": 22.0716552734375, "rewards/rejected": 5.984643936157227, "step": 868 }, { "epoch": 0.4497929606625259, "grad_norm": 2.056036949157715, "learning_rate": 9.729622246196068e-06, "loss": 0.21803830564022064, "rewards/accuracies": 0.8984375, "rewards/chosen": 26.30110740661621, "rewards/margins": 20.645965576171875, "rewards/rejected": 5.65496826171875, "step": 869 }, { "epoch": 0.4503105590062112, "grad_norm": 0.7741504907608032, "learning_rate": 9.72869603878118e-06, "loss": 0.17965054512023926, "rewards/accuracies": 0.90625, "rewards/chosen": 27.7071533203125, "rewards/margins": 22.082477569580078, "rewards/rejected": 5.623069763183594, "step": 870 }, { "epoch": 0.4508281573498965, "grad_norm": 6.523375988006592, "learning_rate": 9.727768291905197e-06, "loss": 0.23604577779769897, "rewards/accuracies": 0.8828125, "rewards/chosen": 26.23073959350586, "rewards/margins": 20.344270706176758, "rewards/rejected": 5.888750076293945, "step": 871 }, { "epoch": 0.45134575569358176, "grad_norm": 1.1431113481521606, "learning_rate": 9.726839005870155e-06, "loss": 0.17230229079723358, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.38302993774414, "rewards/margins": 21.161468505859375, "rewards/rejected": 6.224234580993652, "step": 872 }, { "epoch": 0.4518633540372671, "grad_norm": 1.3157683610916138, "learning_rate": 9.725908180978586e-06, "loss": 0.24968098104000092, "rewards/accuracies": 0.875, "rewards/chosen": 25.385398864746094, "rewards/margins": 19.6649169921875, "rewards/rejected": 5.720767974853516, "step": 873 }, { "epoch": 0.4523809523809524, "grad_norm": 1.6045035123825073, "learning_rate": 9.72497581753353e-06, "loss": 0.20353245735168457, "rewards/accuracies": 0.90625, "rewards/chosen": 24.017780303955078, "rewards/margins": 18.8126220703125, "rewards/rejected": 5.2048797607421875, "step": 874 }, { "epoch": 0.4528985507246377, "grad_norm": 1.4288020133972168, "learning_rate": 9.724041915838527e-06, "loss": 0.2519252896308899, "rewards/accuracies": 0.875, "rewards/chosen": 22.90102767944336, "rewards/margins": 17.973403930664062, "rewards/rejected": 4.925367832183838, "step": 875 }, { "epoch": 0.453416149068323, "grad_norm": 1.2924222946166992, "learning_rate": 9.72310647619761e-06, "loss": 0.1812881976366043, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.341941833496094, "rewards/margins": 17.408489227294922, "rewards/rejected": 4.936702728271484, "step": 876 }, { "epoch": 0.45393374741200826, "grad_norm": 0.8866689205169678, "learning_rate": 9.722169498915322e-06, "loss": 0.15513765811920166, "rewards/accuracies": 0.90625, "rewards/chosen": 24.882963180541992, "rewards/margins": 19.66254234313965, "rewards/rejected": 5.214433670043945, "step": 877 }, { "epoch": 0.45445134575569357, "grad_norm": 0.9296771287918091, "learning_rate": 9.7212309842967e-06, "loss": 0.2070356160402298, "rewards/accuracies": 0.890625, "rewards/chosen": 24.331703186035156, "rewards/margins": 18.942224502563477, "rewards/rejected": 5.394290447235107, "step": 878 }, { "epoch": 0.4549689440993789, "grad_norm": 0.7788568139076233, "learning_rate": 9.720290932647285e-06, "loss": 0.20058801770210266, "rewards/accuracies": 0.875, "rewards/chosen": 23.76136016845703, "rewards/margins": 18.3035888671875, "rewards/rejected": 5.461972236633301, "step": 879 }, { "epoch": 0.4554865424430642, "grad_norm": 0.6393506526947021, "learning_rate": 9.71934934427312e-06, "loss": 0.11409547924995422, "rewards/accuracies": 0.9921875, "rewards/chosen": 23.001426696777344, "rewards/margins": 18.740020751953125, "rewards/rejected": 4.25999641418457, "step": 880 }, { "epoch": 0.4560041407867495, "grad_norm": 1.0313520431518555, "learning_rate": 9.718406219480743e-06, "loss": 0.1839844435453415, "rewards/accuracies": 0.90625, "rewards/chosen": 24.24999237060547, "rewards/margins": 18.88005828857422, "rewards/rejected": 5.369495391845703, "step": 881 }, { "epoch": 0.45652173913043476, "grad_norm": 1.1338461637496948, "learning_rate": 9.717461558577196e-06, "loss": 0.22856669127941132, "rewards/accuracies": 0.859375, "rewards/chosen": 22.282718658447266, "rewards/margins": 18.076316833496094, "rewards/rejected": 4.199375152587891, "step": 882 }, { "epoch": 0.45703933747412007, "grad_norm": 0.845371663570404, "learning_rate": 9.71651536187002e-06, "loss": 0.1411432921886444, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.812522888183594, "rewards/margins": 18.767093658447266, "rewards/rejected": 6.040464401245117, "step": 883 }, { "epoch": 0.4575569358178054, "grad_norm": 1.7430275678634644, "learning_rate": 9.715567629667258e-06, "loss": 0.24179551005363464, "rewards/accuracies": 0.875, "rewards/chosen": 23.153186798095703, "rewards/margins": 18.226119995117188, "rewards/rejected": 4.934782981872559, "step": 884 }, { "epoch": 0.4580745341614907, "grad_norm": 1.2459484338760376, "learning_rate": 9.714618362277448e-06, "loss": 0.2435063123703003, "rewards/accuracies": 0.875, "rewards/chosen": 22.194229125976562, "rewards/margins": 16.933868408203125, "rewards/rejected": 5.257377624511719, "step": 885 }, { "epoch": 0.458592132505176, "grad_norm": 0.9218516945838928, "learning_rate": 9.713667560009634e-06, "loss": 0.20572927594184875, "rewards/accuracies": 0.8828125, "rewards/chosen": 24.914642333984375, "rewards/margins": 18.754348754882812, "rewards/rejected": 6.1683502197265625, "step": 886 }, { "epoch": 0.4591097308488613, "grad_norm": 0.9683829545974731, "learning_rate": 9.712715223173353e-06, "loss": 0.1952836513519287, "rewards/accuracies": 0.890625, "rewards/chosen": 23.963523864746094, "rewards/margins": 18.447242736816406, "rewards/rejected": 5.5160651206970215, "step": 887 }, { "epoch": 0.45962732919254656, "grad_norm": 1.3154886960983276, "learning_rate": 9.711761352078647e-06, "loss": 0.21839675307273865, "rewards/accuracies": 0.875, "rewards/chosen": 20.41115951538086, "rewards/margins": 15.51611328125, "rewards/rejected": 4.893119812011719, "step": 888 }, { "epoch": 0.4601449275362319, "grad_norm": 1.2550817728042603, "learning_rate": 9.710805947036056e-06, "loss": 0.2036689668893814, "rewards/accuracies": 0.859375, "rewards/chosen": 25.444622039794922, "rewards/margins": 18.404319763183594, "rewards/rejected": 7.044772148132324, "step": 889 }, { "epoch": 0.4606625258799172, "grad_norm": 1.320436954498291, "learning_rate": 9.70984900835662e-06, "loss": 0.2061077207326889, "rewards/accuracies": 0.890625, "rewards/chosen": 23.441287994384766, "rewards/margins": 17.06476593017578, "rewards/rejected": 6.377277374267578, "step": 890 }, { "epoch": 0.4611801242236025, "grad_norm": 0.9859340190887451, "learning_rate": 9.708890536351876e-06, "loss": 0.18707916140556335, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.1773738861084, "rewards/margins": 19.10034942626953, "rewards/rejected": 8.08099365234375, "step": 891 }, { "epoch": 0.4616977225672878, "grad_norm": 0.9229241013526917, "learning_rate": 9.70793053133386e-06, "loss": 0.1865554004907608, "rewards/accuracies": 0.890625, "rewards/chosen": 25.68806266784668, "rewards/margins": 17.924331665039062, "rewards/rejected": 7.761576175689697, "step": 892 }, { "epoch": 0.46221532091097306, "grad_norm": 0.7595138549804688, "learning_rate": 9.70696899361511e-06, "loss": 0.18164023756980896, "rewards/accuracies": 0.8828125, "rewards/chosen": 29.00794219970703, "rewards/margins": 19.126327514648438, "rewards/rejected": 9.879430770874023, "step": 893 }, { "epoch": 0.46273291925465837, "grad_norm": 1.0132794380187988, "learning_rate": 9.70600592350866e-06, "loss": 0.16697299480438232, "rewards/accuracies": 0.890625, "rewards/chosen": 30.423809051513672, "rewards/margins": 20.877525329589844, "rewards/rejected": 9.541358947753906, "step": 894 }, { "epoch": 0.4632505175983437, "grad_norm": 1.0264478921890259, "learning_rate": 9.705041321328049e-06, "loss": 0.1742437779903412, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.93800163269043, "rewards/margins": 18.766448974609375, "rewards/rejected": 9.169198036193848, "step": 895 }, { "epoch": 0.463768115942029, "grad_norm": 1.243565320968628, "learning_rate": 9.704075187387306e-06, "loss": 0.19741135835647583, "rewards/accuracies": 0.875, "rewards/chosen": 30.545028686523438, "rewards/margins": 20.190994262695312, "rewards/rejected": 10.355323791503906, "step": 896 }, { "epoch": 0.4642857142857143, "grad_norm": 1.3876864910125732, "learning_rate": 9.703107522000963e-06, "loss": 0.19762219488620758, "rewards/accuracies": 0.921875, "rewards/chosen": 30.86492347717285, "rewards/margins": 19.111717224121094, "rewards/rejected": 11.747735977172852, "step": 897 }, { "epoch": 0.4648033126293996, "grad_norm": 1.0701409578323364, "learning_rate": 9.702138325484052e-06, "loss": 0.1615380346775055, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.023326873779297, "rewards/margins": 18.683792114257812, "rewards/rejected": 11.343198776245117, "step": 898 }, { "epoch": 0.46532091097308487, "grad_norm": 2.0453202724456787, "learning_rate": 9.701167598152102e-06, "loss": 0.19370950758457184, "rewards/accuracies": 0.890625, "rewards/chosen": 30.79253387451172, "rewards/margins": 19.62054443359375, "rewards/rejected": 11.176063537597656, "step": 899 }, { "epoch": 0.4658385093167702, "grad_norm": 4.026604652404785, "learning_rate": 9.700195340321138e-06, "loss": 0.17493370175361633, "rewards/accuracies": 0.8828125, "rewards/chosen": 30.753559112548828, "rewards/margins": 19.451522827148438, "rewards/rejected": 11.299476623535156, "step": 900 }, { "epoch": 0.4663561076604555, "grad_norm": 0.6045064926147461, "learning_rate": 9.699221552307689e-06, "loss": 0.11954589933156967, "rewards/accuracies": 0.9375, "rewards/chosen": 30.88296127319336, "rewards/margins": 19.691726684570312, "rewards/rejected": 11.192008972167969, "step": 901 }, { "epoch": 0.4668737060041408, "grad_norm": 0.6589952111244202, "learning_rate": 9.698246234428774e-06, "loss": 0.13978451490402222, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.916656494140625, "rewards/margins": 20.88440704345703, "rewards/rejected": 10.022963523864746, "step": 902 }, { "epoch": 0.4673913043478261, "grad_norm": 1.6964291334152222, "learning_rate": 9.69726938700192e-06, "loss": 0.15587040781974792, "rewards/accuracies": 0.921875, "rewards/chosen": 29.485637664794922, "rewards/margins": 19.510940551757812, "rewards/rejected": 9.974424362182617, "step": 903 }, { "epoch": 0.46790890269151136, "grad_norm": 1.2443686723709106, "learning_rate": 9.696291010345145e-06, "loss": 0.16035738587379456, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.837982177734375, "rewards/margins": 19.91888427734375, "rewards/rejected": 8.907604217529297, "step": 904 }, { "epoch": 0.4684265010351967, "grad_norm": 1.0984383821487427, "learning_rate": 9.695311104776963e-06, "loss": 0.21848493814468384, "rewards/accuracies": 0.8671875, "rewards/chosen": 30.313932418823242, "rewards/margins": 21.067550659179688, "rewards/rejected": 9.238407135009766, "step": 905 }, { "epoch": 0.468944099378882, "grad_norm": 1.582338571548462, "learning_rate": 9.694329670616395e-06, "loss": 0.16056492924690247, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.26396179199219, "rewards/margins": 22.063358306884766, "rewards/rejected": 10.200042724609375, "step": 906 }, { "epoch": 0.4694616977225673, "grad_norm": 1.5227437019348145, "learning_rate": 9.69334670818295e-06, "loss": 0.22355492413043976, "rewards/accuracies": 0.875, "rewards/chosen": 34.63679885864258, "rewards/margins": 23.581092834472656, "rewards/rejected": 11.052753448486328, "step": 907 }, { "epoch": 0.4699792960662526, "grad_norm": 2.559962272644043, "learning_rate": 9.69236221779664e-06, "loss": 0.28724151849746704, "rewards/accuracies": 0.8828125, "rewards/chosen": 28.332298278808594, "rewards/margins": 18.930709838867188, "rewards/rejected": 9.399200439453125, "step": 908 }, { "epoch": 0.4704968944099379, "grad_norm": 2.877380132675171, "learning_rate": 9.691376199777972e-06, "loss": 0.2591070830821991, "rewards/accuracies": 0.875, "rewards/chosen": 30.546205520629883, "rewards/margins": 21.773448944091797, "rewards/rejected": 8.770769119262695, "step": 909 }, { "epoch": 0.47101449275362317, "grad_norm": 1.1091169118881226, "learning_rate": 9.690388654447954e-06, "loss": 0.16023170948028564, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.774383544921875, "rewards/margins": 22.645355224609375, "rewards/rejected": 10.1256103515625, "step": 910 }, { "epoch": 0.4715320910973085, "grad_norm": 3.2706122398376465, "learning_rate": 9.689399582128084e-06, "loss": 0.2987600564956665, "rewards/accuracies": 0.8828125, "rewards/chosen": 27.159387588500977, "rewards/margins": 19.790023803710938, "rewards/rejected": 7.365901947021484, "step": 911 }, { "epoch": 0.4720496894409938, "grad_norm": 1.220887541770935, "learning_rate": 9.688408983140365e-06, "loss": 0.19084306061267853, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.39801025390625, "rewards/margins": 24.027210235595703, "rewards/rejected": 6.3660125732421875, "step": 912 }, { "epoch": 0.4725672877846791, "grad_norm": 0.7260750532150269, "learning_rate": 9.687416857807293e-06, "loss": 0.14451715350151062, "rewards/accuracies": 0.9375, "rewards/chosen": 30.44465446472168, "rewards/margins": 25.54986572265625, "rewards/rejected": 4.890815734863281, "step": 913 }, { "epoch": 0.4730848861283644, "grad_norm": 0.8670312762260437, "learning_rate": 9.68642320645186e-06, "loss": 0.15560562908649445, "rewards/accuracies": 0.90625, "rewards/chosen": 26.64767837524414, "rewards/margins": 22.330734252929688, "rewards/rejected": 4.317806243896484, "step": 914 }, { "epoch": 0.47360248447204967, "grad_norm": 1.1256954669952393, "learning_rate": 9.68542802939756e-06, "loss": 0.15068377554416656, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.711034774780273, "rewards/margins": 24.329132080078125, "rewards/rejected": 4.391343593597412, "step": 915 }, { "epoch": 0.474120082815735, "grad_norm": 2.7772769927978516, "learning_rate": 9.684431326968375e-06, "loss": 0.20283013582229614, "rewards/accuracies": 0.8828125, "rewards/chosen": 28.237564086914062, "rewards/margins": 24.287139892578125, "rewards/rejected": 3.948699951171875, "step": 916 }, { "epoch": 0.4746376811594203, "grad_norm": 1.882514476776123, "learning_rate": 9.683433099488793e-06, "loss": 0.22763672471046448, "rewards/accuracies": 0.8828125, "rewards/chosen": 22.661531448364258, "rewards/margins": 20.504547119140625, "rewards/rejected": 2.158720016479492, "step": 917 }, { "epoch": 0.4751552795031056, "grad_norm": 28.357908248901367, "learning_rate": 9.682433347283792e-06, "loss": 0.29598468542099, "rewards/accuracies": 0.859375, "rewards/chosen": 21.72283172607422, "rewards/margins": 18.65985870361328, "rewards/rejected": 3.066819429397583, "step": 918 }, { "epoch": 0.4756728778467909, "grad_norm": 1.1113324165344238, "learning_rate": 9.68143207067885e-06, "loss": 0.1780395805835724, "rewards/accuracies": 0.90625, "rewards/chosen": 19.419723510742188, "rewards/margins": 17.722423553466797, "rewards/rejected": 1.695089340209961, "step": 919 }, { "epoch": 0.47619047619047616, "grad_norm": 0.9125515818595886, "learning_rate": 9.680429269999937e-06, "loss": 0.19196079671382904, "rewards/accuracies": 0.921875, "rewards/chosen": 20.44489288330078, "rewards/margins": 18.452239990234375, "rewards/rejected": 1.9907150268554688, "step": 920 }, { "epoch": 0.4767080745341615, "grad_norm": 1.424607276916504, "learning_rate": 9.679424945573526e-06, "loss": 0.21441644430160522, "rewards/accuracies": 0.8984375, "rewards/chosen": 16.3156681060791, "rewards/margins": 15.057090759277344, "rewards/rejected": 1.2597875595092773, "step": 921 }, { "epoch": 0.4772256728778468, "grad_norm": 1.0485131740570068, "learning_rate": 9.678419097726578e-06, "loss": 0.19131919741630554, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.494728088378906, "rewards/margins": 15.325668334960938, "rewards/rejected": 1.1675910949707031, "step": 922 }, { "epoch": 0.4777432712215321, "grad_norm": 1.774236798286438, "learning_rate": 9.677411726786556e-06, "loss": 0.20353950560092926, "rewards/accuracies": 0.875, "rewards/chosen": 17.01683807373047, "rewards/margins": 15.8331298828125, "rewards/rejected": 1.1809215545654297, "step": 923 }, { "epoch": 0.4782608695652174, "grad_norm": 1.2170634269714355, "learning_rate": 9.676402833081418e-06, "loss": 0.19539043307304382, "rewards/accuracies": 0.890625, "rewards/chosen": 14.016143798828125, "rewards/margins": 12.6822509765625, "rewards/rejected": 1.3343467712402344, "step": 924 }, { "epoch": 0.4787784679089027, "grad_norm": 1.637129306793213, "learning_rate": 9.675392416939616e-06, "loss": 0.16674882173538208, "rewards/accuracies": 0.9375, "rewards/chosen": 13.490849494934082, "rewards/margins": 12.362730026245117, "rewards/rejected": 1.1266136169433594, "step": 925 }, { "epoch": 0.47929606625258797, "grad_norm": 0.9031149744987488, "learning_rate": 9.674380478690096e-06, "loss": 0.13729308545589447, "rewards/accuracies": 0.9296875, "rewards/chosen": 15.647594451904297, "rewards/margins": 14.377979278564453, "rewards/rejected": 1.2665905952453613, "step": 926 }, { "epoch": 0.4798136645962733, "grad_norm": 1.0032752752304077, "learning_rate": 9.673367018662305e-06, "loss": 0.13978861272335052, "rewards/accuracies": 0.953125, "rewards/chosen": 13.820758819580078, "rewards/margins": 13.060714721679688, "rewards/rejected": 0.7604255676269531, "step": 927 }, { "epoch": 0.4803312629399586, "grad_norm": 1.299494981765747, "learning_rate": 9.672352037186179e-06, "loss": 0.21569636464118958, "rewards/accuracies": 0.90625, "rewards/chosen": 17.07156753540039, "rewards/margins": 15.49664306640625, "rewards/rejected": 1.5759506225585938, "step": 928 }, { "epoch": 0.4808488612836439, "grad_norm": 1.4557706117630005, "learning_rate": 9.671335534592155e-06, "loss": 0.18110057711601257, "rewards/accuracies": 0.8984375, "rewards/chosen": 21.353775024414062, "rewards/margins": 19.420166015625, "rewards/rejected": 1.931783676147461, "step": 929 }, { "epoch": 0.4813664596273292, "grad_norm": 1.3235127925872803, "learning_rate": 9.670317511211163e-06, "loss": 0.17226585745811462, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.309722900390625, "rewards/margins": 21.5582275390625, "rewards/rejected": 2.7521514892578125, "step": 930 }, { "epoch": 0.48188405797101447, "grad_norm": 1.9867454767227173, "learning_rate": 9.669297967374626e-06, "loss": 0.2188989818096161, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.9805908203125, "rewards/margins": 21.16883087158203, "rewards/rejected": 2.8163070678710938, "step": 931 }, { "epoch": 0.4824016563146998, "grad_norm": 1.2073966264724731, "learning_rate": 9.668276903414463e-06, "loss": 0.16373075544834137, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.73526954650879, "rewards/margins": 26.761322021484375, "rewards/rejected": 3.9721031188964844, "step": 932 }, { "epoch": 0.4829192546583851, "grad_norm": 4.304364204406738, "learning_rate": 9.667254319663094e-06, "loss": 0.2387881875038147, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.399866104125977, "rewards/margins": 24.721054077148438, "rewards/rejected": 2.6749706268310547, "step": 933 }, { "epoch": 0.4834368530020704, "grad_norm": 3.4335553646087646, "learning_rate": 9.666230216453423e-06, "loss": 0.2316027134656906, "rewards/accuracies": 0.875, "rewards/chosen": 30.080713272094727, "rewards/margins": 25.463363647460938, "rewards/rejected": 4.612068176269531, "step": 934 }, { "epoch": 0.4839544513457557, "grad_norm": 1.4192261695861816, "learning_rate": 9.665204594118856e-06, "loss": 0.12985867261886597, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.38195037841797, "rewards/margins": 28.777328491210938, "rewards/rejected": 5.612545013427734, "step": 935 }, { "epoch": 0.484472049689441, "grad_norm": 0.8914869427680969, "learning_rate": 9.664177452993293e-06, "loss": 0.14956167340278625, "rewards/accuracies": 0.8984375, "rewards/chosen": 35.476783752441406, "rewards/margins": 29.091773986816406, "rewards/rejected": 6.386919021606445, "step": 936 }, { "epoch": 0.4849896480331263, "grad_norm": 2.3256733417510986, "learning_rate": 9.663148793411125e-06, "loss": 0.21017679572105408, "rewards/accuracies": 0.921875, "rewards/chosen": 33.08848190307617, "rewards/margins": 26.02108383178711, "rewards/rejected": 7.065614700317383, "step": 937 }, { "epoch": 0.4855072463768116, "grad_norm": 1.8115071058273315, "learning_rate": 9.66211861570724e-06, "loss": 0.22778615355491638, "rewards/accuracies": 0.90625, "rewards/chosen": 36.05594253540039, "rewards/margins": 28.563514709472656, "rewards/rejected": 7.486625671386719, "step": 938 }, { "epoch": 0.4860248447204969, "grad_norm": 2.763756036758423, "learning_rate": 9.661086920217022e-06, "loss": 0.2320554554462433, "rewards/accuracies": 0.8828125, "rewards/chosen": 34.5411491394043, "rewards/margins": 27.43773651123047, "rewards/rejected": 7.102848052978516, "step": 939 }, { "epoch": 0.4865424430641822, "grad_norm": 6.156484127044678, "learning_rate": 9.660053707276345e-06, "loss": 0.32444536685943604, "rewards/accuracies": 0.8359375, "rewards/chosen": 30.801429748535156, "rewards/margins": 24.491729736328125, "rewards/rejected": 6.308145523071289, "step": 940 }, { "epoch": 0.4870600414078675, "grad_norm": 1.1893357038497925, "learning_rate": 9.659018977221579e-06, "loss": 0.21899881958961487, "rewards/accuracies": 0.8671875, "rewards/chosen": 29.77642822265625, "rewards/margins": 24.22265625, "rewards/rejected": 5.558124542236328, "step": 941 }, { "epoch": 0.48757763975155277, "grad_norm": 0.8333641886711121, "learning_rate": 9.657982730389587e-06, "loss": 0.18924351036548615, "rewards/accuracies": 0.90625, "rewards/chosen": 27.252792358398438, "rewards/margins": 23.128929138183594, "rewards/rejected": 4.123425483703613, "step": 942 }, { "epoch": 0.4880952380952381, "grad_norm": 0.8756508827209473, "learning_rate": 9.656944967117729e-06, "loss": 0.21084022521972656, "rewards/accuracies": 0.8984375, "rewards/chosen": 28.316280364990234, "rewards/margins": 23.74077606201172, "rewards/rejected": 4.568946838378906, "step": 943 }, { "epoch": 0.4886128364389234, "grad_norm": 1.971666693687439, "learning_rate": 9.655905687743857e-06, "loss": 0.2659340500831604, "rewards/accuracies": 0.875, "rewards/chosen": 23.07428741455078, "rewards/margins": 20.039108276367188, "rewards/rejected": 3.043489456176758, "step": 944 }, { "epoch": 0.4891304347826087, "grad_norm": 0.8506483435630798, "learning_rate": 9.654864892606311e-06, "loss": 0.16418501734733582, "rewards/accuracies": 0.921875, "rewards/chosen": 24.803730010986328, "rewards/margins": 21.780364990234375, "rewards/rejected": 3.0199522972106934, "step": 945 }, { "epoch": 0.489648033126294, "grad_norm": 1.6914011240005493, "learning_rate": 9.653822582043932e-06, "loss": 0.2398051619529724, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.589313507080078, "rewards/margins": 15.179397583007812, "rewards/rejected": 1.4071648120880127, "step": 946 }, { "epoch": 0.4901656314699793, "grad_norm": 0.8224114775657654, "learning_rate": 9.652778756396053e-06, "loss": 0.21505150198936462, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.39750099182129, "rewards/margins": 16.277420043945312, "rewards/rejected": 2.113232374191284, "step": 947 }, { "epoch": 0.4906832298136646, "grad_norm": 0.7782872319221497, "learning_rate": 9.651733416002498e-06, "loss": 0.14650790393352509, "rewards/accuracies": 0.953125, "rewards/chosen": 14.908072471618652, "rewards/margins": 14.030792236328125, "rewards/rejected": 0.8761682510375977, "step": 948 }, { "epoch": 0.4912008281573499, "grad_norm": 2.594977378845215, "learning_rate": 9.650686561203586e-06, "loss": 0.1925375908613205, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.6039981842041, "rewards/margins": 15.357833862304688, "rewards/rejected": 1.249643325805664, "step": 949 }, { "epoch": 0.4917184265010352, "grad_norm": 0.8032169342041016, "learning_rate": 9.649638192340126e-06, "loss": 0.17353272438049316, "rewards/accuracies": 0.953125, "rewards/chosen": 14.78088092803955, "rewards/margins": 13.605628967285156, "rewards/rejected": 1.1770601272583008, "step": 950 }, { "epoch": 0.4922360248447205, "grad_norm": 0.7615339159965515, "learning_rate": 9.648588309753421e-06, "loss": 0.20503467321395874, "rewards/accuracies": 0.8515625, "rewards/chosen": 14.412683486938477, "rewards/margins": 13.614051818847656, "rewards/rejected": 0.796966552734375, "step": 951 }, { "epoch": 0.4927536231884058, "grad_norm": 0.7160313725471497, "learning_rate": 9.647536913785273e-06, "loss": 0.150642991065979, "rewards/accuracies": 0.9296875, "rewards/chosen": 14.178244590759277, "rewards/margins": 13.183929443359375, "rewards/rejected": 0.9885501861572266, "step": 952 }, { "epoch": 0.4932712215320911, "grad_norm": 1.4000940322875977, "learning_rate": 9.64648400477797e-06, "loss": 0.2567790746688843, "rewards/accuracies": 0.8828125, "rewards/chosen": 15.267948150634766, "rewards/margins": 14.381614685058594, "rewards/rejected": 0.8864564895629883, "step": 953 }, { "epoch": 0.4937888198757764, "grad_norm": 1.0727025270462036, "learning_rate": 9.64542958307429e-06, "loss": 0.19102665781974792, "rewards/accuracies": 0.90625, "rewards/chosen": 15.125574111938477, "rewards/margins": 14.002548217773438, "rewards/rejected": 1.1191349029541016, "step": 954 }, { "epoch": 0.4943064182194617, "grad_norm": 0.8339785933494568, "learning_rate": 9.644373649017512e-06, "loss": 0.1457386314868927, "rewards/accuracies": 0.9375, "rewards/chosen": 15.920902252197266, "rewards/margins": 14.807952880859375, "rewards/rejected": 1.111541748046875, "step": 955 }, { "epoch": 0.494824016563147, "grad_norm": 0.8054856657981873, "learning_rate": 9.6433162029514e-06, "loss": 0.1568450629711151, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.679275512695312, "rewards/margins": 16.555221557617188, "rewards/rejected": 1.1229019165039062, "step": 956 }, { "epoch": 0.4953416149068323, "grad_norm": 0.8554779887199402, "learning_rate": 9.642257245220214e-06, "loss": 0.15055446326732635, "rewards/accuracies": 0.921875, "rewards/chosen": 18.834077835083008, "rewards/margins": 17.162826538085938, "rewards/rejected": 1.6701202392578125, "step": 957 }, { "epoch": 0.49585921325051757, "grad_norm": 1.8305960893630981, "learning_rate": 9.641196776168706e-06, "loss": 0.23038911819458008, "rewards/accuracies": 0.8984375, "rewards/chosen": 19.351089477539062, "rewards/margins": 18.155155181884766, "rewards/rejected": 1.1965632438659668, "step": 958 }, { "epoch": 0.4963768115942029, "grad_norm": 0.9867692589759827, "learning_rate": 9.64013479614212e-06, "loss": 0.16054925322532654, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.754281997680664, "rewards/margins": 18.868988037109375, "rewards/rejected": 1.8812613487243652, "step": 959 }, { "epoch": 0.4968944099378882, "grad_norm": 1.3919628858566284, "learning_rate": 9.639071305486191e-06, "loss": 0.20403632521629333, "rewards/accuracies": 0.890625, "rewards/chosen": 18.6817684173584, "rewards/margins": 17.543838500976562, "rewards/rejected": 1.1440134048461914, "step": 960 }, { "epoch": 0.4974120082815735, "grad_norm": 2.0568606853485107, "learning_rate": 9.638006304547145e-06, "loss": 0.2387722134590149, "rewards/accuracies": 0.890625, "rewards/chosen": 14.276222229003906, "rewards/margins": 14.139328002929688, "rewards/rejected": 0.1332838535308838, "step": 961 }, { "epoch": 0.4979296066252588, "grad_norm": 1.5677679777145386, "learning_rate": 9.6369397936717e-06, "loss": 0.14355888962745667, "rewards/accuracies": 0.953125, "rewards/chosen": 19.288246154785156, "rewards/margins": 17.661407470703125, "rewards/rejected": 1.6270439624786377, "step": 962 }, { "epoch": 0.4984472049689441, "grad_norm": 1.02091383934021, "learning_rate": 9.635871773207068e-06, "loss": 0.17920224368572235, "rewards/accuracies": 0.9140625, "rewards/chosen": 20.05545425415039, "rewards/margins": 18.350547790527344, "rewards/rejected": 1.708261489868164, "step": 963 }, { "epoch": 0.4989648033126294, "grad_norm": 1.613412618637085, "learning_rate": 9.63480224350095e-06, "loss": 0.2825629711151123, "rewards/accuracies": 0.84375, "rewards/chosen": 15.933371543884277, "rewards/margins": 15.249420166015625, "rewards/rejected": 0.6814651489257812, "step": 964 }, { "epoch": 0.4994824016563147, "grad_norm": 2.6944141387939453, "learning_rate": 9.633731204901541e-06, "loss": 0.233730286359787, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.217077255249023, "rewards/margins": 17.05365753173828, "rewards/rejected": 1.162602424621582, "step": 965 }, { "epoch": 0.5, "grad_norm": 1.8605424165725708, "learning_rate": 9.632658657757523e-06, "loss": 0.33192065358161926, "rewards/accuracies": 0.8203125, "rewards/chosen": 11.199005126953125, "rewards/margins": 10.842864990234375, "rewards/rejected": 0.35730409622192383, "step": 966 }, { "epoch": 0.5005175983436853, "grad_norm": 2.088106393814087, "learning_rate": 9.631584602418071e-06, "loss": 0.20823785662651062, "rewards/accuracies": 0.890625, "rewards/chosen": 14.691139221191406, "rewards/margins": 14.0714111328125, "rewards/rejected": 0.6217631697654724, "step": 967 }, { "epoch": 0.5010351966873706, "grad_norm": 1.4408437013626099, "learning_rate": 9.630509039232855e-06, "loss": 0.2040206789970398, "rewards/accuracies": 0.890625, "rewards/chosen": 13.348074913024902, "rewards/margins": 12.365951538085938, "rewards/rejected": 0.9830436706542969, "step": 968 }, { "epoch": 0.5015527950310559, "grad_norm": 0.9482591152191162, "learning_rate": 9.629431968552027e-06, "loss": 0.21960978209972382, "rewards/accuracies": 0.8984375, "rewards/chosen": 9.690041542053223, "rewards/margins": 9.51104736328125, "rewards/rejected": 0.18045520782470703, "step": 969 }, { "epoch": 0.5020703933747412, "grad_norm": 1.3422707319259644, "learning_rate": 9.62835339072624e-06, "loss": 0.20699769258499146, "rewards/accuracies": 0.890625, "rewards/chosen": 8.651660919189453, "rewards/margins": 8.5401611328125, "rewards/rejected": 0.11021232604980469, "step": 970 }, { "epoch": 0.5025879917184265, "grad_norm": 0.9160814881324768, "learning_rate": 9.627273306106631e-06, "loss": 0.20317116379737854, "rewards/accuracies": 0.90625, "rewards/chosen": 7.667210102081299, "rewards/margins": 7.444538116455078, "rewards/rejected": 0.22253793478012085, "step": 971 }, { "epoch": 0.5031055900621118, "grad_norm": 0.6442532539367676, "learning_rate": 9.62619171504483e-06, "loss": 0.18580344319343567, "rewards/accuracies": 0.90625, "rewards/chosen": 8.424522399902344, "rewards/margins": 8.12091064453125, "rewards/rejected": 0.3036574721336365, "step": 972 }, { "epoch": 0.5036231884057971, "grad_norm": 1.14492928981781, "learning_rate": 9.625108617892953e-06, "loss": 0.2253810167312622, "rewards/accuracies": 0.875, "rewards/chosen": 8.627880096435547, "rewards/margins": 8.361930847167969, "rewards/rejected": 0.2653827667236328, "step": 973 }, { "epoch": 0.5041407867494824, "grad_norm": 0.7656651139259338, "learning_rate": 9.624024015003615e-06, "loss": 0.2054899036884308, "rewards/accuracies": 0.8828125, "rewards/chosen": 10.13283920288086, "rewards/margins": 9.716629028320312, "rewards/rejected": 0.41605520248413086, "step": 974 }, { "epoch": 0.5046583850931677, "grad_norm": 1.3336414098739624, "learning_rate": 9.622937906729915e-06, "loss": 0.19266054034233093, "rewards/accuracies": 0.9296875, "rewards/chosen": 9.450719833374023, "rewards/margins": 8.998573303222656, "rewards/rejected": 0.45325493812561035, "step": 975 }, { "epoch": 0.505175983436853, "grad_norm": 0.8923190236091614, "learning_rate": 9.62185029342544e-06, "loss": 0.19763986766338348, "rewards/accuracies": 0.9140625, "rewards/chosen": 9.593093872070312, "rewards/margins": 9.348495483398438, "rewards/rejected": 0.2459622621536255, "step": 976 }, { "epoch": 0.5056935817805382, "grad_norm": 0.6561490893363953, "learning_rate": 9.620761175444277e-06, "loss": 0.13610032200813293, "rewards/accuracies": 0.9453125, "rewards/chosen": 11.940195083618164, "rewards/margins": 11.264419555664062, "rewards/rejected": 0.6752703189849854, "step": 977 }, { "epoch": 0.5062111801242236, "grad_norm": 0.8460367321968079, "learning_rate": 9.61967055314099e-06, "loss": 0.20845402777194977, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.949675559997559, "rewards/margins": 11.0556640625, "rewards/rejected": 0.8934826850891113, "step": 978 }, { "epoch": 0.5067287784679089, "grad_norm": 1.6146243810653687, "learning_rate": 9.618578426870642e-06, "loss": 0.2568325996398926, "rewards/accuracies": 0.875, "rewards/chosen": 12.188835144042969, "rewards/margins": 11.117660522460938, "rewards/rejected": 1.0754737854003906, "step": 979 }, { "epoch": 0.5072463768115942, "grad_norm": 1.5988067388534546, "learning_rate": 9.61748479698878e-06, "loss": 0.24852780997753143, "rewards/accuracies": 0.875, "rewards/chosen": 13.868875503540039, "rewards/margins": 12.424468994140625, "rewards/rejected": 1.4491653442382812, "step": 980 }, { "epoch": 0.5077639751552795, "grad_norm": 1.2539726495742798, "learning_rate": 9.616389663851447e-06, "loss": 0.20265565812587738, "rewards/accuracies": 0.8828125, "rewards/chosen": 15.339092254638672, "rewards/margins": 13.931175231933594, "rewards/rejected": 1.4073562622070312, "step": 981 }, { "epoch": 0.5082815734989649, "grad_norm": 1.6999602317810059, "learning_rate": 9.61529302781517e-06, "loss": 0.2361280918121338, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.23263931274414, "rewards/margins": 14.625442504882812, "rewards/rejected": 1.6048126220703125, "step": 982 }, { "epoch": 0.5087991718426501, "grad_norm": 1.2726210355758667, "learning_rate": 9.614194889236966e-06, "loss": 0.1533457189798355, "rewards/accuracies": 0.9296875, "rewards/chosen": 19.045425415039062, "rewards/margins": 16.718849182128906, "rewards/rejected": 2.3261070251464844, "step": 983 }, { "epoch": 0.5093167701863354, "grad_norm": 1.4478330612182617, "learning_rate": 9.61309524847434e-06, "loss": 0.1653497815132141, "rewards/accuracies": 0.9296875, "rewards/chosen": 18.606060028076172, "rewards/margins": 16.071670532226562, "rewards/rejected": 2.538572311401367, "step": 984 }, { "epoch": 0.5098343685300207, "grad_norm": 1.1977053880691528, "learning_rate": 9.611994105885292e-06, "loss": 0.21331512928009033, "rewards/accuracies": 0.875, "rewards/chosen": 23.675479888916016, "rewards/margins": 20.204681396484375, "rewards/rejected": 3.4684505462646484, "step": 985 }, { "epoch": 0.510351966873706, "grad_norm": 1.9331355094909668, "learning_rate": 9.610891461828304e-06, "loss": 0.18356949090957642, "rewards/accuracies": 0.8984375, "rewards/chosen": 23.864700317382812, "rewards/margins": 20.273849487304688, "rewards/rejected": 3.5925750732421875, "step": 986 }, { "epoch": 0.5108695652173914, "grad_norm": 0.7963559031486511, "learning_rate": 9.609787316662351e-06, "loss": 0.14530092477798462, "rewards/accuracies": 0.9375, "rewards/chosen": 25.53081512451172, "rewards/margins": 21.631637573242188, "rewards/rejected": 3.897247314453125, "step": 987 }, { "epoch": 0.5113871635610766, "grad_norm": 1.0403493642807007, "learning_rate": 9.608681670746895e-06, "loss": 0.1716160625219345, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.403656005859375, "rewards/margins": 20.664840698242188, "rewards/rejected": 3.738536834716797, "step": 988 }, { "epoch": 0.5119047619047619, "grad_norm": 2.009256362915039, "learning_rate": 9.607574524441887e-06, "loss": 0.2153051793575287, "rewards/accuracies": 0.8984375, "rewards/chosen": 24.02775001525879, "rewards/margins": 20.68206787109375, "rewards/rejected": 3.3451175689697266, "step": 989 }, { "epoch": 0.5124223602484472, "grad_norm": 1.6759254932403564, "learning_rate": 9.606465878107768e-06, "loss": 0.2329484224319458, "rewards/accuracies": 0.8828125, "rewards/chosen": 24.400184631347656, "rewards/margins": 21.393226623535156, "rewards/rejected": 3.0115013122558594, "step": 990 }, { "epoch": 0.5129399585921325, "grad_norm": 0.9658243656158447, "learning_rate": 9.605355732105464e-06, "loss": 0.2212233543395996, "rewards/accuracies": 0.875, "rewards/chosen": 25.99927520751953, "rewards/margins": 22.258432388305664, "rewards/rejected": 3.737100601196289, "step": 991 }, { "epoch": 0.5134575569358178, "grad_norm": 1.5597198009490967, "learning_rate": 9.60424408679639e-06, "loss": 0.2231193482875824, "rewards/accuracies": 0.875, "rewards/chosen": 28.381122589111328, "rewards/margins": 23.6983642578125, "rewards/rejected": 4.677574157714844, "step": 992 }, { "epoch": 0.5139751552795031, "grad_norm": 1.5391877889633179, "learning_rate": 9.603130942542453e-06, "loss": 0.1948878914117813, "rewards/accuracies": 0.890625, "rewards/chosen": 26.846525192260742, "rewards/margins": 22.991683959960938, "rewards/rejected": 3.861705780029297, "step": 993 }, { "epoch": 0.5144927536231884, "grad_norm": 1.2276558876037598, "learning_rate": 9.602016299706045e-06, "loss": 0.17917105555534363, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.94525909423828, "rewards/margins": 21.1795654296875, "rewards/rejected": 2.76373291015625, "step": 994 }, { "epoch": 0.5150103519668737, "grad_norm": 1.0730119943618774, "learning_rate": 9.600900158650043e-06, "loss": 0.20104897022247314, "rewards/accuracies": 0.890625, "rewards/chosen": 22.88974380493164, "rewards/margins": 20.207260131835938, "rewards/rejected": 2.6833724975585938, "step": 995 }, { "epoch": 0.515527950310559, "grad_norm": 0.7535142302513123, "learning_rate": 9.599782519737817e-06, "loss": 0.15123799443244934, "rewards/accuracies": 0.90625, "rewards/chosen": 26.022022247314453, "rewards/margins": 23.447036743164062, "rewards/rejected": 2.5705394744873047, "step": 996 }, { "epoch": 0.5160455486542443, "grad_norm": 1.4209158420562744, "learning_rate": 9.598663383333223e-06, "loss": 0.2170817255973816, "rewards/accuracies": 0.890625, "rewards/chosen": 23.972410202026367, "rewards/margins": 21.397804260253906, "rewards/rejected": 2.5769271850585938, "step": 997 }, { "epoch": 0.5165631469979296, "grad_norm": 1.089407205581665, "learning_rate": 9.597542749800601e-06, "loss": 0.17807818949222565, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.2110652923584, "rewards/margins": 22.960426330566406, "rewards/rejected": 2.248713493347168, "step": 998 }, { "epoch": 0.5170807453416149, "grad_norm": 0.825153112411499, "learning_rate": 9.596420619504783e-06, "loss": 0.15889689326286316, "rewards/accuracies": 0.9375, "rewards/chosen": 24.129207611083984, "rewards/margins": 21.503896713256836, "rewards/rejected": 2.618278980255127, "step": 999 }, { "epoch": 0.5175983436853002, "grad_norm": 0.9759520888328552, "learning_rate": 9.59529699281109e-06, "loss": 0.1527688205242157, "rewards/accuracies": 0.8984375, "rewards/chosen": 25.221359252929688, "rewards/margins": 22.4833984375, "rewards/rejected": 2.7282700538635254, "step": 1000 }, { "epoch": 0.5181159420289855, "grad_norm": 0.8179099559783936, "learning_rate": 9.594171870085321e-06, "loss": 0.15071851015090942, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.988525390625, "rewards/margins": 21.999977111816406, "rewards/rejected": 1.9869766235351562, "step": 1001 }, { "epoch": 0.5186335403726708, "grad_norm": 1.4861894845962524, "learning_rate": 9.59304525169377e-06, "loss": 0.21251749992370605, "rewards/accuracies": 0.8984375, "rewards/chosen": 19.8700008392334, "rewards/margins": 18.019577026367188, "rewards/rejected": 1.8530641794204712, "step": 1002 }, { "epoch": 0.5191511387163561, "grad_norm": 1.0325689315795898, "learning_rate": 9.591917138003218e-06, "loss": 0.1501820683479309, "rewards/accuracies": 0.9375, "rewards/chosen": 27.33783721923828, "rewards/margins": 24.341094970703125, "rewards/rejected": 3.0001611709594727, "step": 1003 }, { "epoch": 0.5196687370600414, "grad_norm": 0.8304260969161987, "learning_rate": 9.590787529380927e-06, "loss": 0.1728104203939438, "rewards/accuracies": 0.890625, "rewards/chosen": 25.405302047729492, "rewards/margins": 21.973312377929688, "rewards/rejected": 3.4296607971191406, "step": 1004 }, { "epoch": 0.5201863354037267, "grad_norm": 1.1018284559249878, "learning_rate": 9.589656426194652e-06, "loss": 0.21944312751293182, "rewards/accuracies": 0.8671875, "rewards/chosen": 25.087017059326172, "rewards/margins": 21.72235107421875, "rewards/rejected": 3.369859218597412, "step": 1005 }, { "epoch": 0.520703933747412, "grad_norm": 0.9002290964126587, "learning_rate": 9.588523828812631e-06, "loss": 0.18412330746650696, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.94888687133789, "rewards/margins": 20.334571838378906, "rewards/rejected": 2.6129531860351562, "step": 1006 }, { "epoch": 0.5212215320910973, "grad_norm": 1.5041085481643677, "learning_rate": 9.587389737603587e-06, "loss": 0.215728759765625, "rewards/accuracies": 0.8984375, "rewards/chosen": 26.414703369140625, "rewards/margins": 22.05230712890625, "rewards/rejected": 4.365886688232422, "step": 1007 }, { "epoch": 0.5217391304347826, "grad_norm": 1.1494468450546265, "learning_rate": 9.586254152936736e-06, "loss": 0.23017941415309906, "rewards/accuracies": 0.890625, "rewards/chosen": 23.790958404541016, "rewards/margins": 20.33751678466797, "rewards/rejected": 3.4588871002197266, "step": 1008 }, { "epoch": 0.522256728778468, "grad_norm": 1.3858795166015625, "learning_rate": 9.585117075181771e-06, "loss": 0.19649402797222137, "rewards/accuracies": 0.90625, "rewards/chosen": 25.56825828552246, "rewards/margins": 21.61931610107422, "rewards/rejected": 3.950397491455078, "step": 1009 }, { "epoch": 0.5227743271221532, "grad_norm": 1.0141627788543701, "learning_rate": 9.583978504708881e-06, "loss": 0.19195005297660828, "rewards/accuracies": 0.90625, "rewards/chosen": 24.218585968017578, "rewards/margins": 20.55096435546875, "rewards/rejected": 3.6713905334472656, "step": 1010 }, { "epoch": 0.5232919254658385, "grad_norm": 1.1340292692184448, "learning_rate": 9.582838441888732e-06, "loss": 0.24778889119625092, "rewards/accuracies": 0.8828125, "rewards/chosen": 22.344066619873047, "rewards/margins": 19.186622619628906, "rewards/rejected": 3.1577658653259277, "step": 1011 }, { "epoch": 0.5238095238095238, "grad_norm": 0.7676370739936829, "learning_rate": 9.58169688709248e-06, "loss": 0.1560109257698059, "rewards/accuracies": 0.9375, "rewards/chosen": 23.481769561767578, "rewards/margins": 20.549598693847656, "rewards/rejected": 2.9352798461914062, "step": 1012 }, { "epoch": 0.5243271221532091, "grad_norm": 0.8497444987297058, "learning_rate": 9.580553840691766e-06, "loss": 0.14434891939163208, "rewards/accuracies": 0.921875, "rewards/chosen": 23.913915634155273, "rewards/margins": 20.999618530273438, "rewards/rejected": 2.9100403785705566, "step": 1013 }, { "epoch": 0.5248447204968945, "grad_norm": 1.0375654697418213, "learning_rate": 9.57940930305872e-06, "loss": 0.13166433572769165, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.675220489501953, "rewards/margins": 23.76251220703125, "rewards/rejected": 3.914276123046875, "step": 1014 }, { "epoch": 0.5253623188405797, "grad_norm": 0.7837033271789551, "learning_rate": 9.578263274565953e-06, "loss": 0.16822552680969238, "rewards/accuracies": 0.875, "rewards/chosen": 24.706863403320312, "rewards/margins": 21.44255828857422, "rewards/rejected": 3.2681503295898438, "step": 1015 }, { "epoch": 0.525879917184265, "grad_norm": 0.871133029460907, "learning_rate": 9.577115755586562e-06, "loss": 0.16871914267539978, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.233989715576172, "rewards/margins": 20.943126678466797, "rewards/rejected": 4.291678428649902, "step": 1016 }, { "epoch": 0.5263975155279503, "grad_norm": 0.8455839157104492, "learning_rate": 9.575966746494129e-06, "loss": 0.20871898531913757, "rewards/accuracies": 0.90625, "rewards/chosen": 18.278900146484375, "rewards/margins": 15.643569946289062, "rewards/rejected": 2.6342697143554688, "step": 1017 }, { "epoch": 0.5269151138716356, "grad_norm": 0.8064062595367432, "learning_rate": 9.574816247662724e-06, "loss": 0.138287752866745, "rewards/accuracies": 0.921875, "rewards/chosen": 20.698326110839844, "rewards/margins": 17.512535095214844, "rewards/rejected": 3.1857986450195312, "step": 1018 }, { "epoch": 0.527432712215321, "grad_norm": 1.3482372760772705, "learning_rate": 9.573664259466901e-06, "loss": 0.20790864527225494, "rewards/accuracies": 0.921875, "rewards/chosen": 16.815710067749023, "rewards/margins": 14.372940063476562, "rewards/rejected": 2.4422008991241455, "step": 1019 }, { "epoch": 0.5279503105590062, "grad_norm": 1.0485543012619019, "learning_rate": 9.572510782281696e-06, "loss": 0.17142602801322937, "rewards/accuracies": 0.9140625, "rewards/chosen": 19.04730224609375, "rewards/margins": 15.309783935546875, "rewards/rejected": 3.74102783203125, "step": 1020 }, { "epoch": 0.5284679089026915, "grad_norm": 1.6550976037979126, "learning_rate": 9.571355816482635e-06, "loss": 0.19077126681804657, "rewards/accuracies": 0.890625, "rewards/chosen": 16.444419860839844, "rewards/margins": 13.965744018554688, "rewards/rejected": 2.474858045578003, "step": 1021 }, { "epoch": 0.5289855072463768, "grad_norm": 0.8719319701194763, "learning_rate": 9.570199362445721e-06, "loss": 0.2116825133562088, "rewards/accuracies": 0.8828125, "rewards/chosen": 14.968804359436035, "rewards/margins": 12.743507385253906, "rewards/rejected": 2.2215232849121094, "step": 1022 }, { "epoch": 0.5295031055900621, "grad_norm": 1.5311130285263062, "learning_rate": 9.569041420547449e-06, "loss": 0.18468442559242249, "rewards/accuracies": 0.8828125, "rewards/chosen": 14.900590896606445, "rewards/margins": 12.67034912109375, "rewards/rejected": 2.234590530395508, "step": 1023 }, { "epoch": 0.5300207039337475, "grad_norm": 1.4122745990753174, "learning_rate": 9.567881991164794e-06, "loss": 0.21379521489143372, "rewards/accuracies": 0.8984375, "rewards/chosen": 16.53953742980957, "rewards/margins": 13.843765258789062, "rewards/rejected": 2.6941795349121094, "step": 1024 }, { "epoch": 0.5305383022774327, "grad_norm": 1.0804017782211304, "learning_rate": 9.566721074675216e-06, "loss": 0.1686239391565323, "rewards/accuracies": 0.90625, "rewards/chosen": 15.093730926513672, "rewards/margins": 13.302978515625, "rewards/rejected": 1.791299819946289, "step": 1025 }, { "epoch": 0.531055900621118, "grad_norm": 1.1537365913391113, "learning_rate": 9.565558671456663e-06, "loss": 0.16587285697460175, "rewards/accuracies": 0.921875, "rewards/chosen": 13.27450942993164, "rewards/margins": 11.339176177978516, "rewards/rejected": 1.9345273971557617, "step": 1026 }, { "epoch": 0.5315734989648033, "grad_norm": 0.8104296326637268, "learning_rate": 9.56439478188756e-06, "loss": 0.17859257757663727, "rewards/accuracies": 0.921875, "rewards/chosen": 15.18771743774414, "rewards/margins": 12.831649780273438, "rewards/rejected": 2.3564224243164062, "step": 1027 }, { "epoch": 0.5320910973084886, "grad_norm": 1.517494559288025, "learning_rate": 9.563229406346822e-06, "loss": 0.2062055915594101, "rewards/accuracies": 0.90625, "rewards/chosen": 12.480184555053711, "rewards/margins": 10.645469665527344, "rewards/rejected": 1.8374388217926025, "step": 1028 }, { "epoch": 0.532608695652174, "grad_norm": 0.7360749840736389, "learning_rate": 9.562062545213843e-06, "loss": 0.13671833276748657, "rewards/accuracies": 0.9296875, "rewards/chosen": 14.162812232971191, "rewards/margins": 12.802764892578125, "rewards/rejected": 1.3630752563476562, "step": 1029 }, { "epoch": 0.5331262939958592, "grad_norm": 2.00601863861084, "learning_rate": 9.560894198868508e-06, "loss": 0.2087382823228836, "rewards/accuracies": 0.921875, "rewards/chosen": 13.64607048034668, "rewards/margins": 11.47894287109375, "rewards/rejected": 2.1665382385253906, "step": 1030 }, { "epoch": 0.5336438923395446, "grad_norm": 1.184588074684143, "learning_rate": 9.559724367691175e-06, "loss": 0.1660866141319275, "rewards/accuracies": 0.9296875, "rewards/chosen": 14.454607963562012, "rewards/margins": 12.783866882324219, "rewards/rejected": 1.670938491821289, "step": 1031 }, { "epoch": 0.5341614906832298, "grad_norm": 2.1375041007995605, "learning_rate": 9.558553052062694e-06, "loss": 0.16973896324634552, "rewards/accuracies": 0.9140625, "rewards/chosen": 14.258661270141602, "rewards/margins": 12.814064025878906, "rewards/rejected": 1.4392552375793457, "step": 1032 }, { "epoch": 0.5346790890269151, "grad_norm": 1.7567634582519531, "learning_rate": 9.557380252364395e-06, "loss": 0.20166964828968048, "rewards/accuracies": 0.8984375, "rewards/chosen": 12.816169738769531, "rewards/margins": 11.595062255859375, "rewards/rejected": 1.221053123474121, "step": 1033 }, { "epoch": 0.5351966873706004, "grad_norm": 0.9262790083885193, "learning_rate": 9.556205968978089e-06, "loss": 0.16641193628311157, "rewards/accuracies": 0.890625, "rewards/chosen": 12.185199737548828, "rewards/margins": 11.347190856933594, "rewards/rejected": 0.8337393999099731, "step": 1034 }, { "epoch": 0.5357142857142857, "grad_norm": 1.463330626487732, "learning_rate": 9.555030202286075e-06, "loss": 0.18767346441745758, "rewards/accuracies": 0.8984375, "rewards/chosen": 13.850503921508789, "rewards/margins": 12.63116455078125, "rewards/rejected": 1.219822883605957, "step": 1035 }, { "epoch": 0.5362318840579711, "grad_norm": 1.8835541009902954, "learning_rate": 9.553852952671133e-06, "loss": 0.18856345117092133, "rewards/accuracies": 0.8515625, "rewards/chosen": 14.135501861572266, "rewards/margins": 12.783882141113281, "rewards/rejected": 1.347121000289917, "step": 1036 }, { "epoch": 0.5367494824016563, "grad_norm": 2.0737195014953613, "learning_rate": 9.552674220516522e-06, "loss": 0.26028677821159363, "rewards/accuracies": 0.8984375, "rewards/chosen": 14.024474143981934, "rewards/margins": 12.910293579101562, "rewards/rejected": 1.107717514038086, "step": 1037 }, { "epoch": 0.5372670807453416, "grad_norm": 1.7120792865753174, "learning_rate": 9.551494006205989e-06, "loss": 0.1931799352169037, "rewards/accuracies": 0.8984375, "rewards/chosen": 15.609277725219727, "rewards/margins": 14.146743774414062, "rewards/rejected": 1.464842677116394, "step": 1038 }, { "epoch": 0.5377846790890269, "grad_norm": 1.262455701828003, "learning_rate": 9.55031231012376e-06, "loss": 0.2579077482223511, "rewards/accuracies": 0.8828125, "rewards/chosen": 14.129861831665039, "rewards/margins": 13.152847290039062, "rewards/rejected": 0.9760513305664062, "step": 1039 }, { "epoch": 0.5383022774327122, "grad_norm": 1.3780388832092285, "learning_rate": 9.549129132654547e-06, "loss": 0.23276865482330322, "rewards/accuracies": 0.8984375, "rewards/chosen": 13.396340370178223, "rewards/margins": 12.828872680664062, "rewards/rejected": 0.5681228637695312, "step": 1040 }, { "epoch": 0.5388198757763976, "grad_norm": 1.428139567375183, "learning_rate": 9.54794447418354e-06, "loss": 0.2012377381324768, "rewards/accuracies": 0.8828125, "rewards/chosen": 13.496007919311523, "rewards/margins": 12.68756103515625, "rewards/rejected": 0.8086965084075928, "step": 1041 }, { "epoch": 0.5393374741200828, "grad_norm": 0.716242790222168, "learning_rate": 9.546758335096413e-06, "loss": 0.18408194184303284, "rewards/accuracies": 0.890625, "rewards/chosen": 13.81317138671875, "rewards/margins": 13.079605102539062, "rewards/rejected": 0.7331957817077637, "step": 1042 }, { "epoch": 0.5398550724637681, "grad_norm": 0.8139130473136902, "learning_rate": 9.545570715779324e-06, "loss": 0.1966930329799652, "rewards/accuracies": 0.9140625, "rewards/chosen": 14.2269926071167, "rewards/margins": 13.145683288574219, "rewards/rejected": 1.080413818359375, "step": 1043 }, { "epoch": 0.5403726708074534, "grad_norm": 1.0291306972503662, "learning_rate": 9.544381616618907e-06, "loss": 0.24137413501739502, "rewards/accuracies": 0.84375, "rewards/chosen": 11.554701805114746, "rewards/margins": 11.002254486083984, "rewards/rejected": 0.5544652938842773, "step": 1044 }, { "epoch": 0.5408902691511387, "grad_norm": 0.8144820928573608, "learning_rate": 9.543191038002286e-06, "loss": 0.24332758784294128, "rewards/accuracies": 0.875, "rewards/chosen": 12.60621452331543, "rewards/margins": 11.704093933105469, "rewards/rejected": 0.8992795944213867, "step": 1045 }, { "epoch": 0.5414078674948241, "grad_norm": 1.5391104221343994, "learning_rate": 9.541998980317062e-06, "loss": 0.14952652156352997, "rewards/accuracies": 0.90625, "rewards/chosen": 16.895652770996094, "rewards/margins": 15.127723693847656, "rewards/rejected": 1.7660760879516602, "step": 1046 }, { "epoch": 0.5419254658385093, "grad_norm": 0.7591637969017029, "learning_rate": 9.540805443951315e-06, "loss": 0.1723807454109192, "rewards/accuracies": 0.875, "rewards/chosen": 17.926589965820312, "rewards/margins": 16.155960083007812, "rewards/rejected": 1.7696857452392578, "step": 1047 }, { "epoch": 0.5424430641821946, "grad_norm": 0.8164039850234985, "learning_rate": 9.539610429293613e-06, "loss": 0.22788864374160767, "rewards/accuracies": 0.84375, "rewards/chosen": 16.84758186340332, "rewards/margins": 15.233306884765625, "rewards/rejected": 1.6168994903564453, "step": 1048 }, { "epoch": 0.5429606625258799, "grad_norm": 1.1728311777114868, "learning_rate": 9.538413936733e-06, "loss": 0.18073295056819916, "rewards/accuracies": 0.8828125, "rewards/chosen": 21.289581298828125, "rewards/margins": 18.970321655273438, "rewards/rejected": 2.318298816680908, "step": 1049 }, { "epoch": 0.5434782608695652, "grad_norm": 0.6369011402130127, "learning_rate": 9.537215966659e-06, "loss": 0.17677590250968933, "rewards/accuracies": 0.9296875, "rewards/chosen": 17.786317825317383, "rewards/margins": 15.70880126953125, "rewards/rejected": 2.0769786834716797, "step": 1050 }, { "epoch": 0.5439958592132506, "grad_norm": 1.0393826961517334, "learning_rate": 9.536016519461625e-06, "loss": 0.18550002574920654, "rewards/accuracies": 0.890625, "rewards/chosen": 18.102880477905273, "rewards/margins": 15.42230224609375, "rewards/rejected": 2.6795787811279297, "step": 1051 }, { "epoch": 0.5445134575569358, "grad_norm": 0.7893950939178467, "learning_rate": 9.534815595531364e-06, "loss": 0.18900330364704132, "rewards/accuracies": 0.8671875, "rewards/chosen": 19.0313777923584, "rewards/margins": 16.17131805419922, "rewards/rejected": 2.8613433837890625, "step": 1052 }, { "epoch": 0.5450310559006211, "grad_norm": 0.9822365641593933, "learning_rate": 9.533613195259184e-06, "loss": 0.17226022481918335, "rewards/accuracies": 0.8671875, "rewards/chosen": 20.606842041015625, "rewards/margins": 18.408248901367188, "rewards/rejected": 2.1960763931274414, "step": 1053 }, { "epoch": 0.5455486542443064, "grad_norm": 1.278863787651062, "learning_rate": 9.532409319036533e-06, "loss": 0.2035791277885437, "rewards/accuracies": 0.875, "rewards/chosen": 20.316213607788086, "rewards/margins": 17.702510833740234, "rewards/rejected": 2.6148300170898438, "step": 1054 }, { "epoch": 0.5460662525879917, "grad_norm": 1.3668186664581299, "learning_rate": 9.531203967255347e-06, "loss": 0.2265324592590332, "rewards/accuracies": 0.890625, "rewards/chosen": 20.70029640197754, "rewards/margins": 18.008094787597656, "rewards/rejected": 2.6912314891815186, "step": 1055 }, { "epoch": 0.546583850931677, "grad_norm": 2.61751389503479, "learning_rate": 9.529997140308033e-06, "loss": 0.1781187355518341, "rewards/accuracies": 0.921875, "rewards/chosen": 23.928661346435547, "rewards/margins": 20.647415161132812, "rewards/rejected": 3.2793197631835938, "step": 1056 }, { "epoch": 0.5471014492753623, "grad_norm": 1.4854580163955688, "learning_rate": 9.528788838587483e-06, "loss": 0.23325125873088837, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.661270141601562, "rewards/margins": 18.838821411132812, "rewards/rejected": 3.828481674194336, "step": 1057 }, { "epoch": 0.5476190476190477, "grad_norm": 1.0076662302017212, "learning_rate": 9.527579062487072e-06, "loss": 0.17694690823554993, "rewards/accuracies": 0.921875, "rewards/chosen": 23.109939575195312, "rewards/margins": 19.73028564453125, "rewards/rejected": 3.37744140625, "step": 1058 }, { "epoch": 0.5481366459627329, "grad_norm": 1.6192500591278076, "learning_rate": 9.526367812400645e-06, "loss": 0.2660568058490753, "rewards/accuracies": 0.8671875, "rewards/chosen": 19.361181259155273, "rewards/margins": 16.95703125, "rewards/rejected": 2.3969311714172363, "step": 1059 }, { "epoch": 0.5486542443064182, "grad_norm": 1.811784029006958, "learning_rate": 9.525155088722537e-06, "loss": 0.23734481632709503, "rewards/accuracies": 0.875, "rewards/chosen": 22.18756866455078, "rewards/margins": 19.691326141357422, "rewards/rejected": 2.4979419708251953, "step": 1060 }, { "epoch": 0.5491718426501035, "grad_norm": 1.363356351852417, "learning_rate": 9.523940891847558e-06, "loss": 0.23642289638519287, "rewards/accuracies": 0.890625, "rewards/chosen": 19.651437759399414, "rewards/margins": 17.167938232421875, "rewards/rejected": 2.488679885864258, "step": 1061 }, { "epoch": 0.5496894409937888, "grad_norm": 0.8611990809440613, "learning_rate": 9.522725222171e-06, "loss": 0.17281585931777954, "rewards/accuracies": 0.890625, "rewards/chosen": 21.493118286132812, "rewards/margins": 18.981590270996094, "rewards/rejected": 2.5160751342773438, "step": 1062 }, { "epoch": 0.5502070393374742, "grad_norm": 1.3499091863632202, "learning_rate": 9.521508080088631e-06, "loss": 0.1907254159450531, "rewards/accuracies": 0.9453125, "rewards/chosen": 19.295804977416992, "rewards/margins": 17.416519165039062, "rewards/rejected": 1.873819351196289, "step": 1063 }, { "epoch": 0.5507246376811594, "grad_norm": 1.1157424449920654, "learning_rate": 9.520289465996701e-06, "loss": 0.18730661273002625, "rewards/accuracies": 0.921875, "rewards/chosen": 20.46238136291504, "rewards/margins": 17.902114868164062, "rewards/rejected": 2.5590269565582275, "step": 1064 }, { "epoch": 0.5512422360248447, "grad_norm": 1.3759225606918335, "learning_rate": 9.51906938029194e-06, "loss": 0.17207226157188416, "rewards/accuracies": 0.9296875, "rewards/chosen": 19.906768798828125, "rewards/margins": 17.63165283203125, "rewards/rejected": 2.271352767944336, "step": 1065 }, { "epoch": 0.55175983436853, "grad_norm": 0.9625940918922424, "learning_rate": 9.517847823371554e-06, "loss": 0.1813361793756485, "rewards/accuracies": 0.90625, "rewards/chosen": 18.70419692993164, "rewards/margins": 17.347305297851562, "rewards/rejected": 1.3555259704589844, "step": 1066 }, { "epoch": 0.5522774327122153, "grad_norm": 0.7058466672897339, "learning_rate": 9.51662479563323e-06, "loss": 0.20227324962615967, "rewards/accuracies": 0.890625, "rewards/chosen": 18.035457611083984, "rewards/margins": 16.694236755371094, "rewards/rejected": 1.3367271423339844, "step": 1067 }, { "epoch": 0.5527950310559007, "grad_norm": 2.3420767784118652, "learning_rate": 9.515400297475134e-06, "loss": 0.23403187096118927, "rewards/accuracies": 0.90625, "rewards/chosen": 19.42890167236328, "rewards/margins": 18.33592987060547, "rewards/rejected": 1.0894775390625, "step": 1068 }, { "epoch": 0.5533126293995859, "grad_norm": 1.6719664335250854, "learning_rate": 9.51417432929591e-06, "loss": 0.174893319606781, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.43799591064453, "rewards/margins": 18.864267349243164, "rewards/rejected": 1.5709528923034668, "step": 1069 }, { "epoch": 0.5538302277432712, "grad_norm": 0.6283978819847107, "learning_rate": 9.51294689149468e-06, "loss": 0.1555269956588745, "rewards/accuracies": 0.921875, "rewards/chosen": 23.33591079711914, "rewards/margins": 21.231781005859375, "rewards/rejected": 2.1030712127685547, "step": 1070 }, { "epoch": 0.5543478260869565, "grad_norm": 1.6179972887039185, "learning_rate": 9.511717984471047e-06, "loss": 0.25125446915626526, "rewards/accuracies": 0.8671875, "rewards/chosen": 22.972557067871094, "rewards/margins": 20.84191131591797, "rewards/rejected": 2.1326065063476562, "step": 1071 }, { "epoch": 0.5548654244306418, "grad_norm": 1.5454970598220825, "learning_rate": 9.510487608625088e-06, "loss": 0.24505247175693512, "rewards/accuracies": 0.8671875, "rewards/chosen": 19.86808204650879, "rewards/margins": 18.312484741210938, "rewards/rejected": 1.5546209812164307, "step": 1072 }, { "epoch": 0.5553830227743272, "grad_norm": 1.5287762880325317, "learning_rate": 9.509255764357364e-06, "loss": 0.24998974800109863, "rewards/accuracies": 0.8671875, "rewards/chosen": 23.267627716064453, "rewards/margins": 20.572616577148438, "rewards/rejected": 2.6919751167297363, "step": 1073 }, { "epoch": 0.5559006211180124, "grad_norm": 1.0983130931854248, "learning_rate": 9.508022452068909e-06, "loss": 0.15413516759872437, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.296987533569336, "rewards/margins": 21.997512817382812, "rewards/rejected": 2.2988662719726562, "step": 1074 }, { "epoch": 0.5564182194616977, "grad_norm": 0.9306723475456238, "learning_rate": 9.506787672161235e-06, "loss": 0.19774962961673737, "rewards/accuracies": 0.890625, "rewards/chosen": 27.043481826782227, "rewards/margins": 23.009811401367188, "rewards/rejected": 4.0252227783203125, "step": 1075 }, { "epoch": 0.556935817805383, "grad_norm": 1.2997864484786987, "learning_rate": 9.505551425036334e-06, "loss": 0.25975245237350464, "rewards/accuracies": 0.875, "rewards/chosen": 24.34911346435547, "rewards/margins": 21.865463256835938, "rewards/rejected": 2.487396240234375, "step": 1076 }, { "epoch": 0.5574534161490683, "grad_norm": 1.1589936017990112, "learning_rate": 9.50431371109668e-06, "loss": 0.2177754044532776, "rewards/accuracies": 0.875, "rewards/chosen": 24.117534637451172, "rewards/margins": 21.446441650390625, "rewards/rejected": 2.6744470596313477, "step": 1077 }, { "epoch": 0.5579710144927537, "grad_norm": 1.6692172288894653, "learning_rate": 9.503074530745213e-06, "loss": 0.24332502484321594, "rewards/accuracies": 0.875, "rewards/chosen": 23.07440948486328, "rewards/margins": 20.922073364257812, "rewards/rejected": 2.1497669219970703, "step": 1078 }, { "epoch": 0.5584886128364389, "grad_norm": 0.6490630507469177, "learning_rate": 9.501833884385362e-06, "loss": 0.18354731798171997, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.401885986328125, "rewards/margins": 20.658668518066406, "rewards/rejected": 2.7469406127929688, "step": 1079 }, { "epoch": 0.5590062111801242, "grad_norm": 0.9191430807113647, "learning_rate": 9.500591772421026e-06, "loss": 0.271847665309906, "rewards/accuracies": 0.859375, "rewards/chosen": 18.262819290161133, "rewards/margins": 16.34012222290039, "rewards/rejected": 1.9278640747070312, "step": 1080 }, { "epoch": 0.5595238095238095, "grad_norm": 0.9357141852378845, "learning_rate": 9.499348195256583e-06, "loss": 0.21963810920715332, "rewards/accuracies": 0.8828125, "rewards/chosen": 22.71210479736328, "rewards/margins": 20.590835571289062, "rewards/rejected": 2.1202354431152344, "step": 1081 }, { "epoch": 0.5600414078674948, "grad_norm": 1.3613892793655396, "learning_rate": 9.498103153296891e-06, "loss": 0.2188246101140976, "rewards/accuracies": 0.875, "rewards/chosen": 20.773128509521484, "rewards/margins": 19.389862060546875, "rewards/rejected": 1.3876609802246094, "step": 1082 }, { "epoch": 0.5605590062111802, "grad_norm": 0.6936904788017273, "learning_rate": 9.496856646947282e-06, "loss": 0.23407654464244843, "rewards/accuracies": 0.8671875, "rewards/chosen": 18.175872802734375, "rewards/margins": 17.093421936035156, "rewards/rejected": 1.0851964950561523, "step": 1083 }, { "epoch": 0.5610766045548654, "grad_norm": 1.1575621366500854, "learning_rate": 9.495608676613562e-06, "loss": 0.23604047298431396, "rewards/accuracies": 0.8984375, "rewards/chosen": 19.362865447998047, "rewards/margins": 17.955543518066406, "rewards/rejected": 1.4108753204345703, "step": 1084 }, { "epoch": 0.5615942028985508, "grad_norm": 1.79826819896698, "learning_rate": 9.494359242702019e-06, "loss": 0.21322976052761078, "rewards/accuracies": 0.875, "rewards/chosen": 22.179630279541016, "rewards/margins": 19.723617553710938, "rewards/rejected": 2.4599075317382812, "step": 1085 }, { "epoch": 0.562111801242236, "grad_norm": 1.5440943241119385, "learning_rate": 9.493108345619416e-06, "loss": 0.1547357439994812, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.39641761779785, "rewards/margins": 22.108505249023438, "rewards/rejected": 2.2867431640625, "step": 1086 }, { "epoch": 0.5626293995859213, "grad_norm": 0.8181976079940796, "learning_rate": 9.49185598577299e-06, "loss": 0.19026368856430054, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.836883544921875, "rewards/margins": 19.749534606933594, "rewards/rejected": 2.097801446914673, "step": 1087 }, { "epoch": 0.5631469979296067, "grad_norm": 1.4732437133789062, "learning_rate": 9.490602163570457e-06, "loss": 0.17911607027053833, "rewards/accuracies": 0.875, "rewards/chosen": 24.333328247070312, "rewards/margins": 21.47943115234375, "rewards/rejected": 2.855144500732422, "step": 1088 }, { "epoch": 0.5636645962732919, "grad_norm": 0.812047004699707, "learning_rate": 9.489346879420006e-06, "loss": 0.214286208152771, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.23663902282715, "rewards/margins": 19.789989471435547, "rewards/rejected": 2.4523239135742188, "step": 1089 }, { "epoch": 0.5641821946169773, "grad_norm": 1.5689811706542969, "learning_rate": 9.488090133730306e-06, "loss": 0.17990738153457642, "rewards/accuracies": 0.921875, "rewards/chosen": 27.488983154296875, "rewards/margins": 23.93506622314453, "rewards/rejected": 3.5511698722839355, "step": 1090 }, { "epoch": 0.5646997929606625, "grad_norm": 0.8119063377380371, "learning_rate": 9.486831926910497e-06, "loss": 0.18077492713928223, "rewards/accuracies": 0.890625, "rewards/chosen": 24.728805541992188, "rewards/margins": 22.232986450195312, "rewards/rejected": 2.5098648071289062, "step": 1091 }, { "epoch": 0.5652173913043478, "grad_norm": 1.1845674514770508, "learning_rate": 9.485572259370202e-06, "loss": 0.2068442702293396, "rewards/accuracies": 0.921875, "rewards/chosen": 30.419601440429688, "rewards/margins": 26.22515106201172, "rewards/rejected": 4.197284698486328, "step": 1092 }, { "epoch": 0.5657349896480331, "grad_norm": 2.70651912689209, "learning_rate": 9.48431113151951e-06, "loss": 0.21414059400558472, "rewards/accuracies": 0.875, "rewards/chosen": 28.49837303161621, "rewards/margins": 24.787841796875, "rewards/rejected": 3.714890480041504, "step": 1093 }, { "epoch": 0.5662525879917184, "grad_norm": 1.6814411878585815, "learning_rate": 9.483048543768993e-06, "loss": 0.19974273443222046, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.307655334472656, "rewards/margins": 23.444183349609375, "rewards/rejected": 3.8658676147460938, "step": 1094 }, { "epoch": 0.5667701863354038, "grad_norm": 1.6046689748764038, "learning_rate": 9.481784496529693e-06, "loss": 0.19097906351089478, "rewards/accuracies": 0.90625, "rewards/chosen": 29.674030303955078, "rewards/margins": 26.075668334960938, "rewards/rejected": 3.600147247314453, "step": 1095 }, { "epoch": 0.567287784679089, "grad_norm": 1.774436116218567, "learning_rate": 9.480518990213133e-06, "loss": 0.209476500749588, "rewards/accuracies": 0.8828125, "rewards/chosen": 29.26837921142578, "rewards/margins": 25.561050415039062, "rewards/rejected": 3.7032766342163086, "step": 1096 }, { "epoch": 0.5678053830227743, "grad_norm": 2.1648950576782227, "learning_rate": 9.479252025231305e-06, "loss": 0.17493797838687897, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.366641998291016, "rewards/margins": 27.886871337890625, "rewards/rejected": 4.481842041015625, "step": 1097 }, { "epoch": 0.5683229813664596, "grad_norm": 0.9007256031036377, "learning_rate": 9.47798360199668e-06, "loss": 0.13983464241027832, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.018516540527344, "rewards/margins": 29.836746215820312, "rewards/rejected": 5.186481475830078, "step": 1098 }, { "epoch": 0.5688405797101449, "grad_norm": 5.985708713531494, "learning_rate": 9.476713720922202e-06, "loss": 0.22832196950912476, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.89671325683594, "rewards/margins": 28.03472900390625, "rewards/rejected": 5.872625350952148, "step": 1099 }, { "epoch": 0.5693581780538303, "grad_norm": 2.4276020526885986, "learning_rate": 9.475442382421289e-06, "loss": 0.18647676706314087, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.27457618713379, "rewards/margins": 27.300003051757812, "rewards/rejected": 3.9796972274780273, "step": 1100 }, { "epoch": 0.5698757763975155, "grad_norm": 2.846463441848755, "learning_rate": 9.474169586907837e-06, "loss": 0.1885489821434021, "rewards/accuracies": 0.890625, "rewards/chosen": 25.603805541992188, "rewards/margins": 22.454666137695312, "rewards/rejected": 3.1448001861572266, "step": 1101 }, { "epoch": 0.5703933747412008, "grad_norm": 2.6890571117401123, "learning_rate": 9.47289533479621e-06, "loss": 0.2216123342514038, "rewards/accuracies": 0.90625, "rewards/chosen": 27.327919006347656, "rewards/margins": 23.003875732421875, "rewards/rejected": 4.329944610595703, "step": 1102 }, { "epoch": 0.5709109730848861, "grad_norm": 2.263275623321533, "learning_rate": 9.471619626501255e-06, "loss": 0.1784677505493164, "rewards/accuracies": 0.890625, "rewards/chosen": 21.397167205810547, "rewards/margins": 18.144927978515625, "rewards/rejected": 3.254119396209717, "step": 1103 }, { "epoch": 0.5714285714285714, "grad_norm": 1.7947797775268555, "learning_rate": 9.470342462438283e-06, "loss": 0.251262366771698, "rewards/accuracies": 0.875, "rewards/chosen": 18.210872650146484, "rewards/margins": 15.211395263671875, "rewards/rejected": 2.9986534118652344, "step": 1104 }, { "epoch": 0.5719461697722568, "grad_norm": 2.2599658966064453, "learning_rate": 9.469063843023086e-06, "loss": 0.190598726272583, "rewards/accuracies": 0.90625, "rewards/chosen": 18.55632781982422, "rewards/margins": 15.37652587890625, "rewards/rejected": 3.1781139373779297, "step": 1105 }, { "epoch": 0.572463768115942, "grad_norm": 1.9544732570648193, "learning_rate": 9.467783768671928e-06, "loss": 0.15306085348129272, "rewards/accuracies": 0.9296875, "rewards/chosen": 15.245378494262695, "rewards/margins": 13.502174377441406, "rewards/rejected": 1.7431044578552246, "step": 1106 }, { "epoch": 0.5729813664596274, "grad_norm": 1.4396910667419434, "learning_rate": 9.466502239801545e-06, "loss": 0.15763896703720093, "rewards/accuracies": 0.90625, "rewards/chosen": 13.165611267089844, "rewards/margins": 11.097042083740234, "rewards/rejected": 2.0712356567382812, "step": 1107 }, { "epoch": 0.5734989648033126, "grad_norm": 2.1500678062438965, "learning_rate": 9.46521925682915e-06, "loss": 0.2409793883562088, "rewards/accuracies": 0.890625, "rewards/chosen": 11.115104675292969, "rewards/margins": 10.003646850585938, "rewards/rejected": 1.113142967224121, "step": 1108 }, { "epoch": 0.5740165631469979, "grad_norm": 1.4921936988830566, "learning_rate": 9.463934820172429e-06, "loss": 0.19427089393138885, "rewards/accuracies": 0.8984375, "rewards/chosen": 10.465940475463867, "rewards/margins": 9.311393737792969, "rewards/rejected": 1.154940128326416, "step": 1109 }, { "epoch": 0.5745341614906833, "grad_norm": 1.7076348066329956, "learning_rate": 9.462648930249534e-06, "loss": 0.24690663814544678, "rewards/accuracies": 0.875, "rewards/chosen": 9.144275665283203, "rewards/margins": 7.9877166748046875, "rewards/rejected": 1.1566791534423828, "step": 1110 }, { "epoch": 0.5750517598343685, "grad_norm": 1.153093934059143, "learning_rate": 9.461361587479102e-06, "loss": 0.205497145652771, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.91655158996582, "rewards/margins": 7.674980163574219, "rewards/rejected": 0.23921489715576172, "step": 1111 }, { "epoch": 0.5755693581780539, "grad_norm": 1.6643601655960083, "learning_rate": 9.46007279228023e-06, "loss": 0.27580830454826355, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.292036056518555, "rewards/margins": 6.394294738769531, "rewards/rejected": 0.8996238708496094, "step": 1112 }, { "epoch": 0.5760869565217391, "grad_norm": 1.506687879562378, "learning_rate": 9.4587825450725e-06, "loss": 0.16378092765808105, "rewards/accuracies": 0.90625, "rewards/chosen": 8.238407135009766, "rewards/margins": 6.960662841796875, "rewards/rejected": 1.2772765159606934, "step": 1113 }, { "epoch": 0.5766045548654244, "grad_norm": 1.9508212804794312, "learning_rate": 9.45749084627596e-06, "loss": 0.21551048755645752, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.662153244018555, "rewards/margins": 6.6468353271484375, "rewards/rejected": 1.0154228210449219, "step": 1114 }, { "epoch": 0.5771221532091098, "grad_norm": 1.2179105281829834, "learning_rate": 9.456197696311132e-06, "loss": 0.2060977816581726, "rewards/accuracies": 0.875, "rewards/chosen": 7.325770378112793, "rewards/margins": 6.655235290527344, "rewards/rejected": 0.66827392578125, "step": 1115 }, { "epoch": 0.577639751552795, "grad_norm": 0.8334563970565796, "learning_rate": 9.454903095599008e-06, "loss": 0.21662777662277222, "rewards/accuracies": 0.9453125, "rewards/chosen": 7.330172538757324, "rewards/margins": 5.8072967529296875, "rewards/rejected": 1.5226325988769531, "step": 1116 }, { "epoch": 0.5781573498964804, "grad_norm": 0.9033461213111877, "learning_rate": 9.453607044561056e-06, "loss": 0.22189201414585114, "rewards/accuracies": 0.8984375, "rewards/chosen": 7.495290756225586, "rewards/margins": 6.4341888427734375, "rewards/rejected": 1.0584750175476074, "step": 1117 }, { "epoch": 0.5786749482401656, "grad_norm": 1.2258509397506714, "learning_rate": 9.452309543619215e-06, "loss": 0.21562562882900238, "rewards/accuracies": 0.8671875, "rewards/chosen": 8.48095989227295, "rewards/margins": 7.1507568359375, "rewards/rejected": 1.3308048248291016, "step": 1118 }, { "epoch": 0.5791925465838509, "grad_norm": 1.2179851531982422, "learning_rate": 9.451010593195898e-06, "loss": 0.18197396397590637, "rewards/accuracies": 0.9375, "rewards/chosen": 10.29671859741211, "rewards/margins": 8.950370788574219, "rewards/rejected": 1.346242904663086, "step": 1119 }, { "epoch": 0.5797101449275363, "grad_norm": 0.9083751440048218, "learning_rate": 9.449710193713983e-06, "loss": 0.22102053463459015, "rewards/accuracies": 0.8515625, "rewards/chosen": 11.412010192871094, "rewards/margins": 10.142379760742188, "rewards/rejected": 1.2718276977539062, "step": 1120 }, { "epoch": 0.5802277432712215, "grad_norm": 0.9707618951797485, "learning_rate": 9.44840834559683e-06, "loss": 0.20354634523391724, "rewards/accuracies": 0.890625, "rewards/chosen": 13.31680679321289, "rewards/margins": 11.9876708984375, "rewards/rejected": 1.3281021118164062, "step": 1121 }, { "epoch": 0.5807453416149069, "grad_norm": 0.5020038485527039, "learning_rate": 9.44710504926826e-06, "loss": 0.18202504515647888, "rewards/accuracies": 0.8828125, "rewards/chosen": 14.012689590454102, "rewards/margins": 12.498489379882812, "rewards/rejected": 1.5153112411499023, "step": 1122 }, { "epoch": 0.5812629399585921, "grad_norm": 1.352580189704895, "learning_rate": 9.445800305152571e-06, "loss": 0.22656720876693726, "rewards/accuracies": 0.90625, "rewards/chosen": 15.329170227050781, "rewards/margins": 13.390310287475586, "rewards/rejected": 1.9363373517990112, "step": 1123 }, { "epoch": 0.5817805383022774, "grad_norm": 1.0994857549667358, "learning_rate": 9.444494113674535e-06, "loss": 0.22658005356788635, "rewards/accuracies": 0.875, "rewards/chosen": 16.72476577758789, "rewards/margins": 14.932083129882812, "rewards/rejected": 1.7958219051361084, "step": 1124 }, { "epoch": 0.5822981366459627, "grad_norm": 0.9805395603179932, "learning_rate": 9.44318647525939e-06, "loss": 0.19743621349334717, "rewards/accuracies": 0.8984375, "rewards/chosen": 23.10726547241211, "rewards/margins": 20.28924560546875, "rewards/rejected": 2.809248924255371, "step": 1125 }, { "epoch": 0.582815734989648, "grad_norm": 1.698123812675476, "learning_rate": 9.441877390332847e-06, "loss": 0.23680591583251953, "rewards/accuracies": 0.890625, "rewards/chosen": 18.575103759765625, "rewards/margins": 16.466339111328125, "rewards/rejected": 2.1081321239471436, "step": 1126 }, { "epoch": 0.5833333333333334, "grad_norm": 0.9498289227485657, "learning_rate": 9.440566859321089e-06, "loss": 0.17780783772468567, "rewards/accuracies": 0.875, "rewards/chosen": 27.728782653808594, "rewards/margins": 24.86749267578125, "rewards/rejected": 2.8585805892944336, "step": 1127 }, { "epoch": 0.5838509316770186, "grad_norm": 1.165877103805542, "learning_rate": 9.43925488265077e-06, "loss": 0.2056291550397873, "rewards/accuracies": 0.90625, "rewards/chosen": 26.171051025390625, "rewards/margins": 23.888717651367188, "rewards/rejected": 2.287525177001953, "step": 1128 }, { "epoch": 0.5843685300207039, "grad_norm": 2.1665890216827393, "learning_rate": 9.43794146074901e-06, "loss": 0.17275741696357727, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.73052215576172, "rewards/margins": 22.867366790771484, "rewards/rejected": 1.8547134399414062, "step": 1129 }, { "epoch": 0.5848861283643892, "grad_norm": 1.0789085626602173, "learning_rate": 9.436626594043405e-06, "loss": 0.21729573607444763, "rewards/accuracies": 0.8828125, "rewards/chosen": 31.084360122680664, "rewards/margins": 27.8890380859375, "rewards/rejected": 3.1951847076416016, "step": 1130 }, { "epoch": 0.5854037267080745, "grad_norm": 1.4224506616592407, "learning_rate": 9.435310282962018e-06, "loss": 0.2216237187385559, "rewards/accuracies": 0.890625, "rewards/chosen": 25.648218154907227, "rewards/margins": 23.142986297607422, "rewards/rejected": 2.4993667602539062, "step": 1131 }, { "epoch": 0.5859213250517599, "grad_norm": 1.7972543239593506, "learning_rate": 9.433992527933387e-06, "loss": 0.26424288749694824, "rewards/accuracies": 0.8515625, "rewards/chosen": 24.07469940185547, "rewards/margins": 21.942764282226562, "rewards/rejected": 2.1340904235839844, "step": 1132 }, { "epoch": 0.5864389233954451, "grad_norm": 0.5637362003326416, "learning_rate": 9.432673329386512e-06, "loss": 0.13574957847595215, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.647113800048828, "rewards/margins": 25.73577880859375, "rewards/rejected": 2.9099979400634766, "step": 1133 }, { "epoch": 0.5869565217391305, "grad_norm": 1.41737699508667, "learning_rate": 9.431352687750873e-06, "loss": 0.22015802562236786, "rewards/accuracies": 0.90625, "rewards/chosen": 25.60013198852539, "rewards/margins": 23.20499038696289, "rewards/rejected": 2.3973426818847656, "step": 1134 }, { "epoch": 0.5874741200828157, "grad_norm": 0.7344648241996765, "learning_rate": 9.430030603456409e-06, "loss": 0.18708574771881104, "rewards/accuracies": 0.921875, "rewards/chosen": 22.987239837646484, "rewards/margins": 21.269550323486328, "rewards/rejected": 1.720654010772705, "step": 1135 }, { "epoch": 0.587991718426501, "grad_norm": 1.0991239547729492, "learning_rate": 9.428707076933537e-06, "loss": 0.1985001564025879, "rewards/accuracies": 0.8515625, "rewards/chosen": 19.95955467224121, "rewards/margins": 17.86883544921875, "rewards/rejected": 2.094379425048828, "step": 1136 }, { "epoch": 0.5885093167701864, "grad_norm": 0.7028840780258179, "learning_rate": 9.427382108613139e-06, "loss": 0.18404915928840637, "rewards/accuracies": 0.8828125, "rewards/chosen": 21.52181625366211, "rewards/margins": 19.439300537109375, "rewards/rejected": 2.0782909393310547, "step": 1137 }, { "epoch": 0.5890269151138716, "grad_norm": 1.5922980308532715, "learning_rate": 9.426055698926571e-06, "loss": 0.2212137132883072, "rewards/accuracies": 0.890625, "rewards/chosen": 24.359651565551758, "rewards/margins": 22.614959716796875, "rewards/rejected": 1.7387523651123047, "step": 1138 }, { "epoch": 0.589544513457557, "grad_norm": 2.7323951721191406, "learning_rate": 9.42472784830565e-06, "loss": 0.1746257245540619, "rewards/accuracies": 0.921875, "rewards/chosen": 19.596609115600586, "rewards/margins": 17.579723358154297, "rewards/rejected": 2.018402099609375, "step": 1139 }, { "epoch": 0.5900621118012422, "grad_norm": 0.7010484933853149, "learning_rate": 9.423398557182674e-06, "loss": 0.17798849940299988, "rewards/accuracies": 0.890625, "rewards/chosen": 19.753868103027344, "rewards/margins": 17.6185302734375, "rewards/rejected": 2.141636371612549, "step": 1140 }, { "epoch": 0.5905797101449275, "grad_norm": 0.9180212020874023, "learning_rate": 9.422067825990398e-06, "loss": 0.21877215802669525, "rewards/accuracies": 0.875, "rewards/chosen": 15.882464408874512, "rewards/margins": 14.60748291015625, "rewards/rejected": 1.2764892578125, "step": 1141 }, { "epoch": 0.5910973084886129, "grad_norm": 1.5507748126983643, "learning_rate": 9.420735655162055e-06, "loss": 0.2049822360277176, "rewards/accuracies": 0.9140625, "rewards/chosen": 18.641780853271484, "rewards/margins": 17.0614013671875, "rewards/rejected": 1.5772204399108887, "step": 1142 }, { "epoch": 0.5916149068322981, "grad_norm": 0.8266518712043762, "learning_rate": 9.419402045131339e-06, "loss": 0.1535211205482483, "rewards/accuracies": 0.90625, "rewards/chosen": 18.445240020751953, "rewards/margins": 16.836875915527344, "rewards/rejected": 1.6071648597717285, "step": 1143 }, { "epoch": 0.5921325051759835, "grad_norm": 0.6167104840278625, "learning_rate": 9.418066996332417e-06, "loss": 0.15551716089248657, "rewards/accuracies": 0.9453125, "rewards/chosen": 18.122243881225586, "rewards/margins": 17.12738037109375, "rewards/rejected": 0.9936841726303101, "step": 1144 }, { "epoch": 0.5926501035196687, "grad_norm": 1.0787560939788818, "learning_rate": 9.416730509199929e-06, "loss": 0.16800205409526825, "rewards/accuracies": 0.9140625, "rewards/chosen": 20.398784637451172, "rewards/margins": 19.132831573486328, "rewards/rejected": 1.2679805755615234, "step": 1145 }, { "epoch": 0.593167701863354, "grad_norm": 1.0997488498687744, "learning_rate": 9.415392584168972e-06, "loss": 0.1607409119606018, "rewards/accuracies": 0.9140625, "rewards/chosen": 20.97568130493164, "rewards/margins": 19.1680908203125, "rewards/rejected": 1.8039512634277344, "step": 1146 }, { "epoch": 0.5936853002070394, "grad_norm": 1.5686802864074707, "learning_rate": 9.414053221675119e-06, "loss": 0.16742515563964844, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.508750915527344, "rewards/margins": 21.073654174804688, "rewards/rejected": 1.433908462524414, "step": 1147 }, { "epoch": 0.5942028985507246, "grad_norm": 1.0031017065048218, "learning_rate": 9.412712422154409e-06, "loss": 0.13214810192584991, "rewards/accuracies": 0.921875, "rewards/chosen": 28.523033142089844, "rewards/margins": 27.015869140625, "rewards/rejected": 1.5048432350158691, "step": 1148 }, { "epoch": 0.59472049689441, "grad_norm": 1.3357698917388916, "learning_rate": 9.411370186043349e-06, "loss": 0.09710448235273361, "rewards/accuracies": 0.9375, "rewards/chosen": 30.603042602539062, "rewards/margins": 28.2254638671875, "rewards/rejected": 2.3897604942321777, "step": 1149 }, { "epoch": 0.5952380952380952, "grad_norm": 1.619217038154602, "learning_rate": 9.410026513778914e-06, "loss": 0.20224173367023468, "rewards/accuracies": 0.90625, "rewards/chosen": 26.816089630126953, "rewards/margins": 25.262786865234375, "rewards/rejected": 1.5473839044570923, "step": 1150 }, { "epoch": 0.5957556935817805, "grad_norm": 1.8802623748779297, "learning_rate": 9.408681405798548e-06, "loss": 0.22418615221977234, "rewards/accuracies": 0.8984375, "rewards/chosen": 26.084909439086914, "rewards/margins": 25.20206642150879, "rewards/rejected": 0.8743354082107544, "step": 1151 }, { "epoch": 0.5962732919254659, "grad_norm": 2.8570785522460938, "learning_rate": 9.407334862540156e-06, "loss": 0.2673377990722656, "rewards/accuracies": 0.875, "rewards/chosen": 31.88318634033203, "rewards/margins": 28.274324417114258, "rewards/rejected": 3.6161859035491943, "step": 1152 }, { "epoch": 0.5967908902691511, "grad_norm": 1.301896333694458, "learning_rate": 9.40598688444212e-06, "loss": 0.1467619389295578, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.885421752929688, "rewards/margins": 28.408706665039062, "rewards/rejected": 1.4782800674438477, "step": 1153 }, { "epoch": 0.5973084886128365, "grad_norm": 1.7358914613723755, "learning_rate": 9.40463747194328e-06, "loss": 0.2846102714538574, "rewards/accuracies": 0.859375, "rewards/chosen": 32.18105697631836, "rewards/margins": 29.182151794433594, "rewards/rejected": 2.9997544288635254, "step": 1154 }, { "epoch": 0.5978260869565217, "grad_norm": 1.6510276794433594, "learning_rate": 9.403286625482947e-06, "loss": 0.26361408829689026, "rewards/accuracies": 0.8671875, "rewards/chosen": 34.767112731933594, "rewards/margins": 31.910152435302734, "rewards/rejected": 2.846834421157837, "step": 1155 }, { "epoch": 0.598343685300207, "grad_norm": 1.3621810674667358, "learning_rate": 9.401934345500902e-06, "loss": 0.22550922632217407, "rewards/accuracies": 0.8984375, "rewards/chosen": 35.33378601074219, "rewards/margins": 32.310367584228516, "rewards/rejected": 3.0285024642944336, "step": 1156 }, { "epoch": 0.5988612836438924, "grad_norm": 1.4072010517120361, "learning_rate": 9.400580632437389e-06, "loss": 0.2777825593948364, "rewards/accuracies": 0.8203125, "rewards/chosen": 24.881181716918945, "rewards/margins": 22.841110229492188, "rewards/rejected": 2.03941011428833, "step": 1157 }, { "epoch": 0.5993788819875776, "grad_norm": 2.189290761947632, "learning_rate": 9.399225486733117e-06, "loss": 0.14369794726371765, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.09868240356445, "rewards/margins": 32.83076477050781, "rewards/rejected": 5.265682220458984, "step": 1158 }, { "epoch": 0.599896480331263, "grad_norm": 1.0858937501907349, "learning_rate": 9.397868908829264e-06, "loss": 0.19978266954421997, "rewards/accuracies": 0.890625, "rewards/chosen": 28.645709991455078, "rewards/margins": 25.642044067382812, "rewards/rejected": 3.0104551315307617, "step": 1159 }, { "epoch": 0.6004140786749482, "grad_norm": 1.3800806999206543, "learning_rate": 9.396510899167474e-06, "loss": 0.21659034490585327, "rewards/accuracies": 0.8671875, "rewards/chosen": 26.054851531982422, "rewards/margins": 23.655969619750977, "rewards/rejected": 2.405433177947998, "step": 1160 }, { "epoch": 0.6009316770186336, "grad_norm": 1.114641785621643, "learning_rate": 9.395151458189856e-06, "loss": 0.1926756054162979, "rewards/accuracies": 0.90625, "rewards/chosen": 20.426021575927734, "rewards/margins": 18.6300048828125, "rewards/rejected": 1.794553518295288, "step": 1161 }, { "epoch": 0.6014492753623188, "grad_norm": 1.066551923751831, "learning_rate": 9.393790586338987e-06, "loss": 0.19501665234565735, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.613109588623047, "rewards/margins": 18.589035034179688, "rewards/rejected": 2.023387908935547, "step": 1162 }, { "epoch": 0.6019668737060041, "grad_norm": 0.7004244327545166, "learning_rate": 9.39242828405791e-06, "loss": 0.2070743292570114, "rewards/accuracies": 0.890625, "rewards/chosen": 17.578649520874023, "rewards/margins": 16.1859130859375, "rewards/rejected": 1.3912334442138672, "step": 1163 }, { "epoch": 0.6024844720496895, "grad_norm": 1.2243574857711792, "learning_rate": 9.391064551790129e-06, "loss": 0.17860965430736542, "rewards/accuracies": 0.921875, "rewards/chosen": 20.08776092529297, "rewards/margins": 17.87339973449707, "rewards/rejected": 2.2191460132598877, "step": 1164 }, { "epoch": 0.6030020703933747, "grad_norm": 0.7419143319129944, "learning_rate": 9.38969938997962e-06, "loss": 0.15697765350341797, "rewards/accuracies": 0.9140625, "rewards/chosen": 15.821288108825684, "rewards/margins": 14.370880126953125, "rewards/rejected": 1.4480609893798828, "step": 1165 }, { "epoch": 0.6035196687370601, "grad_norm": 0.9116652011871338, "learning_rate": 9.388332799070818e-06, "loss": 0.1916111409664154, "rewards/accuracies": 0.90625, "rewards/chosen": 13.722000122070312, "rewards/margins": 12.577911376953125, "rewards/rejected": 1.1453286409378052, "step": 1166 }, { "epoch": 0.6040372670807453, "grad_norm": 0.7064950466156006, "learning_rate": 9.38696477950863e-06, "loss": 0.23993340134620667, "rewards/accuracies": 0.890625, "rewards/chosen": 11.57155704498291, "rewards/margins": 10.611099243164062, "rewards/rejected": 0.9567098617553711, "step": 1167 }, { "epoch": 0.6045548654244306, "grad_norm": 0.924568772315979, "learning_rate": 9.385595331738423e-06, "loss": 0.198556587100029, "rewards/accuracies": 0.875, "rewards/chosen": 13.517974853515625, "rewards/margins": 12.63406753540039, "rewards/rejected": 0.8866195678710938, "step": 1168 }, { "epoch": 0.605072463768116, "grad_norm": 0.5752053260803223, "learning_rate": 9.384224456206028e-06, "loss": 0.17207735776901245, "rewards/accuracies": 0.9375, "rewards/chosen": 12.626489639282227, "rewards/margins": 11.739700317382812, "rewards/rejected": 0.8877372741699219, "step": 1169 }, { "epoch": 0.6055900621118012, "grad_norm": 1.3362364768981934, "learning_rate": 9.38285215335775e-06, "loss": 0.23461303114891052, "rewards/accuracies": 0.890625, "rewards/chosen": 9.877897262573242, "rewards/margins": 9.372611999511719, "rewards/rejected": 0.5067057609558105, "step": 1170 }, { "epoch": 0.6061076604554866, "grad_norm": 1.0859979391098022, "learning_rate": 9.381478423640345e-06, "loss": 0.2204655408859253, "rewards/accuracies": 0.8671875, "rewards/chosen": 11.925994873046875, "rewards/margins": 11.000293731689453, "rewards/rejected": 0.9248583912849426, "step": 1171 }, { "epoch": 0.6066252587991718, "grad_norm": 0.6475357413291931, "learning_rate": 9.380103267501043e-06, "loss": 0.18153506517410278, "rewards/accuracies": 0.921875, "rewards/chosen": 11.795379638671875, "rewards/margins": 10.832588195800781, "rewards/rejected": 0.9604048728942871, "step": 1172 }, { "epoch": 0.6071428571428571, "grad_norm": 0.7571757435798645, "learning_rate": 9.378726685387538e-06, "loss": 0.16901150345802307, "rewards/accuracies": 0.9296875, "rewards/chosen": 13.94481086730957, "rewards/margins": 12.654449462890625, "rewards/rejected": 1.2888853549957275, "step": 1173 }, { "epoch": 0.6076604554865425, "grad_norm": 0.740782618522644, "learning_rate": 9.377348677747985e-06, "loss": 0.19883978366851807, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.979181289672852, "rewards/margins": 11.2569580078125, "rewards/rejected": 0.7215213775634766, "step": 1174 }, { "epoch": 0.6081780538302277, "grad_norm": 0.9208011031150818, "learning_rate": 9.375969245031005e-06, "loss": 0.21146729588508606, "rewards/accuracies": 0.90625, "rewards/chosen": 11.29835033416748, "rewards/margins": 10.46942138671875, "rewards/rejected": 0.8292431831359863, "step": 1175 }, { "epoch": 0.6086956521739131, "grad_norm": 1.5533169507980347, "learning_rate": 9.374588387685681e-06, "loss": 0.15563583374023438, "rewards/accuracies": 0.9375, "rewards/chosen": 11.763809204101562, "rewards/margins": 11.18628215789795, "rewards/rejected": 0.5742810964584351, "step": 1176 }, { "epoch": 0.6092132505175983, "grad_norm": 1.3379074335098267, "learning_rate": 9.373206106161562e-06, "loss": 0.25060024857521057, "rewards/accuracies": 0.875, "rewards/chosen": 10.82728099822998, "rewards/margins": 10.370918273925781, "rewards/rejected": 0.4588181972503662, "step": 1177 }, { "epoch": 0.6097308488612836, "grad_norm": 0.7929995059967041, "learning_rate": 9.371822400908659e-06, "loss": 0.18930363655090332, "rewards/accuracies": 0.9140625, "rewards/chosen": 12.825605392456055, "rewards/margins": 11.631797790527344, "rewards/rejected": 1.1935081481933594, "step": 1178 }, { "epoch": 0.610248447204969, "grad_norm": 1.8284677267074585, "learning_rate": 9.370437272377446e-06, "loss": 0.205423966050148, "rewards/accuracies": 0.8984375, "rewards/chosen": 14.033554077148438, "rewards/margins": 12.692146301269531, "rewards/rejected": 1.3394384384155273, "step": 1179 }, { "epoch": 0.6107660455486542, "grad_norm": 2.6773955821990967, "learning_rate": 9.369050721018864e-06, "loss": 0.1810423582792282, "rewards/accuracies": 0.921875, "rewards/chosen": 14.368502616882324, "rewards/margins": 12.961166381835938, "rewards/rejected": 1.4081687927246094, "step": 1180 }, { "epoch": 0.6112836438923396, "grad_norm": 0.8835278749465942, "learning_rate": 9.367662747284314e-06, "loss": 0.2033078372478485, "rewards/accuracies": 0.8984375, "rewards/chosen": 15.24575138092041, "rewards/margins": 13.611419677734375, "rewards/rejected": 1.6378498077392578, "step": 1181 }, { "epoch": 0.6118012422360248, "grad_norm": 2.4334115982055664, "learning_rate": 9.366273351625662e-06, "loss": 0.2560943365097046, "rewards/accuracies": 0.8828125, "rewards/chosen": 15.532272338867188, "rewards/margins": 13.607002258300781, "rewards/rejected": 1.9247474670410156, "step": 1182 }, { "epoch": 0.6123188405797102, "grad_norm": 1.2947174310684204, "learning_rate": 9.364882534495232e-06, "loss": 0.23428964614868164, "rewards/accuracies": 0.8671875, "rewards/chosen": 14.701593399047852, "rewards/margins": 13.144515991210938, "rewards/rejected": 1.554313063621521, "step": 1183 }, { "epoch": 0.6128364389233955, "grad_norm": 1.2801332473754883, "learning_rate": 9.363490296345819e-06, "loss": 0.1851804256439209, "rewards/accuracies": 0.90625, "rewards/chosen": 16.38460350036621, "rewards/margins": 14.335586547851562, "rewards/rejected": 2.045969009399414, "step": 1184 }, { "epoch": 0.6133540372670807, "grad_norm": 1.0471729040145874, "learning_rate": 9.362096637630674e-06, "loss": 0.17168503999710083, "rewards/accuracies": 0.90625, "rewards/chosen": 13.095271110534668, "rewards/margins": 12.3671875, "rewards/rejected": 0.7272472381591797, "step": 1185 }, { "epoch": 0.6138716356107661, "grad_norm": 0.7096533179283142, "learning_rate": 9.360701558803511e-06, "loss": 0.1627969592809677, "rewards/accuracies": 0.8984375, "rewards/chosen": 13.001190185546875, "rewards/margins": 12.0361328125, "rewards/rejected": 0.9655075073242188, "step": 1186 }, { "epoch": 0.6143892339544513, "grad_norm": 2.25537109375, "learning_rate": 9.359305060318513e-06, "loss": 0.19055920839309692, "rewards/accuracies": 0.8984375, "rewards/chosen": 15.460342407226562, "rewards/margins": 13.237228393554688, "rewards/rejected": 2.2219653129577637, "step": 1187 }, { "epoch": 0.6149068322981367, "grad_norm": 2.2404379844665527, "learning_rate": 9.357907142630316e-06, "loss": 0.25037992000579834, "rewards/accuracies": 0.859375, "rewards/chosen": 12.922399520874023, "rewards/margins": 11.403099060058594, "rewards/rejected": 1.5190138816833496, "step": 1188 }, { "epoch": 0.615424430641822, "grad_norm": 1.836772084236145, "learning_rate": 9.356507806194021e-06, "loss": 0.17358191311359406, "rewards/accuracies": 0.875, "rewards/chosen": 14.295320510864258, "rewards/margins": 12.67306900024414, "rewards/rejected": 1.6249548196792603, "step": 1189 }, { "epoch": 0.6159420289855072, "grad_norm": 1.516567587852478, "learning_rate": 9.355107051465197e-06, "loss": 0.19970789551734924, "rewards/accuracies": 0.921875, "rewards/chosen": 12.644084930419922, "rewards/margins": 11.387283325195312, "rewards/rejected": 1.2552852630615234, "step": 1190 }, { "epoch": 0.6164596273291926, "grad_norm": 1.105060338973999, "learning_rate": 9.353704878899866e-06, "loss": 0.16554045677185059, "rewards/accuracies": 0.8984375, "rewards/chosen": 11.577354431152344, "rewards/margins": 10.358207702636719, "rewards/rejected": 1.2230061292648315, "step": 1191 }, { "epoch": 0.6169772256728778, "grad_norm": 1.0675833225250244, "learning_rate": 9.352301288954517e-06, "loss": 0.28062042593955994, "rewards/accuracies": 0.8125, "rewards/chosen": 11.308958053588867, "rewards/margins": 9.7183837890625, "rewards/rejected": 1.5887413024902344, "step": 1192 }, { "epoch": 0.6174948240165632, "grad_norm": 1.1727842092514038, "learning_rate": 9.350896282086098e-06, "loss": 0.20581918954849243, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.249134063720703, "rewards/margins": 9.769584655761719, "rewards/rejected": 1.4797592163085938, "step": 1193 }, { "epoch": 0.6180124223602484, "grad_norm": 0.8450350165367126, "learning_rate": 9.349489858752018e-06, "loss": 0.11849479377269745, "rewards/accuracies": 0.9453125, "rewards/chosen": 11.351070404052734, "rewards/margins": 9.782962799072266, "rewards/rejected": 1.5689287185668945, "step": 1194 }, { "epoch": 0.6185300207039337, "grad_norm": 0.9124978184700012, "learning_rate": 9.34808201941015e-06, "loss": 0.1842747926712036, "rewards/accuracies": 0.890625, "rewards/chosen": 11.16278076171875, "rewards/margins": 9.54107666015625, "rewards/rejected": 1.6227130889892578, "step": 1195 }, { "epoch": 0.6190476190476191, "grad_norm": 1.2434331178665161, "learning_rate": 9.346672764518828e-06, "loss": 0.17131149768829346, "rewards/accuracies": 0.9140625, "rewards/chosen": 12.140689849853516, "rewards/margins": 9.33737564086914, "rewards/rejected": 2.804997682571411, "step": 1196 }, { "epoch": 0.6195652173913043, "grad_norm": 0.9958962798118591, "learning_rate": 9.345262094536842e-06, "loss": 0.17537418007850647, "rewards/accuracies": 0.9140625, "rewards/chosen": 11.376279830932617, "rewards/margins": 8.526641845703125, "rewards/rejected": 2.8524889945983887, "step": 1197 }, { "epoch": 0.6200828157349897, "grad_norm": 1.0037972927093506, "learning_rate": 9.343850009923445e-06, "loss": 0.18979407846927643, "rewards/accuracies": 0.921875, "rewards/chosen": 10.978545188903809, "rewards/margins": 8.50501823425293, "rewards/rejected": 2.470907211303711, "step": 1198 }, { "epoch": 0.620600414078675, "grad_norm": 0.6397344470024109, "learning_rate": 9.342436511138355e-06, "loss": 0.10035943984985352, "rewards/accuracies": 0.9609375, "rewards/chosen": 12.78073787689209, "rewards/margins": 9.016143798828125, "rewards/rejected": 3.764575958251953, "step": 1199 }, { "epoch": 0.6211180124223602, "grad_norm": 1.5139826536178589, "learning_rate": 9.341021598641743e-06, "loss": 0.15147200226783752, "rewards/accuracies": 0.9296875, "rewards/chosen": 11.703086853027344, "rewards/margins": 8.026496887207031, "rewards/rejected": 3.6744041442871094, "step": 1200 }, { "epoch": 0.6216356107660456, "grad_norm": 1.00386643409729, "learning_rate": 9.339605272894247e-06, "loss": 0.16367939114570618, "rewards/accuracies": 0.90625, "rewards/chosen": 10.82330322265625, "rewards/margins": 7.6653289794921875, "rewards/rejected": 3.1581835746765137, "step": 1201 }, { "epoch": 0.6221532091097308, "grad_norm": 1.4913699626922607, "learning_rate": 9.338187534356961e-06, "loss": 0.18896803259849548, "rewards/accuracies": 0.90625, "rewards/chosen": 12.821014404296875, "rewards/margins": 8.702713012695312, "rewards/rejected": 4.118408203125, "step": 1202 }, { "epoch": 0.6226708074534162, "grad_norm": 1.717130184173584, "learning_rate": 9.336768383491442e-06, "loss": 0.2212887704372406, "rewards/accuracies": 0.8828125, "rewards/chosen": 13.623594284057617, "rewards/margins": 9.249725341796875, "rewards/rejected": 4.373199462890625, "step": 1203 }, { "epoch": 0.6231884057971014, "grad_norm": 1.3595256805419922, "learning_rate": 9.3353478207597e-06, "loss": 0.22125869989395142, "rewards/accuracies": 0.8828125, "rewards/chosen": 12.141510009765625, "rewards/margins": 7.7503662109375, "rewards/rejected": 4.390224456787109, "step": 1204 }, { "epoch": 0.6237060041407867, "grad_norm": 5.821560382843018, "learning_rate": 9.333925846624212e-06, "loss": 0.18385538458824158, "rewards/accuracies": 0.921875, "rewards/chosen": 12.55118179321289, "rewards/margins": 8.523818969726562, "rewards/rejected": 4.025357246398926, "step": 1205 }, { "epoch": 0.6242236024844721, "grad_norm": 1.2662310600280762, "learning_rate": 9.332502461547912e-06, "loss": 0.1688217669725418, "rewards/accuracies": 0.9140625, "rewards/chosen": 13.125282287597656, "rewards/margins": 9.039566040039062, "rewards/rejected": 4.086917877197266, "step": 1206 }, { "epoch": 0.6247412008281573, "grad_norm": 2.1603481769561768, "learning_rate": 9.331077665994195e-06, "loss": 0.14997908473014832, "rewards/accuracies": 0.9375, "rewards/chosen": 14.289810180664062, "rewards/margins": 9.870437622070312, "rewards/rejected": 4.420463562011719, "step": 1207 }, { "epoch": 0.6252587991718427, "grad_norm": 1.2074068784713745, "learning_rate": 9.329651460426911e-06, "loss": 0.22292399406433105, "rewards/accuracies": 0.859375, "rewards/chosen": 12.141204833984375, "rewards/margins": 8.304046630859375, "rewards/rejected": 3.8348422050476074, "step": 1208 }, { "epoch": 0.6257763975155279, "grad_norm": 1.7599769830703735, "learning_rate": 9.328223845310371e-06, "loss": 0.2719898223876953, "rewards/accuracies": 0.8671875, "rewards/chosen": 11.4805908203125, "rewards/margins": 9.025238037109375, "rewards/rejected": 2.4577102661132812, "step": 1209 }, { "epoch": 0.6262939958592133, "grad_norm": 1.2084285020828247, "learning_rate": 9.326794821109349e-06, "loss": 0.21920114755630493, "rewards/accuracies": 0.8671875, "rewards/chosen": 10.80828857421875, "rewards/margins": 7.7718658447265625, "rewards/rejected": 3.0363082885742188, "step": 1210 }, { "epoch": 0.6268115942028986, "grad_norm": 3.478959083557129, "learning_rate": 9.32536438828907e-06, "loss": 0.19766484200954437, "rewards/accuracies": 0.9296875, "rewards/chosen": 10.020153045654297, "rewards/margins": 7.960121154785156, "rewards/rejected": 2.0594825744628906, "step": 1211 }, { "epoch": 0.6273291925465838, "grad_norm": 1.1777187585830688, "learning_rate": 9.323932547315224e-06, "loss": 0.16822999715805054, "rewards/accuracies": 0.8984375, "rewards/chosen": 10.695226669311523, "rewards/margins": 8.492477416992188, "rewards/rejected": 2.2020437717437744, "step": 1212 }, { "epoch": 0.6278467908902692, "grad_norm": 1.7512165307998657, "learning_rate": 9.322499298653956e-06, "loss": 0.2828027606010437, "rewards/accuracies": 0.90625, "rewards/chosen": 9.029092788696289, "rewards/margins": 7.143730163574219, "rewards/rejected": 1.8835067749023438, "step": 1213 }, { "epoch": 0.6283643892339544, "grad_norm": 0.9993857145309448, "learning_rate": 9.321064642771871e-06, "loss": 0.19745337963104248, "rewards/accuracies": 0.890625, "rewards/chosen": 9.672113418579102, "rewards/margins": 7.85272216796875, "rewards/rejected": 1.8230133056640625, "step": 1214 }, { "epoch": 0.6288819875776398, "grad_norm": 0.884291410446167, "learning_rate": 9.319628580136033e-06, "loss": 0.26314640045166016, "rewards/accuracies": 0.859375, "rewards/chosen": 7.7761993408203125, "rewards/margins": 6.12725830078125, "rewards/rejected": 1.6512126922607422, "step": 1215 }, { "epoch": 0.629399585921325, "grad_norm": 0.8218127489089966, "learning_rate": 9.318191111213961e-06, "loss": 0.16433575749397278, "rewards/accuracies": 0.9140625, "rewards/chosen": 9.100879669189453, "rewards/margins": 7.4789581298828125, "rewards/rejected": 1.6233940124511719, "step": 1216 }, { "epoch": 0.6299171842650103, "grad_norm": 1.6168915033340454, "learning_rate": 9.316752236473634e-06, "loss": 0.19615978002548218, "rewards/accuracies": 0.890625, "rewards/chosen": 8.072834014892578, "rewards/margins": 6.6960906982421875, "rewards/rejected": 1.3764362335205078, "step": 1217 }, { "epoch": 0.6304347826086957, "grad_norm": 1.2212954759597778, "learning_rate": 9.315311956383487e-06, "loss": 0.2299848347902298, "rewards/accuracies": 0.8828125, "rewards/chosen": 7.899492263793945, "rewards/margins": 6.5317535400390625, "rewards/rejected": 1.3698387145996094, "step": 1218 }, { "epoch": 0.6309523809523809, "grad_norm": 0.9233412146568298, "learning_rate": 9.313870271412415e-06, "loss": 0.15781790018081665, "rewards/accuracies": 0.9296875, "rewards/chosen": 8.402828216552734, "rewards/margins": 7.42083740234375, "rewards/rejected": 0.9833221435546875, "step": 1219 }, { "epoch": 0.6314699792960663, "grad_norm": 1.3166133165359497, "learning_rate": 9.312427182029767e-06, "loss": 0.20956221222877502, "rewards/accuracies": 0.8984375, "rewards/chosen": 8.349105834960938, "rewards/margins": 7.285003662109375, "rewards/rejected": 1.0659408569335938, "step": 1220 }, { "epoch": 0.6319875776397516, "grad_norm": 0.8298109769821167, "learning_rate": 9.310982688705352e-06, "loss": 0.14856237173080444, "rewards/accuracies": 0.921875, "rewards/chosen": 9.010905265808105, "rewards/margins": 7.886516571044922, "rewards/rejected": 1.1249313354492188, "step": 1221 }, { "epoch": 0.6325051759834368, "grad_norm": 1.037455439567566, "learning_rate": 9.309536791909437e-06, "loss": 0.22077402472496033, "rewards/accuracies": 0.8828125, "rewards/chosen": 8.440345764160156, "rewards/margins": 7.3006591796875, "rewards/rejected": 1.1421051025390625, "step": 1222 }, { "epoch": 0.6330227743271222, "grad_norm": 2.3908638954162598, "learning_rate": 9.308089492112742e-06, "loss": 0.24171237647533417, "rewards/accuracies": 0.8828125, "rewards/chosen": 10.42243766784668, "rewards/margins": 8.759971618652344, "rewards/rejected": 1.6609554290771484, "step": 1223 }, { "epoch": 0.6335403726708074, "grad_norm": 1.0167337656021118, "learning_rate": 9.306640789786447e-06, "loss": 0.17711549997329712, "rewards/accuracies": 0.9140625, "rewards/chosen": 13.119064331054688, "rewards/margins": 11.502471923828125, "rewards/rejected": 1.6169357299804688, "step": 1224 }, { "epoch": 0.6340579710144928, "grad_norm": 0.8355826735496521, "learning_rate": 9.305190685402186e-06, "loss": 0.22086384892463684, "rewards/accuracies": 0.8828125, "rewards/chosen": 11.025405883789062, "rewards/margins": 9.417045593261719, "rewards/rejected": 1.6072040796279907, "step": 1225 }, { "epoch": 0.634575569358178, "grad_norm": 1.2230256795883179, "learning_rate": 9.303739179432053e-06, "loss": 0.15522828698158264, "rewards/accuracies": 0.90625, "rewards/chosen": 12.752021789550781, "rewards/margins": 11.486984252929688, "rewards/rejected": 1.264627456665039, "step": 1226 }, { "epoch": 0.6350931677018633, "grad_norm": 1.2544686794281006, "learning_rate": 9.302286272348595e-06, "loss": 0.20562411844730377, "rewards/accuracies": 0.8828125, "rewards/chosen": 13.372230529785156, "rewards/margins": 11.621646881103516, "rewards/rejected": 1.7537245750427246, "step": 1227 }, { "epoch": 0.6356107660455487, "grad_norm": 0.7242883443832397, "learning_rate": 9.300831964624818e-06, "loss": 0.20242001116275787, "rewards/accuracies": 0.90625, "rewards/chosen": 14.492460250854492, "rewards/margins": 12.561660766601562, "rewards/rejected": 1.9309368133544922, "step": 1228 }, { "epoch": 0.6361283643892339, "grad_norm": 1.6302626132965088, "learning_rate": 9.299376256734177e-06, "loss": 0.2037270963191986, "rewards/accuracies": 0.9140625, "rewards/chosen": 14.748695373535156, "rewards/margins": 13.309249877929688, "rewards/rejected": 1.4369051456451416, "step": 1229 }, { "epoch": 0.6366459627329193, "grad_norm": 0.9983348846435547, "learning_rate": 9.297919149150594e-06, "loss": 0.2042813003063202, "rewards/accuracies": 0.90625, "rewards/chosen": 15.273534774780273, "rewards/margins": 13.249496459960938, "rewards/rejected": 2.023780345916748, "step": 1230 }, { "epoch": 0.6371635610766045, "grad_norm": 0.8275240659713745, "learning_rate": 9.29646064234844e-06, "loss": 0.20136971771717072, "rewards/accuracies": 0.8671875, "rewards/chosen": 17.520614624023438, "rewards/margins": 14.825874328613281, "rewards/rejected": 2.700333595275879, "step": 1231 }, { "epoch": 0.6376811594202898, "grad_norm": 0.6847676634788513, "learning_rate": 9.295000736802543e-06, "loss": 0.20804372429847717, "rewards/accuracies": 0.875, "rewards/chosen": 18.02080535888672, "rewards/margins": 15.45550537109375, "rewards/rejected": 2.5616374015808105, "step": 1232 }, { "epoch": 0.6381987577639752, "grad_norm": 1.5987164974212646, "learning_rate": 9.29353943298818e-06, "loss": 0.2551984190940857, "rewards/accuracies": 0.84375, "rewards/chosen": 18.57342529296875, "rewards/margins": 16.006072998046875, "rewards/rejected": 2.5720691680908203, "step": 1233 }, { "epoch": 0.6387163561076604, "grad_norm": 0.8293893933296204, "learning_rate": 9.292076731381096e-06, "loss": 0.1999211460351944, "rewards/accuracies": 0.875, "rewards/chosen": 19.329998016357422, "rewards/margins": 16.1385498046875, "rewards/rejected": 3.187765121459961, "step": 1234 }, { "epoch": 0.6392339544513458, "grad_norm": 1.3190339803695679, "learning_rate": 9.29061263245748e-06, "loss": 0.1972501575946808, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.485572814941406, "rewards/margins": 17.4554443359375, "rewards/rejected": 3.024991989135742, "step": 1235 }, { "epoch": 0.639751552795031, "grad_norm": 1.9549524784088135, "learning_rate": 9.289147136693983e-06, "loss": 0.21349702775478363, "rewards/accuracies": 0.859375, "rewards/chosen": 20.704811096191406, "rewards/margins": 17.339645385742188, "rewards/rejected": 3.3722476959228516, "step": 1236 }, { "epoch": 0.6402691511387164, "grad_norm": 0.8978866934776306, "learning_rate": 9.287680244567705e-06, "loss": 0.23461391031742096, "rewards/accuracies": 0.890625, "rewards/chosen": 21.959022521972656, "rewards/margins": 19.372329711914062, "rewards/rejected": 2.586650848388672, "step": 1237 }, { "epoch": 0.6407867494824017, "grad_norm": 0.8125931620597839, "learning_rate": 9.286211956556204e-06, "loss": 0.1900244653224945, "rewards/accuracies": 0.8828125, "rewards/chosen": 22.33867645263672, "rewards/margins": 19.517333984375, "rewards/rejected": 2.8126461505889893, "step": 1238 }, { "epoch": 0.6413043478260869, "grad_norm": 1.6695592403411865, "learning_rate": 9.284742273137493e-06, "loss": 0.19234371185302734, "rewards/accuracies": 0.8671875, "rewards/chosen": 20.434276580810547, "rewards/margins": 18.260116577148438, "rewards/rejected": 2.1704559326171875, "step": 1239 }, { "epoch": 0.6418219461697723, "grad_norm": 0.880363941192627, "learning_rate": 9.283271194790036e-06, "loss": 0.13240200281143188, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.80956268310547, "rewards/margins": 20.811492919921875, "rewards/rejected": 2.001201629638672, "step": 1240 }, { "epoch": 0.6423395445134575, "grad_norm": 0.5566720366477966, "learning_rate": 9.281798721992754e-06, "loss": 0.18351970613002777, "rewards/accuracies": 0.921875, "rewards/chosen": 23.960147857666016, "rewards/margins": 21.456512451171875, "rewards/rejected": 2.5042572021484375, "step": 1241 }, { "epoch": 0.6428571428571429, "grad_norm": 1.2121750116348267, "learning_rate": 9.280324855225022e-06, "loss": 0.221431165933609, "rewards/accuracies": 0.875, "rewards/chosen": 23.017822265625, "rewards/margins": 20.660537719726562, "rewards/rejected": 2.360454559326172, "step": 1242 }, { "epoch": 0.6433747412008282, "grad_norm": 0.7991037964820862, "learning_rate": 9.278849594966668e-06, "loss": 0.17557625472545624, "rewards/accuracies": 0.90625, "rewards/chosen": 25.778564453125, "rewards/margins": 23.179473876953125, "rewards/rejected": 2.5961246490478516, "step": 1243 }, { "epoch": 0.6438923395445134, "grad_norm": 2.6158103942871094, "learning_rate": 9.277372941697972e-06, "loss": 0.2117687463760376, "rewards/accuracies": 0.90625, "rewards/chosen": 26.378036499023438, "rewards/margins": 23.211700439453125, "rewards/rejected": 3.1686668395996094, "step": 1244 }, { "epoch": 0.6444099378881988, "grad_norm": 2.040585994720459, "learning_rate": 9.275894895899671e-06, "loss": 0.25360897183418274, "rewards/accuracies": 0.875, "rewards/chosen": 23.58232879638672, "rewards/margins": 21.414520263671875, "rewards/rejected": 2.177485466003418, "step": 1245 }, { "epoch": 0.644927536231884, "grad_norm": 0.834797203540802, "learning_rate": 9.274415458052951e-06, "loss": 0.21402384340763092, "rewards/accuracies": 0.8671875, "rewards/chosen": 22.581008911132812, "rewards/margins": 20.463478088378906, "rewards/rejected": 2.1183395385742188, "step": 1246 }, { "epoch": 0.6454451345755694, "grad_norm": 1.6777454614639282, "learning_rate": 9.272934628639456e-06, "loss": 0.22887355089187622, "rewards/accuracies": 0.8828125, "rewards/chosen": 26.30739974975586, "rewards/margins": 23.877655029296875, "rewards/rejected": 2.4360122680664062, "step": 1247 }, { "epoch": 0.6459627329192547, "grad_norm": 1.7156487703323364, "learning_rate": 9.271452408141279e-06, "loss": 0.20064637064933777, "rewards/accuracies": 0.875, "rewards/chosen": 28.915855407714844, "rewards/margins": 26.326507568359375, "rewards/rejected": 2.586637496948242, "step": 1248 }, { "epoch": 0.6464803312629399, "grad_norm": 0.8386070728302002, "learning_rate": 9.26996879704097e-06, "loss": 0.22302722930908203, "rewards/accuracies": 0.8828125, "rewards/chosen": 24.862060546875, "rewards/margins": 22.727340698242188, "rewards/rejected": 2.1314048767089844, "step": 1249 }, { "epoch": 0.6469979296066253, "grad_norm": 0.8946912288665771, "learning_rate": 9.268483795821526e-06, "loss": 0.1828184276819229, "rewards/accuracies": 0.890625, "rewards/chosen": 27.290313720703125, "rewards/margins": 24.47113037109375, "rewards/rejected": 2.8319034576416016, "step": 1250 }, { "epoch": 0.6475155279503105, "grad_norm": 1.3719916343688965, "learning_rate": 9.266997404966404e-06, "loss": 0.19497400522232056, "rewards/accuracies": 0.8984375, "rewards/chosen": 24.581024169921875, "rewards/margins": 21.897979736328125, "rewards/rejected": 2.683692455291748, "step": 1251 }, { "epoch": 0.6480331262939959, "grad_norm": 0.7477267384529114, "learning_rate": 9.265509624959505e-06, "loss": 0.26632606983184814, "rewards/accuracies": 0.875, "rewards/chosen": 21.431079864501953, "rewards/margins": 19.48431396484375, "rewards/rejected": 1.9384021759033203, "step": 1252 }, { "epoch": 0.6485507246376812, "grad_norm": 1.3243072032928467, "learning_rate": 9.26402045628519e-06, "loss": 0.1646067351102829, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.75632095336914, "rewards/margins": 21.88219451904297, "rewards/rejected": 1.8800349235534668, "step": 1253 }, { "epoch": 0.6490683229813664, "grad_norm": 1.2774678468704224, "learning_rate": 9.262529899428265e-06, "loss": 0.1814902126789093, "rewards/accuracies": 0.921875, "rewards/chosen": 23.97879409790039, "rewards/margins": 21.77353286743164, "rewards/rejected": 2.2030351161956787, "step": 1254 }, { "epoch": 0.6495859213250518, "grad_norm": 0.7546804547309875, "learning_rate": 9.261037954873993e-06, "loss": 0.18841907382011414, "rewards/accuracies": 0.90625, "rewards/chosen": 22.68861198425293, "rewards/margins": 21.064117431640625, "rewards/rejected": 1.6238903999328613, "step": 1255 }, { "epoch": 0.650103519668737, "grad_norm": 0.8507958054542542, "learning_rate": 9.25954462310809e-06, "loss": 0.21278822422027588, "rewards/accuracies": 0.875, "rewards/chosen": 24.414838790893555, "rewards/margins": 21.628875732421875, "rewards/rejected": 2.7854862213134766, "step": 1256 }, { "epoch": 0.6506211180124224, "grad_norm": 0.7805787324905396, "learning_rate": 9.258049904616718e-06, "loss": 0.1864362359046936, "rewards/accuracies": 0.90625, "rewards/chosen": 25.72922134399414, "rewards/margins": 23.936782836914062, "rewards/rejected": 1.7964706420898438, "step": 1257 }, { "epoch": 0.6511387163561076, "grad_norm": 1.0843364000320435, "learning_rate": 9.256553799886492e-06, "loss": 0.23298074305057526, "rewards/accuracies": 0.890625, "rewards/chosen": 26.268747329711914, "rewards/margins": 23.832351684570312, "rewards/rejected": 2.4400405883789062, "step": 1258 }, { "epoch": 0.651656314699793, "grad_norm": 1.751908540725708, "learning_rate": 9.255056309404485e-06, "loss": 0.23045793175697327, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.740421295166016, "rewards/margins": 20.561065673828125, "rewards/rejected": 2.178792953491211, "step": 1259 }, { "epoch": 0.6521739130434783, "grad_norm": 0.45898494124412537, "learning_rate": 9.25355743365821e-06, "loss": 0.13536101579666138, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.844919204711914, "rewards/margins": 27.432506561279297, "rewards/rejected": 3.4114503860473633, "step": 1260 }, { "epoch": 0.6526915113871635, "grad_norm": 0.85532146692276, "learning_rate": 9.252057173135642e-06, "loss": 0.17843231558799744, "rewards/accuracies": 0.8828125, "rewards/chosen": 26.97574234008789, "rewards/margins": 24.156875610351562, "rewards/rejected": 2.8169822692871094, "step": 1261 }, { "epoch": 0.6532091097308489, "grad_norm": 1.65006685256958, "learning_rate": 9.250555528325198e-06, "loss": 0.2342672497034073, "rewards/accuracies": 0.875, "rewards/chosen": 26.68126678466797, "rewards/margins": 23.91278839111328, "rewards/rejected": 2.768524169921875, "step": 1262 }, { "epoch": 0.6537267080745341, "grad_norm": 0.9914736151695251, "learning_rate": 9.249052499715749e-06, "loss": 0.17317648231983185, "rewards/accuracies": 0.90625, "rewards/chosen": 24.272441864013672, "rewards/margins": 22.43426513671875, "rewards/rejected": 1.8460206985473633, "step": 1263 }, { "epoch": 0.6542443064182195, "grad_norm": 1.1935365200042725, "learning_rate": 9.247548087796619e-06, "loss": 0.24496787786483765, "rewards/accuracies": 0.8828125, "rewards/chosen": 23.190807342529297, "rewards/margins": 21.131607055664062, "rewards/rejected": 2.0579986572265625, "step": 1264 }, { "epoch": 0.6547619047619048, "grad_norm": 3.5776021480560303, "learning_rate": 9.24604229305758e-06, "loss": 0.25370466709136963, "rewards/accuracies": 0.8828125, "rewards/chosen": 22.263832092285156, "rewards/margins": 20.08690643310547, "rewards/rejected": 2.181030750274658, "step": 1265 }, { "epoch": 0.65527950310559, "grad_norm": 1.198742151260376, "learning_rate": 9.244535115988853e-06, "loss": 0.11589348316192627, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.35784912109375, "rewards/margins": 24.4381103515625, "rewards/rejected": 1.9166431427001953, "step": 1266 }, { "epoch": 0.6557971014492754, "grad_norm": 0.9044090509414673, "learning_rate": 9.243026557081112e-06, "loss": 0.15980744361877441, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.137353897094727, "rewards/margins": 21.310791015625, "rewards/rejected": 1.8236579895019531, "step": 1267 }, { "epoch": 0.6563146997929606, "grad_norm": 1.0732759237289429, "learning_rate": 9.241516616825477e-06, "loss": 0.18521589040756226, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.734085083007812, "rewards/margins": 20.892837524414062, "rewards/rejected": 1.8360815048217773, "step": 1268 }, { "epoch": 0.656832298136646, "grad_norm": 0.760454535484314, "learning_rate": 9.240005295713522e-06, "loss": 0.19947686791419983, "rewards/accuracies": 0.8671875, "rewards/chosen": 18.38655662536621, "rewards/margins": 17.173416137695312, "rewards/rejected": 1.2111244201660156, "step": 1269 }, { "epoch": 0.6573498964803313, "grad_norm": 1.4194722175598145, "learning_rate": 9.238492594237266e-06, "loss": 0.22102661430835724, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.07192611694336, "rewards/margins": 20.45812225341797, "rewards/rejected": 1.6150736808776855, "step": 1270 }, { "epoch": 0.6578674948240165, "grad_norm": 1.0243079662322998, "learning_rate": 9.236978512889183e-06, "loss": 0.20272725820541382, "rewards/accuracies": 0.90625, "rewards/chosen": 21.75662612915039, "rewards/margins": 20.09961700439453, "rewards/rejected": 1.657371997833252, "step": 1271 }, { "epoch": 0.6583850931677019, "grad_norm": 2.1736111640930176, "learning_rate": 9.235463052162192e-06, "loss": 0.32017815113067627, "rewards/accuracies": 0.84375, "rewards/chosen": 19.784961700439453, "rewards/margins": 18.07221221923828, "rewards/rejected": 1.7143096923828125, "step": 1272 }, { "epoch": 0.6589026915113871, "grad_norm": 1.1641662120819092, "learning_rate": 9.233946212549661e-06, "loss": 0.2529900074005127, "rewards/accuracies": 0.84375, "rewards/chosen": 19.890775680541992, "rewards/margins": 18.27234649658203, "rewards/rejected": 1.6235055923461914, "step": 1273 }, { "epoch": 0.6594202898550725, "grad_norm": 1.8732624053955078, "learning_rate": 9.23242799454541e-06, "loss": 0.1914527714252472, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.25074005126953, "rewards/margins": 21.187524795532227, "rewards/rejected": 1.0566768646240234, "step": 1274 }, { "epoch": 0.6599378881987578, "grad_norm": 1.1416393518447876, "learning_rate": 9.230908398643704e-06, "loss": 0.1611400544643402, "rewards/accuracies": 0.921875, "rewards/chosen": 21.626216888427734, "rewards/margins": 20.439586639404297, "rewards/rejected": 1.1845837831497192, "step": 1275 }, { "epoch": 0.660455486542443, "grad_norm": 0.8527940511703491, "learning_rate": 9.229387425339258e-06, "loss": 0.18513637781143188, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.129173278808594, "rewards/margins": 20.72198486328125, "rewards/rejected": 1.408609390258789, "step": 1276 }, { "epoch": 0.6609730848861284, "grad_norm": 1.2110137939453125, "learning_rate": 9.227865075127238e-06, "loss": 0.2449793517589569, "rewards/accuracies": 0.890625, "rewards/chosen": 20.84657859802246, "rewards/margins": 19.802627563476562, "rewards/rejected": 1.0432395935058594, "step": 1277 }, { "epoch": 0.6614906832298136, "grad_norm": 0.7830080986022949, "learning_rate": 9.226341348503253e-06, "loss": 0.2186403125524521, "rewards/accuracies": 0.8984375, "rewards/chosen": 19.059104919433594, "rewards/margins": 17.910179138183594, "rewards/rejected": 1.1515103578567505, "step": 1278 }, { "epoch": 0.662008281573499, "grad_norm": 0.6263836026191711, "learning_rate": 9.224816245963368e-06, "loss": 0.16327553987503052, "rewards/accuracies": 0.9140625, "rewards/chosen": 20.837812423706055, "rewards/margins": 19.827774047851562, "rewards/rejected": 1.0076990127563477, "step": 1279 }, { "epoch": 0.6625258799171843, "grad_norm": 1.0197832584381104, "learning_rate": 9.223289768004087e-06, "loss": 0.17718923091888428, "rewards/accuracies": 0.9453125, "rewards/chosen": 18.99722671508789, "rewards/margins": 18.161468505859375, "rewards/rejected": 0.8371391296386719, "step": 1280 }, { "epoch": 0.6630434782608695, "grad_norm": 2.067589044570923, "learning_rate": 9.221761915122369e-06, "loss": 0.2411029189825058, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.53807830810547, "rewards/margins": 17.65808868408203, "rewards/rejected": 0.8738126754760742, "step": 1281 }, { "epoch": 0.6635610766045549, "grad_norm": 1.7307547330856323, "learning_rate": 9.220232687815613e-06, "loss": 0.2481992244720459, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.673805236816406, "rewards/margins": 17.672950744628906, "rewards/rejected": 0.9958553314208984, "step": 1282 }, { "epoch": 0.6640786749482401, "grad_norm": 0.7780036330223083, "learning_rate": 9.218702086581674e-06, "loss": 0.18750686943531036, "rewards/accuracies": 0.90625, "rewards/chosen": 18.880434036254883, "rewards/margins": 18.0849609375, "rewards/rejected": 0.8001060485839844, "step": 1283 }, { "epoch": 0.6645962732919255, "grad_norm": 1.7264230251312256, "learning_rate": 9.21717011191885e-06, "loss": 0.1798616647720337, "rewards/accuracies": 0.90625, "rewards/chosen": 18.350955963134766, "rewards/margins": 17.53555679321289, "rewards/rejected": 0.8159685134887695, "step": 1284 }, { "epoch": 0.6651138716356108, "grad_norm": 1.3467695713043213, "learning_rate": 9.215636764325884e-06, "loss": 0.2023860663175583, "rewards/accuracies": 0.890625, "rewards/chosen": 21.9722843170166, "rewards/margins": 20.93192481994629, "rewards/rejected": 1.040679931640625, "step": 1285 }, { "epoch": 0.6656314699792961, "grad_norm": 1.0547963380813599, "learning_rate": 9.21410204430197e-06, "loss": 0.18111561238765717, "rewards/accuracies": 0.890625, "rewards/chosen": 18.130643844604492, "rewards/margins": 17.422813415527344, "rewards/rejected": 0.7156581878662109, "step": 1286 }, { "epoch": 0.6661490683229814, "grad_norm": 1.3397023677825928, "learning_rate": 9.212565952346749e-06, "loss": 0.21619927883148193, "rewards/accuracies": 0.875, "rewards/chosen": 20.486669540405273, "rewards/margins": 19.742019653320312, "rewards/rejected": 0.7411112785339355, "step": 1287 }, { "epoch": 0.6666666666666666, "grad_norm": 1.7967398166656494, "learning_rate": 9.211028488960301e-06, "loss": 0.26676714420318604, "rewards/accuracies": 0.8671875, "rewards/chosen": 17.78597640991211, "rewards/margins": 17.094146728515625, "rewards/rejected": 0.6931018829345703, "step": 1288 }, { "epoch": 0.667184265010352, "grad_norm": 1.1996545791625977, "learning_rate": 9.209489654643164e-06, "loss": 0.20248675346374512, "rewards/accuracies": 0.90625, "rewards/chosen": 16.634599685668945, "rewards/margins": 16.207443237304688, "rewards/rejected": 0.4184532165527344, "step": 1289 }, { "epoch": 0.6677018633540373, "grad_norm": 0.7170881628990173, "learning_rate": 9.207949449896316e-06, "loss": 0.17872072756290436, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.852354049682617, "rewards/margins": 18.145431518554688, "rewards/rejected": 0.7028408050537109, "step": 1290 }, { "epoch": 0.6682194616977226, "grad_norm": 1.1667670011520386, "learning_rate": 9.20640787522118e-06, "loss": 0.30234453082084656, "rewards/accuracies": 0.8125, "rewards/chosen": 16.638042449951172, "rewards/margins": 16.179122924804688, "rewards/rejected": 0.4601631164550781, "step": 1291 }, { "epoch": 0.6687370600414079, "grad_norm": 1.1727049350738525, "learning_rate": 9.204864931119624e-06, "loss": 0.19457614421844482, "rewards/accuracies": 0.90625, "rewards/chosen": 18.575021743774414, "rewards/margins": 17.787612915039062, "rewards/rejected": 0.7893422842025757, "step": 1292 }, { "epoch": 0.6692546583850931, "grad_norm": 0.7192240953445435, "learning_rate": 9.203320618093972e-06, "loss": 0.16645529866218567, "rewards/accuracies": 0.890625, "rewards/chosen": 20.76183319091797, "rewards/margins": 19.450286865234375, "rewards/rejected": 1.3108596801757812, "step": 1293 }, { "epoch": 0.6697722567287785, "grad_norm": 1.4059886932373047, "learning_rate": 9.20177493664698e-06, "loss": 0.22489440441131592, "rewards/accuracies": 0.890625, "rewards/chosen": 19.12509536743164, "rewards/margins": 18.314208984375, "rewards/rejected": 0.8096513748168945, "step": 1294 }, { "epoch": 0.6702898550724637, "grad_norm": 0.9314298629760742, "learning_rate": 9.200227887281857e-06, "loss": 0.21781837940216064, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.856590270996094, "rewards/margins": 17.873939514160156, "rewards/rejected": 0.9829082489013672, "step": 1295 }, { "epoch": 0.6708074534161491, "grad_norm": 0.8607335686683655, "learning_rate": 9.198679470502258e-06, "loss": 0.20304995775222778, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.268604278564453, "rewards/margins": 19.425342559814453, "rewards/rejected": 0.8346920013427734, "step": 1296 }, { "epoch": 0.6713250517598344, "grad_norm": 0.9062114357948303, "learning_rate": 9.19712968681228e-06, "loss": 0.19736596941947937, "rewards/accuracies": 0.890625, "rewards/chosen": 19.160850524902344, "rewards/margins": 18.28618621826172, "rewards/rejected": 0.8799314498901367, "step": 1297 }, { "epoch": 0.6718426501035196, "grad_norm": 1.1238476037979126, "learning_rate": 9.195578536716466e-06, "loss": 0.18273663520812988, "rewards/accuracies": 0.9296875, "rewards/chosen": 21.080245971679688, "rewards/margins": 20.331310272216797, "rewards/rejected": 0.7483675479888916, "step": 1298 }, { "epoch": 0.672360248447205, "grad_norm": 1.303166389465332, "learning_rate": 9.194026020719805e-06, "loss": 0.20072205364704132, "rewards/accuracies": 0.8984375, "rewards/chosen": 21.2105655670166, "rewards/margins": 19.765560150146484, "rewards/rejected": 1.4460294246673584, "step": 1299 }, { "epoch": 0.6728778467908902, "grad_norm": 1.2207096815109253, "learning_rate": 9.192472139327728e-06, "loss": 0.2380986213684082, "rewards/accuracies": 0.859375, "rewards/chosen": 21.22260093688965, "rewards/margins": 19.544784545898438, "rewards/rejected": 1.678598403930664, "step": 1300 }, { "epoch": 0.6733954451345756, "grad_norm": 1.165256381034851, "learning_rate": 9.190916893046114e-06, "loss": 0.20203688740730286, "rewards/accuracies": 0.875, "rewards/chosen": 23.235382080078125, "rewards/margins": 20.942276000976562, "rewards/rejected": 2.28817081451416, "step": 1301 }, { "epoch": 0.6739130434782609, "grad_norm": 1.1027843952178955, "learning_rate": 9.189360282381284e-06, "loss": 0.21498525142669678, "rewards/accuracies": 0.875, "rewards/chosen": 24.28992462158203, "rewards/margins": 22.767135620117188, "rewards/rejected": 1.5212154388427734, "step": 1302 }, { "epoch": 0.6744306418219461, "grad_norm": 1.0120524168014526, "learning_rate": 9.187802307840004e-06, "loss": 0.1528276801109314, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.945146560668945, "rewards/margins": 22.70134735107422, "rewards/rejected": 1.2426800727844238, "step": 1303 }, { "epoch": 0.6749482401656315, "grad_norm": 0.7160503268241882, "learning_rate": 9.186242969929485e-06, "loss": 0.1776420921087265, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.712417602539062, "rewards/margins": 22.682029724121094, "rewards/rejected": 2.0256714820861816, "step": 1304 }, { "epoch": 0.6754658385093167, "grad_norm": 0.7739724516868591, "learning_rate": 9.184682269157378e-06, "loss": 0.15550777316093445, "rewards/accuracies": 0.90625, "rewards/chosen": 25.473989486694336, "rewards/margins": 23.99044418334961, "rewards/rejected": 1.4811639785766602, "step": 1305 }, { "epoch": 0.6759834368530021, "grad_norm": 1.8986196517944336, "learning_rate": 9.183120206031782e-06, "loss": 0.26909780502319336, "rewards/accuracies": 0.8671875, "rewards/chosen": 24.86916732788086, "rewards/margins": 23.041366577148438, "rewards/rejected": 1.832148551940918, "step": 1306 }, { "epoch": 0.6765010351966874, "grad_norm": 1.6511943340301514, "learning_rate": 9.181556781061239e-06, "loss": 0.22489973902702332, "rewards/accuracies": 0.8671875, "rewards/chosen": 25.07845687866211, "rewards/margins": 22.812362670898438, "rewards/rejected": 2.266786575317383, "step": 1307 }, { "epoch": 0.6770186335403726, "grad_norm": 1.0730396509170532, "learning_rate": 9.179991994754732e-06, "loss": 0.21391743421554565, "rewards/accuracies": 0.875, "rewards/chosen": 25.427261352539062, "rewards/margins": 22.9490966796875, "rewards/rejected": 2.4801712036132812, "step": 1308 }, { "epoch": 0.677536231884058, "grad_norm": 1.4335588216781616, "learning_rate": 9.178425847621688e-06, "loss": 0.17885653674602509, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.45723533630371, "rewards/margins": 21.382125854492188, "rewards/rejected": 3.074970245361328, "step": 1309 }, { "epoch": 0.6780538302277432, "grad_norm": 0.9167799353599548, "learning_rate": 9.176858340171978e-06, "loss": 0.13736702501773834, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.131969451904297, "rewards/margins": 25.960189819335938, "rewards/rejected": 3.1779212951660156, "step": 1310 }, { "epoch": 0.6785714285714286, "grad_norm": 0.6389718651771545, "learning_rate": 9.175289472915914e-06, "loss": 0.13680490851402283, "rewards/accuracies": 0.921875, "rewards/chosen": 25.788185119628906, "rewards/margins": 22.011489868164062, "rewards/rejected": 3.7768020629882812, "step": 1311 }, { "epoch": 0.6790890269151139, "grad_norm": 1.1500709056854248, "learning_rate": 9.173719246364254e-06, "loss": 0.17480936646461487, "rewards/accuracies": 0.90625, "rewards/chosen": 23.708019256591797, "rewards/margins": 20.348098754882812, "rewards/rejected": 3.3659133911132812, "step": 1312 }, { "epoch": 0.6796066252587992, "grad_norm": 1.6373974084854126, "learning_rate": 9.172147661028198e-06, "loss": 0.1740565001964569, "rewards/accuracies": 0.921875, "rewards/chosen": 25.156192779541016, "rewards/margins": 21.081573486328125, "rewards/rejected": 4.077949523925781, "step": 1313 }, { "epoch": 0.6801242236024845, "grad_norm": 1.273927927017212, "learning_rate": 9.170574717419383e-06, "loss": 0.20035222172737122, "rewards/accuracies": 0.90625, "rewards/chosen": 27.12838363647461, "rewards/margins": 22.046764373779297, "rewards/rejected": 5.081492900848389, "step": 1314 }, { "epoch": 0.6806418219461697, "grad_norm": 0.7532134652137756, "learning_rate": 9.169000416049895e-06, "loss": 0.17771971225738525, "rewards/accuracies": 0.890625, "rewards/chosen": 24.397720336914062, "rewards/margins": 20.52861785888672, "rewards/rejected": 3.8699188232421875, "step": 1315 }, { "epoch": 0.6811594202898551, "grad_norm": 0.9208282232284546, "learning_rate": 9.167424757432259e-06, "loss": 0.13611328601837158, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.310009002685547, "rewards/margins": 23.869117736816406, "rewards/rejected": 6.438699722290039, "step": 1316 }, { "epoch": 0.6816770186335404, "grad_norm": 2.2703983783721924, "learning_rate": 9.16584774207944e-06, "loss": 0.2529163658618927, "rewards/accuracies": 0.859375, "rewards/chosen": 27.826004028320312, "rewards/margins": 22.499557495117188, "rewards/rejected": 5.331981658935547, "step": 1317 }, { "epoch": 0.6821946169772257, "grad_norm": 1.2312365770339966, "learning_rate": 9.164269370504849e-06, "loss": 0.18015334010124207, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.326356887817383, "rewards/margins": 24.398481369018555, "rewards/rejected": 6.937896728515625, "step": 1318 }, { "epoch": 0.682712215320911, "grad_norm": 2.024482250213623, "learning_rate": 9.162689643222334e-06, "loss": 0.2555825114250183, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.17434310913086, "rewards/margins": 23.59033203125, "rewards/rejected": 7.580425262451172, "step": 1319 }, { "epoch": 0.6832298136645962, "grad_norm": 0.9819433689117432, "learning_rate": 9.161108560746189e-06, "loss": 0.22991667687892914, "rewards/accuracies": 0.890625, "rewards/chosen": 28.731163024902344, "rewards/margins": 21.923084259033203, "rewards/rejected": 6.814210414886475, "step": 1320 }, { "epoch": 0.6837474120082816, "grad_norm": 1.3692436218261719, "learning_rate": 9.159526123591145e-06, "loss": 0.21622861921787262, "rewards/accuracies": 0.90625, "rewards/chosen": 32.214263916015625, "rewards/margins": 25.320144653320312, "rewards/rejected": 6.88899040222168, "step": 1321 }, { "epoch": 0.6842650103519669, "grad_norm": 0.8291259407997131, "learning_rate": 9.157942332272376e-06, "loss": 0.1659584641456604, "rewards/accuracies": 0.9140625, "rewards/chosen": 33.82189178466797, "rewards/margins": 25.63019561767578, "rewards/rejected": 8.194530487060547, "step": 1322 }, { "epoch": 0.6847826086956522, "grad_norm": 1.7550246715545654, "learning_rate": 9.156357187305498e-06, "loss": 0.1608346700668335, "rewards/accuracies": 0.8984375, "rewards/chosen": 34.7252082824707, "rewards/margins": 26.857894897460938, "rewards/rejected": 7.864612579345703, "step": 1323 }, { "epoch": 0.6853002070393375, "grad_norm": 2.0242230892181396, "learning_rate": 9.154770689206566e-06, "loss": 0.19019997119903564, "rewards/accuracies": 0.921875, "rewards/chosen": 28.34109878540039, "rewards/margins": 23.64269256591797, "rewards/rejected": 4.707019805908203, "step": 1324 }, { "epoch": 0.6858178053830227, "grad_norm": 1.308225154876709, "learning_rate": 9.153182838492075e-06, "loss": 0.1815166473388672, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.79560089111328, "rewards/margins": 23.930557250976562, "rewards/rejected": 5.860624313354492, "step": 1325 }, { "epoch": 0.6863354037267081, "grad_norm": 1.6708952188491821, "learning_rate": 9.151593635678962e-06, "loss": 0.2995710074901581, "rewards/accuracies": 0.8515625, "rewards/chosen": 25.283418655395508, "rewards/margins": 20.500757217407227, "rewards/rejected": 4.783730983734131, "step": 1326 }, { "epoch": 0.6868530020703933, "grad_norm": 1.3415136337280273, "learning_rate": 9.150003081284602e-06, "loss": 0.18355891108512878, "rewards/accuracies": 0.890625, "rewards/chosen": 25.661441802978516, "rewards/margins": 21.7808837890625, "rewards/rejected": 3.8741531372070312, "step": 1327 }, { "epoch": 0.6873706004140787, "grad_norm": 1.319084644317627, "learning_rate": 9.148411175826813e-06, "loss": 0.25006330013275146, "rewards/accuracies": 0.8671875, "rewards/chosen": 26.86202049255371, "rewards/margins": 22.582626342773438, "rewards/rejected": 4.283008575439453, "step": 1328 }, { "epoch": 0.687888198757764, "grad_norm": 0.8592891693115234, "learning_rate": 9.146817919823852e-06, "loss": 0.22824305295944214, "rewards/accuracies": 0.8828125, "rewards/chosen": 24.585830688476562, "rewards/margins": 21.354949951171875, "rewards/rejected": 3.2368898391723633, "step": 1329 }, { "epoch": 0.6884057971014492, "grad_norm": 0.8375071883201599, "learning_rate": 9.145223313794412e-06, "loss": 0.18755027651786804, "rewards/accuracies": 0.9375, "rewards/chosen": 25.328094482421875, "rewards/margins": 22.610023498535156, "rewards/rejected": 2.719696044921875, "step": 1330 }, { "epoch": 0.6889233954451346, "grad_norm": 1.2954602241516113, "learning_rate": 9.143627358257632e-06, "loss": 0.25863274931907654, "rewards/accuracies": 0.8671875, "rewards/chosen": 24.297794342041016, "rewards/margins": 22.220638275146484, "rewards/rejected": 2.0836455821990967, "step": 1331 }, { "epoch": 0.6894409937888198, "grad_norm": 0.7399251461029053, "learning_rate": 9.142030053733084e-06, "loss": 0.19446179270744324, "rewards/accuracies": 0.90625, "rewards/chosen": 21.010334014892578, "rewards/margins": 20.26837158203125, "rewards/rejected": 0.7458702325820923, "step": 1332 }, { "epoch": 0.6899585921325052, "grad_norm": 0.5623348951339722, "learning_rate": 9.140431400740784e-06, "loss": 0.1893460750579834, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.942337036132812, "rewards/margins": 20.22519874572754, "rewards/rejected": 0.7212691307067871, "step": 1333 }, { "epoch": 0.6904761904761905, "grad_norm": 1.0847899913787842, "learning_rate": 9.138831399801185e-06, "loss": 0.19697999954223633, "rewards/accuracies": 0.890625, "rewards/chosen": 19.97966766357422, "rewards/margins": 19.0745849609375, "rewards/rejected": 0.9053959846496582, "step": 1334 }, { "epoch": 0.6909937888198758, "grad_norm": 1.0999796390533447, "learning_rate": 9.137230051435176e-06, "loss": 0.2236247956752777, "rewards/accuracies": 0.8984375, "rewards/chosen": 21.11138153076172, "rewards/margins": 20.4957332611084, "rewards/rejected": 0.6181235313415527, "step": 1335 }, { "epoch": 0.6915113871635611, "grad_norm": 0.7735489010810852, "learning_rate": 9.135627356164091e-06, "loss": 0.21809044480323792, "rewards/accuracies": 0.875, "rewards/chosen": 17.603160858154297, "rewards/margins": 17.34027099609375, "rewards/rejected": 0.26343727111816406, "step": 1336 }, { "epoch": 0.6920289855072463, "grad_norm": 0.6420530080795288, "learning_rate": 9.134023314509697e-06, "loss": 0.16856566071510315, "rewards/accuracies": 0.9453125, "rewards/chosen": 17.585689544677734, "rewards/margins": 17.528079986572266, "rewards/rejected": 0.05525314807891846, "step": 1337 }, { "epoch": 0.6925465838509317, "grad_norm": 0.72064608335495, "learning_rate": 9.132417926994203e-06, "loss": 0.16934219002723694, "rewards/accuracies": 0.9140625, "rewards/chosen": 19.118850708007812, "rewards/margins": 18.92425537109375, "rewards/rejected": 0.19336891174316406, "step": 1338 }, { "epoch": 0.693064182194617, "grad_norm": 0.8514429330825806, "learning_rate": 9.130811194140252e-06, "loss": 0.17642340064048767, "rewards/accuracies": 0.9296875, "rewards/chosen": 16.212814331054688, "rewards/margins": 16.009296417236328, "rewards/rejected": 0.1933574676513672, "step": 1339 }, { "epoch": 0.6935817805383023, "grad_norm": 0.872772216796875, "learning_rate": 9.129203116470929e-06, "loss": 0.20122754573822021, "rewards/accuracies": 0.890625, "rewards/chosen": 19.758140563964844, "rewards/margins": 19.54796600341797, "rewards/rejected": 0.20848560333251953, "step": 1340 }, { "epoch": 0.6940993788819876, "grad_norm": 1.3550087213516235, "learning_rate": 9.127593694509755e-06, "loss": 0.2125709503889084, "rewards/accuracies": 0.859375, "rewards/chosen": 19.84581756591797, "rewards/margins": 19.432220458984375, "rewards/rejected": 0.41961538791656494, "step": 1341 }, { "epoch": 0.6946169772256728, "grad_norm": 0.6667285561561584, "learning_rate": 9.12598292878069e-06, "loss": 0.1703319549560547, "rewards/accuracies": 0.9453125, "rewards/chosen": 22.95026397705078, "rewards/margins": 22.495437622070312, "rewards/rejected": 0.4560856819152832, "step": 1342 }, { "epoch": 0.6951345755693582, "grad_norm": 0.7629976272583008, "learning_rate": 9.124370819808129e-06, "loss": 0.14812442660331726, "rewards/accuracies": 0.8828125, "rewards/chosen": 21.548961639404297, "rewards/margins": 21.20972442626953, "rewards/rejected": 0.3355884552001953, "step": 1343 }, { "epoch": 0.6956521739130435, "grad_norm": 1.0128034353256226, "learning_rate": 9.122757368116906e-06, "loss": 0.17225775122642517, "rewards/accuracies": 0.890625, "rewards/chosen": 20.54340171813965, "rewards/margins": 20.26911163330078, "rewards/rejected": 0.2684040069580078, "step": 1344 }, { "epoch": 0.6961697722567288, "grad_norm": 1.1975429058074951, "learning_rate": 9.121142574232293e-06, "loss": 0.20572859048843384, "rewards/accuracies": 0.9296875, "rewards/chosen": 21.411239624023438, "rewards/margins": 20.790695190429688, "rewards/rejected": 0.6287851333618164, "step": 1345 }, { "epoch": 0.6966873706004141, "grad_norm": 1.4247691631317139, "learning_rate": 9.119526438679996e-06, "loss": 0.25060296058654785, "rewards/accuracies": 0.84375, "rewards/chosen": 23.763166427612305, "rewards/margins": 23.326980590820312, "rewards/rejected": 0.4362555742263794, "step": 1346 }, { "epoch": 0.6972049689440993, "grad_norm": 1.2370586395263672, "learning_rate": 9.11790896198616e-06, "loss": 0.17602688074111938, "rewards/accuracies": 0.890625, "rewards/chosen": 21.82217788696289, "rewards/margins": 21.391403198242188, "rewards/rejected": 0.4337291717529297, "step": 1347 }, { "epoch": 0.6977225672877847, "grad_norm": 1.0492851734161377, "learning_rate": 9.116290144677366e-06, "loss": 0.18380510807037354, "rewards/accuracies": 0.921875, "rewards/chosen": 28.04817771911621, "rewards/margins": 26.812103271484375, "rewards/rejected": 1.2368077039718628, "step": 1348 }, { "epoch": 0.69824016563147, "grad_norm": 1.0655252933502197, "learning_rate": 9.114669987280634e-06, "loss": 0.22399762272834778, "rewards/accuracies": 0.859375, "rewards/chosen": 21.066038131713867, "rewards/margins": 20.21006965637207, "rewards/rejected": 0.8539767265319824, "step": 1349 }, { "epoch": 0.6987577639751553, "grad_norm": 0.6349027156829834, "learning_rate": 9.113048490323414e-06, "loss": 0.12275466322898865, "rewards/accuracies": 0.921875, "rewards/chosen": 30.442970275878906, "rewards/margins": 28.3310546875, "rewards/rejected": 2.112253189086914, "step": 1350 }, { "epoch": 0.6992753623188406, "grad_norm": 1.1198253631591797, "learning_rate": 9.1114256543336e-06, "loss": 0.18908628821372986, "rewards/accuracies": 0.8828125, "rewards/chosen": 29.056442260742188, "rewards/margins": 26.932815551757812, "rewards/rejected": 2.119182586669922, "step": 1351 }, { "epoch": 0.6997929606625258, "grad_norm": 1.491450548171997, "learning_rate": 9.109801479839516e-06, "loss": 0.2544964551925659, "rewards/accuracies": 0.890625, "rewards/chosen": 24.282329559326172, "rewards/margins": 22.491920471191406, "rewards/rejected": 1.7906246185302734, "step": 1352 }, { "epoch": 0.7003105590062112, "grad_norm": 0.9790498614311218, "learning_rate": 9.108175967369922e-06, "loss": 0.16947965323925018, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.36570358276367, "rewards/margins": 30.505210876464844, "rewards/rejected": 1.868142008781433, "step": 1353 }, { "epoch": 0.7008281573498965, "grad_norm": 2.434474229812622, "learning_rate": 9.106549117454019e-06, "loss": 0.2036934345960617, "rewards/accuracies": 0.8984375, "rewards/chosen": 28.50330352783203, "rewards/margins": 26.400985717773438, "rewards/rejected": 2.101212501525879, "step": 1354 }, { "epoch": 0.7013457556935818, "grad_norm": 0.9444133639335632, "learning_rate": 9.104920930621438e-06, "loss": 0.19222131371498108, "rewards/accuracies": 0.890625, "rewards/chosen": 28.71910858154297, "rewards/margins": 25.755817413330078, "rewards/rejected": 2.9633216857910156, "step": 1355 }, { "epoch": 0.7018633540372671, "grad_norm": 1.8941590785980225, "learning_rate": 9.103291407402245e-06, "loss": 0.20423783361911774, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.6138916015625, "rewards/margins": 26.998519897460938, "rewards/rejected": 2.6175079345703125, "step": 1356 }, { "epoch": 0.7023809523809523, "grad_norm": 0.739834189414978, "learning_rate": 9.101660548326945e-06, "loss": 0.1792633831501007, "rewards/accuracies": 0.921875, "rewards/chosen": 30.992748260498047, "rewards/margins": 27.133407592773438, "rewards/rejected": 3.872452735900879, "step": 1357 }, { "epoch": 0.7028985507246377, "grad_norm": 0.9672386646270752, "learning_rate": 9.100028353926477e-06, "loss": 0.20944438874721527, "rewards/accuracies": 0.875, "rewards/chosen": 28.388641357421875, "rewards/margins": 25.001327514648438, "rewards/rejected": 3.3877487182617188, "step": 1358 }, { "epoch": 0.703416149068323, "grad_norm": 1.562570333480835, "learning_rate": 9.098394824732212e-06, "loss": 0.19923897087574005, "rewards/accuracies": 0.890625, "rewards/chosen": 31.110795974731445, "rewards/margins": 26.497528076171875, "rewards/rejected": 4.619503021240234, "step": 1359 }, { "epoch": 0.7039337474120083, "grad_norm": 0.7407254576683044, "learning_rate": 9.096759961275959e-06, "loss": 0.18391233682632446, "rewards/accuracies": 0.90625, "rewards/chosen": 30.577682495117188, "rewards/margins": 27.929962158203125, "rewards/rejected": 2.6533775329589844, "step": 1360 }, { "epoch": 0.7044513457556936, "grad_norm": 0.8308426141738892, "learning_rate": 9.095123764089956e-06, "loss": 0.14010632038116455, "rewards/accuracies": 0.953125, "rewards/chosen": 32.2430419921875, "rewards/margins": 29.461769104003906, "rewards/rejected": 2.787609100341797, "step": 1361 }, { "epoch": 0.7049689440993789, "grad_norm": 1.6652028560638428, "learning_rate": 9.093486233706883e-06, "loss": 0.19353961944580078, "rewards/accuracies": 0.875, "rewards/chosen": 29.508525848388672, "rewards/margins": 26.09996795654297, "rewards/rejected": 3.41217041015625, "step": 1362 }, { "epoch": 0.7054865424430642, "grad_norm": 1.272396445274353, "learning_rate": 9.091847370659846e-06, "loss": 0.18422818183898926, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.735254287719727, "rewards/margins": 25.340255737304688, "rewards/rejected": 2.3910112380981445, "step": 1363 }, { "epoch": 0.7060041407867494, "grad_norm": 0.5694389343261719, "learning_rate": 9.090207175482393e-06, "loss": 0.19233182072639465, "rewards/accuracies": 0.890625, "rewards/chosen": 32.17008972167969, "rewards/margins": 28.064617156982422, "rewards/rejected": 4.101161956787109, "step": 1364 }, { "epoch": 0.7065217391304348, "grad_norm": 3.280459403991699, "learning_rate": 9.088565648708496e-06, "loss": 0.21663567423820496, "rewards/accuracies": 0.921875, "rewards/chosen": 37.068939208984375, "rewards/margins": 30.900665283203125, "rewards/rejected": 6.175901889801025, "step": 1365 }, { "epoch": 0.7070393374741201, "grad_norm": 1.654721736907959, "learning_rate": 9.08692279087257e-06, "loss": 0.2306644767522812, "rewards/accuracies": 0.8984375, "rewards/chosen": 29.664859771728516, "rewards/margins": 24.083053588867188, "rewards/rejected": 5.58358097076416, "step": 1366 }, { "epoch": 0.7075569358178054, "grad_norm": 1.6725047826766968, "learning_rate": 9.085278602509459e-06, "loss": 0.18538422882556915, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.262989044189453, "rewards/margins": 20.24151611328125, "rewards/rejected": 6.026542663574219, "step": 1367 }, { "epoch": 0.7080745341614907, "grad_norm": 2.307328462600708, "learning_rate": 9.08363308415444e-06, "loss": 0.160853773355484, "rewards/accuracies": 0.921875, "rewards/chosen": 25.742103576660156, "rewards/margins": 20.78931427001953, "rewards/rejected": 4.956123352050781, "step": 1368 }, { "epoch": 0.7085921325051759, "grad_norm": 2.014719009399414, "learning_rate": 9.081986236343221e-06, "loss": 0.20976361632347107, "rewards/accuracies": 0.90625, "rewards/chosen": 20.17919921875, "rewards/margins": 15.446189880371094, "rewards/rejected": 4.732151031494141, "step": 1369 }, { "epoch": 0.7091097308488613, "grad_norm": 1.2881731986999512, "learning_rate": 9.080338059611947e-06, "loss": 0.24536669254302979, "rewards/accuracies": 0.875, "rewards/chosen": 17.775096893310547, "rewards/margins": 13.672225952148438, "rewards/rejected": 4.102752685546875, "step": 1370 }, { "epoch": 0.7096273291925466, "grad_norm": 0.8941617608070374, "learning_rate": 9.078688554497194e-06, "loss": 0.19124944508075714, "rewards/accuracies": 0.890625, "rewards/chosen": 18.2625732421875, "rewards/margins": 13.802467346191406, "rewards/rejected": 4.46283483505249, "step": 1371 }, { "epoch": 0.7101449275362319, "grad_norm": 1.1311272382736206, "learning_rate": 9.077037721535969e-06, "loss": 0.2274496853351593, "rewards/accuracies": 0.859375, "rewards/chosen": 16.93252182006836, "rewards/margins": 12.603561401367188, "rewards/rejected": 4.32739782333374, "step": 1372 }, { "epoch": 0.7106625258799172, "grad_norm": 1.3665632009506226, "learning_rate": 9.075385561265715e-06, "loss": 0.19051426649093628, "rewards/accuracies": 0.90625, "rewards/chosen": 14.796882629394531, "rewards/margins": 11.72406005859375, "rewards/rejected": 3.0730228424072266, "step": 1373 }, { "epoch": 0.7111801242236024, "grad_norm": 1.4815901517868042, "learning_rate": 9.073732074224305e-06, "loss": 0.2603257894515991, "rewards/accuracies": 0.859375, "rewards/chosen": 14.196693420410156, "rewards/margins": 11.036727905273438, "rewards/rejected": 3.1621391773223877, "step": 1374 }, { "epoch": 0.7116977225672878, "grad_norm": 1.4022293090820312, "learning_rate": 9.072077260950039e-06, "loss": 0.2472599595785141, "rewards/accuracies": 0.8671875, "rewards/chosen": 11.878910064697266, "rewards/margins": 10.001152038574219, "rewards/rejected": 1.8739583492279053, "step": 1375 }, { "epoch": 0.7122153209109731, "grad_norm": 1.5161688327789307, "learning_rate": 9.070421121981656e-06, "loss": 0.2376619428396225, "rewards/accuracies": 0.8515625, "rewards/chosen": 11.688241958618164, "rewards/margins": 9.577224731445312, "rewards/rejected": 2.1124210357666016, "step": 1376 }, { "epoch": 0.7127329192546584, "grad_norm": 0.8882049918174744, "learning_rate": 9.068763657858326e-06, "loss": 0.25370943546295166, "rewards/accuracies": 0.8828125, "rewards/chosen": 8.01506519317627, "rewards/margins": 7.061550140380859, "rewards/rejected": 0.9547767639160156, "step": 1377 }, { "epoch": 0.7132505175983437, "grad_norm": 0.5337031483650208, "learning_rate": 9.067104869119647e-06, "loss": 0.179534912109375, "rewards/accuracies": 0.9296875, "rewards/chosen": 8.210456848144531, "rewards/margins": 7.0832977294921875, "rewards/rejected": 1.1263749599456787, "step": 1378 }, { "epoch": 0.7137681159420289, "grad_norm": 0.7689961791038513, "learning_rate": 9.065444756305647e-06, "loss": 0.2390490174293518, "rewards/accuracies": 0.875, "rewards/chosen": 6.46431303024292, "rewards/margins": 5.750083923339844, "rewards/rejected": 0.7155394554138184, "step": 1379 }, { "epoch": 0.7142857142857143, "grad_norm": 0.6899635195732117, "learning_rate": 9.063783319956794e-06, "loss": 0.19429297745227814, "rewards/accuracies": 0.9453125, "rewards/chosen": 7.744737148284912, "rewards/margins": 6.864688873291016, "rewards/rejected": 0.8791694641113281, "step": 1380 }, { "epoch": 0.7148033126293996, "grad_norm": 0.6171004772186279, "learning_rate": 9.062120560613974e-06, "loss": 0.1865394115447998, "rewards/accuracies": 0.921875, "rewards/chosen": 6.140463829040527, "rewards/margins": 5.472801208496094, "rewards/rejected": 0.6679048538208008, "step": 1381 }, { "epoch": 0.7153209109730849, "grad_norm": 0.5622376203536987, "learning_rate": 9.060456478818513e-06, "loss": 0.18752142786979675, "rewards/accuracies": 0.8984375, "rewards/chosen": 6.759646892547607, "rewards/margins": 5.9488677978515625, "rewards/rejected": 0.8127365112304688, "step": 1382 }, { "epoch": 0.7158385093167702, "grad_norm": 0.5578168034553528, "learning_rate": 9.058791075112165e-06, "loss": 0.1789579689502716, "rewards/accuracies": 0.9140625, "rewards/chosen": 6.596990585327148, "rewards/margins": 6.033660888671875, "rewards/rejected": 0.5627708435058594, "step": 1383 }, { "epoch": 0.7163561076604554, "grad_norm": 0.6336413025856018, "learning_rate": 9.057124350037115e-06, "loss": 0.15666547417640686, "rewards/accuracies": 0.921875, "rewards/chosen": 7.140059471130371, "rewards/margins": 6.3482513427734375, "rewards/rejected": 0.7928409576416016, "step": 1384 }, { "epoch": 0.7168737060041408, "grad_norm": 0.7965361475944519, "learning_rate": 9.055456304135975e-06, "loss": 0.16673815250396729, "rewards/accuracies": 0.921875, "rewards/chosen": 7.967185974121094, "rewards/margins": 7.363533020019531, "rewards/rejected": 0.6028361320495605, "step": 1385 }, { "epoch": 0.717391304347826, "grad_norm": 1.1101059913635254, "learning_rate": 9.053786937951791e-06, "loss": 0.26542380452156067, "rewards/accuracies": 0.859375, "rewards/chosen": 6.718764781951904, "rewards/margins": 6.368007659912109, "rewards/rejected": 0.35072898864746094, "step": 1386 }, { "epoch": 0.7179089026915114, "grad_norm": 0.8569159507751465, "learning_rate": 9.052116252028036e-06, "loss": 0.17976102232933044, "rewards/accuracies": 0.9140625, "rewards/chosen": 9.860431671142578, "rewards/margins": 8.900299072265625, "rewards/rejected": 0.9588692784309387, "step": 1387 }, { "epoch": 0.7184265010351967, "grad_norm": 0.7783175706863403, "learning_rate": 9.050444246908616e-06, "loss": 0.1825607866048813, "rewards/accuracies": 0.890625, "rewards/chosen": 9.121198654174805, "rewards/margins": 8.553157806396484, "rewards/rejected": 0.5675525665283203, "step": 1388 }, { "epoch": 0.718944099378882, "grad_norm": 1.2613853216171265, "learning_rate": 9.048770923137863e-06, "loss": 0.20712049305438995, "rewards/accuracies": 0.890625, "rewards/chosen": 10.66488265991211, "rewards/margins": 9.715362548828125, "rewards/rejected": 0.9479866027832031, "step": 1389 }, { "epoch": 0.7194616977225673, "grad_norm": 1.7391120195388794, "learning_rate": 9.047096281260539e-06, "loss": 0.16777709126472473, "rewards/accuracies": 0.8984375, "rewards/chosen": 11.522988319396973, "rewards/margins": 9.992843627929688, "rewards/rejected": 1.5297794342041016, "step": 1390 }, { "epoch": 0.7199792960662525, "grad_norm": 2.233380079269409, "learning_rate": 9.045420321821836e-06, "loss": 0.23799896240234375, "rewards/accuracies": 0.8828125, "rewards/chosen": 12.757766723632812, "rewards/margins": 11.223655700683594, "rewards/rejected": 1.5361120700836182, "step": 1391 }, { "epoch": 0.7204968944099379, "grad_norm": 1.3134551048278809, "learning_rate": 9.043743045367376e-06, "loss": 0.1600206196308136, "rewards/accuracies": 0.921875, "rewards/chosen": 12.731891632080078, "rewards/margins": 11.351608276367188, "rewards/rejected": 1.3807525634765625, "step": 1392 }, { "epoch": 0.7210144927536232, "grad_norm": 2.316394090652466, "learning_rate": 9.042064452443209e-06, "loss": 0.1694008708000183, "rewards/accuracies": 0.921875, "rewards/chosen": 15.162251472473145, "rewards/margins": 13.079025268554688, "rewards/rejected": 2.08432674407959, "step": 1393 }, { "epoch": 0.7215320910973085, "grad_norm": 1.3751099109649658, "learning_rate": 9.040384543595808e-06, "loss": 0.18279334902763367, "rewards/accuracies": 0.890625, "rewards/chosen": 13.619007110595703, "rewards/margins": 11.958572387695312, "rewards/rejected": 1.6628570556640625, "step": 1394 }, { "epoch": 0.7220496894409938, "grad_norm": 0.8175764083862305, "learning_rate": 9.038703319372086e-06, "loss": 0.1662716567516327, "rewards/accuracies": 0.9296875, "rewards/chosen": 15.607196807861328, "rewards/margins": 13.667457580566406, "rewards/rejected": 1.9412574768066406, "step": 1395 }, { "epoch": 0.722567287784679, "grad_norm": 1.6694748401641846, "learning_rate": 9.037020780319373e-06, "loss": 0.19874805212020874, "rewards/accuracies": 0.8984375, "rewards/chosen": 14.920303344726562, "rewards/margins": 12.725341796875, "rewards/rejected": 2.1945042610168457, "step": 1396 }, { "epoch": 0.7230848861283644, "grad_norm": 1.2957613468170166, "learning_rate": 9.035336926985434e-06, "loss": 0.21329811215400696, "rewards/accuracies": 0.8671875, "rewards/chosen": 14.632877349853516, "rewards/margins": 12.5728759765625, "rewards/rejected": 2.0596771240234375, "step": 1397 }, { "epoch": 0.7236024844720497, "grad_norm": 0.6284244656562805, "learning_rate": 9.033651759918459e-06, "loss": 0.1824466586112976, "rewards/accuracies": 0.9296875, "rewards/chosen": 15.508407592773438, "rewards/margins": 13.382583618164062, "rewards/rejected": 2.120919704437256, "step": 1398 }, { "epoch": 0.724120082815735, "grad_norm": 0.6933081150054932, "learning_rate": 9.031965279667067e-06, "loss": 0.15940305590629578, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.57194709777832, "rewards/margins": 14.068145751953125, "rewards/rejected": 2.5027828216552734, "step": 1399 }, { "epoch": 0.7246376811594203, "grad_norm": 1.7116320133209229, "learning_rate": 9.030277486780302e-06, "loss": 0.19635358452796936, "rewards/accuracies": 0.890625, "rewards/chosen": 15.806144714355469, "rewards/margins": 13.38726806640625, "rewards/rejected": 2.420403003692627, "step": 1400 }, { "epoch": 0.7251552795031055, "grad_norm": 0.6296753287315369, "learning_rate": 9.028588381807639e-06, "loss": 0.19731122255325317, "rewards/accuracies": 0.8671875, "rewards/chosen": 17.217178344726562, "rewards/margins": 13.86151123046875, "rewards/rejected": 3.3546581268310547, "step": 1401 }, { "epoch": 0.7256728778467909, "grad_norm": 0.9792534112930298, "learning_rate": 9.026897965298977e-06, "loss": 0.18650078773498535, "rewards/accuracies": 0.9296875, "rewards/chosen": 14.996742248535156, "rewards/margins": 12.269889831542969, "rewards/rejected": 2.7266578674316406, "step": 1402 }, { "epoch": 0.7261904761904762, "grad_norm": 1.0023552179336548, "learning_rate": 9.025206237804645e-06, "loss": 0.21221783757209778, "rewards/accuracies": 0.8671875, "rewards/chosen": 16.013444900512695, "rewards/margins": 13.526268005371094, "rewards/rejected": 2.4896488189697266, "step": 1403 }, { "epoch": 0.7267080745341615, "grad_norm": 0.983181357383728, "learning_rate": 9.023513199875393e-06, "loss": 0.21147403120994568, "rewards/accuracies": 0.90625, "rewards/chosen": 15.611959457397461, "rewards/margins": 12.805349349975586, "rewards/rejected": 2.809286117553711, "step": 1404 }, { "epoch": 0.7272256728778468, "grad_norm": 1.292016863822937, "learning_rate": 9.021818852062407e-06, "loss": 0.300828754901886, "rewards/accuracies": 0.8671875, "rewards/chosen": 16.74149513244629, "rewards/margins": 13.395156860351562, "rewards/rejected": 3.347292900085449, "step": 1405 }, { "epoch": 0.727743271221532, "grad_norm": 1.1053454875946045, "learning_rate": 9.020123194917292e-06, "loss": 0.17658060789108276, "rewards/accuracies": 0.8984375, "rewards/chosen": 17.71920394897461, "rewards/margins": 15.049690246582031, "rewards/rejected": 2.6672415733337402, "step": 1406 }, { "epoch": 0.7282608695652174, "grad_norm": 0.5531654953956604, "learning_rate": 9.01842622899208e-06, "loss": 0.173194020986557, "rewards/accuracies": 0.890625, "rewards/chosen": 14.935661315917969, "rewards/margins": 12.650848388671875, "rewards/rejected": 2.285675048828125, "step": 1407 }, { "epoch": 0.7287784679089027, "grad_norm": 0.7204647660255432, "learning_rate": 9.016727954839233e-06, "loss": 0.2081482708454132, "rewards/accuracies": 0.8671875, "rewards/chosen": 16.829099655151367, "rewards/margins": 14.042362213134766, "rewards/rejected": 2.788266181945801, "step": 1408 }, { "epoch": 0.729296066252588, "grad_norm": 0.8333263993263245, "learning_rate": 9.015028373011634e-06, "loss": 0.2640780210494995, "rewards/accuracies": 0.8515625, "rewards/chosen": 14.28097915649414, "rewards/margins": 12.078828811645508, "rewards/rejected": 2.2017579078674316, "step": 1409 }, { "epoch": 0.7298136645962733, "grad_norm": 2.2384159564971924, "learning_rate": 9.013327484062595e-06, "loss": 0.19229944050312042, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.3006591796875, "rewards/margins": 13.627510070800781, "rewards/rejected": 2.6766719818115234, "step": 1410 }, { "epoch": 0.7303312629399586, "grad_norm": 2.22210693359375, "learning_rate": 9.011625288545853e-06, "loss": 0.2517033815383911, "rewards/accuracies": 0.875, "rewards/chosen": 15.43032169342041, "rewards/margins": 13.335487365722656, "rewards/rejected": 2.096334457397461, "step": 1411 }, { "epoch": 0.7308488612836439, "grad_norm": 0.7236201763153076, "learning_rate": 9.00992178701557e-06, "loss": 0.20701925456523895, "rewards/accuracies": 0.8984375, "rewards/chosen": 14.702438354492188, "rewards/margins": 12.7125244140625, "rewards/rejected": 1.9831724166870117, "step": 1412 }, { "epoch": 0.7313664596273292, "grad_norm": 1.9738175868988037, "learning_rate": 9.008216980026332e-06, "loss": 0.20372304320335388, "rewards/accuracies": 0.90625, "rewards/chosen": 15.874675750732422, "rewards/margins": 13.359451293945312, "rewards/rejected": 2.514993667602539, "step": 1413 }, { "epoch": 0.7318840579710145, "grad_norm": 0.9132247567176819, "learning_rate": 9.006510868133155e-06, "loss": 0.21220523118972778, "rewards/accuracies": 0.875, "rewards/chosen": 16.01559066772461, "rewards/margins": 13.698007583618164, "rewards/rejected": 2.3195393085479736, "step": 1414 }, { "epoch": 0.7324016563146998, "grad_norm": 1.3518539667129517, "learning_rate": 9.004803451891471e-06, "loss": 0.18518395721912384, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.987306594848633, "rewards/margins": 15.616737365722656, "rewards/rejected": 2.3698577880859375, "step": 1415 }, { "epoch": 0.7329192546583851, "grad_norm": 2.0675911903381348, "learning_rate": 9.003094731857145e-06, "loss": 0.22317779064178467, "rewards/accuracies": 0.890625, "rewards/chosen": 17.397796630859375, "rewards/margins": 15.047470092773438, "rewards/rejected": 2.351055145263672, "step": 1416 }, { "epoch": 0.7334368530020704, "grad_norm": 1.4062004089355469, "learning_rate": 9.001384708586462e-06, "loss": 0.21882736682891846, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.629478454589844, "rewards/margins": 14.664785385131836, "rewards/rejected": 1.9633369445800781, "step": 1417 }, { "epoch": 0.7339544513457557, "grad_norm": 1.1767443418502808, "learning_rate": 8.999673382636133e-06, "loss": 0.25457271933555603, "rewards/accuracies": 0.8671875, "rewards/chosen": 16.13777732849121, "rewards/margins": 14.268234252929688, "rewards/rejected": 1.8772516250610352, "step": 1418 }, { "epoch": 0.734472049689441, "grad_norm": 0.8707999587059021, "learning_rate": 8.99796075456329e-06, "loss": 0.23302920162677765, "rewards/accuracies": 0.859375, "rewards/chosen": 17.719417572021484, "rewards/margins": 14.862102508544922, "rewards/rejected": 2.853282928466797, "step": 1419 }, { "epoch": 0.7349896480331263, "grad_norm": 0.8310787081718445, "learning_rate": 8.996246824925497e-06, "loss": 0.23557816445827484, "rewards/accuracies": 0.875, "rewards/chosen": 16.843326568603516, "rewards/margins": 14.419639587402344, "rewards/rejected": 2.422366142272949, "step": 1420 }, { "epoch": 0.7355072463768116, "grad_norm": 0.791991114616394, "learning_rate": 8.99453159428073e-06, "loss": 0.18276065587997437, "rewards/accuracies": 0.890625, "rewards/chosen": 17.082178115844727, "rewards/margins": 14.700607299804688, "rewards/rejected": 2.381049633026123, "step": 1421 }, { "epoch": 0.7360248447204969, "grad_norm": 1.2348823547363281, "learning_rate": 8.992815063187398e-06, "loss": 0.1757214516401291, "rewards/accuracies": 0.9296875, "rewards/chosen": 17.879276275634766, "rewards/margins": 15.180160522460938, "rewards/rejected": 2.6989173889160156, "step": 1422 }, { "epoch": 0.7365424430641822, "grad_norm": 0.5384665131568909, "learning_rate": 8.99109723220433e-06, "loss": 0.1760246455669403, "rewards/accuracies": 0.9296875, "rewards/chosen": 15.895271301269531, "rewards/margins": 13.291015625, "rewards/rejected": 2.6026999950408936, "step": 1423 }, { "epoch": 0.7370600414078675, "grad_norm": 0.8818725943565369, "learning_rate": 8.989378101890776e-06, "loss": 0.2117079794406891, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.85445785522461, "rewards/margins": 14.931556701660156, "rewards/rejected": 1.9262537956237793, "step": 1424 }, { "epoch": 0.7375776397515528, "grad_norm": 0.6354365348815918, "learning_rate": 8.987657672806413e-06, "loss": 0.15136995911598206, "rewards/accuracies": 0.90625, "rewards/chosen": 22.35857391357422, "rewards/margins": 18.303543090820312, "rewards/rejected": 4.0554351806640625, "step": 1425 }, { "epoch": 0.7380952380952381, "grad_norm": 2.2108612060546875, "learning_rate": 8.985935945511341e-06, "loss": 0.2752939462661743, "rewards/accuracies": 0.8515625, "rewards/chosen": 20.194530487060547, "rewards/margins": 16.089630126953125, "rewards/rejected": 4.110324859619141, "step": 1426 }, { "epoch": 0.7386128364389234, "grad_norm": 1.125920057296753, "learning_rate": 8.984212920566077e-06, "loss": 0.16585256159305573, "rewards/accuracies": 0.9296875, "rewards/chosen": 17.76483154296875, "rewards/margins": 14.735359191894531, "rewards/rejected": 3.0358104705810547, "step": 1427 }, { "epoch": 0.7391304347826086, "grad_norm": 1.4771485328674316, "learning_rate": 8.982488598531566e-06, "loss": 0.2176550328731537, "rewards/accuracies": 0.8671875, "rewards/chosen": 17.50960922241211, "rewards/margins": 14.238465309143066, "rewards/rejected": 3.2718610763549805, "step": 1428 }, { "epoch": 0.739648033126294, "grad_norm": 1.0057226419448853, "learning_rate": 8.980762979969174e-06, "loss": 0.21368913352489471, "rewards/accuracies": 0.8828125, "rewards/chosen": 17.396255493164062, "rewards/margins": 14.299568176269531, "rewards/rejected": 3.094266891479492, "step": 1429 }, { "epoch": 0.7401656314699793, "grad_norm": 1.1063734292984009, "learning_rate": 8.979036065440688e-06, "loss": 0.19911251962184906, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.36687469482422, "rewards/margins": 14.20229721069336, "rewards/rejected": 3.16546630859375, "step": 1430 }, { "epoch": 0.7406832298136646, "grad_norm": 1.169405460357666, "learning_rate": 8.977307855508317e-06, "loss": 0.2268175482749939, "rewards/accuracies": 0.8671875, "rewards/chosen": 15.483415603637695, "rewards/margins": 13.019424438476562, "rewards/rejected": 2.4599742889404297, "step": 1431 }, { "epoch": 0.7412008281573499, "grad_norm": 1.5813778638839722, "learning_rate": 8.975578350734694e-06, "loss": 0.19956491887569427, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.669666290283203, "rewards/margins": 15.522109985351562, "rewards/rejected": 3.147111415863037, "step": 1432 }, { "epoch": 0.7417184265010351, "grad_norm": 1.1775343418121338, "learning_rate": 8.97384755168287e-06, "loss": 0.1868738979101181, "rewards/accuracies": 0.890625, "rewards/chosen": 20.18726348876953, "rewards/margins": 16.331520080566406, "rewards/rejected": 3.8567700386047363, "step": 1433 }, { "epoch": 0.7422360248447205, "grad_norm": 0.5792251825332642, "learning_rate": 8.972115458916318e-06, "loss": 0.11561395227909088, "rewards/accuracies": 0.9453125, "rewards/chosen": 19.457138061523438, "rewards/margins": 16.986419677734375, "rewards/rejected": 2.4716973304748535, "step": 1434 }, { "epoch": 0.7427536231884058, "grad_norm": 0.9254934191703796, "learning_rate": 8.970382072998936e-06, "loss": 0.18263037502765656, "rewards/accuracies": 0.921875, "rewards/chosen": 17.018861770629883, "rewards/margins": 13.981948852539062, "rewards/rejected": 3.034907579421997, "step": 1435 }, { "epoch": 0.7432712215320911, "grad_norm": 1.8943204879760742, "learning_rate": 8.968647394495039e-06, "loss": 0.2294696569442749, "rewards/accuracies": 0.8984375, "rewards/chosen": 17.013553619384766, "rewards/margins": 14.814773559570312, "rewards/rejected": 2.202268123626709, "step": 1436 }, { "epoch": 0.7437888198757764, "grad_norm": 0.74191814661026, "learning_rate": 8.966911423969364e-06, "loss": 0.16480597853660583, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.564373016357422, "rewards/margins": 15.2069091796875, "rewards/rejected": 2.3599166870117188, "step": 1437 }, { "epoch": 0.7443064182194618, "grad_norm": 0.8497828841209412, "learning_rate": 8.96517416198707e-06, "loss": 0.19115635752677917, "rewards/accuracies": 0.90625, "rewards/chosen": 16.42060089111328, "rewards/margins": 14.834671020507812, "rewards/rejected": 1.5889053344726562, "step": 1438 }, { "epoch": 0.744824016563147, "grad_norm": 0.5980980396270752, "learning_rate": 8.963435609113732e-06, "loss": 0.15360309183597565, "rewards/accuracies": 0.9296875, "rewards/chosen": 15.448204040527344, "rewards/margins": 13.71435546875, "rewards/rejected": 1.7305870056152344, "step": 1439 }, { "epoch": 0.7453416149068323, "grad_norm": 1.9813683032989502, "learning_rate": 8.961695765915352e-06, "loss": 0.2489539086818695, "rewards/accuracies": 0.859375, "rewards/chosen": 17.431438446044922, "rewards/margins": 15.089847564697266, "rewards/rejected": 2.344792366027832, "step": 1440 }, { "epoch": 0.7458592132505176, "grad_norm": 0.9115517735481262, "learning_rate": 8.959954632958348e-06, "loss": 0.16742101311683655, "rewards/accuracies": 0.8671875, "rewards/chosen": 18.009475708007812, "rewards/margins": 16.191879272460938, "rewards/rejected": 1.8184070587158203, "step": 1441 }, { "epoch": 0.7463768115942029, "grad_norm": 1.8355610370635986, "learning_rate": 8.958212210809554e-06, "loss": 0.21744287014007568, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.5859375, "rewards/margins": 16.65125274658203, "rewards/rejected": 1.9349536895751953, "step": 1442 }, { "epoch": 0.7468944099378882, "grad_norm": 3.3399481773376465, "learning_rate": 8.956468500036235e-06, "loss": 0.20347833633422852, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.002649307250977, "rewards/margins": 14.856033325195312, "rewards/rejected": 2.1477417945861816, "step": 1443 }, { "epoch": 0.7474120082815735, "grad_norm": 1.3455231189727783, "learning_rate": 8.954723501206063e-06, "loss": 0.11530603468418121, "rewards/accuracies": 0.9296875, "rewards/chosen": 19.752132415771484, "rewards/margins": 17.790481567382812, "rewards/rejected": 1.9620819091796875, "step": 1444 }, { "epoch": 0.7479296066252588, "grad_norm": 1.0398051738739014, "learning_rate": 8.952977214887137e-06, "loss": 0.15024533867835999, "rewards/accuracies": 0.9296875, "rewards/chosen": 19.379302978515625, "rewards/margins": 17.14178466796875, "rewards/rejected": 2.238922119140625, "step": 1445 }, { "epoch": 0.7484472049689441, "grad_norm": 1.9070775508880615, "learning_rate": 8.951229641647975e-06, "loss": 0.29281264543533325, "rewards/accuracies": 0.859375, "rewards/chosen": 17.096240997314453, "rewards/margins": 14.56976318359375, "rewards/rejected": 2.527113914489746, "step": 1446 }, { "epoch": 0.7489648033126294, "grad_norm": 4.486841201782227, "learning_rate": 8.949480782057509e-06, "loss": 0.2191551923751831, "rewards/accuracies": 0.8515625, "rewards/chosen": 18.517433166503906, "rewards/margins": 16.176002502441406, "rewards/rejected": 2.340740203857422, "step": 1447 }, { "epoch": 0.7494824016563147, "grad_norm": 1.0377509593963623, "learning_rate": 8.947730636685094e-06, "loss": 0.16786587238311768, "rewards/accuracies": 0.8984375, "rewards/chosen": 17.968109130859375, "rewards/margins": 16.283859252929688, "rewards/rejected": 1.6832447052001953, "step": 1448 }, { "epoch": 0.75, "grad_norm": 1.5001031160354614, "learning_rate": 8.945979206100504e-06, "loss": 0.2028660774230957, "rewards/accuracies": 0.8828125, "rewards/chosen": 15.488556861877441, "rewards/margins": 13.727310180664062, "rewards/rejected": 1.7574286460876465, "step": 1449 }, { "epoch": 0.7505175983436853, "grad_norm": 1.8152403831481934, "learning_rate": 8.944226490873925e-06, "loss": 0.16328346729278564, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.317054748535156, "rewards/margins": 16.098400115966797, "rewards/rejected": 2.212003707885742, "step": 1450 }, { "epoch": 0.7510351966873706, "grad_norm": 0.7823221683502197, "learning_rate": 8.942472491575974e-06, "loss": 0.18369054794311523, "rewards/accuracies": 0.8828125, "rewards/chosen": 15.882791519165039, "rewards/margins": 13.709587097167969, "rewards/rejected": 2.1703720092773438, "step": 1451 }, { "epoch": 0.7515527950310559, "grad_norm": 0.9756906032562256, "learning_rate": 8.940717208777672e-06, "loss": 0.15060070157051086, "rewards/accuracies": 0.9296875, "rewards/chosen": 16.944416046142578, "rewards/margins": 15.1220703125, "rewards/rejected": 1.823087215423584, "step": 1452 }, { "epoch": 0.7520703933747412, "grad_norm": 3.1017675399780273, "learning_rate": 8.938960643050465e-06, "loss": 0.2144334316253662, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.42477798461914, "rewards/margins": 14.439842224121094, "rewards/rejected": 1.9847335815429688, "step": 1453 }, { "epoch": 0.7525879917184265, "grad_norm": 1.0287601947784424, "learning_rate": 8.937202794966215e-06, "loss": 0.17859336733818054, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.916366577148438, "rewards/margins": 14.993408203125, "rewards/rejected": 1.924142837524414, "step": 1454 }, { "epoch": 0.7531055900621118, "grad_norm": 2.6598565578460693, "learning_rate": 8.935443665097206e-06, "loss": 0.21409323811531067, "rewards/accuracies": 0.890625, "rewards/chosen": 16.54621124267578, "rewards/margins": 14.395606994628906, "rewards/rejected": 2.1524276733398438, "step": 1455 }, { "epoch": 0.7536231884057971, "grad_norm": 1.5604733228683472, "learning_rate": 8.93368325401613e-06, "loss": 0.12902404367923737, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.53628921508789, "rewards/margins": 15.683502197265625, "rewards/rejected": 1.8528976440429688, "step": 1456 }, { "epoch": 0.7541407867494824, "grad_norm": 0.9747694134712219, "learning_rate": 8.931921562296106e-06, "loss": 0.17548677325248718, "rewards/accuracies": 0.8984375, "rewards/chosen": 18.046504974365234, "rewards/margins": 15.944812774658203, "rewards/rejected": 2.101207733154297, "step": 1457 }, { "epoch": 0.7546583850931677, "grad_norm": 1.1698284149169922, "learning_rate": 8.930158590510662e-06, "loss": 0.16958856582641602, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.309791564941406, "rewards/margins": 15.558029174804688, "rewards/rejected": 1.7492828369140625, "step": 1458 }, { "epoch": 0.755175983436853, "grad_norm": 1.728611946105957, "learning_rate": 8.92839433923375e-06, "loss": 0.1803603172302246, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.012046813964844, "rewards/margins": 15.174179077148438, "rewards/rejected": 1.8350505828857422, "step": 1459 }, { "epoch": 0.7556935817805382, "grad_norm": 2.464827299118042, "learning_rate": 8.926628809039732e-06, "loss": 0.23268504440784454, "rewards/accuracies": 0.8828125, "rewards/chosen": 17.850914001464844, "rewards/margins": 15.787246704101562, "rewards/rejected": 2.061285972595215, "step": 1460 }, { "epoch": 0.7562111801242236, "grad_norm": 1.8440402746200562, "learning_rate": 8.92486200050339e-06, "loss": 0.2014925628900528, "rewards/accuracies": 0.890625, "rewards/chosen": 19.465877532958984, "rewards/margins": 17.392333984375, "rewards/rejected": 2.0760345458984375, "step": 1461 }, { "epoch": 0.7567287784679089, "grad_norm": 0.9258764386177063, "learning_rate": 8.92309391419992e-06, "loss": 0.20160970091819763, "rewards/accuracies": 0.8984375, "rewards/chosen": 19.306184768676758, "rewards/margins": 16.701438903808594, "rewards/rejected": 2.606328010559082, "step": 1462 }, { "epoch": 0.7572463768115942, "grad_norm": 2.156285047531128, "learning_rate": 8.921324550704935e-06, "loss": 0.23181787133216858, "rewards/accuracies": 0.875, "rewards/chosen": 15.170717239379883, "rewards/margins": 13.665580749511719, "rewards/rejected": 1.5137519836425781, "step": 1463 }, { "epoch": 0.7577639751552795, "grad_norm": 1.757704257965088, "learning_rate": 8.919553910594467e-06, "loss": 0.23098258674144745, "rewards/accuracies": 0.890625, "rewards/chosen": 17.053407669067383, "rewards/margins": 15.257194519042969, "rewards/rejected": 1.8006420135498047, "step": 1464 }, { "epoch": 0.7582815734989649, "grad_norm": 1.0850026607513428, "learning_rate": 8.917781994444958e-06, "loss": 0.17460864782333374, "rewards/accuracies": 0.9140625, "rewards/chosen": 18.69921875, "rewards/margins": 16.47930908203125, "rewards/rejected": 2.2124576568603516, "step": 1465 }, { "epoch": 0.7587991718426501, "grad_norm": 0.8841776251792908, "learning_rate": 8.916008802833271e-06, "loss": 0.17370560765266418, "rewards/accuracies": 0.90625, "rewards/chosen": 17.201385498046875, "rewards/margins": 15.053939819335938, "rewards/rejected": 2.147909164428711, "step": 1466 }, { "epoch": 0.7593167701863354, "grad_norm": 0.9515126943588257, "learning_rate": 8.914234336336678e-06, "loss": 0.2517049312591553, "rewards/accuracies": 0.8671875, "rewards/chosen": 18.04637908935547, "rewards/margins": 16.117431640625, "rewards/rejected": 1.9355182647705078, "step": 1467 }, { "epoch": 0.7598343685300207, "grad_norm": 0.6261983513832092, "learning_rate": 8.912458595532871e-06, "loss": 0.21249297261238098, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.60962677001953, "rewards/margins": 14.892684936523438, "rewards/rejected": 1.7152576446533203, "step": 1468 }, { "epoch": 0.760351966873706, "grad_norm": 0.6025402545928955, "learning_rate": 8.910681580999953e-06, "loss": 0.18511870503425598, "rewards/accuracies": 0.90625, "rewards/chosen": 18.611146926879883, "rewards/margins": 16.42829132080078, "rewards/rejected": 2.1844472885131836, "step": 1469 }, { "epoch": 0.7608695652173914, "grad_norm": 0.8031814694404602, "learning_rate": 8.90890329331645e-06, "loss": 0.17603525519371033, "rewards/accuracies": 0.90625, "rewards/chosen": 17.676925659179688, "rewards/margins": 15.977523803710938, "rewards/rejected": 1.6995437145233154, "step": 1470 }, { "epoch": 0.7613871635610766, "grad_norm": 1.2275804281234741, "learning_rate": 8.907123733061287e-06, "loss": 0.1519744098186493, "rewards/accuracies": 0.9140625, "rewards/chosen": 19.757287979125977, "rewards/margins": 17.32750701904297, "rewards/rejected": 2.4308242797851562, "step": 1471 }, { "epoch": 0.7619047619047619, "grad_norm": 1.495090126991272, "learning_rate": 8.905342900813821e-06, "loss": 0.23126643896102905, "rewards/accuracies": 0.875, "rewards/chosen": 17.58441162109375, "rewards/margins": 15.614521026611328, "rewards/rejected": 1.9736671447753906, "step": 1472 }, { "epoch": 0.7624223602484472, "grad_norm": 0.6886290311813354, "learning_rate": 8.903560797153812e-06, "loss": 0.17629486322402954, "rewards/accuracies": 0.890625, "rewards/chosen": 19.287933349609375, "rewards/margins": 17.764423370361328, "rewards/rejected": 1.5185871124267578, "step": 1473 }, { "epoch": 0.7629399585921325, "grad_norm": 0.9780370593070984, "learning_rate": 8.901777422661435e-06, "loss": 0.2302759289741516, "rewards/accuracies": 0.875, "rewards/chosen": 17.677677154541016, "rewards/margins": 15.430442810058594, "rewards/rejected": 2.2460479736328125, "step": 1474 }, { "epoch": 0.7634575569358178, "grad_norm": 1.611624002456665, "learning_rate": 8.89999277791728e-06, "loss": 0.20577198266983032, "rewards/accuracies": 0.9140625, "rewards/chosen": 19.039817810058594, "rewards/margins": 17.030319213867188, "rewards/rejected": 2.0051841735839844, "step": 1475 }, { "epoch": 0.7639751552795031, "grad_norm": 0.6739711761474609, "learning_rate": 8.898206863502356e-06, "loss": 0.16281016170978546, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.620561599731445, "rewards/margins": 20.207534790039062, "rewards/rejected": 2.410503387451172, "step": 1476 }, { "epoch": 0.7644927536231884, "grad_norm": 0.7843494415283203, "learning_rate": 8.896419679998076e-06, "loss": 0.1702946424484253, "rewards/accuracies": 0.8984375, "rewards/chosen": 21.82879638671875, "rewards/margins": 19.733680725097656, "rewards/rejected": 2.0931453704833984, "step": 1477 }, { "epoch": 0.7650103519668737, "grad_norm": 0.5784754753112793, "learning_rate": 8.894631227986273e-06, "loss": 0.17233455181121826, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.833871841430664, "rewards/margins": 18.552169799804688, "rewards/rejected": 2.2841100692749023, "step": 1478 }, { "epoch": 0.765527950310559, "grad_norm": 0.9342695474624634, "learning_rate": 8.892841508049186e-06, "loss": 0.17205660045146942, "rewards/accuracies": 0.875, "rewards/chosen": 24.005096435546875, "rewards/margins": 21.309371948242188, "rewards/rejected": 2.6900253295898438, "step": 1479 }, { "epoch": 0.7660455486542443, "grad_norm": 0.8070079684257507, "learning_rate": 8.891050520769475e-06, "loss": 0.19478121399879456, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.393495559692383, "rewards/margins": 19.98950958251953, "rewards/rejected": 2.4044013023376465, "step": 1480 }, { "epoch": 0.7665631469979296, "grad_norm": 1.100319743156433, "learning_rate": 8.889258266730209e-06, "loss": 0.1322392076253891, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.490575790405273, "rewards/margins": 20.22613525390625, "rewards/rejected": 3.2689015865325928, "step": 1481 }, { "epoch": 0.7670807453416149, "grad_norm": 2.526069164276123, "learning_rate": 8.887464746514867e-06, "loss": 0.2008875012397766, "rewards/accuracies": 0.890625, "rewards/chosen": 23.998043060302734, "rewards/margins": 20.340972900390625, "rewards/rejected": 3.6578369140625, "step": 1482 }, { "epoch": 0.7675983436853002, "grad_norm": 1.3564074039459229, "learning_rate": 8.885669960707344e-06, "loss": 0.14863519370555878, "rewards/accuracies": 0.9375, "rewards/chosen": 23.13595962524414, "rewards/margins": 19.364593505859375, "rewards/rejected": 3.7764244079589844, "step": 1483 }, { "epoch": 0.7681159420289855, "grad_norm": 1.7211799621582031, "learning_rate": 8.883873909891948e-06, "loss": 0.18175527453422546, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.438430786132812, "rewards/margins": 18.269989013671875, "rewards/rejected": 4.16477108001709, "step": 1484 }, { "epoch": 0.7686335403726708, "grad_norm": 0.7827067375183105, "learning_rate": 8.882076594653389e-06, "loss": 0.17024463415145874, "rewards/accuracies": 0.8984375, "rewards/chosen": 23.656158447265625, "rewards/margins": 19.501480102539062, "rewards/rejected": 4.157527923583984, "step": 1485 }, { "epoch": 0.7691511387163561, "grad_norm": 1.803146243095398, "learning_rate": 8.880278015576802e-06, "loss": 0.24867156147956848, "rewards/accuracies": 0.8828125, "rewards/chosen": 19.896223068237305, "rewards/margins": 16.340606689453125, "rewards/rejected": 3.5522117614746094, "step": 1486 }, { "epoch": 0.7696687370600414, "grad_norm": 0.8031122088432312, "learning_rate": 8.878478173247726e-06, "loss": 0.12823569774627686, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.941295623779297, "rewards/margins": 20.718345642089844, "rewards/rejected": 5.223346710205078, "step": 1487 }, { "epoch": 0.7701863354037267, "grad_norm": 5.100681781768799, "learning_rate": 8.876677068252114e-06, "loss": 0.3044777512550354, "rewards/accuracies": 0.8828125, "rewards/chosen": 22.2027587890625, "rewards/margins": 17.12091064453125, "rewards/rejected": 5.081676483154297, "step": 1488 }, { "epoch": 0.770703933747412, "grad_norm": 0.83672034740448, "learning_rate": 8.874874701176324e-06, "loss": 0.1766539365053177, "rewards/accuracies": 0.9140625, "rewards/chosen": 18.940765380859375, "rewards/margins": 15.022964477539062, "rewards/rejected": 3.9171295166015625, "step": 1489 }, { "epoch": 0.7712215320910973, "grad_norm": 0.9325284361839294, "learning_rate": 8.873071072607136e-06, "loss": 0.13872192800045013, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.013071060180664, "rewards/margins": 16.449600219726562, "rewards/rejected": 3.5619983673095703, "step": 1490 }, { "epoch": 0.7717391304347826, "grad_norm": 0.7905405759811401, "learning_rate": 8.871266183131728e-06, "loss": 0.16523262858390808, "rewards/accuracies": 0.921875, "rewards/chosen": 18.75323486328125, "rewards/margins": 14.457298278808594, "rewards/rejected": 4.292872428894043, "step": 1491 }, { "epoch": 0.772256728778468, "grad_norm": 1.9373527765274048, "learning_rate": 8.8694600333377e-06, "loss": 0.2173001915216446, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.44860076904297, "rewards/margins": 13.356025695800781, "rewards/rejected": 3.0917510986328125, "step": 1492 }, { "epoch": 0.7727743271221532, "grad_norm": 1.1849809885025024, "learning_rate": 8.867652623813054e-06, "loss": 0.23686255514621735, "rewards/accuracies": 0.9140625, "rewards/chosen": 15.366565704345703, "rewards/margins": 12.648075103759766, "rewards/rejected": 2.7175755500793457, "step": 1493 }, { "epoch": 0.7732919254658385, "grad_norm": 1.5489188432693481, "learning_rate": 8.865843955146207e-06, "loss": 0.16476599872112274, "rewards/accuracies": 0.921875, "rewards/chosen": 19.297386169433594, "rewards/margins": 15.247268676757812, "rewards/rejected": 4.052450656890869, "step": 1494 }, { "epoch": 0.7738095238095238, "grad_norm": 0.9366694092750549, "learning_rate": 8.86403402792598e-06, "loss": 0.22880345582962036, "rewards/accuracies": 0.8671875, "rewards/chosen": 16.591657638549805, "rewards/margins": 12.930404663085938, "rewards/rejected": 3.657747268676758, "step": 1495 }, { "epoch": 0.7743271221532091, "grad_norm": 0.8965792655944824, "learning_rate": 8.862222842741614e-06, "loss": 0.19148999452590942, "rewards/accuracies": 0.90625, "rewards/chosen": 17.116580963134766, "rewards/margins": 13.781745910644531, "rewards/rejected": 3.3358588218688965, "step": 1496 }, { "epoch": 0.7748447204968945, "grad_norm": 0.623699426651001, "learning_rate": 8.86041040018275e-06, "loss": 0.12594890594482422, "rewards/accuracies": 0.9296875, "rewards/chosen": 18.238872528076172, "rewards/margins": 14.442169189453125, "rewards/rejected": 3.795799732208252, "step": 1497 }, { "epoch": 0.7753623188405797, "grad_norm": 0.8674623966217041, "learning_rate": 8.858596700839442e-06, "loss": 0.1968538612127304, "rewards/accuracies": 0.8984375, "rewards/chosen": 15.93071174621582, "rewards/margins": 12.91887092590332, "rewards/rejected": 3.014780044555664, "step": 1498 }, { "epoch": 0.775879917184265, "grad_norm": 0.7967759966850281, "learning_rate": 8.856781745302153e-06, "loss": 0.17997068166732788, "rewards/accuracies": 0.90625, "rewards/chosen": 15.573661804199219, "rewards/margins": 12.045547485351562, "rewards/rejected": 3.5264663696289062, "step": 1499 }, { "epoch": 0.7763975155279503, "grad_norm": 2.040335178375244, "learning_rate": 8.854965534161755e-06, "loss": 0.23669210076332092, "rewards/accuracies": 0.859375, "rewards/chosen": 16.380624771118164, "rewards/margins": 12.469097137451172, "rewards/rejected": 3.9117064476013184, "step": 1500 }, { "epoch": 0.7769151138716356, "grad_norm": 1.0819971561431885, "learning_rate": 8.85314806800953e-06, "loss": 0.1601833999156952, "rewards/accuracies": 0.921875, "rewards/chosen": 17.408935546875, "rewards/margins": 13.886566162109375, "rewards/rejected": 3.52215576171875, "step": 1501 }, { "epoch": 0.777432712215321, "grad_norm": 1.4061471223831177, "learning_rate": 8.851329347437162e-06, "loss": 0.19732382893562317, "rewards/accuracies": 0.8984375, "rewards/chosen": 15.920772552490234, "rewards/margins": 12.742721557617188, "rewards/rejected": 3.174118995666504, "step": 1502 }, { "epoch": 0.7779503105590062, "grad_norm": 0.8286568522453308, "learning_rate": 8.849509373036758e-06, "loss": 0.1772901862859726, "rewards/accuracies": 0.8828125, "rewards/chosen": 16.916202545166016, "rewards/margins": 13.827957153320312, "rewards/rejected": 3.093104600906372, "step": 1503 }, { "epoch": 0.7784679089026915, "grad_norm": 1.1629923582077026, "learning_rate": 8.847688145400814e-06, "loss": 0.17780031263828278, "rewards/accuracies": 0.9296875, "rewards/chosen": 16.47076416015625, "rewards/margins": 13.592987060546875, "rewards/rejected": 2.8766462802886963, "step": 1504 }, { "epoch": 0.7789855072463768, "grad_norm": 1.0967460870742798, "learning_rate": 8.84586566512225e-06, "loss": 0.24584175646305084, "rewards/accuracies": 0.8515625, "rewards/chosen": 15.787979125976562, "rewards/margins": 12.435333251953125, "rewards/rejected": 3.3528594970703125, "step": 1505 }, { "epoch": 0.7795031055900621, "grad_norm": 1.1708232164382935, "learning_rate": 8.844041932794385e-06, "loss": 0.20121368765830994, "rewards/accuracies": 0.890625, "rewards/chosen": 16.866973876953125, "rewards/margins": 13.837478637695312, "rewards/rejected": 3.029022216796875, "step": 1506 }, { "epoch": 0.7800207039337475, "grad_norm": 0.8356667757034302, "learning_rate": 8.842216949010949e-06, "loss": 0.16476988792419434, "rewards/accuracies": 0.90625, "rewards/chosen": 17.675308227539062, "rewards/margins": 14.334358215332031, "rewards/rejected": 3.3460311889648438, "step": 1507 }, { "epoch": 0.7805383022774327, "grad_norm": 0.870667576789856, "learning_rate": 8.840390714366077e-06, "loss": 0.1738722324371338, "rewards/accuracies": 0.921875, "rewards/chosen": 18.199630737304688, "rewards/margins": 15.170318603515625, "rewards/rejected": 3.0283522605895996, "step": 1508 }, { "epoch": 0.781055900621118, "grad_norm": 0.9731136560440063, "learning_rate": 8.838563229454316e-06, "loss": 0.2000979781150818, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.159934997558594, "rewards/margins": 13.954544067382812, "rewards/rejected": 2.2036561965942383, "step": 1509 }, { "epoch": 0.7815734989648033, "grad_norm": 0.8482347726821899, "learning_rate": 8.836734494870613e-06, "loss": 0.23493081331253052, "rewards/accuracies": 0.90625, "rewards/chosen": 15.26577377319336, "rewards/margins": 13.14654541015625, "rewards/rejected": 2.1229171752929688, "step": 1510 }, { "epoch": 0.7820910973084886, "grad_norm": 0.7827019691467285, "learning_rate": 8.83490451121033e-06, "loss": 0.19168873131275177, "rewards/accuracies": 0.890625, "rewards/chosen": 15.104169845581055, "rewards/margins": 12.7998046875, "rewards/rejected": 2.3010706901550293, "step": 1511 }, { "epoch": 0.782608695652174, "grad_norm": 0.9073243141174316, "learning_rate": 8.83307327906923e-06, "loss": 0.18639439344406128, "rewards/accuracies": 0.90625, "rewards/chosen": 17.622621536254883, "rewards/margins": 15.219047546386719, "rewards/rejected": 2.4002151489257812, "step": 1512 }, { "epoch": 0.7831262939958592, "grad_norm": 2.38716721534729, "learning_rate": 8.831240799043481e-06, "loss": 0.14889411628246307, "rewards/accuracies": 0.9140625, "rewards/chosen": 18.888233184814453, "rewards/margins": 16.478389739990234, "rewards/rejected": 2.4188919067382812, "step": 1513 }, { "epoch": 0.7836438923395446, "grad_norm": 0.46340861916542053, "learning_rate": 8.829407071729663e-06, "loss": 0.10494781285524368, "rewards/accuracies": 0.9765625, "rewards/chosen": 19.66411018371582, "rewards/margins": 17.194839477539062, "rewards/rejected": 2.471550226211548, "step": 1514 }, { "epoch": 0.7841614906832298, "grad_norm": 0.5877734422683716, "learning_rate": 8.82757209772476e-06, "loss": 0.16260260343551636, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.897043228149414, "rewards/margins": 15.357307434082031, "rewards/rejected": 1.540414810180664, "step": 1515 }, { "epoch": 0.7846790890269151, "grad_norm": 1.1876869201660156, "learning_rate": 8.825735877626158e-06, "loss": 0.25460055470466614, "rewards/accuracies": 0.875, "rewards/chosen": 17.109630584716797, "rewards/margins": 14.989044189453125, "rewards/rejected": 2.120525360107422, "step": 1516 }, { "epoch": 0.7851966873706004, "grad_norm": 1.2132128477096558, "learning_rate": 8.823898412031652e-06, "loss": 0.20249810814857483, "rewards/accuracies": 0.8984375, "rewards/chosen": 15.600414276123047, "rewards/margins": 13.922134399414062, "rewards/rejected": 1.678971290588379, "step": 1517 }, { "epoch": 0.7857142857142857, "grad_norm": 1.362892985343933, "learning_rate": 8.822059701539444e-06, "loss": 0.182047039270401, "rewards/accuracies": 0.9375, "rewards/chosen": 17.390949249267578, "rewards/margins": 15.754287719726562, "rewards/rejected": 1.6383113861083984, "step": 1518 }, { "epoch": 0.7862318840579711, "grad_norm": 0.610079824924469, "learning_rate": 8.820219746748139e-06, "loss": 0.1858803778886795, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.975902557373047, "rewards/margins": 15.293464660644531, "rewards/rejected": 1.6816139221191406, "step": 1519 }, { "epoch": 0.7867494824016563, "grad_norm": 1.0928324460983276, "learning_rate": 8.818378548256747e-06, "loss": 0.18177512288093567, "rewards/accuracies": 0.90625, "rewards/chosen": 18.803363800048828, "rewards/margins": 17.173431396484375, "rewards/rejected": 1.6299686431884766, "step": 1520 }, { "epoch": 0.7872670807453416, "grad_norm": 1.5311232805252075, "learning_rate": 8.816536106664683e-06, "loss": 0.2572329044342041, "rewards/accuracies": 0.859375, "rewards/chosen": 17.448143005371094, "rewards/margins": 15.989837646484375, "rewards/rejected": 1.4634037017822266, "step": 1521 }, { "epoch": 0.7877846790890269, "grad_norm": 1.4039632081985474, "learning_rate": 8.81469242257177e-06, "loss": 0.1844344139099121, "rewards/accuracies": 0.90625, "rewards/chosen": 18.95459747314453, "rewards/margins": 17.94464874267578, "rewards/rejected": 1.010617971420288, "step": 1522 }, { "epoch": 0.7883022774327122, "grad_norm": 2.5325558185577393, "learning_rate": 8.812847496578229e-06, "loss": 0.2956421375274658, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.142169952392578, "rewards/margins": 19.372333526611328, "rewards/rejected": 1.7690610885620117, "step": 1523 }, { "epoch": 0.7888198757763976, "grad_norm": 1.1751164197921753, "learning_rate": 8.81100132928469e-06, "loss": 0.20528121292591095, "rewards/accuracies": 0.90625, "rewards/chosen": 18.260446548461914, "rewards/margins": 16.802734375, "rewards/rejected": 1.4596576690673828, "step": 1524 }, { "epoch": 0.7893374741200828, "grad_norm": 1.2797104120254517, "learning_rate": 8.809153921292189e-06, "loss": 0.17713198065757751, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.70708656311035, "rewards/margins": 16.641510009765625, "rewards/rejected": 1.0685882568359375, "step": 1525 }, { "epoch": 0.7898550724637681, "grad_norm": 1.5875478982925415, "learning_rate": 8.807305273202157e-06, "loss": 0.1641969531774521, "rewards/accuracies": 0.8828125, "rewards/chosen": 21.422348022460938, "rewards/margins": 19.570201873779297, "rewards/rejected": 1.8553333282470703, "step": 1526 }, { "epoch": 0.7903726708074534, "grad_norm": 0.9216684103012085, "learning_rate": 8.805455385616441e-06, "loss": 0.21069109439849854, "rewards/accuracies": 0.890625, "rewards/chosen": 17.610767364501953, "rewards/margins": 16.135696411132812, "rewards/rejected": 1.4732303619384766, "step": 1527 }, { "epoch": 0.7908902691511387, "grad_norm": 1.5073857307434082, "learning_rate": 8.803604259137281e-06, "loss": 0.2219831496477127, "rewards/accuracies": 0.859375, "rewards/chosen": 19.648334503173828, "rewards/margins": 18.31548309326172, "rewards/rejected": 1.3350067138671875, "step": 1528 }, { "epoch": 0.7914078674948241, "grad_norm": 2.183314323425293, "learning_rate": 8.801751894367327e-06, "loss": 0.21121637523174286, "rewards/accuracies": 0.8828125, "rewards/chosen": 18.281070709228516, "rewards/margins": 16.997119903564453, "rewards/rejected": 1.280202865600586, "step": 1529 }, { "epoch": 0.7919254658385093, "grad_norm": 0.599686861038208, "learning_rate": 8.799898291909629e-06, "loss": 0.19991794228553772, "rewards/accuracies": 0.8828125, "rewards/chosen": 21.61957550048828, "rewards/margins": 19.933887481689453, "rewards/rejected": 1.6838990449905396, "step": 1530 }, { "epoch": 0.7924430641821946, "grad_norm": 0.892876386642456, "learning_rate": 8.79804345236764e-06, "loss": 0.24459302425384521, "rewards/accuracies": 0.8828125, "rewards/chosen": 14.710707664489746, "rewards/margins": 13.719199180603027, "rewards/rejected": 0.9944181442260742, "step": 1531 }, { "epoch": 0.7929606625258799, "grad_norm": 0.6710295081138611, "learning_rate": 8.796187376345216e-06, "loss": 0.163845032453537, "rewards/accuracies": 0.9140625, "rewards/chosen": 20.89971923828125, "rewards/margins": 19.781150817871094, "rewards/rejected": 1.1184425354003906, "step": 1532 }, { "epoch": 0.7934782608695652, "grad_norm": 0.624254584312439, "learning_rate": 8.794330064446617e-06, "loss": 0.1717340350151062, "rewards/accuracies": 0.9375, "rewards/chosen": 18.896564483642578, "rewards/margins": 17.41864776611328, "rewards/rejected": 1.4764938354492188, "step": 1533 }, { "epoch": 0.7939958592132506, "grad_norm": 1.0062963962554932, "learning_rate": 8.792471517276504e-06, "loss": 0.17061102390289307, "rewards/accuracies": 0.9140625, "rewards/chosen": 19.920490264892578, "rewards/margins": 18.250041961669922, "rewards/rejected": 1.6706008911132812, "step": 1534 }, { "epoch": 0.7945134575569358, "grad_norm": 1.692072868347168, "learning_rate": 8.790611735439943e-06, "loss": 0.2142280638217926, "rewards/accuracies": 0.875, "rewards/chosen": 21.088497161865234, "rewards/margins": 19.273895263671875, "rewards/rejected": 1.8092327117919922, "step": 1535 }, { "epoch": 0.7950310559006211, "grad_norm": 0.7985432147979736, "learning_rate": 8.788750719542394e-06, "loss": 0.1418519914150238, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.250614166259766, "rewards/margins": 21.90106201171875, "rewards/rejected": 2.3481178283691406, "step": 1536 }, { "epoch": 0.7955486542443064, "grad_norm": 0.5999130010604858, "learning_rate": 8.786888470189726e-06, "loss": 0.11372684687376022, "rewards/accuracies": 0.96875, "rewards/chosen": 23.58578872680664, "rewards/margins": 21.74843978881836, "rewards/rejected": 1.8369178771972656, "step": 1537 }, { "epoch": 0.7960662525879917, "grad_norm": 1.7511684894561768, "learning_rate": 8.78502498798821e-06, "loss": 0.16914376616477966, "rewards/accuracies": 0.90625, "rewards/chosen": 25.11157989501953, "rewards/margins": 22.323387145996094, "rewards/rejected": 2.791661262512207, "step": 1538 }, { "epoch": 0.796583850931677, "grad_norm": 2.376311779022217, "learning_rate": 8.783160273544516e-06, "loss": 0.25109294056892395, "rewards/accuracies": 0.90625, "rewards/chosen": 26.27420997619629, "rewards/margins": 23.59868621826172, "rewards/rejected": 2.673739433288574, "step": 1539 }, { "epoch": 0.7971014492753623, "grad_norm": 1.0332953929901123, "learning_rate": 8.781294327465714e-06, "loss": 0.1531771868467331, "rewards/accuracies": 0.953125, "rewards/chosen": 26.823312759399414, "rewards/margins": 23.461349487304688, "rewards/rejected": 3.3542861938476562, "step": 1540 }, { "epoch": 0.7976190476190477, "grad_norm": 2.807231903076172, "learning_rate": 8.779427150359275e-06, "loss": 0.25944051146507263, "rewards/accuracies": 0.875, "rewards/chosen": 26.745220184326172, "rewards/margins": 23.981918334960938, "rewards/rejected": 2.766500473022461, "step": 1541 }, { "epoch": 0.7981366459627329, "grad_norm": 1.0058140754699707, "learning_rate": 8.777558742833074e-06, "loss": 0.16835716366767883, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.63348388671875, "rewards/margins": 22.474576950073242, "rewards/rejected": 3.156970977783203, "step": 1542 }, { "epoch": 0.7986542443064182, "grad_norm": 1.0164693593978882, "learning_rate": 8.775689105495383e-06, "loss": 0.21233408153057098, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.502120971679688, "rewards/margins": 26.640716552734375, "rewards/rejected": 3.8642959594726562, "step": 1543 }, { "epoch": 0.7991718426501035, "grad_norm": 1.4398012161254883, "learning_rate": 8.773818238954877e-06, "loss": 0.18809011578559875, "rewards/accuracies": 0.875, "rewards/chosen": 27.6527042388916, "rewards/margins": 23.908050537109375, "rewards/rejected": 3.7411956787109375, "step": 1544 }, { "epoch": 0.7996894409937888, "grad_norm": 1.371180772781372, "learning_rate": 8.77194614382063e-06, "loss": 0.20220376551151276, "rewards/accuracies": 0.8828125, "rewards/chosen": 26.86993980407715, "rewards/margins": 23.192646026611328, "rewards/rejected": 3.682392120361328, "step": 1545 }, { "epoch": 0.8002070393374742, "grad_norm": 1.6180284023284912, "learning_rate": 8.770072820702116e-06, "loss": 0.1936064213514328, "rewards/accuracies": 0.90625, "rewards/chosen": 30.136444091796875, "rewards/margins": 24.892990112304688, "rewards/rejected": 5.243488788604736, "step": 1546 }, { "epoch": 0.8007246376811594, "grad_norm": 1.292481541633606, "learning_rate": 8.768198270209207e-06, "loss": 0.20210790634155273, "rewards/accuracies": 0.90625, "rewards/chosen": 30.09928321838379, "rewards/margins": 25.383773803710938, "rewards/rejected": 4.714897155761719, "step": 1547 }, { "epoch": 0.8012422360248447, "grad_norm": 2.8349413871765137, "learning_rate": 8.76632249295218e-06, "loss": 0.2830200791358948, "rewards/accuracies": 0.859375, "rewards/chosen": 22.563100814819336, "rewards/margins": 19.062843322753906, "rewards/rejected": 3.4976072311401367, "step": 1548 }, { "epoch": 0.80175983436853, "grad_norm": 2.1052939891815186, "learning_rate": 8.764445489541706e-06, "loss": 0.24662503600120544, "rewards/accuracies": 0.875, "rewards/chosen": 21.750459671020508, "rewards/margins": 18.6038818359375, "rewards/rejected": 3.1492080688476562, "step": 1549 }, { "epoch": 0.8022774327122153, "grad_norm": 0.5288955569267273, "learning_rate": 8.762567260588854e-06, "loss": 0.13123226165771484, "rewards/accuracies": 0.921875, "rewards/chosen": 23.12674331665039, "rewards/margins": 20.513534545898438, "rewards/rejected": 2.6155147552490234, "step": 1550 }, { "epoch": 0.8027950310559007, "grad_norm": 1.2062042951583862, "learning_rate": 8.760687806705102e-06, "loss": 0.17021287977695465, "rewards/accuracies": 0.890625, "rewards/chosen": 22.17041015625, "rewards/margins": 19.500612258911133, "rewards/rejected": 2.6690425872802734, "step": 1551 }, { "epoch": 0.8033126293995859, "grad_norm": 0.4985055923461914, "learning_rate": 8.758807128502312e-06, "loss": 0.16523227095603943, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.181896209716797, "rewards/margins": 19.141551971435547, "rewards/rejected": 3.0309505462646484, "step": 1552 }, { "epoch": 0.8038302277432712, "grad_norm": 1.1721389293670654, "learning_rate": 8.756925226592758e-06, "loss": 0.1982356607913971, "rewards/accuracies": 0.90625, "rewards/chosen": 20.05373764038086, "rewards/margins": 17.405899047851562, "rewards/rejected": 2.6443252563476562, "step": 1553 }, { "epoch": 0.8043478260869565, "grad_norm": 1.1990166902542114, "learning_rate": 8.755042101589105e-06, "loss": 0.22237972915172577, "rewards/accuracies": 0.8984375, "rewards/chosen": 15.843481063842773, "rewards/margins": 13.970617294311523, "rewards/rejected": 1.8749139308929443, "step": 1554 }, { "epoch": 0.8048654244306418, "grad_norm": 1.6806691884994507, "learning_rate": 8.753157754104416e-06, "loss": 0.24132627248764038, "rewards/accuracies": 0.84375, "rewards/chosen": 16.436187744140625, "rewards/margins": 14.177766799926758, "rewards/rejected": 2.2601537704467773, "step": 1555 }, { "epoch": 0.8053830227743272, "grad_norm": 0.9155763387680054, "learning_rate": 8.751272184752157e-06, "loss": 0.17065384984016418, "rewards/accuracies": 0.8984375, "rewards/chosen": 15.951339721679688, "rewards/margins": 13.745243072509766, "rewards/rejected": 2.2057509422302246, "step": 1556 }, { "epoch": 0.8059006211180124, "grad_norm": 0.9765603542327881, "learning_rate": 8.749385394146186e-06, "loss": 0.16976743936538696, "rewards/accuracies": 0.9140625, "rewards/chosen": 16.99920654296875, "rewards/margins": 15.024673461914062, "rewards/rejected": 1.9735450744628906, "step": 1557 }, { "epoch": 0.8064182194616977, "grad_norm": 0.9397842288017273, "learning_rate": 8.747497382900763e-06, "loss": 0.19408513605594635, "rewards/accuracies": 0.875, "rewards/chosen": 13.782054901123047, "rewards/margins": 11.966262817382812, "rewards/rejected": 1.8182337284088135, "step": 1558 }, { "epoch": 0.806935817805383, "grad_norm": 1.4429326057434082, "learning_rate": 8.745608151630544e-06, "loss": 0.23007580637931824, "rewards/accuracies": 0.875, "rewards/chosen": 13.605562210083008, "rewards/margins": 11.480239868164062, "rewards/rejected": 2.1235504150390625, "step": 1559 }, { "epoch": 0.8074534161490683, "grad_norm": 1.1879074573516846, "learning_rate": 8.74371770095058e-06, "loss": 0.18751497566699982, "rewards/accuracies": 0.8671875, "rewards/chosen": 14.444599151611328, "rewards/margins": 12.57766342163086, "rewards/rejected": 1.8640360832214355, "step": 1560 }, { "epoch": 0.8079710144927537, "grad_norm": 0.973679780960083, "learning_rate": 8.741826031476323e-06, "loss": 0.23457759618759155, "rewards/accuracies": 0.8671875, "rewards/chosen": 11.446338653564453, "rewards/margins": 10.082206726074219, "rewards/rejected": 1.3622527122497559, "step": 1561 }, { "epoch": 0.8084886128364389, "grad_norm": 1.2959181070327759, "learning_rate": 8.739933143823619e-06, "loss": 0.20532965660095215, "rewards/accuracies": 0.9140625, "rewards/chosen": 14.12552261352539, "rewards/margins": 12.073116302490234, "rewards/rejected": 2.053396224975586, "step": 1562 }, { "epoch": 0.8090062111801242, "grad_norm": 2.2438368797302246, "learning_rate": 8.73803903860871e-06, "loss": 0.195832759141922, "rewards/accuracies": 0.8984375, "rewards/chosen": 14.533153533935547, "rewards/margins": 12.238922119140625, "rewards/rejected": 2.295498847961426, "step": 1563 }, { "epoch": 0.8095238095238095, "grad_norm": 0.8992484211921692, "learning_rate": 8.736143716448236e-06, "loss": 0.208410382270813, "rewards/accuracies": 0.9140625, "rewards/chosen": 14.390047073364258, "rewards/margins": 12.03997802734375, "rewards/rejected": 2.3534278869628906, "step": 1564 }, { "epoch": 0.8100414078674948, "grad_norm": 0.8998428583145142, "learning_rate": 8.734247177959234e-06, "loss": 0.18718987703323364, "rewards/accuracies": 0.90625, "rewards/chosen": 14.405956268310547, "rewards/margins": 12.257568359375, "rewards/rejected": 2.152897357940674, "step": 1565 }, { "epoch": 0.8105590062111802, "grad_norm": 0.6565580368041992, "learning_rate": 8.732349423759135e-06, "loss": 0.19599711894989014, "rewards/accuracies": 0.8984375, "rewards/chosen": 10.624038696289062, "rewards/margins": 9.403961181640625, "rewards/rejected": 1.221977949142456, "step": 1566 }, { "epoch": 0.8110766045548654, "grad_norm": 1.4032444953918457, "learning_rate": 8.730450454465764e-06, "loss": 0.25487861037254333, "rewards/accuracies": 0.8828125, "rewards/chosen": 12.888347625732422, "rewards/margins": 10.995288848876953, "rewards/rejected": 1.8931455612182617, "step": 1567 }, { "epoch": 0.8115942028985508, "grad_norm": 1.4204124212265015, "learning_rate": 8.728550270697349e-06, "loss": 0.26859182119369507, "rewards/accuracies": 0.84375, "rewards/chosen": 13.956645965576172, "rewards/margins": 11.471336364746094, "rewards/rejected": 2.4841670989990234, "step": 1568 }, { "epoch": 0.812111801242236, "grad_norm": 0.9618015289306641, "learning_rate": 8.726648873072507e-06, "loss": 0.2797141969203949, "rewards/accuracies": 0.8671875, "rewards/chosen": 13.332786560058594, "rewards/margins": 11.164527893066406, "rewards/rejected": 2.173739433288574, "step": 1569 }, { "epoch": 0.8126293995859213, "grad_norm": 0.8129897713661194, "learning_rate": 8.72474626221025e-06, "loss": 0.1877897083759308, "rewards/accuracies": 0.90625, "rewards/chosen": 14.075899124145508, "rewards/margins": 11.690715789794922, "rewards/rejected": 2.3874568939208984, "step": 1570 }, { "epoch": 0.8131469979296067, "grad_norm": 0.7764129638671875, "learning_rate": 8.722842438729989e-06, "loss": 0.1917910873889923, "rewards/accuracies": 0.8984375, "rewards/chosen": 12.132233619689941, "rewards/margins": 10.576065063476562, "rewards/rejected": 1.5531482696533203, "step": 1571 }, { "epoch": 0.8136645962732919, "grad_norm": 0.6201814413070679, "learning_rate": 8.720937403251523e-06, "loss": 0.19062161445617676, "rewards/accuracies": 0.9296875, "rewards/chosen": 12.792901992797852, "rewards/margins": 10.995956420898438, "rewards/rejected": 1.7956957817077637, "step": 1572 }, { "epoch": 0.8141821946169773, "grad_norm": 0.9325059056282043, "learning_rate": 8.719031156395054e-06, "loss": 0.25654417276382446, "rewards/accuracies": 0.8671875, "rewards/chosen": 12.17853832244873, "rewards/margins": 10.571746826171875, "rewards/rejected": 1.6110939979553223, "step": 1573 }, { "epoch": 0.8146997929606625, "grad_norm": 0.896163284778595, "learning_rate": 8.717123698781176e-06, "loss": 0.23406574130058289, "rewards/accuracies": 0.859375, "rewards/chosen": 11.298896789550781, "rewards/margins": 9.934745788574219, "rewards/rejected": 1.3632001876831055, "step": 1574 }, { "epoch": 0.8152173913043478, "grad_norm": 0.6364295482635498, "learning_rate": 8.715215031030871e-06, "loss": 0.17412082850933075, "rewards/accuracies": 0.9140625, "rewards/chosen": 13.172443389892578, "rewards/margins": 11.598175048828125, "rewards/rejected": 1.5775680541992188, "step": 1575 }, { "epoch": 0.8157349896480331, "grad_norm": 0.6829884052276611, "learning_rate": 8.713305153765522e-06, "loss": 0.20917582511901855, "rewards/accuracies": 0.9140625, "rewards/chosen": 13.929208755493164, "rewards/margins": 11.591972351074219, "rewards/rejected": 2.338780403137207, "step": 1576 }, { "epoch": 0.8162525879917184, "grad_norm": 0.6792503595352173, "learning_rate": 8.711394067606903e-06, "loss": 0.19513824582099915, "rewards/accuracies": 0.9296875, "rewards/chosen": 14.436275482177734, "rewards/margins": 12.47637939453125, "rewards/rejected": 1.9617042541503906, "step": 1577 }, { "epoch": 0.8167701863354038, "grad_norm": 0.7741829752922058, "learning_rate": 8.709481773177182e-06, "loss": 0.14487382769584656, "rewards/accuracies": 0.9375, "rewards/chosen": 15.195991516113281, "rewards/margins": 13.229148864746094, "rewards/rejected": 1.9672508239746094, "step": 1578 }, { "epoch": 0.817287784679089, "grad_norm": 0.5487321019172668, "learning_rate": 8.70756827109892e-06, "loss": 0.13538341224193573, "rewards/accuracies": 0.9609375, "rewards/chosen": 13.42213249206543, "rewards/margins": 11.72370719909668, "rewards/rejected": 1.7006454467773438, "step": 1579 }, { "epoch": 0.8178053830227743, "grad_norm": 0.7222735285758972, "learning_rate": 8.705653561995071e-06, "loss": 0.20137576758861542, "rewards/accuracies": 0.921875, "rewards/chosen": 12.903722763061523, "rewards/margins": 11.311790466308594, "rewards/rejected": 1.5923571586608887, "step": 1580 }, { "epoch": 0.8183229813664596, "grad_norm": 1.1244810819625854, "learning_rate": 8.703737646488985e-06, "loss": 0.21393197774887085, "rewards/accuracies": 0.875, "rewards/chosen": 15.403657913208008, "rewards/margins": 13.507110595703125, "rewards/rejected": 1.9010032415390015, "step": 1581 }, { "epoch": 0.8188405797101449, "grad_norm": 0.5996379852294922, "learning_rate": 8.701820525204397e-06, "loss": 0.13614393770694733, "rewards/accuracies": 0.9296875, "rewards/chosen": 17.031755447387695, "rewards/margins": 15.082984924316406, "rewards/rejected": 1.9521551132202148, "step": 1582 }, { "epoch": 0.8193581780538303, "grad_norm": 0.891806423664093, "learning_rate": 8.699902198765444e-06, "loss": 0.13945922255516052, "rewards/accuracies": 0.9453125, "rewards/chosen": 17.84337615966797, "rewards/margins": 15.820976257324219, "rewards/rejected": 2.0235214233398438, "step": 1583 }, { "epoch": 0.8198757763975155, "grad_norm": 1.2353012561798096, "learning_rate": 8.69798266779665e-06, "loss": 0.22010186314582825, "rewards/accuracies": 0.921875, "rewards/chosen": 13.381446838378906, "rewards/margins": 11.877967834472656, "rewards/rejected": 1.5041650533676147, "step": 1584 }, { "epoch": 0.8203933747412008, "grad_norm": 1.156489372253418, "learning_rate": 8.696061932922933e-06, "loss": 0.17969512939453125, "rewards/accuracies": 0.90625, "rewards/chosen": 18.891357421875, "rewards/margins": 16.597946166992188, "rewards/rejected": 2.292755126953125, "step": 1585 }, { "epoch": 0.8209109730848861, "grad_norm": 1.6078534126281738, "learning_rate": 8.694139994769598e-06, "loss": 0.18192443251609802, "rewards/accuracies": 0.921875, "rewards/chosen": 20.101451873779297, "rewards/margins": 17.656723022460938, "rewards/rejected": 2.4391117095947266, "step": 1586 }, { "epoch": 0.8214285714285714, "grad_norm": 2.409668207168579, "learning_rate": 8.69221685396235e-06, "loss": 0.20988407731056213, "rewards/accuracies": 0.8828125, "rewards/chosen": 20.185848236083984, "rewards/margins": 17.253219604492188, "rewards/rejected": 2.9289803504943848, "step": 1587 }, { "epoch": 0.8219461697722568, "grad_norm": 1.137399673461914, "learning_rate": 8.69029251112728e-06, "loss": 0.20028789341449738, "rewards/accuracies": 0.921875, "rewards/chosen": 22.624465942382812, "rewards/margins": 19.60601806640625, "rewards/rejected": 3.013204574584961, "step": 1588 }, { "epoch": 0.822463768115942, "grad_norm": 1.0058737993240356, "learning_rate": 8.688366966890874e-06, "loss": 0.13319462537765503, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.513015747070312, "rewards/margins": 20.319385528564453, "rewards/rejected": 3.1905746459960938, "step": 1589 }, { "epoch": 0.8229813664596274, "grad_norm": 1.074078917503357, "learning_rate": 8.68644022188e-06, "loss": 0.24827823042869568, "rewards/accuracies": 0.890625, "rewards/chosen": 21.677047729492188, "rewards/margins": 18.784713745117188, "rewards/rejected": 2.896759033203125, "step": 1590 }, { "epoch": 0.8234989648033126, "grad_norm": 1.1534326076507568, "learning_rate": 8.684512276721933e-06, "loss": 0.186112642288208, "rewards/accuracies": 0.890625, "rewards/chosen": 22.509750366210938, "rewards/margins": 19.3685302734375, "rewards/rejected": 3.1388344764709473, "step": 1591 }, { "epoch": 0.8240165631469979, "grad_norm": 1.0523664951324463, "learning_rate": 8.682583132044321e-06, "loss": 0.2501497268676758, "rewards/accuracies": 0.890625, "rewards/chosen": 17.737285614013672, "rewards/margins": 15.858734130859375, "rewards/rejected": 1.87847900390625, "step": 1592 }, { "epoch": 0.8245341614906833, "grad_norm": 0.4899676442146301, "learning_rate": 8.680652788475217e-06, "loss": 0.10848671197891235, "rewards/accuracies": 0.9375, "rewards/chosen": 25.015335083007812, "rewards/margins": 21.433303833007812, "rewards/rejected": 3.5791168212890625, "step": 1593 }, { "epoch": 0.8250517598343685, "grad_norm": 1.358573079109192, "learning_rate": 8.678721246643055e-06, "loss": 0.22141291201114655, "rewards/accuracies": 0.9140625, "rewards/chosen": 18.952857971191406, "rewards/margins": 16.61219024658203, "rewards/rejected": 2.3355510234832764, "step": 1594 }, { "epoch": 0.8255693581780539, "grad_norm": 0.8278517723083496, "learning_rate": 8.676788507176663e-06, "loss": 0.1616213321685791, "rewards/accuracies": 0.921875, "rewards/chosen": 21.110759735107422, "rewards/margins": 18.213668823242188, "rewards/rejected": 2.8968124389648438, "step": 1595 }, { "epoch": 0.8260869565217391, "grad_norm": 2.138925552368164, "learning_rate": 8.674854570705258e-06, "loss": 0.20387804508209229, "rewards/accuracies": 0.8828125, "rewards/chosen": 25.358612060546875, "rewards/margins": 21.871337890625, "rewards/rejected": 3.4903128147125244, "step": 1596 }, { "epoch": 0.8266045548654244, "grad_norm": 1.1737157106399536, "learning_rate": 8.672919437858449e-06, "loss": 0.1313878893852234, "rewards/accuracies": 0.90625, "rewards/chosen": 25.60015106201172, "rewards/margins": 22.461563110351562, "rewards/rejected": 3.1362380981445312, "step": 1597 }, { "epoch": 0.8271221532091098, "grad_norm": 1.1664565801620483, "learning_rate": 8.670983109266231e-06, "loss": 0.19005286693572998, "rewards/accuracies": 0.890625, "rewards/chosen": 22.571239471435547, "rewards/margins": 19.433631896972656, "rewards/rejected": 3.1390132904052734, "step": 1598 }, { "epoch": 0.827639751552795, "grad_norm": 2.198463201522827, "learning_rate": 8.66904558555899e-06, "loss": 0.20189577341079712, "rewards/accuracies": 0.90625, "rewards/chosen": 21.19367218017578, "rewards/margins": 18.496978759765625, "rewards/rejected": 2.700113296508789, "step": 1599 }, { "epoch": 0.8281573498964804, "grad_norm": 0.6998844742774963, "learning_rate": 8.6671068673675e-06, "loss": 0.18554866313934326, "rewards/accuracies": 0.90625, "rewards/chosen": 21.91079330444336, "rewards/margins": 19.414329528808594, "rewards/rejected": 2.4925498962402344, "step": 1600 }, { "epoch": 0.8286749482401656, "grad_norm": 2.1039822101593018, "learning_rate": 8.665166955322926e-06, "loss": 0.16014264523983002, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.085819244384766, "rewards/margins": 21.284019470214844, "rewards/rejected": 2.8018722534179688, "step": 1601 }, { "epoch": 0.8291925465838509, "grad_norm": 0.594372034072876, "learning_rate": 8.663225850056818e-06, "loss": 0.13379420340061188, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.800029754638672, "rewards/margins": 20.746627807617188, "rewards/rejected": 3.0587730407714844, "step": 1602 }, { "epoch": 0.8297101449275363, "grad_norm": 0.8362223505973816, "learning_rate": 8.66128355220112e-06, "loss": 0.21275150775909424, "rewards/accuracies": 0.859375, "rewards/chosen": 19.890018463134766, "rewards/margins": 17.291656494140625, "rewards/rejected": 2.603851318359375, "step": 1603 }, { "epoch": 0.8302277432712215, "grad_norm": 2.4544942378997803, "learning_rate": 8.659340062388157e-06, "loss": 0.1351478099822998, "rewards/accuracies": 0.9375, "rewards/chosen": 24.145313262939453, "rewards/margins": 20.566192626953125, "rewards/rejected": 3.5747318267822266, "step": 1604 }, { "epoch": 0.8307453416149069, "grad_norm": 0.9102903604507446, "learning_rate": 8.657395381250652e-06, "loss": 0.23979204893112183, "rewards/accuracies": 0.90625, "rewards/chosen": 21.21251106262207, "rewards/margins": 17.645675659179688, "rewards/rejected": 3.5622005462646484, "step": 1605 }, { "epoch": 0.8312629399585921, "grad_norm": 0.9096129536628723, "learning_rate": 8.655449509421706e-06, "loss": 0.16221974790096283, "rewards/accuracies": 0.90625, "rewards/chosen": 23.999650955200195, "rewards/margins": 20.70545196533203, "rewards/rejected": 3.2999839782714844, "step": 1606 }, { "epoch": 0.8317805383022774, "grad_norm": 0.89423006772995, "learning_rate": 8.65350244753481e-06, "loss": 0.1894785761833191, "rewards/accuracies": 0.890625, "rewards/chosen": 22.781248092651367, "rewards/margins": 19.178207397460938, "rewards/rejected": 3.5921974182128906, "step": 1607 }, { "epoch": 0.8322981366459627, "grad_norm": 2.254055976867676, "learning_rate": 8.651554196223849e-06, "loss": 0.23260736465454102, "rewards/accuracies": 0.8203125, "rewards/chosen": 18.856657028198242, "rewards/margins": 16.64446258544922, "rewards/rejected": 2.2144060134887695, "step": 1608 }, { "epoch": 0.832815734989648, "grad_norm": 1.799221158027649, "learning_rate": 8.649604756123085e-06, "loss": 0.20032812654972076, "rewards/accuracies": 0.9140625, "rewards/chosen": 20.371721267700195, "rewards/margins": 17.3165283203125, "rewards/rejected": 3.0551528930664062, "step": 1609 }, { "epoch": 0.8333333333333334, "grad_norm": 0.8217980265617371, "learning_rate": 8.647654127867175e-06, "loss": 0.20823051035404205, "rewards/accuracies": 0.890625, "rewards/chosen": 20.88031005859375, "rewards/margins": 18.00722885131836, "rewards/rejected": 2.8817062377929688, "step": 1610 }, { "epoch": 0.8338509316770186, "grad_norm": 2.1066734790802, "learning_rate": 8.645702312091161e-06, "loss": 0.1701355129480362, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.847091674804688, "rewards/margins": 19.54266357421875, "rewards/rejected": 2.3059206008911133, "step": 1611 }, { "epoch": 0.8343685300207039, "grad_norm": 1.1023615598678589, "learning_rate": 8.643749309430469e-06, "loss": 0.15005265176296234, "rewards/accuracies": 0.9375, "rewards/chosen": 19.725008010864258, "rewards/margins": 17.578399658203125, "rewards/rejected": 2.1486339569091797, "step": 1612 }, { "epoch": 0.8348861283643892, "grad_norm": 1.347899317741394, "learning_rate": 8.641795120520915e-06, "loss": 0.16061024367809296, "rewards/accuracies": 0.90625, "rewards/chosen": 23.209396362304688, "rewards/margins": 20.905914306640625, "rewards/rejected": 2.305694580078125, "step": 1613 }, { "epoch": 0.8354037267080745, "grad_norm": 1.7655454874038696, "learning_rate": 8.639839745998698e-06, "loss": 0.30321258306503296, "rewards/accuracies": 0.8359375, "rewards/chosen": 16.6575984954834, "rewards/margins": 14.95206069946289, "rewards/rejected": 1.7036705017089844, "step": 1614 }, { "epoch": 0.8359213250517599, "grad_norm": 1.0264554023742676, "learning_rate": 8.637883186500401e-06, "loss": 0.21001553535461426, "rewards/accuracies": 0.90625, "rewards/chosen": 18.028915405273438, "rewards/margins": 16.21027374267578, "rewards/rejected": 1.8192834854125977, "step": 1615 }, { "epoch": 0.8364389233954451, "grad_norm": 0.9383524656295776, "learning_rate": 8.635925442663003e-06, "loss": 0.1514473855495453, "rewards/accuracies": 0.90625, "rewards/chosen": 22.78510093688965, "rewards/margins": 20.078903198242188, "rewards/rejected": 2.7026944160461426, "step": 1616 }, { "epoch": 0.8369565217391305, "grad_norm": 1.109527349472046, "learning_rate": 8.633966515123855e-06, "loss": 0.2024383544921875, "rewards/accuracies": 0.90625, "rewards/chosen": 20.721843719482422, "rewards/margins": 18.65185546875, "rewards/rejected": 2.0654096603393555, "step": 1617 }, { "epoch": 0.8374741200828157, "grad_norm": 1.1398729085922241, "learning_rate": 8.632006404520706e-06, "loss": 0.2001267969608307, "rewards/accuracies": 0.8828125, "rewards/chosen": 20.524429321289062, "rewards/margins": 18.29547882080078, "rewards/rejected": 2.233226776123047, "step": 1618 }, { "epoch": 0.837991718426501, "grad_norm": 0.9639199376106262, "learning_rate": 8.630045111491679e-06, "loss": 0.22494098544120789, "rewards/accuracies": 0.890625, "rewards/chosen": 20.0115909576416, "rewards/margins": 18.091529846191406, "rewards/rejected": 1.924966812133789, "step": 1619 }, { "epoch": 0.8385093167701864, "grad_norm": 1.3550630807876587, "learning_rate": 8.628082636675288e-06, "loss": 0.16878741979599, "rewards/accuracies": 0.9296875, "rewards/chosen": 21.660083770751953, "rewards/margins": 19.517791748046875, "rewards/rejected": 2.146484375, "step": 1620 }, { "epoch": 0.8390269151138716, "grad_norm": 1.7378078699111938, "learning_rate": 8.626118980710435e-06, "loss": 0.2207174450159073, "rewards/accuracies": 0.890625, "rewards/chosen": 21.548246383666992, "rewards/margins": 19.1846981048584, "rewards/rejected": 2.365971088409424, "step": 1621 }, { "epoch": 0.839544513457557, "grad_norm": 1.3119096755981445, "learning_rate": 8.6241541442364e-06, "loss": 0.27851369976997375, "rewards/accuracies": 0.8671875, "rewards/chosen": 19.83109474182129, "rewards/margins": 18.300628662109375, "rewards/rejected": 1.5271368026733398, "step": 1622 }, { "epoch": 0.8400621118012422, "grad_norm": 0.6867672204971313, "learning_rate": 8.622188127892845e-06, "loss": 0.16810201108455658, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.629314422607422, "rewards/margins": 21.725929260253906, "rewards/rejected": 1.9071969985961914, "step": 1623 }, { "epoch": 0.8405797101449275, "grad_norm": 0.6727194786071777, "learning_rate": 8.620220932319826e-06, "loss": 0.1803305298089981, "rewards/accuracies": 0.8671875, "rewards/chosen": 21.209388732910156, "rewards/margins": 19.316312789916992, "rewards/rejected": 1.8910675048828125, "step": 1624 }, { "epoch": 0.8410973084886129, "grad_norm": 0.641450047492981, "learning_rate": 8.618252558157779e-06, "loss": 0.17820899188518524, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.149770736694336, "rewards/margins": 20.087570190429688, "rewards/rejected": 2.0547332763671875, "step": 1625 }, { "epoch": 0.8416149068322981, "grad_norm": 0.6147380471229553, "learning_rate": 8.616283006047516e-06, "loss": 0.16242945194244385, "rewards/accuracies": 0.921875, "rewards/chosen": 19.748849868774414, "rewards/margins": 18.102264404296875, "rewards/rejected": 1.651863694190979, "step": 1626 }, { "epoch": 0.8421325051759835, "grad_norm": 1.2677569389343262, "learning_rate": 8.614312276630245e-06, "loss": 0.1585741490125656, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.43750762939453, "rewards/margins": 18.968708038330078, "rewards/rejected": 1.4670696258544922, "step": 1627 }, { "epoch": 0.8426501035196687, "grad_norm": 0.8489712476730347, "learning_rate": 8.61234037054755e-06, "loss": 0.1724843680858612, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.804668426513672, "rewards/margins": 19.21253204345703, "rewards/rejected": 1.5875072479248047, "step": 1628 }, { "epoch": 0.843167701863354, "grad_norm": 1.5668233633041382, "learning_rate": 8.610367288441396e-06, "loss": 0.16600826382637024, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.766326904296875, "rewards/margins": 20.007522583007812, "rewards/rejected": 1.7620611190795898, "step": 1629 }, { "epoch": 0.8436853002070394, "grad_norm": 1.4312282800674438, "learning_rate": 8.60839303095414e-06, "loss": 0.20691843330860138, "rewards/accuracies": 0.90625, "rewards/chosen": 22.395109176635742, "rewards/margins": 20.481292724609375, "rewards/rejected": 1.9120025634765625, "step": 1630 }, { "epoch": 0.8442028985507246, "grad_norm": 0.838746964931488, "learning_rate": 8.606417598728508e-06, "loss": 0.12788988649845123, "rewards/accuracies": 0.953125, "rewards/chosen": 24.466533660888672, "rewards/margins": 22.25189208984375, "rewards/rejected": 2.210186004638672, "step": 1631 }, { "epoch": 0.84472049689441, "grad_norm": 1.1211193799972534, "learning_rate": 8.604440992407623e-06, "loss": 0.2475622147321701, "rewards/accuracies": 0.875, "rewards/chosen": 18.296234130859375, "rewards/margins": 16.93902587890625, "rewards/rejected": 1.3551902770996094, "step": 1632 }, { "epoch": 0.8452380952380952, "grad_norm": 2.2041854858398438, "learning_rate": 8.60246321263498e-06, "loss": 0.2220609486103058, "rewards/accuracies": 0.90625, "rewards/chosen": 24.536075592041016, "rewards/margins": 21.793678283691406, "rewards/rejected": 2.743302345275879, "step": 1633 }, { "epoch": 0.8457556935817805, "grad_norm": 1.0408904552459717, "learning_rate": 8.600484260054458e-06, "loss": 0.20596745610237122, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.637962341308594, "rewards/margins": 19.625228881835938, "rewards/rejected": 2.009880542755127, "step": 1634 }, { "epoch": 0.8462732919254659, "grad_norm": 1.2816827297210693, "learning_rate": 8.598504135310323e-06, "loss": 0.17756807804107666, "rewards/accuracies": 0.8984375, "rewards/chosen": 23.951366424560547, "rewards/margins": 21.669776916503906, "rewards/rejected": 2.2877025604248047, "step": 1635 }, { "epoch": 0.8467908902691511, "grad_norm": 1.1569017171859741, "learning_rate": 8.596522839047219e-06, "loss": 0.17479847371578217, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.751672744750977, "rewards/margins": 20.251083374023438, "rewards/rejected": 2.5049896240234375, "step": 1636 }, { "epoch": 0.8473084886128365, "grad_norm": 0.9406325817108154, "learning_rate": 8.59454037191017e-06, "loss": 0.16158095002174377, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.648876190185547, "rewards/margins": 21.313026428222656, "rewards/rejected": 2.3364124298095703, "step": 1637 }, { "epoch": 0.8478260869565217, "grad_norm": 0.9745233654975891, "learning_rate": 8.59255673454458e-06, "loss": 0.16216664016246796, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.4641170501709, "rewards/margins": 23.40557098388672, "rewards/rejected": 4.051624298095703, "step": 1638 }, { "epoch": 0.848343685300207, "grad_norm": 0.6827297210693359, "learning_rate": 8.590571927596239e-06, "loss": 0.15413177013397217, "rewards/accuracies": 0.8828125, "rewards/chosen": 24.587757110595703, "rewards/margins": 21.62523651123047, "rewards/rejected": 2.9636988639831543, "step": 1639 }, { "epoch": 0.8488612836438924, "grad_norm": 1.0091794729232788, "learning_rate": 8.588585951711315e-06, "loss": 0.1509200930595398, "rewards/accuracies": 0.890625, "rewards/chosen": 26.2763671875, "rewards/margins": 22.38367462158203, "rewards/rejected": 3.8901004791259766, "step": 1640 }, { "epoch": 0.8493788819875776, "grad_norm": 1.2798421382904053, "learning_rate": 8.586598807536356e-06, "loss": 0.22308143973350525, "rewards/accuracies": 0.921875, "rewards/chosen": 27.940231323242188, "rewards/margins": 22.30282211303711, "rewards/rejected": 5.635467529296875, "step": 1641 }, { "epoch": 0.849896480331263, "grad_norm": 0.6183639168739319, "learning_rate": 8.584610495718294e-06, "loss": 0.10760553181171417, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.375347137451172, "rewards/margins": 25.036991119384766, "rewards/rejected": 5.336907386779785, "step": 1642 }, { "epoch": 0.8504140786749482, "grad_norm": 1.091686725616455, "learning_rate": 8.582621016904434e-06, "loss": 0.23229311406612396, "rewards/accuracies": 0.90625, "rewards/chosen": 26.55547523498535, "rewards/margins": 22.05951690673828, "rewards/rejected": 4.492465972900391, "step": 1643 }, { "epoch": 0.8509316770186336, "grad_norm": 1.3835378885269165, "learning_rate": 8.580630371742467e-06, "loss": 0.21763336658477783, "rewards/accuracies": 0.890625, "rewards/chosen": 26.923656463623047, "rewards/margins": 22.122055053710938, "rewards/rejected": 4.801265716552734, "step": 1644 }, { "epoch": 0.8514492753623188, "grad_norm": 1.032395839691162, "learning_rate": 8.57863856088046e-06, "loss": 0.19881445169448853, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.666671752929688, "rewards/margins": 21.446937561035156, "rewards/rejected": 6.222209930419922, "step": 1645 }, { "epoch": 0.8519668737060041, "grad_norm": 1.2085641622543335, "learning_rate": 8.576645584966867e-06, "loss": 0.15839342772960663, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.20348358154297, "rewards/margins": 25.30157470703125, "rewards/rejected": 6.898914337158203, "step": 1646 }, { "epoch": 0.8524844720496895, "grad_norm": 1.0914521217346191, "learning_rate": 8.574651444650512e-06, "loss": 0.22414414584636688, "rewards/accuracies": 0.8671875, "rewards/chosen": 29.323514938354492, "rewards/margins": 21.81126594543457, "rewards/rejected": 7.512913703918457, "step": 1647 }, { "epoch": 0.8530020703933747, "grad_norm": 1.0697356462478638, "learning_rate": 8.5726561405806e-06, "loss": 0.18129035830497742, "rewards/accuracies": 0.90625, "rewards/chosen": 32.03163146972656, "rewards/margins": 25.653228759765625, "rewards/rejected": 6.377355575561523, "step": 1648 }, { "epoch": 0.8535196687370601, "grad_norm": 1.6482189893722534, "learning_rate": 8.570659673406721e-06, "loss": 0.1724534034729004, "rewards/accuracies": 0.921875, "rewards/chosen": 30.86144256591797, "rewards/margins": 25.492603302001953, "rewards/rejected": 5.369365692138672, "step": 1649 }, { "epoch": 0.8540372670807453, "grad_norm": 0.9193621277809143, "learning_rate": 8.568662043778834e-06, "loss": 0.1320641189813614, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.72513198852539, "rewards/margins": 24.62419891357422, "rewards/rejected": 8.102584838867188, "step": 1650 }, { "epoch": 0.8545548654244306, "grad_norm": 0.892285168170929, "learning_rate": 8.566663252347284e-06, "loss": 0.17746621370315552, "rewards/accuracies": 0.890625, "rewards/chosen": 30.99068832397461, "rewards/margins": 25.381362915039062, "rewards/rejected": 5.612091064453125, "step": 1651 }, { "epoch": 0.855072463768116, "grad_norm": 0.6690723896026611, "learning_rate": 8.564663299762795e-06, "loss": 0.187713161110878, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.370201110839844, "rewards/margins": 27.227706909179688, "rewards/rejected": 5.145042419433594, "step": 1652 }, { "epoch": 0.8555900621118012, "grad_norm": 1.8831369876861572, "learning_rate": 8.562662186676463e-06, "loss": 0.160366952419281, "rewards/accuracies": 0.8984375, "rewards/chosen": 34.62053298950195, "rewards/margins": 28.409439086914062, "rewards/rejected": 6.209673881530762, "step": 1653 }, { "epoch": 0.8561076604554866, "grad_norm": 1.3915011882781982, "learning_rate": 8.560659913739764e-06, "loss": 0.1502961814403534, "rewards/accuracies": 0.921875, "rewards/chosen": 41.458213806152344, "rewards/margins": 33.63825225830078, "rewards/rejected": 7.816013336181641, "step": 1654 }, { "epoch": 0.8566252587991718, "grad_norm": 0.6188753843307495, "learning_rate": 8.558656481604554e-06, "loss": 0.1782710999250412, "rewards/accuracies": 0.8671875, "rewards/chosen": 32.613502502441406, "rewards/margins": 27.181045532226562, "rewards/rejected": 5.431072235107422, "step": 1655 }, { "epoch": 0.8571428571428571, "grad_norm": 0.921746015548706, "learning_rate": 8.556651890923064e-06, "loss": 0.15349248051643372, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.075138092041016, "rewards/margins": 27.115524291992188, "rewards/rejected": 4.960151672363281, "step": 1656 }, { "epoch": 0.8576604554865425, "grad_norm": 0.7860938310623169, "learning_rate": 8.554646142347904e-06, "loss": 0.1700747013092041, "rewards/accuracies": 0.8984375, "rewards/chosen": 37.40166091918945, "rewards/margins": 31.895660400390625, "rewards/rejected": 5.504241943359375, "step": 1657 }, { "epoch": 0.8581780538302277, "grad_norm": 0.7438122630119324, "learning_rate": 8.552639236532062e-06, "loss": 0.19142936170101166, "rewards/accuracies": 0.90625, "rewards/chosen": 30.490493774414062, "rewards/margins": 24.838302612304688, "rewards/rejected": 5.651698589324951, "step": 1658 }, { "epoch": 0.8586956521739131, "grad_norm": 1.614024043083191, "learning_rate": 8.550631174128894e-06, "loss": 0.1796068251132965, "rewards/accuracies": 0.90625, "rewards/chosen": 35.571556091308594, "rewards/margins": 30.465866088867188, "rewards/rejected": 5.110206604003906, "step": 1659 }, { "epoch": 0.8592132505175983, "grad_norm": 1.4916528463363647, "learning_rate": 8.548621955792144e-06, "loss": 0.25220221281051636, "rewards/accuracies": 0.8828125, "rewards/chosen": 34.17869567871094, "rewards/margins": 28.990478515625, "rewards/rejected": 5.193994522094727, "step": 1660 }, { "epoch": 0.8597308488612836, "grad_norm": 1.1632356643676758, "learning_rate": 8.546611582175927e-06, "loss": 0.13544732332229614, "rewards/accuracies": 0.9296875, "rewards/chosen": 44.45710372924805, "rewards/margins": 37.734405517578125, "rewards/rejected": 6.730712890625, "step": 1661 }, { "epoch": 0.860248447204969, "grad_norm": 2.053767204284668, "learning_rate": 8.544600053934734e-06, "loss": 0.18279330432415009, "rewards/accuracies": 0.90625, "rewards/chosen": 37.61469650268555, "rewards/margins": 31.99506378173828, "rewards/rejected": 5.619411468505859, "step": 1662 }, { "epoch": 0.8607660455486542, "grad_norm": 0.4976567029953003, "learning_rate": 8.542587371723432e-06, "loss": 0.1370028704404831, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.822967529296875, "rewards/margins": 29.846221923828125, "rewards/rejected": 6.969215393066406, "step": 1663 }, { "epoch": 0.8612836438923396, "grad_norm": 2.071079969406128, "learning_rate": 8.540573536197267e-06, "loss": 0.2781716287136078, "rewards/accuracies": 0.890625, "rewards/chosen": 33.49395751953125, "rewards/margins": 28.47637939453125, "rewards/rejected": 5.0255279541015625, "step": 1664 }, { "epoch": 0.8618012422360248, "grad_norm": 0.7204386591911316, "learning_rate": 8.538558548011855e-06, "loss": 0.15119794011116028, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.519569396972656, "rewards/margins": 30.768295288085938, "rewards/rejected": 5.753852844238281, "step": 1665 }, { "epoch": 0.8623188405797102, "grad_norm": 2.424302816390991, "learning_rate": 8.53654240782319e-06, "loss": 0.24741005897521973, "rewards/accuracies": 0.8671875, "rewards/chosen": 31.18239974975586, "rewards/margins": 26.812950134277344, "rewards/rejected": 4.369447708129883, "step": 1666 }, { "epoch": 0.8628364389233955, "grad_norm": 1.5861791372299194, "learning_rate": 8.534525116287642e-06, "loss": 0.19921952486038208, "rewards/accuracies": 0.9140625, "rewards/chosen": 36.59528732299805, "rewards/margins": 30.88690185546875, "rewards/rejected": 5.705633163452148, "step": 1667 }, { "epoch": 0.8633540372670807, "grad_norm": 1.4566091299057007, "learning_rate": 8.532506674061953e-06, "loss": 0.23556695878505707, "rewards/accuracies": 0.8828125, "rewards/chosen": 33.38920211791992, "rewards/margins": 29.388206481933594, "rewards/rejected": 3.995532989501953, "step": 1668 }, { "epoch": 0.8638716356107661, "grad_norm": 0.9869768023490906, "learning_rate": 8.530487081803246e-06, "loss": 0.18195413053035736, "rewards/accuracies": 0.8984375, "rewards/chosen": 34.20469665527344, "rewards/margins": 29.484046936035156, "rewards/rejected": 4.723604202270508, "step": 1669 }, { "epoch": 0.8643892339544513, "grad_norm": 0.8246453404426575, "learning_rate": 8.528466340169008e-06, "loss": 0.18454155325889587, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.26673889160156, "rewards/margins": 28.95859146118164, "rewards/rejected": 5.298007965087891, "step": 1670 }, { "epoch": 0.8649068322981367, "grad_norm": 1.1123638153076172, "learning_rate": 8.52644444981711e-06, "loss": 0.15492179989814758, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.39474868774414, "rewards/margins": 29.42437744140625, "rewards/rejected": 4.979500770568848, "step": 1671 }, { "epoch": 0.865424430641822, "grad_norm": 0.7795559167861938, "learning_rate": 8.52442141140579e-06, "loss": 0.16999319195747375, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.62824249267578, "rewards/margins": 23.572715759277344, "rewards/rejected": 5.051889419555664, "step": 1672 }, { "epoch": 0.8659420289855072, "grad_norm": 0.8135629892349243, "learning_rate": 8.522397225593666e-06, "loss": 0.15829059481620789, "rewards/accuracies": 0.890625, "rewards/chosen": 36.26565170288086, "rewards/margins": 30.392953872680664, "rewards/rejected": 5.871443748474121, "step": 1673 }, { "epoch": 0.8664596273291926, "grad_norm": 0.48384958505630493, "learning_rate": 8.520371893039724e-06, "loss": 0.12652987241744995, "rewards/accuracies": 0.921875, "rewards/chosen": 36.43693542480469, "rewards/margins": 30.648956298828125, "rewards/rejected": 5.786231994628906, "step": 1674 }, { "epoch": 0.8669772256728778, "grad_norm": 1.0900084972381592, "learning_rate": 8.518345414403327e-06, "loss": 0.21489346027374268, "rewards/accuracies": 0.8984375, "rewards/chosen": 35.64517593383789, "rewards/margins": 29.900238037109375, "rewards/rejected": 5.749912261962891, "step": 1675 }, { "epoch": 0.8674948240165632, "grad_norm": 0.7848390340805054, "learning_rate": 8.516317790344208e-06, "loss": 0.20791682600975037, "rewards/accuracies": 0.8984375, "rewards/chosen": 29.716903686523438, "rewards/margins": 25.771804809570312, "rewards/rejected": 3.940309524536133, "step": 1676 }, { "epoch": 0.8680124223602484, "grad_norm": 1.0557316541671753, "learning_rate": 8.514289021522474e-06, "loss": 0.20946240425109863, "rewards/accuracies": 0.8671875, "rewards/chosen": 32.14514923095703, "rewards/margins": 28.128875732421875, "rewards/rejected": 4.011628150939941, "step": 1677 }, { "epoch": 0.8685300207039337, "grad_norm": 0.8049050569534302, "learning_rate": 8.51225910859861e-06, "loss": 0.2070900797843933, "rewards/accuracies": 0.8828125, "rewards/chosen": 30.945316314697266, "rewards/margins": 27.89892578125, "rewards/rejected": 3.0478391647338867, "step": 1678 }, { "epoch": 0.8690476190476191, "grad_norm": 0.9568613171577454, "learning_rate": 8.510228052233464e-06, "loss": 0.1949855536222458, "rewards/accuracies": 0.8984375, "rewards/chosen": 33.663753509521484, "rewards/margins": 28.44367218017578, "rewards/rejected": 5.229765892028809, "step": 1679 }, { "epoch": 0.8695652173913043, "grad_norm": 0.7656842470169067, "learning_rate": 8.508195853088264e-06, "loss": 0.1804179549217224, "rewards/accuracies": 0.9375, "rewards/chosen": 30.40650177001953, "rewards/margins": 26.780853271484375, "rewards/rejected": 3.6170425415039062, "step": 1680 }, { "epoch": 0.8700828157349897, "grad_norm": 1.039067268371582, "learning_rate": 8.506162511824606e-06, "loss": 0.25104522705078125, "rewards/accuracies": 0.8515625, "rewards/chosen": 31.008468627929688, "rewards/margins": 28.75469970703125, "rewards/rejected": 2.246753692626953, "step": 1681 }, { "epoch": 0.870600414078675, "grad_norm": 0.6760348081588745, "learning_rate": 8.504128029104459e-06, "loss": 0.15380065143108368, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.16545867919922, "rewards/margins": 31.055580139160156, "rewards/rejected": 3.1115550994873047, "step": 1682 }, { "epoch": 0.8711180124223602, "grad_norm": 1.0295367240905762, "learning_rate": 8.502092405590162e-06, "loss": 0.17887620627880096, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.681440353393555, "rewards/margins": 27.254852294921875, "rewards/rejected": 2.435544490814209, "step": 1683 }, { "epoch": 0.8716356107660456, "grad_norm": 1.123365044593811, "learning_rate": 8.50005564194443e-06, "loss": 0.19119514524936676, "rewards/accuracies": 0.890625, "rewards/chosen": 29.302223205566406, "rewards/margins": 27.040077209472656, "rewards/rejected": 2.2728500366210938, "step": 1684 }, { "epoch": 0.8721532091097308, "grad_norm": 0.863304853439331, "learning_rate": 8.498017738830342e-06, "loss": 0.1710519790649414, "rewards/accuracies": 0.8828125, "rewards/chosen": 33.094539642333984, "rewards/margins": 29.719322204589844, "rewards/rejected": 3.3688559532165527, "step": 1685 }, { "epoch": 0.8726708074534162, "grad_norm": 0.9100701808929443, "learning_rate": 8.495978696911356e-06, "loss": 0.19760452210903168, "rewards/accuracies": 0.8671875, "rewards/chosen": 32.11152648925781, "rewards/margins": 29.325603485107422, "rewards/rejected": 2.7876014709472656, "step": 1686 }, { "epoch": 0.8731884057971014, "grad_norm": 0.9143128395080566, "learning_rate": 8.493938516851294e-06, "loss": 0.15607362985610962, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.88446044921875, "rewards/margins": 30.398086547851562, "rewards/rejected": 2.492673397064209, "step": 1687 }, { "epoch": 0.8737060041407867, "grad_norm": 1.1087923049926758, "learning_rate": 8.491897199314355e-06, "loss": 0.2552397847175598, "rewards/accuracies": 0.8671875, "rewards/chosen": 32.68135070800781, "rewards/margins": 29.725326538085938, "rewards/rejected": 2.955455780029297, "step": 1688 }, { "epoch": 0.8742236024844721, "grad_norm": 1.7613918781280518, "learning_rate": 8.4898547449651e-06, "loss": 0.25709110498428345, "rewards/accuracies": 0.8984375, "rewards/chosen": 31.96225357055664, "rewards/margins": 28.877479553222656, "rewards/rejected": 3.084207534790039, "step": 1689 }, { "epoch": 0.8747412008281573, "grad_norm": 0.880817174911499, "learning_rate": 8.487811154468466e-06, "loss": 0.16620858013629913, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.104347229003906, "rewards/margins": 29.651260375976562, "rewards/rejected": 2.4622955322265625, "step": 1690 }, { "epoch": 0.8752587991718427, "grad_norm": 0.7809582352638245, "learning_rate": 8.485766428489761e-06, "loss": 0.17045342922210693, "rewards/accuracies": 0.921875, "rewards/chosen": 26.731046676635742, "rewards/margins": 24.921356201171875, "rewards/rejected": 1.810638427734375, "step": 1691 }, { "epoch": 0.8757763975155279, "grad_norm": 0.7856304049491882, "learning_rate": 8.48372056769466e-06, "loss": 0.13040541112422943, "rewards/accuracies": 0.90625, "rewards/chosen": 31.903154373168945, "rewards/margins": 29.53327178955078, "rewards/rejected": 2.3708419799804688, "step": 1692 }, { "epoch": 0.8762939958592133, "grad_norm": 0.8502088189125061, "learning_rate": 8.481673572749205e-06, "loss": 0.18032699823379517, "rewards/accuracies": 0.890625, "rewards/chosen": 25.437061309814453, "rewards/margins": 23.777053833007812, "rewards/rejected": 1.6597704887390137, "step": 1693 }, { "epoch": 0.8768115942028986, "grad_norm": 1.2146244049072266, "learning_rate": 8.47962544431981e-06, "loss": 0.18345697224140167, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.43207550048828, "rewards/margins": 23.942230224609375, "rewards/rejected": 2.4867706298828125, "step": 1694 }, { "epoch": 0.8773291925465838, "grad_norm": 0.7242618799209595, "learning_rate": 8.477576183073262e-06, "loss": 0.11240901052951813, "rewards/accuracies": 0.96875, "rewards/chosen": 25.525043487548828, "rewards/margins": 23.90753173828125, "rewards/rejected": 1.6221332550048828, "step": 1695 }, { "epoch": 0.8778467908902692, "grad_norm": 1.3064326047897339, "learning_rate": 8.47552578967671e-06, "loss": 0.21631844341754913, "rewards/accuracies": 0.8984375, "rewards/chosen": 21.661954879760742, "rewards/margins": 19.357162475585938, "rewards/rejected": 2.3073959350585938, "step": 1696 }, { "epoch": 0.8783643892339544, "grad_norm": 1.0342828035354614, "learning_rate": 8.473474264797675e-06, "loss": 0.16609108448028564, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.62203598022461, "rewards/margins": 23.438209533691406, "rewards/rejected": 2.1799089908599854, "step": 1697 }, { "epoch": 0.8788819875776398, "grad_norm": 0.8014616370201111, "learning_rate": 8.471421609104044e-06, "loss": 0.1705198884010315, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.458351135253906, "rewards/margins": 21.667556762695312, "rewards/rejected": 1.7891807556152344, "step": 1698 }, { "epoch": 0.879399585921325, "grad_norm": 1.1263818740844727, "learning_rate": 8.469367823264075e-06, "loss": 0.2515159845352173, "rewards/accuracies": 0.828125, "rewards/chosen": 17.299663543701172, "rewards/margins": 15.731035232543945, "rewards/rejected": 1.5695948600769043, "step": 1699 }, { "epoch": 0.8799171842650103, "grad_norm": 0.8458340764045715, "learning_rate": 8.467312907946394e-06, "loss": 0.15288209915161133, "rewards/accuracies": 0.9140625, "rewards/chosen": 19.67737579345703, "rewards/margins": 17.911102294921875, "rewards/rejected": 1.7595999240875244, "step": 1700 }, { "epoch": 0.8804347826086957, "grad_norm": 1.2634588479995728, "learning_rate": 8.465256863819993e-06, "loss": 0.24373924732208252, "rewards/accuracies": 0.875, "rewards/chosen": 17.203025817871094, "rewards/margins": 15.482131958007812, "rewards/rejected": 1.71928071975708, "step": 1701 }, { "epoch": 0.8809523809523809, "grad_norm": 0.8711602091789246, "learning_rate": 8.46319969155423e-06, "loss": 0.2208799123764038, "rewards/accuracies": 0.8671875, "rewards/chosen": 18.066123962402344, "rewards/margins": 16.371200561523438, "rewards/rejected": 1.6939563751220703, "step": 1702 }, { "epoch": 0.8814699792960663, "grad_norm": 0.6713356971740723, "learning_rate": 8.461141391818835e-06, "loss": 0.16322486102581024, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.316165924072266, "rewards/margins": 19.233963012695312, "rewards/rejected": 2.0817832946777344, "step": 1703 }, { "epoch": 0.8819875776397516, "grad_norm": 0.6972991228103638, "learning_rate": 8.4590819652839e-06, "loss": 0.13102561235427856, "rewards/accuracies": 0.9375, "rewards/chosen": 21.234519958496094, "rewards/margins": 18.957977294921875, "rewards/rejected": 2.2785415649414062, "step": 1704 }, { "epoch": 0.8825051759834368, "grad_norm": 1.117783546447754, "learning_rate": 8.457021412619887e-06, "loss": 0.18414098024368286, "rewards/accuracies": 0.90625, "rewards/chosen": 18.62537956237793, "rewards/margins": 15.99432373046875, "rewards/rejected": 2.627063274383545, "step": 1705 }, { "epoch": 0.8830227743271222, "grad_norm": 0.8461037874221802, "learning_rate": 8.454959734497626e-06, "loss": 0.22541150450706482, "rewards/accuracies": 0.8828125, "rewards/chosen": 19.90511703491211, "rewards/margins": 17.346405029296875, "rewards/rejected": 2.554769515991211, "step": 1706 }, { "epoch": 0.8835403726708074, "grad_norm": 0.7437970042228699, "learning_rate": 8.452896931588308e-06, "loss": 0.19937032461166382, "rewards/accuracies": 0.921875, "rewards/chosen": 19.174232482910156, "rewards/margins": 16.979488372802734, "rewards/rejected": 2.200765609741211, "step": 1707 }, { "epoch": 0.8840579710144928, "grad_norm": 0.9163292050361633, "learning_rate": 8.450833004563494e-06, "loss": 0.16203254461288452, "rewards/accuracies": 0.9375, "rewards/chosen": 18.775463104248047, "rewards/margins": 16.580841064453125, "rewards/rejected": 2.194678783416748, "step": 1708 }, { "epoch": 0.884575569358178, "grad_norm": 0.5392158031463623, "learning_rate": 8.448767954095112e-06, "loss": 0.1923491358757019, "rewards/accuracies": 0.8515625, "rewards/chosen": 21.56658172607422, "rewards/margins": 18.745376586914062, "rewards/rejected": 2.817035675048828, "step": 1709 }, { "epoch": 0.8850931677018633, "grad_norm": 1.1545448303222656, "learning_rate": 8.446701780855452e-06, "loss": 0.25886693596839905, "rewards/accuracies": 0.8671875, "rewards/chosen": 20.127639770507812, "rewards/margins": 17.246726989746094, "rewards/rejected": 2.8800201416015625, "step": 1710 }, { "epoch": 0.8856107660455487, "grad_norm": 0.8735032677650452, "learning_rate": 8.444634485517173e-06, "loss": 0.17552447319030762, "rewards/accuracies": 0.921875, "rewards/chosen": 20.861183166503906, "rewards/margins": 17.989341735839844, "rewards/rejected": 2.8701133728027344, "step": 1711 }, { "epoch": 0.8861283643892339, "grad_norm": 1.032297968864441, "learning_rate": 8.442566068753296e-06, "loss": 0.17324015498161316, "rewards/accuracies": 0.8828125, "rewards/chosen": 18.427467346191406, "rewards/margins": 16.257436752319336, "rewards/rejected": 2.1714816093444824, "step": 1712 }, { "epoch": 0.8866459627329193, "grad_norm": 0.7177382111549377, "learning_rate": 8.440496531237212e-06, "loss": 0.22181347012519836, "rewards/accuracies": 0.890625, "rewards/chosen": 16.925701141357422, "rewards/margins": 15.142059326171875, "rewards/rejected": 1.7808055877685547, "step": 1713 }, { "epoch": 0.8871635610766045, "grad_norm": 0.6801588535308838, "learning_rate": 8.438425873642672e-06, "loss": 0.13358142971992493, "rewards/accuracies": 0.9375, "rewards/chosen": 23.458101272583008, "rewards/margins": 20.21478271484375, "rewards/rejected": 3.2453269958496094, "step": 1714 }, { "epoch": 0.8876811594202898, "grad_norm": 0.980193555355072, "learning_rate": 8.436354096643795e-06, "loss": 0.201555535197258, "rewards/accuracies": 0.90625, "rewards/chosen": 22.67981719970703, "rewards/margins": 19.303421020507812, "rewards/rejected": 3.3767383098602295, "step": 1715 }, { "epoch": 0.8881987577639752, "grad_norm": 0.9251440167427063, "learning_rate": 8.43428120091506e-06, "loss": 0.2095840573310852, "rewards/accuracies": 0.9140625, "rewards/chosen": 19.89788055419922, "rewards/margins": 17.38855743408203, "rewards/rejected": 2.5068206787109375, "step": 1716 }, { "epoch": 0.8887163561076604, "grad_norm": 1.458829641342163, "learning_rate": 8.432207187131316e-06, "loss": 0.20476047694683075, "rewards/accuracies": 0.8828125, "rewards/chosen": 20.136945724487305, "rewards/margins": 17.70379638671875, "rewards/rejected": 2.4261951446533203, "step": 1717 }, { "epoch": 0.8892339544513458, "grad_norm": 0.85158371925354, "learning_rate": 8.430132055967773e-06, "loss": 0.13737767934799194, "rewards/accuracies": 0.9609375, "rewards/chosen": 25.681304931640625, "rewards/margins": 22.663055419921875, "rewards/rejected": 3.015714645385742, "step": 1718 }, { "epoch": 0.889751552795031, "grad_norm": 1.718395709991455, "learning_rate": 8.428055808100001e-06, "loss": 0.17876699566841125, "rewards/accuracies": 0.921875, "rewards/chosen": 25.777952194213867, "rewards/margins": 21.772079467773438, "rewards/rejected": 4.006343841552734, "step": 1719 }, { "epoch": 0.8902691511387164, "grad_norm": 0.9580543041229248, "learning_rate": 8.425978444203944e-06, "loss": 0.19871526956558228, "rewards/accuracies": 0.9140625, "rewards/chosen": 19.708030700683594, "rewards/margins": 16.832534790039062, "rewards/rejected": 2.8767638206481934, "step": 1720 }, { "epoch": 0.8907867494824017, "grad_norm": 0.7568016052246094, "learning_rate": 8.423899964955896e-06, "loss": 0.19977666437625885, "rewards/accuracies": 0.8984375, "rewards/chosen": 24.266719818115234, "rewards/margins": 20.812652587890625, "rewards/rejected": 3.46120285987854, "step": 1721 }, { "epoch": 0.8913043478260869, "grad_norm": 0.7516465187072754, "learning_rate": 8.421820371032526e-06, "loss": 0.16002345085144043, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.25286293029785, "rewards/margins": 21.912755966186523, "rewards/rejected": 3.343709945678711, "step": 1722 }, { "epoch": 0.8918219461697723, "grad_norm": 1.8646583557128906, "learning_rate": 8.41973966311086e-06, "loss": 0.18112167716026306, "rewards/accuracies": 0.9375, "rewards/chosen": 31.840848922729492, "rewards/margins": 27.746936798095703, "rewards/rejected": 4.102598190307617, "step": 1723 }, { "epoch": 0.8923395445134575, "grad_norm": 3.052971839904785, "learning_rate": 8.417657841868284e-06, "loss": 0.28551143407821655, "rewards/accuracies": 0.875, "rewards/chosen": 26.377365112304688, "rewards/margins": 23.16790008544922, "rewards/rejected": 3.2109298706054688, "step": 1724 }, { "epoch": 0.8928571428571429, "grad_norm": 1.0459758043289185, "learning_rate": 8.415574907982553e-06, "loss": 0.23782840371131897, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.7757568359375, "rewards/margins": 19.940505981445312, "rewards/rejected": 2.8276844024658203, "step": 1725 }, { "epoch": 0.8933747412008282, "grad_norm": 1.7753965854644775, "learning_rate": 8.41349086213178e-06, "loss": 0.28772294521331787, "rewards/accuracies": 0.859375, "rewards/chosen": 29.211181640625, "rewards/margins": 24.724700927734375, "rewards/rejected": 4.479737281799316, "step": 1726 }, { "epoch": 0.8938923395445134, "grad_norm": 1.52382230758667, "learning_rate": 8.411405704994442e-06, "loss": 0.19655746221542358, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.25771713256836, "rewards/margins": 23.18811798095703, "rewards/rejected": 4.069713592529297, "step": 1727 }, { "epoch": 0.8944099378881988, "grad_norm": 0.8357822895050049, "learning_rate": 8.409319437249378e-06, "loss": 0.20842453837394714, "rewards/accuracies": 0.875, "rewards/chosen": 28.309354782104492, "rewards/margins": 24.02463150024414, "rewards/rejected": 4.280941009521484, "step": 1728 }, { "epoch": 0.894927536231884, "grad_norm": 1.3008215427398682, "learning_rate": 8.407232059575785e-06, "loss": 0.18420879542827606, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.8800048828125, "rewards/margins": 25.698104858398438, "rewards/rejected": 4.180373668670654, "step": 1729 }, { "epoch": 0.8954451345755694, "grad_norm": 0.6968556046485901, "learning_rate": 8.405143572653225e-06, "loss": 0.17520147562026978, "rewards/accuracies": 0.8984375, "rewards/chosen": 26.184803009033203, "rewards/margins": 23.510997772216797, "rewards/rejected": 2.6772029399871826, "step": 1730 }, { "epoch": 0.8959627329192547, "grad_norm": 0.6293640732765198, "learning_rate": 8.40305397716162e-06, "loss": 0.17566025257110596, "rewards/accuracies": 0.90625, "rewards/chosen": 26.855819702148438, "rewards/margins": 22.961822509765625, "rewards/rejected": 3.894948959350586, "step": 1731 }, { "epoch": 0.8964803312629399, "grad_norm": 0.7234556078910828, "learning_rate": 8.400963273781252e-06, "loss": 0.180648535490036, "rewards/accuracies": 0.875, "rewards/chosen": 27.607059478759766, "rewards/margins": 24.2652587890625, "rewards/rejected": 3.3434982299804688, "step": 1732 }, { "epoch": 0.8969979296066253, "grad_norm": 0.8152795433998108, "learning_rate": 8.398871463192767e-06, "loss": 0.20365984737873077, "rewards/accuracies": 0.875, "rewards/chosen": 26.795677185058594, "rewards/margins": 23.379348754882812, "rewards/rejected": 3.4129161834716797, "step": 1733 }, { "epoch": 0.8975155279503105, "grad_norm": 0.8535913825035095, "learning_rate": 8.396778546077166e-06, "loss": 0.171764075756073, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.42245101928711, "rewards/margins": 23.68456268310547, "rewards/rejected": 2.732685089111328, "step": 1734 }, { "epoch": 0.8980331262939959, "grad_norm": 1.0699540376663208, "learning_rate": 8.394684523115814e-06, "loss": 0.21617862582206726, "rewards/accuracies": 0.875, "rewards/chosen": 31.62430763244629, "rewards/margins": 27.905731201171875, "rewards/rejected": 3.7135534286499023, "step": 1735 }, { "epoch": 0.8985507246376812, "grad_norm": 1.1702460050582886, "learning_rate": 8.392589394990438e-06, "loss": 0.17431868612766266, "rewards/accuracies": 0.921875, "rewards/chosen": 26.66204071044922, "rewards/margins": 23.593101501464844, "rewards/rejected": 3.069652557373047, "step": 1736 }, { "epoch": 0.8990683229813664, "grad_norm": 0.8392894268035889, "learning_rate": 8.390493162383117e-06, "loss": 0.1821904182434082, "rewards/accuracies": 0.8984375, "rewards/chosen": 29.91983985900879, "rewards/margins": 27.06097412109375, "rewards/rejected": 2.859889507293701, "step": 1737 }, { "epoch": 0.8995859213250518, "grad_norm": 0.5446137189865112, "learning_rate": 8.388395825976302e-06, "loss": 0.18772205710411072, "rewards/accuracies": 0.90625, "rewards/chosen": 27.755657196044922, "rewards/margins": 24.546546936035156, "rewards/rejected": 3.20841646194458, "step": 1738 }, { "epoch": 0.900103519668737, "grad_norm": 0.556792676448822, "learning_rate": 8.386297386452787e-06, "loss": 0.1679026186466217, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.25937271118164, "rewards/margins": 24.840286254882812, "rewards/rejected": 2.411680221557617, "step": 1739 }, { "epoch": 0.9006211180124224, "grad_norm": 0.8856478929519653, "learning_rate": 8.384197844495743e-06, "loss": 0.19278544187545776, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.527528762817383, "rewards/margins": 21.36357879638672, "rewards/rejected": 2.1662254333496094, "step": 1740 }, { "epoch": 0.9011387163561076, "grad_norm": 0.4991209805011749, "learning_rate": 8.382097200788683e-06, "loss": 0.182017520070076, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.890609741210938, "rewards/margins": 25.626739501953125, "rewards/rejected": 2.2639474868774414, "step": 1741 }, { "epoch": 0.901656314699793, "grad_norm": 0.9952691793441772, "learning_rate": 8.379995456015492e-06, "loss": 0.19588083028793335, "rewards/accuracies": 0.90625, "rewards/chosen": 30.34248924255371, "rewards/margins": 26.856956481933594, "rewards/rejected": 3.4880852699279785, "step": 1742 }, { "epoch": 0.9021739130434783, "grad_norm": 1.0034044981002808, "learning_rate": 8.377892610860407e-06, "loss": 0.1748698353767395, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.22516632080078, "rewards/margins": 29.21672821044922, "rewards/rejected": 3.016152858734131, "step": 1743 }, { "epoch": 0.9026915113871635, "grad_norm": 1.172337293624878, "learning_rate": 8.375788666008024e-06, "loss": 0.17946197092533112, "rewards/accuracies": 0.890625, "rewards/chosen": 30.4442081451416, "rewards/margins": 28.153244018554688, "rewards/rejected": 2.2928333282470703, "step": 1744 }, { "epoch": 0.9032091097308489, "grad_norm": 1.0982670783996582, "learning_rate": 8.373683622143299e-06, "loss": 0.17472775280475616, "rewards/accuracies": 0.921875, "rewards/chosen": 32.87318420410156, "rewards/margins": 30.689849853515625, "rewards/rejected": 2.185466766357422, "step": 1745 }, { "epoch": 0.9037267080745341, "grad_norm": 0.9487172961235046, "learning_rate": 8.37157747995154e-06, "loss": 0.1817486584186554, "rewards/accuracies": 0.90625, "rewards/chosen": 30.631418228149414, "rewards/margins": 28.616676330566406, "rewards/rejected": 2.0058112144470215, "step": 1746 }, { "epoch": 0.9042443064182195, "grad_norm": 0.816520094871521, "learning_rate": 8.36947024011842e-06, "loss": 0.2390596568584442, "rewards/accuracies": 0.8828125, "rewards/chosen": 29.236900329589844, "rewards/margins": 26.413330078125, "rewards/rejected": 2.8224687576293945, "step": 1747 }, { "epoch": 0.9047619047619048, "grad_norm": 1.0673038959503174, "learning_rate": 8.367361903329968e-06, "loss": 0.18368250131607056, "rewards/accuracies": 0.90625, "rewards/chosen": 35.56916046142578, "rewards/margins": 33.125404357910156, "rewards/rejected": 2.440487861633301, "step": 1748 }, { "epoch": 0.90527950310559, "grad_norm": 0.8885737657546997, "learning_rate": 8.365252470272562e-06, "loss": 0.16422009468078613, "rewards/accuracies": 0.9375, "rewards/chosen": 33.776885986328125, "rewards/margins": 31.1690673828125, "rewards/rejected": 2.6019325256347656, "step": 1749 }, { "epoch": 0.9057971014492754, "grad_norm": 1.3996126651763916, "learning_rate": 8.363141941632948e-06, "loss": 0.16177216172218323, "rewards/accuracies": 0.9375, "rewards/chosen": 34.15642547607422, "rewards/margins": 31.016586303710938, "rewards/rejected": 3.1515352725982666, "step": 1750 }, { "epoch": 0.9063146997929606, "grad_norm": 0.8321415185928345, "learning_rate": 8.361030318098225e-06, "loss": 0.20072831213474274, "rewards/accuracies": 0.859375, "rewards/chosen": 36.124176025390625, "rewards/margins": 33.77198791503906, "rewards/rejected": 2.3571319580078125, "step": 1751 }, { "epoch": 0.906832298136646, "grad_norm": 1.0096040964126587, "learning_rate": 8.358917600355842e-06, "loss": 0.20670342445373535, "rewards/accuracies": 0.8984375, "rewards/chosen": 38.77568817138672, "rewards/margins": 36.22858810424805, "rewards/rejected": 2.551004409790039, "step": 1752 }, { "epoch": 0.9073498964803313, "grad_norm": 0.9600409269332886, "learning_rate": 8.356803789093614e-06, "loss": 0.20530974864959717, "rewards/accuracies": 0.921875, "rewards/chosen": 37.03830337524414, "rewards/margins": 34.428504943847656, "rewards/rejected": 2.596855640411377, "step": 1753 }, { "epoch": 0.9078674948240165, "grad_norm": 1.1000218391418457, "learning_rate": 8.354688884999707e-06, "loss": 0.24271175265312195, "rewards/accuracies": 0.90625, "rewards/chosen": 35.875160217285156, "rewards/margins": 32.72577667236328, "rewards/rejected": 3.15517520904541, "step": 1754 }, { "epoch": 0.9083850931677019, "grad_norm": 1.2638520002365112, "learning_rate": 8.35257288876264e-06, "loss": 0.21785777807235718, "rewards/accuracies": 0.890625, "rewards/chosen": 35.0983772277832, "rewards/margins": 32.45478057861328, "rewards/rejected": 2.637965202331543, "step": 1755 }, { "epoch": 0.9089026915113871, "grad_norm": 0.739005446434021, "learning_rate": 8.350455801071296e-06, "loss": 0.12796616554260254, "rewards/accuracies": 0.953125, "rewards/chosen": 40.9267578125, "rewards/margins": 37.38872146606445, "rewards/rejected": 3.540961265563965, "step": 1756 }, { "epoch": 0.9094202898550725, "grad_norm": 2.187533140182495, "learning_rate": 8.348337622614904e-06, "loss": 0.2319571077823639, "rewards/accuracies": 0.8515625, "rewards/chosen": 39.2133903503418, "rewards/margins": 36.329315185546875, "rewards/rejected": 2.886706829071045, "step": 1757 }, { "epoch": 0.9099378881987578, "grad_norm": 0.6536272764205933, "learning_rate": 8.346218354083053e-06, "loss": 0.16590763628482819, "rewards/accuracies": 0.8984375, "rewards/chosen": 39.24463653564453, "rewards/margins": 36.02261734008789, "rewards/rejected": 3.2323646545410156, "step": 1758 }, { "epoch": 0.910455486542443, "grad_norm": 0.7275141477584839, "learning_rate": 8.344097996165688e-06, "loss": 0.17486195266246796, "rewards/accuracies": 0.9140625, "rewards/chosen": 36.28425598144531, "rewards/margins": 32.87040328979492, "rewards/rejected": 3.398759365081787, "step": 1759 }, { "epoch": 0.9109730848861284, "grad_norm": 1.3641712665557861, "learning_rate": 8.341976549553104e-06, "loss": 0.15415628254413605, "rewards/accuracies": 0.9296875, "rewards/chosen": 46.29551696777344, "rewards/margins": 42.42900085449219, "rewards/rejected": 3.852325439453125, "step": 1760 }, { "epoch": 0.9114906832298136, "grad_norm": 1.0285954475402832, "learning_rate": 8.339854014935956e-06, "loss": 0.2368355542421341, "rewards/accuracies": 0.84375, "rewards/chosen": 39.206424713134766, "rewards/margins": 35.96405029296875, "rewards/rejected": 3.23358154296875, "step": 1761 }, { "epoch": 0.912008281573499, "grad_norm": 1.667582392692566, "learning_rate": 8.337730393005248e-06, "loss": 0.32438039779663086, "rewards/accuracies": 0.859375, "rewards/chosen": 37.400146484375, "rewards/margins": 34.13450241088867, "rewards/rejected": 3.281867265701294, "step": 1762 }, { "epoch": 0.9125258799171843, "grad_norm": 2.9537453651428223, "learning_rate": 8.335605684452343e-06, "loss": 0.21066994965076447, "rewards/accuracies": 0.890625, "rewards/chosen": 39.27242660522461, "rewards/margins": 34.64459991455078, "rewards/rejected": 4.62101936340332, "step": 1763 }, { "epoch": 0.9130434782608695, "grad_norm": 0.5380900502204895, "learning_rate": 8.333479889968954e-06, "loss": 0.1794080138206482, "rewards/accuracies": 0.9140625, "rewards/chosen": 43.1767578125, "rewards/margins": 39.269798278808594, "rewards/rejected": 3.8983922004699707, "step": 1764 }, { "epoch": 0.9135610766045549, "grad_norm": 0.7918962836265564, "learning_rate": 8.331353010247148e-06, "loss": 0.2567102313041687, "rewards/accuracies": 0.890625, "rewards/chosen": 37.23146438598633, "rewards/margins": 34.551979064941406, "rewards/rejected": 2.689293146133423, "step": 1765 }, { "epoch": 0.9140786749482401, "grad_norm": 1.202311396598816, "learning_rate": 8.329225045979345e-06, "loss": 0.13085684180259705, "rewards/accuracies": 0.9609375, "rewards/chosen": 36.215057373046875, "rewards/margins": 33.03106689453125, "rewards/rejected": 3.1849260330200195, "step": 1766 }, { "epoch": 0.9145962732919255, "grad_norm": 0.7191990613937378, "learning_rate": 8.327095997858324e-06, "loss": 0.15555927157402039, "rewards/accuracies": 0.90625, "rewards/chosen": 40.08296585083008, "rewards/margins": 36.603057861328125, "rewards/rejected": 3.47550630569458, "step": 1767 }, { "epoch": 0.9151138716356108, "grad_norm": 0.938032329082489, "learning_rate": 8.324965866577206e-06, "loss": 0.2202555537223816, "rewards/accuracies": 0.890625, "rewards/chosen": 28.973154067993164, "rewards/margins": 27.772747039794922, "rewards/rejected": 1.1933586597442627, "step": 1768 }, { "epoch": 0.9156314699792961, "grad_norm": 0.7227593064308167, "learning_rate": 8.322834652829474e-06, "loss": 0.2075486183166504, "rewards/accuracies": 0.890625, "rewards/chosen": 36.54920959472656, "rewards/margins": 34.23053741455078, "rewards/rejected": 2.310969352722168, "step": 1769 }, { "epoch": 0.9161490683229814, "grad_norm": 0.7907344102859497, "learning_rate": 8.32070235730896e-06, "loss": 0.1513414978981018, "rewards/accuracies": 0.9375, "rewards/chosen": 35.19864273071289, "rewards/margins": 32.859642028808594, "rewards/rejected": 2.3427963256835938, "step": 1770 }, { "epoch": 0.9166666666666666, "grad_norm": 0.5727226734161377, "learning_rate": 8.318568980709848e-06, "loss": 0.15379302203655243, "rewards/accuracies": 0.921875, "rewards/chosen": 34.14933776855469, "rewards/margins": 31.666183471679688, "rewards/rejected": 2.4862489700317383, "step": 1771 }, { "epoch": 0.917184265010352, "grad_norm": 1.5800524950027466, "learning_rate": 8.316434523726672e-06, "loss": 0.21215319633483887, "rewards/accuracies": 0.8671875, "rewards/chosen": 30.3309326171875, "rewards/margins": 27.82514190673828, "rewards/rejected": 2.511760711669922, "step": 1772 }, { "epoch": 0.9177018633540373, "grad_norm": 0.5396172404289246, "learning_rate": 8.314298987054323e-06, "loss": 0.11130864173173904, "rewards/accuracies": 0.9375, "rewards/chosen": 37.653289794921875, "rewards/margins": 34.71189880371094, "rewards/rejected": 2.940869092941284, "step": 1773 }, { "epoch": 0.9182194616977226, "grad_norm": 0.8547749519348145, "learning_rate": 8.312162371388037e-06, "loss": 0.1847030073404312, "rewards/accuracies": 0.890625, "rewards/chosen": 35.63056182861328, "rewards/margins": 32.100486755371094, "rewards/rejected": 3.5315704345703125, "step": 1774 }, { "epoch": 0.9187370600414079, "grad_norm": 0.874280571937561, "learning_rate": 8.310024677423408e-06, "loss": 0.22362901270389557, "rewards/accuracies": 0.90625, "rewards/chosen": 30.922412872314453, "rewards/margins": 27.61572265625, "rewards/rejected": 3.305492877960205, "step": 1775 }, { "epoch": 0.9192546583850931, "grad_norm": 1.7944471836090088, "learning_rate": 8.307885905856376e-06, "loss": 0.2212931513786316, "rewards/accuracies": 0.859375, "rewards/chosen": 31.313461303710938, "rewards/margins": 28.425411224365234, "rewards/rejected": 2.882534980773926, "step": 1776 }, { "epoch": 0.9197722567287785, "grad_norm": 1.158393144607544, "learning_rate": 8.305746057383233e-06, "loss": 0.22400090098381042, "rewards/accuracies": 0.8671875, "rewards/chosen": 25.467973709106445, "rewards/margins": 23.413162231445312, "rewards/rejected": 2.0517120361328125, "step": 1777 }, { "epoch": 0.9202898550724637, "grad_norm": 1.9791643619537354, "learning_rate": 8.303605132700623e-06, "loss": 0.17883998155593872, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.56555938720703, "rewards/margins": 27.869815826416016, "rewards/rejected": 2.700927734375, "step": 1778 }, { "epoch": 0.9208074534161491, "grad_norm": 2.090447425842285, "learning_rate": 8.30146313250554e-06, "loss": 0.1426074504852295, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.13141632080078, "rewards/margins": 31.964385986328125, "rewards/rejected": 2.1545791625976562, "step": 1779 }, { "epoch": 0.9213250517598344, "grad_norm": 0.5787361860275269, "learning_rate": 8.299320057495326e-06, "loss": 0.16026350855827332, "rewards/accuracies": 0.90625, "rewards/chosen": 33.56062316894531, "rewards/margins": 30.756099700927734, "rewards/rejected": 2.7984390258789062, "step": 1780 }, { "epoch": 0.9218426501035196, "grad_norm": 0.5887353420257568, "learning_rate": 8.297175908367676e-06, "loss": 0.1393968015909195, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.929290771484375, "rewards/margins": 31.222240447998047, "rewards/rejected": 2.7103660106658936, "step": 1781 }, { "epoch": 0.922360248447205, "grad_norm": 1.1081064939498901, "learning_rate": 8.295030685820632e-06, "loss": 0.15789617598056793, "rewards/accuracies": 0.921875, "rewards/chosen": 36.27117919921875, "rewards/margins": 33.57670593261719, "rewards/rejected": 2.6971702575683594, "step": 1782 }, { "epoch": 0.9228778467908902, "grad_norm": 1.076614260673523, "learning_rate": 8.292884390552588e-06, "loss": 0.24285158514976501, "rewards/accuracies": 0.90625, "rewards/chosen": 34.093746185302734, "rewards/margins": 30.972763061523438, "rewards/rejected": 3.1326398849487305, "step": 1783 }, { "epoch": 0.9233954451345756, "grad_norm": 0.7032462358474731, "learning_rate": 8.290737023262286e-06, "loss": 0.1503113955259323, "rewards/accuracies": 0.9375, "rewards/chosen": 32.99647521972656, "rewards/margins": 30.526031494140625, "rewards/rejected": 2.4824371337890625, "step": 1784 }, { "epoch": 0.9239130434782609, "grad_norm": 0.7868230938911438, "learning_rate": 8.288588584648813e-06, "loss": 0.1939258873462677, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.588363647460938, "rewards/margins": 28.791397094726562, "rewards/rejected": 1.7915916442871094, "step": 1785 }, { "epoch": 0.9244306418219461, "grad_norm": 2.068950891494751, "learning_rate": 8.286439075411613e-06, "loss": 0.1981777399778366, "rewards/accuracies": 0.8828125, "rewards/chosen": 30.961179733276367, "rewards/margins": 29.499893188476562, "rewards/rejected": 1.4661407470703125, "step": 1786 }, { "epoch": 0.9249482401656315, "grad_norm": 1.1367098093032837, "learning_rate": 8.284288496250475e-06, "loss": 0.18105608224868774, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.06330108642578, "rewards/margins": 26.550865173339844, "rewards/rejected": 2.513298988342285, "step": 1787 }, { "epoch": 0.9254658385093167, "grad_norm": 0.9683094024658203, "learning_rate": 8.282136847865532e-06, "loss": 0.16124102473258972, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.157520294189453, "rewards/margins": 28.83623504638672, "rewards/rejected": 1.3219575881958008, "step": 1788 }, { "epoch": 0.9259834368530021, "grad_norm": 0.9072991013526917, "learning_rate": 8.27998413095727e-06, "loss": 0.18756096065044403, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.63028907775879, "rewards/margins": 25.55990982055664, "rewards/rejected": 2.0715932846069336, "step": 1789 }, { "epoch": 0.9265010351966874, "grad_norm": 0.8461597561836243, "learning_rate": 8.277830346226522e-06, "loss": 0.16402684152126312, "rewards/accuracies": 0.90625, "rewards/chosen": 29.778446197509766, "rewards/margins": 27.78003692626953, "rewards/rejected": 1.9986553192138672, "step": 1790 }, { "epoch": 0.9270186335403726, "grad_norm": 0.5348219275474548, "learning_rate": 8.275675494374469e-06, "loss": 0.13015107810497284, "rewards/accuracies": 0.96875, "rewards/chosen": 33.04402542114258, "rewards/margins": 30.782852172851562, "rewards/rejected": 2.263543128967285, "step": 1791 }, { "epoch": 0.927536231884058, "grad_norm": 1.079889178276062, "learning_rate": 8.273519576102635e-06, "loss": 0.18885162472724915, "rewards/accuracies": 0.8984375, "rewards/chosen": 29.354001998901367, "rewards/margins": 27.303359985351562, "rewards/rejected": 2.0465869903564453, "step": 1792 }, { "epoch": 0.9280538302277432, "grad_norm": 2.0932118892669678, "learning_rate": 8.2713625921129e-06, "loss": 0.16207468509674072, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.822172164916992, "rewards/margins": 28.292770385742188, "rewards/rejected": 1.5398178100585938, "step": 1793 }, { "epoch": 0.9285714285714286, "grad_norm": 1.1719629764556885, "learning_rate": 8.269204543107481e-06, "loss": 0.23032882809638977, "rewards/accuracies": 0.890625, "rewards/chosen": 29.242216110229492, "rewards/margins": 26.898113250732422, "rewards/rejected": 2.3423118591308594, "step": 1794 }, { "epoch": 0.9290890269151139, "grad_norm": 0.9864116311073303, "learning_rate": 8.267045429788948e-06, "loss": 0.19551372528076172, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.280834197998047, "rewards/margins": 27.766250610351562, "rewards/rejected": 2.5198707580566406, "step": 1795 }, { "epoch": 0.9296066252587992, "grad_norm": 1.033394694328308, "learning_rate": 8.264885252860216e-06, "loss": 0.17498914897441864, "rewards/accuracies": 0.90625, "rewards/chosen": 36.286624908447266, "rewards/margins": 33.71085739135742, "rewards/rejected": 2.5792436599731445, "step": 1796 }, { "epoch": 0.9301242236024845, "grad_norm": 1.3704549074172974, "learning_rate": 8.262724013024548e-06, "loss": 0.2524619698524475, "rewards/accuracies": 0.8671875, "rewards/chosen": 29.39556121826172, "rewards/margins": 26.959857940673828, "rewards/rejected": 2.443450927734375, "step": 1797 }, { "epoch": 0.9306418219461697, "grad_norm": 1.8521296977996826, "learning_rate": 8.260561710985548e-06, "loss": 0.1694856435060501, "rewards/accuracies": 0.8984375, "rewards/chosen": 35.42848205566406, "rewards/margins": 32.892791748046875, "rewards/rejected": 2.5482006072998047, "step": 1798 }, { "epoch": 0.9311594202898551, "grad_norm": 0.5852417349815369, "learning_rate": 8.258398347447172e-06, "loss": 0.19975882768630981, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.24221420288086, "rewards/margins": 25.30181884765625, "rewards/rejected": 2.9455623626708984, "step": 1799 }, { "epoch": 0.9316770186335404, "grad_norm": 0.6589427590370178, "learning_rate": 8.256233923113715e-06, "loss": 0.20445968210697174, "rewards/accuracies": 0.890625, "rewards/chosen": 32.299827575683594, "rewards/margins": 30.006744384765625, "rewards/rejected": 2.3000259399414062, "step": 1800 }, { "epoch": 0.9321946169772257, "grad_norm": 0.6278084516525269, "learning_rate": 8.254068438689826e-06, "loss": 0.1996869146823883, "rewards/accuracies": 0.8984375, "rewards/chosen": 34.35877227783203, "rewards/margins": 30.785438537597656, "rewards/rejected": 3.577411651611328, "step": 1801 }, { "epoch": 0.932712215320911, "grad_norm": 0.6899591088294983, "learning_rate": 8.251901894880489e-06, "loss": 0.166069895029068, "rewards/accuracies": 0.9140625, "rewards/chosen": 37.263397216796875, "rewards/margins": 32.63512420654297, "rewards/rejected": 4.618932723999023, "step": 1802 }, { "epoch": 0.9332298136645962, "grad_norm": 1.1412142515182495, "learning_rate": 8.24973429239104e-06, "loss": 0.21278303861618042, "rewards/accuracies": 0.9375, "rewards/chosen": 35.55363082885742, "rewards/margins": 31.82852554321289, "rewards/rejected": 3.7152743339538574, "step": 1803 }, { "epoch": 0.9337474120082816, "grad_norm": 2.3753695487976074, "learning_rate": 8.24756563192716e-06, "loss": 0.19876675307750702, "rewards/accuracies": 0.921875, "rewards/chosen": 36.01068878173828, "rewards/margins": 32.53363037109375, "rewards/rejected": 3.4814000129699707, "step": 1804 }, { "epoch": 0.9342650103519669, "grad_norm": 1.2842680215835571, "learning_rate": 8.245395914194868e-06, "loss": 0.14971080422401428, "rewards/accuracies": 0.90625, "rewards/chosen": 34.76643753051758, "rewards/margins": 31.960952758789062, "rewards/rejected": 2.8069534301757812, "step": 1805 }, { "epoch": 0.9347826086956522, "grad_norm": 1.6513830423355103, "learning_rate": 8.243225139900533e-06, "loss": 0.19608816504478455, "rewards/accuracies": 0.921875, "rewards/chosen": 31.527685165405273, "rewards/margins": 29.003849029541016, "rewards/rejected": 2.5233192443847656, "step": 1806 }, { "epoch": 0.9353002070393375, "grad_norm": 2.5383262634277344, "learning_rate": 8.241053309750868e-06, "loss": 0.27214786410331726, "rewards/accuracies": 0.875, "rewards/chosen": 28.047521591186523, "rewards/margins": 26.113449096679688, "rewards/rejected": 1.9397644996643066, "step": 1807 }, { "epoch": 0.9358178053830227, "grad_norm": 1.4654618501663208, "learning_rate": 8.238880424452925e-06, "loss": 0.21060211956501007, "rewards/accuracies": 0.859375, "rewards/chosen": 32.125640869140625, "rewards/margins": 28.72742462158203, "rewards/rejected": 3.4081039428710938, "step": 1808 }, { "epoch": 0.9363354037267081, "grad_norm": 1.155383825302124, "learning_rate": 8.236706484714103e-06, "loss": 0.2355058789253235, "rewards/accuracies": 0.8515625, "rewards/chosen": 30.010210037231445, "rewards/margins": 27.37004852294922, "rewards/rejected": 2.649616241455078, "step": 1809 }, { "epoch": 0.9368530020703933, "grad_norm": 0.9094429612159729, "learning_rate": 8.234531491242145e-06, "loss": 0.168870747089386, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.31657409667969, "rewards/margins": 30.320194244384766, "rewards/rejected": 3.997762680053711, "step": 1810 }, { "epoch": 0.9373706004140787, "grad_norm": 0.8365866541862488, "learning_rate": 8.232355444745136e-06, "loss": 0.23077121376991272, "rewards/accuracies": 0.8828125, "rewards/chosen": 33.096351623535156, "rewards/margins": 30.039928436279297, "rewards/rejected": 3.0602869987487793, "step": 1811 }, { "epoch": 0.937888198757764, "grad_norm": 0.6862980127334595, "learning_rate": 8.230178345931504e-06, "loss": 0.15453383326530457, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.49225997924805, "rewards/margins": 31.075546264648438, "rewards/rejected": 3.4158401489257812, "step": 1812 }, { "epoch": 0.9384057971014492, "grad_norm": 0.777923583984375, "learning_rate": 8.228000195510017e-06, "loss": 0.19903187453746796, "rewards/accuracies": 0.875, "rewards/chosen": 30.563243865966797, "rewards/margins": 27.43938446044922, "rewards/rejected": 3.126857280731201, "step": 1813 }, { "epoch": 0.9389233954451346, "grad_norm": 1.3204724788665771, "learning_rate": 8.225820994189789e-06, "loss": 0.19185250997543335, "rewards/accuracies": 0.890625, "rewards/chosen": 35.51801300048828, "rewards/margins": 32.51884460449219, "rewards/rejected": 2.995269775390625, "step": 1814 }, { "epoch": 0.9394409937888198, "grad_norm": 1.073704719543457, "learning_rate": 8.223640742680275e-06, "loss": 0.22411125898361206, "rewards/accuracies": 0.859375, "rewards/chosen": 30.143653869628906, "rewards/margins": 26.960647583007812, "rewards/rejected": 3.1834936141967773, "step": 1815 }, { "epoch": 0.9399585921325052, "grad_norm": 0.5152602791786194, "learning_rate": 8.221459441691268e-06, "loss": 0.20270389318466187, "rewards/accuracies": 0.8984375, "rewards/chosen": 29.184782028198242, "rewards/margins": 25.545562744140625, "rewards/rejected": 3.639728546142578, "step": 1816 }, { "epoch": 0.9404761904761905, "grad_norm": 0.6668833494186401, "learning_rate": 8.219277091932914e-06, "loss": 0.15710334479808807, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.81081199645996, "rewards/margins": 26.055335998535156, "rewards/rejected": 2.7455501556396484, "step": 1817 }, { "epoch": 0.9409937888198758, "grad_norm": 0.720728874206543, "learning_rate": 8.217093694115684e-06, "loss": 0.17957854270935059, "rewards/accuracies": 0.90625, "rewards/chosen": 32.24443054199219, "rewards/margins": 29.728225708007812, "rewards/rejected": 2.526317596435547, "step": 1818 }, { "epoch": 0.9415113871635611, "grad_norm": 0.7021108865737915, "learning_rate": 8.214909248950404e-06, "loss": 0.148418128490448, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.932308197021484, "rewards/margins": 30.9017333984375, "rewards/rejected": 4.034570693969727, "step": 1819 }, { "epoch": 0.9420289855072463, "grad_norm": 1.166974663734436, "learning_rate": 8.212723757148235e-06, "loss": 0.1755075454711914, "rewards/accuracies": 0.9375, "rewards/chosen": 33.607086181640625, "rewards/margins": 29.90459442138672, "rewards/rejected": 3.697469711303711, "step": 1820 }, { "epoch": 0.9425465838509317, "grad_norm": 1.1173312664031982, "learning_rate": 8.210537219420681e-06, "loss": 0.1759839802980423, "rewards/accuracies": 0.890625, "rewards/chosen": 32.72624588012695, "rewards/margins": 28.864498138427734, "rewards/rejected": 3.858388900756836, "step": 1821 }, { "epoch": 0.943064182194617, "grad_norm": 0.734448254108429, "learning_rate": 8.208349636479582e-06, "loss": 0.1964939832687378, "rewards/accuracies": 0.875, "rewards/chosen": 35.87202835083008, "rewards/margins": 31.625450134277344, "rewards/rejected": 4.257354736328125, "step": 1822 }, { "epoch": 0.9435817805383023, "grad_norm": 2.4275312423706055, "learning_rate": 8.206161009037123e-06, "loss": 0.21214693784713745, "rewards/accuracies": 0.90625, "rewards/chosen": 34.32569122314453, "rewards/margins": 30.532608032226562, "rewards/rejected": 3.792816162109375, "step": 1823 }, { "epoch": 0.9440993788819876, "grad_norm": 4.381614685058594, "learning_rate": 8.203971337805827e-06, "loss": 0.30968207120895386, "rewards/accuracies": 0.84375, "rewards/chosen": 29.104618072509766, "rewards/margins": 26.075885772705078, "rewards/rejected": 3.025376319885254, "step": 1824 }, { "epoch": 0.9446169772256728, "grad_norm": 0.9305154085159302, "learning_rate": 8.201780623498559e-06, "loss": 0.21204861998558044, "rewards/accuracies": 0.8671875, "rewards/chosen": 32.95484161376953, "rewards/margins": 28.582290649414062, "rewards/rejected": 4.376824378967285, "step": 1825 }, { "epoch": 0.9451345755693582, "grad_norm": 1.8601526021957397, "learning_rate": 8.199588866828521e-06, "loss": 0.19954922795295715, "rewards/accuracies": 0.921875, "rewards/chosen": 35.82323455810547, "rewards/margins": 32.03990173339844, "rewards/rejected": 3.7813720703125, "step": 1826 }, { "epoch": 0.9456521739130435, "grad_norm": 1.1175510883331299, "learning_rate": 8.197396068509254e-06, "loss": 0.20999786257743835, "rewards/accuracies": 0.890625, "rewards/chosen": 35.73298263549805, "rewards/margins": 31.337738037109375, "rewards/rejected": 4.401139259338379, "step": 1827 }, { "epoch": 0.9461697722567288, "grad_norm": 0.697384774684906, "learning_rate": 8.195202229254637e-06, "loss": 0.14818421006202698, "rewards/accuracies": 0.90625, "rewards/chosen": 34.83209991455078, "rewards/margins": 30.710037231445312, "rewards/rejected": 4.116574287414551, "step": 1828 }, { "epoch": 0.9466873706004141, "grad_norm": 0.6456895470619202, "learning_rate": 8.193007349778896e-06, "loss": 0.17218539118766785, "rewards/accuracies": 0.890625, "rewards/chosen": 38.5451545715332, "rewards/margins": 33.000946044921875, "rewards/rejected": 5.542137145996094, "step": 1829 }, { "epoch": 0.9472049689440993, "grad_norm": 0.794218897819519, "learning_rate": 8.190811430796588e-06, "loss": 0.19630903005599976, "rewards/accuracies": 0.890625, "rewards/chosen": 29.010635375976562, "rewards/margins": 25.647552490234375, "rewards/rejected": 3.3610363006591797, "step": 1830 }, { "epoch": 0.9477225672877847, "grad_norm": 1.4345372915267944, "learning_rate": 8.188614473022607e-06, "loss": 0.1354345679283142, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.20156478881836, "rewards/margins": 32.856605529785156, "rewards/rejected": 4.339405059814453, "step": 1831 }, { "epoch": 0.94824016563147, "grad_norm": 1.2393293380737305, "learning_rate": 8.186416477172193e-06, "loss": 0.2448095828294754, "rewards/accuracies": 0.90625, "rewards/chosen": 34.177616119384766, "rewards/margins": 30.8504638671875, "rewards/rejected": 3.3199095726013184, "step": 1832 }, { "epoch": 0.9487577639751553, "grad_norm": 1.7140429019927979, "learning_rate": 8.184217443960915e-06, "loss": 0.2062952220439911, "rewards/accuracies": 0.90625, "rewards/chosen": 35.69087600708008, "rewards/margins": 31.779273986816406, "rewards/rejected": 3.910520553588867, "step": 1833 }, { "epoch": 0.9492753623188406, "grad_norm": 1.3780848979949951, "learning_rate": 8.182017374104687e-06, "loss": 0.23110570013523102, "rewards/accuracies": 0.8828125, "rewards/chosen": 35.30015182495117, "rewards/margins": 31.90105438232422, "rewards/rejected": 3.407665967941284, "step": 1834 }, { "epoch": 0.9497929606625258, "grad_norm": 1.0590943098068237, "learning_rate": 8.179816268319758e-06, "loss": 0.14295867085456848, "rewards/accuracies": 0.9375, "rewards/chosen": 33.257728576660156, "rewards/margins": 30.179977416992188, "rewards/rejected": 3.0865020751953125, "step": 1835 }, { "epoch": 0.9503105590062112, "grad_norm": 1.798452377319336, "learning_rate": 8.17761412732271e-06, "loss": 0.22283175587654114, "rewards/accuracies": 0.875, "rewards/chosen": 36.98000717163086, "rewards/margins": 34.59971618652344, "rewards/rejected": 2.3751978874206543, "step": 1836 }, { "epoch": 0.9508281573498965, "grad_norm": 1.006158471107483, "learning_rate": 8.175410951830471e-06, "loss": 0.16691215336322784, "rewards/accuracies": 0.8984375, "rewards/chosen": 37.67566680908203, "rewards/margins": 33.689788818359375, "rewards/rejected": 3.9701642990112305, "step": 1837 }, { "epoch": 0.9513457556935818, "grad_norm": 1.1144523620605469, "learning_rate": 8.173206742560298e-06, "loss": 0.23430196940898895, "rewards/accuracies": 0.890625, "rewards/chosen": 30.82847023010254, "rewards/margins": 27.609397888183594, "rewards/rejected": 3.2232742309570312, "step": 1838 }, { "epoch": 0.9518633540372671, "grad_norm": 0.9464515447616577, "learning_rate": 8.171001500229784e-06, "loss": 0.23161065578460693, "rewards/accuracies": 0.875, "rewards/chosen": 29.025352478027344, "rewards/margins": 26.8284912109375, "rewards/rejected": 2.1928138732910156, "step": 1839 }, { "epoch": 0.9523809523809523, "grad_norm": 0.9903033971786499, "learning_rate": 8.16879522555687e-06, "loss": 0.13008932769298553, "rewards/accuracies": 0.953125, "rewards/chosen": 36.1993522644043, "rewards/margins": 33.75431823730469, "rewards/rejected": 2.453885078430176, "step": 1840 }, { "epoch": 0.9528985507246377, "grad_norm": 0.6640124320983887, "learning_rate": 8.166587919259816e-06, "loss": 0.15091869235038757, "rewards/accuracies": 0.921875, "rewards/chosen": 34.31755828857422, "rewards/margins": 32.461578369140625, "rewards/rejected": 1.8478279113769531, "step": 1841 }, { "epoch": 0.953416149068323, "grad_norm": 0.8775652647018433, "learning_rate": 8.164379582057233e-06, "loss": 0.17389419674873352, "rewards/accuracies": 0.921875, "rewards/chosen": 28.760536193847656, "rewards/margins": 26.39453125, "rewards/rejected": 2.3689022064208984, "step": 1842 }, { "epoch": 0.9539337474120083, "grad_norm": 0.598686695098877, "learning_rate": 8.162170214668057e-06, "loss": 0.17130175232887268, "rewards/accuracies": 0.90625, "rewards/chosen": 34.56675720214844, "rewards/margins": 32.1214599609375, "rewards/rejected": 2.4474658966064453, "step": 1843 }, { "epoch": 0.9544513457556936, "grad_norm": 0.9206702709197998, "learning_rate": 8.159959817811565e-06, "loss": 0.17602381110191345, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.457624435424805, "rewards/margins": 29.909637451171875, "rewards/rejected": 1.5448112487792969, "step": 1844 }, { "epoch": 0.9549689440993789, "grad_norm": 0.7991102933883667, "learning_rate": 8.157748392207367e-06, "loss": 0.2053273618221283, "rewards/accuracies": 0.890625, "rewards/chosen": 33.043460845947266, "rewards/margins": 30.759784698486328, "rewards/rejected": 2.274595260620117, "step": 1845 }, { "epoch": 0.9554865424430642, "grad_norm": 1.1051589250564575, "learning_rate": 8.155535938575409e-06, "loss": 0.16104966402053833, "rewards/accuracies": 0.9375, "rewards/chosen": 29.759571075439453, "rewards/margins": 27.394515991210938, "rewards/rejected": 2.3572921752929688, "step": 1846 }, { "epoch": 0.9560041407867494, "grad_norm": 1.9084161520004272, "learning_rate": 8.153322457635971e-06, "loss": 0.20949053764343262, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.226905822753906, "rewards/margins": 27.440658569335938, "rewards/rejected": 1.7966394424438477, "step": 1847 }, { "epoch": 0.9565217391304348, "grad_norm": 0.6008750200271606, "learning_rate": 8.151107950109669e-06, "loss": 0.2066432386636734, "rewards/accuracies": 0.921875, "rewards/chosen": 32.887474060058594, "rewards/margins": 30.193496704101562, "rewards/rejected": 2.7071876525878906, "step": 1848 }, { "epoch": 0.9570393374741201, "grad_norm": 0.7790365815162659, "learning_rate": 8.14889241671745e-06, "loss": 0.1771983504295349, "rewards/accuracies": 0.890625, "rewards/chosen": 40.58651351928711, "rewards/margins": 36.38456726074219, "rewards/rejected": 4.195896148681641, "step": 1849 }, { "epoch": 0.9575569358178054, "grad_norm": 1.035225510597229, "learning_rate": 8.146675858180596e-06, "loss": 0.24805565178394318, "rewards/accuracies": 0.8671875, "rewards/chosen": 35.82491683959961, "rewards/margins": 31.95262908935547, "rewards/rejected": 3.878429889678955, "step": 1850 }, { "epoch": 0.9580745341614907, "grad_norm": 0.6303760409355164, "learning_rate": 8.144458275220725e-06, "loss": 0.15268269181251526, "rewards/accuracies": 0.921875, "rewards/chosen": 37.064903259277344, "rewards/margins": 34.20648956298828, "rewards/rejected": 2.84796142578125, "step": 1851 }, { "epoch": 0.9585921325051759, "grad_norm": 1.6351605653762817, "learning_rate": 8.14223966855979e-06, "loss": 0.24725614488124847, "rewards/accuracies": 0.8828125, "rewards/chosen": 40.48683547973633, "rewards/margins": 35.515159606933594, "rewards/rejected": 4.970163345336914, "step": 1852 }, { "epoch": 0.9591097308488613, "grad_norm": 1.0073310136795044, "learning_rate": 8.140020038920071e-06, "loss": 0.16318616271018982, "rewards/accuracies": 0.921875, "rewards/chosen": 40.97657012939453, "rewards/margins": 36.328392028808594, "rewards/rejected": 4.6403350830078125, "step": 1853 }, { "epoch": 0.9596273291925466, "grad_norm": 0.7394909262657166, "learning_rate": 8.137799387024184e-06, "loss": 0.17902231216430664, "rewards/accuracies": 0.875, "rewards/chosen": 38.30071258544922, "rewards/margins": 34.359947204589844, "rewards/rejected": 3.948221206665039, "step": 1854 }, { "epoch": 0.9601449275362319, "grad_norm": 1.0779021978378296, "learning_rate": 8.135577713595081e-06, "loss": 0.16420061886310577, "rewards/accuracies": 0.9375, "rewards/chosen": 39.18962478637695, "rewards/margins": 34.63518142700195, "rewards/rejected": 4.560558319091797, "step": 1855 }, { "epoch": 0.9606625258799172, "grad_norm": 0.839261531829834, "learning_rate": 8.133355019356042e-06, "loss": 0.15714675188064575, "rewards/accuracies": 0.875, "rewards/chosen": 44.65923309326172, "rewards/margins": 39.60687255859375, "rewards/rejected": 5.042520523071289, "step": 1856 }, { "epoch": 0.9611801242236024, "grad_norm": 0.7322729229927063, "learning_rate": 8.13113130503068e-06, "loss": 0.15685051679611206, "rewards/accuracies": 0.890625, "rewards/chosen": 40.83941650390625, "rewards/margins": 34.05317687988281, "rewards/rejected": 6.801267623901367, "step": 1857 }, { "epoch": 0.9616977225672878, "grad_norm": 0.5893954634666443, "learning_rate": 8.128906571342945e-06, "loss": 0.1987810730934143, "rewards/accuracies": 0.90625, "rewards/chosen": 39.26976776123047, "rewards/margins": 34.08258056640625, "rewards/rejected": 5.178047180175781, "step": 1858 }, { "epoch": 0.9622153209109731, "grad_norm": 1.1567400693893433, "learning_rate": 8.12668081901711e-06, "loss": 0.20020286738872528, "rewards/accuracies": 0.8984375, "rewards/chosen": 40.70914840698242, "rewards/margins": 35.105751037597656, "rewards/rejected": 5.614892959594727, "step": 1859 }, { "epoch": 0.9627329192546584, "grad_norm": 0.7402109503746033, "learning_rate": 8.124454048777787e-06, "loss": 0.14976046979427338, "rewards/accuracies": 0.9375, "rewards/chosen": 44.380943298339844, "rewards/margins": 37.02777099609375, "rewards/rejected": 7.355524063110352, "step": 1860 }, { "epoch": 0.9632505175983437, "grad_norm": 1.9492244720458984, "learning_rate": 8.12222626134992e-06, "loss": 0.17954346537590027, "rewards/accuracies": 0.8984375, "rewards/chosen": 38.56085968017578, "rewards/margins": 32.898780822753906, "rewards/rejected": 5.665931701660156, "step": 1861 }, { "epoch": 0.9637681159420289, "grad_norm": 2.697079658508301, "learning_rate": 8.119997457458777e-06, "loss": 0.2103833556175232, "rewards/accuracies": 0.890625, "rewards/chosen": 41.78877258300781, "rewards/margins": 34.30955123901367, "rewards/rejected": 7.479572296142578, "step": 1862 }, { "epoch": 0.9642857142857143, "grad_norm": 1.470829963684082, "learning_rate": 8.117767637829963e-06, "loss": 0.17277899384498596, "rewards/accuracies": 0.8984375, "rewards/chosen": 44.70098114013672, "rewards/margins": 36.773345947265625, "rewards/rejected": 7.932243347167969, "step": 1863 }, { "epoch": 0.9648033126293996, "grad_norm": 1.8425205945968628, "learning_rate": 8.115536803189413e-06, "loss": 0.2597222924232483, "rewards/accuracies": 0.9140625, "rewards/chosen": 42.52051544189453, "rewards/margins": 34.39790344238281, "rewards/rejected": 8.128985404968262, "step": 1864 }, { "epoch": 0.9653209109730849, "grad_norm": 0.7898126840591431, "learning_rate": 8.11330495426339e-06, "loss": 0.21755008399486542, "rewards/accuracies": 0.90625, "rewards/chosen": 41.54601287841797, "rewards/margins": 34.43064880371094, "rewards/rejected": 7.113304138183594, "step": 1865 }, { "epoch": 0.9658385093167702, "grad_norm": 1.5433262586593628, "learning_rate": 8.11107209177849e-06, "loss": 0.17198719084262848, "rewards/accuracies": 0.9140625, "rewards/chosen": 40.0862922668457, "rewards/margins": 33.44513702392578, "rewards/rejected": 6.63137674331665, "step": 1866 }, { "epoch": 0.9663561076604554, "grad_norm": 1.5350797176361084, "learning_rate": 8.108838216461631e-06, "loss": 0.22421786189079285, "rewards/accuracies": 0.8984375, "rewards/chosen": 38.58637619018555, "rewards/margins": 33.67948913574219, "rewards/rejected": 4.902277946472168, "step": 1867 }, { "epoch": 0.9668737060041408, "grad_norm": 0.562946617603302, "learning_rate": 8.106603329040077e-06, "loss": 0.1446518898010254, "rewards/accuracies": 0.953125, "rewards/chosen": 41.50600814819336, "rewards/margins": 36.300262451171875, "rewards/rejected": 5.197710037231445, "step": 1868 }, { "epoch": 0.967391304347826, "grad_norm": 1.1784180402755737, "learning_rate": 8.104367430241405e-06, "loss": 0.20047394931316376, "rewards/accuracies": 0.8984375, "rewards/chosen": 36.43716812133789, "rewards/margins": 31.5347900390625, "rewards/rejected": 4.910518646240234, "step": 1869 }, { "epoch": 0.9679089026915114, "grad_norm": 0.541698694229126, "learning_rate": 8.102130520793532e-06, "loss": 0.1789490282535553, "rewards/accuracies": 0.921875, "rewards/chosen": 46.58872604370117, "rewards/margins": 40.154571533203125, "rewards/rejected": 6.434947967529297, "step": 1870 }, { "epoch": 0.9684265010351967, "grad_norm": 0.9989179968833923, "learning_rate": 8.099892601424697e-06, "loss": 0.11028190702199936, "rewards/accuracies": 0.9453125, "rewards/chosen": 44.09623336791992, "rewards/margins": 39.111572265625, "rewards/rejected": 4.9658708572387695, "step": 1871 }, { "epoch": 0.968944099378882, "grad_norm": 0.9435621500015259, "learning_rate": 8.09765367286347e-06, "loss": 0.20006713271141052, "rewards/accuracies": 0.90625, "rewards/chosen": 39.81044387817383, "rewards/margins": 35.41957092285156, "rewards/rejected": 4.396297454833984, "step": 1872 }, { "epoch": 0.9694616977225673, "grad_norm": 0.6944519281387329, "learning_rate": 8.095413735838756e-06, "loss": 0.1554848700761795, "rewards/accuracies": 0.9140625, "rewards/chosen": 40.7139892578125, "rewards/margins": 35.33860778808594, "rewards/rejected": 5.385124206542969, "step": 1873 }, { "epoch": 0.9699792960662525, "grad_norm": 0.9387738108634949, "learning_rate": 8.093172791079774e-06, "loss": 0.14175769686698914, "rewards/accuracies": 0.9375, "rewards/chosen": 45.896995544433594, "rewards/margins": 42.11669921875, "rewards/rejected": 3.7782211303710938, "step": 1874 }, { "epoch": 0.9704968944099379, "grad_norm": 1.4655585289001465, "learning_rate": 8.090930839316089e-06, "loss": 0.16694989800453186, "rewards/accuracies": 0.90625, "rewards/chosen": 38.78452682495117, "rewards/margins": 34.472999572753906, "rewards/rejected": 4.3157501220703125, "step": 1875 }, { "epoch": 0.9710144927536232, "grad_norm": 0.6836339831352234, "learning_rate": 8.088687881277577e-06, "loss": 0.1482854187488556, "rewards/accuracies": 0.9296875, "rewards/chosen": 41.14496612548828, "rewards/margins": 36.31867980957031, "rewards/rejected": 4.828418731689453, "step": 1876 }, { "epoch": 0.9715320910973085, "grad_norm": 1.4294294118881226, "learning_rate": 8.086443917694454e-06, "loss": 0.23222891986370087, "rewards/accuracies": 0.90625, "rewards/chosen": 46.90340042114258, "rewards/margins": 41.22772216796875, "rewards/rejected": 5.669561386108398, "step": 1877 }, { "epoch": 0.9720496894409938, "grad_norm": 5.253259658813477, "learning_rate": 8.084198949297255e-06, "loss": 0.1825132668018341, "rewards/accuracies": 0.8984375, "rewards/chosen": 42.93160629272461, "rewards/margins": 38.843597412109375, "rewards/rejected": 4.096151351928711, "step": 1878 }, { "epoch": 0.972567287784679, "grad_norm": 1.0733650922775269, "learning_rate": 8.081952976816848e-06, "loss": 0.17207203805446625, "rewards/accuracies": 0.9140625, "rewards/chosen": 43.328712463378906, "rewards/margins": 38.71855163574219, "rewards/rejected": 4.602884292602539, "step": 1879 }, { "epoch": 0.9730848861283644, "grad_norm": 0.9998316764831543, "learning_rate": 8.079706000984426e-06, "loss": 0.1964259147644043, "rewards/accuracies": 0.875, "rewards/chosen": 39.79330062866211, "rewards/margins": 34.54322814941406, "rewards/rejected": 5.240345001220703, "step": 1880 }, { "epoch": 0.9736024844720497, "grad_norm": 1.2574093341827393, "learning_rate": 8.077458022531507e-06, "loss": 0.16009588539600372, "rewards/accuracies": 0.9140625, "rewards/chosen": 45.81298828125, "rewards/margins": 40.65245056152344, "rewards/rejected": 5.163124084472656, "step": 1881 }, { "epoch": 0.974120082815735, "grad_norm": 3.027106761932373, "learning_rate": 8.075209042189935e-06, "loss": 0.22725805640220642, "rewards/accuracies": 0.859375, "rewards/chosen": 42.947994232177734, "rewards/margins": 37.673675537109375, "rewards/rejected": 5.273319244384766, "step": 1882 }, { "epoch": 0.9746376811594203, "grad_norm": 1.3726645708084106, "learning_rate": 8.072959060691886e-06, "loss": 0.18624013662338257, "rewards/accuracies": 0.8984375, "rewards/chosen": 40.78111267089844, "rewards/margins": 36.822547912597656, "rewards/rejected": 3.9588661193847656, "step": 1883 }, { "epoch": 0.9751552795031055, "grad_norm": 1.3298555612564087, "learning_rate": 8.070708078769855e-06, "loss": 0.20205740630626678, "rewards/accuracies": 0.890625, "rewards/chosen": 45.851715087890625, "rewards/margins": 38.1728515625, "rewards/rejected": 7.679756164550781, "step": 1884 }, { "epoch": 0.9756728778467909, "grad_norm": 2.132230281829834, "learning_rate": 8.068456097156663e-06, "loss": 0.24556531012058258, "rewards/accuracies": 0.8515625, "rewards/chosen": 41.543983459472656, "rewards/margins": 36.91361999511719, "rewards/rejected": 4.631129264831543, "step": 1885 }, { "epoch": 0.9761904761904762, "grad_norm": 1.8485630750656128, "learning_rate": 8.066203116585465e-06, "loss": 0.2323819398880005, "rewards/accuracies": 0.8671875, "rewards/chosen": 44.475311279296875, "rewards/margins": 38.64068603515625, "rewards/rejected": 5.834419250488281, "step": 1886 }, { "epoch": 0.9767080745341615, "grad_norm": 0.8935233950614929, "learning_rate": 8.063949137789728e-06, "loss": 0.14403358101844788, "rewards/accuracies": 0.9453125, "rewards/chosen": 44.54601287841797, "rewards/margins": 38.369850158691406, "rewards/rejected": 6.18556022644043, "step": 1887 }, { "epoch": 0.9772256728778468, "grad_norm": 0.7646844983100891, "learning_rate": 8.061694161503257e-06, "loss": 0.15109241008758545, "rewards/accuracies": 0.9140625, "rewards/chosen": 44.25777816772461, "rewards/margins": 38.70751953125, "rewards/rejected": 5.565797805786133, "step": 1888 }, { "epoch": 0.977743271221532, "grad_norm": 3.9231302738189697, "learning_rate": 8.059438188460174e-06, "loss": 0.19284556806087494, "rewards/accuracies": 0.8984375, "rewards/chosen": 39.709449768066406, "rewards/margins": 34.349910736083984, "rewards/rejected": 5.365849018096924, "step": 1889 }, { "epoch": 0.9782608695652174, "grad_norm": 1.1036455631256104, "learning_rate": 8.057181219394924e-06, "loss": 0.19180026650428772, "rewards/accuracies": 0.8984375, "rewards/chosen": 40.97907638549805, "rewards/margins": 36.294036865234375, "rewards/rejected": 4.679889678955078, "step": 1890 }, { "epoch": 0.9787784679089027, "grad_norm": 0.8787549734115601, "learning_rate": 8.054923255042284e-06, "loss": 0.1714659035205841, "rewards/accuracies": 0.90625, "rewards/chosen": 40.642372131347656, "rewards/margins": 35.146453857421875, "rewards/rejected": 5.4856462478637695, "step": 1891 }, { "epoch": 0.979296066252588, "grad_norm": 1.1365739107131958, "learning_rate": 8.05266429613735e-06, "loss": 0.19335120916366577, "rewards/accuracies": 0.9140625, "rewards/chosen": 45.438720703125, "rewards/margins": 39.884010314941406, "rewards/rejected": 5.544441223144531, "step": 1892 }, { "epoch": 0.9798136645962733, "grad_norm": 0.8975664377212524, "learning_rate": 8.050404343415538e-06, "loss": 0.15072089433670044, "rewards/accuracies": 0.9453125, "rewards/chosen": 43.717811584472656, "rewards/margins": 37.85298156738281, "rewards/rejected": 5.86328125, "step": 1893 }, { "epoch": 0.9803312629399586, "grad_norm": 0.6498885154724121, "learning_rate": 8.048143397612598e-06, "loss": 0.19068396091461182, "rewards/accuracies": 0.8984375, "rewards/chosen": 45.767234802246094, "rewards/margins": 38.22726821899414, "rewards/rejected": 7.541278839111328, "step": 1894 }, { "epoch": 0.9808488612836439, "grad_norm": 1.5547313690185547, "learning_rate": 8.045881459464593e-06, "loss": 0.16803856194019318, "rewards/accuracies": 0.8984375, "rewards/chosen": 48.560001373291016, "rewards/margins": 42.302154541015625, "rewards/rejected": 6.2676849365234375, "step": 1895 }, { "epoch": 0.9813664596273292, "grad_norm": 1.296007513999939, "learning_rate": 8.043618529707913e-06, "loss": 0.2005031257867813, "rewards/accuracies": 0.890625, "rewards/chosen": 42.55022430419922, "rewards/margins": 36.369956970214844, "rewards/rejected": 6.16496467590332, "step": 1896 }, { "epoch": 0.9818840579710145, "grad_norm": 0.6594840288162231, "learning_rate": 8.041354609079274e-06, "loss": 0.17261503636837006, "rewards/accuracies": 0.9140625, "rewards/chosen": 45.85466766357422, "rewards/margins": 38.779563903808594, "rewards/rejected": 7.085363388061523, "step": 1897 }, { "epoch": 0.9824016563146998, "grad_norm": 0.681624710559845, "learning_rate": 8.039089698315712e-06, "loss": 0.14904864132404327, "rewards/accuracies": 0.953125, "rewards/chosen": 42.02220153808594, "rewards/margins": 33.998138427734375, "rewards/rejected": 8.03215503692627, "step": 1898 }, { "epoch": 0.9829192546583851, "grad_norm": 1.1191513538360596, "learning_rate": 8.036823798154581e-06, "loss": 0.1752745509147644, "rewards/accuracies": 0.9140625, "rewards/chosen": 45.20621109008789, "rewards/margins": 36.50467300415039, "rewards/rejected": 8.711030960083008, "step": 1899 }, { "epoch": 0.9834368530020704, "grad_norm": 0.6387892961502075, "learning_rate": 8.034556909333565e-06, "loss": 0.15876248478889465, "rewards/accuracies": 0.9296875, "rewards/chosen": 39.4735107421875, "rewards/margins": 29.892074584960938, "rewards/rejected": 9.580224990844727, "step": 1900 }, { "epoch": 0.9839544513457557, "grad_norm": 0.9945669174194336, "learning_rate": 8.032289032590663e-06, "loss": 0.177427738904953, "rewards/accuracies": 0.9140625, "rewards/chosen": 42.76451873779297, "rewards/margins": 33.35661315917969, "rewards/rejected": 9.409032821655273, "step": 1901 }, { "epoch": 0.984472049689441, "grad_norm": 1.5901585817337036, "learning_rate": 8.030020168664201e-06, "loss": 0.23074260354042053, "rewards/accuracies": 0.8671875, "rewards/chosen": 46.789398193359375, "rewards/margins": 36.902976989746094, "rewards/rejected": 9.885148048400879, "step": 1902 }, { "epoch": 0.9849896480331263, "grad_norm": 2.6226916313171387, "learning_rate": 8.027750318292824e-06, "loss": 0.21774840354919434, "rewards/accuracies": 0.9140625, "rewards/chosen": 49.38475799560547, "rewards/margins": 38.41322326660156, "rewards/rejected": 10.976181030273438, "step": 1903 }, { "epoch": 0.9855072463768116, "grad_norm": 1.1067135334014893, "learning_rate": 8.025479482215497e-06, "loss": 0.20779754221439362, "rewards/accuracies": 0.9140625, "rewards/chosen": 45.23387145996094, "rewards/margins": 33.693119049072266, "rewards/rejected": 11.550395965576172, "step": 1904 }, { "epoch": 0.9860248447204969, "grad_norm": 1.2514090538024902, "learning_rate": 8.023207661171507e-06, "loss": 0.2067473828792572, "rewards/accuracies": 0.8671875, "rewards/chosen": 42.55086135864258, "rewards/margins": 33.3212890625, "rewards/rejected": 9.233699798583984, "step": 1905 }, { "epoch": 0.9865424430641822, "grad_norm": 0.9663172960281372, "learning_rate": 8.020934855900462e-06, "loss": 0.13278383016586304, "rewards/accuracies": 0.9453125, "rewards/chosen": 44.8416748046875, "rewards/margins": 35.093505859375, "rewards/rejected": 9.746374130249023, "step": 1906 }, { "epoch": 0.9870600414078675, "grad_norm": 0.7604719996452332, "learning_rate": 8.01866106714229e-06, "loss": 0.13211137056350708, "rewards/accuracies": 0.9375, "rewards/chosen": 46.192649841308594, "rewards/margins": 36.28569793701172, "rewards/rejected": 9.903417587280273, "step": 1907 }, { "epoch": 0.9875776397515528, "grad_norm": 1.489307165145874, "learning_rate": 8.01638629563724e-06, "loss": 0.1286032497882843, "rewards/accuracies": 0.9375, "rewards/chosen": 46.73981475830078, "rewards/margins": 35.08368682861328, "rewards/rejected": 11.647407531738281, "step": 1908 }, { "epoch": 0.9880952380952381, "grad_norm": 1.1786242723464966, "learning_rate": 8.01411054212588e-06, "loss": 0.14720231294631958, "rewards/accuracies": 0.921875, "rewards/chosen": 46.0098762512207, "rewards/margins": 36.674896240234375, "rewards/rejected": 9.327987670898438, "step": 1909 }, { "epoch": 0.9886128364389234, "grad_norm": 1.857369303703308, "learning_rate": 8.011833807349097e-06, "loss": 0.18206726014614105, "rewards/accuracies": 0.9140625, "rewards/chosen": 41.92466735839844, "rewards/margins": 33.9561767578125, "rewards/rejected": 7.966884613037109, "step": 1910 }, { "epoch": 0.9891304347826086, "grad_norm": 1.509301781654358, "learning_rate": 8.009556092048099e-06, "loss": 0.22572444379329681, "rewards/accuracies": 0.90625, "rewards/chosen": 37.86273956298828, "rewards/margins": 29.373550415039062, "rewards/rejected": 8.499222755432129, "step": 1911 }, { "epoch": 0.989648033126294, "grad_norm": 1.2516940832138062, "learning_rate": 8.007277396964414e-06, "loss": 0.1479770541191101, "rewards/accuracies": 0.9140625, "rewards/chosen": 41.247135162353516, "rewards/margins": 32.95466613769531, "rewards/rejected": 8.292214393615723, "step": 1912 }, { "epoch": 0.9901656314699793, "grad_norm": 0.8565351963043213, "learning_rate": 8.004997722839886e-06, "loss": 0.13188916444778442, "rewards/accuracies": 0.90625, "rewards/chosen": 38.921485900878906, "rewards/margins": 29.24791717529297, "rewards/rejected": 9.677558898925781, "step": 1913 }, { "epoch": 0.9906832298136646, "grad_norm": 1.7148568630218506, "learning_rate": 8.00271707041668e-06, "loss": 0.275778591632843, "rewards/accuracies": 0.8984375, "rewards/chosen": 38.787662506103516, "rewards/margins": 31.1236572265625, "rewards/rejected": 7.6656341552734375, "step": 1914 }, { "epoch": 0.9912008281573499, "grad_norm": 0.9224007725715637, "learning_rate": 8.000435440437278e-06, "loss": 0.20922395586967468, "rewards/accuracies": 0.875, "rewards/chosen": 36.45595169067383, "rewards/margins": 29.78112030029297, "rewards/rejected": 6.666934967041016, "step": 1915 }, { "epoch": 0.9917184265010351, "grad_norm": 1.2757744789123535, "learning_rate": 7.998152833644483e-06, "loss": 0.22759412229061127, "rewards/accuracies": 0.875, "rewards/chosen": 34.181068420410156, "rewards/margins": 29.23419189453125, "rewards/rejected": 4.946967124938965, "step": 1916 }, { "epoch": 0.9922360248447205, "grad_norm": 0.9869281053543091, "learning_rate": 7.995869250781412e-06, "loss": 0.1672132909297943, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.33689880371094, "rewards/margins": 29.673965454101562, "rewards/rejected": 6.6572418212890625, "step": 1917 }, { "epoch": 0.9927536231884058, "grad_norm": 0.8435537815093994, "learning_rate": 7.993584692591503e-06, "loss": 0.22596532106399536, "rewards/accuracies": 0.8828125, "rewards/chosen": 28.295551300048828, "rewards/margins": 23.192642211914062, "rewards/rejected": 5.097953796386719, "step": 1918 }, { "epoch": 0.9932712215320911, "grad_norm": 1.207390546798706, "learning_rate": 7.99129915981851e-06, "loss": 0.18640360236167908, "rewards/accuracies": 0.8984375, "rewards/chosen": 34.66136932373047, "rewards/margins": 28.844730377197266, "rewards/rejected": 5.8182830810546875, "step": 1919 }, { "epoch": 0.9937888198757764, "grad_norm": 0.7699341177940369, "learning_rate": 7.989012653206505e-06, "loss": 0.17296984791755676, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.068340301513672, "rewards/margins": 25.24005889892578, "rewards/rejected": 4.82774543762207, "step": 1920 }, { "epoch": 0.9943064182194618, "grad_norm": 1.61282479763031, "learning_rate": 7.986725173499875e-06, "loss": 0.2402791976928711, "rewards/accuracies": 0.890625, "rewards/chosen": 30.437538146972656, "rewards/margins": 25.051963806152344, "rewards/rejected": 5.3768463134765625, "step": 1921 }, { "epoch": 0.994824016563147, "grad_norm": 0.8332760334014893, "learning_rate": 7.98443672144333e-06, "loss": 0.16618777811527252, "rewards/accuracies": 0.9375, "rewards/chosen": 33.06856155395508, "rewards/margins": 27.482463836669922, "rewards/rejected": 5.595150947570801, "step": 1922 }, { "epoch": 0.9953416149068323, "grad_norm": 0.7818955779075623, "learning_rate": 7.982147297781888e-06, "loss": 0.16487543284893036, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.19477844238281, "rewards/margins": 26.43389892578125, "rewards/rejected": 5.754673004150391, "step": 1923 }, { "epoch": 0.9958592132505176, "grad_norm": 1.5290082693099976, "learning_rate": 7.979856903260888e-06, "loss": 0.2387295812368393, "rewards/accuracies": 0.8984375, "rewards/chosen": 31.648056030273438, "rewards/margins": 25.30031967163086, "rewards/rejected": 6.353094100952148, "step": 1924 }, { "epoch": 0.9963768115942029, "grad_norm": 0.6706212162971497, "learning_rate": 7.977565538625985e-06, "loss": 0.16625964641571045, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.684734344482422, "rewards/margins": 25.425262451171875, "rewards/rejected": 5.2600603103637695, "step": 1925 }, { "epoch": 0.9968944099378882, "grad_norm": 0.7951241135597229, "learning_rate": 7.975273204623149e-06, "loss": 0.14418460428714752, "rewards/accuracies": 0.921875, "rewards/chosen": 30.460893630981445, "rewards/margins": 23.951202392578125, "rewards/rejected": 6.509988784790039, "step": 1926 }, { "epoch": 0.9974120082815735, "grad_norm": 0.6895461082458496, "learning_rate": 7.972979901998666e-06, "loss": 0.17918498814105988, "rewards/accuracies": 0.9140625, "rewards/chosen": 33.48994064331055, "rewards/margins": 26.544601440429688, "rewards/rejected": 6.942475318908691, "step": 1927 }, { "epoch": 0.9979296066252588, "grad_norm": 1.0115910768508911, "learning_rate": 7.970685631499138e-06, "loss": 0.19369226694107056, "rewards/accuracies": 0.8828125, "rewards/chosen": 28.60407257080078, "rewards/margins": 23.409435272216797, "rewards/rejected": 5.193903923034668, "step": 1928 }, { "epoch": 0.9984472049689441, "grad_norm": 0.5088260173797607, "learning_rate": 7.96839039387148e-06, "loss": 0.15051868557929993, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.008970260620117, "rewards/margins": 24.144454956054688, "rewards/rejected": 4.859020709991455, "step": 1929 }, { "epoch": 0.9989648033126294, "grad_norm": 1.314374566078186, "learning_rate": 7.966094189862925e-06, "loss": 0.1261398047208786, "rewards/accuracies": 0.9375, "rewards/chosen": 33.757022857666016, "rewards/margins": 27.253969192504883, "rewards/rejected": 6.499626636505127, "step": 1930 }, { "epoch": 0.9994824016563147, "grad_norm": 1.4061182737350464, "learning_rate": 7.963797020221016e-06, "loss": 0.18230602145195007, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.197675704956055, "rewards/margins": 22.812110900878906, "rewards/rejected": 6.387054443359375, "step": 1931 }, { "epoch": 1.0, "grad_norm": 2.058760404586792, "learning_rate": 7.961498885693614e-06, "loss": 0.2642630934715271, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.018455505371094, "rewards/margins": 22.379981994628906, "rewards/rejected": 4.642081260681152, "step": 1932 }, { "epoch": 1.0005175983436854, "grad_norm": 0.6948747634887695, "learning_rate": 7.959199787028895e-06, "loss": 0.1422756314277649, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.007572174072266, "rewards/margins": 27.86231231689453, "rewards/rejected": 5.140989303588867, "step": 1933 }, { "epoch": 1.0010351966873705, "grad_norm": 0.7137649655342102, "learning_rate": 7.956899724975346e-06, "loss": 0.12816110253334045, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.905912399291992, "rewards/margins": 24.634033203125, "rewards/rejected": 6.269369602203369, "step": 1934 }, { "epoch": 1.0015527950310559, "grad_norm": 0.7304075956344604, "learning_rate": 7.95459870028177e-06, "loss": 0.19804596900939941, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.63726043701172, "rewards/margins": 19.771881103515625, "rewards/rejected": 5.864536285400391, "step": 1935 }, { "epoch": 1.0020703933747412, "grad_norm": 0.6951687932014465, "learning_rate": 7.95229671369728e-06, "loss": 0.14096170663833618, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.607139587402344, "rewards/margins": 24.705894470214844, "rewards/rejected": 5.909499168395996, "step": 1936 }, { "epoch": 1.0025879917184266, "grad_norm": 0.5913236141204834, "learning_rate": 7.949993765971307e-06, "loss": 0.11670366674661636, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.536041259765625, "rewards/margins": 23.670120239257812, "rewards/rejected": 5.864893913269043, "step": 1937 }, { "epoch": 1.0031055900621118, "grad_norm": 1.029911994934082, "learning_rate": 7.947689857853588e-06, "loss": 0.13188660144805908, "rewards/accuracies": 0.9375, "rewards/chosen": 35.66395568847656, "rewards/margins": 27.54376983642578, "rewards/rejected": 8.112092018127441, "step": 1938 }, { "epoch": 1.0036231884057971, "grad_norm": 0.5970345735549927, "learning_rate": 7.945384990094181e-06, "loss": 0.11092644184827805, "rewards/accuracies": 0.9609375, "rewards/chosen": 33.23616409301758, "rewards/margins": 26.431224822998047, "rewards/rejected": 6.809645652770996, "step": 1939 }, { "epoch": 1.0041407867494825, "grad_norm": 1.1544243097305298, "learning_rate": 7.94307916344345e-06, "loss": 0.13528935611248016, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.69492721557617, "rewards/margins": 27.495376586914062, "rewards/rejected": 6.193870544433594, "step": 1940 }, { "epoch": 1.0046583850931676, "grad_norm": 0.5886800289154053, "learning_rate": 7.940772378652075e-06, "loss": 0.10174128413200378, "rewards/accuracies": 0.9375, "rewards/chosen": 31.729228973388672, "rewards/margins": 24.531497955322266, "rewards/rejected": 7.194755554199219, "step": 1941 }, { "epoch": 1.005175983436853, "grad_norm": 0.7241038680076599, "learning_rate": 7.938464636471046e-06, "loss": 0.14005222916603088, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.56371307373047, "rewards/margins": 26.111289978027344, "rewards/rejected": 5.4542717933654785, "step": 1942 }, { "epoch": 1.0056935817805384, "grad_norm": 0.9558950662612915, "learning_rate": 7.936155937651667e-06, "loss": 0.11798214912414551, "rewards/accuracies": 0.96875, "rewards/chosen": 36.33123779296875, "rewards/margins": 29.315567016601562, "rewards/rejected": 7.010700225830078, "step": 1943 }, { "epoch": 1.0062111801242235, "grad_norm": 0.8363360166549683, "learning_rate": 7.933846282945545e-06, "loss": 0.10331451147794724, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.55426025390625, "rewards/margins": 28.843719482421875, "rewards/rejected": 6.706057548522949, "step": 1944 }, { "epoch": 1.0067287784679089, "grad_norm": 0.8545397520065308, "learning_rate": 7.93153567310461e-06, "loss": 0.14007225632667542, "rewards/accuracies": 0.921875, "rewards/chosen": 32.53675079345703, "rewards/margins": 25.242294311523438, "rewards/rejected": 7.300636291503906, "step": 1945 }, { "epoch": 1.0072463768115942, "grad_norm": 1.4073646068572998, "learning_rate": 7.929224108881099e-06, "loss": 0.19144314527511597, "rewards/accuracies": 0.890625, "rewards/chosen": 33.286746978759766, "rewards/margins": 26.236358642578125, "rewards/rejected": 7.056537628173828, "step": 1946 }, { "epoch": 1.0077639751552796, "grad_norm": 2.207453727722168, "learning_rate": 7.926911591027554e-06, "loss": 0.1826864778995514, "rewards/accuracies": 0.8984375, "rewards/chosen": 36.54697036743164, "rewards/margins": 27.90808868408203, "rewards/rejected": 8.638495445251465, "step": 1947 }, { "epoch": 1.0082815734989647, "grad_norm": 0.9864372611045837, "learning_rate": 7.924598120296839e-06, "loss": 0.07157861441373825, "rewards/accuracies": 0.96875, "rewards/chosen": 38.253387451171875, "rewards/margins": 30.3931884765625, "rewards/rejected": 7.853126525878906, "step": 1948 }, { "epoch": 1.00879917184265, "grad_norm": 1.281865119934082, "learning_rate": 7.92228369744211e-06, "loss": 0.10171972215175629, "rewards/accuracies": 0.96875, "rewards/chosen": 38.34099197387695, "rewards/margins": 31.998046875, "rewards/rejected": 6.346342086791992, "step": 1949 }, { "epoch": 1.0093167701863355, "grad_norm": 1.1773194074630737, "learning_rate": 7.919968323216854e-06, "loss": 0.08425000309944153, "rewards/accuracies": 0.953125, "rewards/chosen": 36.10847091674805, "rewards/margins": 28.305496215820312, "rewards/rejected": 7.799034118652344, "step": 1950 }, { "epoch": 1.0098343685300206, "grad_norm": 2.0164718627929688, "learning_rate": 7.917651998374853e-06, "loss": 0.11520113050937653, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.99945068359375, "rewards/margins": 30.3544921875, "rewards/rejected": 6.642435073852539, "step": 1951 }, { "epoch": 1.010351966873706, "grad_norm": 4.150607585906982, "learning_rate": 7.915334723670205e-06, "loss": 0.20153126120567322, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.205528259277344, "rewards/margins": 27.21111297607422, "rewards/rejected": 6.9963226318359375, "step": 1952 }, { "epoch": 1.0108695652173914, "grad_norm": 0.924797534942627, "learning_rate": 7.913016499857313e-06, "loss": 0.12084692716598511, "rewards/accuracies": 0.9609375, "rewards/chosen": 37.85137939453125, "rewards/margins": 31.605621337890625, "rewards/rejected": 6.245672225952148, "step": 1953 }, { "epoch": 1.0113871635610765, "grad_norm": 1.4221118688583374, "learning_rate": 7.910697327690895e-06, "loss": 0.16433319449424744, "rewards/accuracies": 0.921875, "rewards/chosen": 39.41644287109375, "rewards/margins": 32.021331787109375, "rewards/rejected": 7.397920608520508, "step": 1954 }, { "epoch": 1.0119047619047619, "grad_norm": 2.0149099826812744, "learning_rate": 7.908377207925973e-06, "loss": 0.14879998564720154, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.84459686279297, "rewards/margins": 26.763580322265625, "rewards/rejected": 6.0756049156188965, "step": 1955 }, { "epoch": 1.0124223602484472, "grad_norm": 3.2683820724487305, "learning_rate": 7.906056141317874e-06, "loss": 0.23080062866210938, "rewards/accuracies": 0.890625, "rewards/chosen": 32.081504821777344, "rewards/margins": 27.522682189941406, "rewards/rejected": 4.560422897338867, "step": 1956 }, { "epoch": 1.0129399585921326, "grad_norm": 1.1173640489578247, "learning_rate": 7.903734128622248e-06, "loss": 0.09506350755691528, "rewards/accuracies": 0.953125, "rewards/chosen": 37.23027038574219, "rewards/margins": 31.05419921875, "rewards/rejected": 6.176155090332031, "step": 1957 }, { "epoch": 1.0134575569358177, "grad_norm": 1.54923415184021, "learning_rate": 7.901411170595035e-06, "loss": 0.09484352171421051, "rewards/accuracies": 0.9765625, "rewards/chosen": 31.537185668945312, "rewards/margins": 27.100738525390625, "rewards/rejected": 4.439714431762695, "step": 1958 }, { "epoch": 1.013975155279503, "grad_norm": 1.5492132902145386, "learning_rate": 7.899087267992491e-06, "loss": 0.12199818342924118, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.69629669189453, "rewards/margins": 27.823394775390625, "rewards/rejected": 5.883630752563477, "step": 1959 }, { "epoch": 1.0144927536231885, "grad_norm": 3.4220893383026123, "learning_rate": 7.896762421571183e-06, "loss": 0.10331089794635773, "rewards/accuracies": 0.9609375, "rewards/chosen": 35.5670166015625, "rewards/margins": 30.624267578125, "rewards/rejected": 4.947834014892578, "step": 1960 }, { "epoch": 1.0150103519668736, "grad_norm": 2.1924750804901123, "learning_rate": 7.894436632087978e-06, "loss": 0.11136471480131149, "rewards/accuracies": 0.96875, "rewards/chosen": 33.81610107421875, "rewards/margins": 28.701583862304688, "rewards/rejected": 5.105485916137695, "step": 1961 }, { "epoch": 1.015527950310559, "grad_norm": 1.4678514003753662, "learning_rate": 7.892109900300057e-06, "loss": 0.12360061705112457, "rewards/accuracies": 0.953125, "rewards/chosen": 32.618370056152344, "rewards/margins": 27.141830444335938, "rewards/rejected": 5.478494644165039, "step": 1962 }, { "epoch": 1.0160455486542443, "grad_norm": 1.6496751308441162, "learning_rate": 7.889782226964902e-06, "loss": 0.11322780698537827, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.43769836425781, "rewards/margins": 30.270233154296875, "rewards/rejected": 4.164575576782227, "step": 1963 }, { "epoch": 1.0165631469979297, "grad_norm": 3.125396251678467, "learning_rate": 7.887453612840303e-06, "loss": 0.2405250072479248, "rewards/accuracies": 0.875, "rewards/chosen": 30.17717933654785, "rewards/margins": 24.923526763916016, "rewards/rejected": 5.256095886230469, "step": 1964 }, { "epoch": 1.0170807453416149, "grad_norm": 2.143758773803711, "learning_rate": 7.885124058684361e-06, "loss": 0.1669934242963791, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.812271118164062, "rewards/margins": 27.86871337890625, "rewards/rejected": 3.9462175369262695, "step": 1965 }, { "epoch": 1.0175983436853002, "grad_norm": 1.719114065170288, "learning_rate": 7.882793565255475e-06, "loss": 0.12307453155517578, "rewards/accuracies": 0.953125, "rewards/chosen": 30.993595123291016, "rewards/margins": 27.78692626953125, "rewards/rejected": 3.1991214752197266, "step": 1966 }, { "epoch": 1.0181159420289856, "grad_norm": 1.1651270389556885, "learning_rate": 7.880462133312355e-06, "loss": 0.14047950506210327, "rewards/accuracies": 0.953125, "rewards/chosen": 33.75669860839844, "rewards/margins": 29.371246337890625, "rewards/rejected": 4.382951736450195, "step": 1967 }, { "epoch": 1.0186335403726707, "grad_norm": 1.3242191076278687, "learning_rate": 7.878129763614018e-06, "loss": 0.199191153049469, "rewards/accuracies": 0.90625, "rewards/chosen": 27.389251708984375, "rewards/margins": 24.57269287109375, "rewards/rejected": 2.807342529296875, "step": 1968 }, { "epoch": 1.019151138716356, "grad_norm": 1.477551817893982, "learning_rate": 7.87579645691978e-06, "loss": 0.23839478194713593, "rewards/accuracies": 0.8828125, "rewards/chosen": 26.138717651367188, "rewards/margins": 22.509544372558594, "rewards/rejected": 3.629077911376953, "step": 1969 }, { "epoch": 1.0196687370600415, "grad_norm": 1.07233726978302, "learning_rate": 7.873462213989269e-06, "loss": 0.1251341998577118, "rewards/accuracies": 0.9375, "rewards/chosen": 29.524398803710938, "rewards/margins": 26.59442138671875, "rewards/rejected": 2.9346961975097656, "step": 1970 }, { "epoch": 1.0201863354037266, "grad_norm": 0.8624070286750793, "learning_rate": 7.871127035582413e-06, "loss": 0.14350667595863342, "rewards/accuracies": 0.9140625, "rewards/chosen": 33.438629150390625, "rewards/margins": 29.106285095214844, "rewards/rejected": 4.334904670715332, "step": 1971 }, { "epoch": 1.020703933747412, "grad_norm": 0.7915343046188354, "learning_rate": 7.868790922459447e-06, "loss": 0.1123565286397934, "rewards/accuracies": 0.9375, "rewards/chosen": 31.93958282470703, "rewards/margins": 28.479736328125, "rewards/rejected": 3.4604015350341797, "step": 1972 }, { "epoch": 1.0212215320910973, "grad_norm": 1.964246153831482, "learning_rate": 7.866453875380907e-06, "loss": 0.163269504904747, "rewards/accuracies": 0.9375, "rewards/chosen": 29.113582611083984, "rewards/margins": 26.23639678955078, "rewards/rejected": 2.8685264587402344, "step": 1973 }, { "epoch": 1.0217391304347827, "grad_norm": 0.7447354793548584, "learning_rate": 7.864115895107638e-06, "loss": 0.10846257209777832, "rewards/accuracies": 0.9609375, "rewards/chosen": 27.015995025634766, "rewards/margins": 25.5528564453125, "rewards/rejected": 1.4659881591796875, "step": 1974 }, { "epoch": 1.0222567287784678, "grad_norm": 0.739332914352417, "learning_rate": 7.861776982400787e-06, "loss": 0.1051291972398758, "rewards/accuracies": 0.9609375, "rewards/chosen": 35.87648391723633, "rewards/margins": 32.523799896240234, "rewards/rejected": 3.353212356567383, "step": 1975 }, { "epoch": 1.0227743271221532, "grad_norm": 1.1471476554870605, "learning_rate": 7.859437138021802e-06, "loss": 0.12727384269237518, "rewards/accuracies": 0.953125, "rewards/chosen": 26.47170639038086, "rewards/margins": 23.833297729492188, "rewards/rejected": 2.641416549682617, "step": 1976 }, { "epoch": 1.0232919254658386, "grad_norm": 1.0722347497940063, "learning_rate": 7.857096362732438e-06, "loss": 0.12628191709518433, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.08744430541992, "rewards/margins": 29.043441772460938, "rewards/rejected": 3.036466598510742, "step": 1977 }, { "epoch": 1.0238095238095237, "grad_norm": 1.3338344097137451, "learning_rate": 7.854754657294747e-06, "loss": 0.13263729214668274, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.78672409057617, "rewards/margins": 29.78099822998047, "rewards/rejected": 3.0106101036071777, "step": 1978 }, { "epoch": 1.024327122153209, "grad_norm": 1.0894449949264526, "learning_rate": 7.852412022471094e-06, "loss": 0.1130685955286026, "rewards/accuracies": 0.9609375, "rewards/chosen": 29.734207153320312, "rewards/margins": 27.710723876953125, "rewards/rejected": 2.0164175033569336, "step": 1979 }, { "epoch": 1.0248447204968945, "grad_norm": 0.8501716256141663, "learning_rate": 7.850068459024136e-06, "loss": 0.17245632410049438, "rewards/accuracies": 0.90625, "rewards/chosen": 31.151445388793945, "rewards/margins": 28.83454132080078, "rewards/rejected": 2.318009853363037, "step": 1980 }, { "epoch": 1.0253623188405796, "grad_norm": 1.1134530305862427, "learning_rate": 7.84772396771684e-06, "loss": 0.11571705341339111, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.230133056640625, "rewards/margins": 25.576904296875, "rewards/rejected": 2.651421546936035, "step": 1981 }, { "epoch": 1.025879917184265, "grad_norm": 1.140215516090393, "learning_rate": 7.845378549312472e-06, "loss": 0.19061140716075897, "rewards/accuracies": 0.90625, "rewards/chosen": 32.34916305541992, "rewards/margins": 29.605270385742188, "rewards/rejected": 2.7492332458496094, "step": 1982 }, { "epoch": 1.0263975155279503, "grad_norm": 2.2605783939361572, "learning_rate": 7.843032204574596e-06, "loss": 0.15990209579467773, "rewards/accuracies": 0.921875, "rewards/chosen": 29.091238021850586, "rewards/margins": 26.1717529296875, "rewards/rejected": 2.9146728515625, "step": 1983 }, { "epoch": 1.0269151138716357, "grad_norm": 1.9275875091552734, "learning_rate": 7.840684934267087e-06, "loss": 0.13663998246192932, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.430179595947266, "rewards/margins": 31.634490966796875, "rewards/rejected": 3.8026089668273926, "step": 1984 }, { "epoch": 1.0274327122153208, "grad_norm": 0.8772211670875549, "learning_rate": 7.838336739154112e-06, "loss": 0.13046693801879883, "rewards/accuracies": 0.9375, "rewards/chosen": 31.575733184814453, "rewards/margins": 28.66918182373047, "rewards/rejected": 2.9123401641845703, "step": 1985 }, { "epoch": 1.0279503105590062, "grad_norm": 1.0282490253448486, "learning_rate": 7.835987620000145e-06, "loss": 0.1353335976600647, "rewards/accuracies": 0.9609375, "rewards/chosen": 28.85932159423828, "rewards/margins": 27.271820068359375, "rewards/rejected": 1.5898513793945312, "step": 1986 }, { "epoch": 1.0284679089026916, "grad_norm": 3.7125957012176514, "learning_rate": 7.833637577569954e-06, "loss": 0.16641990840435028, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.04053497314453, "rewards/margins": 30.153518676757812, "rewards/rejected": 2.8885555267333984, "step": 1987 }, { "epoch": 1.0289855072463767, "grad_norm": 1.0733445882797241, "learning_rate": 7.83128661262862e-06, "loss": 0.10495270788669586, "rewards/accuracies": 0.9609375, "rewards/chosen": 35.07949447631836, "rewards/margins": 31.21485137939453, "rewards/rejected": 3.861574172973633, "step": 1988 }, { "epoch": 1.029503105590062, "grad_norm": 1.077483057975769, "learning_rate": 7.82893472594151e-06, "loss": 0.20532381534576416, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.979209899902344, "rewards/margins": 26.659469604492188, "rewards/rejected": 2.3208231925964355, "step": 1989 }, { "epoch": 1.0300207039337475, "grad_norm": 1.1321353912353516, "learning_rate": 7.826581918274302e-06, "loss": 0.13234028220176697, "rewards/accuracies": 0.953125, "rewards/chosen": 37.5328369140625, "rewards/margins": 32.65309143066406, "rewards/rejected": 4.8835906982421875, "step": 1990 }, { "epoch": 1.0305383022774328, "grad_norm": 1.1776059865951538, "learning_rate": 7.824228190392966e-06, "loss": 0.16019679605960846, "rewards/accuracies": 0.921875, "rewards/chosen": 29.344423294067383, "rewards/margins": 26.921295166015625, "rewards/rejected": 2.4179344177246094, "step": 1991 }, { "epoch": 1.031055900621118, "grad_norm": 0.9248138666152954, "learning_rate": 7.821873543063776e-06, "loss": 0.1345847100019455, "rewards/accuracies": 0.953125, "rewards/chosen": 34.41331481933594, "rewards/margins": 31.534900665283203, "rewards/rejected": 2.878742218017578, "step": 1992 }, { "epoch": 1.0315734989648033, "grad_norm": 1.1487599611282349, "learning_rate": 7.819517977053308e-06, "loss": 0.12013676762580872, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.60123062133789, "rewards/margins": 30.43781280517578, "rewards/rejected": 4.157772064208984, "step": 1993 }, { "epoch": 1.0320910973084887, "grad_norm": 1.1629359722137451, "learning_rate": 7.817161493128428e-06, "loss": 0.14551624655723572, "rewards/accuracies": 0.9375, "rewards/chosen": 41.280845642089844, "rewards/margins": 36.302825927734375, "rewards/rejected": 4.974384307861328, "step": 1994 }, { "epoch": 1.0326086956521738, "grad_norm": 1.0589268207550049, "learning_rate": 7.81480409205631e-06, "loss": 0.11982561647891998, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.42086410522461, "rewards/margins": 28.97344207763672, "rewards/rejected": 2.4448070526123047, "step": 1995 }, { "epoch": 1.0331262939958592, "grad_norm": 1.4660284519195557, "learning_rate": 7.812445774604422e-06, "loss": 0.1532566249370575, "rewards/accuracies": 0.90625, "rewards/chosen": 35.89183807373047, "rewards/margins": 32.23486328125, "rewards/rejected": 3.66119384765625, "step": 1996 }, { "epoch": 1.0336438923395446, "grad_norm": 3.5776453018188477, "learning_rate": 7.81008654154053e-06, "loss": 0.2159094363451004, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.561275482177734, "rewards/margins": 32.63671875, "rewards/rejected": 3.9215545654296875, "step": 1997 }, { "epoch": 1.0341614906832297, "grad_norm": 1.8996905088424683, "learning_rate": 7.807726393632702e-06, "loss": 0.14191500842571259, "rewards/accuracies": 0.9375, "rewards/chosen": 33.012939453125, "rewards/margins": 29.00750732421875, "rewards/rejected": 4.005056381225586, "step": 1998 }, { "epoch": 1.034679089026915, "grad_norm": 1.2437634468078613, "learning_rate": 7.8053653316493e-06, "loss": 0.1471378207206726, "rewards/accuracies": 0.953125, "rewards/chosen": 37.399845123291016, "rewards/margins": 33.19536590576172, "rewards/rejected": 4.198629379272461, "step": 1999 }, { "epoch": 1.0351966873706004, "grad_norm": 1.4107922315597534, "learning_rate": 7.803003356358981e-06, "loss": 0.1775682270526886, "rewards/accuracies": 0.9296875, "rewards/chosen": 38.6565055847168, "rewards/margins": 33.66644287109375, "rewards/rejected": 4.989751815795898, "step": 2000 }, { "epoch": 1.0357142857142858, "grad_norm": 1.3215221166610718, "learning_rate": 7.800640468530709e-06, "loss": 0.16027861833572388, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.609031677246094, "rewards/margins": 28.687118530273438, "rewards/rejected": 3.9253482818603516, "step": 2001 }, { "epoch": 1.036231884057971, "grad_norm": 1.1357802152633667, "learning_rate": 7.798276668933737e-06, "loss": 0.1508065164089203, "rewards/accuracies": 0.9375, "rewards/chosen": 35.3282470703125, "rewards/margins": 31.462921142578125, "rewards/rejected": 3.8704776763916016, "step": 2002 }, { "epoch": 1.0367494824016563, "grad_norm": 1.1428204774856567, "learning_rate": 7.795911958337616e-06, "loss": 0.15833821892738342, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.52845001220703, "rewards/margins": 31.486101150512695, "rewards/rejected": 3.0449275970458984, "step": 2003 }, { "epoch": 1.0372670807453417, "grad_norm": 0.9550669193267822, "learning_rate": 7.793546337512194e-06, "loss": 0.14214415848255157, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.12110137939453, "rewards/margins": 29.671539306640625, "rewards/rejected": 3.4524459838867188, "step": 2004 }, { "epoch": 1.0377846790890268, "grad_norm": 1.3927781581878662, "learning_rate": 7.791179807227618e-06, "loss": 0.13271205127239227, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.72921371459961, "rewards/margins": 28.05462646484375, "rewards/rejected": 3.6854629516601562, "step": 2005 }, { "epoch": 1.0383022774327122, "grad_norm": 2.793468713760376, "learning_rate": 7.788812368254328e-06, "loss": 0.2032919079065323, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.6262264251709, "rewards/margins": 27.035171508789062, "rewards/rejected": 3.590423583984375, "step": 2006 }, { "epoch": 1.0388198757763976, "grad_norm": 0.8189600706100464, "learning_rate": 7.786444021363063e-06, "loss": 0.13299165666103363, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.449466705322266, "rewards/margins": 30.602027893066406, "rewards/rejected": 3.847637176513672, "step": 2007 }, { "epoch": 1.039337474120083, "grad_norm": 1.0308786630630493, "learning_rate": 7.784074767324851e-06, "loss": 0.165266752243042, "rewards/accuracies": 0.921875, "rewards/chosen": 31.480045318603516, "rewards/margins": 27.7908935546875, "rewards/rejected": 3.6911516189575195, "step": 2008 }, { "epoch": 1.039855072463768, "grad_norm": 0.9385057687759399, "learning_rate": 7.781704606911022e-06, "loss": 0.14534181356430054, "rewards/accuracies": 0.9375, "rewards/chosen": 30.47470474243164, "rewards/margins": 26.888031005859375, "rewards/rejected": 3.5869174003601074, "step": 2009 }, { "epoch": 1.0403726708074534, "grad_norm": 1.831478476524353, "learning_rate": 7.7793335408932e-06, "loss": 0.1286630630493164, "rewards/accuracies": 0.9375, "rewards/chosen": 36.231143951416016, "rewards/margins": 31.256938934326172, "rewards/rejected": 4.973361968994141, "step": 2010 }, { "epoch": 1.0408902691511388, "grad_norm": 0.9063906669616699, "learning_rate": 7.776961570043302e-06, "loss": 0.10943756997585297, "rewards/accuracies": 0.953125, "rewards/chosen": 34.27680206298828, "rewards/margins": 30.666610717773438, "rewards/rejected": 3.612213611602783, "step": 2011 }, { "epoch": 1.041407867494824, "grad_norm": 0.837847113609314, "learning_rate": 7.774588695133538e-06, "loss": 0.11847686767578125, "rewards/accuracies": 0.953125, "rewards/chosen": 32.86643981933594, "rewards/margins": 28.908737182617188, "rewards/rejected": 3.9678428173065186, "step": 2012 }, { "epoch": 1.0419254658385093, "grad_norm": 0.8941842913627625, "learning_rate": 7.772214916936418e-06, "loss": 0.14828220009803772, "rewards/accuracies": 0.953125, "rewards/chosen": 33.62214660644531, "rewards/margins": 29.574554443359375, "rewards/rejected": 4.0568695068359375, "step": 2013 }, { "epoch": 1.0424430641821947, "grad_norm": 1.4886969327926636, "learning_rate": 7.769840236224742e-06, "loss": 0.10487888753414154, "rewards/accuracies": 0.953125, "rewards/chosen": 36.580589294433594, "rewards/margins": 31.655975341796875, "rewards/rejected": 4.9224090576171875, "step": 2014 }, { "epoch": 1.0429606625258798, "grad_norm": 0.9090759754180908, "learning_rate": 7.7674646537716e-06, "loss": 0.10563625395298004, "rewards/accuracies": 0.953125, "rewards/chosen": 34.59434509277344, "rewards/margins": 29.547805786132812, "rewards/rejected": 5.0340423583984375, "step": 2015 }, { "epoch": 1.0434782608695652, "grad_norm": 1.177187204360962, "learning_rate": 7.765088170350386e-06, "loss": 0.13177593052387238, "rewards/accuracies": 0.953125, "rewards/chosen": 37.0767822265625, "rewards/margins": 31.328018188476562, "rewards/rejected": 5.758354187011719, "step": 2016 }, { "epoch": 1.0439958592132506, "grad_norm": 0.8721403479576111, "learning_rate": 7.762710786734778e-06, "loss": 0.07245200872421265, "rewards/accuracies": 0.96875, "rewards/chosen": 37.682464599609375, "rewards/margins": 32.048583984375, "rewards/rejected": 5.6317291259765625, "step": 2017 }, { "epoch": 1.044513457556936, "grad_norm": 1.2285737991333008, "learning_rate": 7.76033250369875e-06, "loss": 0.1590084731578827, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.01673889160156, "rewards/margins": 31.90766143798828, "rewards/rejected": 5.114530563354492, "step": 2018 }, { "epoch": 1.045031055900621, "grad_norm": 2.881957769393921, "learning_rate": 7.757953322016572e-06, "loss": 0.18787594139575958, "rewards/accuracies": 0.8984375, "rewards/chosen": 40.42296600341797, "rewards/margins": 33.59016418457031, "rewards/rejected": 6.832340240478516, "step": 2019 }, { "epoch": 1.0455486542443064, "grad_norm": 1.2341963052749634, "learning_rate": 7.7555732424628e-06, "loss": 0.12956330180168152, "rewards/accuracies": 0.9453125, "rewards/chosen": 41.609134674072266, "rewards/margins": 33.49615478515625, "rewards/rejected": 8.10903549194336, "step": 2020 }, { "epoch": 1.0460662525879918, "grad_norm": 2.4806673526763916, "learning_rate": 7.753192265812288e-06, "loss": 0.19311487674713135, "rewards/accuracies": 0.9375, "rewards/chosen": 40.44535827636719, "rewards/margins": 33.651893615722656, "rewards/rejected": 6.800628662109375, "step": 2021 }, { "epoch": 1.046583850931677, "grad_norm": 1.816672444343567, "learning_rate": 7.750810392840182e-06, "loss": 0.12140960991382599, "rewards/accuracies": 0.921875, "rewards/chosen": 34.2285041809082, "rewards/margins": 29.150466918945312, "rewards/rejected": 5.075859069824219, "step": 2022 }, { "epoch": 1.0471014492753623, "grad_norm": 1.943167805671692, "learning_rate": 7.748427624321914e-06, "loss": 0.16989666223526, "rewards/accuracies": 0.953125, "rewards/chosen": 41.71260070800781, "rewards/margins": 34.66473388671875, "rewards/rejected": 7.042708396911621, "step": 2023 }, { "epoch": 1.0476190476190477, "grad_norm": 1.97036612033844, "learning_rate": 7.746043961033215e-06, "loss": 0.17051735520362854, "rewards/accuracies": 0.9375, "rewards/chosen": 38.83993148803711, "rewards/margins": 32.82806396484375, "rewards/rejected": 6.012351989746094, "step": 2024 }, { "epoch": 1.0481366459627328, "grad_norm": 2.2821264266967773, "learning_rate": 7.743659403750104e-06, "loss": 0.24091777205467224, "rewards/accuracies": 0.8359375, "rewards/chosen": 35.946197509765625, "rewards/margins": 30.17212677001953, "rewards/rejected": 5.773092269897461, "step": 2025 }, { "epoch": 1.0486542443064182, "grad_norm": 2.8068952560424805, "learning_rate": 7.741273953248886e-06, "loss": 0.17990219593048096, "rewards/accuracies": 0.9140625, "rewards/chosen": 38.775367736816406, "rewards/margins": 32.47498321533203, "rewards/rejected": 6.2933807373046875, "step": 2026 }, { "epoch": 1.0491718426501035, "grad_norm": 1.2285408973693848, "learning_rate": 7.738887610306167e-06, "loss": 0.12099502980709076, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.619483947753906, "rewards/margins": 32.1800537109375, "rewards/rejected": 6.437168121337891, "step": 2027 }, { "epoch": 1.049689440993789, "grad_norm": 2.3140037059783936, "learning_rate": 7.736500375698836e-06, "loss": 0.19610166549682617, "rewards/accuracies": 0.8828125, "rewards/chosen": 30.165523529052734, "rewards/margins": 24.925689697265625, "rewards/rejected": 5.233120918273926, "step": 2028 }, { "epoch": 1.050207039337474, "grad_norm": 1.3528456687927246, "learning_rate": 7.734112250204073e-06, "loss": 0.15016256272792816, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.45866394042969, "rewards/margins": 31.558074951171875, "rewards/rejected": 6.896093368530273, "step": 2029 }, { "epoch": 1.0507246376811594, "grad_norm": 0.8532382845878601, "learning_rate": 7.731723234599351e-06, "loss": 0.12624216079711914, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.762298583984375, "rewards/margins": 30.633163452148438, "rewards/rejected": 5.1332244873046875, "step": 2030 }, { "epoch": 1.0512422360248448, "grad_norm": 1.5464144945144653, "learning_rate": 7.729333329662432e-06, "loss": 0.15248309075832367, "rewards/accuracies": 0.8984375, "rewards/chosen": 36.60894775390625, "rewards/margins": 30.139556884765625, "rewards/rejected": 6.466775894165039, "step": 2031 }, { "epoch": 1.05175983436853, "grad_norm": 1.1637729406356812, "learning_rate": 7.726942536171366e-06, "loss": 0.1029859185218811, "rewards/accuracies": 0.9609375, "rewards/chosen": 40.9222297668457, "rewards/margins": 34.23771667480469, "rewards/rejected": 6.6790571212768555, "step": 2032 }, { "epoch": 1.0522774327122153, "grad_norm": 1.1319353580474854, "learning_rate": 7.724550854904491e-06, "loss": 0.10555292665958405, "rewards/accuracies": 0.9609375, "rewards/chosen": 41.32029724121094, "rewards/margins": 33.7039794921875, "rewards/rejected": 7.616889953613281, "step": 2033 }, { "epoch": 1.0527950310559007, "grad_norm": 0.8992871642112732, "learning_rate": 7.722158286640439e-06, "loss": 0.11949899792671204, "rewards/accuracies": 0.9453125, "rewards/chosen": 41.692108154296875, "rewards/margins": 33.99003982543945, "rewards/rejected": 7.702998161315918, "step": 2034 }, { "epoch": 1.0533126293995858, "grad_norm": 1.8189704418182373, "learning_rate": 7.719764832158129e-06, "loss": 0.08344461023807526, "rewards/accuracies": 0.96875, "rewards/chosen": 43.67139434814453, "rewards/margins": 34.40577697753906, "rewards/rejected": 9.276039123535156, "step": 2035 }, { "epoch": 1.0538302277432712, "grad_norm": 1.2571231126785278, "learning_rate": 7.717370492236763e-06, "loss": 0.1514228880405426, "rewards/accuracies": 0.921875, "rewards/chosen": 39.4235725402832, "rewards/margins": 30.383392333984375, "rewards/rejected": 9.040416717529297, "step": 2036 }, { "epoch": 1.0543478260869565, "grad_norm": 1.268614411354065, "learning_rate": 7.714975267655838e-06, "loss": 0.094866544008255, "rewards/accuracies": 0.953125, "rewards/chosen": 41.950111389160156, "rewards/margins": 35.002197265625, "rewards/rejected": 6.958156585693359, "step": 2037 }, { "epoch": 1.054865424430642, "grad_norm": 1.093273401260376, "learning_rate": 7.712579159195137e-06, "loss": 0.0963093712925911, "rewards/accuracies": 0.9765625, "rewards/chosen": 43.69169616699219, "rewards/margins": 36.25679016113281, "rewards/rejected": 7.431463241577148, "step": 2038 }, { "epoch": 1.055383022774327, "grad_norm": 1.3218095302581787, "learning_rate": 7.71018216763473e-06, "loss": 0.1646643579006195, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.9207649230957, "rewards/margins": 27.0364990234375, "rewards/rejected": 7.877947807312012, "step": 2039 }, { "epoch": 1.0559006211180124, "grad_norm": 1.1897449493408203, "learning_rate": 7.707784293754976e-06, "loss": 0.11189870536327362, "rewards/accuracies": 0.9375, "rewards/chosen": 46.761940002441406, "rewards/margins": 36.25563049316406, "rewards/rejected": 10.501834869384766, "step": 2040 }, { "epoch": 1.0564182194616978, "grad_norm": 3.1701793670654297, "learning_rate": 7.705385538336516e-06, "loss": 0.2404346466064453, "rewards/accuracies": 0.921875, "rewards/chosen": 34.516334533691406, "rewards/margins": 27.510238647460938, "rewards/rejected": 7.010972499847412, "step": 2041 }, { "epoch": 1.056935817805383, "grad_norm": 1.2352231740951538, "learning_rate": 7.702985902160287e-06, "loss": 0.1392030417919159, "rewards/accuracies": 0.9140625, "rewards/chosen": 38.02599334716797, "rewards/margins": 29.9005126953125, "rewards/rejected": 8.117238998413086, "step": 2042 }, { "epoch": 1.0574534161490683, "grad_norm": 0.9284889698028564, "learning_rate": 7.700585386007504e-06, "loss": 0.10117113590240479, "rewards/accuracies": 0.9375, "rewards/chosen": 47.798484802246094, "rewards/margins": 37.435516357421875, "rewards/rejected": 10.356369018554688, "step": 2043 }, { "epoch": 1.0579710144927537, "grad_norm": 1.3428370952606201, "learning_rate": 7.698183990659678e-06, "loss": 0.12554919719696045, "rewards/accuracies": 0.9453125, "rewards/chosen": 46.12501907348633, "rewards/margins": 38.417205810546875, "rewards/rejected": 7.708489894866943, "step": 2044 }, { "epoch": 1.058488612836439, "grad_norm": 1.5988856554031372, "learning_rate": 7.695781716898593e-06, "loss": 0.09228342026472092, "rewards/accuracies": 0.953125, "rewards/chosen": 51.40666961669922, "rewards/margins": 41.81524658203125, "rewards/rejected": 9.592164993286133, "step": 2045 }, { "epoch": 1.0590062111801242, "grad_norm": 1.2867876291275024, "learning_rate": 7.693378565506333e-06, "loss": 0.06578332930803299, "rewards/accuracies": 0.984375, "rewards/chosen": 50.804473876953125, "rewards/margins": 40.544708251953125, "rewards/rejected": 10.271278381347656, "step": 2046 }, { "epoch": 1.0595238095238095, "grad_norm": 0.8709829449653625, "learning_rate": 7.690974537265257e-06, "loss": 0.08088371157646179, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.176788330078125, "rewards/margins": 39.75041198730469, "rewards/rejected": 7.424873352050781, "step": 2047 }, { "epoch": 1.060041407867495, "grad_norm": 2.355947971343994, "learning_rate": 7.688569632958016e-06, "loss": 0.18164904415607452, "rewards/accuracies": 0.875, "rewards/chosen": 46.16814422607422, "rewards/margins": 38.34400177001953, "rewards/rejected": 7.827990531921387, "step": 2048 }, { "epoch": 1.06055900621118, "grad_norm": 1.6628321409225464, "learning_rate": 7.686163853367544e-06, "loss": 0.11891795694828033, "rewards/accuracies": 0.953125, "rewards/chosen": 43.7316780090332, "rewards/margins": 36.71430969238281, "rewards/rejected": 7.038509368896484, "step": 2049 }, { "epoch": 1.0610766045548654, "grad_norm": 1.958193302154541, "learning_rate": 7.683757199277057e-06, "loss": 0.20606963336467743, "rewards/accuracies": 0.8984375, "rewards/chosen": 44.73274230957031, "rewards/margins": 37.49346923828125, "rewards/rejected": 7.2259416580200195, "step": 2050 }, { "epoch": 1.0615942028985508, "grad_norm": 1.9330542087554932, "learning_rate": 7.681349671470063e-06, "loss": 0.14124321937561035, "rewards/accuracies": 0.953125, "rewards/chosen": 45.109153747558594, "rewards/margins": 39.108184814453125, "rewards/rejected": 6.002403259277344, "step": 2051 }, { "epoch": 1.062111801242236, "grad_norm": 0.9051594734191895, "learning_rate": 7.678941270730347e-06, "loss": 0.06745615601539612, "rewards/accuracies": 0.984375, "rewards/chosen": 48.90052795410156, "rewards/margins": 41.349578857421875, "rewards/rejected": 7.531776428222656, "step": 2052 }, { "epoch": 1.0626293995859213, "grad_norm": 1.3432663679122925, "learning_rate": 7.676531997841983e-06, "loss": 0.10290545970201492, "rewards/accuracies": 0.953125, "rewards/chosen": 49.28191375732422, "rewards/margins": 43.109107971191406, "rewards/rejected": 6.158510208129883, "step": 2053 }, { "epoch": 1.0631469979296067, "grad_norm": 1.4237017631530762, "learning_rate": 7.674121853589325e-06, "loss": 0.09933291375637054, "rewards/accuracies": 0.96875, "rewards/chosen": 43.93733596801758, "rewards/margins": 38.41082763671875, "rewards/rejected": 5.5220794677734375, "step": 2054 }, { "epoch": 1.063664596273292, "grad_norm": 2.058044195175171, "learning_rate": 7.671710838757014e-06, "loss": 0.17869862914085388, "rewards/accuracies": 0.9609375, "rewards/chosen": 39.63059997558594, "rewards/margins": 35.06322479248047, "rewards/rejected": 4.577017307281494, "step": 2055 }, { "epoch": 1.0641821946169772, "grad_norm": 3.6183786392211914, "learning_rate": 7.669298954129973e-06, "loss": 0.21187934279441833, "rewards/accuracies": 0.8984375, "rewards/chosen": 44.85777282714844, "rewards/margins": 39.208702087402344, "rewards/rejected": 5.647914886474609, "step": 2056 }, { "epoch": 1.0646997929606625, "grad_norm": 2.2856338024139404, "learning_rate": 7.666886200493411e-06, "loss": 0.12636947631835938, "rewards/accuracies": 0.9453125, "rewards/chosen": 45.77568435668945, "rewards/margins": 41.38563537597656, "rewards/rejected": 4.394901275634766, "step": 2057 }, { "epoch": 1.065217391304348, "grad_norm": 3.04113507270813, "learning_rate": 7.664472578632814e-06, "loss": 0.18488503992557526, "rewards/accuracies": 0.9375, "rewards/chosen": 50.682777404785156, "rewards/margins": 45.752777099609375, "rewards/rejected": 4.916679382324219, "step": 2058 }, { "epoch": 1.065734989648033, "grad_norm": 1.9285001754760742, "learning_rate": 7.662058089333954e-06, "loss": 0.17374087870121002, "rewards/accuracies": 0.9375, "rewards/chosen": 41.937259674072266, "rewards/margins": 38.68040466308594, "rewards/rejected": 3.2528610229492188, "step": 2059 }, { "epoch": 1.0662525879917184, "grad_norm": 1.5720694065093994, "learning_rate": 7.659642733382888e-06, "loss": 0.08772304654121399, "rewards/accuracies": 0.953125, "rewards/chosen": 48.53448486328125, "rewards/margins": 45.300514221191406, "rewards/rejected": 3.2248878479003906, "step": 2060 }, { "epoch": 1.0667701863354038, "grad_norm": 2.7615158557891846, "learning_rate": 7.65722651156595e-06, "loss": 0.18125978112220764, "rewards/accuracies": 0.9375, "rewards/chosen": 45.00503158569336, "rewards/margins": 40.52613830566406, "rewards/rejected": 4.475793838500977, "step": 2061 }, { "epoch": 1.0672877846790891, "grad_norm": 1.1719094514846802, "learning_rate": 7.65480942466976e-06, "loss": 0.08110867440700531, "rewards/accuracies": 0.984375, "rewards/chosen": 47.89885711669922, "rewards/margins": 44.373138427734375, "rewards/rejected": 3.5232772827148438, "step": 2062 }, { "epoch": 1.0678053830227743, "grad_norm": 1.2145267724990845, "learning_rate": 7.652391473481218e-06, "loss": 0.08856813609600067, "rewards/accuracies": 0.96875, "rewards/chosen": 39.783729553222656, "rewards/margins": 38.4246826171875, "rewards/rejected": 1.3573989868164062, "step": 2063 }, { "epoch": 1.0683229813664596, "grad_norm": 1.534739375114441, "learning_rate": 7.649972658787503e-06, "loss": 0.14579851925373077, "rewards/accuracies": 0.9375, "rewards/chosen": 37.87336730957031, "rewards/margins": 34.543182373046875, "rewards/rejected": 3.3226547241210938, "step": 2064 }, { "epoch": 1.068840579710145, "grad_norm": 1.190064549446106, "learning_rate": 7.64755298137608e-06, "loss": 0.07757404446601868, "rewards/accuracies": 0.96875, "rewards/chosen": 46.737274169921875, "rewards/margins": 43.996192932128906, "rewards/rejected": 2.749073028564453, "step": 2065 }, { "epoch": 1.0693581780538302, "grad_norm": 2.156235933303833, "learning_rate": 7.645132442034696e-06, "loss": 0.2244364470243454, "rewards/accuracies": 0.921875, "rewards/chosen": 42.55158233642578, "rewards/margins": 39.66210174560547, "rewards/rejected": 2.8880460262298584, "step": 2066 }, { "epoch": 1.0698757763975155, "grad_norm": 1.0225412845611572, "learning_rate": 7.642711041551366e-06, "loss": 0.06678953766822815, "rewards/accuracies": 0.96875, "rewards/chosen": 44.706390380859375, "rewards/margins": 42.528656005859375, "rewards/rejected": 2.1661624908447266, "step": 2067 }, { "epoch": 1.0703933747412009, "grad_norm": 1.4861334562301636, "learning_rate": 7.640288780714402e-06, "loss": 0.11706286668777466, "rewards/accuracies": 0.953125, "rewards/chosen": 36.722373962402344, "rewards/margins": 35.66600036621094, "rewards/rejected": 1.0627174377441406, "step": 2068 }, { "epoch": 1.070910973084886, "grad_norm": 1.8650741577148438, "learning_rate": 7.637865660312385e-06, "loss": 0.1557174026966095, "rewards/accuracies": 0.953125, "rewards/chosen": 28.852060317993164, "rewards/margins": 27.480499267578125, "rewards/rejected": 1.3834247589111328, "step": 2069 }, { "epoch": 1.0714285714285714, "grad_norm": 1.6696606874465942, "learning_rate": 7.63544168113418e-06, "loss": 0.18668928742408752, "rewards/accuracies": 0.9140625, "rewards/chosen": 38.37013244628906, "rewards/margins": 36.5223388671875, "rewards/rejected": 1.8584136962890625, "step": 2070 }, { "epoch": 1.0719461697722568, "grad_norm": 2.1919422149658203, "learning_rate": 7.633016843968933e-06, "loss": 0.15126410126686096, "rewards/accuracies": 0.9296875, "rewards/chosen": 40.787532806396484, "rewards/margins": 37.384429931640625, "rewards/rejected": 3.3989830017089844, "step": 2071 }, { "epoch": 1.0724637681159421, "grad_norm": 1.4984670877456665, "learning_rate": 7.630591149606064e-06, "loss": 0.1214132308959961, "rewards/accuracies": 0.9453125, "rewards/chosen": 46.390174865722656, "rewards/margins": 45.17396545410156, "rewards/rejected": 1.2254562377929688, "step": 2072 }, { "epoch": 1.0729813664596273, "grad_norm": 1.80051851272583, "learning_rate": 7.628164598835276e-06, "loss": 0.16691218316555023, "rewards/accuracies": 0.90625, "rewards/chosen": 40.340728759765625, "rewards/margins": 38.34049987792969, "rewards/rejected": 1.9960222244262695, "step": 2073 }, { "epoch": 1.0734989648033126, "grad_norm": 1.3177765607833862, "learning_rate": 7.6257371924465515e-06, "loss": 0.08265416324138641, "rewards/accuracies": 0.96875, "rewards/chosen": 40.62372970581055, "rewards/margins": 39.02972412109375, "rewards/rejected": 1.5872774124145508, "step": 2074 }, { "epoch": 1.074016563146998, "grad_norm": 2.5924360752105713, "learning_rate": 7.623308931230149e-06, "loss": 0.1217958927154541, "rewards/accuracies": 0.96875, "rewards/chosen": 35.496002197265625, "rewards/margins": 34.52961730957031, "rewards/rejected": 0.9671001434326172, "step": 2075 }, { "epoch": 1.0745341614906831, "grad_norm": 1.986242651939392, "learning_rate": 7.620879815976608e-06, "loss": 0.17240887880325317, "rewards/accuracies": 0.90625, "rewards/chosen": 31.96607208251953, "rewards/margins": 31.00751495361328, "rewards/rejected": 0.9508476257324219, "step": 2076 }, { "epoch": 1.0750517598343685, "grad_norm": 2.048501491546631, "learning_rate": 7.618449847476742e-06, "loss": 0.07187631726264954, "rewards/accuracies": 0.96875, "rewards/chosen": 41.0743293762207, "rewards/margins": 39.3756103515625, "rewards/rejected": 1.6954612731933594, "step": 2077 }, { "epoch": 1.0755693581780539, "grad_norm": 2.06811785697937, "learning_rate": 7.616019026521648e-06, "loss": 0.11297063529491425, "rewards/accuracies": 0.9609375, "rewards/chosen": 39.91858673095703, "rewards/margins": 38.137855529785156, "rewards/rejected": 1.7824516296386719, "step": 2078 }, { "epoch": 1.0760869565217392, "grad_norm": 1.9065765142440796, "learning_rate": 7.613587353902693e-06, "loss": 0.20972251892089844, "rewards/accuracies": 0.9375, "rewards/chosen": 40.34297180175781, "rewards/margins": 38.71913146972656, "rewards/rejected": 1.6325693130493164, "step": 2079 }, { "epoch": 1.0766045548654244, "grad_norm": 2.608969211578369, "learning_rate": 7.61115483041153e-06, "loss": 0.12894399464130402, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.89161682128906, "rewards/margins": 36.92939758300781, "rewards/rejected": 1.9605903625488281, "step": 2080 }, { "epoch": 1.0771221532091098, "grad_norm": 2.5392022132873535, "learning_rate": 7.608721456840081e-06, "loss": 0.09860500693321228, "rewards/accuracies": 0.96875, "rewards/chosen": 35.274208068847656, "rewards/margins": 34.139739990234375, "rewards/rejected": 1.1487007141113281, "step": 2081 }, { "epoch": 1.0776397515527951, "grad_norm": 1.723080039024353, "learning_rate": 7.606287233980552e-06, "loss": 0.09590695798397064, "rewards/accuracies": 0.96875, "rewards/chosen": 41.191925048828125, "rewards/margins": 39.8406982421875, "rewards/rejected": 1.3554458618164062, "step": 2082 }, { "epoch": 1.0781573498964803, "grad_norm": 1.4429349899291992, "learning_rate": 7.603852162625417e-06, "loss": 0.1330755650997162, "rewards/accuracies": 0.96875, "rewards/chosen": 29.695417404174805, "rewards/margins": 28.884521484375, "rewards/rejected": 0.8105297088623047, "step": 2083 }, { "epoch": 1.0786749482401656, "grad_norm": 1.4466891288757324, "learning_rate": 7.6014162435674345e-06, "loss": 0.11311367899179459, "rewards/accuracies": 0.953125, "rewards/chosen": 38.41111755371094, "rewards/margins": 36.98626708984375, "rewards/rejected": 1.4273605346679688, "step": 2084 }, { "epoch": 1.079192546583851, "grad_norm": 2.507530689239502, "learning_rate": 7.598979477599635e-06, "loss": 0.140043243765831, "rewards/accuracies": 0.953125, "rewards/chosen": 39.53355407714844, "rewards/margins": 38.233856201171875, "rewards/rejected": 1.3019866943359375, "step": 2085 }, { "epoch": 1.0797101449275361, "grad_norm": 1.6496305465698242, "learning_rate": 7.596541865515325e-06, "loss": 0.15230543911457062, "rewards/accuracies": 0.921875, "rewards/chosen": 35.915565490722656, "rewards/margins": 34.83457946777344, "rewards/rejected": 1.0827980041503906, "step": 2086 }, { "epoch": 1.0802277432712215, "grad_norm": 1.2743405103683472, "learning_rate": 7.594103408108087e-06, "loss": 0.12358415871858597, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.35688018798828, "rewards/margins": 35.46966552734375, "rewards/rejected": 1.8960998058319092, "step": 2087 }, { "epoch": 1.0807453416149069, "grad_norm": 1.2965563535690308, "learning_rate": 7.591664106171776e-06, "loss": 0.1689475178718567, "rewards/accuracies": 0.9375, "rewards/chosen": 36.075469970703125, "rewards/margins": 34.33026123046875, "rewards/rejected": 1.7379703521728516, "step": 2088 }, { "epoch": 1.081262939958592, "grad_norm": 1.2890409231185913, "learning_rate": 7.58922396050053e-06, "loss": 0.14684076607227325, "rewards/accuracies": 0.953125, "rewards/chosen": 39.6530647277832, "rewards/margins": 37.35893249511719, "rewards/rejected": 2.2997846603393555, "step": 2089 }, { "epoch": 1.0817805383022774, "grad_norm": 1.2734203338623047, "learning_rate": 7.5867829718887486e-06, "loss": 0.16012360155582428, "rewards/accuracies": 0.9375, "rewards/chosen": 36.70035171508789, "rewards/margins": 35.195648193359375, "rewards/rejected": 1.4947280883789062, "step": 2090 }, { "epoch": 1.0822981366459627, "grad_norm": 1.2184797525405884, "learning_rate": 7.5843411411311194e-06, "loss": 0.12612000107765198, "rewards/accuracies": 0.96875, "rewards/chosen": 33.058231353759766, "rewards/margins": 31.837421417236328, "rewards/rejected": 1.2205729484558105, "step": 2091 }, { "epoch": 1.0828157349896481, "grad_norm": 1.3712358474731445, "learning_rate": 7.581898469022595e-06, "loss": 0.14064481854438782, "rewards/accuracies": 0.9375, "rewards/chosen": 39.89329528808594, "rewards/margins": 38.743255615234375, "rewards/rejected": 1.1491775512695312, "step": 2092 }, { "epoch": 1.0833333333333333, "grad_norm": 0.8400251269340515, "learning_rate": 7.5794549563584064e-06, "loss": 0.11663377285003662, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.088314056396484, "rewards/margins": 36.069671630859375, "rewards/rejected": 1.0153484344482422, "step": 2093 }, { "epoch": 1.0838509316770186, "grad_norm": 0.7274706363677979, "learning_rate": 7.577010603934054e-06, "loss": 0.10432086884975433, "rewards/accuracies": 0.9609375, "rewards/chosen": 39.091949462890625, "rewards/margins": 37.64134216308594, "rewards/rejected": 1.4480857849121094, "step": 2094 }, { "epoch": 1.084368530020704, "grad_norm": 1.1631169319152832, "learning_rate": 7.574565412545318e-06, "loss": 0.14404743909835815, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.987998962402344, "rewards/margins": 35.50153350830078, "rewards/rejected": 1.4776086807250977, "step": 2095 }, { "epoch": 1.0848861283643891, "grad_norm": 1.1358011960983276, "learning_rate": 7.572119382988245e-06, "loss": 0.1043848991394043, "rewards/accuracies": 0.9453125, "rewards/chosen": 40.42571258544922, "rewards/margins": 39.10584259033203, "rewards/rejected": 1.3248062133789062, "step": 2096 }, { "epoch": 1.0854037267080745, "grad_norm": 1.3847335577011108, "learning_rate": 7.569672516059161e-06, "loss": 0.15497469902038574, "rewards/accuracies": 0.8984375, "rewards/chosen": 42.18611526489258, "rewards/margins": 41.225563049316406, "rewards/rejected": 0.9660742282867432, "step": 2097 }, { "epoch": 1.0859213250517599, "grad_norm": 1.1683458089828491, "learning_rate": 7.567224812554657e-06, "loss": 0.1508132815361023, "rewards/accuracies": 0.9375, "rewards/chosen": 41.30629348754883, "rewards/margins": 39.63209533691406, "rewards/rejected": 1.6715545654296875, "step": 2098 }, { "epoch": 1.0864389233954452, "grad_norm": 1.498564600944519, "learning_rate": 7.5647762732716034e-06, "loss": 0.13600096106529236, "rewards/accuracies": 0.9609375, "rewards/chosen": 40.87253189086914, "rewards/margins": 39.046234130859375, "rewards/rejected": 1.8371658325195312, "step": 2099 }, { "epoch": 1.0869565217391304, "grad_norm": 1.006980061531067, "learning_rate": 7.562326899007139e-06, "loss": 0.148942232131958, "rewards/accuracies": 0.9375, "rewards/chosen": 45.10934066772461, "rewards/margins": 43.225006103515625, "rewards/rejected": 1.866048812866211, "step": 2100 }, { "epoch": 1.0874741200828157, "grad_norm": 1.171656847000122, "learning_rate": 7.559876690558675e-06, "loss": 0.11885645240545273, "rewards/accuracies": 0.9375, "rewards/chosen": 45.57023620605469, "rewards/margins": 43.59757995605469, "rewards/rejected": 2.0014877319335938, "step": 2101 }, { "epoch": 1.087991718426501, "grad_norm": 1.6669034957885742, "learning_rate": 7.557425648723895e-06, "loss": 0.17342382669448853, "rewards/accuracies": 0.8984375, "rewards/chosen": 36.91239929199219, "rewards/margins": 35.934844970703125, "rewards/rejected": 0.975255012512207, "step": 2102 }, { "epoch": 1.0885093167701863, "grad_norm": 1.7307021617889404, "learning_rate": 7.554973774300753e-06, "loss": 0.17231489717960358, "rewards/accuracies": 0.90625, "rewards/chosen": 36.92766571044922, "rewards/margins": 35.700218200683594, "rewards/rejected": 1.2345995903015137, "step": 2103 }, { "epoch": 1.0890269151138716, "grad_norm": 1.2018307447433472, "learning_rate": 7.552521068087475e-06, "loss": 0.0967528373003006, "rewards/accuracies": 0.9453125, "rewards/chosen": 48.798343658447266, "rewards/margins": 46.06120300292969, "rewards/rejected": 2.7341156005859375, "step": 2104 }, { "epoch": 1.089544513457557, "grad_norm": 5.220870018005371, "learning_rate": 7.5500675308825555e-06, "loss": 0.16426700353622437, "rewards/accuracies": 0.9375, "rewards/chosen": 38.42287063598633, "rewards/margins": 36.955726623535156, "rewards/rejected": 1.465932846069336, "step": 2105 }, { "epoch": 1.0900621118012421, "grad_norm": 1.0308797359466553, "learning_rate": 7.547613163484764e-06, "loss": 0.09376212954521179, "rewards/accuracies": 0.96875, "rewards/chosen": 43.2128791809082, "rewards/margins": 41.560791015625, "rewards/rejected": 1.6545867919921875, "step": 2106 }, { "epoch": 1.0905797101449275, "grad_norm": 1.6712729930877686, "learning_rate": 7.545157966693134e-06, "loss": 0.10797114670276642, "rewards/accuracies": 0.96875, "rewards/chosen": 39.96685791015625, "rewards/margins": 37.863372802734375, "rewards/rejected": 2.098745346069336, "step": 2107 }, { "epoch": 1.0910973084886129, "grad_norm": 1.403336763381958, "learning_rate": 7.542701941306976e-06, "loss": 0.14847832918167114, "rewards/accuracies": 0.9375, "rewards/chosen": 43.25135803222656, "rewards/margins": 41.964111328125, "rewards/rejected": 1.295013427734375, "step": 2108 }, { "epoch": 1.0916149068322982, "grad_norm": 2.403282642364502, "learning_rate": 7.540245088125865e-06, "loss": 0.16238975524902344, "rewards/accuracies": 0.921875, "rewards/chosen": 42.70929718017578, "rewards/margins": 40.29328918457031, "rewards/rejected": 2.417112350463867, "step": 2109 }, { "epoch": 1.0921325051759834, "grad_norm": 1.7494852542877197, "learning_rate": 7.537787407949647e-06, "loss": 0.1770133674144745, "rewards/accuracies": 0.921875, "rewards/chosen": 40.177433013916016, "rewards/margins": 38.149505615234375, "rewards/rejected": 2.019904136657715, "step": 2110 }, { "epoch": 1.0926501035196687, "grad_norm": 1.1404838562011719, "learning_rate": 7.535328901578438e-06, "loss": 0.13762083649635315, "rewards/accuracies": 0.9375, "rewards/chosen": 38.62038040161133, "rewards/margins": 36.945919036865234, "rewards/rejected": 1.6614570617675781, "step": 2111 }, { "epoch": 1.093167701863354, "grad_norm": 2.013633966445923, "learning_rate": 7.532869569812622e-06, "loss": 0.21443134546279907, "rewards/accuracies": 0.890625, "rewards/chosen": 31.710346221923828, "rewards/margins": 30.394081115722656, "rewards/rejected": 1.3196182250976562, "step": 2112 }, { "epoch": 1.0936853002070392, "grad_norm": 1.6068565845489502, "learning_rate": 7.530409413452853e-06, "loss": 0.1214451715350151, "rewards/accuracies": 0.953125, "rewards/chosen": 39.28022003173828, "rewards/margins": 37.7847900390625, "rewards/rejected": 1.4919185638427734, "step": 2113 }, { "epoch": 1.0942028985507246, "grad_norm": 0.9631279110908508, "learning_rate": 7.527948433300053e-06, "loss": 0.10409009456634521, "rewards/accuracies": 0.9765625, "rewards/chosen": 35.669769287109375, "rewards/margins": 33.601898193359375, "rewards/rejected": 2.071955680847168, "step": 2114 }, { "epoch": 1.09472049689441, "grad_norm": 1.0927571058273315, "learning_rate": 7.5254866301554096e-06, "loss": 0.08358526229858398, "rewards/accuracies": 0.9765625, "rewards/chosen": 42.46328353881836, "rewards/margins": 39.54154586791992, "rewards/rejected": 2.910490036010742, "step": 2115 }, { "epoch": 1.0952380952380953, "grad_norm": 2.2726073265075684, "learning_rate": 7.523024004820384e-06, "loss": 0.16282036900520325, "rewards/accuracies": 0.90625, "rewards/chosen": 28.967164993286133, "rewards/margins": 27.2913818359375, "rewards/rejected": 1.6764793395996094, "step": 2116 }, { "epoch": 1.0957556935817805, "grad_norm": 1.186522364616394, "learning_rate": 7.520560558096699e-06, "loss": 0.11237788200378418, "rewards/accuracies": 0.9375, "rewards/chosen": 37.351158142089844, "rewards/margins": 34.56898498535156, "rewards/rejected": 2.7759339809417725, "step": 2117 }, { "epoch": 1.0962732919254659, "grad_norm": 1.138319969177246, "learning_rate": 7.518096290786347e-06, "loss": 0.12996438145637512, "rewards/accuracies": 0.9296875, "rewards/chosen": 40.490203857421875, "rewards/margins": 38.20384216308594, "rewards/rejected": 2.2853622436523438, "step": 2118 }, { "epoch": 1.0967908902691512, "grad_norm": 1.477035641670227, "learning_rate": 7.515631203691591e-06, "loss": 0.10518532246351242, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.381587982177734, "rewards/margins": 34.17625427246094, "rewards/rejected": 2.2001609802246094, "step": 2119 }, { "epoch": 1.0973084886128364, "grad_norm": 1.4940639734268188, "learning_rate": 7.513165297614955e-06, "loss": 0.11008715629577637, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.13323211669922, "rewards/margins": 34.073944091796875, "rewards/rejected": 2.0652542114257812, "step": 2120 }, { "epoch": 1.0978260869565217, "grad_norm": 2.0682971477508545, "learning_rate": 7.510698573359234e-06, "loss": 0.1877654790878296, "rewards/accuracies": 0.921875, "rewards/chosen": 36.061668395996094, "rewards/margins": 33.60401916503906, "rewards/rejected": 2.4549713134765625, "step": 2121 }, { "epoch": 1.098343685300207, "grad_norm": 1.3507988452911377, "learning_rate": 7.508231031727486e-06, "loss": 0.10174442827701569, "rewards/accuracies": 0.9609375, "rewards/chosen": 33.47276306152344, "rewards/margins": 30.753707885742188, "rewards/rejected": 2.7164154052734375, "step": 2122 }, { "epoch": 1.0988612836438922, "grad_norm": 1.6284488439559937, "learning_rate": 7.50576267352304e-06, "loss": 0.16678740084171295, "rewards/accuracies": 0.9375, "rewards/chosen": 35.2784423828125, "rewards/margins": 31.366912841796875, "rewards/rejected": 3.9040603637695312, "step": 2123 }, { "epoch": 1.0993788819875776, "grad_norm": 2.2531192302703857, "learning_rate": 7.503293499549486e-06, "loss": 0.13497893512248993, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.83092498779297, "rewards/margins": 36.524322509765625, "rewards/rejected": 2.2972865104675293, "step": 2124 }, { "epoch": 1.099896480331263, "grad_norm": 1.558349847793579, "learning_rate": 7.500823510610682e-06, "loss": 0.09627965837717056, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.66140365600586, "rewards/margins": 32.205291748046875, "rewards/rejected": 1.4675521850585938, "step": 2125 }, { "epoch": 1.1004140786749483, "grad_norm": 1.3714120388031006, "learning_rate": 7.498352707510749e-06, "loss": 0.11755630373954773, "rewards/accuracies": 0.953125, "rewards/chosen": 34.873043060302734, "rewards/margins": 31.615402221679688, "rewards/rejected": 3.251208543777466, "step": 2126 }, { "epoch": 1.1009316770186335, "grad_norm": 1.7722989320755005, "learning_rate": 7.495881091054077e-06, "loss": 0.11090719699859619, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.97271728515625, "rewards/margins": 30.624603271484375, "rewards/rejected": 3.3504638671875, "step": 2127 }, { "epoch": 1.1014492753623188, "grad_norm": 2.1803767681121826, "learning_rate": 7.4934086620453175e-06, "loss": 0.1355745941400528, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.567359924316406, "rewards/margins": 33.39446258544922, "rewards/rejected": 3.1808290481567383, "step": 2128 }, { "epoch": 1.1019668737060042, "grad_norm": 1.7172939777374268, "learning_rate": 7.490935421289387e-06, "loss": 0.1937662810087204, "rewards/accuracies": 0.921875, "rewards/chosen": 35.10344696044922, "rewards/margins": 32.4915771484375, "rewards/rejected": 2.616058349609375, "step": 2129 }, { "epoch": 1.1024844720496894, "grad_norm": 1.2509379386901855, "learning_rate": 7.488461369591469e-06, "loss": 0.11263421177864075, "rewards/accuracies": 0.953125, "rewards/chosen": 34.929954528808594, "rewards/margins": 31.392044067382812, "rewards/rejected": 3.5404224395751953, "step": 2130 }, { "epoch": 1.1030020703933747, "grad_norm": 1.564024567604065, "learning_rate": 7.485986507757006e-06, "loss": 0.142283096909523, "rewards/accuracies": 0.90625, "rewards/chosen": 35.28633117675781, "rewards/margins": 32.18026351928711, "rewards/rejected": 3.103311538696289, "step": 2131 }, { "epoch": 1.10351966873706, "grad_norm": 1.1136995553970337, "learning_rate": 7.48351083659171e-06, "loss": 0.14440220594406128, "rewards/accuracies": 0.90625, "rewards/chosen": 37.59846115112305, "rewards/margins": 33.68736267089844, "rewards/rejected": 3.904062271118164, "step": 2132 }, { "epoch": 1.1040372670807455, "grad_norm": 1.4975225925445557, "learning_rate": 7.481034356901549e-06, "loss": 0.14716485142707825, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.566619873046875, "rewards/margins": 33.19800567626953, "rewards/rejected": 3.3754138946533203, "step": 2133 }, { "epoch": 1.1045548654244306, "grad_norm": 2.7689762115478516, "learning_rate": 7.4785570694927655e-06, "loss": 0.198167622089386, "rewards/accuracies": 0.9375, "rewards/chosen": 38.2876091003418, "rewards/margins": 35.17231750488281, "rewards/rejected": 3.1203689575195312, "step": 2134 }, { "epoch": 1.105072463768116, "grad_norm": 0.9570075273513794, "learning_rate": 7.4760789751718545e-06, "loss": 0.11152065545320511, "rewards/accuracies": 0.9375, "rewards/chosen": 39.32783508300781, "rewards/margins": 36.96253967285156, "rewards/rejected": 2.366305351257324, "step": 2135 }, { "epoch": 1.1055900621118013, "grad_norm": 1.0066182613372803, "learning_rate": 7.473600074745576e-06, "loss": 0.0923597663640976, "rewards/accuracies": 0.96875, "rewards/chosen": 35.09521484375, "rewards/margins": 32.76335144042969, "rewards/rejected": 2.328054428100586, "step": 2136 }, { "epoch": 1.1061076604554865, "grad_norm": 0.7293800711631775, "learning_rate": 7.4711203690209565e-06, "loss": 0.1342577040195465, "rewards/accuracies": 0.921875, "rewards/chosen": 37.62472915649414, "rewards/margins": 34.47381591796875, "rewards/rejected": 3.1418914794921875, "step": 2137 }, { "epoch": 1.1066252587991718, "grad_norm": 0.759143590927124, "learning_rate": 7.468639858805281e-06, "loss": 0.08791051805019379, "rewards/accuracies": 0.9765625, "rewards/chosen": 37.52008056640625, "rewards/margins": 35.757110595703125, "rewards/rejected": 1.767204761505127, "step": 2138 }, { "epoch": 1.1071428571428572, "grad_norm": 0.7487903833389282, "learning_rate": 7.466158544906098e-06, "loss": 0.11208490282297134, "rewards/accuracies": 0.9609375, "rewards/chosen": 38.6461296081543, "rewards/margins": 35.364776611328125, "rewards/rejected": 3.2745018005371094, "step": 2139 }, { "epoch": 1.1076604554865424, "grad_norm": 1.6687568426132202, "learning_rate": 7.463676428131217e-06, "loss": 0.1225588470697403, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.27576446533203, "rewards/margins": 32.24794006347656, "rewards/rejected": 3.027761459350586, "step": 2140 }, { "epoch": 1.1081780538302277, "grad_norm": 1.5055068731307983, "learning_rate": 7.4611935092887085e-06, "loss": 0.2098010778427124, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.219581604003906, "rewards/margins": 29.885757446289062, "rewards/rejected": 2.3311078548431396, "step": 2141 }, { "epoch": 1.108695652173913, "grad_norm": 2.7797012329101562, "learning_rate": 7.4587097891869055e-06, "loss": 0.18808601796627045, "rewards/accuracies": 0.9140625, "rewards/chosen": 36.260986328125, "rewards/margins": 33.55162048339844, "rewards/rejected": 2.7131147384643555, "step": 2142 }, { "epoch": 1.1092132505175982, "grad_norm": 0.7899831533432007, "learning_rate": 7.4562252686344e-06, "loss": 0.05354539677500725, "rewards/accuracies": 0.984375, "rewards/chosen": 42.370330810546875, "rewards/margins": 39.08296203613281, "rewards/rejected": 3.29508113861084, "step": 2143 }, { "epoch": 1.1097308488612836, "grad_norm": 1.802019476890564, "learning_rate": 7.453739948440046e-06, "loss": 0.18174371123313904, "rewards/accuracies": 0.8984375, "rewards/chosen": 32.721649169921875, "rewards/margins": 30.774093627929688, "rewards/rejected": 1.949228286743164, "step": 2144 }, { "epoch": 1.110248447204969, "grad_norm": 1.632644534111023, "learning_rate": 7.4512538294129585e-06, "loss": 0.11881732940673828, "rewards/accuracies": 0.9375, "rewards/chosen": 33.02873992919922, "rewards/margins": 31.979698181152344, "rewards/rejected": 1.0394763946533203, "step": 2145 }, { "epoch": 1.1107660455486543, "grad_norm": 2.227755069732666, "learning_rate": 7.44876691236251e-06, "loss": 0.11226137727499008, "rewards/accuracies": 0.9609375, "rewards/chosen": 31.662992477416992, "rewards/margins": 30.107948303222656, "rewards/rejected": 1.5568828582763672, "step": 2146 }, { "epoch": 1.1112836438923395, "grad_norm": 1.5370770692825317, "learning_rate": 7.446279198098334e-06, "loss": 0.15423737466335297, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.46218490600586, "rewards/margins": 30.27789306640625, "rewards/rejected": 2.1827497482299805, "step": 2147 }, { "epoch": 1.1118012422360248, "grad_norm": 2.9829177856445312, "learning_rate": 7.4437906874303245e-06, "loss": 0.12147924304008484, "rewards/accuracies": 0.9765625, "rewards/chosen": 28.914443969726562, "rewards/margins": 28.017486572265625, "rewards/rejected": 0.9010524749755859, "step": 2148 }, { "epoch": 1.1123188405797102, "grad_norm": 1.3667981624603271, "learning_rate": 7.441301381168635e-06, "loss": 0.11686765402555466, "rewards/accuracies": 0.9375, "rewards/chosen": 28.385223388671875, "rewards/margins": 27.111404418945312, "rewards/rejected": 1.2719941139221191, "step": 2149 }, { "epoch": 1.1128364389233953, "grad_norm": 0.9666358828544617, "learning_rate": 7.438811280123677e-06, "loss": 0.11069931089878082, "rewards/accuracies": 0.953125, "rewards/chosen": 31.20867156982422, "rewards/margins": 28.698402404785156, "rewards/rejected": 2.5046310424804688, "step": 2150 }, { "epoch": 1.1133540372670807, "grad_norm": 0.9822039604187012, "learning_rate": 7.43632038510612e-06, "loss": 0.10361620038747787, "rewards/accuracies": 0.9609375, "rewards/chosen": 28.699892044067383, "rewards/margins": 27.63768768310547, "rewards/rejected": 1.055532455444336, "step": 2151 }, { "epoch": 1.113871635610766, "grad_norm": 1.839811086654663, "learning_rate": 7.43382869692689e-06, "loss": 0.1139284074306488, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.216182708740234, "rewards/margins": 26.706390380859375, "rewards/rejected": 1.5125007629394531, "step": 2152 }, { "epoch": 1.1143892339544514, "grad_norm": 1.0588622093200684, "learning_rate": 7.43133621639718e-06, "loss": 0.12086398154497147, "rewards/accuracies": 0.921875, "rewards/chosen": 31.543960571289062, "rewards/margins": 29.596038818359375, "rewards/rejected": 1.9429287910461426, "step": 2153 }, { "epoch": 1.1149068322981366, "grad_norm": 1.4535716772079468, "learning_rate": 7.428842944328432e-06, "loss": 0.15582239627838135, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.01632308959961, "rewards/margins": 24.494216918945312, "rewards/rejected": 1.5213165283203125, "step": 2154 }, { "epoch": 1.115424430641822, "grad_norm": 2.514718770980835, "learning_rate": 7.426348881532347e-06, "loss": 0.22722385823726654, "rewards/accuracies": 0.921875, "rewards/chosen": 29.490671157836914, "rewards/margins": 27.515594482421875, "rewards/rejected": 1.9754116535186768, "step": 2155 }, { "epoch": 1.1159420289855073, "grad_norm": 5.481996536254883, "learning_rate": 7.4238540288208875e-06, "loss": 0.1770443320274353, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.785524368286133, "rewards/margins": 24.8021240234375, "rewards/rejected": 1.9837169647216797, "step": 2156 }, { "epoch": 1.1164596273291925, "grad_norm": 1.2233725786209106, "learning_rate": 7.42135838700627e-06, "loss": 0.1433422565460205, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.81010627746582, "rewards/margins": 24.822311401367188, "rewards/rejected": 0.9934844970703125, "step": 2157 }, { "epoch": 1.1169772256728778, "grad_norm": 2.071589231491089, "learning_rate": 7.41886195690097e-06, "loss": 0.18568485975265503, "rewards/accuracies": 0.90625, "rewards/chosen": 23.11260986328125, "rewards/margins": 21.732192993164062, "rewards/rejected": 1.3739395141601562, "step": 2158 }, { "epoch": 1.1174948240165632, "grad_norm": 1.3300546407699585, "learning_rate": 7.416364739317715e-06, "loss": 0.16297458112239838, "rewards/accuracies": 0.90625, "rewards/chosen": 26.003677368164062, "rewards/margins": 24.21337890625, "rewards/rejected": 1.790945053100586, "step": 2159 }, { "epoch": 1.1180124223602483, "grad_norm": 1.787427544593811, "learning_rate": 7.413866735069497e-06, "loss": 0.12594276666641235, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.803749084472656, "rewards/margins": 22.4034423828125, "rewards/rejected": 1.3931368589401245, "step": 2160 }, { "epoch": 1.1185300207039337, "grad_norm": 1.4041286706924438, "learning_rate": 7.411367944969554e-06, "loss": 0.10835648328065872, "rewards/accuracies": 0.9375, "rewards/chosen": 26.95282745361328, "rewards/margins": 25.361083984375, "rewards/rejected": 1.5892982482910156, "step": 2161 }, { "epoch": 1.119047619047619, "grad_norm": 1.073197603225708, "learning_rate": 7.40886836983139e-06, "loss": 0.13870400190353394, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.235153198242188, "rewards/margins": 21.684974670410156, "rewards/rejected": 1.5461373329162598, "step": 2162 }, { "epoch": 1.1195652173913044, "grad_norm": 3.749574899673462, "learning_rate": 7.406368010468756e-06, "loss": 0.09601311385631561, "rewards/accuracies": 0.9765625, "rewards/chosen": 29.09778594970703, "rewards/margins": 27.220733642578125, "rewards/rejected": 1.878021240234375, "step": 2163 }, { "epoch": 1.1200828157349896, "grad_norm": 1.3605953454971313, "learning_rate": 7.4038668676956645e-06, "loss": 0.11093088984489441, "rewards/accuracies": 0.9609375, "rewards/chosen": 27.202301025390625, "rewards/margins": 25.495635986328125, "rewards/rejected": 1.706436276435852, "step": 2164 }, { "epoch": 1.120600414078675, "grad_norm": 0.8254357576370239, "learning_rate": 7.401364942326379e-06, "loss": 0.09508208185434341, "rewards/accuracies": 0.96875, "rewards/chosen": 24.496706008911133, "rewards/margins": 23.258413314819336, "rewards/rejected": 1.237234115600586, "step": 2165 }, { "epoch": 1.1211180124223603, "grad_norm": 1.3199104070663452, "learning_rate": 7.3988622351754204e-06, "loss": 0.15549291670322418, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.0931396484375, "rewards/margins": 22.783966064453125, "rewards/rejected": 1.310577392578125, "step": 2166 }, { "epoch": 1.1216356107660455, "grad_norm": 1.9582023620605469, "learning_rate": 7.396358747057561e-06, "loss": 0.17665687203407288, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.362060546875, "rewards/margins": 21.07117462158203, "rewards/rejected": 1.2914352416992188, "step": 2167 }, { "epoch": 1.1221532091097308, "grad_norm": 1.5795224905014038, "learning_rate": 7.393854478787833e-06, "loss": 0.19585475325584412, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.54651641845703, "rewards/margins": 23.114532470703125, "rewards/rejected": 1.4280776977539062, "step": 2168 }, { "epoch": 1.1226708074534162, "grad_norm": 1.9234821796417236, "learning_rate": 7.391349431181516e-06, "loss": 0.1988915205001831, "rewards/accuracies": 0.8984375, "rewards/chosen": 24.065074920654297, "rewards/margins": 22.82830810546875, "rewards/rejected": 1.2355670928955078, "step": 2169 }, { "epoch": 1.1231884057971016, "grad_norm": 0.938127875328064, "learning_rate": 7.388843605054148e-06, "loss": 0.14746221899986267, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.074081420898438, "rewards/margins": 21.72882080078125, "rewards/rejected": 1.3432693481445312, "step": 2170 }, { "epoch": 1.1237060041407867, "grad_norm": 2.162280321121216, "learning_rate": 7.386337001221518e-06, "loss": 0.16481655836105347, "rewards/accuracies": 0.921875, "rewards/chosen": 22.125072479248047, "rewards/margins": 21.10588836669922, "rewards/rejected": 1.0217854976654053, "step": 2171 }, { "epoch": 1.124223602484472, "grad_norm": 0.8881409168243408, "learning_rate": 7.383829620499667e-06, "loss": 0.1144377738237381, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.600723266601562, "rewards/margins": 23.3333740234375, "rewards/rejected": 1.2729835510253906, "step": 2172 }, { "epoch": 1.1247412008281574, "grad_norm": 1.0714515447616577, "learning_rate": 7.381321463704896e-06, "loss": 0.11434321850538254, "rewards/accuracies": 0.953125, "rewards/chosen": 24.82697296142578, "rewards/margins": 23.660308837890625, "rewards/rejected": 1.1597042083740234, "step": 2173 }, { "epoch": 1.1252587991718426, "grad_norm": 1.9157319068908691, "learning_rate": 7.378812531653747e-06, "loss": 0.19039949774742126, "rewards/accuracies": 0.921875, "rewards/chosen": 26.290130615234375, "rewards/margins": 23.99505615234375, "rewards/rejected": 2.296481132507324, "step": 2174 }, { "epoch": 1.125776397515528, "grad_norm": 0.936836302280426, "learning_rate": 7.376302825163029e-06, "loss": 0.15919125080108643, "rewards/accuracies": 0.921875, "rewards/chosen": 25.721694946289062, "rewards/margins": 24.084686279296875, "rewards/rejected": 1.6377530097961426, "step": 2175 }, { "epoch": 1.1262939958592133, "grad_norm": 2.610823631286621, "learning_rate": 7.373792345049786e-06, "loss": 0.20171892642974854, "rewards/accuracies": 0.8984375, "rewards/chosen": 26.120532989501953, "rewards/margins": 24.435489654541016, "rewards/rejected": 1.6861646175384521, "step": 2176 }, { "epoch": 1.1268115942028984, "grad_norm": 1.031070351600647, "learning_rate": 7.3712810921313296e-06, "loss": 0.17158058285713196, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.90349578857422, "rewards/margins": 21.46417236328125, "rewards/rejected": 1.439910888671875, "step": 2177 }, { "epoch": 1.1273291925465838, "grad_norm": 1.3948496580123901, "learning_rate": 7.3687690672252136e-06, "loss": 0.14849844574928284, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.990951538085938, "rewards/margins": 26.0968017578125, "rewards/rejected": 1.8875572681427002, "step": 2178 }, { "epoch": 1.1278467908902692, "grad_norm": 0.8853744268417358, "learning_rate": 7.366256271149246e-06, "loss": 0.14456690847873688, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.80276107788086, "rewards/margins": 23.9359130859375, "rewards/rejected": 1.8650951385498047, "step": 2179 }, { "epoch": 1.1283643892339545, "grad_norm": 0.893592119216919, "learning_rate": 7.363742704721485e-06, "loss": 0.16175058484077454, "rewards/accuracies": 0.90625, "rewards/chosen": 26.672025680541992, "rewards/margins": 24.924842834472656, "rewards/rejected": 1.7523446083068848, "step": 2180 }, { "epoch": 1.1288819875776397, "grad_norm": 0.8102287650108337, "learning_rate": 7.361228368760243e-06, "loss": 0.12616628408432007, "rewards/accuracies": 0.9375, "rewards/chosen": 27.788694381713867, "rewards/margins": 25.700927734375, "rewards/rejected": 2.0852832794189453, "step": 2181 }, { "epoch": 1.129399585921325, "grad_norm": 1.2610594034194946, "learning_rate": 7.358713264084077e-06, "loss": 0.13163018226623535, "rewards/accuracies": 0.9609375, "rewards/chosen": 26.015666961669922, "rewards/margins": 23.631118774414062, "rewards/rejected": 2.392876625061035, "step": 2182 }, { "epoch": 1.1299171842650104, "grad_norm": 1.3692060708999634, "learning_rate": 7.356197391511799e-06, "loss": 0.20121821761131287, "rewards/accuracies": 0.8828125, "rewards/chosen": 20.280792236328125, "rewards/margins": 17.958541870117188, "rewards/rejected": 2.319244384765625, "step": 2183 }, { "epoch": 1.1304347826086956, "grad_norm": 0.690753698348999, "learning_rate": 7.353680751862469e-06, "loss": 0.11189890652894974, "rewards/accuracies": 0.953125, "rewards/chosen": 29.12311553955078, "rewards/margins": 26.764923095703125, "rewards/rejected": 2.357736587524414, "step": 2184 }, { "epoch": 1.130952380952381, "grad_norm": 0.6874547600746155, "learning_rate": 7.351163345955398e-06, "loss": 0.10557810962200165, "rewards/accuracies": 0.96875, "rewards/chosen": 28.264556884765625, "rewards/margins": 25.650405883789062, "rewards/rejected": 2.615345001220703, "step": 2185 }, { "epoch": 1.1314699792960663, "grad_norm": 0.7024004459381104, "learning_rate": 7.348645174610145e-06, "loss": 0.1192227229475975, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.520187377929688, "rewards/margins": 27.16735076904297, "rewards/rejected": 3.3543739318847656, "step": 2186 }, { "epoch": 1.1319875776397517, "grad_norm": 2.033020257949829, "learning_rate": 7.346126238646519e-06, "loss": 0.1525549590587616, "rewards/accuracies": 0.9375, "rewards/chosen": 31.087848663330078, "rewards/margins": 27.520294189453125, "rewards/rejected": 3.5663375854492188, "step": 2187 }, { "epoch": 1.1325051759834368, "grad_norm": 1.436774730682373, "learning_rate": 7.343606538884578e-06, "loss": 0.11814284324645996, "rewards/accuracies": 0.9375, "rewards/chosen": 31.58726692199707, "rewards/margins": 28.19415283203125, "rewards/rejected": 3.3921890258789062, "step": 2188 }, { "epoch": 1.1330227743271222, "grad_norm": 1.4791855812072754, "learning_rate": 7.341086076144628e-06, "loss": 0.16586343944072723, "rewards/accuracies": 0.9375, "rewards/chosen": 34.50712585449219, "rewards/margins": 30.232078552246094, "rewards/rejected": 4.275157928466797, "step": 2189 }, { "epoch": 1.1335403726708075, "grad_norm": 1.3660792112350464, "learning_rate": 7.338564851247224e-06, "loss": 0.1764029860496521, "rewards/accuracies": 0.921875, "rewards/chosen": 36.8509521484375, "rewards/margins": 33.65032958984375, "rewards/rejected": 3.200855255126953, "step": 2190 }, { "epoch": 1.1340579710144927, "grad_norm": 1.337322473526001, "learning_rate": 7.3360428650131685e-06, "loss": 0.12788447737693787, "rewards/accuracies": 0.953125, "rewards/chosen": 36.065643310546875, "rewards/margins": 31.24420166015625, "rewards/rejected": 4.816335678100586, "step": 2191 }, { "epoch": 1.134575569358178, "grad_norm": 2.152958869934082, "learning_rate": 7.333520118263513e-06, "loss": 0.17947918176651, "rewards/accuracies": 0.8984375, "rewards/chosen": 35.893856048583984, "rewards/margins": 31.201705932617188, "rewards/rejected": 4.692775726318359, "step": 2192 }, { "epoch": 1.1350931677018634, "grad_norm": 2.3363399505615234, "learning_rate": 7.330996611819556e-06, "loss": 0.12853887677192688, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.2507438659668, "rewards/margins": 31.265090942382812, "rewards/rejected": 4.989068984985352, "step": 2193 }, { "epoch": 1.1356107660455486, "grad_norm": 1.1021301746368408, "learning_rate": 7.328472346502843e-06, "loss": 0.08069495856761932, "rewards/accuracies": 0.953125, "rewards/chosen": 33.582374572753906, "rewards/margins": 29.590164184570312, "rewards/rejected": 3.9813642501831055, "step": 2194 }, { "epoch": 1.136128364389234, "grad_norm": 1.247732400894165, "learning_rate": 7.325947323135166e-06, "loss": 0.14937537908554077, "rewards/accuracies": 0.953125, "rewards/chosen": 31.596763610839844, "rewards/margins": 26.776596069335938, "rewards/rejected": 4.823996543884277, "step": 2195 }, { "epoch": 1.1366459627329193, "grad_norm": 1.3739997148513794, "learning_rate": 7.323421542538566e-06, "loss": 0.13120713829994202, "rewards/accuracies": 0.9375, "rewards/chosen": 34.947540283203125, "rewards/margins": 31.294998168945312, "rewards/rejected": 3.659740447998047, "step": 2196 }, { "epoch": 1.1371635610766044, "grad_norm": 1.908879280090332, "learning_rate": 7.320895005535327e-06, "loss": 0.2569414973258972, "rewards/accuracies": 0.8828125, "rewards/chosen": 29.88043212890625, "rewards/margins": 25.69317626953125, "rewards/rejected": 4.187854766845703, "step": 2197 }, { "epoch": 1.1376811594202898, "grad_norm": 1.0514575242996216, "learning_rate": 7.318367712947984e-06, "loss": 0.1588895320892334, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.86247634887695, "rewards/margins": 28.882568359375, "rewards/rejected": 3.9790191650390625, "step": 2198 }, { "epoch": 1.1381987577639752, "grad_norm": 1.745421051979065, "learning_rate": 7.315839665599314e-06, "loss": 0.2045266479253769, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.641883850097656, "rewards/margins": 26.459030151367188, "rewards/rejected": 5.188161849975586, "step": 2199 }, { "epoch": 1.1387163561076605, "grad_norm": 1.233625054359436, "learning_rate": 7.313310864312339e-06, "loss": 0.18061240017414093, "rewards/accuracies": 0.90625, "rewards/chosen": 28.881149291992188, "rewards/margins": 25.60572052001953, "rewards/rejected": 3.26625919342041, "step": 2200 }, { "epoch": 1.1392339544513457, "grad_norm": 1.3143633604049683, "learning_rate": 7.310781309910333e-06, "loss": 0.12702926993370056, "rewards/accuracies": 0.9375, "rewards/chosen": 31.251008987426758, "rewards/margins": 27.590652465820312, "rewards/rejected": 3.663797378540039, "step": 2201 }, { "epoch": 1.139751552795031, "grad_norm": 1.0738290548324585, "learning_rate": 7.308251003216805e-06, "loss": 0.15300753712654114, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.681896209716797, "rewards/margins": 27.85448455810547, "rewards/rejected": 2.8335514068603516, "step": 2202 }, { "epoch": 1.1402691511387164, "grad_norm": 1.064988374710083, "learning_rate": 7.3057199450555205e-06, "loss": 0.20015141367912292, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.06219482421875, "rewards/margins": 25.00301742553711, "rewards/rejected": 2.062889814376831, "step": 2203 }, { "epoch": 1.1407867494824018, "grad_norm": 0.6910951733589172, "learning_rate": 7.303188136250481e-06, "loss": 0.12442474067211151, "rewards/accuracies": 0.9375, "rewards/chosen": 26.84408187866211, "rewards/margins": 24.815521240234375, "rewards/rejected": 2.0188827514648438, "step": 2204 }, { "epoch": 1.141304347826087, "grad_norm": 0.9347885847091675, "learning_rate": 7.300655577625934e-06, "loss": 0.13156674802303314, "rewards/accuracies": 0.921875, "rewards/chosen": 25.79094886779785, "rewards/margins": 22.943138122558594, "rewards/rejected": 2.8510210514068604, "step": 2205 }, { "epoch": 1.1418219461697723, "grad_norm": 1.1101967096328735, "learning_rate": 7.298122270006373e-06, "loss": 0.18872031569480896, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.96630859375, "rewards/margins": 24.343482971191406, "rewards/rejected": 3.6242923736572266, "step": 2206 }, { "epoch": 1.1423395445134576, "grad_norm": 0.9183199405670166, "learning_rate": 7.295588214216535e-06, "loss": 0.17820699512958527, "rewards/accuracies": 0.9375, "rewards/chosen": 29.19436264038086, "rewards/margins": 25.90093994140625, "rewards/rejected": 3.2878551483154297, "step": 2207 }, { "epoch": 1.1428571428571428, "grad_norm": 0.6287526488304138, "learning_rate": 7.293053411081401e-06, "loss": 0.1015433520078659, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.560022354125977, "rewards/margins": 24.041259765625, "rewards/rejected": 2.520669937133789, "step": 2208 }, { "epoch": 1.1433747412008282, "grad_norm": 0.7408047318458557, "learning_rate": 7.2905178614261925e-06, "loss": 0.15444856882095337, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.717119216918945, "rewards/margins": 20.467811584472656, "rewards/rejected": 2.2445411682128906, "step": 2209 }, { "epoch": 1.1438923395445135, "grad_norm": 0.6241381168365479, "learning_rate": 7.2879815660763776e-06, "loss": 0.10931381583213806, "rewards/accuracies": 0.96875, "rewards/chosen": 27.862651824951172, "rewards/margins": 24.749221801757812, "rewards/rejected": 3.113247871398926, "step": 2210 }, { "epoch": 1.1444099378881987, "grad_norm": 0.8103518486022949, "learning_rate": 7.285444525857666e-06, "loss": 0.1417980194091797, "rewards/accuracies": 0.96875, "rewards/chosen": 23.561084747314453, "rewards/margins": 21.36437225341797, "rewards/rejected": 2.194528579711914, "step": 2211 }, { "epoch": 1.144927536231884, "grad_norm": 0.8572102189064026, "learning_rate": 7.282906741596005e-06, "loss": 0.12196248769760132, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.668163299560547, "rewards/margins": 22.442367553710938, "rewards/rejected": 2.222362518310547, "step": 2212 }, { "epoch": 1.1454451345755694, "grad_norm": 0.5719884037971497, "learning_rate": 7.280368214117594e-06, "loss": 0.09200868010520935, "rewards/accuracies": 0.953125, "rewards/chosen": 28.81022834777832, "rewards/margins": 24.80323028564453, "rewards/rejected": 4.001707553863525, "step": 2213 }, { "epoch": 1.1459627329192545, "grad_norm": 1.1379433870315552, "learning_rate": 7.277828944248868e-06, "loss": 0.19508123397827148, "rewards/accuracies": 0.90625, "rewards/chosen": 25.390108108520508, "rewards/margins": 22.499267578125, "rewards/rejected": 2.8900835514068604, "step": 2214 }, { "epoch": 1.14648033126294, "grad_norm": 1.2964218854904175, "learning_rate": 7.275288932816502e-06, "loss": 0.16203820705413818, "rewards/accuracies": 0.921875, "rewards/chosen": 25.480167388916016, "rewards/margins": 22.349288940429688, "rewards/rejected": 3.125213623046875, "step": 2215 }, { "epoch": 1.1469979296066253, "grad_norm": 1.2281830310821533, "learning_rate": 7.2727481806474175e-06, "loss": 0.13492459058761597, "rewards/accuracies": 0.9375, "rewards/chosen": 23.409976959228516, "rewards/margins": 20.514007568359375, "rewards/rejected": 2.891514778137207, "step": 2216 }, { "epoch": 1.1475155279503106, "grad_norm": 1.5767723321914673, "learning_rate": 7.2702066885687724e-06, "loss": 0.17158916592597961, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.103654861450195, "rewards/margins": 19.422122955322266, "rewards/rejected": 3.690561294555664, "step": 2217 }, { "epoch": 1.1480331262939958, "grad_norm": 1.0944709777832031, "learning_rate": 7.267664457407969e-06, "loss": 0.071540467441082, "rewards/accuracies": 0.9765625, "rewards/chosen": 26.14598846435547, "rewards/margins": 23.0606689453125, "rewards/rejected": 3.0876755714416504, "step": 2218 }, { "epoch": 1.1485507246376812, "grad_norm": 1.2727106809616089, "learning_rate": 7.2651214879926504e-06, "loss": 0.14360956847667694, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.558429718017578, "rewards/margins": 23.412673950195312, "rewards/rejected": 3.1538562774658203, "step": 2219 }, { "epoch": 1.1490683229813665, "grad_norm": 1.1235343217849731, "learning_rate": 7.262577781150696e-06, "loss": 0.11769416183233261, "rewards/accuracies": 0.953125, "rewards/chosen": 23.6566162109375, "rewards/margins": 21.97441864013672, "rewards/rejected": 1.681344985961914, "step": 2220 }, { "epoch": 1.1495859213250517, "grad_norm": 1.204291582107544, "learning_rate": 7.260033337710229e-06, "loss": 0.15778124332427979, "rewards/accuracies": 0.921875, "rewards/chosen": 25.105010986328125, "rewards/margins": 22.095184326171875, "rewards/rejected": 3.0094804763793945, "step": 2221 }, { "epoch": 1.150103519668737, "grad_norm": 1.38496994972229, "learning_rate": 7.257488158499611e-06, "loss": 0.08841326832771301, "rewards/accuracies": 0.9765625, "rewards/chosen": 28.09770965576172, "rewards/margins": 24.629592895507812, "rewards/rejected": 3.462512969970703, "step": 2222 }, { "epoch": 1.1506211180124224, "grad_norm": 1.1009060144424438, "learning_rate": 7.2549422443474446e-06, "loss": 0.14466460049152374, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.625499725341797, "rewards/margins": 23.0164794921875, "rewards/rejected": 1.6080303192138672, "step": 2223 }, { "epoch": 1.1511387163561078, "grad_norm": 1.5424039363861084, "learning_rate": 7.252395596082571e-06, "loss": 0.13762840628623962, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.960084915161133, "rewards/margins": 25.841629028320312, "rewards/rejected": 3.1171340942382812, "step": 2224 }, { "epoch": 1.151656314699793, "grad_norm": 1.151947259902954, "learning_rate": 7.249848214534067e-06, "loss": 0.0979817658662796, "rewards/accuracies": 0.9375, "rewards/chosen": 27.18248748779297, "rewards/margins": 24.535858154296875, "rewards/rejected": 2.6488454341888428, "step": 2225 }, { "epoch": 1.1521739130434783, "grad_norm": 2.2591469287872314, "learning_rate": 7.247300100531254e-06, "loss": 0.14366239309310913, "rewards/accuracies": 0.9375, "rewards/chosen": 24.977306365966797, "rewards/margins": 22.005584716796875, "rewards/rejected": 2.977243423461914, "step": 2226 }, { "epoch": 1.1526915113871636, "grad_norm": 1.2108547687530518, "learning_rate": 7.244751254903688e-06, "loss": 0.13466067612171173, "rewards/accuracies": 0.953125, "rewards/chosen": 28.782196044921875, "rewards/margins": 25.647483825683594, "rewards/rejected": 3.1369380950927734, "step": 2227 }, { "epoch": 1.1532091097308488, "grad_norm": 1.7043554782867432, "learning_rate": 7.242201678481162e-06, "loss": 0.13200518488883972, "rewards/accuracies": 0.9375, "rewards/chosen": 25.629165649414062, "rewards/margins": 23.68994140625, "rewards/rejected": 1.9339141845703125, "step": 2228 }, { "epoch": 1.1537267080745341, "grad_norm": 1.7725648880004883, "learning_rate": 7.239651372093712e-06, "loss": 0.18356986343860626, "rewards/accuracies": 0.90625, "rewards/chosen": 30.095596313476562, "rewards/margins": 26.36749267578125, "rewards/rejected": 3.7251205444335938, "step": 2229 }, { "epoch": 1.1542443064182195, "grad_norm": 0.9397071003913879, "learning_rate": 7.237100336571607e-06, "loss": 0.09884814918041229, "rewards/accuracies": 0.96875, "rewards/chosen": 25.916711807250977, "rewards/margins": 23.78387451171875, "rewards/rejected": 2.1362838745117188, "step": 2230 }, { "epoch": 1.1547619047619047, "grad_norm": 1.751543641090393, "learning_rate": 7.2345485727453555e-06, "loss": 0.17523270845413208, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.912372589111328, "rewards/margins": 20.483673095703125, "rewards/rejected": 2.422607421875, "step": 2231 }, { "epoch": 1.15527950310559, "grad_norm": 1.0941683053970337, "learning_rate": 7.231996081445702e-06, "loss": 0.1432904154062271, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.940773010253906, "rewards/margins": 25.099090576171875, "rewards/rejected": 3.837881088256836, "step": 2232 }, { "epoch": 1.1557971014492754, "grad_norm": 0.9868553876876831, "learning_rate": 7.22944286350363e-06, "loss": 0.08418327569961548, "rewards/accuracies": 0.9609375, "rewards/chosen": 26.349197387695312, "rewards/margins": 23.184234619140625, "rewards/rejected": 3.164031982421875, "step": 2233 }, { "epoch": 1.1563146997929608, "grad_norm": 2.178563356399536, "learning_rate": 7.2268889197503555e-06, "loss": 0.12854145467281342, "rewards/accuracies": 0.9609375, "rewards/chosen": 28.513004302978516, "rewards/margins": 24.30731201171875, "rewards/rejected": 4.209236145019531, "step": 2234 }, { "epoch": 1.156832298136646, "grad_norm": 1.0869098901748657, "learning_rate": 7.2243342510173365e-06, "loss": 0.121034175157547, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.818763732910156, "rewards/margins": 22.83709716796875, "rewards/rejected": 2.976165771484375, "step": 2235 }, { "epoch": 1.1573498964803313, "grad_norm": 1.0294502973556519, "learning_rate": 7.22177885813626e-06, "loss": 0.09320767223834991, "rewards/accuracies": 0.9375, "rewards/chosen": 27.50289535522461, "rewards/margins": 23.0498046875, "rewards/rejected": 4.4520416259765625, "step": 2236 }, { "epoch": 1.1578674948240166, "grad_norm": 1.23548424243927, "learning_rate": 7.219222741939056e-06, "loss": 0.1349743902683258, "rewards/accuracies": 0.953125, "rewards/chosen": 29.385770797729492, "rewards/margins": 25.51202392578125, "rewards/rejected": 3.875980854034424, "step": 2237 }, { "epoch": 1.1583850931677018, "grad_norm": 1.2521600723266602, "learning_rate": 7.216665903257885e-06, "loss": 0.0949859619140625, "rewards/accuracies": 0.96875, "rewards/chosen": 30.62374496459961, "rewards/margins": 25.874679565429688, "rewards/rejected": 4.755699157714844, "step": 2238 }, { "epoch": 1.1589026915113871, "grad_norm": 1.430374264717102, "learning_rate": 7.214108342925144e-06, "loss": 0.17292463779449463, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.882984161376953, "rewards/margins": 22.202232360839844, "rewards/rejected": 6.680498123168945, "step": 2239 }, { "epoch": 1.1594202898550725, "grad_norm": 1.184849739074707, "learning_rate": 7.211550061773467e-06, "loss": 0.1241026520729065, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.71228790283203, "rewards/margins": 24.477142333984375, "rewards/rejected": 5.2337188720703125, "step": 2240 }, { "epoch": 1.1599378881987579, "grad_norm": 1.6190389394760132, "learning_rate": 7.208991060635717e-06, "loss": 0.1730194091796875, "rewards/accuracies": 0.8984375, "rewards/chosen": 32.223453521728516, "rewards/margins": 26.463973999023438, "rewards/rejected": 5.753871917724609, "step": 2241 }, { "epoch": 1.160455486542443, "grad_norm": 1.0078593492507935, "learning_rate": 7.206431340345001e-06, "loss": 0.12025478482246399, "rewards/accuracies": 0.953125, "rewards/chosen": 32.51433563232422, "rewards/margins": 25.67303466796875, "rewards/rejected": 6.845320701599121, "step": 2242 }, { "epoch": 1.1609730848861284, "grad_norm": 1.1825764179229736, "learning_rate": 7.203870901734649e-06, "loss": 0.1043350100517273, "rewards/accuracies": 0.953125, "rewards/chosen": 31.85509490966797, "rewards/margins": 26.151233673095703, "rewards/rejected": 5.6978759765625, "step": 2243 }, { "epoch": 1.1614906832298137, "grad_norm": 0.6512780785560608, "learning_rate": 7.201309745638233e-06, "loss": 0.06109112873673439, "rewards/accuracies": 0.96875, "rewards/chosen": 38.29066467285156, "rewards/margins": 30.978607177734375, "rewards/rejected": 7.312675476074219, "step": 2244 }, { "epoch": 1.162008281573499, "grad_norm": 1.0856977701187134, "learning_rate": 7.198747872889555e-06, "loss": 0.13596707582473755, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.357696533203125, "rewards/margins": 27.416915893554688, "rewards/rejected": 6.9422760009765625, "step": 2245 }, { "epoch": 1.1625258799171843, "grad_norm": 0.9238613843917847, "learning_rate": 7.196185284322652e-06, "loss": 0.134514719247818, "rewards/accuracies": 0.9375, "rewards/chosen": 32.21533203125, "rewards/margins": 25.76421356201172, "rewards/rejected": 6.449106216430664, "step": 2246 }, { "epoch": 1.1630434782608696, "grad_norm": 2.1069107055664062, "learning_rate": 7.193621980771793e-06, "loss": 0.14876285195350647, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.76986312866211, "rewards/margins": 24.83564567565918, "rewards/rejected": 6.937447547912598, "step": 2247 }, { "epoch": 1.1635610766045548, "grad_norm": 1.4292699098587036, "learning_rate": 7.191057963071479e-06, "loss": 0.1593664586544037, "rewards/accuracies": 0.921875, "rewards/chosen": 35.237083435058594, "rewards/margins": 26.915420532226562, "rewards/rejected": 8.324951171875, "step": 2248 }, { "epoch": 1.1640786749482401, "grad_norm": 4.171023845672607, "learning_rate": 7.188493232056446e-06, "loss": 0.20342375338077545, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.277427673339844, "rewards/margins": 25.4827880859375, "rewards/rejected": 8.79898452758789, "step": 2249 }, { "epoch": 1.1645962732919255, "grad_norm": 1.0622819662094116, "learning_rate": 7.18592778856166e-06, "loss": 0.10626475512981415, "rewards/accuracies": 0.96875, "rewards/chosen": 33.84077453613281, "rewards/margins": 26.644439697265625, "rewards/rejected": 7.18879508972168, "step": 2250 }, { "epoch": 1.1651138716356106, "grad_norm": 1.1797401905059814, "learning_rate": 7.183361633422317e-06, "loss": 0.16710591316223145, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.3685302734375, "rewards/margins": 20.872188568115234, "rewards/rejected": 4.492837429046631, "step": 2251 }, { "epoch": 1.165631469979296, "grad_norm": 1.1410191059112549, "learning_rate": 7.180794767473851e-06, "loss": 0.12524180114269257, "rewards/accuracies": 0.9375, "rewards/chosen": 29.015182495117188, "rewards/margins": 23.951812744140625, "rewards/rejected": 5.058986663818359, "step": 2252 }, { "epoch": 1.1661490683229814, "grad_norm": 1.1207937002182007, "learning_rate": 7.178227191551922e-06, "loss": 0.14949651062488556, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.3511962890625, "rewards/margins": 23.750701904296875, "rewards/rejected": 5.598876953125, "step": 2253 }, { "epoch": 1.1666666666666667, "grad_norm": 1.7227706909179688, "learning_rate": 7.175658906492424e-06, "loss": 0.1204323023557663, "rewards/accuracies": 0.953125, "rewards/chosen": 25.393280029296875, "rewards/margins": 21.48486328125, "rewards/rejected": 3.9095993041992188, "step": 2254 }, { "epoch": 1.1671842650103519, "grad_norm": 1.000255823135376, "learning_rate": 7.173089913131482e-06, "loss": 0.138499915599823, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.926416397094727, "rewards/margins": 21.21631622314453, "rewards/rejected": 3.7021090984344482, "step": 2255 }, { "epoch": 1.1677018633540373, "grad_norm": 0.9783153533935547, "learning_rate": 7.170520212305444e-06, "loss": 0.12479671835899353, "rewards/accuracies": 0.9375, "rewards/chosen": 28.78812026977539, "rewards/margins": 24.363967895507812, "rewards/rejected": 4.434898376464844, "step": 2256 }, { "epoch": 1.1682194616977226, "grad_norm": 0.7671397924423218, "learning_rate": 7.1679498048509025e-06, "loss": 0.1273425966501236, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.08701515197754, "rewards/margins": 21.16461181640625, "rewards/rejected": 3.9255142211914062, "step": 2257 }, { "epoch": 1.168737060041408, "grad_norm": 1.713484764099121, "learning_rate": 7.165378691604666e-06, "loss": 0.15749439597129822, "rewards/accuracies": 0.9609375, "rewards/chosen": 27.088363647460938, "rewards/margins": 22.461212158203125, "rewards/rejected": 4.629411220550537, "step": 2258 }, { "epoch": 1.1692546583850931, "grad_norm": 1.0753916501998901, "learning_rate": 7.162806873403782e-06, "loss": 0.12347292900085449, "rewards/accuracies": 0.9375, "rewards/chosen": 25.25723648071289, "rewards/margins": 21.75037384033203, "rewards/rejected": 3.5008907318115234, "step": 2259 }, { "epoch": 1.1697722567287785, "grad_norm": 0.8230679631233215, "learning_rate": 7.160234351085525e-06, "loss": 0.12316729873418808, "rewards/accuracies": 0.9375, "rewards/chosen": 23.71377182006836, "rewards/margins": 20.666778564453125, "rewards/rejected": 3.049701690673828, "step": 2260 }, { "epoch": 1.1702898550724639, "grad_norm": 0.5634847283363342, "learning_rate": 7.157661125487398e-06, "loss": 0.07647988945245743, "rewards/accuracies": 0.96875, "rewards/chosen": 27.732213973999023, "rewards/margins": 23.863754272460938, "rewards/rejected": 3.8607711791992188, "step": 2261 }, { "epoch": 1.170807453416149, "grad_norm": 1.0569170713424683, "learning_rate": 7.15508719744713e-06, "loss": 0.14148661494255066, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.13996124267578, "rewards/margins": 22.225997924804688, "rewards/rejected": 3.9131031036376953, "step": 2262 }, { "epoch": 1.1713250517598344, "grad_norm": 0.9492385983467102, "learning_rate": 7.1525125678026875e-06, "loss": 0.1609281599521637, "rewards/accuracies": 0.9453125, "rewards/chosen": 20.60128402709961, "rewards/margins": 17.965744018554688, "rewards/rejected": 2.6297531127929688, "step": 2263 }, { "epoch": 1.1718426501035197, "grad_norm": 0.9725450873374939, "learning_rate": 7.149937237392253e-06, "loss": 0.16115835309028625, "rewards/accuracies": 0.890625, "rewards/chosen": 25.001724243164062, "rewards/margins": 21.35223388671875, "rewards/rejected": 3.6583938598632812, "step": 2264 }, { "epoch": 1.1723602484472049, "grad_norm": 0.8590071201324463, "learning_rate": 7.14736120705425e-06, "loss": 0.16122972965240479, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.979087829589844, "rewards/margins": 20.970504760742188, "rewards/rejected": 3.0109901428222656, "step": 2265 }, { "epoch": 1.1728778467908902, "grad_norm": 1.6753244400024414, "learning_rate": 7.144784477627319e-06, "loss": 0.14619538187980652, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.596099853515625, "rewards/margins": 21.19781494140625, "rewards/rejected": 3.3942108154296875, "step": 2266 }, { "epoch": 1.1733954451345756, "grad_norm": 0.8808525800704956, "learning_rate": 7.142207049950336e-06, "loss": 0.11107124388217926, "rewards/accuracies": 0.953125, "rewards/chosen": 25.0344295501709, "rewards/margins": 21.61522674560547, "rewards/rejected": 3.421506404876709, "step": 2267 }, { "epoch": 1.1739130434782608, "grad_norm": 1.2472667694091797, "learning_rate": 7.1396289248624e-06, "loss": 0.14494457840919495, "rewards/accuracies": 0.953125, "rewards/chosen": 23.782093048095703, "rewards/margins": 21.027114868164062, "rewards/rejected": 2.7560958862304688, "step": 2268 }, { "epoch": 1.1744306418219461, "grad_norm": 1.3459099531173706, "learning_rate": 7.137050103202838e-06, "loss": 0.1677464246749878, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.59139633178711, "rewards/margins": 22.9149169921875, "rewards/rejected": 4.67962646484375, "step": 2269 }, { "epoch": 1.1749482401656315, "grad_norm": 3.5233330726623535, "learning_rate": 7.134470585811206e-06, "loss": 0.2327583134174347, "rewards/accuracies": 0.8984375, "rewards/chosen": 26.178489685058594, "rewards/margins": 22.963546752929688, "rewards/rejected": 3.2149534225463867, "step": 2270 }, { "epoch": 1.1754658385093169, "grad_norm": 1.219714641571045, "learning_rate": 7.13189037352728e-06, "loss": 0.1260133981704712, "rewards/accuracies": 0.9609375, "rewards/chosen": 26.483551025390625, "rewards/margins": 23.141677856445312, "rewards/rejected": 3.3435516357421875, "step": 2271 }, { "epoch": 1.175983436853002, "grad_norm": 1.3339368104934692, "learning_rate": 7.129309467191072e-06, "loss": 0.16864171624183655, "rewards/accuracies": 0.8828125, "rewards/chosen": 24.696456909179688, "rewards/margins": 21.737014770507812, "rewards/rejected": 2.9619216918945312, "step": 2272 }, { "epoch": 1.1765010351966874, "grad_norm": 0.8120576739311218, "learning_rate": 7.126727867642811e-06, "loss": 0.1177683025598526, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.39373016357422, "rewards/margins": 21.922470092773438, "rewards/rejected": 2.4743566513061523, "step": 2273 }, { "epoch": 1.1770186335403727, "grad_norm": 1.2016119956970215, "learning_rate": 7.124145575722956e-06, "loss": 0.15679730474948883, "rewards/accuracies": 0.9375, "rewards/chosen": 23.670740127563477, "rewards/margins": 21.063568115234375, "rewards/rejected": 2.6124420166015625, "step": 2274 }, { "epoch": 1.177536231884058, "grad_norm": 1.544811725616455, "learning_rate": 7.121562592272192e-06, "loss": 0.16637668013572693, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.730342864990234, "rewards/margins": 21.862945556640625, "rewards/rejected": 2.8732223510742188, "step": 2275 }, { "epoch": 1.1780538302277432, "grad_norm": 1.334954857826233, "learning_rate": 7.118978918131428e-06, "loss": 0.18558736145496368, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.751211166381836, "rewards/margins": 19.606857299804688, "rewards/rejected": 2.1478347778320312, "step": 2276 }, { "epoch": 1.1785714285714286, "grad_norm": 1.116167426109314, "learning_rate": 7.1163945541417955e-06, "loss": 0.09461402148008347, "rewards/accuracies": 0.9609375, "rewards/chosen": 25.64282989501953, "rewards/margins": 22.865264892578125, "rewards/rejected": 2.7815613746643066, "step": 2277 }, { "epoch": 1.179089026915114, "grad_norm": 1.112051248550415, "learning_rate": 7.1138095011446555e-06, "loss": 0.13752812147140503, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.602359771728516, "rewards/margins": 22.259830474853516, "rewards/rejected": 3.3460302352905273, "step": 2278 }, { "epoch": 1.1796066252587991, "grad_norm": 0.999359667301178, "learning_rate": 7.1112237599815885e-06, "loss": 0.13093245029449463, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.075807571411133, "rewards/margins": 21.031028747558594, "rewards/rejected": 3.0367088317871094, "step": 2279 }, { "epoch": 1.1801242236024845, "grad_norm": 2.2357895374298096, "learning_rate": 7.108637331494402e-06, "loss": 0.15044380724430084, "rewards/accuracies": 0.9375, "rewards/chosen": 23.81995391845703, "rewards/margins": 20.488174438476562, "rewards/rejected": 3.3313751220703125, "step": 2280 }, { "epoch": 1.1806418219461698, "grad_norm": 0.8139594793319702, "learning_rate": 7.106050216525127e-06, "loss": 0.14093369245529175, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.155214309692383, "rewards/margins": 22.666351318359375, "rewards/rejected": 3.4933419227600098, "step": 2281 }, { "epoch": 1.181159420289855, "grad_norm": 1.0746064186096191, "learning_rate": 7.103462415916016e-06, "loss": 0.12317131459712982, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.29340171813965, "rewards/margins": 21.347496032714844, "rewards/rejected": 2.9453201293945312, "step": 2282 }, { "epoch": 1.1816770186335404, "grad_norm": 1.2121727466583252, "learning_rate": 7.100873930509549e-06, "loss": 0.17158499360084534, "rewards/accuracies": 0.9375, "rewards/chosen": 27.530048370361328, "rewards/margins": 23.598590850830078, "rewards/rejected": 3.9357309341430664, "step": 2283 }, { "epoch": 1.1821946169772257, "grad_norm": 1.0081658363342285, "learning_rate": 7.098284761148422e-06, "loss": 0.19365978240966797, "rewards/accuracies": 0.8984375, "rewards/chosen": 20.48761749267578, "rewards/margins": 18.095046997070312, "rewards/rejected": 2.4017295837402344, "step": 2284 }, { "epoch": 1.1827122153209109, "grad_norm": 0.7054557800292969, "learning_rate": 7.095694908675562e-06, "loss": 0.09923669695854187, "rewards/accuracies": 0.9375, "rewards/chosen": 27.823623657226562, "rewards/margins": 23.850082397460938, "rewards/rejected": 3.977458953857422, "step": 2285 }, { "epoch": 1.1832298136645962, "grad_norm": 0.7859513163566589, "learning_rate": 7.093104373934111e-06, "loss": 0.12486106157302856, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.65433692932129, "rewards/margins": 20.86119842529297, "rewards/rejected": 2.785848617553711, "step": 2286 }, { "epoch": 1.1837474120082816, "grad_norm": 0.9885607361793518, "learning_rate": 7.090513157767438e-06, "loss": 0.18114185333251953, "rewards/accuracies": 0.8984375, "rewards/chosen": 23.974502563476562, "rewards/margins": 21.229873657226562, "rewards/rejected": 2.7443089485168457, "step": 2287 }, { "epoch": 1.184265010351967, "grad_norm": 0.9312283396720886, "learning_rate": 7.0879212610191305e-06, "loss": 0.1340234875679016, "rewards/accuracies": 0.921875, "rewards/chosen": 26.713401794433594, "rewards/margins": 23.957717895507812, "rewards/rejected": 2.7517547607421875, "step": 2288 }, { "epoch": 1.184782608695652, "grad_norm": 1.0787936449050903, "learning_rate": 7.085328684533001e-06, "loss": 0.167147696018219, "rewards/accuracies": 0.90625, "rewards/chosen": 24.516002655029297, "rewards/margins": 22.211578369140625, "rewards/rejected": 2.303799867630005, "step": 2289 }, { "epoch": 1.1853002070393375, "grad_norm": 1.371911644935608, "learning_rate": 7.0827354291530805e-06, "loss": 0.16759926080703735, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.989547729492188, "rewards/margins": 22.035850524902344, "rewards/rejected": 2.9496726989746094, "step": 2290 }, { "epoch": 1.1858178053830228, "grad_norm": 0.9535888433456421, "learning_rate": 7.080141495723622e-06, "loss": 0.1416832059621811, "rewards/accuracies": 0.9296875, "rewards/chosen": 21.91412353515625, "rewards/margins": 19.353065490722656, "rewards/rejected": 2.5511269569396973, "step": 2291 }, { "epoch": 1.186335403726708, "grad_norm": 0.659010112285614, "learning_rate": 7.077546885089098e-06, "loss": 0.09715131670236588, "rewards/accuracies": 0.953125, "rewards/chosen": 25.095182418823242, "rewards/margins": 22.521591186523438, "rewards/rejected": 2.5766754150390625, "step": 2292 }, { "epoch": 1.1868530020703933, "grad_norm": 1.101433515548706, "learning_rate": 7.074951598094206e-06, "loss": 0.14839698374271393, "rewards/accuracies": 0.921875, "rewards/chosen": 19.92753791809082, "rewards/margins": 18.146713256835938, "rewards/rejected": 1.7807579040527344, "step": 2293 }, { "epoch": 1.1873706004140787, "grad_norm": 1.0475643873214722, "learning_rate": 7.072355635583858e-06, "loss": 0.17872068285942078, "rewards/accuracies": 0.9453125, "rewards/chosen": 21.05007553100586, "rewards/margins": 18.513172149658203, "rewards/rejected": 2.5366744995117188, "step": 2294 }, { "epoch": 1.187888198757764, "grad_norm": 3.181161403656006, "learning_rate": 7.069758998403189e-06, "loss": 0.2534743547439575, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.894651412963867, "rewards/margins": 22.678619384765625, "rewards/rejected": 3.2220306396484375, "step": 2295 }, { "epoch": 1.1884057971014492, "grad_norm": 0.7203473448753357, "learning_rate": 7.067161687397553e-06, "loss": 0.099469393491745, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.92998504638672, "rewards/margins": 22.872711181640625, "rewards/rejected": 3.0542831420898438, "step": 2296 }, { "epoch": 1.1889233954451346, "grad_norm": 1.164695143699646, "learning_rate": 7.0645637034125205e-06, "loss": 0.18283995985984802, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.799766540527344, "rewards/margins": 20.373947143554688, "rewards/rejected": 2.4221878051757812, "step": 2297 }, { "epoch": 1.18944099378882, "grad_norm": 0.9470036625862122, "learning_rate": 7.061965047293889e-06, "loss": 0.16348138451576233, "rewards/accuracies": 0.921875, "rewards/chosen": 24.174415588378906, "rewards/margins": 21.44605255126953, "rewards/rejected": 2.7292022705078125, "step": 2298 }, { "epoch": 1.189958592132505, "grad_norm": 0.7093616127967834, "learning_rate": 7.059365719887666e-06, "loss": 0.14293529093265533, "rewards/accuracies": 0.921875, "rewards/chosen": 27.180505752563477, "rewards/margins": 24.10986328125, "rewards/rejected": 3.0733375549316406, "step": 2299 }, { "epoch": 1.1904761904761905, "grad_norm": 1.6425896883010864, "learning_rate": 7.056765722040083e-06, "loss": 0.13234911859035492, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.033424377441406, "rewards/margins": 20.66143798828125, "rewards/rejected": 2.367203712463379, "step": 2300 }, { "epoch": 1.1909937888198758, "grad_norm": 0.8493044376373291, "learning_rate": 7.054165054597588e-06, "loss": 0.1470203995704651, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.30124282836914, "rewards/margins": 21.497406005859375, "rewards/rejected": 2.7943716049194336, "step": 2301 }, { "epoch": 1.191511387163561, "grad_norm": 1.1491825580596924, "learning_rate": 7.051563718406846e-06, "loss": 0.1544986367225647, "rewards/accuracies": 0.9375, "rewards/chosen": 27.84054946899414, "rewards/margins": 24.291610717773438, "rewards/rejected": 3.5460891723632812, "step": 2302 }, { "epoch": 1.1920289855072463, "grad_norm": 1.4129810333251953, "learning_rate": 7.048961714314742e-06, "loss": 0.13878411054611206, "rewards/accuracies": 0.9375, "rewards/chosen": 24.83385467529297, "rewards/margins": 21.903915405273438, "rewards/rejected": 2.926222801208496, "step": 2303 }, { "epoch": 1.1925465838509317, "grad_norm": 1.0329700708389282, "learning_rate": 7.046359043168376e-06, "loss": 0.09457665681838989, "rewards/accuracies": 0.953125, "rewards/chosen": 24.056884765625, "rewards/margins": 21.202674865722656, "rewards/rejected": 2.8519020080566406, "step": 2304 }, { "epoch": 1.193064182194617, "grad_norm": 0.9667677283287048, "learning_rate": 7.043755705815066e-06, "loss": 0.09819187223911285, "rewards/accuracies": 0.953125, "rewards/chosen": 23.866531372070312, "rewards/margins": 20.460113525390625, "rewards/rejected": 3.4023914337158203, "step": 2305 }, { "epoch": 1.1935817805383022, "grad_norm": 0.960441529750824, "learning_rate": 7.04115170310235e-06, "loss": 0.13152533769607544, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.248929977416992, "rewards/margins": 24.094879150390625, "rewards/rejected": 3.160038948059082, "step": 2306 }, { "epoch": 1.1940993788819876, "grad_norm": 1.3571878671646118, "learning_rate": 7.038547035877976e-06, "loss": 0.11761821806430817, "rewards/accuracies": 0.9453125, "rewards/chosen": 22.52688217163086, "rewards/margins": 19.291152954101562, "rewards/rejected": 3.230318069458008, "step": 2307 }, { "epoch": 1.194616977225673, "grad_norm": 2.8814170360565186, "learning_rate": 7.035941704989916e-06, "loss": 0.15841172635555267, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.644256591796875, "rewards/margins": 21.67828369140625, "rewards/rejected": 3.963214874267578, "step": 2308 }, { "epoch": 1.195134575569358, "grad_norm": 0.9678093791007996, "learning_rate": 7.033335711286352e-06, "loss": 0.1429714858531952, "rewards/accuracies": 0.9375, "rewards/chosen": 28.840530395507812, "rewards/margins": 25.1668701171875, "rewards/rejected": 3.686891555786133, "step": 2309 }, { "epoch": 1.1956521739130435, "grad_norm": 2.0601680278778076, "learning_rate": 7.030729055615684e-06, "loss": 0.1505962610244751, "rewards/accuracies": 0.9375, "rewards/chosen": 23.21934700012207, "rewards/margins": 20.841949462890625, "rewards/rejected": 2.3758983612060547, "step": 2310 }, { "epoch": 1.1961697722567288, "grad_norm": 0.8591235280036926, "learning_rate": 7.0281217388265275e-06, "loss": 0.09347057342529297, "rewards/accuracies": 0.96875, "rewards/chosen": 26.224525451660156, "rewards/margins": 22.854736328125, "rewards/rejected": 3.368804931640625, "step": 2311 }, { "epoch": 1.1966873706004142, "grad_norm": 2.7423276901245117, "learning_rate": 7.0255137617677136e-06, "loss": 0.18120799958705902, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.0632381439209, "rewards/margins": 23.75049591064453, "rewards/rejected": 3.3111114501953125, "step": 2312 }, { "epoch": 1.1972049689440993, "grad_norm": 1.6325000524520874, "learning_rate": 7.022905125288291e-06, "loss": 0.12202871590852737, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.722305297851562, "rewards/margins": 22.183731079101562, "rewards/rejected": 3.5409679412841797, "step": 2313 }, { "epoch": 1.1977225672877847, "grad_norm": 1.2134592533111572, "learning_rate": 7.0202958302375155e-06, "loss": 0.11704269796609879, "rewards/accuracies": 0.9375, "rewards/chosen": 26.215787887573242, "rewards/margins": 23.463027954101562, "rewards/rejected": 2.7523345947265625, "step": 2314 }, { "epoch": 1.19824016563147, "grad_norm": 1.7304774522781372, "learning_rate": 7.017685877464864e-06, "loss": 0.18403016030788422, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.88335418701172, "rewards/margins": 25.19769287109375, "rewards/rejected": 3.6944198608398438, "step": 2315 }, { "epoch": 1.1987577639751552, "grad_norm": 0.9307501912117004, "learning_rate": 7.015075267820024e-06, "loss": 0.09191449731588364, "rewards/accuracies": 0.9609375, "rewards/chosen": 25.69037628173828, "rewards/margins": 22.559547424316406, "rewards/rejected": 3.1342556476593018, "step": 2316 }, { "epoch": 1.1992753623188406, "grad_norm": 2.781764030456543, "learning_rate": 7.012464002152901e-06, "loss": 0.24421481788158417, "rewards/accuracies": 0.8671875, "rewards/chosen": 24.138946533203125, "rewards/margins": 21.4451904296875, "rewards/rejected": 2.6915512084960938, "step": 2317 }, { "epoch": 1.199792960662526, "grad_norm": 1.3205844163894653, "learning_rate": 7.0098520813136094e-06, "loss": 0.10584630072116852, "rewards/accuracies": 0.96875, "rewards/chosen": 27.196666717529297, "rewards/margins": 23.20452880859375, "rewards/rejected": 3.997844696044922, "step": 2318 }, { "epoch": 1.200310559006211, "grad_norm": 1.3728739023208618, "learning_rate": 7.0072395061524816e-06, "loss": 0.12914840877056122, "rewards/accuracies": 0.953125, "rewards/chosen": 29.1306209564209, "rewards/margins": 25.335586547851562, "rewards/rejected": 3.796365737915039, "step": 2319 }, { "epoch": 1.2008281573498965, "grad_norm": 0.9688580632209778, "learning_rate": 7.004626277520055e-06, "loss": 0.10382531583309174, "rewards/accuracies": 0.9609375, "rewards/chosen": 27.718198776245117, "rewards/margins": 25.142044067382812, "rewards/rejected": 2.577850341796875, "step": 2320 }, { "epoch": 1.2013457556935818, "grad_norm": 1.6821612119674683, "learning_rate": 7.0020123962670895e-06, "loss": 0.18464358150959015, "rewards/accuracies": 0.8984375, "rewards/chosen": 26.194530487060547, "rewards/margins": 23.49658966064453, "rewards/rejected": 2.6936416625976562, "step": 2321 }, { "epoch": 1.201863354037267, "grad_norm": 2.862234115600586, "learning_rate": 6.999397863244552e-06, "loss": 0.14230765402317047, "rewards/accuracies": 0.9375, "rewards/chosen": 27.821544647216797, "rewards/margins": 25.00152587890625, "rewards/rejected": 2.822482109069824, "step": 2322 }, { "epoch": 1.2023809523809523, "grad_norm": 1.347961187362671, "learning_rate": 6.996782679303621e-06, "loss": 0.11456086486577988, "rewards/accuracies": 0.953125, "rewards/chosen": 29.983646392822266, "rewards/margins": 26.21508026123047, "rewards/rejected": 3.775402069091797, "step": 2323 }, { "epoch": 1.2028985507246377, "grad_norm": 0.9130886197090149, "learning_rate": 6.994166845295689e-06, "loss": 0.1052369475364685, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.888694763183594, "rewards/margins": 25.3162841796875, "rewards/rejected": 3.5740604400634766, "step": 2324 }, { "epoch": 1.203416149068323, "grad_norm": 0.9838762283325195, "learning_rate": 6.9915503620723614e-06, "loss": 0.09651554375886917, "rewards/accuracies": 0.953125, "rewards/chosen": 30.982555389404297, "rewards/margins": 27.70477294921875, "rewards/rejected": 3.2760467529296875, "step": 2325 }, { "epoch": 1.2039337474120082, "grad_norm": 2.736815929412842, "learning_rate": 6.988933230485453e-06, "loss": 0.15532508492469788, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.565895080566406, "rewards/margins": 26.856666564941406, "rewards/rejected": 3.7091689109802246, "step": 2326 }, { "epoch": 1.2044513457556936, "grad_norm": 1.293771743774414, "learning_rate": 6.986315451386987e-06, "loss": 0.1136288195848465, "rewards/accuracies": 0.96875, "rewards/chosen": 30.09796142578125, "rewards/margins": 26.36798095703125, "rewards/rejected": 3.7358627319335938, "step": 2327 }, { "epoch": 1.204968944099379, "grad_norm": 1.464024305343628, "learning_rate": 6.9836970256292034e-06, "loss": 0.1930360496044159, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.15700149536133, "rewards/margins": 28.162086486816406, "rewards/rejected": 3.986469268798828, "step": 2328 }, { "epoch": 1.2054865424430643, "grad_norm": 1.6199556589126587, "learning_rate": 6.9810779540645475e-06, "loss": 0.10575146228075027, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.38745880126953, "rewards/margins": 28.975799560546875, "rewards/rejected": 4.409980773925781, "step": 2329 }, { "epoch": 1.2060041407867494, "grad_norm": 1.5155673027038574, "learning_rate": 6.978458237545679e-06, "loss": 0.10757851600646973, "rewards/accuracies": 0.953125, "rewards/chosen": 30.656097412109375, "rewards/margins": 26.952056884765625, "rewards/rejected": 3.6984920501708984, "step": 2330 }, { "epoch": 1.2065217391304348, "grad_norm": 1.3944898843765259, "learning_rate": 6.975837876925463e-06, "loss": 0.1349724382162094, "rewards/accuracies": 0.9609375, "rewards/chosen": 29.592060089111328, "rewards/margins": 24.731597900390625, "rewards/rejected": 4.859443664550781, "step": 2331 }, { "epoch": 1.2070393374741202, "grad_norm": 0.6822464466094971, "learning_rate": 6.973216873056981e-06, "loss": 0.08296267688274384, "rewards/accuracies": 0.96875, "rewards/chosen": 35.73638916015625, "rewards/margins": 31.457763671875, "rewards/rejected": 4.27911376953125, "step": 2332 }, { "epoch": 1.2075569358178053, "grad_norm": 2.1404061317443848, "learning_rate": 6.970595226793513e-06, "loss": 0.19165100157260895, "rewards/accuracies": 0.8984375, "rewards/chosen": 36.687400817871094, "rewards/margins": 31.727615356445312, "rewards/rejected": 4.959621429443359, "step": 2333 }, { "epoch": 1.2080745341614907, "grad_norm": 1.5245062112808228, "learning_rate": 6.967972938988561e-06, "loss": 0.1249077096581459, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.96700096130371, "rewards/margins": 26.4910888671875, "rewards/rejected": 4.474028587341309, "step": 2334 }, { "epoch": 1.208592132505176, "grad_norm": 1.4243113994598389, "learning_rate": 6.965350010495825e-06, "loss": 0.11495058983564377, "rewards/accuracies": 0.9375, "rewards/chosen": 33.97217559814453, "rewards/margins": 29.71575927734375, "rewards/rejected": 4.246665954589844, "step": 2335 }, { "epoch": 1.2091097308488612, "grad_norm": 1.273297905921936, "learning_rate": 6.962726442169223e-06, "loss": 0.12424564361572266, "rewards/accuracies": 0.9375, "rewards/chosen": 32.815574645996094, "rewards/margins": 28.175827026367188, "rewards/rejected": 4.639196395874023, "step": 2336 }, { "epoch": 1.2096273291925466, "grad_norm": 1.2095621824264526, "learning_rate": 6.960102234862873e-06, "loss": 0.09840786457061768, "rewards/accuracies": 0.9375, "rewards/chosen": 34.54010772705078, "rewards/margins": 30.22381591796875, "rewards/rejected": 4.305179595947266, "step": 2337 }, { "epoch": 1.210144927536232, "grad_norm": 2.1352579593658447, "learning_rate": 6.957477389431107e-06, "loss": 0.18896308541297913, "rewards/accuracies": 0.8984375, "rewards/chosen": 32.13433837890625, "rewards/margins": 27.301361083984375, "rewards/rejected": 4.835136413574219, "step": 2338 }, { "epoch": 1.210662525879917, "grad_norm": 1.0287647247314453, "learning_rate": 6.95485190672846e-06, "loss": 0.11713263392448425, "rewards/accuracies": 0.953125, "rewards/chosen": 33.091407775878906, "rewards/margins": 28.805877685546875, "rewards/rejected": 4.291370868682861, "step": 2339 }, { "epoch": 1.2111801242236024, "grad_norm": 1.5788887739181519, "learning_rate": 6.952225787609679e-06, "loss": 0.14362096786499023, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.32505798339844, "rewards/margins": 30.38001251220703, "rewards/rejected": 5.936470031738281, "step": 2340 }, { "epoch": 1.2116977225672878, "grad_norm": 1.2460697889328003, "learning_rate": 6.949599032929715e-06, "loss": 0.14393296837806702, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.881175994873047, "rewards/margins": 25.541351318359375, "rewards/rejected": 3.337451934814453, "step": 2341 }, { "epoch": 1.2122153209109732, "grad_norm": 1.1404672861099243, "learning_rate": 6.946971643543726e-06, "loss": 0.12091988325119019, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.88866424560547, "rewards/margins": 28.727874755859375, "rewards/rejected": 5.159492492675781, "step": 2342 }, { "epoch": 1.2127329192546583, "grad_norm": 1.1751610040664673, "learning_rate": 6.944343620307079e-06, "loss": 0.12478923797607422, "rewards/accuracies": 0.9375, "rewards/chosen": 33.33729553222656, "rewards/margins": 28.30938720703125, "rewards/rejected": 5.024269104003906, "step": 2343 }, { "epoch": 1.2132505175983437, "grad_norm": 1.8453624248504639, "learning_rate": 6.941714964075346e-06, "loss": 0.19480381906032562, "rewards/accuracies": 0.890625, "rewards/chosen": 31.048355102539062, "rewards/margins": 26.6019287109375, "rewards/rejected": 4.442588806152344, "step": 2344 }, { "epoch": 1.213768115942029, "grad_norm": 1.746930480003357, "learning_rate": 6.9390856757043045e-06, "loss": 0.1605677306652069, "rewards/accuracies": 0.921875, "rewards/chosen": 29.223758697509766, "rewards/margins": 24.517791748046875, "rewards/rejected": 4.709969520568848, "step": 2345 }, { "epoch": 1.2142857142857142, "grad_norm": 1.6424764394760132, "learning_rate": 6.936455756049936e-06, "loss": 0.08827921748161316, "rewards/accuracies": 0.9609375, "rewards/chosen": 28.906814575195312, "rewards/margins": 25.3509521484375, "rewards/rejected": 3.5596580505371094, "step": 2346 }, { "epoch": 1.2148033126293996, "grad_norm": 0.7867241501808167, "learning_rate": 6.933825205968435e-06, "loss": 0.10978823900222778, "rewards/accuracies": 0.9609375, "rewards/chosen": 28.150054931640625, "rewards/margins": 24.346664428710938, "rewards/rejected": 3.807872772216797, "step": 2347 }, { "epoch": 1.215320910973085, "grad_norm": 2.074399948120117, "learning_rate": 6.9311940263161906e-06, "loss": 0.17239433526992798, "rewards/accuracies": 0.921875, "rewards/chosen": 28.517593383789062, "rewards/margins": 25.159576416015625, "rewards/rejected": 3.3608970642089844, "step": 2348 }, { "epoch": 1.2158385093167703, "grad_norm": 2.7746496200561523, "learning_rate": 6.928562217949808e-06, "loss": 0.20970430970191956, "rewards/accuracies": 0.90625, "rewards/chosen": 26.875499725341797, "rewards/margins": 23.29693603515625, "rewards/rejected": 3.577953815460205, "step": 2349 }, { "epoch": 1.2163561076604554, "grad_norm": 0.9478808045387268, "learning_rate": 6.925929781726086e-06, "loss": 0.11979156732559204, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.268173217773438, "rewards/margins": 23.267318725585938, "rewards/rejected": 2.995290756225586, "step": 2350 }, { "epoch": 1.2168737060041408, "grad_norm": 1.1263269186019897, "learning_rate": 6.923296718502035e-06, "loss": 0.13554736971855164, "rewards/accuracies": 0.9375, "rewards/chosen": 27.997722625732422, "rewards/margins": 24.619171142578125, "rewards/rejected": 3.3763256072998047, "step": 2351 }, { "epoch": 1.2173913043478262, "grad_norm": 0.9488770961761475, "learning_rate": 6.920663029134869e-06, "loss": 0.13746078312397003, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.31552505493164, "rewards/margins": 20.833297729492188, "rewards/rejected": 2.4820406436920166, "step": 2352 }, { "epoch": 1.2179089026915113, "grad_norm": 1.0879889726638794, "learning_rate": 6.9180287144820035e-06, "loss": 0.15055528283119202, "rewards/accuracies": 0.921875, "rewards/chosen": 24.308242797851562, "rewards/margins": 21.311046600341797, "rewards/rejected": 3.002328872680664, "step": 2353 }, { "epoch": 1.2184265010351967, "grad_norm": 0.8190538883209229, "learning_rate": 6.915393775401058e-06, "loss": 0.12733811140060425, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.132720947265625, "rewards/margins": 20.24798583984375, "rewards/rejected": 2.8853893280029297, "step": 2354 }, { "epoch": 1.218944099378882, "grad_norm": 1.0841729640960693, "learning_rate": 6.9127582127498585e-06, "loss": 0.14046916365623474, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.252830505371094, "rewards/margins": 22.279800415039062, "rewards/rejected": 2.9742918014526367, "step": 2355 }, { "epoch": 1.2194616977225672, "grad_norm": 0.8455429673194885, "learning_rate": 6.910122027386428e-06, "loss": 0.1252654790878296, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.464359283447266, "rewards/margins": 21.796646118164062, "rewards/rejected": 2.666323661804199, "step": 2356 }, { "epoch": 1.2199792960662525, "grad_norm": 0.9599661827087402, "learning_rate": 6.907485220168998e-06, "loss": 0.12221761047840118, "rewards/accuracies": 0.96875, "rewards/chosen": 25.510549545288086, "rewards/margins": 22.67730712890625, "rewards/rejected": 2.832000732421875, "step": 2357 }, { "epoch": 1.220496894409938, "grad_norm": 1.5362318754196167, "learning_rate": 6.904847791955998e-06, "loss": 0.17303389310836792, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.32281494140625, "rewards/margins": 18.326385498046875, "rewards/rejected": 1.9990005493164062, "step": 2358 }, { "epoch": 1.2210144927536233, "grad_norm": 0.6512998342514038, "learning_rate": 6.902209743606066e-06, "loss": 0.07101944833993912, "rewards/accuracies": 0.9765625, "rewards/chosen": 24.513076782226562, "rewards/margins": 21.93719482421875, "rewards/rejected": 2.5797386169433594, "step": 2359 }, { "epoch": 1.2215320910973084, "grad_norm": 1.1269407272338867, "learning_rate": 6.8995710759780336e-06, "loss": 0.20017504692077637, "rewards/accuracies": 0.890625, "rewards/chosen": 23.37961196899414, "rewards/margins": 20.40152359008789, "rewards/rejected": 2.9674720764160156, "step": 2360 }, { "epoch": 1.2220496894409938, "grad_norm": 0.7029663324356079, "learning_rate": 6.896931789930938e-06, "loss": 0.10346285998821259, "rewards/accuracies": 0.96875, "rewards/chosen": 22.804306030273438, "rewards/margins": 20.277740478515625, "rewards/rejected": 2.5265655517578125, "step": 2361 }, { "epoch": 1.2225672877846792, "grad_norm": 1.042327880859375, "learning_rate": 6.894291886324021e-06, "loss": 0.1617206633090973, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.20671844482422, "rewards/margins": 21.39740753173828, "rewards/rejected": 2.8140838146209717, "step": 2362 }, { "epoch": 1.2230848861283643, "grad_norm": 1.5340620279312134, "learning_rate": 6.891651366016719e-06, "loss": 0.14260685443878174, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.383085250854492, "rewards/margins": 22.33001708984375, "rewards/rejected": 2.0529327392578125, "step": 2363 }, { "epoch": 1.2236024844720497, "grad_norm": 1.2349110841751099, "learning_rate": 6.889010229868676e-06, "loss": 0.1860717236995697, "rewards/accuracies": 0.890625, "rewards/chosen": 21.717391967773438, "rewards/margins": 20.0699462890625, "rewards/rejected": 1.6446266174316406, "step": 2364 }, { "epoch": 1.224120082815735, "grad_norm": 1.0898669958114624, "learning_rate": 6.8863684787397275e-06, "loss": 0.13979658484458923, "rewards/accuracies": 0.96875, "rewards/chosen": 25.91015625, "rewards/margins": 23.10113525390625, "rewards/rejected": 2.800029754638672, "step": 2365 }, { "epoch": 1.2246376811594204, "grad_norm": 3.6798810958862305, "learning_rate": 6.8837261134899205e-06, "loss": 0.1771022528409958, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.874317169189453, "rewards/margins": 21.846710205078125, "rewards/rejected": 3.033125877380371, "step": 2366 }, { "epoch": 1.2251552795031055, "grad_norm": 1.1632471084594727, "learning_rate": 6.881083134979491e-06, "loss": 0.14029401540756226, "rewards/accuracies": 0.9453125, "rewards/chosen": 22.303585052490234, "rewards/margins": 20.4161376953125, "rewards/rejected": 1.885019302368164, "step": 2367 }, { "epoch": 1.225672877846791, "grad_norm": 1.0407488346099854, "learning_rate": 6.8784395440688845e-06, "loss": 0.13728976249694824, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.642860412597656, "rewards/margins": 23.43695068359375, "rewards/rejected": 2.205010414123535, "step": 2368 }, { "epoch": 1.2261904761904763, "grad_norm": 0.9944614768028259, "learning_rate": 6.8757953416187375e-06, "loss": 0.16356390714645386, "rewards/accuracies": 0.9375, "rewards/chosen": 23.464550018310547, "rewards/margins": 20.966129302978516, "rewards/rejected": 2.5045719146728516, "step": 2369 }, { "epoch": 1.2267080745341614, "grad_norm": 0.8578882813453674, "learning_rate": 6.873150528489891e-06, "loss": 0.08902566134929657, "rewards/accuracies": 0.96875, "rewards/chosen": 28.937240600585938, "rewards/margins": 26.040191650390625, "rewards/rejected": 2.9026756286621094, "step": 2370 }, { "epoch": 1.2272256728778468, "grad_norm": 1.1268894672393799, "learning_rate": 6.870505105543382e-06, "loss": 0.14546063542366028, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.189247131347656, "rewards/margins": 25.173492431640625, "rewards/rejected": 3.013988494873047, "step": 2371 }, { "epoch": 1.2277432712215322, "grad_norm": 2.4966237545013428, "learning_rate": 6.8678590736404475e-06, "loss": 0.11897643655538559, "rewards/accuracies": 0.9609375, "rewards/chosen": 27.214675903320312, "rewards/margins": 24.954383850097656, "rewards/rejected": 2.263073444366455, "step": 2372 }, { "epoch": 1.2282608695652173, "grad_norm": 0.8434224128723145, "learning_rate": 6.865212433642523e-06, "loss": 0.09745865315198898, "rewards/accuracies": 0.9375, "rewards/chosen": 26.862533569335938, "rewards/margins": 24.68328857421875, "rewards/rejected": 2.17901611328125, "step": 2373 }, { "epoch": 1.2287784679089027, "grad_norm": 1.1700502634048462, "learning_rate": 6.862565186411238e-06, "loss": 0.14720237255096436, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.256526947021484, "rewards/margins": 24.0791015625, "rewards/rejected": 3.1710681915283203, "step": 2374 }, { "epoch": 1.229296066252588, "grad_norm": 6.089491844177246, "learning_rate": 6.859917332808428e-06, "loss": 0.22502346336841583, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.794326782226562, "rewards/margins": 25.305198669433594, "rewards/rejected": 2.4889144897460938, "step": 2375 }, { "epoch": 1.2298136645962732, "grad_norm": 0.8574178814888, "learning_rate": 6.857268873696116e-06, "loss": 0.09467555582523346, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.349380493164062, "rewards/margins": 25.78826904296875, "rewards/rejected": 2.5634613037109375, "step": 2376 }, { "epoch": 1.2303312629399585, "grad_norm": 1.121924877166748, "learning_rate": 6.85461980993653e-06, "loss": 0.09704913198947906, "rewards/accuracies": 0.953125, "rewards/chosen": 29.408418655395508, "rewards/margins": 26.7982177734375, "rewards/rejected": 2.6161575317382812, "step": 2377 }, { "epoch": 1.230848861283644, "grad_norm": 2.1265556812286377, "learning_rate": 6.851970142392092e-06, "loss": 0.13445179164409637, "rewards/accuracies": 0.9375, "rewards/chosen": 25.647735595703125, "rewards/margins": 23.33112335205078, "rewards/rejected": 2.3168277740478516, "step": 2378 }, { "epoch": 1.2313664596273293, "grad_norm": 0.9781214594841003, "learning_rate": 6.849319871925417e-06, "loss": 0.14947351813316345, "rewards/accuracies": 0.90625, "rewards/chosen": 30.17096519470215, "rewards/margins": 26.762786865234375, "rewards/rejected": 3.417146682739258, "step": 2379 }, { "epoch": 1.2318840579710144, "grad_norm": 2.616466522216797, "learning_rate": 6.846668999399324e-06, "loss": 0.19428125023841858, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.031448364257812, "rewards/margins": 26.214675903320312, "rewards/rejected": 2.8208694458007812, "step": 2380 }, { "epoch": 1.2324016563146998, "grad_norm": 3.443932056427002, "learning_rate": 6.844017525676821e-06, "loss": 0.18416070938110352, "rewards/accuracies": 0.921875, "rewards/chosen": 27.79058837890625, "rewards/margins": 25.228302001953125, "rewards/rejected": 2.5615997314453125, "step": 2381 }, { "epoch": 1.2329192546583851, "grad_norm": 2.5008668899536133, "learning_rate": 6.841365451621114e-06, "loss": 0.21685907244682312, "rewards/accuracies": 0.8984375, "rewards/chosen": 32.17400360107422, "rewards/margins": 28.606048583984375, "rewards/rejected": 3.5684127807617188, "step": 2382 }, { "epoch": 1.2334368530020705, "grad_norm": 1.1677467823028564, "learning_rate": 6.838712778095608e-06, "loss": 0.13339868187904358, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.03877258300781, "rewards/margins": 28.77117919921875, "rewards/rejected": 3.2613229751586914, "step": 2383 }, { "epoch": 1.2339544513457557, "grad_norm": 1.0585670471191406, "learning_rate": 6.836059505963896e-06, "loss": 0.08131210505962372, "rewards/accuracies": 0.9375, "rewards/chosen": 33.77851867675781, "rewards/margins": 31.018829345703125, "rewards/rejected": 2.755659580230713, "step": 2384 }, { "epoch": 1.234472049689441, "grad_norm": 2.574577808380127, "learning_rate": 6.833405636089771e-06, "loss": 0.10468286275863647, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.068817138671875, "rewards/margins": 28.191070556640625, "rewards/rejected": 2.874447822570801, "step": 2385 }, { "epoch": 1.2349896480331264, "grad_norm": 2.5015029907226562, "learning_rate": 6.830751169337224e-06, "loss": 0.15241864323616028, "rewards/accuracies": 0.921875, "rewards/chosen": 26.246788024902344, "rewards/margins": 24.057861328125, "rewards/rejected": 2.190459728240967, "step": 2386 }, { "epoch": 1.2355072463768115, "grad_norm": 2.0403707027435303, "learning_rate": 6.8280961065704275e-06, "loss": 0.15020789206027985, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.19280242919922, "rewards/margins": 27.432525634765625, "rewards/rejected": 2.751575469970703, "step": 2387 }, { "epoch": 1.236024844720497, "grad_norm": 1.411555290222168, "learning_rate": 6.825440448653764e-06, "loss": 0.17431750893592834, "rewards/accuracies": 0.953125, "rewards/chosen": 29.158660888671875, "rewards/margins": 26.026611328125, "rewards/rejected": 3.131877899169922, "step": 2388 }, { "epoch": 1.2365424430641823, "grad_norm": 0.7840347290039062, "learning_rate": 6.822784196451797e-06, "loss": 0.06474867463111877, "rewards/accuracies": 0.96875, "rewards/chosen": 35.66514587402344, "rewards/margins": 31.108551025390625, "rewards/rejected": 4.551823616027832, "step": 2389 }, { "epoch": 1.2370600414078674, "grad_norm": 1.2528990507125854, "learning_rate": 6.820127350829293e-06, "loss": 0.18976253271102905, "rewards/accuracies": 0.90625, "rewards/chosen": 28.71746826171875, "rewards/margins": 26.562545776367188, "rewards/rejected": 2.161489248275757, "step": 2390 }, { "epoch": 1.2375776397515528, "grad_norm": 1.3719795942306519, "learning_rate": 6.817469912651202e-06, "loss": 0.105112224817276, "rewards/accuracies": 0.96875, "rewards/chosen": 34.93735885620117, "rewards/margins": 32.04888916015625, "rewards/rejected": 2.8888726234436035, "step": 2391 }, { "epoch": 1.2380952380952381, "grad_norm": 2.195138692855835, "learning_rate": 6.814811882782677e-06, "loss": 0.28012320399284363, "rewards/accuracies": 0.8671875, "rewards/chosen": 30.364471435546875, "rewards/margins": 26.70074462890625, "rewards/rejected": 3.656553268432617, "step": 2392 }, { "epoch": 1.2386128364389233, "grad_norm": 1.380599021911621, "learning_rate": 6.812153262089055e-06, "loss": 0.11648862063884735, "rewards/accuracies": 0.9609375, "rewards/chosen": 36.82779312133789, "rewards/margins": 32.08526611328125, "rewards/rejected": 4.738275051116943, "step": 2393 }, { "epoch": 1.2391304347826086, "grad_norm": 1.0776368379592896, "learning_rate": 6.809494051435871e-06, "loss": 0.13166852295398712, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.75337219238281, "rewards/margins": 30.50421142578125, "rewards/rejected": 3.252714157104492, "step": 2394 }, { "epoch": 1.239648033126294, "grad_norm": 2.4525222778320312, "learning_rate": 6.806834251688851e-06, "loss": 0.14813491702079773, "rewards/accuracies": 0.8984375, "rewards/chosen": 34.48845291137695, "rewards/margins": 30.2696533203125, "rewards/rejected": 4.211696624755859, "step": 2395 }, { "epoch": 1.2401656314699794, "grad_norm": 1.260721206665039, "learning_rate": 6.80417386371391e-06, "loss": 0.17927923798561096, "rewards/accuracies": 0.9375, "rewards/chosen": 32.97215270996094, "rewards/margins": 28.699432373046875, "rewards/rejected": 4.275493621826172, "step": 2396 }, { "epoch": 1.2406832298136645, "grad_norm": 0.8557391166687012, "learning_rate": 6.801512888377158e-06, "loss": 0.15061646699905396, "rewards/accuracies": 0.953125, "rewards/chosen": 30.354999542236328, "rewards/margins": 25.80133056640625, "rewards/rejected": 4.55633544921875, "step": 2397 }, { "epoch": 1.24120082815735, "grad_norm": 0.9149178862571716, "learning_rate": 6.798851326544894e-06, "loss": 0.16859248280525208, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.367801666259766, "rewards/margins": 27.937225341796875, "rewards/rejected": 3.4223551750183105, "step": 2398 }, { "epoch": 1.2417184265010353, "grad_norm": 1.0822827816009521, "learning_rate": 6.796189179083609e-06, "loss": 0.14737984538078308, "rewards/accuracies": 0.921875, "rewards/chosen": 30.102340698242188, "rewards/margins": 26.277359008789062, "rewards/rejected": 3.8288044929504395, "step": 2399 }, { "epoch": 1.2422360248447206, "grad_norm": 0.8995713591575623, "learning_rate": 6.793526446859984e-06, "loss": 0.171156108379364, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.65704917907715, "rewards/margins": 26.033187866210938, "rewards/rejected": 3.621744155883789, "step": 2400 }, { "epoch": 1.2427536231884058, "grad_norm": 0.681989312171936, "learning_rate": 6.7908631307408925e-06, "loss": 0.1047963947057724, "rewards/accuracies": 0.953125, "rewards/chosen": 36.31891632080078, "rewards/margins": 31.562843322753906, "rewards/rejected": 4.752552032470703, "step": 2401 }, { "epoch": 1.2432712215320911, "grad_norm": 0.8193171620368958, "learning_rate": 6.7881992315933906e-06, "loss": 0.12606672942638397, "rewards/accuracies": 0.9375, "rewards/chosen": 31.873947143554688, "rewards/margins": 27.558624267578125, "rewards/rejected": 4.313348770141602, "step": 2402 }, { "epoch": 1.2437888198757765, "grad_norm": 0.6665899753570557, "learning_rate": 6.785534750284738e-06, "loss": 0.10411638021469116, "rewards/accuracies": 0.9765625, "rewards/chosen": 34.97376251220703, "rewards/margins": 30.046783447265625, "rewards/rejected": 4.9343719482421875, "step": 2403 }, { "epoch": 1.2443064182194616, "grad_norm": 0.5783935189247131, "learning_rate": 6.7828696876823695e-06, "loss": 0.09270443022251129, "rewards/accuracies": 0.96875, "rewards/chosen": 29.81634521484375, "rewards/margins": 25.79914093017578, "rewards/rejected": 4.015964031219482, "step": 2404 }, { "epoch": 1.244824016563147, "grad_norm": 1.2543150186538696, "learning_rate": 6.7802040446539185e-06, "loss": 0.2279263734817505, "rewards/accuracies": 0.90625, "rewards/chosen": 30.94780731201172, "rewards/margins": 25.910293579101562, "rewards/rejected": 5.035327911376953, "step": 2405 }, { "epoch": 1.2453416149068324, "grad_norm": 2.1645660400390625, "learning_rate": 6.777537822067203e-06, "loss": 0.22515423595905304, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.31849670410156, "rewards/margins": 27.97441864013672, "rewards/rejected": 4.343132019042969, "step": 2406 }, { "epoch": 1.2458592132505175, "grad_norm": 1.1292763948440552, "learning_rate": 6.774871020790231e-06, "loss": 0.13488614559173584, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.64128112792969, "rewards/margins": 28.23583984375, "rewards/rejected": 5.3995256423950195, "step": 2407 }, { "epoch": 1.2463768115942029, "grad_norm": 1.2759956121444702, "learning_rate": 6.772203641691201e-06, "loss": 0.1433539241552353, "rewards/accuracies": 0.921875, "rewards/chosen": 37.9867057800293, "rewards/margins": 31.17694091796875, "rewards/rejected": 6.803497314453125, "step": 2408 }, { "epoch": 1.2468944099378882, "grad_norm": 1.1117256879806519, "learning_rate": 6.769535685638494e-06, "loss": 0.08570321649312973, "rewards/accuracies": 0.96875, "rewards/chosen": 39.23114776611328, "rewards/margins": 33.1007080078125, "rewards/rejected": 6.131294250488281, "step": 2409 }, { "epoch": 1.2474120082815734, "grad_norm": 1.0282419919967651, "learning_rate": 6.766867153500685e-06, "loss": 0.15249094367027283, "rewards/accuracies": 0.875, "rewards/chosen": 36.39409637451172, "rewards/margins": 30.965194702148438, "rewards/rejected": 5.420087814331055, "step": 2410 }, { "epoch": 1.2479296066252588, "grad_norm": 0.8362100124359131, "learning_rate": 6.764198046146533e-06, "loss": 0.08070170879364014, "rewards/accuracies": 0.96875, "rewards/chosen": 33.8245849609375, "rewards/margins": 28.545822143554688, "rewards/rejected": 5.2792510986328125, "step": 2411 }, { "epoch": 1.2484472049689441, "grad_norm": 0.9189696907997131, "learning_rate": 6.761528364444985e-06, "loss": 0.12162305414676666, "rewards/accuracies": 0.96875, "rewards/chosen": 32.45647048950195, "rewards/margins": 28.73089599609375, "rewards/rejected": 3.7254371643066406, "step": 2412 }, { "epoch": 1.2489648033126295, "grad_norm": 1.0228976011276245, "learning_rate": 6.758858109265175e-06, "loss": 0.06661218404769897, "rewards/accuracies": 0.96875, "rewards/chosen": 44.9222412109375, "rewards/margins": 37.98321533203125, "rewards/rejected": 6.938682556152344, "step": 2413 }, { "epoch": 1.2494824016563146, "grad_norm": 1.374024748802185, "learning_rate": 6.756187281476425e-06, "loss": 0.15527114272117615, "rewards/accuracies": 0.9140625, "rewards/chosen": 37.27360534667969, "rewards/margins": 32.06095886230469, "rewards/rejected": 5.21075439453125, "step": 2414 }, { "epoch": 1.25, "grad_norm": 0.6894026398658752, "learning_rate": 6.753515881948241e-06, "loss": 0.07028093934059143, "rewards/accuracies": 0.9765625, "rewards/chosen": 36.627845764160156, "rewards/margins": 32.15313720703125, "rewards/rejected": 4.485586166381836, "step": 2415 }, { "epoch": 1.2505175983436854, "grad_norm": 1.0335512161254883, "learning_rate": 6.750843911550317e-06, "loss": 0.12916980683803558, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.185646057128906, "rewards/margins": 28.796058654785156, "rewards/rejected": 4.392345905303955, "step": 2416 }, { "epoch": 1.2510351966873707, "grad_norm": 1.0950767993927002, "learning_rate": 6.74817137115253e-06, "loss": 0.1358051598072052, "rewards/accuracies": 0.953125, "rewards/chosen": 37.696258544921875, "rewards/margins": 33.317466735839844, "rewards/rejected": 4.366663455963135, "step": 2417 }, { "epoch": 1.2515527950310559, "grad_norm": 1.5810546875, "learning_rate": 6.74549826162495e-06, "loss": 0.17875097692012787, "rewards/accuracies": 0.9140625, "rewards/chosen": 37.99410629272461, "rewards/margins": 32.614410400390625, "rewards/rejected": 5.380962371826172, "step": 2418 }, { "epoch": 1.2520703933747412, "grad_norm": 3.217998743057251, "learning_rate": 6.7428245838378214e-06, "loss": 0.27023249864578247, "rewards/accuracies": 0.890625, "rewards/chosen": 40.260887145996094, "rewards/margins": 34.07697296142578, "rewards/rejected": 6.185075759887695, "step": 2419 }, { "epoch": 1.2525879917184266, "grad_norm": 2.489668846130371, "learning_rate": 6.740150338661583e-06, "loss": 0.20254555344581604, "rewards/accuracies": 0.9140625, "rewards/chosen": 35.04387283325195, "rewards/margins": 30.1480712890625, "rewards/rejected": 4.894405364990234, "step": 2420 }, { "epoch": 1.2531055900621118, "grad_norm": 2.4000730514526367, "learning_rate": 6.737475526966852e-06, "loss": 0.14468413591384888, "rewards/accuracies": 0.9609375, "rewards/chosen": 36.718109130859375, "rewards/margins": 30.768295288085938, "rewards/rejected": 5.94927978515625, "step": 2421 }, { "epoch": 1.2536231884057971, "grad_norm": 1.2234987020492554, "learning_rate": 6.734800149624437e-06, "loss": 0.10347853600978851, "rewards/accuracies": 0.9609375, "rewards/chosen": 34.68513488769531, "rewards/margins": 30.035919189453125, "rewards/rejected": 4.650979042053223, "step": 2422 }, { "epoch": 1.2541407867494825, "grad_norm": 1.8634248971939087, "learning_rate": 6.732124207505319e-06, "loss": 0.15509331226348877, "rewards/accuracies": 0.921875, "rewards/chosen": 36.953460693359375, "rewards/margins": 31.591827392578125, "rewards/rejected": 5.365531921386719, "step": 2423 }, { "epoch": 1.2546583850931676, "grad_norm": 0.999097466468811, "learning_rate": 6.729447701480678e-06, "loss": 0.0722661092877388, "rewards/accuracies": 0.9765625, "rewards/chosen": 35.74864196777344, "rewards/margins": 30.340606689453125, "rewards/rejected": 5.41028356552124, "step": 2424 }, { "epoch": 1.255175983436853, "grad_norm": 1.1016381978988647, "learning_rate": 6.7267706324218655e-06, "loss": 0.13607197999954224, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.837547302246094, "rewards/margins": 24.27490234375, "rewards/rejected": 2.5672261714935303, "step": 2425 }, { "epoch": 1.2556935817805384, "grad_norm": 2.325213670730591, "learning_rate": 6.724093001200422e-06, "loss": 0.1333838552236557, "rewards/accuracies": 0.9375, "rewards/chosen": 31.15994644165039, "rewards/margins": 27.374053955078125, "rewards/rejected": 3.7818336486816406, "step": 2426 }, { "epoch": 1.2562111801242235, "grad_norm": 1.413000464439392, "learning_rate": 6.7214148086880695e-06, "loss": 0.17314854264259338, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.287841796875, "rewards/margins": 24.902679443359375, "rewards/rejected": 3.3798828125, "step": 2427 }, { "epoch": 1.2567287784679089, "grad_norm": 1.1856902837753296, "learning_rate": 6.71873605575671e-06, "loss": 0.14287200570106506, "rewards/accuracies": 0.921875, "rewards/chosen": 28.972301483154297, "rewards/margins": 24.977638244628906, "rewards/rejected": 4.003668308258057, "step": 2428 }, { "epoch": 1.2572463768115942, "grad_norm": 1.149444341659546, "learning_rate": 6.716056743278438e-06, "loss": 0.121659055352211, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.380409240722656, "rewards/margins": 25.858932495117188, "rewards/rejected": 3.5256266593933105, "step": 2429 }, { "epoch": 1.2577639751552794, "grad_norm": 1.244743824005127, "learning_rate": 6.713376872125514e-06, "loss": 0.10022406280040741, "rewards/accuracies": 0.953125, "rewards/chosen": 27.432682037353516, "rewards/margins": 24.91106414794922, "rewards/rejected": 2.521463394165039, "step": 2430 }, { "epoch": 1.2582815734989647, "grad_norm": 1.2573723793029785, "learning_rate": 6.710696443170397e-06, "loss": 0.18334266543388367, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.198848724365234, "rewards/margins": 21.695877075195312, "rewards/rejected": 2.506134033203125, "step": 2431 }, { "epoch": 1.25879917184265, "grad_norm": 0.7957891225814819, "learning_rate": 6.708015457285716e-06, "loss": 0.142514169216156, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.57373046875, "rewards/margins": 20.997146606445312, "rewards/rejected": 2.571807861328125, "step": 2432 }, { "epoch": 1.2593167701863355, "grad_norm": 0.7054295539855957, "learning_rate": 6.705333915344285e-06, "loss": 0.08954663574695587, "rewards/accuracies": 0.9765625, "rewards/chosen": 26.410594940185547, "rewards/margins": 23.4356689453125, "rewards/rejected": 2.9747352600097656, "step": 2433 }, { "epoch": 1.2598343685300206, "grad_norm": 3.3527965545654297, "learning_rate": 6.702651818219102e-06, "loss": 0.17510810494422913, "rewards/accuracies": 0.90625, "rewards/chosen": 25.694080352783203, "rewards/margins": 22.63016128540039, "rewards/rejected": 3.0615234375, "step": 2434 }, { "epoch": 1.260351966873706, "grad_norm": 1.0244452953338623, "learning_rate": 6.69996916678334e-06, "loss": 0.11877720057964325, "rewards/accuracies": 0.953125, "rewards/chosen": 26.238014221191406, "rewards/margins": 23.363210678100586, "rewards/rejected": 2.86903977394104, "step": 2435 }, { "epoch": 1.2608695652173914, "grad_norm": 1.159574270248413, "learning_rate": 6.6972859619103584e-06, "loss": 0.14952927827835083, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.843379974365234, "rewards/margins": 22.788314819335938, "rewards/rejected": 3.0606346130371094, "step": 2436 }, { "epoch": 1.2613871635610767, "grad_norm": 3.8379175662994385, "learning_rate": 6.6946022044736915e-06, "loss": 0.1475667506456375, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.213932037353516, "rewards/margins": 21.52306365966797, "rewards/rejected": 2.6931076049804688, "step": 2437 }, { "epoch": 1.2619047619047619, "grad_norm": 1.0027230978012085, "learning_rate": 6.691917895347058e-06, "loss": 0.08561447262763977, "rewards/accuracies": 0.96875, "rewards/chosen": 26.581207275390625, "rewards/margins": 23.333267211914062, "rewards/rejected": 3.2487502098083496, "step": 2438 }, { "epoch": 1.2624223602484472, "grad_norm": 0.8418440222740173, "learning_rate": 6.6892330354043514e-06, "loss": 0.1295967698097229, "rewards/accuracies": 0.9453125, "rewards/chosen": 22.420326232910156, "rewards/margins": 19.926239013671875, "rewards/rejected": 2.4896812438964844, "step": 2439 }, { "epoch": 1.2629399585921326, "grad_norm": 4.229497909545898, "learning_rate": 6.686547625519649e-06, "loss": 0.1374962031841278, "rewards/accuracies": 0.96875, "rewards/chosen": 23.866943359375, "rewards/margins": 21.102691650390625, "rewards/rejected": 2.7678399085998535, "step": 2440 }, { "epoch": 1.2634575569358177, "grad_norm": 1.1594429016113281, "learning_rate": 6.6838616665672064e-06, "loss": 0.1280348300933838, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.590246200561523, "rewards/margins": 21.623626708984375, "rewards/rejected": 2.9601898193359375, "step": 2441 }, { "epoch": 1.263975155279503, "grad_norm": 0.9629679918289185, "learning_rate": 6.6811751594214555e-06, "loss": 0.13369864225387573, "rewards/accuracies": 0.9453125, "rewards/chosen": 19.834428787231445, "rewards/margins": 18.3402099609375, "rewards/rejected": 1.498077392578125, "step": 2442 }, { "epoch": 1.2644927536231885, "grad_norm": 3.756103515625, "learning_rate": 6.678488104957007e-06, "loss": 0.14177563786506653, "rewards/accuracies": 0.953125, "rewards/chosen": 21.974544525146484, "rewards/margins": 19.764007568359375, "rewards/rejected": 2.2093772888183594, "step": 2443 }, { "epoch": 1.2650103519668736, "grad_norm": 1.124408483505249, "learning_rate": 6.675800504048654e-06, "loss": 0.1335674375295639, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.75885009765625, "rewards/margins": 18.737274169921875, "rewards/rejected": 2.0219955444335938, "step": 2444 }, { "epoch": 1.265527950310559, "grad_norm": 2.3642311096191406, "learning_rate": 6.6731123575713605e-06, "loss": 0.22603823244571686, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.75354766845703, "rewards/margins": 18.714553833007812, "rewards/rejected": 2.0414352416992188, "step": 2445 }, { "epoch": 1.2660455486542443, "grad_norm": 1.077966332435608, "learning_rate": 6.670423666400275e-06, "loss": 0.09160935133695602, "rewards/accuracies": 0.9765625, "rewards/chosen": 20.081361770629883, "rewards/margins": 18.49755859375, "rewards/rejected": 1.5832977294921875, "step": 2446 }, { "epoch": 1.2665631469979295, "grad_norm": 2.0050339698791504, "learning_rate": 6.667734431410719e-06, "loss": 0.15802377462387085, "rewards/accuracies": 0.953125, "rewards/chosen": 21.402000427246094, "rewards/margins": 19.441429138183594, "rewards/rejected": 1.9594502449035645, "step": 2447 }, { "epoch": 1.2670807453416149, "grad_norm": 1.1725772619247437, "learning_rate": 6.665044653478193e-06, "loss": 0.07931533455848694, "rewards/accuracies": 0.96875, "rewards/chosen": 24.551666259765625, "rewards/margins": 22.767242431640625, "rewards/rejected": 1.7847137451171875, "step": 2448 }, { "epoch": 1.2675983436853002, "grad_norm": 1.5318914651870728, "learning_rate": 6.662354333478372e-06, "loss": 0.09633442759513855, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.22085952758789, "rewards/margins": 21.065017700195312, "rewards/rejected": 2.1518802642822266, "step": 2449 }, { "epoch": 1.2681159420289856, "grad_norm": 1.6332863569259644, "learning_rate": 6.659663472287112e-06, "loss": 0.13048915565013885, "rewards/accuracies": 0.9453125, "rewards/chosen": 19.330615997314453, "rewards/margins": 17.638473510742188, "rewards/rejected": 1.6976337432861328, "step": 2450 }, { "epoch": 1.2686335403726707, "grad_norm": 1.9200636148452759, "learning_rate": 6.656972070780439e-06, "loss": 0.13839277625083923, "rewards/accuracies": 0.921875, "rewards/chosen": 22.53038787841797, "rewards/margins": 20.569869995117188, "rewards/rejected": 1.9639711380004883, "step": 2451 }, { "epoch": 1.269151138716356, "grad_norm": 1.4804853200912476, "learning_rate": 6.654280129834561e-06, "loss": 0.11847871541976929, "rewards/accuracies": 0.9609375, "rewards/chosen": 20.342735290527344, "rewards/margins": 18.454269409179688, "rewards/rejected": 1.888906478881836, "step": 2452 }, { "epoch": 1.2696687370600415, "grad_norm": 1.094567894935608, "learning_rate": 6.651587650325856e-06, "loss": 0.08915618062019348, "rewards/accuracies": 0.96875, "rewards/chosen": 20.946517944335938, "rewards/margins": 19.42242431640625, "rewards/rejected": 1.5219993591308594, "step": 2453 }, { "epoch": 1.2701863354037268, "grad_norm": 1.3887706995010376, "learning_rate": 6.648894633130882e-06, "loss": 0.104413703083992, "rewards/accuracies": 0.953125, "rewards/chosen": 22.365386962890625, "rewards/margins": 20.114486694335938, "rewards/rejected": 2.258975028991699, "step": 2454 }, { "epoch": 1.270703933747412, "grad_norm": 1.82375168800354, "learning_rate": 6.646201079126371e-06, "loss": 0.17635032534599304, "rewards/accuracies": 0.921875, "rewards/chosen": 20.274742126464844, "rewards/margins": 18.207595825195312, "rewards/rejected": 2.0650224685668945, "step": 2455 }, { "epoch": 1.2712215320910973, "grad_norm": 1.3277109861373901, "learning_rate": 6.643506989189225e-06, "loss": 0.07739608734846115, "rewards/accuracies": 0.9765625, "rewards/chosen": 22.030607223510742, "rewards/margins": 20.01806640625, "rewards/rejected": 2.0141501426696777, "step": 2456 }, { "epoch": 1.2717391304347827, "grad_norm": 1.4447859525680542, "learning_rate": 6.640812364196531e-06, "loss": 0.09225621819496155, "rewards/accuracies": 0.96875, "rewards/chosen": 21.93313217163086, "rewards/margins": 19.954498291015625, "rewards/rejected": 1.9750633239746094, "step": 2457 }, { "epoch": 1.2722567287784678, "grad_norm": 1.601639747619629, "learning_rate": 6.638117205025536e-06, "loss": 0.17325210571289062, "rewards/accuracies": 0.921875, "rewards/chosen": 23.351016998291016, "rewards/margins": 21.22235107421875, "rewards/rejected": 2.13287353515625, "step": 2458 }, { "epoch": 1.2727743271221532, "grad_norm": 1.7906125783920288, "learning_rate": 6.635421512553675e-06, "loss": 0.19668865203857422, "rewards/accuracies": 0.9375, "rewards/chosen": 19.38079833984375, "rewards/margins": 17.745697021484375, "rewards/rejected": 1.639145851135254, "step": 2459 }, { "epoch": 1.2732919254658386, "grad_norm": 1.0115444660186768, "learning_rate": 6.632725287658546e-06, "loss": 0.11232055723667145, "rewards/accuracies": 0.953125, "rewards/chosen": 21.837383270263672, "rewards/margins": 19.51300048828125, "rewards/rejected": 2.323305130004883, "step": 2460 }, { "epoch": 1.2738095238095237, "grad_norm": 0.9390784502029419, "learning_rate": 6.630028531217926e-06, "loss": 0.09235556423664093, "rewards/accuracies": 0.9453125, "rewards/chosen": 21.11517333984375, "rewards/margins": 20.153778076171875, "rewards/rejected": 0.9651679992675781, "step": 2461 }, { "epoch": 1.274327122153209, "grad_norm": 1.0160564184188843, "learning_rate": 6.627331244109762e-06, "loss": 0.0887305736541748, "rewards/accuracies": 0.9765625, "rewards/chosen": 19.57087516784668, "rewards/margins": 19.018098831176758, "rewards/rejected": 0.5461006164550781, "step": 2462 }, { "epoch": 1.2748447204968945, "grad_norm": 2.1091318130493164, "learning_rate": 6.624633427212178e-06, "loss": 0.14170292019844055, "rewards/accuracies": 0.9375, "rewards/chosen": 21.688993453979492, "rewards/margins": 19.30853271484375, "rewards/rejected": 2.3736801147460938, "step": 2463 }, { "epoch": 1.2753623188405796, "grad_norm": 0.8035165667533875, "learning_rate": 6.621935081403465e-06, "loss": 0.08628322184085846, "rewards/accuracies": 0.953125, "rewards/chosen": 23.757280349731445, "rewards/margins": 21.63677215576172, "rewards/rejected": 2.123647689819336, "step": 2464 }, { "epoch": 1.275879917184265, "grad_norm": 2.1178951263427734, "learning_rate": 6.619236207562092e-06, "loss": 0.1534302681684494, "rewards/accuracies": 0.921875, "rewards/chosen": 20.50442123413086, "rewards/margins": 18.864166259765625, "rewards/rejected": 1.6354866027832031, "step": 2465 }, { "epoch": 1.2763975155279503, "grad_norm": 5.001464366912842, "learning_rate": 6.616536806566694e-06, "loss": 0.22785122692584991, "rewards/accuracies": 0.875, "rewards/chosen": 17.91246795654297, "rewards/margins": 16.919891357421875, "rewards/rejected": 0.9883956909179688, "step": 2466 }, { "epoch": 1.2769151138716355, "grad_norm": 0.9758791923522949, "learning_rate": 6.613836879296082e-06, "loss": 0.10633877664804459, "rewards/accuracies": 0.953125, "rewards/chosen": 21.887956619262695, "rewards/margins": 19.6207275390625, "rewards/rejected": 2.2682838439941406, "step": 2467 }, { "epoch": 1.2774327122153208, "grad_norm": 3.001216173171997, "learning_rate": 6.611136426629237e-06, "loss": 0.20553727447986603, "rewards/accuracies": 0.890625, "rewards/chosen": 20.95177459716797, "rewards/margins": 18.961273193359375, "rewards/rejected": 1.991241455078125, "step": 2468 }, { "epoch": 1.2779503105590062, "grad_norm": 1.554738998413086, "learning_rate": 6.6084354494453075e-06, "loss": 0.16498178243637085, "rewards/accuracies": 0.9375, "rewards/chosen": 19.296550750732422, "rewards/margins": 17.5771484375, "rewards/rejected": 1.7238121032714844, "step": 2469 }, { "epoch": 1.2784679089026916, "grad_norm": 1.2331048250198364, "learning_rate": 6.605733948623621e-06, "loss": 0.09908721596002579, "rewards/accuracies": 0.9609375, "rewards/chosen": 22.58030128479004, "rewards/margins": 20.163925170898438, "rewards/rejected": 2.4221267700195312, "step": 2470 }, { "epoch": 1.278985507246377, "grad_norm": 1.261610984802246, "learning_rate": 6.603031925043668e-06, "loss": 0.14990657567977905, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.493896484375, "rewards/margins": 19.090225219726562, "rewards/rejected": 1.4075489044189453, "step": 2471 }, { "epoch": 1.279503105590062, "grad_norm": 1.141183614730835, "learning_rate": 6.600329379585112e-06, "loss": 0.12183669209480286, "rewards/accuracies": 0.9453125, "rewards/chosen": 22.643096923828125, "rewards/margins": 20.662322998046875, "rewards/rejected": 1.9792490005493164, "step": 2472 }, { "epoch": 1.2800207039337475, "grad_norm": 1.2896201610565186, "learning_rate": 6.597626313127786e-06, "loss": 0.10148658603429794, "rewards/accuracies": 0.953125, "rewards/chosen": 21.906997680664062, "rewards/margins": 19.842071533203125, "rewards/rejected": 2.0652923583984375, "step": 2473 }, { "epoch": 1.2805383022774328, "grad_norm": 1.0782842636108398, "learning_rate": 6.5949227265516935e-06, "loss": 0.13627389073371887, "rewards/accuracies": 0.9296875, "rewards/chosen": 19.10943603515625, "rewards/margins": 17.502700805664062, "rewards/rejected": 1.605621337890625, "step": 2474 }, { "epoch": 1.281055900621118, "grad_norm": 1.136584758758545, "learning_rate": 6.5922186207370054e-06, "loss": 0.1572946161031723, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.97774887084961, "rewards/margins": 18.56586456298828, "rewards/rejected": 2.4078445434570312, "step": 2475 }, { "epoch": 1.2815734989648033, "grad_norm": 0.8058488965034485, "learning_rate": 6.589513996564064e-06, "loss": 0.075717031955719, "rewards/accuracies": 0.9765625, "rewards/chosen": 22.596893310546875, "rewards/margins": 20.34954833984375, "rewards/rejected": 2.2471580505371094, "step": 2476 }, { "epoch": 1.2820910973084887, "grad_norm": 0.8644176125526428, "learning_rate": 6.586808854913379e-06, "loss": 0.10941342264413834, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.43686294555664, "rewards/margins": 21.45355224609375, "rewards/rejected": 2.9864730834960938, "step": 2477 }, { "epoch": 1.2826086956521738, "grad_norm": 1.467763066291809, "learning_rate": 6.5841031966656285e-06, "loss": 0.1611127108335495, "rewards/accuracies": 0.9453125, "rewards/chosen": 21.319766998291016, "rewards/margins": 18.848861694335938, "rewards/rejected": 2.477558135986328, "step": 2478 }, { "epoch": 1.2831262939958592, "grad_norm": 4.354780197143555, "learning_rate": 6.581397022701659e-06, "loss": 0.1901962161064148, "rewards/accuracies": 0.90625, "rewards/chosen": 20.402835845947266, "rewards/margins": 18.100021362304688, "rewards/rejected": 2.3024940490722656, "step": 2479 }, { "epoch": 1.2836438923395446, "grad_norm": 0.6927709579467773, "learning_rate": 6.578690333902485e-06, "loss": 0.0885261669754982, "rewards/accuracies": 0.96875, "rewards/chosen": 23.927146911621094, "rewards/margins": 21.328872680664062, "rewards/rejected": 2.597198486328125, "step": 2480 }, { "epoch": 1.2841614906832297, "grad_norm": 1.5439226627349854, "learning_rate": 6.575983131149288e-06, "loss": 0.11971309036016464, "rewards/accuracies": 0.9375, "rewards/chosen": 21.715499877929688, "rewards/margins": 19.340957641601562, "rewards/rejected": 2.378941535949707, "step": 2481 }, { "epoch": 1.284679089026915, "grad_norm": 1.1020492315292358, "learning_rate": 6.573275415323419e-06, "loss": 0.1423056274652481, "rewards/accuracies": 0.921875, "rewards/chosen": 22.365394592285156, "rewards/margins": 19.694854736328125, "rewards/rejected": 2.674304962158203, "step": 2482 }, { "epoch": 1.2851966873706004, "grad_norm": 1.2998218536376953, "learning_rate": 6.570567187306395e-06, "loss": 0.14978355169296265, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.739627838134766, "rewards/margins": 20.943679809570312, "rewards/rejected": 2.799163818359375, "step": 2483 }, { "epoch": 1.2857142857142856, "grad_norm": 1.3219935894012451, "learning_rate": 6.5678584479798934e-06, "loss": 0.1733728051185608, "rewards/accuracies": 0.9140625, "rewards/chosen": 17.65532684326172, "rewards/margins": 15.90911865234375, "rewards/rejected": 1.7433509826660156, "step": 2484 }, { "epoch": 1.286231884057971, "grad_norm": 0.8610063791275024, "learning_rate": 6.565149198225772e-06, "loss": 0.11635377258062363, "rewards/accuracies": 0.9609375, "rewards/chosen": 19.02996063232422, "rewards/margins": 17.391006469726562, "rewards/rejected": 1.6427650451660156, "step": 2485 }, { "epoch": 1.2867494824016563, "grad_norm": 1.0792893171310425, "learning_rate": 6.562439438926039e-06, "loss": 0.13190233707427979, "rewards/accuracies": 0.9375, "rewards/chosen": 20.352306365966797, "rewards/margins": 18.078887939453125, "rewards/rejected": 2.2703800201416016, "step": 2486 }, { "epoch": 1.2872670807453417, "grad_norm": 1.424455165863037, "learning_rate": 6.559729170962883e-06, "loss": 0.11969514191150665, "rewards/accuracies": 0.9375, "rewards/chosen": 18.687976837158203, "rewards/margins": 16.83172607421875, "rewards/rejected": 1.8581666946411133, "step": 2487 }, { "epoch": 1.287784679089027, "grad_norm": 1.0037769079208374, "learning_rate": 6.557018395218646e-06, "loss": 0.11750036478042603, "rewards/accuracies": 0.9609375, "rewards/chosen": 21.487058639526367, "rewards/margins": 19.231231689453125, "rewards/rejected": 2.2543869018554688, "step": 2488 }, { "epoch": 1.2883022774327122, "grad_norm": 0.8065380454063416, "learning_rate": 6.554307112575844e-06, "loss": 0.12279581278562546, "rewards/accuracies": 0.9375, "rewards/chosen": 20.254058837890625, "rewards/margins": 18.02325439453125, "rewards/rejected": 2.2243661880493164, "step": 2489 }, { "epoch": 1.2888198757763976, "grad_norm": 1.1855705976486206, "learning_rate": 6.551595323917151e-06, "loss": 0.1638323813676834, "rewards/accuracies": 0.9609375, "rewards/chosen": 22.412555694580078, "rewards/margins": 19.513412475585938, "rewards/rejected": 2.9040870666503906, "step": 2490 }, { "epoch": 1.289337474120083, "grad_norm": 1.0883510112762451, "learning_rate": 6.548883030125414e-06, "loss": 0.11493222415447235, "rewards/accuracies": 0.96875, "rewards/chosen": 18.462814331054688, "rewards/margins": 16.5455322265625, "rewards/rejected": 1.9215259552001953, "step": 2491 }, { "epoch": 1.289855072463768, "grad_norm": 4.960489749908447, "learning_rate": 6.546170232083635e-06, "loss": 0.12267979234457016, "rewards/accuracies": 0.9609375, "rewards/chosen": 23.23432159423828, "rewards/margins": 20.963653564453125, "rewards/rejected": 2.271333694458008, "step": 2492 }, { "epoch": 1.2903726708074534, "grad_norm": 1.1978418827056885, "learning_rate": 6.543456930674988e-06, "loss": 0.12923239171504974, "rewards/accuracies": 0.9375, "rewards/chosen": 23.054168701171875, "rewards/margins": 20.75006103515625, "rewards/rejected": 2.3105831146240234, "step": 2493 }, { "epoch": 1.2908902691511388, "grad_norm": 1.1962037086486816, "learning_rate": 6.540743126782808e-06, "loss": 0.12907934188842773, "rewards/accuracies": 0.9453125, "rewards/chosen": 20.019683837890625, "rewards/margins": 17.770309448242188, "rewards/rejected": 2.2536468505859375, "step": 2494 }, { "epoch": 1.291407867494824, "grad_norm": 1.9455658197402954, "learning_rate": 6.538028821290592e-06, "loss": 0.17431500554084778, "rewards/accuracies": 0.921875, "rewards/chosen": 22.294876098632812, "rewards/margins": 19.61688232421875, "rewards/rejected": 2.672731399536133, "step": 2495 }, { "epoch": 1.2919254658385093, "grad_norm": 1.470837950706482, "learning_rate": 6.5353140150820026e-06, "loss": 0.12797999382019043, "rewards/accuracies": 0.9453125, "rewards/chosen": 20.25756072998047, "rewards/margins": 18.18999481201172, "rewards/rejected": 2.058399200439453, "step": 2496 }, { "epoch": 1.2924430641821947, "grad_norm": 1.6685278415679932, "learning_rate": 6.532598709040863e-06, "loss": 0.1157078817486763, "rewards/accuracies": 0.953125, "rewards/chosen": 24.059349060058594, "rewards/margins": 21.05120849609375, "rewards/rejected": 3.0043516159057617, "step": 2497 }, { "epoch": 1.2929606625258798, "grad_norm": 6.863430023193359, "learning_rate": 6.529882904051164e-06, "loss": 0.14654099941253662, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.257545471191406, "rewards/margins": 20.857192993164062, "rewards/rejected": 2.406444549560547, "step": 2498 }, { "epoch": 1.2934782608695652, "grad_norm": 1.1582036018371582, "learning_rate": 6.52716660099705e-06, "loss": 0.1156550794839859, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.289260864257812, "rewards/margins": 22.799453735351562, "rewards/rejected": 3.4847068786621094, "step": 2499 }, { "epoch": 1.2939958592132506, "grad_norm": 0.9017994403839111, "learning_rate": 6.5244498007628395e-06, "loss": 0.11603771150112152, "rewards/accuracies": 0.9375, "rewards/chosen": 20.421035766601562, "rewards/margins": 18.7354736328125, "rewards/rejected": 1.6869621276855469, "step": 2500 }, { "epoch": 1.2945134575569357, "grad_norm": 2.4745686054229736, "learning_rate": 6.521732504233003e-06, "loss": 0.24911284446716309, "rewards/accuracies": 0.9296875, "rewards/chosen": 21.602386474609375, "rewards/margins": 19.516586303710938, "rewards/rejected": 2.0885720252990723, "step": 2501 }, { "epoch": 1.295031055900621, "grad_norm": 0.8520876169204712, "learning_rate": 6.519014712292175e-06, "loss": 0.09822424501180649, "rewards/accuracies": 0.9609375, "rewards/chosen": 23.14147186279297, "rewards/margins": 20.698638916015625, "rewards/rejected": 2.4442138671875, "step": 2502 }, { "epoch": 1.2955486542443064, "grad_norm": 1.224579930305481, "learning_rate": 6.516296425825155e-06, "loss": 0.1658935248851776, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.78813934326172, "rewards/margins": 21.66729736328125, "rewards/rejected": 2.117068290710449, "step": 2503 }, { "epoch": 1.2960662525879918, "grad_norm": 0.7118335366249084, "learning_rate": 6.513577645716897e-06, "loss": 0.08770773559808731, "rewards/accuracies": 0.953125, "rewards/chosen": 24.47848892211914, "rewards/margins": 21.644710540771484, "rewards/rejected": 2.83247971534729, "step": 2504 }, { "epoch": 1.296583850931677, "grad_norm": 0.9422724843025208, "learning_rate": 6.510858372852524e-06, "loss": 0.11150460690259933, "rewards/accuracies": 0.9453125, "rewards/chosen": 22.501174926757812, "rewards/margins": 19.708465576171875, "rewards/rejected": 2.7914390563964844, "step": 2505 }, { "epoch": 1.2971014492753623, "grad_norm": 1.0380052328109741, "learning_rate": 6.508138608117311e-06, "loss": 0.15476202964782715, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.984262466430664, "rewards/margins": 20.685562133789062, "rewards/rejected": 2.3072032928466797, "step": 2506 }, { "epoch": 1.2976190476190477, "grad_norm": 1.772977352142334, "learning_rate": 6.5054183523967e-06, "loss": 0.22238799929618835, "rewards/accuracies": 0.8828125, "rewards/chosen": 19.97395896911621, "rewards/margins": 17.810302734375, "rewards/rejected": 2.1620426177978516, "step": 2507 }, { "epoch": 1.298136645962733, "grad_norm": 2.090819835662842, "learning_rate": 6.5026976065762895e-06, "loss": 0.17761541903018951, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.70624542236328, "rewards/margins": 19.836456298828125, "rewards/rejected": 2.8637237548828125, "step": 2508 }, { "epoch": 1.2986542443064182, "grad_norm": 1.2037546634674072, "learning_rate": 6.499976371541836e-06, "loss": 0.10001415014266968, "rewards/accuracies": 0.9609375, "rewards/chosen": 24.145397186279297, "rewards/margins": 20.876480102539062, "rewards/rejected": 3.2704124450683594, "step": 2509 }, { "epoch": 1.2991718426501035, "grad_norm": 0.774726152420044, "learning_rate": 6.497254648179256e-06, "loss": 0.11542513966560364, "rewards/accuracies": 0.953125, "rewards/chosen": 22.29612159729004, "rewards/margins": 19.799407958984375, "rewards/rejected": 2.5007481575012207, "step": 2510 }, { "epoch": 1.299689440993789, "grad_norm": 1.3222512006759644, "learning_rate": 6.494532437374632e-06, "loss": 0.11556591093540192, "rewards/accuracies": 0.9375, "rewards/chosen": 25.039752960205078, "rewards/margins": 22.052978515625, "rewards/rejected": 2.9883804321289062, "step": 2511 }, { "epoch": 1.300207039337474, "grad_norm": 1.0087110996246338, "learning_rate": 6.491809740014193e-06, "loss": 0.11244121193885803, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.062885284423828, "rewards/margins": 21.11798095703125, "rewards/rejected": 2.9513092041015625, "step": 2512 }, { "epoch": 1.3007246376811594, "grad_norm": 0.9062153697013855, "learning_rate": 6.489086556984337e-06, "loss": 0.13230645656585693, "rewards/accuracies": 0.921875, "rewards/chosen": 24.096996307373047, "rewards/margins": 21.518692016601562, "rewards/rejected": 2.5776915550231934, "step": 2513 }, { "epoch": 1.3012422360248448, "grad_norm": 0.9563425183296204, "learning_rate": 6.486362889171615e-06, "loss": 0.17566365003585815, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.3704833984375, "rewards/margins": 20.577266693115234, "rewards/rejected": 2.7902145385742188, "step": 2514 }, { "epoch": 1.30175983436853, "grad_norm": 2.2383453845977783, "learning_rate": 6.483638737462734e-06, "loss": 0.12603279948234558, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.544729232788086, "rewards/margins": 21.170814514160156, "rewards/rejected": 2.3742518424987793, "step": 2515 }, { "epoch": 1.3022774327122153, "grad_norm": 0.8729609251022339, "learning_rate": 6.480914102744565e-06, "loss": 0.16134250164031982, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.077497482299805, "rewards/margins": 21.29326629638672, "rewards/rejected": 2.7855262756347656, "step": 2516 }, { "epoch": 1.3027950310559007, "grad_norm": 1.1066701412200928, "learning_rate": 6.478188985904129e-06, "loss": 0.1371104121208191, "rewards/accuracies": 0.9453125, "rewards/chosen": 22.14250373840332, "rewards/margins": 20.061737060546875, "rewards/rejected": 2.0809097290039062, "step": 2517 }, { "epoch": 1.3033126293995858, "grad_norm": 1.5836608409881592, "learning_rate": 6.475463387828611e-06, "loss": 0.23088054358959198, "rewards/accuracies": 0.8828125, "rewards/chosen": 19.86968231201172, "rewards/margins": 17.38433837890625, "rewards/rejected": 2.4888973236083984, "step": 2518 }, { "epoch": 1.3038302277432712, "grad_norm": 0.9263399839401245, "learning_rate": 6.4727373094053455e-06, "loss": 0.1213439404964447, "rewards/accuracies": 0.953125, "rewards/chosen": 20.190536499023438, "rewards/margins": 17.955886840820312, "rewards/rejected": 2.232574462890625, "step": 2519 }, { "epoch": 1.3043478260869565, "grad_norm": 1.146087646484375, "learning_rate": 6.47001075152183e-06, "loss": 0.1313396692276001, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.831039428710938, "rewards/margins": 23.59979248046875, "rewards/rejected": 3.2334022521972656, "step": 2520 }, { "epoch": 1.3048654244306417, "grad_norm": 1.4627586603164673, "learning_rate": 6.467283715065713e-06, "loss": 0.19873040914535522, "rewards/accuracies": 0.90625, "rewards/chosen": 21.709705352783203, "rewards/margins": 19.348411560058594, "rewards/rejected": 2.3607635498046875, "step": 2521 }, { "epoch": 1.305383022774327, "grad_norm": 0.8814426064491272, "learning_rate": 6.464556200924803e-06, "loss": 0.12389511615037918, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.240825653076172, "rewards/margins": 20.847732543945312, "rewards/rejected": 2.391742706298828, "step": 2522 }, { "epoch": 1.3059006211180124, "grad_norm": 0.6379590034484863, "learning_rate": 6.46182820998706e-06, "loss": 0.1097094863653183, "rewards/accuracies": 0.9609375, "rewards/chosen": 25.361618041992188, "rewards/margins": 22.810501098632812, "rewards/rejected": 2.5497589111328125, "step": 2523 }, { "epoch": 1.3064182194616978, "grad_norm": 1.6277108192443848, "learning_rate": 6.4590997431406025e-06, "loss": 0.1580144613981247, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.432098388671875, "rewards/margins": 20.160720825195312, "rewards/rejected": 2.2659568786621094, "step": 2524 }, { "epoch": 1.3069358178053831, "grad_norm": 0.8262049555778503, "learning_rate": 6.4563708012737e-06, "loss": 0.10724957287311554, "rewards/accuracies": 0.9375, "rewards/chosen": 25.49683380126953, "rewards/margins": 22.281326293945312, "rewards/rejected": 3.2157087326049805, "step": 2525 }, { "epoch": 1.3074534161490683, "grad_norm": 0.8614230751991272, "learning_rate": 6.453641385274783e-06, "loss": 0.09279502928256989, "rewards/accuracies": 0.9609375, "rewards/chosen": 23.180591583251953, "rewards/margins": 20.456939697265625, "rewards/rejected": 2.7272262573242188, "step": 2526 }, { "epoch": 1.3079710144927537, "grad_norm": 1.5016120672225952, "learning_rate": 6.450911496032428e-06, "loss": 0.11225876957178116, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.353172302246094, "rewards/margins": 22.83551025390625, "rewards/rejected": 3.5158157348632812, "step": 2527 }, { "epoch": 1.308488612836439, "grad_norm": 1.3675262928009033, "learning_rate": 6.4481811344353765e-06, "loss": 0.14757949113845825, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.144245147705078, "rewards/margins": 22.3253173828125, "rewards/rejected": 1.8238372802734375, "step": 2528 }, { "epoch": 1.3090062111801242, "grad_norm": 0.8598138689994812, "learning_rate": 6.445450301372511e-06, "loss": 0.08756306767463684, "rewards/accuracies": 0.9765625, "rewards/chosen": 23.548789978027344, "rewards/margins": 20.906402587890625, "rewards/rejected": 2.643599510192871, "step": 2529 }, { "epoch": 1.3095238095238095, "grad_norm": 1.2985754013061523, "learning_rate": 6.442718997732878e-06, "loss": 0.14046326279640198, "rewards/accuracies": 0.9375, "rewards/chosen": 27.00521469116211, "rewards/margins": 22.786392211914062, "rewards/rejected": 4.212545394897461, "step": 2530 }, { "epoch": 1.310041407867495, "grad_norm": 0.9334222674369812, "learning_rate": 6.439987224405671e-06, "loss": 0.10783910751342773, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.693246841430664, "rewards/margins": 23.015594482421875, "rewards/rejected": 2.6760177612304688, "step": 2531 }, { "epoch": 1.31055900621118, "grad_norm": 2.034335136413574, "learning_rate": 6.437254982280238e-06, "loss": 0.10953807830810547, "rewards/accuracies": 0.953125, "rewards/chosen": 26.563644409179688, "rewards/margins": 23.478851318359375, "rewards/rejected": 3.0887069702148438, "step": 2532 }, { "epoch": 1.3110766045548654, "grad_norm": 0.9674311876296997, "learning_rate": 6.434522272246082e-06, "loss": 0.1501113474369049, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.32427978515625, "rewards/margins": 23.205825805664062, "rewards/rejected": 3.119781494140625, "step": 2533 }, { "epoch": 1.3115942028985508, "grad_norm": 1.7939953804016113, "learning_rate": 6.431789095192855e-06, "loss": 0.11913382261991501, "rewards/accuracies": 0.9375, "rewards/chosen": 27.152999877929688, "rewards/margins": 23.59759521484375, "rewards/rejected": 3.5557193756103516, "step": 2534 }, { "epoch": 1.312111801242236, "grad_norm": 1.625370740890503, "learning_rate": 6.429055452010364e-06, "loss": 0.24181276559829712, "rewards/accuracies": 0.890625, "rewards/chosen": 25.6489315032959, "rewards/margins": 22.01019287109375, "rewards/rejected": 3.636593818664551, "step": 2535 }, { "epoch": 1.3126293995859213, "grad_norm": 1.427219033241272, "learning_rate": 6.426321343588567e-06, "loss": 0.17722287774085999, "rewards/accuracies": 0.90625, "rewards/chosen": 26.420528411865234, "rewards/margins": 23.36419677734375, "rewards/rejected": 3.0539932250976562, "step": 2536 }, { "epoch": 1.3131469979296067, "grad_norm": 1.4374785423278809, "learning_rate": 6.42358677081757e-06, "loss": 0.11834104359149933, "rewards/accuracies": 0.953125, "rewards/chosen": 26.824172973632812, "rewards/margins": 22.580474853515625, "rewards/rejected": 4.244509696960449, "step": 2537 }, { "epoch": 1.3136645962732918, "grad_norm": 1.710304856300354, "learning_rate": 6.420851734587632e-06, "loss": 0.12026439607143402, "rewards/accuracies": 0.953125, "rewards/chosen": 26.025787353515625, "rewards/margins": 21.80255126953125, "rewards/rejected": 4.2219085693359375, "step": 2538 }, { "epoch": 1.3141821946169772, "grad_norm": 1.0387239456176758, "learning_rate": 6.418116235789169e-06, "loss": 0.12747123837471008, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.68694305419922, "rewards/margins": 22.35137939453125, "rewards/rejected": 3.333585500717163, "step": 2539 }, { "epoch": 1.3146997929606625, "grad_norm": 1.288961410522461, "learning_rate": 6.4153802753127366e-06, "loss": 0.12274790555238724, "rewards/accuracies": 0.96875, "rewards/chosen": 23.496482849121094, "rewards/margins": 20.109085083007812, "rewards/rejected": 3.3941116333007812, "step": 2540 }, { "epoch": 1.315217391304348, "grad_norm": 2.0972914695739746, "learning_rate": 6.412643854049051e-06, "loss": 0.16135051846504211, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.8121337890625, "rewards/margins": 19.432796478271484, "rewards/rejected": 4.379495620727539, "step": 2541 }, { "epoch": 1.3157349896480333, "grad_norm": 1.5392612218856812, "learning_rate": 6.409906972888971e-06, "loss": 0.20070141553878784, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.186904907226562, "rewards/margins": 18.915512084960938, "rewards/rejected": 3.2768516540527344, "step": 2542 }, { "epoch": 1.3162525879917184, "grad_norm": 1.7817054986953735, "learning_rate": 6.407169632723509e-06, "loss": 0.16156601905822754, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.99199676513672, "rewards/margins": 18.95526123046875, "rewards/rejected": 4.0324530601501465, "step": 2543 }, { "epoch": 1.3167701863354038, "grad_norm": 1.872663140296936, "learning_rate": 6.404431834443826e-06, "loss": 0.19082555174827576, "rewards/accuracies": 0.8984375, "rewards/chosen": 22.101863861083984, "rewards/margins": 17.54718017578125, "rewards/rejected": 4.558814525604248, "step": 2544 }, { "epoch": 1.3172877846790891, "grad_norm": 0.7907329797744751, "learning_rate": 6.401693578941232e-06, "loss": 0.12941710650920868, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.785846710205078, "rewards/margins": 19.507858276367188, "rewards/rejected": 4.277973175048828, "step": 2545 }, { "epoch": 1.3178053830227743, "grad_norm": 1.344236969947815, "learning_rate": 6.398954867107184e-06, "loss": 0.11258740723133087, "rewards/accuracies": 0.9375, "rewards/chosen": 25.76404571533203, "rewards/margins": 20.978363037109375, "rewards/rejected": 4.786771774291992, "step": 2546 }, { "epoch": 1.3183229813664596, "grad_norm": 2.4896936416625977, "learning_rate": 6.3962156998332925e-06, "loss": 0.23253990709781647, "rewards/accuracies": 0.90625, "rewards/chosen": 22.596763610839844, "rewards/margins": 18.37694549560547, "rewards/rejected": 4.216865062713623, "step": 2547 }, { "epoch": 1.318840579710145, "grad_norm": 1.2378137111663818, "learning_rate": 6.393476078011312e-06, "loss": 0.16158375144004822, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.11798095703125, "rewards/margins": 19.18218994140625, "rewards/rejected": 3.9417877197265625, "step": 2548 }, { "epoch": 1.3193581780538302, "grad_norm": 0.8115515112876892, "learning_rate": 6.390736002533143e-06, "loss": 0.08200277388095856, "rewards/accuracies": 0.984375, "rewards/chosen": 22.870330810546875, "rewards/margins": 19.26214599609375, "rewards/rejected": 3.6157302856445312, "step": 2549 }, { "epoch": 1.3198757763975155, "grad_norm": 0.8580837845802307, "learning_rate": 6.387995474290843e-06, "loss": 0.10318633913993835, "rewards/accuracies": 0.953125, "rewards/chosen": 20.37255859375, "rewards/margins": 16.714523315429688, "rewards/rejected": 3.6585402488708496, "step": 2550 }, { "epoch": 1.3203933747412009, "grad_norm": 2.5559544563293457, "learning_rate": 6.385254494176605e-06, "loss": 0.08457444608211517, "rewards/accuracies": 0.96875, "rewards/chosen": 23.394886016845703, "rewards/margins": 19.529266357421875, "rewards/rejected": 3.862107038497925, "step": 2551 }, { "epoch": 1.320910973084886, "grad_norm": 1.151276707649231, "learning_rate": 6.382513063082778e-06, "loss": 0.14269712567329407, "rewards/accuracies": 0.9375, "rewards/chosen": 21.160675048828125, "rewards/margins": 17.770736694335938, "rewards/rejected": 3.39617919921875, "step": 2552 }, { "epoch": 1.3214285714285714, "grad_norm": 0.931891918182373, "learning_rate": 6.379771181901853e-06, "loss": 0.11545667797327042, "rewards/accuracies": 0.9375, "rewards/chosen": 26.2518310546875, "rewards/margins": 21.501937866210938, "rewards/rejected": 4.752727508544922, "step": 2553 }, { "epoch": 1.3219461697722568, "grad_norm": 1.3553946018218994, "learning_rate": 6.377028851526472e-06, "loss": 0.1347774863243103, "rewards/accuracies": 0.921875, "rewards/chosen": 23.630537033081055, "rewards/margins": 19.120651245117188, "rewards/rejected": 4.505126953125, "step": 2554 }, { "epoch": 1.322463768115942, "grad_norm": 0.940876305103302, "learning_rate": 6.374286072849416e-06, "loss": 0.14876851439476013, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.87686538696289, "rewards/margins": 18.83354949951172, "rewards/rejected": 3.0486063957214355, "step": 2555 }, { "epoch": 1.3229813664596273, "grad_norm": 1.284490704536438, "learning_rate": 6.371542846763621e-06, "loss": 0.11685702949762344, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.1657657623291, "rewards/margins": 18.092498779296875, "rewards/rejected": 4.0753631591796875, "step": 2556 }, { "epoch": 1.3234989648033126, "grad_norm": 3.2735230922698975, "learning_rate": 6.36879917416216e-06, "loss": 0.17728963494300842, "rewards/accuracies": 0.9453125, "rewards/chosen": 20.66619110107422, "rewards/margins": 16.33160400390625, "rewards/rejected": 4.334001541137695, "step": 2557 }, { "epoch": 1.324016563146998, "grad_norm": 1.0101752281188965, "learning_rate": 6.3660550559382584e-06, "loss": 0.12563031911849976, "rewards/accuracies": 0.9609375, "rewards/chosen": 23.317893981933594, "rewards/margins": 18.494873046875, "rewards/rejected": 4.827442169189453, "step": 2558 }, { "epoch": 1.3245341614906831, "grad_norm": 0.8120896220207214, "learning_rate": 6.3633104929852815e-06, "loss": 0.12064041942358017, "rewards/accuracies": 0.953125, "rewards/chosen": 25.453353881835938, "rewards/margins": 19.58953857421875, "rewards/rejected": 5.863258361816406, "step": 2559 }, { "epoch": 1.3250517598343685, "grad_norm": 1.3095377683639526, "learning_rate": 6.360565486196744e-06, "loss": 0.16622966527938843, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.660236358642578, "rewards/margins": 18.470176696777344, "rewards/rejected": 4.1923828125, "step": 2560 }, { "epoch": 1.3255693581780539, "grad_norm": 0.9106674790382385, "learning_rate": 6.3578200364663e-06, "loss": 0.10070651769638062, "rewards/accuracies": 0.9609375, "rewards/chosen": 22.083335876464844, "rewards/margins": 17.723464965820312, "rewards/rejected": 4.3567047119140625, "step": 2561 }, { "epoch": 1.3260869565217392, "grad_norm": 1.4860568046569824, "learning_rate": 6.3550741446877515e-06, "loss": 0.11458345502614975, "rewards/accuracies": 0.9609375, "rewards/chosen": 21.204849243164062, "rewards/margins": 17.066261291503906, "rewards/rejected": 4.144604206085205, "step": 2562 }, { "epoch": 1.3266045548654244, "grad_norm": 0.8985610604286194, "learning_rate": 6.352327811755044e-06, "loss": 0.08777555823326111, "rewards/accuracies": 0.96875, "rewards/chosen": 21.932037353515625, "rewards/margins": 17.757736206054688, "rewards/rejected": 4.172298431396484, "step": 2563 }, { "epoch": 1.3271221532091098, "grad_norm": 1.1120750904083252, "learning_rate": 6.349581038562266e-06, "loss": 0.12536267936229706, "rewards/accuracies": 0.9375, "rewards/chosen": 21.515464782714844, "rewards/margins": 17.274391174316406, "rewards/rejected": 4.245136260986328, "step": 2564 }, { "epoch": 1.3276397515527951, "grad_norm": 1.3005478382110596, "learning_rate": 6.346833826003649e-06, "loss": 0.122205950319767, "rewards/accuracies": 0.953125, "rewards/chosen": 20.99443244934082, "rewards/margins": 16.54345703125, "rewards/rejected": 4.4493408203125, "step": 2565 }, { "epoch": 1.3281573498964803, "grad_norm": 1.9900753498077393, "learning_rate": 6.344086174973566e-06, "loss": 0.13887609541416168, "rewards/accuracies": 0.9453125, "rewards/chosen": 21.811908721923828, "rewards/margins": 17.0469970703125, "rewards/rejected": 4.76483154296875, "step": 2566 }, { "epoch": 1.3286749482401656, "grad_norm": 0.7027868628501892, "learning_rate": 6.341338086366538e-06, "loss": 0.10761392116546631, "rewards/accuracies": 0.9609375, "rewards/chosen": 23.323457717895508, "rewards/margins": 18.740020751953125, "rewards/rejected": 4.589302062988281, "step": 2567 }, { "epoch": 1.329192546583851, "grad_norm": 1.2597880363464355, "learning_rate": 6.338589561077222e-06, "loss": 0.18793204426765442, "rewards/accuracies": 0.921875, "rewards/chosen": 22.395286560058594, "rewards/margins": 17.933570861816406, "rewards/rejected": 4.457298278808594, "step": 2568 }, { "epoch": 1.3297101449275361, "grad_norm": 1.0990264415740967, "learning_rate": 6.335840600000425e-06, "loss": 0.12965764105319977, "rewards/accuracies": 0.9375, "rewards/chosen": 24.310325622558594, "rewards/margins": 18.768768310546875, "rewards/rejected": 5.535041809082031, "step": 2569 }, { "epoch": 1.3302277432712215, "grad_norm": 1.3967852592468262, "learning_rate": 6.333091204031085e-06, "loss": 0.1677243411540985, "rewards/accuracies": 0.921875, "rewards/chosen": 20.73497772216797, "rewards/margins": 17.48004150390625, "rewards/rejected": 3.2511444091796875, "step": 2570 }, { "epoch": 1.3307453416149069, "grad_norm": 0.6842739582061768, "learning_rate": 6.3303413740642895e-06, "loss": 0.09737922251224518, "rewards/accuracies": 0.953125, "rewards/chosen": 24.345827102661133, "rewards/margins": 20.034149169921875, "rewards/rejected": 4.307233810424805, "step": 2571 }, { "epoch": 1.331262939958592, "grad_norm": 1.279961109161377, "learning_rate": 6.327591110995268e-06, "loss": 0.11513166129589081, "rewards/accuracies": 0.9375, "rewards/chosen": 19.90802001953125, "rewards/margins": 16.884552001953125, "rewards/rejected": 3.0303001403808594, "step": 2572 }, { "epoch": 1.3317805383022774, "grad_norm": 1.6698013544082642, "learning_rate": 6.3248404157193865e-06, "loss": 0.1336832493543625, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.533172607421875, "rewards/margins": 20.461334228515625, "rewards/rejected": 5.071540832519531, "step": 2573 }, { "epoch": 1.3322981366459627, "grad_norm": 2.4327709674835205, "learning_rate": 6.322089289132154e-06, "loss": 0.24571794271469116, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.26161766052246, "rewards/margins": 18.52069091796875, "rewards/rejected": 4.742544174194336, "step": 2574 }, { "epoch": 1.3328157349896481, "grad_norm": 2.1305480003356934, "learning_rate": 6.319337732129221e-06, "loss": 0.11852876842021942, "rewards/accuracies": 0.9609375, "rewards/chosen": 23.164073944091797, "rewards/margins": 18.445999145507812, "rewards/rejected": 4.715509414672852, "step": 2575 }, { "epoch": 1.3333333333333333, "grad_norm": 1.0108922719955444, "learning_rate": 6.3165857456063715e-06, "loss": 0.1176973432302475, "rewards/accuracies": 0.9609375, "rewards/chosen": 24.084781646728516, "rewards/margins": 19.737823486328125, "rewards/rejected": 4.345623016357422, "step": 2576 }, { "epoch": 1.3338509316770186, "grad_norm": 2.2554450035095215, "learning_rate": 6.313833330459541e-06, "loss": 0.180510476231575, "rewards/accuracies": 0.9453125, "rewards/chosen": 20.735530853271484, "rewards/margins": 16.722503662109375, "rewards/rejected": 4.010387420654297, "step": 2577 }, { "epoch": 1.334368530020704, "grad_norm": 1.0091952085494995, "learning_rate": 6.311080487584796e-06, "loss": 0.08241380006074905, "rewards/accuracies": 0.9609375, "rewards/chosen": 25.989810943603516, "rewards/margins": 21.245880126953125, "rewards/rejected": 4.736273765563965, "step": 2578 }, { "epoch": 1.3348861283643894, "grad_norm": 1.7238750457763672, "learning_rate": 6.308327217878343e-06, "loss": 0.2010732889175415, "rewards/accuracies": 0.8828125, "rewards/chosen": 21.685867309570312, "rewards/margins": 16.811233520507812, "rewards/rejected": 4.877374649047852, "step": 2579 }, { "epoch": 1.3354037267080745, "grad_norm": 3.5082616806030273, "learning_rate": 6.30557352223653e-06, "loss": 0.1421934962272644, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.463420867919922, "rewards/margins": 20.423980712890625, "rewards/rejected": 5.04437255859375, "step": 2580 }, { "epoch": 1.3359213250517599, "grad_norm": 0.7921781539916992, "learning_rate": 6.302819401555842e-06, "loss": 0.07317700237035751, "rewards/accuracies": 0.96875, "rewards/chosen": 26.219886779785156, "rewards/margins": 21.434478759765625, "rewards/rejected": 4.784141540527344, "step": 2581 }, { "epoch": 1.3364389233954452, "grad_norm": 1.5588316917419434, "learning_rate": 6.300064856732903e-06, "loss": 0.1761823147535324, "rewards/accuracies": 0.90625, "rewards/chosen": 22.356470108032227, "rewards/margins": 18.250091552734375, "rewards/rejected": 4.1031494140625, "step": 2582 }, { "epoch": 1.3369565217391304, "grad_norm": 1.293697714805603, "learning_rate": 6.2973098886644754e-06, "loss": 0.12848088145256042, "rewards/accuracies": 0.9375, "rewards/chosen": 25.613327026367188, "rewards/margins": 20.32831573486328, "rewards/rejected": 5.282508850097656, "step": 2583 }, { "epoch": 1.3374741200828157, "grad_norm": 1.0337402820587158, "learning_rate": 6.294554498247458e-06, "loss": 0.15405498445034027, "rewards/accuracies": 0.9375, "rewards/chosen": 22.590003967285156, "rewards/margins": 18.688949584960938, "rewards/rejected": 3.9046216011047363, "step": 2584 }, { "epoch": 1.337991718426501, "grad_norm": 1.022637128829956, "learning_rate": 6.291798686378888e-06, "loss": 0.15022088587284088, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.937223434448242, "rewards/margins": 20.63692855834961, "rewards/rejected": 4.304073333740234, "step": 2585 }, { "epoch": 1.3385093167701863, "grad_norm": 1.4438337087631226, "learning_rate": 6.289042453955941e-06, "loss": 0.16570980846881866, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.057537078857422, "rewards/margins": 18.879425048828125, "rewards/rejected": 4.181717872619629, "step": 2586 }, { "epoch": 1.3390269151138716, "grad_norm": 0.8286904692649841, "learning_rate": 6.286285801875928e-06, "loss": 0.11131566762924194, "rewards/accuracies": 0.9609375, "rewards/chosen": 23.76103973388672, "rewards/margins": 20.672836303710938, "rewards/rejected": 3.0886154174804688, "step": 2587 }, { "epoch": 1.339544513457557, "grad_norm": 2.316070556640625, "learning_rate": 6.283528731036297e-06, "loss": 0.17409268021583557, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.22068214416504, "rewards/margins": 17.297042846679688, "rewards/rejected": 3.923215866088867, "step": 2588 }, { "epoch": 1.3400621118012421, "grad_norm": 1.7686398029327393, "learning_rate": 6.2807712423346286e-06, "loss": 0.2160637080669403, "rewards/accuracies": 0.90625, "rewards/chosen": 20.705028533935547, "rewards/margins": 17.060775756835938, "rewards/rejected": 3.6418585777282715, "step": 2589 }, { "epoch": 1.3405797101449275, "grad_norm": 0.9980313181877136, "learning_rate": 6.278013336668651e-06, "loss": 0.1424923837184906, "rewards/accuracies": 0.9375, "rewards/chosen": 20.471580505371094, "rewards/margins": 17.679092407226562, "rewards/rejected": 2.792631149291992, "step": 2590 }, { "epoch": 1.3410973084886129, "grad_norm": 0.8461111783981323, "learning_rate": 6.275255014936212e-06, "loss": 0.1769256889820099, "rewards/accuracies": 0.890625, "rewards/chosen": 21.682708740234375, "rewards/margins": 17.841064453125, "rewards/rejected": 3.8395919799804688, "step": 2591 }, { "epoch": 1.341614906832298, "grad_norm": 0.9568919539451599, "learning_rate": 6.2724962780353085e-06, "loss": 0.14112643897533417, "rewards/accuracies": 0.9140625, "rewards/chosen": 20.345972061157227, "rewards/margins": 16.65350341796875, "rewards/rejected": 3.6938745975494385, "step": 2592 }, { "epoch": 1.3421325051759834, "grad_norm": 1.3475340604782104, "learning_rate": 6.2697371268640664e-06, "loss": 0.1535513997077942, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.916545867919922, "rewards/margins": 19.847957611083984, "rewards/rejected": 4.071018218994141, "step": 2593 }, { "epoch": 1.3426501035196687, "grad_norm": 1.4425171613693237, "learning_rate": 6.266977562320744e-06, "loss": 0.135897696018219, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.351242065429688, "rewards/margins": 18.781661987304688, "rewards/rejected": 3.56915283203125, "step": 2594 }, { "epoch": 1.343167701863354, "grad_norm": 0.8492133021354675, "learning_rate": 6.2642175853037425e-06, "loss": 0.16002514958381653, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.004375457763672, "rewards/margins": 18.234344482421875, "rewards/rejected": 3.773230791091919, "step": 2595 }, { "epoch": 1.3436853002070395, "grad_norm": 1.224431037902832, "learning_rate": 6.2614571967115876e-06, "loss": 0.17983704805374146, "rewards/accuracies": 0.9375, "rewards/chosen": 21.22344970703125, "rewards/margins": 17.21417236328125, "rewards/rejected": 4.004144668579102, "step": 2596 }, { "epoch": 1.3442028985507246, "grad_norm": 2.116462230682373, "learning_rate": 6.258696397442946e-06, "loss": 0.21738453209400177, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.258914947509766, "rewards/margins": 19.6275634765625, "rewards/rejected": 3.627781867980957, "step": 2597 }, { "epoch": 1.34472049689441, "grad_norm": 0.8001607060432434, "learning_rate": 6.255935188396615e-06, "loss": 0.132229745388031, "rewards/accuracies": 0.9453125, "rewards/chosen": 22.155075073242188, "rewards/margins": 17.997360229492188, "rewards/rejected": 4.161569595336914, "step": 2598 }, { "epoch": 1.3452380952380953, "grad_norm": 1.2849000692367554, "learning_rate": 6.253173570471529e-06, "loss": 0.14730337262153625, "rewards/accuracies": 0.921875, "rewards/chosen": 23.417850494384766, "rewards/margins": 18.87469482421875, "rewards/rejected": 4.539487838745117, "step": 2599 }, { "epoch": 1.3457556935817805, "grad_norm": 0.7494150996208191, "learning_rate": 6.250411544566748e-06, "loss": 0.09470637142658234, "rewards/accuracies": 0.9609375, "rewards/chosen": 24.497516632080078, "rewards/margins": 19.934967041015625, "rewards/rejected": 4.5658650398254395, "step": 2600 }, { "epoch": 1.3462732919254659, "grad_norm": 1.0101674795150757, "learning_rate": 6.247649111581474e-06, "loss": 0.1570708304643631, "rewards/accuracies": 0.9375, "rewards/chosen": 21.520076751708984, "rewards/margins": 17.341598510742188, "rewards/rejected": 4.172464370727539, "step": 2601 }, { "epoch": 1.3467908902691512, "grad_norm": 1.0117895603179932, "learning_rate": 6.244886272415033e-06, "loss": 0.13424620032310486, "rewards/accuracies": 0.9453125, "rewards/chosen": 22.288618087768555, "rewards/margins": 17.121749877929688, "rewards/rejected": 5.165428161621094, "step": 2602 }, { "epoch": 1.3473084886128364, "grad_norm": 1.1793078184127808, "learning_rate": 6.2421230279668895e-06, "loss": 0.1365763396024704, "rewards/accuracies": 0.921875, "rewards/chosen": 23.065189361572266, "rewards/margins": 18.185739517211914, "rewards/rejected": 4.882478713989258, "step": 2603 }, { "epoch": 1.3478260869565217, "grad_norm": 1.5903698205947876, "learning_rate": 6.239359379136636e-06, "loss": 0.18841229379177094, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.9888916015625, "rewards/margins": 17.63836669921875, "rewards/rejected": 4.351320743560791, "step": 2604 }, { "epoch": 1.348343685300207, "grad_norm": 1.9610997438430786, "learning_rate": 6.236595326824002e-06, "loss": 0.1517704427242279, "rewards/accuracies": 0.9453125, "rewards/chosen": 23.820215225219727, "rewards/margins": 19.29949951171875, "rewards/rejected": 4.5210418701171875, "step": 2605 }, { "epoch": 1.3488612836438922, "grad_norm": 2.9432849884033203, "learning_rate": 6.23383087192884e-06, "loss": 0.2222619652748108, "rewards/accuracies": 0.90625, "rewards/chosen": 24.408037185668945, "rewards/margins": 19.391082763671875, "rewards/rejected": 5.0110931396484375, "step": 2606 }, { "epoch": 1.3493788819875776, "grad_norm": 0.6915523409843445, "learning_rate": 6.23106601535114e-06, "loss": 0.09419870376586914, "rewards/accuracies": 0.953125, "rewards/chosen": 25.703109741210938, "rewards/margins": 20.383697509765625, "rewards/rejected": 5.315830230712891, "step": 2607 }, { "epoch": 1.349896480331263, "grad_norm": 1.034719705581665, "learning_rate": 6.228300757991023e-06, "loss": 0.11158601939678192, "rewards/accuracies": 0.9609375, "rewards/chosen": 26.264368057250977, "rewards/margins": 20.815170288085938, "rewards/rejected": 5.449980735778809, "step": 2608 }, { "epoch": 1.3504140786749481, "grad_norm": 0.8572824597358704, "learning_rate": 6.225535100748734e-06, "loss": 0.09606297314167023, "rewards/accuracies": 0.96875, "rewards/chosen": 24.378646850585938, "rewards/margins": 19.591293334960938, "rewards/rejected": 4.789249420166016, "step": 2609 }, { "epoch": 1.3509316770186335, "grad_norm": 0.9739717841148376, "learning_rate": 6.222769044524659e-06, "loss": 0.11924945563077927, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.091094970703125, "rewards/margins": 21.5477294921875, "rewards/rejected": 5.535274505615234, "step": 2610 }, { "epoch": 1.3514492753623188, "grad_norm": 1.5320534706115723, "learning_rate": 6.220002590219302e-06, "loss": 0.15295176208019257, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.885242462158203, "rewards/margins": 19.6026611328125, "rewards/rejected": 5.279722690582275, "step": 2611 }, { "epoch": 1.3519668737060042, "grad_norm": 1.2469106912612915, "learning_rate": 6.217235738733305e-06, "loss": 0.09694606065750122, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.844261169433594, "rewards/margins": 22.260154724121094, "rewards/rejected": 5.5877580642700195, "step": 2612 }, { "epoch": 1.3524844720496896, "grad_norm": 1.5255539417266846, "learning_rate": 6.214468490967435e-06, "loss": 0.12442493438720703, "rewards/accuracies": 0.9375, "rewards/chosen": 24.377155303955078, "rewards/margins": 20.271560668945312, "rewards/rejected": 4.108953475952148, "step": 2613 }, { "epoch": 1.3530020703933747, "grad_norm": 0.9766570329666138, "learning_rate": 6.2117008478225905e-06, "loss": 0.13905110955238342, "rewards/accuracies": 0.953125, "rewards/chosen": 27.155967712402344, "rewards/margins": 21.766250610351562, "rewards/rejected": 5.387546539306641, "step": 2614 }, { "epoch": 1.35351966873706, "grad_norm": 1.4190013408660889, "learning_rate": 6.208932810199796e-06, "loss": 0.11980780959129333, "rewards/accuracies": 0.9375, "rewards/chosen": 25.851886749267578, "rewards/margins": 21.852041244506836, "rewards/rejected": 4.002819061279297, "step": 2615 }, { "epoch": 1.3540372670807455, "grad_norm": 3.0212795734405518, "learning_rate": 6.206164379000209e-06, "loss": 0.12302194535732269, "rewards/accuracies": 0.953125, "rewards/chosen": 28.38173484802246, "rewards/margins": 23.463470458984375, "rewards/rejected": 4.9202880859375, "step": 2616 }, { "epoch": 1.3545548654244306, "grad_norm": 1.3155663013458252, "learning_rate": 6.203395555125108e-06, "loss": 0.11576499789953232, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.180843353271484, "rewards/margins": 22.017105102539062, "rewards/rejected": 4.163145065307617, "step": 2617 }, { "epoch": 1.355072463768116, "grad_norm": 1.3408610820770264, "learning_rate": 6.200626339475909e-06, "loss": 0.13127633929252625, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.591617584228516, "rewards/margins": 23.168685913085938, "rewards/rejected": 5.430972099304199, "step": 2618 }, { "epoch": 1.3555900621118013, "grad_norm": 1.1091489791870117, "learning_rate": 6.197856732954146e-06, "loss": 0.1093844547867775, "rewards/accuracies": 0.953125, "rewards/chosen": 30.011878967285156, "rewards/margins": 22.684226989746094, "rewards/rejected": 7.3350677490234375, "step": 2619 }, { "epoch": 1.3561076604554865, "grad_norm": 1.2825210094451904, "learning_rate": 6.195086736461487e-06, "loss": 0.14112240076065063, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.385093688964844, "rewards/margins": 21.420974731445312, "rewards/rejected": 4.96278190612793, "step": 2620 }, { "epoch": 1.3566252587991718, "grad_norm": 1.3885082006454468, "learning_rate": 6.192316350899721e-06, "loss": 0.1779721975326538, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.08584976196289, "rewards/margins": 22.926830291748047, "rewards/rejected": 5.153116226196289, "step": 2621 }, { "epoch": 1.3571428571428572, "grad_norm": 1.792312502861023, "learning_rate": 6.189545577170768e-06, "loss": 0.12749715149402618, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.987869262695312, "rewards/margins": 22.649375915527344, "rewards/rejected": 4.339653968811035, "step": 2622 }, { "epoch": 1.3576604554865424, "grad_norm": 1.259366750717163, "learning_rate": 6.186774416176678e-06, "loss": 0.12989285588264465, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.505531311035156, "rewards/margins": 21.279464721679688, "rewards/rejected": 5.2310791015625, "step": 2623 }, { "epoch": 1.3581780538302277, "grad_norm": 1.7553986310958862, "learning_rate": 6.1840028688196164e-06, "loss": 0.16562466323375702, "rewards/accuracies": 0.921875, "rewards/chosen": 27.195199966430664, "rewards/margins": 22.465484619140625, "rewards/rejected": 4.730317115783691, "step": 2624 }, { "epoch": 1.358695652173913, "grad_norm": 1.4136391878128052, "learning_rate": 6.181230936001883e-06, "loss": 0.18384762108325958, "rewards/accuracies": 0.8828125, "rewards/chosen": 25.505672454833984, "rewards/margins": 21.25213623046875, "rewards/rejected": 4.24639892578125, "step": 2625 }, { "epoch": 1.3592132505175982, "grad_norm": 1.8595072031021118, "learning_rate": 6.178458618625901e-06, "loss": 0.1794874221086502, "rewards/accuracies": 0.921875, "rewards/chosen": 25.55854034423828, "rewards/margins": 21.579055786132812, "rewards/rejected": 3.9794998168945312, "step": 2626 }, { "epoch": 1.3597308488612836, "grad_norm": 1.541717290878296, "learning_rate": 6.175685917594218e-06, "loss": 0.1698099970817566, "rewards/accuracies": 0.921875, "rewards/chosen": 26.534698486328125, "rewards/margins": 22.517379760742188, "rewards/rejected": 4.0176544189453125, "step": 2627 }, { "epoch": 1.360248447204969, "grad_norm": 1.2799931764602661, "learning_rate": 6.172912833809504e-06, "loss": 0.1443862020969391, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.931798934936523, "rewards/margins": 22.722396850585938, "rewards/rejected": 3.212204933166504, "step": 2628 }, { "epoch": 1.3607660455486543, "grad_norm": 1.1004809141159058, "learning_rate": 6.1701393681745626e-06, "loss": 0.1352200210094452, "rewards/accuracies": 0.921875, "rewards/chosen": 23.411476135253906, "rewards/margins": 20.807769775390625, "rewards/rejected": 2.6057615280151367, "step": 2629 }, { "epoch": 1.3612836438923395, "grad_norm": 0.8235941529273987, "learning_rate": 6.167365521592309e-06, "loss": 0.1021275669336319, "rewards/accuracies": 0.9609375, "rewards/chosen": 27.222013473510742, "rewards/margins": 22.965606689453125, "rewards/rejected": 4.251056671142578, "step": 2630 }, { "epoch": 1.3618012422360248, "grad_norm": 1.155796766281128, "learning_rate": 6.164591294965794e-06, "loss": 0.1577375829219818, "rewards/accuracies": 0.921875, "rewards/chosen": 26.634178161621094, "rewards/margins": 23.467105865478516, "rewards/rejected": 3.1582107543945312, "step": 2631 }, { "epoch": 1.3623188405797102, "grad_norm": 1.3444490432739258, "learning_rate": 6.161816689198183e-06, "loss": 0.16910219192504883, "rewards/accuracies": 0.921875, "rewards/chosen": 26.70655059814453, "rewards/margins": 22.991622924804688, "rewards/rejected": 3.7166481018066406, "step": 2632 }, { "epoch": 1.3628364389233956, "grad_norm": 1.6928670406341553, "learning_rate": 6.159041705192772e-06, "loss": 0.14195653796195984, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.37126922607422, "rewards/margins": 23.045806884765625, "rewards/rejected": 4.324685096740723, "step": 2633 }, { "epoch": 1.3633540372670807, "grad_norm": 1.1438064575195312, "learning_rate": 6.156266343852974e-06, "loss": 0.15081480145454407, "rewards/accuracies": 0.90625, "rewards/chosen": 25.176597595214844, "rewards/margins": 21.1776123046875, "rewards/rejected": 3.9943056106567383, "step": 2634 }, { "epoch": 1.363871635610766, "grad_norm": 1.7442162036895752, "learning_rate": 6.153490606082332e-06, "loss": 0.19406843185424805, "rewards/accuracies": 0.8828125, "rewards/chosen": 29.736663818359375, "rewards/margins": 25.150970458984375, "rewards/rejected": 4.5850830078125, "step": 2635 }, { "epoch": 1.3643892339544514, "grad_norm": 1.4174699783325195, "learning_rate": 6.150714492784504e-06, "loss": 0.1355489194393158, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.684707641601562, "rewards/margins": 24.94942855834961, "rewards/rejected": 3.7256693840026855, "step": 2636 }, { "epoch": 1.3649068322981366, "grad_norm": 1.0283124446868896, "learning_rate": 6.147938004863275e-06, "loss": 0.15372668206691742, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.337303161621094, "rewards/margins": 23.266708374023438, "rewards/rejected": 3.0694727897644043, "step": 2637 }, { "epoch": 1.365424430641822, "grad_norm": 0.9869115948677063, "learning_rate": 6.145161143222551e-06, "loss": 0.10521968454122543, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.211387634277344, "rewards/margins": 23.483367919921875, "rewards/rejected": 3.733386993408203, "step": 2638 }, { "epoch": 1.3659420289855073, "grad_norm": 0.6685009002685547, "learning_rate": 6.1423839087663585e-06, "loss": 0.10178583860397339, "rewards/accuracies": 0.9375, "rewards/chosen": 30.195240020751953, "rewards/margins": 26.3983154296875, "rewards/rejected": 3.7907772064208984, "step": 2639 }, { "epoch": 1.3664596273291925, "grad_norm": 1.4442976713180542, "learning_rate": 6.139606302398847e-06, "loss": 0.12092826515436172, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.0399169921875, "rewards/margins": 24.429107666015625, "rewards/rejected": 3.605609893798828, "step": 2640 }, { "epoch": 1.3669772256728778, "grad_norm": 0.9719004034996033, "learning_rate": 6.136828325024286e-06, "loss": 0.20954596996307373, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.528427124023438, "rewards/margins": 20.853118896484375, "rewards/rejected": 3.679758310317993, "step": 2641 }, { "epoch": 1.3674948240165632, "grad_norm": 1.0518488883972168, "learning_rate": 6.134049977547066e-06, "loss": 0.21405449509620667, "rewards/accuracies": 0.875, "rewards/chosen": 27.561992645263672, "rewards/margins": 23.343856811523438, "rewards/rejected": 4.214273452758789, "step": 2642 }, { "epoch": 1.3680124223602483, "grad_norm": 1.8363029956817627, "learning_rate": 6.1312712608716964e-06, "loss": 0.1509324014186859, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.519512176513672, "rewards/margins": 25.53558349609375, "rewards/rejected": 3.98492431640625, "step": 2643 }, { "epoch": 1.3685300207039337, "grad_norm": 0.694006085395813, "learning_rate": 6.128492175902812e-06, "loss": 0.12171689420938492, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.002098083496094, "rewards/margins": 25.631744384765625, "rewards/rejected": 4.367136001586914, "step": 2644 }, { "epoch": 1.369047619047619, "grad_norm": 0.8520615100860596, "learning_rate": 6.125712723545161e-06, "loss": 0.08761975914239883, "rewards/accuracies": 0.953125, "rewards/chosen": 32.930782318115234, "rewards/margins": 28.443817138671875, "rewards/rejected": 4.489128112792969, "step": 2645 }, { "epoch": 1.3695652173913042, "grad_norm": 0.9718180298805237, "learning_rate": 6.122932904703616e-06, "loss": 0.12171133607625961, "rewards/accuracies": 0.953125, "rewards/chosen": 29.38296127319336, "rewards/margins": 25.048141479492188, "rewards/rejected": 4.332000732421875, "step": 2646 }, { "epoch": 1.3700828157349896, "grad_norm": 1.0008132457733154, "learning_rate": 6.1201527202831655e-06, "loss": 0.137605220079422, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.20667266845703, "rewards/margins": 26.421844482421875, "rewards/rejected": 4.783260345458984, "step": 2647 }, { "epoch": 1.370600414078675, "grad_norm": 0.8649409413337708, "learning_rate": 6.117372171188919e-06, "loss": 0.1392040252685547, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.646215438842773, "rewards/margins": 26.093582153320312, "rewards/rejected": 3.5514070987701416, "step": 2648 }, { "epoch": 1.3711180124223603, "grad_norm": 1.5611621141433716, "learning_rate": 6.1145912583261044e-06, "loss": 0.12290544807910919, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.09888458251953, "rewards/margins": 25.791160583496094, "rewards/rejected": 4.306838035583496, "step": 2649 }, { "epoch": 1.3716356107660457, "grad_norm": 0.7393243908882141, "learning_rate": 6.11180998260007e-06, "loss": 0.10438072681427002, "rewards/accuracies": 0.9375, "rewards/chosen": 33.00819778442383, "rewards/margins": 28.68377685546875, "rewards/rejected": 4.330990791320801, "step": 2650 }, { "epoch": 1.3721532091097308, "grad_norm": 1.0425034761428833, "learning_rate": 6.109028344916277e-06, "loss": 0.1662135124206543, "rewards/accuracies": 0.9140625, "rewards/chosen": 33.171722412109375, "rewards/margins": 27.111801147460938, "rewards/rejected": 6.070932388305664, "step": 2651 }, { "epoch": 1.3726708074534162, "grad_norm": 1.2552796602249146, "learning_rate": 6.106246346180309e-06, "loss": 0.1478268802165985, "rewards/accuracies": 0.9296875, "rewards/chosen": 37.14943313598633, "rewards/margins": 31.2342529296875, "rewards/rejected": 5.913732528686523, "step": 2652 }, { "epoch": 1.3731884057971016, "grad_norm": 1.4134215116500854, "learning_rate": 6.103463987297865e-06, "loss": 0.19177241623401642, "rewards/accuracies": 0.90625, "rewards/chosen": 32.196014404296875, "rewards/margins": 26.67900848388672, "rewards/rejected": 5.523420333862305, "step": 2653 }, { "epoch": 1.3737060041407867, "grad_norm": 1.001471996307373, "learning_rate": 6.1006812691747645e-06, "loss": 0.11439625918865204, "rewards/accuracies": 0.9375, "rewards/chosen": 35.174285888671875, "rewards/margins": 28.34795379638672, "rewards/rejected": 6.827587127685547, "step": 2654 }, { "epoch": 1.374223602484472, "grad_norm": 2.324289321899414, "learning_rate": 6.097898192716939e-06, "loss": 0.15017342567443848, "rewards/accuracies": 0.921875, "rewards/chosen": 35.22504425048828, "rewards/margins": 29.798187255859375, "rewards/rejected": 5.427783966064453, "step": 2655 }, { "epoch": 1.3747412008281574, "grad_norm": 1.6199164390563965, "learning_rate": 6.0951147588304386e-06, "loss": 0.2023836076259613, "rewards/accuracies": 0.921875, "rewards/chosen": 40.75775909423828, "rewards/margins": 32.7061767578125, "rewards/rejected": 8.04605484008789, "step": 2656 }, { "epoch": 1.3752587991718426, "grad_norm": 2.2712738513946533, "learning_rate": 6.0923309684214326e-06, "loss": 0.16786015033721924, "rewards/accuracies": 0.9375, "rewards/chosen": 34.41094970703125, "rewards/margins": 29.381622314453125, "rewards/rejected": 5.027427673339844, "step": 2657 }, { "epoch": 1.375776397515528, "grad_norm": 2.247856855392456, "learning_rate": 6.089546822396202e-06, "loss": 0.11826150119304657, "rewards/accuracies": 0.953125, "rewards/chosen": 36.19295883178711, "rewards/margins": 29.632705688476562, "rewards/rejected": 6.558260917663574, "step": 2658 }, { "epoch": 1.3762939958592133, "grad_norm": 0.6701956987380981, "learning_rate": 6.086762321661147e-06, "loss": 0.10372850298881531, "rewards/accuracies": 0.9375, "rewards/chosen": 37.833587646484375, "rewards/margins": 32.525421142578125, "rewards/rejected": 5.308197021484375, "step": 2659 }, { "epoch": 1.3768115942028984, "grad_norm": 1.2763874530792236, "learning_rate": 6.083977467122781e-06, "loss": 0.12982968986034393, "rewards/accuracies": 0.96875, "rewards/chosen": 35.89823532104492, "rewards/margins": 29.729934692382812, "rewards/rejected": 6.166967391967773, "step": 2660 }, { "epoch": 1.3773291925465838, "grad_norm": 2.2744131088256836, "learning_rate": 6.081192259687735e-06, "loss": 0.13302528858184814, "rewards/accuracies": 0.921875, "rewards/chosen": 30.428173065185547, "rewards/margins": 26.283111572265625, "rewards/rejected": 4.139976501464844, "step": 2661 }, { "epoch": 1.3778467908902692, "grad_norm": 3.53273606300354, "learning_rate": 6.078406700262751e-06, "loss": 0.19601649045944214, "rewards/accuracies": 0.890625, "rewards/chosen": 32.995140075683594, "rewards/margins": 27.979049682617188, "rewards/rejected": 5.016700744628906, "step": 2662 }, { "epoch": 1.3783643892339543, "grad_norm": 1.2839654684066772, "learning_rate": 6.075620789754689e-06, "loss": 0.1602015495300293, "rewards/accuracies": 0.90625, "rewards/chosen": 29.737720489501953, "rewards/margins": 26.263031005859375, "rewards/rejected": 3.468090057373047, "step": 2663 }, { "epoch": 1.3788819875776397, "grad_norm": 0.7002469301223755, "learning_rate": 6.072834529070524e-06, "loss": 0.10097727179527283, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.878890991210938, "rewards/margins": 27.27716064453125, "rewards/rejected": 3.6050567626953125, "step": 2664 }, { "epoch": 1.379399585921325, "grad_norm": 0.8684818148612976, "learning_rate": 6.07004791911734e-06, "loss": 0.07387156039476395, "rewards/accuracies": 0.953125, "rewards/chosen": 33.222007751464844, "rewards/margins": 29.99578857421875, "rewards/rejected": 3.225017547607422, "step": 2665 }, { "epoch": 1.3799171842650104, "grad_norm": 1.0790915489196777, "learning_rate": 6.06726096080234e-06, "loss": 0.15610918402671814, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.048994064331055, "rewards/margins": 26.81891632080078, "rewards/rejected": 3.234222412109375, "step": 2666 }, { "epoch": 1.3804347826086958, "grad_norm": 1.1886309385299683, "learning_rate": 6.064473655032839e-06, "loss": 0.14781813323497772, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.64671325683594, "rewards/margins": 29.933319091796875, "rewards/rejected": 3.7308311462402344, "step": 2667 }, { "epoch": 1.380952380952381, "grad_norm": 1.1617499589920044, "learning_rate": 6.061686002716263e-06, "loss": 0.12464277446269989, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.824005126953125, "rewards/margins": 27.42767333984375, "rewards/rejected": 3.3901290893554688, "step": 2668 }, { "epoch": 1.3814699792960663, "grad_norm": 1.128461241722107, "learning_rate": 6.058898004760152e-06, "loss": 0.133572518825531, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.472244262695312, "rewards/margins": 23.282394409179688, "rewards/rejected": 2.18951416015625, "step": 2669 }, { "epoch": 1.3819875776397517, "grad_norm": 0.8058421015739441, "learning_rate": 6.056109662072161e-06, "loss": 0.10496484488248825, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.67799949645996, "rewards/margins": 24.456161499023438, "rewards/rejected": 2.223247528076172, "step": 2670 }, { "epoch": 1.3825051759834368, "grad_norm": 0.6483848690986633, "learning_rate": 6.053320975560053e-06, "loss": 0.08525411784648895, "rewards/accuracies": 0.96875, "rewards/chosen": 29.276538848876953, "rewards/margins": 26.588455200195312, "rewards/rejected": 2.689074993133545, "step": 2671 }, { "epoch": 1.3830227743271222, "grad_norm": 2.7205305099487305, "learning_rate": 6.050531946131707e-06, "loss": 0.14265619218349457, "rewards/accuracies": 0.9375, "rewards/chosen": 28.81336212158203, "rewards/margins": 26.36328125, "rewards/rejected": 2.442291736602783, "step": 2672 }, { "epoch": 1.3835403726708075, "grad_norm": 0.7077972888946533, "learning_rate": 6.047742574695108e-06, "loss": 0.10028782486915588, "rewards/accuracies": 0.9609375, "rewards/chosen": 28.008190155029297, "rewards/margins": 26.04265594482422, "rewards/rejected": 1.9578275680541992, "step": 2673 }, { "epoch": 1.3840579710144927, "grad_norm": 1.054222583770752, "learning_rate": 6.04495286215836e-06, "loss": 0.15532910823822021, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.208782196044922, "rewards/margins": 24.496688842773438, "rewards/rejected": 1.7102222442626953, "step": 2674 }, { "epoch": 1.384575569358178, "grad_norm": 1.2855801582336426, "learning_rate": 6.042162809429672e-06, "loss": 0.12480912357568741, "rewards/accuracies": 0.953125, "rewards/chosen": 30.690967559814453, "rewards/margins": 28.50226402282715, "rewards/rejected": 2.1976490020751953, "step": 2675 }, { "epoch": 1.3850931677018634, "grad_norm": 1.1617419719696045, "learning_rate": 6.0393724174173665e-06, "loss": 0.13294260203838348, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.757835388183594, "rewards/margins": 25.93151092529297, "rewards/rejected": 2.831191062927246, "step": 2676 }, { "epoch": 1.3856107660455486, "grad_norm": 1.4477092027664185, "learning_rate": 6.036581687029876e-06, "loss": 0.17269662022590637, "rewards/accuracies": 0.90625, "rewards/chosen": 25.92682456970215, "rewards/margins": 24.0709228515625, "rewards/rejected": 1.8614487648010254, "step": 2677 }, { "epoch": 1.386128364389234, "grad_norm": 0.9967222213745117, "learning_rate": 6.033790619175741e-06, "loss": 0.12174652516841888, "rewards/accuracies": 0.9375, "rewards/chosen": 29.68987274169922, "rewards/margins": 27.106781005859375, "rewards/rejected": 2.578245162963867, "step": 2678 }, { "epoch": 1.3866459627329193, "grad_norm": 1.5407682657241821, "learning_rate": 6.030999214763616e-06, "loss": 0.08655532449483871, "rewards/accuracies": 0.9609375, "rewards/chosen": 29.102371215820312, "rewards/margins": 26.865537643432617, "rewards/rejected": 2.236783981323242, "step": 2679 }, { "epoch": 1.3871635610766044, "grad_norm": 1.1818822622299194, "learning_rate": 6.028207474702261e-06, "loss": 0.14229196310043335, "rewards/accuracies": 0.90625, "rewards/chosen": 27.0152587890625, "rewards/margins": 24.383651733398438, "rewards/rejected": 2.624950408935547, "step": 2680 }, { "epoch": 1.3876811594202898, "grad_norm": 1.3894116878509521, "learning_rate": 6.025415399900548e-06, "loss": 0.12331540882587433, "rewards/accuracies": 0.953125, "rewards/chosen": 28.82645034790039, "rewards/margins": 26.415176391601562, "rewards/rejected": 2.409626007080078, "step": 2681 }, { "epoch": 1.3881987577639752, "grad_norm": 1.2098103761672974, "learning_rate": 6.022622991267458e-06, "loss": 0.13792608678340912, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.078266143798828, "rewards/margins": 24.713287353515625, "rewards/rejected": 2.369508743286133, "step": 2682 }, { "epoch": 1.3887163561076605, "grad_norm": 1.148193120956421, "learning_rate": 6.01983024971208e-06, "loss": 0.12199363112449646, "rewards/accuracies": 0.9609375, "rewards/chosen": 26.818992614746094, "rewards/margins": 25.1905517578125, "rewards/rejected": 1.6233940124511719, "step": 2683 }, { "epoch": 1.3892339544513457, "grad_norm": 1.1753524541854858, "learning_rate": 6.017037176143607e-06, "loss": 0.1464712768793106, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.28601837158203, "rewards/margins": 28.247314453125, "rewards/rejected": 3.0365867614746094, "step": 2684 }, { "epoch": 1.389751552795031, "grad_norm": 1.7607125043869019, "learning_rate": 6.0142437714713506e-06, "loss": 0.1007268875837326, "rewards/accuracies": 0.96875, "rewards/chosen": 31.122385025024414, "rewards/margins": 27.954444885253906, "rewards/rejected": 3.1640090942382812, "step": 2685 }, { "epoch": 1.3902691511387164, "grad_norm": 1.0090399980545044, "learning_rate": 6.0114500366047176e-06, "loss": 0.1165417730808258, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.30554962158203, "rewards/margins": 29.487533569335938, "rewards/rejected": 2.8116989135742188, "step": 2686 }, { "epoch": 1.3907867494824018, "grad_norm": 1.0400911569595337, "learning_rate": 6.008655972453234e-06, "loss": 0.16445131599903107, "rewards/accuracies": 0.921875, "rewards/chosen": 27.210670471191406, "rewards/margins": 25.20867919921875, "rewards/rejected": 1.9978713989257812, "step": 2687 }, { "epoch": 1.391304347826087, "grad_norm": 1.0383808612823486, "learning_rate": 6.0058615799265245e-06, "loss": 0.10126958787441254, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.46977424621582, "rewards/margins": 26.66046142578125, "rewards/rejected": 3.807882308959961, "step": 2688 }, { "epoch": 1.3918219461697723, "grad_norm": 0.8606638312339783, "learning_rate": 6.003066859934323e-06, "loss": 0.09370910376310349, "rewards/accuracies": 0.9375, "rewards/chosen": 34.59418487548828, "rewards/margins": 29.878639221191406, "rewards/rejected": 4.7210469245910645, "step": 2689 }, { "epoch": 1.3923395445134576, "grad_norm": 1.455369234085083, "learning_rate": 6.0002718133864724e-06, "loss": 0.16074833273887634, "rewards/accuracies": 0.9375, "rewards/chosen": 31.22049331665039, "rewards/margins": 27.205284118652344, "rewards/rejected": 4.016109466552734, "step": 2690 }, { "epoch": 1.3928571428571428, "grad_norm": 2.107614040374756, "learning_rate": 5.9974764411929195e-06, "loss": 0.16406120359897614, "rewards/accuracies": 0.90625, "rewards/chosen": 32.34685516357422, "rewards/margins": 28.756820678710938, "rewards/rejected": 3.587209701538086, "step": 2691 }, { "epoch": 1.3933747412008282, "grad_norm": 1.3200013637542725, "learning_rate": 5.994680744263719e-06, "loss": 0.16078664362430573, "rewards/accuracies": 0.8984375, "rewards/chosen": 32.52104187011719, "rewards/margins": 27.874481201171875, "rewards/rejected": 4.649204254150391, "step": 2692 }, { "epoch": 1.3938923395445135, "grad_norm": 1.1553430557250977, "learning_rate": 5.9918847235090295e-06, "loss": 0.10738327354192734, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.26873779296875, "rewards/margins": 30.9501953125, "rewards/rejected": 5.318576812744141, "step": 2693 }, { "epoch": 1.3944099378881987, "grad_norm": 1.6846389770507812, "learning_rate": 5.989088379839114e-06, "loss": 0.13525567948818207, "rewards/accuracies": 0.953125, "rewards/chosen": 33.59832000732422, "rewards/margins": 29.24700927734375, "rewards/rejected": 4.348978042602539, "step": 2694 }, { "epoch": 1.394927536231884, "grad_norm": 1.0365725755691528, "learning_rate": 5.986291714164346e-06, "loss": 0.13566726446151733, "rewards/accuracies": 0.921875, "rewards/chosen": 30.403827667236328, "rewards/margins": 26.198226928710938, "rewards/rejected": 4.202442169189453, "step": 2695 }, { "epoch": 1.3954451345755694, "grad_norm": 1.9128304719924927, "learning_rate": 5.983494727395197e-06, "loss": 0.19762015342712402, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.25567626953125, "rewards/margins": 28.3673095703125, "rewards/rejected": 5.8928985595703125, "step": 2696 }, { "epoch": 1.3959627329192545, "grad_norm": 1.6226180791854858, "learning_rate": 5.9806974204422475e-06, "loss": 0.18328863382339478, "rewards/accuracies": 0.9140625, "rewards/chosen": 33.097564697265625, "rewards/margins": 28.525894165039062, "rewards/rejected": 4.572013854980469, "step": 2697 }, { "epoch": 1.39648033126294, "grad_norm": 2.8635640144348145, "learning_rate": 5.977899794216182e-06, "loss": 0.20640170574188232, "rewards/accuracies": 0.8984375, "rewards/chosen": 31.339460372924805, "rewards/margins": 27.223388671875, "rewards/rejected": 4.118677139282227, "step": 2698 }, { "epoch": 1.3969979296066253, "grad_norm": 2.3682470321655273, "learning_rate": 5.975101849627784e-06, "loss": 0.18805253505706787, "rewards/accuracies": 0.9375, "rewards/chosen": 32.53593444824219, "rewards/margins": 28.349716186523438, "rewards/rejected": 4.18498420715332, "step": 2699 }, { "epoch": 1.3975155279503104, "grad_norm": 0.8509365320205688, "learning_rate": 5.972303587587951e-06, "loss": 0.08082211017608643, "rewards/accuracies": 0.96875, "rewards/chosen": 39.67969512939453, "rewards/margins": 33.726417541503906, "rewards/rejected": 5.9583740234375, "step": 2700 }, { "epoch": 1.3980331262939958, "grad_norm": 1.1779590845108032, "learning_rate": 5.9695050090076725e-06, "loss": 0.1007005050778389, "rewards/accuracies": 0.9609375, "rewards/chosen": 33.69635772705078, "rewards/margins": 28.03954315185547, "rewards/rejected": 5.664192199707031, "step": 2701 }, { "epoch": 1.3985507246376812, "grad_norm": 1.0413466691970825, "learning_rate": 5.9667061147980456e-06, "loss": 0.13836240768432617, "rewards/accuracies": 0.921875, "rewards/chosen": 32.527950286865234, "rewards/margins": 27.71723175048828, "rewards/rejected": 4.817883491516113, "step": 2702 }, { "epoch": 1.3990683229813665, "grad_norm": 1.0164028406143188, "learning_rate": 5.963906905870273e-06, "loss": 0.12312924116849899, "rewards/accuracies": 0.921875, "rewards/chosen": 33.9746208190918, "rewards/margins": 29.76227569580078, "rewards/rejected": 4.203567981719971, "step": 2703 }, { "epoch": 1.3995859213250519, "grad_norm": 1.1428052186965942, "learning_rate": 5.961107383135655e-06, "loss": 0.14423128962516785, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.5938720703125, "rewards/margins": 29.46588134765625, "rewards/rejected": 5.1386566162109375, "step": 2704 }, { "epoch": 1.400103519668737, "grad_norm": 0.8206638097763062, "learning_rate": 5.958307547505599e-06, "loss": 0.1467103362083435, "rewards/accuracies": 0.90625, "rewards/chosen": 29.95833969116211, "rewards/margins": 26.301475524902344, "rewards/rejected": 3.650053024291992, "step": 2705 }, { "epoch": 1.4006211180124224, "grad_norm": 0.6234909296035767, "learning_rate": 5.955507399891609e-06, "loss": 0.094652459025383, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.995574951171875, "rewards/margins": 29.907089233398438, "rewards/rejected": 5.084190368652344, "step": 2706 }, { "epoch": 1.4011387163561078, "grad_norm": 0.9467800855636597, "learning_rate": 5.9527069412052965e-06, "loss": 0.13738536834716797, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.718505859375, "rewards/margins": 28.318359375, "rewards/rejected": 4.395755290985107, "step": 2707 }, { "epoch": 1.401656314699793, "grad_norm": 2.9379303455352783, "learning_rate": 5.949906172358368e-06, "loss": 0.18816903233528137, "rewards/accuracies": 0.90625, "rewards/chosen": 33.90665817260742, "rewards/margins": 30.00677490234375, "rewards/rejected": 3.9057559967041016, "step": 2708 }, { "epoch": 1.4021739130434783, "grad_norm": 0.4792708456516266, "learning_rate": 5.9471050942626355e-06, "loss": 0.05023392662405968, "rewards/accuracies": 0.9765625, "rewards/chosen": 37.09449005126953, "rewards/margins": 33.017669677734375, "rewards/rejected": 4.082526206970215, "step": 2709 }, { "epoch": 1.4026915113871636, "grad_norm": 2.430161476135254, "learning_rate": 5.94430370783001e-06, "loss": 0.1667930781841278, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.76467514038086, "rewards/margins": 26.676422119140625, "rewards/rejected": 3.090269088745117, "step": 2710 }, { "epoch": 1.4032091097308488, "grad_norm": 1.0812134742736816, "learning_rate": 5.941502013972504e-06, "loss": 0.15745417773723602, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.30412292480469, "rewards/margins": 28.394973754882812, "rewards/rejected": 3.9005162715911865, "step": 2711 }, { "epoch": 1.4037267080745341, "grad_norm": 1.8009052276611328, "learning_rate": 5.9387000136022255e-06, "loss": 0.22400568425655365, "rewards/accuracies": 0.890625, "rewards/chosen": 30.147323608398438, "rewards/margins": 26.90948486328125, "rewards/rejected": 3.239337921142578, "step": 2712 }, { "epoch": 1.4042443064182195, "grad_norm": 1.140876293182373, "learning_rate": 5.935897707631391e-06, "loss": 0.15714819729328156, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.619956970214844, "rewards/margins": 29.380538940429688, "rewards/rejected": 3.241518020629883, "step": 2713 }, { "epoch": 1.4047619047619047, "grad_norm": 1.1738474369049072, "learning_rate": 5.9330950969723066e-06, "loss": 0.14681166410446167, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.48239517211914, "rewards/margins": 27.19506072998047, "rewards/rejected": 4.284511566162109, "step": 2714 }, { "epoch": 1.40527950310559, "grad_norm": 0.9471909403800964, "learning_rate": 5.930292182537386e-06, "loss": 0.16688713431358337, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.36931228637695, "rewards/margins": 30.911056518554688, "rewards/rejected": 5.465288162231445, "step": 2715 }, { "epoch": 1.4057971014492754, "grad_norm": 2.096632480621338, "learning_rate": 5.927488965239136e-06, "loss": 0.19714386761188507, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.5482063293457, "rewards/margins": 30.190643310546875, "rewards/rejected": 4.362277030944824, "step": 2716 }, { "epoch": 1.4063146997929605, "grad_norm": 1.0292308330535889, "learning_rate": 5.924685445990167e-06, "loss": 0.11410205066204071, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.80644226074219, "rewards/margins": 30.472938537597656, "rewards/rejected": 3.3345165252685547, "step": 2717 }, { "epoch": 1.406832298136646, "grad_norm": 1.1179664134979248, "learning_rate": 5.921881625703182e-06, "loss": 0.12827233970165253, "rewards/accuracies": 0.9140625, "rewards/chosen": 36.29192352294922, "rewards/margins": 32.141021728515625, "rewards/rejected": 4.156227111816406, "step": 2718 }, { "epoch": 1.4073498964803313, "grad_norm": 4.09982967376709, "learning_rate": 5.919077505290985e-06, "loss": 0.23200994729995728, "rewards/accuracies": 0.921875, "rewards/chosen": 36.40536880493164, "rewards/margins": 31.452972412109375, "rewards/rejected": 4.954037666320801, "step": 2719 }, { "epoch": 1.4078674948240166, "grad_norm": 1.8266340494155884, "learning_rate": 5.9162730856664795e-06, "loss": 0.23198938369750977, "rewards/accuracies": 0.8828125, "rewards/chosen": 31.040767669677734, "rewards/margins": 25.721664428710938, "rewards/rejected": 5.312994956970215, "step": 2720 }, { "epoch": 1.408385093167702, "grad_norm": 0.8482105731964111, "learning_rate": 5.913468367742663e-06, "loss": 0.16551294922828674, "rewards/accuracies": 0.953125, "rewards/chosen": 30.625808715820312, "rewards/margins": 26.232864379882812, "rewards/rejected": 4.386436462402344, "step": 2721 }, { "epoch": 1.4089026915113871, "grad_norm": 0.7667052149772644, "learning_rate": 5.910663352432633e-06, "loss": 0.13487961888313293, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.223548889160156, "rewards/margins": 28.945945739746094, "rewards/rejected": 5.268370151519775, "step": 2722 }, { "epoch": 1.4094202898550725, "grad_norm": 0.9780980944633484, "learning_rate": 5.907858040649581e-06, "loss": 0.16532963514328003, "rewards/accuracies": 0.921875, "rewards/chosen": 28.351654052734375, "rewards/margins": 24.232467651367188, "rewards/rejected": 4.119903564453125, "step": 2723 }, { "epoch": 1.4099378881987579, "grad_norm": 0.9717848896980286, "learning_rate": 5.905052433306798e-06, "loss": 0.1032317727804184, "rewards/accuracies": 0.9609375, "rewards/chosen": 34.407684326171875, "rewards/margins": 29.05450439453125, "rewards/rejected": 5.3568267822265625, "step": 2724 }, { "epoch": 1.410455486542443, "grad_norm": 0.988693356513977, "learning_rate": 5.9022465313176665e-06, "loss": 0.13566124439239502, "rewards/accuracies": 0.9375, "rewards/chosen": 36.01171875, "rewards/margins": 31.473480224609375, "rewards/rejected": 4.544342041015625, "step": 2725 }, { "epoch": 1.4109730848861284, "grad_norm": 1.3257824182510376, "learning_rate": 5.899440335595674e-06, "loss": 0.1624460220336914, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.99945068359375, "rewards/margins": 28.165817260742188, "rewards/rejected": 5.83399772644043, "step": 2726 }, { "epoch": 1.4114906832298137, "grad_norm": 0.6728860139846802, "learning_rate": 5.8966338470543914e-06, "loss": 0.10138806700706482, "rewards/accuracies": 0.9609375, "rewards/chosen": 32.05448913574219, "rewards/margins": 26.68560791015625, "rewards/rejected": 5.378805160522461, "step": 2727 }, { "epoch": 1.412008281573499, "grad_norm": 1.0509549379348755, "learning_rate": 5.893827066607496e-06, "loss": 0.1269656866788864, "rewards/accuracies": 0.9375, "rewards/chosen": 34.35675811767578, "rewards/margins": 29.017532348632812, "rewards/rejected": 5.347831726074219, "step": 2728 }, { "epoch": 1.4125258799171843, "grad_norm": 0.6615275144577026, "learning_rate": 5.891019995168753e-06, "loss": 0.10775552690029144, "rewards/accuracies": 0.9375, "rewards/chosen": 33.67831802368164, "rewards/margins": 28.567779541015625, "rewards/rejected": 5.1047515869140625, "step": 2729 }, { "epoch": 1.4130434782608696, "grad_norm": 0.9519184827804565, "learning_rate": 5.888212633652025e-06, "loss": 0.1660669445991516, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.60002899169922, "rewards/margins": 29.107574462890625, "rewards/rejected": 5.493481636047363, "step": 2730 }, { "epoch": 1.4135610766045548, "grad_norm": 0.6205881834030151, "learning_rate": 5.885404982971269e-06, "loss": 0.06483389437198639, "rewards/accuracies": 0.984375, "rewards/chosen": 38.84187316894531, "rewards/margins": 31.33416748046875, "rewards/rejected": 7.502074241638184, "step": 2731 }, { "epoch": 1.4140786749482401, "grad_norm": 3.391331672668457, "learning_rate": 5.8825970440405365e-06, "loss": 0.14789918065071106, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.861427307128906, "rewards/margins": 24.930465698242188, "rewards/rejected": 5.931953430175781, "step": 2732 }, { "epoch": 1.4145962732919255, "grad_norm": 1.3731602430343628, "learning_rate": 5.87978881777397e-06, "loss": 0.12358570843935013, "rewards/accuracies": 0.953125, "rewards/chosen": 36.84136962890625, "rewards/margins": 31.537811279296875, "rewards/rejected": 5.315141677856445, "step": 2733 }, { "epoch": 1.4151138716356106, "grad_norm": 1.0327948331832886, "learning_rate": 5.876980305085812e-06, "loss": 0.1289486587047577, "rewards/accuracies": 0.9375, "rewards/chosen": 36.833740234375, "rewards/margins": 29.8834228515625, "rewards/rejected": 6.943901538848877, "step": 2734 }, { "epoch": 1.415631469979296, "grad_norm": 1.1871355772018433, "learning_rate": 5.87417150689039e-06, "loss": 0.17330914735794067, "rewards/accuracies": 0.8984375, "rewards/chosen": 33.742469787597656, "rewards/margins": 27.734954833984375, "rewards/rejected": 6.014392852783203, "step": 2735 }, { "epoch": 1.4161490683229814, "grad_norm": 0.796988844871521, "learning_rate": 5.871362424102131e-06, "loss": 0.12072433531284332, "rewards/accuracies": 0.921875, "rewards/chosen": 31.187240600585938, "rewards/margins": 25.128204345703125, "rewards/rejected": 6.060742378234863, "step": 2736 }, { "epoch": 1.4166666666666667, "grad_norm": 1.4516689777374268, "learning_rate": 5.868553057635551e-06, "loss": 0.19742363691329956, "rewards/accuracies": 0.8828125, "rewards/chosen": 28.771102905273438, "rewards/margins": 24.1536865234375, "rewards/rejected": 4.6236572265625, "step": 2737 }, { "epoch": 1.4171842650103519, "grad_norm": 1.9950714111328125, "learning_rate": 5.86574340840526e-06, "loss": 0.26091745495796204, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.53281784057617, "rewards/margins": 28.13330078125, "rewards/rejected": 4.390865325927734, "step": 2738 }, { "epoch": 1.4177018633540373, "grad_norm": 1.4974273443222046, "learning_rate": 5.862933477325961e-06, "loss": 0.15583345293998718, "rewards/accuracies": 0.921875, "rewards/chosen": 34.52928924560547, "rewards/margins": 28.74627685546875, "rewards/rejected": 5.783105850219727, "step": 2739 }, { "epoch": 1.4182194616977226, "grad_norm": 1.1897313594818115, "learning_rate": 5.860123265312442e-06, "loss": 0.12049943208694458, "rewards/accuracies": 0.96875, "rewards/chosen": 34.71984100341797, "rewards/margins": 28.843360900878906, "rewards/rejected": 5.8760223388671875, "step": 2740 }, { "epoch": 1.418737060041408, "grad_norm": 0.9038795828819275, "learning_rate": 5.857312773279596e-06, "loss": 0.07834827154874802, "rewards/accuracies": 0.953125, "rewards/chosen": 34.59552764892578, "rewards/margins": 28.356918334960938, "rewards/rejected": 6.231105804443359, "step": 2741 }, { "epoch": 1.4192546583850931, "grad_norm": 1.3978197574615479, "learning_rate": 5.85450200214239e-06, "loss": 0.07301132380962372, "rewards/accuracies": 0.9765625, "rewards/chosen": 35.258209228515625, "rewards/margins": 28.910125732421875, "rewards/rejected": 6.344753265380859, "step": 2742 }, { "epoch": 1.4197722567287785, "grad_norm": 1.0261588096618652, "learning_rate": 5.851690952815898e-06, "loss": 0.13279035687446594, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.46128845214844, "rewards/margins": 30.35443115234375, "rewards/rejected": 6.111845016479492, "step": 2743 }, { "epoch": 1.4202898550724639, "grad_norm": 0.9591987133026123, "learning_rate": 5.848879626215274e-06, "loss": 0.09077182412147522, "rewards/accuracies": 0.953125, "rewards/chosen": 38.20872497558594, "rewards/margins": 32.5211181640625, "rewards/rejected": 5.691070556640625, "step": 2744 }, { "epoch": 1.420807453416149, "grad_norm": 1.328762412071228, "learning_rate": 5.846068023255765e-06, "loss": 0.16683124005794525, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.712806701660156, "rewards/margins": 26.487533569335938, "rewards/rejected": 4.228141784667969, "step": 2745 }, { "epoch": 1.4213250517598344, "grad_norm": 2.0382609367370605, "learning_rate": 5.843256144852711e-06, "loss": 0.16796539723873138, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.623035430908203, "rewards/margins": 24.065406799316406, "rewards/rejected": 5.552696228027344, "step": 2746 }, { "epoch": 1.4218426501035197, "grad_norm": 6.2942423820495605, "learning_rate": 5.840443991921538e-06, "loss": 0.12396835535764694, "rewards/accuracies": 0.9375, "rewards/chosen": 34.107933044433594, "rewards/margins": 28.57195281982422, "rewards/rejected": 5.536151885986328, "step": 2747 }, { "epoch": 1.4223602484472049, "grad_norm": 1.2131885290145874, "learning_rate": 5.837631565377764e-06, "loss": 0.09984298050403595, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.77881622314453, "rewards/margins": 30.313552856445312, "rewards/rejected": 5.460982322692871, "step": 2748 }, { "epoch": 1.4228778467908902, "grad_norm": 1.100590467453003, "learning_rate": 5.834818866136993e-06, "loss": 0.1073525995016098, "rewards/accuracies": 0.9765625, "rewards/chosen": 35.22975540161133, "rewards/margins": 29.86651611328125, "rewards/rejected": 5.354222297668457, "step": 2749 }, { "epoch": 1.4233954451345756, "grad_norm": 2.0275261402130127, "learning_rate": 5.83200589511492e-06, "loss": 0.1635008454322815, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.5142822265625, "rewards/margins": 29.649871826171875, "rewards/rejected": 5.861174583435059, "step": 2750 }, { "epoch": 1.4239130434782608, "grad_norm": 2.0445151329040527, "learning_rate": 5.829192653227327e-06, "loss": 0.1293744444847107, "rewards/accuracies": 0.953125, "rewards/chosen": 37.060035705566406, "rewards/margins": 30.977752685546875, "rewards/rejected": 6.075124740600586, "step": 2751 }, { "epoch": 1.4244306418219461, "grad_norm": 1.7425230741500854, "learning_rate": 5.826379141390089e-06, "loss": 0.15594574809074402, "rewards/accuracies": 0.921875, "rewards/chosen": 31.23312759399414, "rewards/margins": 26.334518432617188, "rewards/rejected": 4.899718284606934, "step": 2752 }, { "epoch": 1.4249482401656315, "grad_norm": 1.125441074371338, "learning_rate": 5.8235653605191585e-06, "loss": 0.115500807762146, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.85252380371094, "rewards/margins": 30.49188232421875, "rewards/rejected": 7.362983703613281, "step": 2753 }, { "epoch": 1.4254658385093169, "grad_norm": 1.1861472129821777, "learning_rate": 5.82075131153059e-06, "loss": 0.15353423357009888, "rewards/accuracies": 0.90625, "rewards/chosen": 34.489837646484375, "rewards/margins": 29.609481811523438, "rewards/rejected": 4.879329681396484, "step": 2754 }, { "epoch": 1.425983436853002, "grad_norm": 1.1781024932861328, "learning_rate": 5.817936995340509e-06, "loss": 0.14531609416007996, "rewards/accuracies": 0.9375, "rewards/chosen": 34.834285736083984, "rewards/margins": 28.777587890625, "rewards/rejected": 6.064674377441406, "step": 2755 }, { "epoch": 1.4265010351966874, "grad_norm": 1.7256073951721191, "learning_rate": 5.815122412865146e-06, "loss": 0.12305813282728195, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.23245620727539, "rewards/margins": 26.843162536621094, "rewards/rejected": 5.388971328735352, "step": 2756 }, { "epoch": 1.4270186335403727, "grad_norm": 1.4042489528656006, "learning_rate": 5.8123075650208e-06, "loss": 0.11786916851997375, "rewards/accuracies": 0.953125, "rewards/chosen": 34.44457244873047, "rewards/margins": 29.162487030029297, "rewards/rejected": 5.292240142822266, "step": 2757 }, { "epoch": 1.427536231884058, "grad_norm": 1.9344758987426758, "learning_rate": 5.80949245272387e-06, "loss": 0.14553943276405334, "rewards/accuracies": 0.9296875, "rewards/chosen": 37.993160247802734, "rewards/margins": 31.256515502929688, "rewards/rejected": 6.7340240478515625, "step": 2758 }, { "epoch": 1.4280538302277432, "grad_norm": 2.3676044940948486, "learning_rate": 5.806677076890834e-06, "loss": 0.1361396610736847, "rewards/accuracies": 0.9453125, "rewards/chosen": 39.464019775390625, "rewards/margins": 32.265045166015625, "rewards/rejected": 7.2010498046875, "step": 2759 }, { "epoch": 1.4285714285714286, "grad_norm": 1.3392915725708008, "learning_rate": 5.803861438438259e-06, "loss": 0.11919528245925903, "rewards/accuracies": 0.96875, "rewards/chosen": 33.696563720703125, "rewards/margins": 27.546173095703125, "rewards/rejected": 6.141990661621094, "step": 2760 }, { "epoch": 1.429089026915114, "grad_norm": 0.6443365216255188, "learning_rate": 5.801045538282795e-06, "loss": 0.09083303064107895, "rewards/accuracies": 0.96875, "rewards/chosen": 40.58027267456055, "rewards/margins": 33.259796142578125, "rewards/rejected": 7.33222770690918, "step": 2761 }, { "epoch": 1.4296066252587991, "grad_norm": 0.6059219837188721, "learning_rate": 5.7982293773411815e-06, "loss": 0.10143541544675827, "rewards/accuracies": 0.9375, "rewards/chosen": 35.97392272949219, "rewards/margins": 30.4654541015625, "rewards/rejected": 5.504701614379883, "step": 2762 }, { "epoch": 1.4301242236024845, "grad_norm": 3.051377058029175, "learning_rate": 5.795412956530236e-06, "loss": 0.15113762021064758, "rewards/accuracies": 0.96875, "rewards/chosen": 36.46916198730469, "rewards/margins": 29.985504150390625, "rewards/rejected": 6.480197906494141, "step": 2763 }, { "epoch": 1.4306418219461698, "grad_norm": 0.7182292342185974, "learning_rate": 5.792596276766868e-06, "loss": 0.11597909778356552, "rewards/accuracies": 0.953125, "rewards/chosen": 36.93292236328125, "rewards/margins": 30.734405517578125, "rewards/rejected": 6.196147918701172, "step": 2764 }, { "epoch": 1.431159420289855, "grad_norm": 1.1487658023834229, "learning_rate": 5.789779338968067e-06, "loss": 0.1616610884666443, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.27788543701172, "rewards/margins": 29.795387268066406, "rewards/rejected": 4.483791351318359, "step": 2765 }, { "epoch": 1.4316770186335404, "grad_norm": 1.4984670877456665, "learning_rate": 5.786962144050905e-06, "loss": 0.1567690074443817, "rewards/accuracies": 0.9296875, "rewards/chosen": 40.36072540283203, "rewards/margins": 32.65203857421875, "rewards/rejected": 7.708789825439453, "step": 2766 }, { "epoch": 1.4321946169772257, "grad_norm": 1.786130428314209, "learning_rate": 5.784144692932544e-06, "loss": 0.15338358283042908, "rewards/accuracies": 0.90625, "rewards/chosen": 35.6068115234375, "rewards/margins": 28.29986572265625, "rewards/rejected": 7.305739879608154, "step": 2767 }, { "epoch": 1.4327122153209109, "grad_norm": 0.8059125542640686, "learning_rate": 5.781326986530221e-06, "loss": 0.11782816052436829, "rewards/accuracies": 0.96875, "rewards/chosen": 37.959442138671875, "rewards/margins": 31.134124755859375, "rewards/rejected": 6.826038360595703, "step": 2768 }, { "epoch": 1.4332298136645962, "grad_norm": 0.7636974453926086, "learning_rate": 5.7785090257612665e-06, "loss": 0.11167973279953003, "rewards/accuracies": 0.953125, "rewards/chosen": 38.78144836425781, "rewards/margins": 31.395889282226562, "rewards/rejected": 7.383480072021484, "step": 2769 }, { "epoch": 1.4337474120082816, "grad_norm": 0.8902621865272522, "learning_rate": 5.775690811543083e-06, "loss": 0.09300416707992554, "rewards/accuracies": 0.9609375, "rewards/chosen": 33.387916564941406, "rewards/margins": 27.587196350097656, "rewards/rejected": 5.805049896240234, "step": 2770 }, { "epoch": 1.4342650103519667, "grad_norm": 2.6542961597442627, "learning_rate": 5.772872344793162e-06, "loss": 0.12625665962696075, "rewards/accuracies": 0.9375, "rewards/chosen": 36.796356201171875, "rewards/margins": 29.208282470703125, "rewards/rejected": 7.587917327880859, "step": 2771 }, { "epoch": 1.434782608695652, "grad_norm": 1.1305586099624634, "learning_rate": 5.770053626429077e-06, "loss": 0.14701931178569794, "rewards/accuracies": 0.9375, "rewards/chosen": 33.697288513183594, "rewards/margins": 26.200119018554688, "rewards/rejected": 7.499400615692139, "step": 2772 }, { "epoch": 1.4353002070393375, "grad_norm": 1.310502052307129, "learning_rate": 5.76723465736848e-06, "loss": 0.14329802989959717, "rewards/accuracies": 0.9375, "rewards/chosen": 39.652099609375, "rewards/margins": 32.649497985839844, "rewards/rejected": 7.009559631347656, "step": 2773 }, { "epoch": 1.4358178053830228, "grad_norm": 1.4491827487945557, "learning_rate": 5.764415438529107e-06, "loss": 0.1939876824617386, "rewards/accuracies": 0.90625, "rewards/chosen": 36.3495979309082, "rewards/margins": 30.62396240234375, "rewards/rejected": 5.7259063720703125, "step": 2774 }, { "epoch": 1.4363354037267082, "grad_norm": 1.774499773979187, "learning_rate": 5.761595970828777e-06, "loss": 0.1340637356042862, "rewards/accuracies": 0.9296875, "rewards/chosen": 37.44363784790039, "rewards/margins": 31.884353637695312, "rewards/rejected": 5.566259384155273, "step": 2775 }, { "epoch": 1.4368530020703933, "grad_norm": 1.299033284187317, "learning_rate": 5.7587762551853865e-06, "loss": 0.16466546058654785, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.6823844909668, "rewards/margins": 27.92822265625, "rewards/rejected": 6.7501678466796875, "step": 2776 }, { "epoch": 1.4373706004140787, "grad_norm": 5.056436061859131, "learning_rate": 5.7559562925169144e-06, "loss": 0.19139735400676727, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.61097717285156, "rewards/margins": 27.008941650390625, "rewards/rejected": 7.596210479736328, "step": 2777 }, { "epoch": 1.437888198757764, "grad_norm": 1.12343430519104, "learning_rate": 5.7531360837414195e-06, "loss": 0.14215606451034546, "rewards/accuracies": 0.96875, "rewards/chosen": 37.64882278442383, "rewards/margins": 31.125457763671875, "rewards/rejected": 6.534894943237305, "step": 2778 }, { "epoch": 1.4384057971014492, "grad_norm": 1.9585579633712769, "learning_rate": 5.750315629777041e-06, "loss": 0.1868165135383606, "rewards/accuracies": 0.8984375, "rewards/chosen": 34.2955322265625, "rewards/margins": 27.95235824584961, "rewards/rejected": 6.341512680053711, "step": 2779 }, { "epoch": 1.4389233954451346, "grad_norm": 5.571842193603516, "learning_rate": 5.747494931542001e-06, "loss": 0.19938728213310242, "rewards/accuracies": 0.9375, "rewards/chosen": 32.508880615234375, "rewards/margins": 27.523269653320312, "rewards/rejected": 4.9891357421875, "step": 2780 }, { "epoch": 1.43944099378882, "grad_norm": 0.68317711353302, "learning_rate": 5.744673989954593e-06, "loss": 0.10858897119760513, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.715179443359375, "rewards/margins": 26.26036834716797, "rewards/rejected": 4.44730281829834, "step": 2781 }, { "epoch": 1.439958592132505, "grad_norm": 1.6386346817016602, "learning_rate": 5.7418528059332e-06, "loss": 0.14910736680030823, "rewards/accuracies": 0.921875, "rewards/chosen": 31.584705352783203, "rewards/margins": 27.60552978515625, "rewards/rejected": 3.9774093627929688, "step": 2782 }, { "epoch": 1.4404761904761905, "grad_norm": 1.731497049331665, "learning_rate": 5.7390313803962746e-06, "loss": 0.19764897227287292, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.978923797607422, "rewards/margins": 23.125152587890625, "rewards/rejected": 3.85673189163208, "step": 2783 }, { "epoch": 1.4409937888198758, "grad_norm": 2.562498092651367, "learning_rate": 5.7362097142623554e-06, "loss": 0.2117355763912201, "rewards/accuracies": 0.9375, "rewards/chosen": 31.062530517578125, "rewards/margins": 26.753631591796875, "rewards/rejected": 4.306190490722656, "step": 2784 }, { "epoch": 1.441511387163561, "grad_norm": 1.0070135593414307, "learning_rate": 5.733387808450054e-06, "loss": 0.11331667006015778, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.012054443359375, "rewards/margins": 28.28528594970703, "rewards/rejected": 3.7241668701171875, "step": 2785 }, { "epoch": 1.4420289855072463, "grad_norm": 1.7972131967544556, "learning_rate": 5.730565663878063e-06, "loss": 0.1791885644197464, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.863330841064453, "rewards/margins": 27.624237060546875, "rewards/rejected": 4.2361907958984375, "step": 2786 }, { "epoch": 1.4425465838509317, "grad_norm": 1.7153607606887817, "learning_rate": 5.727743281465152e-06, "loss": 0.097355917096138, "rewards/accuracies": 0.953125, "rewards/chosen": 33.30671691894531, "rewards/margins": 29.0137939453125, "rewards/rejected": 4.294914245605469, "step": 2787 }, { "epoch": 1.4430641821946169, "grad_norm": 1.8352681398391724, "learning_rate": 5.724920662130168e-06, "loss": 0.09646767377853394, "rewards/accuracies": 0.96875, "rewards/chosen": 31.780139923095703, "rewards/margins": 27.580856323242188, "rewards/rejected": 4.194547653198242, "step": 2788 }, { "epoch": 1.4435817805383022, "grad_norm": 1.1585450172424316, "learning_rate": 5.722097806792034e-06, "loss": 0.1479073166847229, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.65227508544922, "rewards/margins": 22.648529052734375, "rewards/rejected": 3.9989356994628906, "step": 2789 }, { "epoch": 1.4440993788819876, "grad_norm": 2.5004117488861084, "learning_rate": 5.7192747163697525e-06, "loss": 0.1472119390964508, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.281818389892578, "rewards/margins": 24.71319580078125, "rewards/rejected": 4.567486763000488, "step": 2790 }, { "epoch": 1.444616977225673, "grad_norm": 0.7634690403938293, "learning_rate": 5.716451391782401e-06, "loss": 0.11600102484226227, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.883346557617188, "rewards/margins": 24.18011474609375, "rewards/rejected": 3.7017059326171875, "step": 2791 }, { "epoch": 1.4451345755693583, "grad_norm": 0.694483757019043, "learning_rate": 5.713627833949131e-06, "loss": 0.13128739595413208, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.20133399963379, "rewards/margins": 24.153579711914062, "rewards/rejected": 3.052356719970703, "step": 2792 }, { "epoch": 1.4456521739130435, "grad_norm": 1.1998565196990967, "learning_rate": 5.7108040437891755e-06, "loss": 0.23142898082733154, "rewards/accuracies": 0.8828125, "rewards/chosen": 23.1456298828125, "rewards/margins": 20.18981170654297, "rewards/rejected": 2.958110809326172, "step": 2793 }, { "epoch": 1.4461697722567288, "grad_norm": 0.7113670706748962, "learning_rate": 5.707980022221835e-06, "loss": 0.13369302451610565, "rewards/accuracies": 0.9375, "rewards/chosen": 24.06554412841797, "rewards/margins": 20.897293090820312, "rewards/rejected": 3.162586212158203, "step": 2794 }, { "epoch": 1.4466873706004142, "grad_norm": 3.010552167892456, "learning_rate": 5.7051557701664965e-06, "loss": 0.17477013170719147, "rewards/accuracies": 0.96875, "rewards/chosen": 23.23054313659668, "rewards/margins": 20.40643310546875, "rewards/rejected": 2.825000762939453, "step": 2795 }, { "epoch": 1.4472049689440993, "grad_norm": 0.861724853515625, "learning_rate": 5.702331288542609e-06, "loss": 0.14892609417438507, "rewards/accuracies": 0.953125, "rewards/chosen": 25.61894989013672, "rewards/margins": 22.53131103515625, "rewards/rejected": 3.0856800079345703, "step": 2796 }, { "epoch": 1.4477225672877847, "grad_norm": 1.056962013244629, "learning_rate": 5.699506578269708e-06, "loss": 0.20314949750900269, "rewards/accuracies": 0.8671875, "rewards/chosen": 21.96663475036621, "rewards/margins": 18.752288818359375, "rewards/rejected": 3.211848258972168, "step": 2797 }, { "epoch": 1.44824016563147, "grad_norm": 0.7767897248268127, "learning_rate": 5.696681640267395e-06, "loss": 0.16702473163604736, "rewards/accuracies": 0.921875, "rewards/chosen": 25.511066436767578, "rewards/margins": 22.200958251953125, "rewards/rejected": 3.3125648498535156, "step": 2798 }, { "epoch": 1.4487577639751552, "grad_norm": 1.2896400690078735, "learning_rate": 5.69385647545535e-06, "loss": 0.17261318862438202, "rewards/accuracies": 0.8984375, "rewards/chosen": 23.425064086914062, "rewards/margins": 19.669342041015625, "rewards/rejected": 3.7501277923583984, "step": 2799 }, { "epoch": 1.4492753623188406, "grad_norm": 1.815971851348877, "learning_rate": 5.6910310847533264e-06, "loss": 0.15788082778453827, "rewards/accuracies": 0.9140625, "rewards/chosen": 20.35179901123047, "rewards/margins": 18.520584106445312, "rewards/rejected": 1.830678939819336, "step": 2800 }, { "epoch": 1.449792960662526, "grad_norm": 1.9836000204086304, "learning_rate": 5.68820546908115e-06, "loss": 0.20225653052330017, "rewards/accuracies": 0.90625, "rewards/chosen": 21.987607955932617, "rewards/margins": 19.583786010742188, "rewards/rejected": 2.4091644287109375, "step": 2801 }, { "epoch": 1.450310559006211, "grad_norm": 0.7698876857757568, "learning_rate": 5.6853796293587205e-06, "loss": 0.1653430163860321, "rewards/accuracies": 0.8984375, "rewards/chosen": 19.781517028808594, "rewards/margins": 18.297882080078125, "rewards/rejected": 1.4922027587890625, "step": 2802 }, { "epoch": 1.4508281573498965, "grad_norm": 0.6438347101211548, "learning_rate": 5.682553566506011e-06, "loss": 0.12064149975776672, "rewards/accuracies": 0.9609375, "rewards/chosen": 21.124164581298828, "rewards/margins": 19.23175048828125, "rewards/rejected": 1.8915081024169922, "step": 2803 }, { "epoch": 1.4513457556935818, "grad_norm": 0.8592795133590698, "learning_rate": 5.679727281443064e-06, "loss": 0.16550219058990479, "rewards/accuracies": 0.9375, "rewards/chosen": 19.848655700683594, "rewards/margins": 17.781532287597656, "rewards/rejected": 2.0698049068450928, "step": 2804 }, { "epoch": 1.451863354037267, "grad_norm": 6.3169755935668945, "learning_rate": 5.6769007750899996e-06, "loss": 0.21051479876041412, "rewards/accuracies": 0.8984375, "rewards/chosen": 23.16799545288086, "rewards/margins": 20.075332641601562, "rewards/rejected": 3.093975067138672, "step": 2805 }, { "epoch": 1.4523809523809523, "grad_norm": 0.852482259273529, "learning_rate": 5.674074048367009e-06, "loss": 0.12468179315328598, "rewards/accuracies": 0.9609375, "rewards/chosen": 20.50836181640625, "rewards/margins": 18.679100036621094, "rewards/rejected": 1.8331298828125, "step": 2806 }, { "epoch": 1.4528985507246377, "grad_norm": 1.0392390489578247, "learning_rate": 5.6712471021943475e-06, "loss": 0.144305020570755, "rewards/accuracies": 0.9375, "rewards/chosen": 21.406726837158203, "rewards/margins": 18.947784423828125, "rewards/rejected": 2.4552478790283203, "step": 2807 }, { "epoch": 1.453416149068323, "grad_norm": 0.7961662411689758, "learning_rate": 5.6684199374923544e-06, "loss": 0.09597396850585938, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.660911560058594, "rewards/margins": 23.561676025390625, "rewards/rejected": 3.0967960357666016, "step": 2808 }, { "epoch": 1.4539337474120082, "grad_norm": 0.9393956661224365, "learning_rate": 5.6655925551814275e-06, "loss": 0.14097553491592407, "rewards/accuracies": 0.9375, "rewards/chosen": 19.978469848632812, "rewards/margins": 18.070953369140625, "rewards/rejected": 1.8983893394470215, "step": 2809 }, { "epoch": 1.4544513457556936, "grad_norm": 0.8664893507957458, "learning_rate": 5.6627649561820474e-06, "loss": 0.1541680544614792, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.762128829956055, "rewards/margins": 20.274986267089844, "rewards/rejected": 3.4828643798828125, "step": 2810 }, { "epoch": 1.454968944099379, "grad_norm": 1.0655442476272583, "learning_rate": 5.659937141414755e-06, "loss": 0.18924270570278168, "rewards/accuracies": 0.890625, "rewards/chosen": 23.557628631591797, "rewards/margins": 20.571563720703125, "rewards/rejected": 2.9882168769836426, "step": 2811 }, { "epoch": 1.4554865424430643, "grad_norm": 0.7711926102638245, "learning_rate": 5.657109111800168e-06, "loss": 0.10089859366416931, "rewards/accuracies": 0.9609375, "rewards/chosen": 24.624347686767578, "rewards/margins": 22.003387451171875, "rewards/rejected": 2.615570068359375, "step": 2812 }, { "epoch": 1.4560041407867494, "grad_norm": 0.7312519550323486, "learning_rate": 5.65428086825897e-06, "loss": 0.08350132405757904, "rewards/accuracies": 0.9609375, "rewards/chosen": 22.13336944580078, "rewards/margins": 20.24420166015625, "rewards/rejected": 1.8888859748840332, "step": 2813 }, { "epoch": 1.4565217391304348, "grad_norm": 0.9729307293891907, "learning_rate": 5.651452411711919e-06, "loss": 0.1435902863740921, "rewards/accuracies": 0.9375, "rewards/chosen": 24.184677124023438, "rewards/margins": 21.587997436523438, "rewards/rejected": 2.5976791381835938, "step": 2814 }, { "epoch": 1.4570393374741202, "grad_norm": 0.7582257986068726, "learning_rate": 5.648623743079836e-06, "loss": 0.0984683483839035, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.01214599609375, "rewards/margins": 22.309036254882812, "rewards/rejected": 2.6962890625, "step": 2815 }, { "epoch": 1.4575569358178053, "grad_norm": 1.2708654403686523, "learning_rate": 5.645794863283617e-06, "loss": 0.14756619930267334, "rewards/accuracies": 0.921875, "rewards/chosen": 25.01323699951172, "rewards/margins": 21.744686126708984, "rewards/rejected": 3.2654380798339844, "step": 2816 }, { "epoch": 1.4580745341614907, "grad_norm": 1.306374430656433, "learning_rate": 5.642965773244224e-06, "loss": 0.20219987630844116, "rewards/accuracies": 0.90625, "rewards/chosen": 23.238605499267578, "rewards/margins": 20.851226806640625, "rewards/rejected": 2.3840231895446777, "step": 2817 }, { "epoch": 1.458592132505176, "grad_norm": 3.1269025802612305, "learning_rate": 5.640136473882685e-06, "loss": 0.14623665809631348, "rewards/accuracies": 0.921875, "rewards/chosen": 25.179397583007812, "rewards/margins": 22.91925048828125, "rewards/rejected": 2.255697250366211, "step": 2818 }, { "epoch": 1.4591097308488612, "grad_norm": 1.2998199462890625, "learning_rate": 5.637306966120102e-06, "loss": 0.13165217638015747, "rewards/accuracies": 0.921875, "rewards/chosen": 25.94683837890625, "rewards/margins": 22.32244873046875, "rewards/rejected": 3.630497932434082, "step": 2819 }, { "epoch": 1.4596273291925466, "grad_norm": 1.2893524169921875, "learning_rate": 5.63447725087764e-06, "loss": 0.16124066710472107, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.578338623046875, "rewards/margins": 24.14935302734375, "rewards/rejected": 3.4257688522338867, "step": 2820 }, { "epoch": 1.460144927536232, "grad_norm": 1.1227952241897583, "learning_rate": 5.631647329076535e-06, "loss": 0.09936675429344177, "rewards/accuracies": 0.9375, "rewards/chosen": 25.083398818969727, "rewards/margins": 22.729278564453125, "rewards/rejected": 2.35455322265625, "step": 2821 }, { "epoch": 1.460662525879917, "grad_norm": 1.1580114364624023, "learning_rate": 5.628817201638086e-06, "loss": 0.13899140059947968, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.212379455566406, "rewards/margins": 24.030029296875, "rewards/rejected": 4.172309875488281, "step": 2822 }, { "epoch": 1.4611801242236024, "grad_norm": 1.8578256368637085, "learning_rate": 5.625986869483663e-06, "loss": 0.14136099815368652, "rewards/accuracies": 0.9375, "rewards/chosen": 22.891048431396484, "rewards/margins": 20.428009033203125, "rewards/rejected": 2.462961196899414, "step": 2823 }, { "epoch": 1.4616977225672878, "grad_norm": 1.045200228691101, "learning_rate": 5.6231563335347e-06, "loss": 0.1517079472541809, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.548139572143555, "rewards/margins": 21.06768798828125, "rewards/rejected": 2.480215072631836, "step": 2824 }, { "epoch": 1.462215320910973, "grad_norm": 2.255547285079956, "learning_rate": 5.6203255947127e-06, "loss": 0.21425577998161316, "rewards/accuracies": 0.8671875, "rewards/chosen": 25.0550594329834, "rewards/margins": 22.537322998046875, "rewards/rejected": 2.51678466796875, "step": 2825 }, { "epoch": 1.4627329192546583, "grad_norm": 1.066927194595337, "learning_rate": 5.617494653939228e-06, "loss": 0.14967580139636993, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.37506103515625, "rewards/margins": 24.019302368164062, "rewards/rejected": 2.3506851196289062, "step": 2826 }, { "epoch": 1.4632505175983437, "grad_norm": 1.2240077257156372, "learning_rate": 5.6146635121359196e-06, "loss": 0.17489583790302277, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.681819915771484, "rewards/margins": 22.67279052734375, "rewards/rejected": 4.0120391845703125, "step": 2827 }, { "epoch": 1.463768115942029, "grad_norm": 0.7714484333992004, "learning_rate": 5.611832170224471e-06, "loss": 0.07175023853778839, "rewards/accuracies": 0.96875, "rewards/chosen": 27.58454704284668, "rewards/margins": 24.853668212890625, "rewards/rejected": 2.730062484741211, "step": 2828 }, { "epoch": 1.4642857142857144, "grad_norm": 1.1677290201187134, "learning_rate": 5.609000629126649e-06, "loss": 0.13470858335494995, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.406021118164062, "rewards/margins": 24.970840454101562, "rewards/rejected": 2.4366455078125, "step": 2829 }, { "epoch": 1.4648033126293996, "grad_norm": 2.188206195831299, "learning_rate": 5.6061688897642794e-06, "loss": 0.14786818623542786, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.377506256103516, "rewards/margins": 24.7169189453125, "rewards/rejected": 2.659214496612549, "step": 2830 }, { "epoch": 1.465320910973085, "grad_norm": 1.8017133474349976, "learning_rate": 5.6033369530592575e-06, "loss": 0.16844812035560608, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.31352996826172, "rewards/margins": 27.25757598876953, "rewards/rejected": 3.0600814819335938, "step": 2831 }, { "epoch": 1.4658385093167703, "grad_norm": 1.2560622692108154, "learning_rate": 5.6005048199335385e-06, "loss": 0.12735900282859802, "rewards/accuracies": 0.953125, "rewards/chosen": 28.135570526123047, "rewards/margins": 25.458023071289062, "rewards/rejected": 2.680692672729492, "step": 2832 }, { "epoch": 1.4663561076604554, "grad_norm": 0.9485968351364136, "learning_rate": 5.597672491309146e-06, "loss": 0.12996506690979004, "rewards/accuracies": 0.9375, "rewards/chosen": 28.318492889404297, "rewards/margins": 25.6136474609375, "rewards/rejected": 2.704726219177246, "step": 2833 }, { "epoch": 1.4668737060041408, "grad_norm": 1.4720120429992676, "learning_rate": 5.594839968108166e-06, "loss": 0.1985396444797516, "rewards/accuracies": 0.921875, "rewards/chosen": 26.74639892578125, "rewards/margins": 24.252456665039062, "rewards/rejected": 2.491819381713867, "step": 2834 }, { "epoch": 1.4673913043478262, "grad_norm": 0.9820685386657715, "learning_rate": 5.59200725125274e-06, "loss": 0.09872827678918839, "rewards/accuracies": 0.96875, "rewards/chosen": 27.889305114746094, "rewards/margins": 24.947845458984375, "rewards/rejected": 2.9347591400146484, "step": 2835 }, { "epoch": 1.4679089026915113, "grad_norm": 1.623771071434021, "learning_rate": 5.589174341665089e-06, "loss": 0.15042054653167725, "rewards/accuracies": 0.921875, "rewards/chosen": 31.717945098876953, "rewards/margins": 27.821170806884766, "rewards/rejected": 3.8908538818359375, "step": 2836 }, { "epoch": 1.4684265010351967, "grad_norm": 1.76926851272583, "learning_rate": 5.5863412402674785e-06, "loss": 0.1431923806667328, "rewards/accuracies": 0.921875, "rewards/chosen": 27.206485748291016, "rewards/margins": 24.47216033935547, "rewards/rejected": 2.732754707336426, "step": 2837 }, { "epoch": 1.468944099378882, "grad_norm": 1.3960903882980347, "learning_rate": 5.5835079479822526e-06, "loss": 0.16284878551959991, "rewards/accuracies": 0.953125, "rewards/chosen": 27.72675323486328, "rewards/margins": 25.587539672851562, "rewards/rejected": 2.1404266357421875, "step": 2838 }, { "epoch": 1.4694616977225672, "grad_norm": 2.228745460510254, "learning_rate": 5.580674465731805e-06, "loss": 0.15673112869262695, "rewards/accuracies": 0.9375, "rewards/chosen": 28.873798370361328, "rewards/margins": 25.865509033203125, "rewards/rejected": 3.010589599609375, "step": 2839 }, { "epoch": 1.4699792960662525, "grad_norm": 1.0668892860412598, "learning_rate": 5.5778407944385985e-06, "loss": 0.1279742419719696, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.69908332824707, "rewards/margins": 25.054824829101562, "rewards/rejected": 2.6486663818359375, "step": 2840 }, { "epoch": 1.470496894409938, "grad_norm": 0.8665329217910767, "learning_rate": 5.575006935025152e-06, "loss": 0.08922906219959259, "rewards/accuracies": 0.96875, "rewards/chosen": 29.885208129882812, "rewards/margins": 27.086517333984375, "rewards/rejected": 2.795039653778076, "step": 2841 }, { "epoch": 1.471014492753623, "grad_norm": 1.0811688899993896, "learning_rate": 5.572172888414054e-06, "loss": 0.1833549439907074, "rewards/accuracies": 0.90625, "rewards/chosen": 25.075542449951172, "rewards/margins": 22.278656005859375, "rewards/rejected": 2.794544219970703, "step": 2842 }, { "epoch": 1.4715320910973084, "grad_norm": 1.6349717378616333, "learning_rate": 5.569338655527945e-06, "loss": 0.16893231868743896, "rewards/accuracies": 0.953125, "rewards/chosen": 26.445083618164062, "rewards/margins": 24.41326904296875, "rewards/rejected": 2.0287342071533203, "step": 2843 }, { "epoch": 1.4720496894409938, "grad_norm": 0.9831832647323608, "learning_rate": 5.566504237289532e-06, "loss": 0.07454006373882294, "rewards/accuracies": 0.96875, "rewards/chosen": 30.53533935546875, "rewards/margins": 27.343170166015625, "rewards/rejected": 3.195465087890625, "step": 2844 }, { "epoch": 1.4725672877846792, "grad_norm": 0.9034198522567749, "learning_rate": 5.563669634621578e-06, "loss": 0.15230897068977356, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.970474243164062, "rewards/margins": 24.3026123046875, "rewards/rejected": 3.6797561645507812, "step": 2845 }, { "epoch": 1.4730848861283645, "grad_norm": 0.9124471545219421, "learning_rate": 5.56083484844691e-06, "loss": 0.09511739015579224, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.940284729003906, "rewards/margins": 27.334686279296875, "rewards/rejected": 3.5997467041015625, "step": 2846 }, { "epoch": 1.4736024844720497, "grad_norm": 1.7266895771026611, "learning_rate": 5.557999879688415e-06, "loss": 0.18441219627857208, "rewards/accuracies": 0.90625, "rewards/chosen": 32.16044616699219, "rewards/margins": 27.5067138671875, "rewards/rejected": 4.646079063415527, "step": 2847 }, { "epoch": 1.474120082815735, "grad_norm": 1.973836898803711, "learning_rate": 5.555164729269031e-06, "loss": 0.21133987605571747, "rewards/accuracies": 0.8984375, "rewards/chosen": 28.110288619995117, "rewards/margins": 25.4169921875, "rewards/rejected": 2.6925086975097656, "step": 2848 }, { "epoch": 1.4746376811594204, "grad_norm": 3.848414421081543, "learning_rate": 5.5523293981117705e-06, "loss": 0.14542649686336517, "rewards/accuracies": 0.9375, "rewards/chosen": 34.08021545410156, "rewards/margins": 29.0111083984375, "rewards/rejected": 5.06280517578125, "step": 2849 }, { "epoch": 1.4751552795031055, "grad_norm": 0.8344436883926392, "learning_rate": 5.549493887139689e-06, "loss": 0.1077205240726471, "rewards/accuracies": 0.9609375, "rewards/chosen": 28.70885467529297, "rewards/margins": 24.63109588623047, "rewards/rejected": 4.071531295776367, "step": 2850 }, { "epoch": 1.475672877846791, "grad_norm": 1.1548490524291992, "learning_rate": 5.546658197275911e-06, "loss": 0.17107997834682465, "rewards/accuracies": 0.9375, "rewards/chosen": 30.75359344482422, "rewards/margins": 25.900680541992188, "rewards/rejected": 4.84906005859375, "step": 2851 }, { "epoch": 1.4761904761904763, "grad_norm": 0.8485908508300781, "learning_rate": 5.543822329443614e-06, "loss": 0.11158643662929535, "rewards/accuracies": 0.953125, "rewards/chosen": 33.02669143676758, "rewards/margins": 28.40728759765625, "rewards/rejected": 4.6147308349609375, "step": 2852 }, { "epoch": 1.4767080745341614, "grad_norm": 1.0014067888259888, "learning_rate": 5.540986284566036e-06, "loss": 0.16445764899253845, "rewards/accuracies": 0.921875, "rewards/chosen": 30.733346939086914, "rewards/margins": 26.470535278320312, "rewards/rejected": 4.26759672164917, "step": 2853 }, { "epoch": 1.4772256728778468, "grad_norm": 2.306715965270996, "learning_rate": 5.538150063566473e-06, "loss": 0.1434667408466339, "rewards/accuracies": 0.953125, "rewards/chosen": 29.751583099365234, "rewards/margins": 26.315155029296875, "rewards/rejected": 3.44036865234375, "step": 2854 }, { "epoch": 1.4777432712215322, "grad_norm": 0.9967806339263916, "learning_rate": 5.535313667368276e-06, "loss": 0.1673843264579773, "rewards/accuracies": 0.921875, "rewards/chosen": 27.656784057617188, "rewards/margins": 23.769485473632812, "rewards/rejected": 3.8928871154785156, "step": 2855 }, { "epoch": 1.4782608695652173, "grad_norm": 1.0211219787597656, "learning_rate": 5.532477096894854e-06, "loss": 0.1425885111093521, "rewards/accuracies": 0.9140625, "rewards/chosen": 33.22044372558594, "rewards/margins": 27.738632202148438, "rewards/rejected": 5.485966682434082, "step": 2856 }, { "epoch": 1.4787784679089027, "grad_norm": 1.9576045274734497, "learning_rate": 5.529640353069674e-06, "loss": 0.16259247064590454, "rewards/accuracies": 0.9375, "rewards/chosen": 36.099082946777344, "rewards/margins": 28.843372344970703, "rewards/rejected": 7.2501678466796875, "step": 2857 }, { "epoch": 1.479296066252588, "grad_norm": 1.0997815132141113, "learning_rate": 5.526803436816256e-06, "loss": 0.11941200494766235, "rewards/accuracies": 0.9375, "rewards/chosen": 36.62668228149414, "rewards/margins": 31.018707275390625, "rewards/rejected": 5.598564147949219, "step": 2858 }, { "epoch": 1.4798136645962732, "grad_norm": 1.4097379446029663, "learning_rate": 5.523966349058182e-06, "loss": 0.17010284960269928, "rewards/accuracies": 0.921875, "rewards/chosen": 32.974090576171875, "rewards/margins": 27.471031188964844, "rewards/rejected": 5.508626937866211, "step": 2859 }, { "epoch": 1.4803312629399585, "grad_norm": 0.7020551562309265, "learning_rate": 5.521129090719084e-06, "loss": 0.13181650638580322, "rewards/accuracies": 0.921875, "rewards/chosen": 43.04024887084961, "rewards/margins": 36.44781494140625, "rewards/rejected": 6.5925750732421875, "step": 2860 }, { "epoch": 1.480848861283644, "grad_norm": 1.2988612651824951, "learning_rate": 5.518291662722653e-06, "loss": 0.11552725732326508, "rewards/accuracies": 0.9609375, "rewards/chosen": 33.06147003173828, "rewards/margins": 28.120681762695312, "rewards/rejected": 4.944089889526367, "step": 2861 }, { "epoch": 1.4813664596273293, "grad_norm": 0.7259325385093689, "learning_rate": 5.515454065992633e-06, "loss": 0.12317880243062973, "rewards/accuracies": 0.9609375, "rewards/chosen": 35.46924591064453, "rewards/margins": 29.32898712158203, "rewards/rejected": 6.149776458740234, "step": 2862 }, { "epoch": 1.4818840579710144, "grad_norm": 1.7680960893630981, "learning_rate": 5.512616301452825e-06, "loss": 0.13888072967529297, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.800716400146484, "rewards/margins": 28.953125, "rewards/rejected": 6.854928970336914, "step": 2863 }, { "epoch": 1.4824016563146998, "grad_norm": 1.6768712997436523, "learning_rate": 5.509778370027083e-06, "loss": 0.2228168547153473, "rewards/accuracies": 0.8671875, "rewards/chosen": 37.75156021118164, "rewards/margins": 30.494781494140625, "rewards/rejected": 7.259632110595703, "step": 2864 }, { "epoch": 1.4829192546583851, "grad_norm": 1.1291146278381348, "learning_rate": 5.506940272639319e-06, "loss": 0.13089969754219055, "rewards/accuracies": 0.9453125, "rewards/chosen": 43.0625, "rewards/margins": 33.885719299316406, "rewards/rejected": 9.184283256530762, "step": 2865 }, { "epoch": 1.4834368530020705, "grad_norm": 1.4691619873046875, "learning_rate": 5.504102010213491e-06, "loss": 0.10662740468978882, "rewards/accuracies": 0.953125, "rewards/chosen": 40.1363410949707, "rewards/margins": 30.689605712890625, "rewards/rejected": 9.451738357543945, "step": 2866 }, { "epoch": 1.4839544513457557, "grad_norm": 1.1853713989257812, "learning_rate": 5.50126358367362e-06, "loss": 0.1586003303527832, "rewards/accuracies": 0.9140625, "rewards/chosen": 39.67195510864258, "rewards/margins": 30.584014892578125, "rewards/rejected": 9.099907875061035, "step": 2867 }, { "epoch": 1.484472049689441, "grad_norm": 0.7432247996330261, "learning_rate": 5.498424993943773e-06, "loss": 0.11076438426971436, "rewards/accuracies": 0.96875, "rewards/chosen": 39.440338134765625, "rewards/margins": 30.62054443359375, "rewards/rejected": 8.825530052185059, "step": 2868 }, { "epoch": 1.4849896480331264, "grad_norm": 1.088820457458496, "learning_rate": 5.495586241948075e-06, "loss": 0.1186990886926651, "rewards/accuracies": 0.9765625, "rewards/chosen": 38.86614227294922, "rewards/margins": 29.63189697265625, "rewards/rejected": 9.235515594482422, "step": 2869 }, { "epoch": 1.4855072463768115, "grad_norm": 0.6098068356513977, "learning_rate": 5.4927473286107045e-06, "loss": 0.11755964159965515, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.972068786621094, "rewards/margins": 28.943115234375, "rewards/rejected": 7.034637451171875, "step": 2870 }, { "epoch": 1.486024844720497, "grad_norm": 1.5567679405212402, "learning_rate": 5.4899082548558855e-06, "loss": 0.1618528664112091, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.39034652709961, "rewards/margins": 26.619354248046875, "rewards/rejected": 9.768610954284668, "step": 2871 }, { "epoch": 1.4865424430641823, "grad_norm": 2.751812219619751, "learning_rate": 5.487069021607903e-06, "loss": 0.11370095610618591, "rewards/accuracies": 0.9609375, "rewards/chosen": 41.63654327392578, "rewards/margins": 31.575912475585938, "rewards/rejected": 10.05471420288086, "step": 2872 }, { "epoch": 1.4870600414078674, "grad_norm": 1.074570894241333, "learning_rate": 5.484229629791087e-06, "loss": 0.12079928070306778, "rewards/accuracies": 0.921875, "rewards/chosen": 39.85380554199219, "rewards/margins": 29.647361755371094, "rewards/rejected": 10.198949813842773, "step": 2873 }, { "epoch": 1.4875776397515528, "grad_norm": 1.490443468093872, "learning_rate": 5.481390080329823e-06, "loss": 0.13184911012649536, "rewards/accuracies": 0.9609375, "rewards/chosen": 36.736454010009766, "rewards/margins": 27.48675537109375, "rewards/rejected": 9.25346565246582, "step": 2874 }, { "epoch": 1.4880952380952381, "grad_norm": 1.12677800655365, "learning_rate": 5.478550374148549e-06, "loss": 0.15810847282409668, "rewards/accuracies": 0.9375, "rewards/chosen": 34.63962173461914, "rewards/margins": 27.433914184570312, "rewards/rejected": 7.206085205078125, "step": 2875 }, { "epoch": 1.4886128364389233, "grad_norm": 1.5560219287872314, "learning_rate": 5.475710512171747e-06, "loss": 0.1473611742258072, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.679962158203125, "rewards/margins": 26.267974853515625, "rewards/rejected": 8.406250953674316, "step": 2876 }, { "epoch": 1.4891304347826086, "grad_norm": 1.263640284538269, "learning_rate": 5.4728704953239596e-06, "loss": 0.15344029664993286, "rewards/accuracies": 0.9296875, "rewards/chosen": 37.481170654296875, "rewards/margins": 28.760528564453125, "rewards/rejected": 8.72890853881836, "step": 2877 }, { "epoch": 1.489648033126294, "grad_norm": 0.9501291513442993, "learning_rate": 5.4700303245297704e-06, "loss": 0.11332693696022034, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.95940399169922, "rewards/margins": 27.0233154296875, "rewards/rejected": 6.940402030944824, "step": 2878 }, { "epoch": 1.4901656314699794, "grad_norm": 1.4151208400726318, "learning_rate": 5.467190000713823e-06, "loss": 0.10767970979213715, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.803382873535156, "rewards/margins": 27.745697021484375, "rewards/rejected": 7.055253505706787, "step": 2879 }, { "epoch": 1.4906832298136645, "grad_norm": 0.8292512893676758, "learning_rate": 5.4643495248007985e-06, "loss": 0.11096431314945221, "rewards/accuracies": 0.9375, "rewards/chosen": 36.481475830078125, "rewards/margins": 29.834747314453125, "rewards/rejected": 6.6393585205078125, "step": 2880 }, { "epoch": 1.49120082815735, "grad_norm": 1.2739958763122559, "learning_rate": 5.461508897715439e-06, "loss": 0.1615147739648819, "rewards/accuracies": 0.90625, "rewards/chosen": 32.02082824707031, "rewards/margins": 25.162017822265625, "rewards/rejected": 6.856121063232422, "step": 2881 }, { "epoch": 1.4917184265010353, "grad_norm": 1.658512830734253, "learning_rate": 5.45866812038253e-06, "loss": 0.12179317325353622, "rewards/accuracies": 0.953125, "rewards/chosen": 41.353607177734375, "rewards/margins": 32.471641540527344, "rewards/rejected": 8.880277633666992, "step": 2882 }, { "epoch": 1.4922360248447206, "grad_norm": 1.142114520072937, "learning_rate": 5.455827193726906e-06, "loss": 0.13947656750679016, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.18685531616211, "rewards/margins": 24.30393409729004, "rewards/rejected": 5.886788368225098, "step": 2883 }, { "epoch": 1.4927536231884058, "grad_norm": 1.6259195804595947, "learning_rate": 5.452986118673453e-06, "loss": 0.17692895233631134, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.6944694519043, "rewards/margins": 25.796966552734375, "rewards/rejected": 6.891572952270508, "step": 2884 }, { "epoch": 1.4932712215320911, "grad_norm": 1.4955443143844604, "learning_rate": 5.4501448961471025e-06, "loss": 0.16406434774398804, "rewards/accuracies": 0.921875, "rewards/chosen": 34.733360290527344, "rewards/margins": 26.985923767089844, "rewards/rejected": 7.741954803466797, "step": 2885 }, { "epoch": 1.4937888198757765, "grad_norm": 1.5133899450302124, "learning_rate": 5.447303527072835e-06, "loss": 0.11088043451309204, "rewards/accuracies": 0.953125, "rewards/chosen": 32.736106872558594, "rewards/margins": 26.99376678466797, "rewards/rejected": 5.739295959472656, "step": 2886 }, { "epoch": 1.4943064182194616, "grad_norm": 1.6460671424865723, "learning_rate": 5.4444620123756786e-06, "loss": 0.14400175213813782, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.665061950683594, "rewards/margins": 26.27606201171875, "rewards/rejected": 6.387187957763672, "step": 2887 }, { "epoch": 1.494824016563147, "grad_norm": 2.2426352500915527, "learning_rate": 5.441620352980709e-06, "loss": 0.1359085738658905, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.81233215332031, "rewards/margins": 28.50909423828125, "rewards/rejected": 6.300621032714844, "step": 2888 }, { "epoch": 1.4953416149068324, "grad_norm": 5.40345573425293, "learning_rate": 5.438778549813048e-06, "loss": 0.27629733085632324, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.709625244140625, "rewards/margins": 26.239181518554688, "rewards/rejected": 5.4658966064453125, "step": 2889 }, { "epoch": 1.4958592132505175, "grad_norm": 1.3284811973571777, "learning_rate": 5.435936603797867e-06, "loss": 0.1530107855796814, "rewards/accuracies": 0.9375, "rewards/chosen": 31.27941131591797, "rewards/margins": 25.706405639648438, "rewards/rejected": 5.580641746520996, "step": 2890 }, { "epoch": 1.4963768115942029, "grad_norm": 1.4571309089660645, "learning_rate": 5.43309451586038e-06, "loss": 0.12611187994480133, "rewards/accuracies": 0.921875, "rewards/chosen": 29.121618270874023, "rewards/margins": 23.774749755859375, "rewards/rejected": 5.347373962402344, "step": 2891 }, { "epoch": 1.4968944099378882, "grad_norm": 0.9046232104301453, "learning_rate": 5.430252286925851e-06, "loss": 0.0858321487903595, "rewards/accuracies": 0.9765625, "rewards/chosen": 28.799396514892578, "rewards/margins": 25.15069580078125, "rewards/rejected": 3.645815849304199, "step": 2892 }, { "epoch": 1.4974120082815734, "grad_norm": 0.8805223703384399, "learning_rate": 5.427409917919588e-06, "loss": 0.10283207893371582, "rewards/accuracies": 0.953125, "rewards/chosen": 28.895267486572266, "rewards/margins": 24.9442138671875, "rewards/rejected": 3.950742721557617, "step": 2893 }, { "epoch": 1.4979296066252588, "grad_norm": 1.3440581560134888, "learning_rate": 5.424567409766943e-06, "loss": 0.12701717019081116, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.70805549621582, "rewards/margins": 24.030670166015625, "rewards/rejected": 3.6736159324645996, "step": 2894 }, { "epoch": 1.4984472049689441, "grad_norm": 0.9391700625419617, "learning_rate": 5.421724763393317e-06, "loss": 0.1302577555179596, "rewards/accuracies": 0.9375, "rewards/chosen": 27.424530029296875, "rewards/margins": 23.3189697265625, "rewards/rejected": 4.104377746582031, "step": 2895 }, { "epoch": 1.4989648033126293, "grad_norm": 1.537123441696167, "learning_rate": 5.418881979724152e-06, "loss": 0.12218382954597473, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.91986083984375, "rewards/margins": 25.4827880859375, "rewards/rejected": 4.429920196533203, "step": 2896 }, { "epoch": 1.4994824016563146, "grad_norm": 0.8491235375404358, "learning_rate": 5.416039059684939e-06, "loss": 0.07612158358097076, "rewards/accuracies": 0.984375, "rewards/chosen": 27.480117797851562, "rewards/margins": 24.87310791015625, "rewards/rejected": 2.608356475830078, "step": 2897 }, { "epoch": 1.5, "grad_norm": 1.3660919666290283, "learning_rate": 5.413196004201211e-06, "loss": 0.15612903237342834, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.937496185302734, "rewards/margins": 25.243560791015625, "rewards/rejected": 4.698278427124023, "step": 2898 }, { "epoch": 1.5005175983436851, "grad_norm": 1.836323618888855, "learning_rate": 5.410352814198542e-06, "loss": 0.23784899711608887, "rewards/accuracies": 0.8984375, "rewards/chosen": 28.1129150390625, "rewards/margins": 23.667739868164062, "rewards/rejected": 4.4492645263671875, "step": 2899 }, { "epoch": 1.5010351966873707, "grad_norm": 2.390817165374756, "learning_rate": 5.407509490602556e-06, "loss": 0.19111907482147217, "rewards/accuracies": 0.921875, "rewards/chosen": 26.7647705078125, "rewards/margins": 23.39141845703125, "rewards/rejected": 3.3718299865722656, "step": 2900 }, { "epoch": 1.5015527950310559, "grad_norm": 1.0810368061065674, "learning_rate": 5.404666034338916e-06, "loss": 0.1252955198287964, "rewards/accuracies": 0.953125, "rewards/chosen": 27.289596557617188, "rewards/margins": 23.499298095703125, "rewards/rejected": 3.7907333374023438, "step": 2901 }, { "epoch": 1.5020703933747412, "grad_norm": 2.200507879257202, "learning_rate": 5.401822446333332e-06, "loss": 0.22017912566661835, "rewards/accuracies": 0.8828125, "rewards/chosen": 20.77730369567871, "rewards/margins": 18.722625732421875, "rewards/rejected": 2.054032325744629, "step": 2902 }, { "epoch": 1.5025879917184266, "grad_norm": 1.319635272026062, "learning_rate": 5.398978727511553e-06, "loss": 0.20764769613742828, "rewards/accuracies": 0.8984375, "rewards/chosen": 25.232013702392578, "rewards/margins": 21.898422241210938, "rewards/rejected": 3.3313522338867188, "step": 2903 }, { "epoch": 1.5031055900621118, "grad_norm": 0.9924300312995911, "learning_rate": 5.396134878799369e-06, "loss": 0.16893146932125092, "rewards/accuracies": 0.9375, "rewards/chosen": 23.166400909423828, "rewards/margins": 20.514022827148438, "rewards/rejected": 2.649866819381714, "step": 2904 }, { "epoch": 1.5036231884057971, "grad_norm": 0.6998314261436462, "learning_rate": 5.393290901122622e-06, "loss": 0.14400872588157654, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.448593139648438, "rewards/margins": 18.203811645507812, "rewards/rejected": 2.245516300201416, "step": 2905 }, { "epoch": 1.5041407867494825, "grad_norm": 0.8229643106460571, "learning_rate": 5.390446795407184e-06, "loss": 0.1277112066745758, "rewards/accuracies": 0.9609375, "rewards/chosen": 21.96744728088379, "rewards/margins": 18.905792236328125, "rewards/rejected": 3.06195068359375, "step": 2906 }, { "epoch": 1.5046583850931676, "grad_norm": 3.1179206371307373, "learning_rate": 5.387602562578979e-06, "loss": 0.1687813699245453, "rewards/accuracies": 0.9140625, "rewards/chosen": 23.137245178222656, "rewards/margins": 20.40654754638672, "rewards/rejected": 2.7257747650146484, "step": 2907 }, { "epoch": 1.505175983436853, "grad_norm": 1.3891137838363647, "learning_rate": 5.384758203563962e-06, "loss": 0.16560688614845276, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.130664825439453, "rewards/margins": 17.497940063476562, "rewards/rejected": 2.6376686096191406, "step": 2908 }, { "epoch": 1.5056935817805384, "grad_norm": 1.2813560962677002, "learning_rate": 5.381913719288139e-06, "loss": 0.16406600177288055, "rewards/accuracies": 0.9375, "rewards/chosen": 19.59648895263672, "rewards/margins": 17.103439331054688, "rewards/rejected": 2.4933433532714844, "step": 2909 }, { "epoch": 1.5062111801242235, "grad_norm": 0.6337655186653137, "learning_rate": 5.379069110677549e-06, "loss": 0.12100113183259964, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.871097564697266, "rewards/margins": 21.066085815429688, "rewards/rejected": 2.8082046508789062, "step": 2910 }, { "epoch": 1.5067287784679089, "grad_norm": 1.052644968032837, "learning_rate": 5.376224378658279e-06, "loss": 0.17254924774169922, "rewards/accuracies": 0.9296875, "rewards/chosen": 20.62104034423828, "rewards/margins": 17.377426147460938, "rewards/rejected": 3.243776321411133, "step": 2911 }, { "epoch": 1.5072463768115942, "grad_norm": 0.8358239531517029, "learning_rate": 5.373379524156446e-06, "loss": 0.13053598999977112, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.434417724609375, "rewards/margins": 20.403854370117188, "rewards/rejected": 3.022596836090088, "step": 2912 }, { "epoch": 1.5077639751552794, "grad_norm": 1.2201141119003296, "learning_rate": 5.3705345480982194e-06, "loss": 0.16378478705883026, "rewards/accuracies": 0.9140625, "rewards/chosen": 20.828086853027344, "rewards/margins": 17.578462600708008, "rewards/rejected": 3.2425687313079834, "step": 2913 }, { "epoch": 1.508281573498965, "grad_norm": 0.8255484700202942, "learning_rate": 5.3676894514097965e-06, "loss": 0.10742182284593582, "rewards/accuracies": 0.953125, "rewards/chosen": 24.59044075012207, "rewards/margins": 21.065216064453125, "rewards/rejected": 3.530442237854004, "step": 2914 }, { "epoch": 1.50879917184265, "grad_norm": 3.1834371089935303, "learning_rate": 5.364844235017424e-06, "loss": 0.15400460362434387, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.240413665771484, "rewards/margins": 21.249603271484375, "rewards/rejected": 2.9829540252685547, "step": 2915 }, { "epoch": 1.5093167701863353, "grad_norm": 2.084900379180908, "learning_rate": 5.361998899847377e-06, "loss": 0.1712377667427063, "rewards/accuracies": 0.9296875, "rewards/chosen": 21.397964477539062, "rewards/margins": 18.953216552734375, "rewards/rejected": 2.446955680847168, "step": 2916 }, { "epoch": 1.5098343685300208, "grad_norm": 0.96151202917099, "learning_rate": 5.3591534468259784e-06, "loss": 0.13733184337615967, "rewards/accuracies": 0.9453125, "rewards/chosen": 21.61138153076172, "rewards/margins": 19.705780029296875, "rewards/rejected": 1.907257080078125, "step": 2917 }, { "epoch": 1.510351966873706, "grad_norm": 1.776771903038025, "learning_rate": 5.356307876879586e-06, "loss": 0.174674391746521, "rewards/accuracies": 0.921875, "rewards/chosen": 21.058757781982422, "rewards/margins": 18.868797302246094, "rewards/rejected": 2.1908187866210938, "step": 2918 }, { "epoch": 1.5108695652173914, "grad_norm": 5.897755146026611, "learning_rate": 5.353462190934594e-06, "loss": 0.2094716727733612, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.419109344482422, "rewards/margins": 20.101821899414062, "rewards/rejected": 3.324223518371582, "step": 2919 }, { "epoch": 1.5113871635610767, "grad_norm": 1.1747509241104126, "learning_rate": 5.350616389917438e-06, "loss": 0.15208078920841217, "rewards/accuracies": 0.953125, "rewards/chosen": 25.588668823242188, "rewards/margins": 22.199295043945312, "rewards/rejected": 3.3820114135742188, "step": 2920 }, { "epoch": 1.5119047619047619, "grad_norm": 1.0494822263717651, "learning_rate": 5.347770474754588e-06, "loss": 0.17279335856437683, "rewards/accuracies": 0.9140625, "rewards/chosen": 21.307235717773438, "rewards/margins": 19.052276611328125, "rewards/rejected": 2.2592687606811523, "step": 2921 }, { "epoch": 1.5124223602484472, "grad_norm": 0.7746115922927856, "learning_rate": 5.3449244463725516e-06, "loss": 0.11454328894615173, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.228622436523438, "rewards/margins": 21.432998657226562, "rewards/rejected": 2.787320375442505, "step": 2922 }, { "epoch": 1.5129399585921326, "grad_norm": 0.8540987968444824, "learning_rate": 5.3420783056978734e-06, "loss": 0.08270131796598434, "rewards/accuracies": 0.9765625, "rewards/chosen": 27.523677825927734, "rewards/margins": 24.298416137695312, "rewards/rejected": 3.2272682189941406, "step": 2923 }, { "epoch": 1.5134575569358177, "grad_norm": 0.9415895342826843, "learning_rate": 5.339232053657137e-06, "loss": 0.12630876898765564, "rewards/accuracies": 0.9375, "rewards/chosen": 23.89999771118164, "rewards/margins": 21.627960205078125, "rewards/rejected": 2.2739181518554688, "step": 2924 }, { "epoch": 1.513975155279503, "grad_norm": 0.8387445211410522, "learning_rate": 5.336385691176959e-06, "loss": 0.12551553547382355, "rewards/accuracies": 0.9453125, "rewards/chosen": 21.454381942749023, "rewards/margins": 19.70458221435547, "rewards/rejected": 1.7416801452636719, "step": 2925 }, { "epoch": 1.5144927536231885, "grad_norm": 1.1908410787582397, "learning_rate": 5.333539219183995e-06, "loss": 0.1316002905368805, "rewards/accuracies": 0.9453125, "rewards/chosen": 21.346149444580078, "rewards/margins": 19.183074951171875, "rewards/rejected": 2.1590442657470703, "step": 2926 }, { "epoch": 1.5150103519668736, "grad_norm": 1.7296333312988281, "learning_rate": 5.330692638604931e-06, "loss": 0.17537112534046173, "rewards/accuracies": 0.921875, "rewards/chosen": 21.491737365722656, "rewards/margins": 19.726593017578125, "rewards/rejected": 1.7707977294921875, "step": 2927 }, { "epoch": 1.515527950310559, "grad_norm": 1.0702530145645142, "learning_rate": 5.327845950366495e-06, "loss": 0.10367940366268158, "rewards/accuracies": 0.9609375, "rewards/chosen": 24.922897338867188, "rewards/margins": 21.9833984375, "rewards/rejected": 2.946437358856201, "step": 2928 }, { "epoch": 1.5160455486542443, "grad_norm": 1.352831244468689, "learning_rate": 5.324999155395445e-06, "loss": 0.09476654976606369, "rewards/accuracies": 0.953125, "rewards/chosen": 25.29730987548828, "rewards/margins": 22.66064453125, "rewards/rejected": 2.6356048583984375, "step": 2929 }, { "epoch": 1.5165631469979295, "grad_norm": 1.5801897048950195, "learning_rate": 5.322152254618577e-06, "loss": 0.19506338238716125, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.54999542236328, "rewards/margins": 22.69610595703125, "rewards/rejected": 1.8527450561523438, "step": 2930 }, { "epoch": 1.5170807453416149, "grad_norm": 2.694157600402832, "learning_rate": 5.319305248962719e-06, "loss": 0.15506960451602936, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.575244903564453, "rewards/margins": 27.02734375, "rewards/rejected": 3.5534238815307617, "step": 2931 }, { "epoch": 1.5175983436853002, "grad_norm": 1.4762648344039917, "learning_rate": 5.3164581393547346e-06, "loss": 0.06708253175020218, "rewards/accuracies": 0.9921875, "rewards/chosen": 24.30852508544922, "rewards/margins": 22.881103515625, "rewards/rejected": 1.4284248352050781, "step": 2932 }, { "epoch": 1.5181159420289854, "grad_norm": 1.6954388618469238, "learning_rate": 5.31361092672152e-06, "loss": 0.16050365567207336, "rewards/accuracies": 0.9296875, "rewards/chosen": 21.358835220336914, "rewards/margins": 19.889663696289062, "rewards/rejected": 1.4704418182373047, "step": 2933 }, { "epoch": 1.518633540372671, "grad_norm": 1.0832325220108032, "learning_rate": 5.310763611990007e-06, "loss": 0.09551818668842316, "rewards/accuracies": 0.953125, "rewards/chosen": 24.658184051513672, "rewards/margins": 23.069610595703125, "rewards/rejected": 1.5920071601867676, "step": 2934 }, { "epoch": 1.519151138716356, "grad_norm": 0.9367673397064209, "learning_rate": 5.307916196087161e-06, "loss": 0.10899552702903748, "rewards/accuracies": 0.9375, "rewards/chosen": 25.355117797851562, "rewards/margins": 23.7156982421875, "rewards/rejected": 1.641510009765625, "step": 2935 }, { "epoch": 1.5196687370600412, "grad_norm": 1.8476665019989014, "learning_rate": 5.305068679939975e-06, "loss": 0.156165212392807, "rewards/accuracies": 0.9375, "rewards/chosen": 23.090286254882812, "rewards/margins": 21.735595703125, "rewards/rejected": 1.3592994213104248, "step": 2936 }, { "epoch": 1.5201863354037268, "grad_norm": 2.373039484024048, "learning_rate": 5.302221064475481e-06, "loss": 0.2685510516166687, "rewards/accuracies": 0.890625, "rewards/chosen": 23.423709869384766, "rewards/margins": 20.960845947265625, "rewards/rejected": 2.4575459957122803, "step": 2937 }, { "epoch": 1.520703933747412, "grad_norm": 1.609294056892395, "learning_rate": 5.29937335062074e-06, "loss": 0.14872679114341736, "rewards/accuracies": 0.90625, "rewards/chosen": 25.31402587890625, "rewards/margins": 23.031494140625, "rewards/rejected": 2.2797698974609375, "step": 2938 }, { "epoch": 1.5212215320910973, "grad_norm": 1.7412364482879639, "learning_rate": 5.296525539302847e-06, "loss": 0.1415122151374817, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.0169620513916, "rewards/margins": 23.081405639648438, "rewards/rejected": 1.939168930053711, "step": 2939 }, { "epoch": 1.5217391304347827, "grad_norm": 1.324318528175354, "learning_rate": 5.293677631448925e-06, "loss": 0.1597679853439331, "rewards/accuracies": 0.921875, "rewards/chosen": 24.786073684692383, "rewards/margins": 23.55035400390625, "rewards/rejected": 1.2438850402832031, "step": 2940 }, { "epoch": 1.5222567287784678, "grad_norm": 1.2858918905258179, "learning_rate": 5.290829627986133e-06, "loss": 0.10183901339769363, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.69268798828125, "rewards/margins": 22.956207275390625, "rewards/rejected": 1.7437894344329834, "step": 2941 }, { "epoch": 1.5227743271221532, "grad_norm": 0.9895585775375366, "learning_rate": 5.287981529841657e-06, "loss": 0.13557946681976318, "rewards/accuracies": 0.90625, "rewards/chosen": 24.04525375366211, "rewards/margins": 21.806427001953125, "rewards/rejected": 2.246593475341797, "step": 2942 }, { "epoch": 1.5232919254658386, "grad_norm": 2.059401750564575, "learning_rate": 5.2851333379427194e-06, "loss": 0.18141929805278778, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.788593292236328, "rewards/margins": 22.876800537109375, "rewards/rejected": 1.910573959350586, "step": 2943 }, { "epoch": 1.5238095238095237, "grad_norm": 1.221596121788025, "learning_rate": 5.282285053216567e-06, "loss": 0.14013616740703583, "rewards/accuracies": 0.9375, "rewards/chosen": 21.56586456298828, "rewards/margins": 21.047775268554688, "rewards/rejected": 0.518182635307312, "step": 2944 }, { "epoch": 1.524327122153209, "grad_norm": 1.5946924686431885, "learning_rate": 5.279436676590479e-06, "loss": 0.12066419422626495, "rewards/accuracies": 0.953125, "rewards/chosen": 22.327808380126953, "rewards/margins": 21.014598846435547, "rewards/rejected": 1.3082599639892578, "step": 2945 }, { "epoch": 1.5248447204968945, "grad_norm": 2.089061737060547, "learning_rate": 5.276588208991766e-06, "loss": 0.17703774571418762, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.387020111083984, "rewards/margins": 22.5802001953125, "rewards/rejected": 1.8078956604003906, "step": 2946 }, { "epoch": 1.5253623188405796, "grad_norm": 1.0869140625, "learning_rate": 5.273739651347768e-06, "loss": 0.16460928320884705, "rewards/accuracies": 0.90625, "rewards/chosen": 23.933570861816406, "rewards/margins": 22.400970458984375, "rewards/rejected": 1.5317935943603516, "step": 2947 }, { "epoch": 1.525879917184265, "grad_norm": 1.3825056552886963, "learning_rate": 5.270891004585851e-06, "loss": 0.185703307390213, "rewards/accuracies": 0.9375, "rewards/chosen": 21.854244232177734, "rewards/margins": 20.341339111328125, "rewards/rejected": 1.5110969543457031, "step": 2948 }, { "epoch": 1.5263975155279503, "grad_norm": 1.4131848812103271, "learning_rate": 5.268042269633416e-06, "loss": 0.1464276909828186, "rewards/accuracies": 0.9375, "rewards/chosen": 26.30826187133789, "rewards/margins": 23.955917358398438, "rewards/rejected": 2.357271194458008, "step": 2949 }, { "epoch": 1.5269151138716355, "grad_norm": 1.5869505405426025, "learning_rate": 5.265193447417886e-06, "loss": 0.15081891417503357, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.30612564086914, "rewards/margins": 20.7398681640625, "rewards/rejected": 1.5739898681640625, "step": 2950 }, { "epoch": 1.527432712215321, "grad_norm": 1.1892924308776855, "learning_rate": 5.262344538866716e-06, "loss": 0.10712622106075287, "rewards/accuracies": 0.953125, "rewards/chosen": 27.02165412902832, "rewards/margins": 24.845703125, "rewards/rejected": 2.1788721084594727, "step": 2951 }, { "epoch": 1.5279503105590062, "grad_norm": 1.2447668313980103, "learning_rate": 5.259495544907392e-06, "loss": 0.1491764485836029, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.276424407958984, "rewards/margins": 21.134361267089844, "rewards/rejected": 1.1472015380859375, "step": 2952 }, { "epoch": 1.5284679089026914, "grad_norm": 0.725386917591095, "learning_rate": 5.256646466467419e-06, "loss": 0.13488906621932983, "rewards/accuracies": 0.96875, "rewards/chosen": 22.786779403686523, "rewards/margins": 21.514617919921875, "rewards/rejected": 1.2723045349121094, "step": 2953 }, { "epoch": 1.528985507246377, "grad_norm": 0.9950641393661499, "learning_rate": 5.253797304474341e-06, "loss": 0.20352083444595337, "rewards/accuracies": 0.890625, "rewards/chosen": 23.558475494384766, "rewards/margins": 21.009628295898438, "rewards/rejected": 2.546567916870117, "step": 2954 }, { "epoch": 1.529503105590062, "grad_norm": 0.9533541798591614, "learning_rate": 5.250948059855718e-06, "loss": 0.1447133719921112, "rewards/accuracies": 0.9375, "rewards/chosen": 19.96526527404785, "rewards/margins": 18.682235717773438, "rewards/rejected": 1.2852516174316406, "step": 2955 }, { "epoch": 1.5300207039337475, "grad_norm": 0.7551975250244141, "learning_rate": 5.248098733539147e-06, "loss": 0.14696648716926575, "rewards/accuracies": 0.9140625, "rewards/chosen": 19.823654174804688, "rewards/margins": 18.231224060058594, "rewards/rejected": 1.590545892715454, "step": 2956 }, { "epoch": 1.5305383022774328, "grad_norm": 0.7435744404792786, "learning_rate": 5.245249326452242e-06, "loss": 0.161767840385437, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.7113037109375, "rewards/margins": 22.21173095703125, "rewards/rejected": 2.5013160705566406, "step": 2957 }, { "epoch": 1.531055900621118, "grad_norm": 0.703082263469696, "learning_rate": 5.2423998395226514e-06, "loss": 0.14013636112213135, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.04821014404297, "rewards/margins": 20.52959442138672, "rewards/rejected": 1.522085189819336, "step": 2958 }, { "epoch": 1.5315734989648033, "grad_norm": 0.7303730845451355, "learning_rate": 5.239550273678045e-06, "loss": 0.10650425404310226, "rewards/accuracies": 0.953125, "rewards/chosen": 25.823871612548828, "rewards/margins": 23.643295288085938, "rewards/rejected": 2.17718505859375, "step": 2959 }, { "epoch": 1.5320910973084887, "grad_norm": 0.5096721053123474, "learning_rate": 5.236700629846119e-06, "loss": 0.09061506390571594, "rewards/accuracies": 0.9609375, "rewards/chosen": 23.359508514404297, "rewards/margins": 21.630287170410156, "rewards/rejected": 1.725864291191101, "step": 2960 }, { "epoch": 1.5326086956521738, "grad_norm": 0.7642648220062256, "learning_rate": 5.233850908954596e-06, "loss": 0.14783495664596558, "rewards/accuracies": 0.9375, "rewards/chosen": 23.233579635620117, "rewards/margins": 20.838951110839844, "rewards/rejected": 2.390568733215332, "step": 2961 }, { "epoch": 1.5331262939958592, "grad_norm": 1.248700499534607, "learning_rate": 5.231001111931223e-06, "loss": 0.165645569562912, "rewards/accuracies": 0.921875, "rewards/chosen": 25.28268814086914, "rewards/margins": 22.626800537109375, "rewards/rejected": 2.650747060775757, "step": 2962 }, { "epoch": 1.5336438923395446, "grad_norm": 0.6341217756271362, "learning_rate": 5.2281512397037734e-06, "loss": 0.1604003757238388, "rewards/accuracies": 0.921875, "rewards/chosen": 20.548492431640625, "rewards/margins": 18.539398193359375, "rewards/rejected": 2.0057759284973145, "step": 2963 }, { "epoch": 1.5341614906832297, "grad_norm": 0.7891669869422913, "learning_rate": 5.225301293200042e-06, "loss": 0.15259355306625366, "rewards/accuracies": 0.9140625, "rewards/chosen": 24.224822998046875, "rewards/margins": 21.318893432617188, "rewards/rejected": 2.9073715209960938, "step": 2964 }, { "epoch": 1.534679089026915, "grad_norm": 0.5756915807723999, "learning_rate": 5.2224512733478495e-06, "loss": 0.1210215836763382, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.777870178222656, "rewards/margins": 24.008270263671875, "rewards/rejected": 3.769113540649414, "step": 2965 }, { "epoch": 1.5351966873706004, "grad_norm": 0.42870211601257324, "learning_rate": 5.219601181075041e-06, "loss": 0.07765312492847443, "rewards/accuracies": 0.9609375, "rewards/chosen": 26.462831497192383, "rewards/margins": 23.40185546875, "rewards/rejected": 3.0562407970428467, "step": 2966 }, { "epoch": 1.5357142857142856, "grad_norm": 0.5362627506256104, "learning_rate": 5.216751017309486e-06, "loss": 0.09112793952226639, "rewards/accuracies": 0.9765625, "rewards/chosen": 32.254913330078125, "rewards/margins": 27.72730255126953, "rewards/rejected": 4.525265693664551, "step": 2967 }, { "epoch": 1.5362318840579712, "grad_norm": 0.6230210065841675, "learning_rate": 5.2139007829790725e-06, "loss": 0.09568273276090622, "rewards/accuracies": 0.984375, "rewards/chosen": 30.30951690673828, "rewards/margins": 25.468582153320312, "rewards/rejected": 4.835819244384766, "step": 2968 }, { "epoch": 1.5367494824016563, "grad_norm": 0.6296722292900085, "learning_rate": 5.2110504790117174e-06, "loss": 0.09067650139331818, "rewards/accuracies": 0.9609375, "rewards/chosen": 28.848405838012695, "rewards/margins": 23.639816284179688, "rewards/rejected": 5.215082168579102, "step": 2969 }, { "epoch": 1.5372670807453415, "grad_norm": 2.8091135025024414, "learning_rate": 5.208200106335356e-06, "loss": 0.18929991126060486, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.40699005126953, "rewards/margins": 22.88104248046875, "rewards/rejected": 3.527679443359375, "step": 2970 }, { "epoch": 1.537784679089027, "grad_norm": 3.9618964195251465, "learning_rate": 5.2053496658779505e-06, "loss": 0.23668411374092102, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.863271713256836, "rewards/margins": 26.005279541015625, "rewards/rejected": 4.857641220092773, "step": 2971 }, { "epoch": 1.5383022774327122, "grad_norm": 1.335516333580017, "learning_rate": 5.202499158567478e-06, "loss": 0.17352962493896484, "rewards/accuracies": 0.90625, "rewards/chosen": 29.934349060058594, "rewards/margins": 25.043548583984375, "rewards/rejected": 4.8979692459106445, "step": 2972 }, { "epoch": 1.5388198757763976, "grad_norm": 3.002077102661133, "learning_rate": 5.199648585331946e-06, "loss": 0.18891151249408722, "rewards/accuracies": 0.921875, "rewards/chosen": 31.901050567626953, "rewards/margins": 26.848159790039062, "rewards/rejected": 5.048606872558594, "step": 2973 }, { "epoch": 1.539337474120083, "grad_norm": 1.5486695766448975, "learning_rate": 5.196797947099376e-06, "loss": 0.19973020255565643, "rewards/accuracies": 0.875, "rewards/chosen": 27.609689712524414, "rewards/margins": 23.07671356201172, "rewards/rejected": 4.537144660949707, "step": 2974 }, { "epoch": 1.539855072463768, "grad_norm": 2.644965171813965, "learning_rate": 5.1939472447978155e-06, "loss": 0.11747385561466217, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.797801971435547, "rewards/margins": 25.8287353515625, "rewards/rejected": 4.9602580070495605, "step": 2975 }, { "epoch": 1.5403726708074534, "grad_norm": 1.7732211351394653, "learning_rate": 5.191096479355332e-06, "loss": 0.22475358843803406, "rewards/accuracies": 0.8828125, "rewards/chosen": 28.962722778320312, "rewards/margins": 24.562744140625, "rewards/rejected": 4.401524066925049, "step": 2976 }, { "epoch": 1.5408902691511388, "grad_norm": 1.1949224472045898, "learning_rate": 5.18824565170001e-06, "loss": 0.17736493051052094, "rewards/accuracies": 0.90625, "rewards/chosen": 30.153881072998047, "rewards/margins": 25.05877685546875, "rewards/rejected": 5.084880828857422, "step": 2977 }, { "epoch": 1.541407867494824, "grad_norm": 2.5858724117279053, "learning_rate": 5.18539476275996e-06, "loss": 0.16343051195144653, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.7474365234375, "rewards/margins": 21.483184814453125, "rewards/rejected": 4.2600860595703125, "step": 2978 }, { "epoch": 1.5419254658385093, "grad_norm": 0.7617681622505188, "learning_rate": 5.182543813463308e-06, "loss": 0.09297747910022736, "rewards/accuracies": 0.9765625, "rewards/chosen": 32.998085021972656, "rewards/margins": 27.25091552734375, "rewards/rejected": 5.751350402832031, "step": 2979 }, { "epoch": 1.5424430641821947, "grad_norm": 1.1316115856170654, "learning_rate": 5.179692804738201e-06, "loss": 0.13495776057243347, "rewards/accuracies": 0.9609375, "rewards/chosen": 29.456825256347656, "rewards/margins": 25.166839599609375, "rewards/rejected": 4.296104431152344, "step": 2980 }, { "epoch": 1.5429606625258798, "grad_norm": 1.3098089694976807, "learning_rate": 5.176841737512804e-06, "loss": 0.16010820865631104, "rewards/accuracies": 0.921875, "rewards/chosen": 28.83907127380371, "rewards/margins": 23.989364624023438, "rewards/rejected": 4.848384857177734, "step": 2981 }, { "epoch": 1.5434782608695652, "grad_norm": 0.6301785707473755, "learning_rate": 5.173990612715306e-06, "loss": 0.11406679451465607, "rewards/accuracies": 0.9375, "rewards/chosen": 27.77259063720703, "rewards/margins": 24.026962280273438, "rewards/rejected": 3.7435030937194824, "step": 2982 }, { "epoch": 1.5439958592132506, "grad_norm": 1.386304497718811, "learning_rate": 5.171139431273908e-06, "loss": 0.19801978766918182, "rewards/accuracies": 0.8984375, "rewards/chosen": 29.020048141479492, "rewards/margins": 24.98253631591797, "rewards/rejected": 4.039405822753906, "step": 2983 }, { "epoch": 1.5445134575569357, "grad_norm": 1.192702054977417, "learning_rate": 5.168288194116836e-06, "loss": 0.1198723241686821, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.77417755126953, "rewards/margins": 22.056396484375, "rewards/rejected": 3.713909149169922, "step": 2984 }, { "epoch": 1.545031055900621, "grad_norm": 0.986355185508728, "learning_rate": 5.165436902172328e-06, "loss": 0.1780650019645691, "rewards/accuracies": 0.9375, "rewards/chosen": 28.537437438964844, "rewards/margins": 24.339202880859375, "rewards/rejected": 4.203205108642578, "step": 2985 }, { "epoch": 1.5455486542443064, "grad_norm": 1.3496570587158203, "learning_rate": 5.162585556368643e-06, "loss": 0.2066115438938141, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.118881225585938, "rewards/margins": 21.6976318359375, "rewards/rejected": 3.4182395935058594, "step": 2986 }, { "epoch": 1.5460662525879916, "grad_norm": 1.8543686866760254, "learning_rate": 5.1597341576340576e-06, "loss": 0.19225460290908813, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.761947631835938, "rewards/margins": 22.728439331054688, "rewards/rejected": 4.0254669189453125, "step": 2987 }, { "epoch": 1.5465838509316772, "grad_norm": 0.9259017705917358, "learning_rate": 5.156882706896867e-06, "loss": 0.13615205883979797, "rewards/accuracies": 0.9375, "rewards/chosen": 28.530052185058594, "rewards/margins": 24.50982666015625, "rewards/rejected": 4.019889831542969, "step": 2988 }, { "epoch": 1.5471014492753623, "grad_norm": 6.185041904449463, "learning_rate": 5.154031205085377e-06, "loss": 0.11440268158912659, "rewards/accuracies": 0.96875, "rewards/chosen": 29.900131225585938, "rewards/margins": 26.54339599609375, "rewards/rejected": 3.3503522872924805, "step": 2989 }, { "epoch": 1.5476190476190477, "grad_norm": 0.6545832753181458, "learning_rate": 5.151179653127919e-06, "loss": 0.13372541964054108, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.34347915649414, "rewards/margins": 22.43841552734375, "rewards/rejected": 2.9100029468536377, "step": 2990 }, { "epoch": 1.548136645962733, "grad_norm": 1.964163899421692, "learning_rate": 5.148328051952835e-06, "loss": 0.17151661217212677, "rewards/accuracies": 0.921875, "rewards/chosen": 25.274234771728516, "rewards/margins": 22.386611938476562, "rewards/rejected": 2.8825888633728027, "step": 2991 }, { "epoch": 1.5486542443064182, "grad_norm": 0.8761782050132751, "learning_rate": 5.145476402488483e-06, "loss": 0.10305439680814743, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.688493728637695, "rewards/margins": 24.604934692382812, "rewards/rejected": 3.0805435180664062, "step": 2992 }, { "epoch": 1.5491718426501035, "grad_norm": 0.8193164467811584, "learning_rate": 5.142624705663242e-06, "loss": 0.14425526559352875, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.29200553894043, "rewards/margins": 22.614654541015625, "rewards/rejected": 2.6802663803100586, "step": 2993 }, { "epoch": 1.549689440993789, "grad_norm": 0.9777394533157349, "learning_rate": 5.139772962405496e-06, "loss": 0.1179203912615776, "rewards/accuracies": 0.921875, "rewards/chosen": 29.790328979492188, "rewards/margins": 26.76910400390625, "rewards/rejected": 3.0239028930664062, "step": 2994 }, { "epoch": 1.550207039337474, "grad_norm": 3.5376179218292236, "learning_rate": 5.1369211736436565e-06, "loss": 0.2407812476158142, "rewards/accuracies": 0.8828125, "rewards/chosen": 23.554523468017578, "rewards/margins": 20.609848022460938, "rewards/rejected": 2.9424946308135986, "step": 2995 }, { "epoch": 1.5507246376811594, "grad_norm": 1.4007983207702637, "learning_rate": 5.13406934030614e-06, "loss": 0.18236367404460907, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.75, "rewards/margins": 21.49490737915039, "rewards/rejected": 1.2562193870544434, "step": 2996 }, { "epoch": 1.5512422360248448, "grad_norm": 0.7169830203056335, "learning_rate": 5.131217463321384e-06, "loss": 0.12287828326225281, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.869651794433594, "rewards/margins": 21.431365966796875, "rewards/rejected": 2.4309797286987305, "step": 2997 }, { "epoch": 1.55175983436853, "grad_norm": 0.9669277667999268, "learning_rate": 5.128365543617836e-06, "loss": 0.1772647202014923, "rewards/accuracies": 0.90625, "rewards/chosen": 24.160799026489258, "rewards/margins": 21.727798461914062, "rewards/rejected": 2.4383544921875, "step": 2998 }, { "epoch": 1.5522774327122153, "grad_norm": 1.1845238208770752, "learning_rate": 5.12551358212396e-06, "loss": 0.14636752009391785, "rewards/accuracies": 0.921875, "rewards/chosen": 26.53738784790039, "rewards/margins": 23.846141815185547, "rewards/rejected": 2.697284698486328, "step": 2999 }, { "epoch": 1.5527950310559007, "grad_norm": 0.9327467083930969, "learning_rate": 5.122661579768232e-06, "loss": 0.14088153839111328, "rewards/accuracies": 0.953125, "rewards/chosen": 23.792835235595703, "rewards/margins": 21.257240295410156, "rewards/rejected": 2.5394020080566406, "step": 3000 }, { "epoch": 1.5533126293995858, "grad_norm": 0.6897130608558655, "learning_rate": 5.119809537479143e-06, "loss": 0.10572028160095215, "rewards/accuracies": 0.9375, "rewards/chosen": 30.68108367919922, "rewards/margins": 27.4769287109375, "rewards/rejected": 3.191864013671875, "step": 3001 }, { "epoch": 1.5538302277432712, "grad_norm": 2.600597381591797, "learning_rate": 5.116957456185193e-06, "loss": 0.14036130905151367, "rewards/accuracies": 0.9375, "rewards/chosen": 26.940521240234375, "rewards/margins": 23.690216064453125, "rewards/rejected": 3.254680633544922, "step": 3002 }, { "epoch": 1.5543478260869565, "grad_norm": 0.5251803398132324, "learning_rate": 5.114105336814902e-06, "loss": 0.06621946394443512, "rewards/accuracies": 0.9765625, "rewards/chosen": 31.591148376464844, "rewards/margins": 27.55853271484375, "rewards/rejected": 4.028806686401367, "step": 3003 }, { "epoch": 1.5548654244306417, "grad_norm": 0.9504756331443787, "learning_rate": 5.111253180296795e-06, "loss": 0.11954295635223389, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.436859130859375, "rewards/margins": 23.762542724609375, "rewards/rejected": 2.666950225830078, "step": 3004 }, { "epoch": 1.5553830227743273, "grad_norm": 1.2597252130508423, "learning_rate": 5.108400987559412e-06, "loss": 0.16332465410232544, "rewards/accuracies": 0.9140625, "rewards/chosen": 25.888059616088867, "rewards/margins": 22.717391967773438, "rewards/rejected": 3.164396286010742, "step": 3005 }, { "epoch": 1.5559006211180124, "grad_norm": 1.473522424697876, "learning_rate": 5.1055487595313065e-06, "loss": 0.1613585650920868, "rewards/accuracies": 0.9375, "rewards/chosen": 30.08236312866211, "rewards/margins": 26.859939575195312, "rewards/rejected": 3.220059394836426, "step": 3006 }, { "epoch": 1.5564182194616976, "grad_norm": 0.8777885437011719, "learning_rate": 5.102696497141041e-06, "loss": 0.12234008312225342, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.672992706298828, "rewards/margins": 23.770721435546875, "rewards/rejected": 2.90850830078125, "step": 3007 }, { "epoch": 1.5569358178053831, "grad_norm": 1.1389023065567017, "learning_rate": 5.099844201317192e-06, "loss": 0.1263105273246765, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.191747665405273, "rewards/margins": 22.877044677734375, "rewards/rejected": 3.311309814453125, "step": 3008 }, { "epoch": 1.5574534161490683, "grad_norm": 0.9651073813438416, "learning_rate": 5.096991872988341e-06, "loss": 0.16458049416542053, "rewards/accuracies": 0.921875, "rewards/chosen": 26.79361915588379, "rewards/margins": 23.972442626953125, "rewards/rejected": 2.820880889892578, "step": 3009 }, { "epoch": 1.5579710144927537, "grad_norm": 1.9671645164489746, "learning_rate": 5.0941395130830886e-06, "loss": 0.20111088454723358, "rewards/accuracies": 0.8984375, "rewards/chosen": 26.96613311767578, "rewards/margins": 23.251983642578125, "rewards/rejected": 3.701505184173584, "step": 3010 }, { "epoch": 1.558488612836439, "grad_norm": 0.9218704700469971, "learning_rate": 5.0912871225300355e-06, "loss": 0.11793290078639984, "rewards/accuracies": 0.953125, "rewards/chosen": 29.197628021240234, "rewards/margins": 25.064498901367188, "rewards/rejected": 4.132858753204346, "step": 3011 }, { "epoch": 1.5590062111801242, "grad_norm": 1.1617012023925781, "learning_rate": 5.088434702257804e-06, "loss": 0.14691025018692017, "rewards/accuracies": 0.8828125, "rewards/chosen": 28.377330780029297, "rewards/margins": 24.602508544921875, "rewards/rejected": 3.769833564758301, "step": 3012 }, { "epoch": 1.5595238095238095, "grad_norm": 1.254900574684143, "learning_rate": 5.0855822531950174e-06, "loss": 0.12096868455410004, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.086944580078125, "rewards/margins": 25.388824462890625, "rewards/rejected": 4.695949554443359, "step": 3013 }, { "epoch": 1.560041407867495, "grad_norm": 1.16552734375, "learning_rate": 5.0827297762703095e-06, "loss": 0.11797209829092026, "rewards/accuracies": 0.953125, "rewards/chosen": 27.786727905273438, "rewards/margins": 24.198028564453125, "rewards/rejected": 3.5932235717773438, "step": 3014 }, { "epoch": 1.56055900621118, "grad_norm": 0.8624827861785889, "learning_rate": 5.079877272412328e-06, "loss": 0.09273920953273773, "rewards/accuracies": 0.9609375, "rewards/chosen": 32.57673645019531, "rewards/margins": 27.94647216796875, "rewards/rejected": 4.627326011657715, "step": 3015 }, { "epoch": 1.5610766045548654, "grad_norm": 1.214845895767212, "learning_rate": 5.077024742549724e-06, "loss": 0.16966703534126282, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.32270050048828, "rewards/margins": 23.280181884765625, "rewards/rejected": 4.044039726257324, "step": 3016 }, { "epoch": 1.5615942028985508, "grad_norm": 1.425301432609558, "learning_rate": 5.074172187611159e-06, "loss": 0.1739344745874405, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.346965789794922, "rewards/margins": 24.547607421875, "rewards/rejected": 4.799077987670898, "step": 3017 }, { "epoch": 1.562111801242236, "grad_norm": 1.0801500082015991, "learning_rate": 5.071319608525304e-06, "loss": 0.14422950148582458, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.977752685546875, "rewards/margins": 24.976806640625, "rewards/rejected": 4.002349853515625, "step": 3018 }, { "epoch": 1.5626293995859213, "grad_norm": 1.308341145515442, "learning_rate": 5.0684670062208365e-06, "loss": 0.1501014530658722, "rewards/accuracies": 0.953125, "rewards/chosen": 31.314071655273438, "rewards/margins": 25.310317993164062, "rewards/rejected": 6.002655029296875, "step": 3019 }, { "epoch": 1.5631469979296067, "grad_norm": 1.0358576774597168, "learning_rate": 5.06561438162644e-06, "loss": 0.15865211188793182, "rewards/accuracies": 0.921875, "rewards/chosen": 28.196943283081055, "rewards/margins": 24.20013427734375, "rewards/rejected": 3.9953460693359375, "step": 3020 }, { "epoch": 1.5636645962732918, "grad_norm": 2.872534990310669, "learning_rate": 5.06276173567081e-06, "loss": 0.2338014841079712, "rewards/accuracies": 0.8828125, "rewards/chosen": 27.030590057373047, "rewards/margins": 22.62103271484375, "rewards/rejected": 4.418285369873047, "step": 3021 }, { "epoch": 1.5641821946169774, "grad_norm": 1.5933207273483276, "learning_rate": 5.059909069282643e-06, "loss": 0.18197014927864075, "rewards/accuracies": 0.921875, "rewards/chosen": 30.508331298828125, "rewards/margins": 24.876266479492188, "rewards/rejected": 5.635503768920898, "step": 3022 }, { "epoch": 1.5646997929606625, "grad_norm": 1.476940631866455, "learning_rate": 5.057056383390648e-06, "loss": 0.14971870183944702, "rewards/accuracies": 0.8984375, "rewards/chosen": 28.253982543945312, "rewards/margins": 23.779342651367188, "rewards/rejected": 4.472831726074219, "step": 3023 }, { "epoch": 1.5652173913043477, "grad_norm": 1.2931042909622192, "learning_rate": 5.054203678923535e-06, "loss": 0.14451110363006592, "rewards/accuracies": 0.9140625, "rewards/chosen": 35.04265594482422, "rewards/margins": 28.895278930664062, "rewards/rejected": 6.141992092132568, "step": 3024 }, { "epoch": 1.5657349896480333, "grad_norm": 1.049500823020935, "learning_rate": 5.0513509568100235e-06, "loss": 0.1440989077091217, "rewards/accuracies": 0.9375, "rewards/chosen": 28.310699462890625, "rewards/margins": 22.900848388671875, "rewards/rejected": 5.409820556640625, "step": 3025 }, { "epoch": 1.5662525879917184, "grad_norm": 1.3534517288208008, "learning_rate": 5.048498217978839e-06, "loss": 0.17739850282669067, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.895526885986328, "rewards/margins": 24.931663513183594, "rewards/rejected": 5.955760478973389, "step": 3026 }, { "epoch": 1.5667701863354038, "grad_norm": 1.2994941473007202, "learning_rate": 5.045645463358708e-06, "loss": 0.15470144152641296, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.72685432434082, "rewards/margins": 22.06031036376953, "rewards/rejected": 5.669475555419922, "step": 3027 }, { "epoch": 1.5672877846790891, "grad_norm": 1.235644817352295, "learning_rate": 5.042792693878368e-06, "loss": 0.13407769799232483, "rewards/accuracies": 0.953125, "rewards/chosen": 29.835403442382812, "rewards/margins": 22.985137939453125, "rewards/rejected": 6.8489532470703125, "step": 3028 }, { "epoch": 1.5678053830227743, "grad_norm": 0.7131845951080322, "learning_rate": 5.039939910466558e-06, "loss": 0.10004334896802902, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.04216003417969, "rewards/margins": 25.184967041015625, "rewards/rejected": 6.8560333251953125, "step": 3029 }, { "epoch": 1.5683229813664596, "grad_norm": 1.080107569694519, "learning_rate": 5.0370871140520215e-06, "loss": 0.15481428802013397, "rewards/accuracies": 0.90625, "rewards/chosen": 32.020606994628906, "rewards/margins": 24.413909912109375, "rewards/rejected": 7.611745834350586, "step": 3030 }, { "epoch": 1.568840579710145, "grad_norm": 0.8558033108711243, "learning_rate": 5.0342343055635076e-06, "loss": 0.10648935288190842, "rewards/accuracies": 0.9609375, "rewards/chosen": 29.402130126953125, "rewards/margins": 23.011825561523438, "rewards/rejected": 6.393857955932617, "step": 3031 }, { "epoch": 1.5693581780538302, "grad_norm": 0.8183034062385559, "learning_rate": 5.03138148592977e-06, "loss": 0.1351553499698639, "rewards/accuracies": 0.9375, "rewards/chosen": 31.4540958404541, "rewards/margins": 24.680442810058594, "rewards/rejected": 6.7661590576171875, "step": 3032 }, { "epoch": 1.5698757763975155, "grad_norm": 1.4135265350341797, "learning_rate": 5.028528656079562e-06, "loss": 0.21102002263069153, "rewards/accuracies": 0.890625, "rewards/chosen": 29.574905395507812, "rewards/margins": 23.29461669921875, "rewards/rejected": 6.277935028076172, "step": 3033 }, { "epoch": 1.5703933747412009, "grad_norm": 2.567511558532715, "learning_rate": 5.025675816941646e-06, "loss": 0.1323082447052002, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.313858032226562, "rewards/margins": 21.456573486328125, "rewards/rejected": 5.855457305908203, "step": 3034 }, { "epoch": 1.570910973084886, "grad_norm": 1.0975779294967651, "learning_rate": 5.022822969444781e-06, "loss": 0.13335978984832764, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.497772216796875, "rewards/margins": 23.19586181640625, "rewards/rejected": 7.301105499267578, "step": 3035 }, { "epoch": 1.5714285714285714, "grad_norm": 1.655703067779541, "learning_rate": 5.0199701145177345e-06, "loss": 0.18442119657993317, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.623310089111328, "rewards/margins": 20.317535400390625, "rewards/rejected": 7.3065032958984375, "step": 3036 }, { "epoch": 1.5719461697722568, "grad_norm": 1.1473034620285034, "learning_rate": 5.017117253089272e-06, "loss": 0.15629643201828003, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.61257553100586, "rewards/margins": 23.1727294921875, "rewards/rejected": 7.444316864013672, "step": 3037 }, { "epoch": 1.572463768115942, "grad_norm": 1.441741704940796, "learning_rate": 5.014264386088166e-06, "loss": 0.181766539812088, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.4407958984375, "rewards/margins": 21.80462646484375, "rewards/rejected": 6.632922649383545, "step": 3038 }, { "epoch": 1.5729813664596275, "grad_norm": 2.108638048171997, "learning_rate": 5.0114115144431855e-06, "loss": 0.16003993153572083, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.446807861328125, "rewards/margins": 20.529266357421875, "rewards/rejected": 5.916259765625, "step": 3039 }, { "epoch": 1.5734989648033126, "grad_norm": 0.7095693349838257, "learning_rate": 5.008558639083104e-06, "loss": 0.0925484448671341, "rewards/accuracies": 0.9765625, "rewards/chosen": 30.672088623046875, "rewards/margins": 23.192138671875, "rewards/rejected": 7.475994110107422, "step": 3040 }, { "epoch": 1.5740165631469978, "grad_norm": 1.4121711254119873, "learning_rate": 5.005705760936696e-06, "loss": 0.17792634665966034, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.794418334960938, "rewards/margins": 23.77947998046875, "rewards/rejected": 6.018459320068359, "step": 3041 }, { "epoch": 1.5745341614906834, "grad_norm": 0.6276180744171143, "learning_rate": 5.002852880932736e-06, "loss": 0.1321464627981186, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.42706871032715, "rewards/margins": 22.312149047851562, "rewards/rejected": 6.116260528564453, "step": 3042 }, { "epoch": 1.5750517598343685, "grad_norm": 0.7579807043075562, "learning_rate": 5e-06, "loss": 0.11140227317810059, "rewards/accuracies": 0.953125, "rewards/chosen": 29.898134231567383, "rewards/margins": 22.771026611328125, "rewards/rejected": 7.132408142089844, "step": 3043 }, { "epoch": 1.5755693581780539, "grad_norm": 0.6829304099082947, "learning_rate": 4.997147119067265e-06, "loss": 0.10962985455989838, "rewards/accuracies": 0.9609375, "rewards/chosen": 25.12671661376953, "rewards/margins": 19.9959716796875, "rewards/rejected": 5.133075714111328, "step": 3044 }, { "epoch": 1.5760869565217392, "grad_norm": 0.5666177868843079, "learning_rate": 4.994294239063306e-06, "loss": 0.13161376118659973, "rewards/accuracies": 0.953125, "rewards/chosen": 24.95508575439453, "rewards/margins": 20.168853759765625, "rewards/rejected": 4.783607482910156, "step": 3045 }, { "epoch": 1.5766045548654244, "grad_norm": 0.6169793009757996, "learning_rate": 4.991441360916897e-06, "loss": 0.10885762423276901, "rewards/accuracies": 0.9609375, "rewards/chosen": 26.477386474609375, "rewards/margins": 21.355438232421875, "rewards/rejected": 5.117977142333984, "step": 3046 }, { "epoch": 1.5771221532091098, "grad_norm": 3.2867825031280518, "learning_rate": 4.988588485556815e-06, "loss": 0.22449496388435364, "rewards/accuracies": 0.9140625, "rewards/chosen": 22.539932250976562, "rewards/margins": 18.246482849121094, "rewards/rejected": 4.291820526123047, "step": 3047 }, { "epoch": 1.5776397515527951, "grad_norm": 0.634263813495636, "learning_rate": 4.985735613911836e-06, "loss": 0.12168046832084656, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.28419303894043, "rewards/margins": 21.332366943359375, "rewards/rejected": 4.953195571899414, "step": 3048 }, { "epoch": 1.5781573498964803, "grad_norm": 0.90415358543396, "learning_rate": 4.982882746910729e-06, "loss": 0.1530924141407013, "rewards/accuracies": 0.90625, "rewards/chosen": 25.057571411132812, "rewards/margins": 20.015228271484375, "rewards/rejected": 5.045323848724365, "step": 3049 }, { "epoch": 1.5786749482401656, "grad_norm": 0.8250503540039062, "learning_rate": 4.980029885482266e-06, "loss": 0.08809757977724075, "rewards/accuracies": 0.96875, "rewards/chosen": 27.09942626953125, "rewards/margins": 22.68011474609375, "rewards/rejected": 4.417724609375, "step": 3050 }, { "epoch": 1.579192546583851, "grad_norm": 0.8586633801460266, "learning_rate": 4.977177030555219e-06, "loss": 0.13272574543952942, "rewards/accuracies": 0.96875, "rewards/chosen": 24.994903564453125, "rewards/margins": 20.379608154296875, "rewards/rejected": 4.615383148193359, "step": 3051 }, { "epoch": 1.5797101449275361, "grad_norm": 1.017385482788086, "learning_rate": 4.9743241830583565e-06, "loss": 0.136419415473938, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.89935302734375, "rewards/margins": 21.422805786132812, "rewards/rejected": 4.476815223693848, "step": 3052 }, { "epoch": 1.5802277432712215, "grad_norm": 1.011629343032837, "learning_rate": 4.971471343920439e-06, "loss": 0.09675171971321106, "rewards/accuracies": 0.96875, "rewards/chosen": 27.543731689453125, "rewards/margins": 22.94183349609375, "rewards/rejected": 4.59814453125, "step": 3053 }, { "epoch": 1.5807453416149069, "grad_norm": 1.0909844636917114, "learning_rate": 4.96861851407023e-06, "loss": 0.12510600686073303, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.194210052490234, "rewards/margins": 20.940826416015625, "rewards/rejected": 5.254001617431641, "step": 3054 }, { "epoch": 1.581262939958592, "grad_norm": 1.8639434576034546, "learning_rate": 4.965765694436493e-06, "loss": 0.11505915224552155, "rewards/accuracies": 0.953125, "rewards/chosen": 23.6976261138916, "rewards/margins": 19.737701416015625, "rewards/rejected": 3.9569473266601562, "step": 3055 }, { "epoch": 1.5817805383022774, "grad_norm": 1.957654595375061, "learning_rate": 4.962912885947979e-06, "loss": 0.1842125654220581, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.755050659179688, "rewards/margins": 21.81158447265625, "rewards/rejected": 4.943809509277344, "step": 3056 }, { "epoch": 1.5822981366459627, "grad_norm": 0.6743485331535339, "learning_rate": 4.960060089533443e-06, "loss": 0.078855000436306, "rewards/accuracies": 0.953125, "rewards/chosen": 27.979461669921875, "rewards/margins": 23.505096435546875, "rewards/rejected": 4.472285270690918, "step": 3057 }, { "epoch": 1.582815734989648, "grad_norm": 0.9699082970619202, "learning_rate": 4.9572073061216345e-06, "loss": 0.11766710877418518, "rewards/accuracies": 0.9453125, "rewards/chosen": 25.08698272705078, "rewards/margins": 20.872299194335938, "rewards/rejected": 4.213327407836914, "step": 3058 }, { "epoch": 1.5833333333333335, "grad_norm": 1.3058021068572998, "learning_rate": 4.9543545366412935e-06, "loss": 0.08578820526599884, "rewards/accuracies": 0.96875, "rewards/chosen": 25.997323989868164, "rewards/margins": 22.303634643554688, "rewards/rejected": 3.69134521484375, "step": 3059 }, { "epoch": 1.5838509316770186, "grad_norm": 0.8471659421920776, "learning_rate": 4.951501782021163e-06, "loss": 0.09375563263893127, "rewards/accuracies": 0.953125, "rewards/chosen": 25.191238403320312, "rewards/margins": 21.670135498046875, "rewards/rejected": 3.5245933532714844, "step": 3060 }, { "epoch": 1.5843685300207038, "grad_norm": 1.8058148622512817, "learning_rate": 4.948649043189977e-06, "loss": 0.11953409761190414, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.6868896484375, "rewards/margins": 22.48089599609375, "rewards/rejected": 4.201960563659668, "step": 3061 }, { "epoch": 1.5848861283643894, "grad_norm": 1.0135949850082397, "learning_rate": 4.945796321076467e-06, "loss": 0.08475211262702942, "rewards/accuracies": 0.96875, "rewards/chosen": 28.642526626586914, "rewards/margins": 24.556472778320312, "rewards/rejected": 4.086902141571045, "step": 3062 }, { "epoch": 1.5854037267080745, "grad_norm": 1.456483006477356, "learning_rate": 4.942943616609353e-06, "loss": 0.1656474471092224, "rewards/accuracies": 0.921875, "rewards/chosen": 23.765960693359375, "rewards/margins": 20.63238525390625, "rewards/rejected": 3.1356942653656006, "step": 3063 }, { "epoch": 1.5859213250517599, "grad_norm": 2.9034600257873535, "learning_rate": 4.9400909307173576e-06, "loss": 0.17608945071697235, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.22142791748047, "rewards/margins": 23.51300048828125, "rewards/rejected": 3.7104568481445312, "step": 3064 }, { "epoch": 1.5864389233954452, "grad_norm": 0.7696862816810608, "learning_rate": 4.9372382643291915e-06, "loss": 0.0862932875752449, "rewards/accuracies": 0.96875, "rewards/chosen": 26.605270385742188, "rewards/margins": 22.5062255859375, "rewards/rejected": 4.091960906982422, "step": 3065 }, { "epoch": 1.5869565217391304, "grad_norm": 2.8804194927215576, "learning_rate": 4.934385618373561e-06, "loss": 0.18218542635440826, "rewards/accuracies": 0.90625, "rewards/chosen": 29.481060028076172, "rewards/margins": 25.34552001953125, "rewards/rejected": 4.137091636657715, "step": 3066 }, { "epoch": 1.5874741200828157, "grad_norm": 2.0097832679748535, "learning_rate": 4.931532993779164e-06, "loss": 0.17933547496795654, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.095195770263672, "rewards/margins": 24.50286865234375, "rewards/rejected": 4.593160629272461, "step": 3067 }, { "epoch": 1.587991718426501, "grad_norm": 1.5218331813812256, "learning_rate": 4.928680391474697e-06, "loss": 0.1755332350730896, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.630203247070312, "rewards/margins": 21.299522399902344, "rewards/rejected": 4.328008651733398, "step": 3068 }, { "epoch": 1.5885093167701863, "grad_norm": 0.9664497971534729, "learning_rate": 4.925827812388842e-06, "loss": 0.14338535070419312, "rewards/accuracies": 0.90625, "rewards/chosen": 25.536365509033203, "rewards/margins": 21.37896728515625, "rewards/rejected": 4.153460502624512, "step": 3069 }, { "epoch": 1.5890269151138716, "grad_norm": 1.5235272645950317, "learning_rate": 4.922975257450277e-06, "loss": 0.13169969618320465, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.950672149658203, "rewards/margins": 24.70208740234375, "rewards/rejected": 5.243618965148926, "step": 3070 }, { "epoch": 1.589544513457557, "grad_norm": 0.9995257258415222, "learning_rate": 4.9201227275876745e-06, "loss": 0.12053501605987549, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.789260864257812, "rewards/margins": 24.720611572265625, "rewards/rejected": 5.071282386779785, "step": 3071 }, { "epoch": 1.5900621118012421, "grad_norm": 1.3847109079360962, "learning_rate": 4.917270223729691e-06, "loss": 0.12167972326278687, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.27200698852539, "rewards/margins": 22.906768798828125, "rewards/rejected": 4.3593597412109375, "step": 3072 }, { "epoch": 1.5905797101449275, "grad_norm": 1.036260724067688, "learning_rate": 4.914417746804984e-06, "loss": 0.12833768129348755, "rewards/accuracies": 0.953125, "rewards/chosen": 26.676937103271484, "rewards/margins": 21.830032348632812, "rewards/rejected": 4.844026565551758, "step": 3073 }, { "epoch": 1.5910973084886129, "grad_norm": 1.1429407596588135, "learning_rate": 4.911565297742197e-06, "loss": 0.12136131525039673, "rewards/accuracies": 0.953125, "rewards/chosen": 25.287826538085938, "rewards/margins": 20.77166748046875, "rewards/rejected": 4.511158466339111, "step": 3074 }, { "epoch": 1.591614906832298, "grad_norm": 1.0149096250534058, "learning_rate": 4.908712877469966e-06, "loss": 0.14414210617542267, "rewards/accuracies": 0.953125, "rewards/chosen": 27.2548828125, "rewards/margins": 23.309600830078125, "rewards/rejected": 3.948118209838867, "step": 3075 }, { "epoch": 1.5921325051759836, "grad_norm": 1.0930246114730835, "learning_rate": 4.905860486916914e-06, "loss": 0.1155334860086441, "rewards/accuracies": 0.9375, "rewards/chosen": 28.12133026123047, "rewards/margins": 23.8939208984375, "rewards/rejected": 4.235834121704102, "step": 3076 }, { "epoch": 1.5926501035196687, "grad_norm": 1.0178446769714355, "learning_rate": 4.90300812701166e-06, "loss": 0.12583407759666443, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.14898681640625, "rewards/margins": 25.704925537109375, "rewards/rejected": 5.4514007568359375, "step": 3077 }, { "epoch": 1.5931677018633539, "grad_norm": 3.1416709423065186, "learning_rate": 4.900155798682811e-06, "loss": 0.13375051319599152, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.21975326538086, "rewards/margins": 22.521827697753906, "rewards/rejected": 3.698740243911743, "step": 3078 }, { "epoch": 1.5936853002070395, "grad_norm": 1.590018630027771, "learning_rate": 4.89730350285896e-06, "loss": 0.09859489649534225, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.830215454101562, "rewards/margins": 24.057281494140625, "rewards/rejected": 3.7604827880859375, "step": 3079 }, { "epoch": 1.5942028985507246, "grad_norm": 1.4945281744003296, "learning_rate": 4.894451240468694e-06, "loss": 0.18508300185203552, "rewards/accuracies": 0.90625, "rewards/chosen": 26.09193229675293, "rewards/margins": 20.978561401367188, "rewards/rejected": 5.118282318115234, "step": 3080 }, { "epoch": 1.59472049689441, "grad_norm": 1.7973158359527588, "learning_rate": 4.891599012440589e-06, "loss": 0.1406826376914978, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.01058578491211, "rewards/margins": 21.882568359375, "rewards/rejected": 4.1272735595703125, "step": 3081 }, { "epoch": 1.5952380952380953, "grad_norm": 1.2709012031555176, "learning_rate": 4.888746819703207e-06, "loss": 0.14279961585998535, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.874778747558594, "rewards/margins": 25.508056640625, "rewards/rejected": 5.357501983642578, "step": 3082 }, { "epoch": 1.5957556935817805, "grad_norm": 1.2027539014816284, "learning_rate": 4.8858946631851e-06, "loss": 0.08472096174955368, "rewards/accuracies": 0.9609375, "rewards/chosen": 31.21238136291504, "rewards/margins": 25.211944580078125, "rewards/rejected": 6.000923156738281, "step": 3083 }, { "epoch": 1.5962732919254659, "grad_norm": 2.7773265838623047, "learning_rate": 4.883042543814806e-06, "loss": 0.11399289965629578, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.68118667602539, "rewards/margins": 28.650222778320312, "rewards/rejected": 6.030364990234375, "step": 3084 }, { "epoch": 1.5967908902691512, "grad_norm": 2.4671528339385986, "learning_rate": 4.880190462520859e-06, "loss": 0.23451641201972961, "rewards/accuracies": 0.90625, "rewards/chosen": 29.6806640625, "rewards/margins": 24.15386962890625, "rewards/rejected": 5.520668029785156, "step": 3085 }, { "epoch": 1.5973084886128364, "grad_norm": 1.573524832725525, "learning_rate": 4.877338420231769e-06, "loss": 0.12568898499011993, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.99555206298828, "rewards/margins": 23.71454620361328, "rewards/rejected": 5.279088973999023, "step": 3086 }, { "epoch": 1.5978260869565217, "grad_norm": 2.064876079559326, "learning_rate": 4.874486417876041e-06, "loss": 0.11949630081653595, "rewards/accuracies": 0.953125, "rewards/chosen": 28.670440673828125, "rewards/margins": 24.14959716796875, "rewards/rejected": 4.531351089477539, "step": 3087 }, { "epoch": 1.598343685300207, "grad_norm": 1.0010334253311157, "learning_rate": 4.871634456382166e-06, "loss": 0.11939732730388641, "rewards/accuracies": 0.953125, "rewards/chosen": 26.937259674072266, "rewards/margins": 22.620819091796875, "rewards/rejected": 4.31767463684082, "step": 3088 }, { "epoch": 1.5988612836438922, "grad_norm": 1.0776617527008057, "learning_rate": 4.8687825366786174e-06, "loss": 0.20000720024108887, "rewards/accuracies": 0.921875, "rewards/chosen": 26.714149475097656, "rewards/margins": 22.24994659423828, "rewards/rejected": 4.4619140625, "step": 3089 }, { "epoch": 1.5993788819875776, "grad_norm": 1.162179708480835, "learning_rate": 4.865930659693861e-06, "loss": 0.12370960414409637, "rewards/accuracies": 0.9375, "rewards/chosen": 28.294357299804688, "rewards/margins": 23.333030700683594, "rewards/rejected": 4.951290607452393, "step": 3090 }, { "epoch": 1.599896480331263, "grad_norm": 1.2297098636627197, "learning_rate": 4.863078826356346e-06, "loss": 0.15397858619689941, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.994300842285156, "rewards/margins": 22.94415283203125, "rewards/rejected": 4.051977157592773, "step": 3091 }, { "epoch": 1.6004140786749481, "grad_norm": 1.4686119556427002, "learning_rate": 4.860227037594506e-06, "loss": 0.1767377257347107, "rewards/accuracies": 0.8984375, "rewards/chosen": 24.97118377685547, "rewards/margins": 20.514373779296875, "rewards/rejected": 4.456668853759766, "step": 3092 }, { "epoch": 1.6009316770186337, "grad_norm": 1.23064124584198, "learning_rate": 4.857375294336761e-06, "loss": 0.18466031551361084, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.94314193725586, "rewards/margins": 23.406341552734375, "rewards/rejected": 4.530429840087891, "step": 3093 }, { "epoch": 1.6014492753623188, "grad_norm": 1.191622257232666, "learning_rate": 4.854523597511518e-06, "loss": 0.16450946033000946, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.855022430419922, "rewards/margins": 21.547332763671875, "rewards/rejected": 4.306581497192383, "step": 3094 }, { "epoch": 1.601966873706004, "grad_norm": 1.2093069553375244, "learning_rate": 4.851671948047167e-06, "loss": 0.19368577003479004, "rewards/accuracies": 0.90625, "rewards/chosen": 27.281475067138672, "rewards/margins": 23.283981323242188, "rewards/rejected": 3.9958267211914062, "step": 3095 }, { "epoch": 1.6024844720496896, "grad_norm": 0.913468599319458, "learning_rate": 4.848820346872082e-06, "loss": 0.13363993167877197, "rewards/accuracies": 0.9375, "rewards/chosen": 27.21356964111328, "rewards/margins": 23.417808532714844, "rewards/rejected": 3.800631046295166, "step": 3096 }, { "epoch": 1.6030020703933747, "grad_norm": 0.9674185514450073, "learning_rate": 4.845968794914623e-06, "loss": 0.1394561529159546, "rewards/accuracies": 0.953125, "rewards/chosen": 28.356822967529297, "rewards/margins": 23.981735229492188, "rewards/rejected": 4.374041557312012, "step": 3097 }, { "epoch": 1.60351966873706, "grad_norm": 1.185851812362671, "learning_rate": 4.843117293103136e-06, "loss": 0.17612090706825256, "rewards/accuracies": 0.90625, "rewards/chosen": 25.38522720336914, "rewards/margins": 20.94875717163086, "rewards/rejected": 4.42991828918457, "step": 3098 }, { "epoch": 1.6040372670807455, "grad_norm": 0.8809114694595337, "learning_rate": 4.840265842365944e-06, "loss": 0.10708260536193848, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.064788818359375, "rewards/margins": 23.464569091796875, "rewards/rejected": 4.59819221496582, "step": 3099 }, { "epoch": 1.6045548654244306, "grad_norm": 1.4424554109573364, "learning_rate": 4.8374144436313585e-06, "loss": 0.14842693507671356, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.27173614501953, "rewards/margins": 24.304290771484375, "rewards/rejected": 3.959712505340576, "step": 3100 }, { "epoch": 1.605072463768116, "grad_norm": 1.092394232749939, "learning_rate": 4.8345630978276746e-06, "loss": 0.14552482962608337, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.224140167236328, "rewards/margins": 21.556427001953125, "rewards/rejected": 4.666046142578125, "step": 3101 }, { "epoch": 1.6055900621118013, "grad_norm": 0.6640380620956421, "learning_rate": 4.831711805883165e-06, "loss": 0.11995107680559158, "rewards/accuracies": 0.953125, "rewards/chosen": 25.57166290283203, "rewards/margins": 21.665451049804688, "rewards/rejected": 3.910342216491699, "step": 3102 }, { "epoch": 1.6061076604554865, "grad_norm": 1.0650436878204346, "learning_rate": 4.828860568726092e-06, "loss": 0.12209326028823853, "rewards/accuracies": 0.953125, "rewards/chosen": 30.435623168945312, "rewards/margins": 25.223052978515625, "rewards/rejected": 5.215760231018066, "step": 3103 }, { "epoch": 1.6066252587991718, "grad_norm": 0.6567133665084839, "learning_rate": 4.826009387284696e-06, "loss": 0.14073535799980164, "rewards/accuracies": 0.921875, "rewards/chosen": 31.2185115814209, "rewards/margins": 26.256866455078125, "rewards/rejected": 4.962425231933594, "step": 3104 }, { "epoch": 1.6071428571428572, "grad_norm": 0.9745316505432129, "learning_rate": 4.8231582624871975e-06, "loss": 0.13458237051963806, "rewards/accuracies": 0.921875, "rewards/chosen": 27.540687561035156, "rewards/margins": 23.189178466796875, "rewards/rejected": 4.357158660888672, "step": 3105 }, { "epoch": 1.6076604554865424, "grad_norm": 0.5936928391456604, "learning_rate": 4.8203071952618e-06, "loss": 0.10547538101673126, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.569812774658203, "rewards/margins": 23.699798583984375, "rewards/rejected": 4.867649078369141, "step": 3106 }, { "epoch": 1.6081780538302277, "grad_norm": 1.00244140625, "learning_rate": 4.8174561865366955e-06, "loss": 0.12310466915369034, "rewards/accuracies": 0.9609375, "rewards/chosen": 31.524131774902344, "rewards/margins": 26.48736572265625, "rewards/rejected": 5.039154052734375, "step": 3107 }, { "epoch": 1.608695652173913, "grad_norm": 1.3347737789154053, "learning_rate": 4.814605237240042e-06, "loss": 0.1801636666059494, "rewards/accuracies": 0.90625, "rewards/chosen": 33.6417236328125, "rewards/margins": 28.54168701171875, "rewards/rejected": 5.104461669921875, "step": 3108 }, { "epoch": 1.6092132505175982, "grad_norm": 0.638820469379425, "learning_rate": 4.811754348299991e-06, "loss": 0.07456320524215698, "rewards/accuracies": 0.96875, "rewards/chosen": 40.04856872558594, "rewards/margins": 32.62965393066406, "rewards/rejected": 7.416526794433594, "step": 3109 }, { "epoch": 1.6097308488612836, "grad_norm": 0.8948941826820374, "learning_rate": 4.808903520644669e-06, "loss": 0.12149378657341003, "rewards/accuracies": 0.953125, "rewards/chosen": 35.077369689941406, "rewards/margins": 28.849578857421875, "rewards/rejected": 6.219486236572266, "step": 3110 }, { "epoch": 1.610248447204969, "grad_norm": 0.7547049522399902, "learning_rate": 4.806052755202185e-06, "loss": 0.0973397046327591, "rewards/accuracies": 0.9375, "rewards/chosen": 30.83660125732422, "rewards/margins": 25.126052856445312, "rewards/rejected": 5.70897102355957, "step": 3111 }, { "epoch": 1.610766045548654, "grad_norm": 0.8373998999595642, "learning_rate": 4.803202052900625e-06, "loss": 0.15450608730316162, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.35707092285156, "rewards/margins": 25.839462280273438, "rewards/rejected": 6.522434234619141, "step": 3112 }, { "epoch": 1.6112836438923397, "grad_norm": 0.9721027612686157, "learning_rate": 4.8003514146680565e-06, "loss": 0.162271648645401, "rewards/accuracies": 0.9375, "rewards/chosen": 33.12358856201172, "rewards/margins": 27.28289794921875, "rewards/rejected": 5.845718860626221, "step": 3113 }, { "epoch": 1.6118012422360248, "grad_norm": 0.9570343494415283, "learning_rate": 4.797500841432524e-06, "loss": 0.10899090021848679, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.039161682128906, "rewards/margins": 28.192413330078125, "rewards/rejected": 5.862850189208984, "step": 3114 }, { "epoch": 1.6123188405797102, "grad_norm": 0.855233907699585, "learning_rate": 4.794650334122052e-06, "loss": 0.14191380143165588, "rewards/accuracies": 0.9375, "rewards/chosen": 32.47049331665039, "rewards/margins": 26.227127075195312, "rewards/rejected": 6.2439470291137695, "step": 3115 }, { "epoch": 1.6128364389233956, "grad_norm": 1.0763750076293945, "learning_rate": 4.791799893664645e-06, "loss": 0.14436836540699005, "rewards/accuracies": 0.9140625, "rewards/chosen": 33.5357666015625, "rewards/margins": 26.926773071289062, "rewards/rejected": 6.608786582946777, "step": 3116 }, { "epoch": 1.6133540372670807, "grad_norm": 1.5169256925582886, "learning_rate": 4.788949520988285e-06, "loss": 0.12682616710662842, "rewards/accuracies": 0.9375, "rewards/chosen": 33.38330078125, "rewards/margins": 26.607406616210938, "rewards/rejected": 6.7730393409729, "step": 3117 }, { "epoch": 1.613871635610766, "grad_norm": 1.0270313024520874, "learning_rate": 4.78609921702093e-06, "loss": 0.1509781777858734, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.15787124633789, "rewards/margins": 26.791656494140625, "rewards/rejected": 6.370307922363281, "step": 3118 }, { "epoch": 1.6143892339544514, "grad_norm": 0.7426457405090332, "learning_rate": 4.783248982690515e-06, "loss": 0.09266196191310883, "rewards/accuracies": 0.9609375, "rewards/chosen": 41.851806640625, "rewards/margins": 32.8466796875, "rewards/rejected": 9.01034164428711, "step": 3119 }, { "epoch": 1.6149068322981366, "grad_norm": 2.1405415534973145, "learning_rate": 4.780398818924959e-06, "loss": 0.1348962038755417, "rewards/accuracies": 0.9375, "rewards/chosen": 34.18567657470703, "rewards/margins": 28.271636962890625, "rewards/rejected": 5.904751300811768, "step": 3120 }, { "epoch": 1.615424430641822, "grad_norm": 1.2666865587234497, "learning_rate": 4.777548726652151e-06, "loss": 0.1307527720928192, "rewards/accuracies": 0.9296875, "rewards/chosen": 37.8413200378418, "rewards/margins": 29.79241943359375, "rewards/rejected": 8.043514251708984, "step": 3121 }, { "epoch": 1.6159420289855073, "grad_norm": 2.5124151706695557, "learning_rate": 4.7746987067999595e-06, "loss": 0.13364045321941376, "rewards/accuracies": 0.953125, "rewards/chosen": 37.95310592651367, "rewards/margins": 28.5390625, "rewards/rejected": 9.406835556030273, "step": 3122 }, { "epoch": 1.6164596273291925, "grad_norm": 2.388216018676758, "learning_rate": 4.771848760296227e-06, "loss": 0.22812330722808838, "rewards/accuracies": 0.8828125, "rewards/chosen": 32.662841796875, "rewards/margins": 26.519485473632812, "rewards/rejected": 6.142477035522461, "step": 3123 }, { "epoch": 1.6169772256728778, "grad_norm": 1.4481338262557983, "learning_rate": 4.768998888068778e-06, "loss": 0.14071273803710938, "rewards/accuracies": 0.90625, "rewards/chosen": 37.2615966796875, "rewards/margins": 29.3017578125, "rewards/rejected": 7.962390899658203, "step": 3124 }, { "epoch": 1.6174948240165632, "grad_norm": 2.737950563430786, "learning_rate": 4.7661490910454055e-06, "loss": 0.14397946000099182, "rewards/accuracies": 0.9609375, "rewards/chosen": 37.32326889038086, "rewards/margins": 29.470863342285156, "rewards/rejected": 7.8531646728515625, "step": 3125 }, { "epoch": 1.6180124223602483, "grad_norm": 1.9596118927001953, "learning_rate": 4.763299370153883e-06, "loss": 0.1711270809173584, "rewards/accuracies": 0.90625, "rewards/chosen": 33.95707702636719, "rewards/margins": 27.511322021484375, "rewards/rejected": 6.438259124755859, "step": 3126 }, { "epoch": 1.6185300207039337, "grad_norm": 1.2538000345230103, "learning_rate": 4.760449726321958e-06, "loss": 0.1837916076183319, "rewards/accuracies": 0.9296875, "rewards/chosen": 40.5064697265625, "rewards/margins": 31.786224365234375, "rewards/rejected": 8.726951599121094, "step": 3127 }, { "epoch": 1.619047619047619, "grad_norm": 1.5280019044876099, "learning_rate": 4.75760016047735e-06, "loss": 0.14863185584545135, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.23149108886719, "rewards/margins": 28.738067626953125, "rewards/rejected": 6.4998321533203125, "step": 3128 }, { "epoch": 1.6195652173913042, "grad_norm": 0.8062055110931396, "learning_rate": 4.754750673547759e-06, "loss": 0.14575347304344177, "rewards/accuracies": 0.90625, "rewards/chosen": 36.968360900878906, "rewards/margins": 30.463348388671875, "rewards/rejected": 6.5018310546875, "step": 3129 }, { "epoch": 1.6200828157349898, "grad_norm": 0.82364422082901, "learning_rate": 4.751901266460856e-06, "loss": 0.0926104262471199, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.95558166503906, "rewards/margins": 30.136367797851562, "rewards/rejected": 6.821266174316406, "step": 3130 }, { "epoch": 1.620600414078675, "grad_norm": 0.6177923679351807, "learning_rate": 4.749051940144283e-06, "loss": 0.0934077650308609, "rewards/accuracies": 0.9609375, "rewards/chosen": 35.21931457519531, "rewards/margins": 27.674835205078125, "rewards/rejected": 7.539691925048828, "step": 3131 }, { "epoch": 1.62111801242236, "grad_norm": 1.3640060424804688, "learning_rate": 4.746202695525661e-06, "loss": 0.20309999585151672, "rewards/accuracies": 0.890625, "rewards/chosen": 34.16636657714844, "rewards/margins": 27.341781616210938, "rewards/rejected": 6.822551727294922, "step": 3132 }, { "epoch": 1.6216356107660457, "grad_norm": 0.9935219287872314, "learning_rate": 4.7433535335325805e-06, "loss": 0.17427018284797668, "rewards/accuracies": 0.9375, "rewards/chosen": 35.94050216674805, "rewards/margins": 28.836044311523438, "rewards/rejected": 7.102195739746094, "step": 3133 }, { "epoch": 1.6221532091097308, "grad_norm": 0.6877574324607849, "learning_rate": 4.740504455092611e-06, "loss": 0.10943819582462311, "rewards/accuracies": 0.953125, "rewards/chosen": 34.299034118652344, "rewards/margins": 27.352081298828125, "rewards/rejected": 6.944263458251953, "step": 3134 }, { "epoch": 1.6226708074534162, "grad_norm": 2.7643542289733887, "learning_rate": 4.737655461133285e-06, "loss": 0.213884174823761, "rewards/accuracies": 0.921875, "rewards/chosen": 41.69098663330078, "rewards/margins": 33.8787841796875, "rewards/rejected": 7.808940887451172, "step": 3135 }, { "epoch": 1.6231884057971016, "grad_norm": 1.334741234779358, "learning_rate": 4.734806552582115e-06, "loss": 0.15725219249725342, "rewards/accuracies": 0.9375, "rewards/chosen": 36.18650817871094, "rewards/margins": 29.697200775146484, "rewards/rejected": 6.490774154663086, "step": 3136 }, { "epoch": 1.6237060041407867, "grad_norm": 0.713755190372467, "learning_rate": 4.731957730366588e-06, "loss": 0.10410870611667633, "rewards/accuracies": 0.9609375, "rewards/chosen": 32.931304931640625, "rewards/margins": 26.740219116210938, "rewards/rejected": 6.1871795654296875, "step": 3137 }, { "epoch": 1.624223602484472, "grad_norm": 1.7895033359527588, "learning_rate": 4.72910899541415e-06, "loss": 0.13850682973861694, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.30044937133789, "rewards/margins": 27.087631225585938, "rewards/rejected": 6.216453552246094, "step": 3138 }, { "epoch": 1.6247412008281574, "grad_norm": 2.0562381744384766, "learning_rate": 4.726260348652234e-06, "loss": 0.22187121212482452, "rewards/accuracies": 0.90625, "rewards/chosen": 30.286258697509766, "rewards/margins": 26.06988525390625, "rewards/rejected": 4.2126359939575195, "step": 3139 }, { "epoch": 1.6252587991718426, "grad_norm": 1.4633978605270386, "learning_rate": 4.723411791008236e-06, "loss": 0.12815099954605103, "rewards/accuracies": 0.953125, "rewards/chosen": 31.704673767089844, "rewards/margins": 26.0693359375, "rewards/rejected": 5.638020992279053, "step": 3140 }, { "epoch": 1.625776397515528, "grad_norm": 0.9013000726699829, "learning_rate": 4.720563323409523e-06, "loss": 0.1363769769668579, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.013580322265625, "rewards/margins": 24.7821044921875, "rewards/rejected": 4.227173805236816, "step": 3141 }, { "epoch": 1.6262939958592133, "grad_norm": 1.0413252115249634, "learning_rate": 4.717714946783435e-06, "loss": 0.15407133102416992, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.587961196899414, "rewards/margins": 25.996780395507812, "rewards/rejected": 3.587144136428833, "step": 3142 }, { "epoch": 1.6268115942028984, "grad_norm": 0.8989199995994568, "learning_rate": 4.714866662057282e-06, "loss": 0.16159185767173767, "rewards/accuracies": 0.921875, "rewards/chosen": 32.44017028808594, "rewards/margins": 27.213172912597656, "rewards/rejected": 5.220420837402344, "step": 3143 }, { "epoch": 1.6273291925465838, "grad_norm": 1.5765501260757446, "learning_rate": 4.712018470158345e-06, "loss": 0.0857817679643631, "rewards/accuracies": 0.9765625, "rewards/chosen": 32.05244445800781, "rewards/margins": 27.463027954101562, "rewards/rejected": 4.597068786621094, "step": 3144 }, { "epoch": 1.6278467908902692, "grad_norm": 0.9095469117164612, "learning_rate": 4.709170372013868e-06, "loss": 0.17885394394397736, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.546234130859375, "rewards/margins": 23.559661865234375, "rewards/rejected": 2.9852218627929688, "step": 3145 }, { "epoch": 1.6283643892339543, "grad_norm": 0.9212998151779175, "learning_rate": 4.706322368551076e-06, "loss": 0.14511221647262573, "rewards/accuracies": 0.921875, "rewards/chosen": 28.778160095214844, "rewards/margins": 25.6112060546875, "rewards/rejected": 3.165203094482422, "step": 3146 }, { "epoch": 1.62888198757764, "grad_norm": 1.413196086883545, "learning_rate": 4.703474460697156e-06, "loss": 0.1736820638179779, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.608692169189453, "rewards/margins": 26.326263427734375, "rewards/rejected": 4.281511306762695, "step": 3147 }, { "epoch": 1.629399585921325, "grad_norm": 0.6203754544258118, "learning_rate": 4.7006266493792615e-06, "loss": 0.12089402973651886, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.672361373901367, "rewards/margins": 25.200122833251953, "rewards/rejected": 3.4744529724121094, "step": 3148 }, { "epoch": 1.6299171842650102, "grad_norm": 0.7467247247695923, "learning_rate": 4.69777893552452e-06, "loss": 0.13552318513393402, "rewards/accuracies": 0.921875, "rewards/chosen": 25.86301612854004, "rewards/margins": 22.62397003173828, "rewards/rejected": 3.2427616119384766, "step": 3149 }, { "epoch": 1.6304347826086958, "grad_norm": 1.0808974504470825, "learning_rate": 4.694931320060027e-06, "loss": 0.12377355247735977, "rewards/accuracies": 0.96875, "rewards/chosen": 28.473506927490234, "rewards/margins": 24.443023681640625, "rewards/rejected": 4.036003112792969, "step": 3150 }, { "epoch": 1.630952380952381, "grad_norm": 0.7672107219696045, "learning_rate": 4.692083803912842e-06, "loss": 0.11851707845926285, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.240760803222656, "rewards/margins": 27.345932006835938, "rewards/rejected": 2.893150806427002, "step": 3151 }, { "epoch": 1.6314699792960663, "grad_norm": 1.4999107122421265, "learning_rate": 4.6892363880099935e-06, "loss": 0.12905935943126678, "rewards/accuracies": 0.953125, "rewards/chosen": 27.945552825927734, "rewards/margins": 24.52984619140625, "rewards/rejected": 3.4160823822021484, "step": 3152 }, { "epoch": 1.6319875776397517, "grad_norm": 1.1073098182678223, "learning_rate": 4.686389073278483e-06, "loss": 0.16591432690620422, "rewards/accuracies": 0.9375, "rewards/chosen": 28.785545349121094, "rewards/margins": 25.398529052734375, "rewards/rejected": 3.3864097595214844, "step": 3153 }, { "epoch": 1.6325051759834368, "grad_norm": 0.7490625381469727, "learning_rate": 4.683541860645268e-06, "loss": 0.10878744721412659, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.250385284423828, "rewards/margins": 27.468536376953125, "rewards/rejected": 3.7833290100097656, "step": 3154 }, { "epoch": 1.6330227743271222, "grad_norm": 1.6978323459625244, "learning_rate": 4.680694751037283e-06, "loss": 0.1554105281829834, "rewards/accuracies": 0.953125, "rewards/chosen": 27.248634338378906, "rewards/margins": 24.635604858398438, "rewards/rejected": 2.6178741455078125, "step": 3155 }, { "epoch": 1.6335403726708075, "grad_norm": 3.3150458335876465, "learning_rate": 4.677847745381425e-06, "loss": 0.19071868062019348, "rewards/accuracies": 0.9375, "rewards/chosen": 30.114830017089844, "rewards/margins": 26.809486389160156, "rewards/rejected": 3.3071327209472656, "step": 3156 }, { "epoch": 1.6340579710144927, "grad_norm": 2.2170541286468506, "learning_rate": 4.675000844604557e-06, "loss": 0.13333754241466522, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.10006332397461, "rewards/margins": 25.015594482421875, "rewards/rejected": 3.078136444091797, "step": 3157 }, { "epoch": 1.634575569358178, "grad_norm": 0.6091580390930176, "learning_rate": 4.6721540496335065e-06, "loss": 0.1224307268857956, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.004592895507812, "rewards/margins": 25.552169799804688, "rewards/rejected": 3.4605674743652344, "step": 3158 }, { "epoch": 1.6350931677018634, "grad_norm": 0.9540680050849915, "learning_rate": 4.66930736139507e-06, "loss": 0.16905559599399567, "rewards/accuracies": 0.921875, "rewards/chosen": 25.83941650390625, "rewards/margins": 23.186500549316406, "rewards/rejected": 2.645965814590454, "step": 3159 }, { "epoch": 1.6356107660455486, "grad_norm": 1.5996310710906982, "learning_rate": 4.6664607808160076e-06, "loss": 0.16394838690757751, "rewards/accuracies": 0.9375, "rewards/chosen": 25.02867317199707, "rewards/margins": 22.841796875, "rewards/rejected": 2.195413589477539, "step": 3160 }, { "epoch": 1.636128364389234, "grad_norm": 1.2252323627471924, "learning_rate": 4.663614308823042e-06, "loss": 0.13917337357997894, "rewards/accuracies": 0.953125, "rewards/chosen": 29.493560791015625, "rewards/margins": 26.21567153930664, "rewards/rejected": 3.277872085571289, "step": 3161 }, { "epoch": 1.6366459627329193, "grad_norm": 1.533652663230896, "learning_rate": 4.660767946342864e-06, "loss": 0.14851386845111847, "rewards/accuracies": 0.9375, "rewards/chosen": 30.650243759155273, "rewards/margins": 26.07445526123047, "rewards/rejected": 4.577547073364258, "step": 3162 }, { "epoch": 1.6371635610766044, "grad_norm": 1.4698944091796875, "learning_rate": 4.657921694302129e-06, "loss": 0.14898592233657837, "rewards/accuracies": 0.953125, "rewards/chosen": 28.77312469482422, "rewards/margins": 25.868263244628906, "rewards/rejected": 2.915376663208008, "step": 3163 }, { "epoch": 1.6376811594202898, "grad_norm": 1.2317273616790771, "learning_rate": 4.655075553627451e-06, "loss": 0.1389954388141632, "rewards/accuracies": 0.9375, "rewards/chosen": 27.398590087890625, "rewards/margins": 24.919479370117188, "rewards/rejected": 2.481536865234375, "step": 3164 }, { "epoch": 1.6381987577639752, "grad_norm": 1.5783424377441406, "learning_rate": 4.652229525245414e-06, "loss": 0.20054787397384644, "rewards/accuracies": 0.9296875, "rewards/chosen": 22.119422912597656, "rewards/margins": 20.405441284179688, "rewards/rejected": 1.7067041397094727, "step": 3165 }, { "epoch": 1.6387163561076603, "grad_norm": 0.8160331845283508, "learning_rate": 4.649383610082562e-06, "loss": 0.14313474297523499, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.389999389648438, "rewards/margins": 27.766616821289062, "rewards/rejected": 3.6282730102539062, "step": 3166 }, { "epoch": 1.639233954451346, "grad_norm": 0.559288740158081, "learning_rate": 4.646537809065408e-06, "loss": 0.07823999971151352, "rewards/accuracies": 0.96875, "rewards/chosen": 33.24626541137695, "rewards/margins": 29.458419799804688, "rewards/rejected": 3.7821578979492188, "step": 3167 }, { "epoch": 1.639751552795031, "grad_norm": 1.929231882095337, "learning_rate": 4.6436921231204155e-06, "loss": 0.17917941510677338, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.62755584716797, "rewards/margins": 27.354629516601562, "rewards/rejected": 3.268446922302246, "step": 3168 }, { "epoch": 1.6402691511387164, "grad_norm": 1.2024558782577515, "learning_rate": 4.640846553174022e-06, "loss": 0.11082753539085388, "rewards/accuracies": 0.921875, "rewards/chosen": 33.42140197753906, "rewards/margins": 29.337936401367188, "rewards/rejected": 4.084228515625, "step": 3169 }, { "epoch": 1.6407867494824018, "grad_norm": 1.0930463075637817, "learning_rate": 4.638001100152626e-06, "loss": 0.09375929832458496, "rewards/accuracies": 0.96875, "rewards/chosen": 32.65777587890625, "rewards/margins": 28.537094116210938, "rewards/rejected": 4.124517440795898, "step": 3170 }, { "epoch": 1.641304347826087, "grad_norm": 1.145354986190796, "learning_rate": 4.635155764982579e-06, "loss": 0.15627506375312805, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.581295013427734, "rewards/margins": 26.746780395507812, "rewards/rejected": 2.8473024368286133, "step": 3171 }, { "epoch": 1.6418219461697723, "grad_norm": 0.6451966166496277, "learning_rate": 4.632310548590204e-06, "loss": 0.09045794606208801, "rewards/accuracies": 0.984375, "rewards/chosen": 32.82893371582031, "rewards/margins": 29.12994384765625, "rewards/rejected": 3.6826210021972656, "step": 3172 }, { "epoch": 1.6423395445134576, "grad_norm": 0.795380711555481, "learning_rate": 4.629465451901783e-06, "loss": 0.1382206380367279, "rewards/accuracies": 0.9375, "rewards/chosen": 36.28798294067383, "rewards/margins": 31.980499267578125, "rewards/rejected": 4.309471130371094, "step": 3173 }, { "epoch": 1.6428571428571428, "grad_norm": 0.809381902217865, "learning_rate": 4.626620475843555e-06, "loss": 0.14714963734149933, "rewards/accuracies": 0.921875, "rewards/chosen": 31.694015502929688, "rewards/margins": 28.26282501220703, "rewards/rejected": 3.424083709716797, "step": 3174 }, { "epoch": 1.6433747412008282, "grad_norm": 0.7606099843978882, "learning_rate": 4.623775621341723e-06, "loss": 0.10655945539474487, "rewards/accuracies": 0.953125, "rewards/chosen": 29.44525909423828, "rewards/margins": 25.718826293945312, "rewards/rejected": 3.725538730621338, "step": 3175 }, { "epoch": 1.6438923395445135, "grad_norm": 0.7956523299217224, "learning_rate": 4.620930889322453e-06, "loss": 0.10030953586101532, "rewards/accuracies": 0.9375, "rewards/chosen": 34.38190841674805, "rewards/margins": 30.518997192382812, "rewards/rejected": 3.8572874069213867, "step": 3176 }, { "epoch": 1.6444099378881987, "grad_norm": 1.1909654140472412, "learning_rate": 4.6180862807118635e-06, "loss": 0.14421632885932922, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.852046966552734, "rewards/margins": 30.470375061035156, "rewards/rejected": 5.379125595092773, "step": 3177 }, { "epoch": 1.644927536231884, "grad_norm": 0.9775883555412292, "learning_rate": 4.615241796436039e-06, "loss": 0.09498964250087738, "rewards/accuracies": 0.96875, "rewards/chosen": 34.208457946777344, "rewards/margins": 29.38299560546875, "rewards/rejected": 4.827401161193848, "step": 3178 }, { "epoch": 1.6454451345755694, "grad_norm": 1.2856940031051636, "learning_rate": 4.612397437421022e-06, "loss": 0.1022212952375412, "rewards/accuracies": 0.96875, "rewards/chosen": 38.2099723815918, "rewards/margins": 33.242095947265625, "rewards/rejected": 4.9570159912109375, "step": 3179 }, { "epoch": 1.6459627329192545, "grad_norm": 1.2013901472091675, "learning_rate": 4.609553204592817e-06, "loss": 0.1686926931142807, "rewards/accuracies": 0.921875, "rewards/chosen": 35.01091384887695, "rewards/margins": 30.66387939453125, "rewards/rejected": 4.344120502471924, "step": 3180 }, { "epoch": 1.64648033126294, "grad_norm": 1.5671038627624512, "learning_rate": 4.606709098877379e-06, "loss": 0.15160095691680908, "rewards/accuracies": 0.9375, "rewards/chosen": 34.17588806152344, "rewards/margins": 29.573516845703125, "rewards/rejected": 4.5982666015625, "step": 3181 }, { "epoch": 1.6469979296066253, "grad_norm": 1.451836109161377, "learning_rate": 4.603865121200631e-06, "loss": 0.15523649752140045, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.939388275146484, "rewards/margins": 27.179529190063477, "rewards/rejected": 3.761808395385742, "step": 3182 }, { "epoch": 1.6475155279503104, "grad_norm": 0.6980648636817932, "learning_rate": 4.60102127248845e-06, "loss": 0.11061228066682816, "rewards/accuracies": 0.953125, "rewards/chosen": 36.95700454711914, "rewards/margins": 31.50408935546875, "rewards/rejected": 5.457867622375488, "step": 3183 }, { "epoch": 1.648033126293996, "grad_norm": 0.8466196060180664, "learning_rate": 4.59817755366667e-06, "loss": 0.10547392815351486, "rewards/accuracies": 0.9609375, "rewards/chosen": 35.296905517578125, "rewards/margins": 30.102630615234375, "rewards/rejected": 5.195748805999756, "step": 3184 }, { "epoch": 1.6485507246376812, "grad_norm": 1.6143053770065308, "learning_rate": 4.595333965661085e-06, "loss": 0.2047208845615387, "rewards/accuracies": 0.921875, "rewards/chosen": 32.04576873779297, "rewards/margins": 27.666595458984375, "rewards/rejected": 4.382647514343262, "step": 3185 }, { "epoch": 1.6490683229813663, "grad_norm": 1.6258506774902344, "learning_rate": 4.592490509397446e-06, "loss": 0.14827975630760193, "rewards/accuracies": 0.9375, "rewards/chosen": 37.63807678222656, "rewards/margins": 30.69256591796875, "rewards/rejected": 6.9579315185546875, "step": 3186 }, { "epoch": 1.6495859213250519, "grad_norm": 1.1493014097213745, "learning_rate": 4.5896471858014605e-06, "loss": 0.18489491939544678, "rewards/accuracies": 0.90625, "rewards/chosen": 37.85978698730469, "rewards/margins": 32.175567626953125, "rewards/rejected": 5.691511154174805, "step": 3187 }, { "epoch": 1.650103519668737, "grad_norm": 1.4106807708740234, "learning_rate": 4.586803995798791e-06, "loss": 0.11301363259553909, "rewards/accuracies": 0.9296875, "rewards/chosen": 39.83620834350586, "rewards/margins": 33.71155548095703, "rewards/rejected": 6.12711238861084, "step": 3188 }, { "epoch": 1.6506211180124224, "grad_norm": 0.9623539447784424, "learning_rate": 4.583960940315061e-06, "loss": 0.09801804274320602, "rewards/accuracies": 0.9453125, "rewards/chosen": 40.468502044677734, "rewards/margins": 34.49613952636719, "rewards/rejected": 5.969585418701172, "step": 3189 }, { "epoch": 1.6511387163561078, "grad_norm": 0.8246381878852844, "learning_rate": 4.581118020275849e-06, "loss": 0.13844075798988342, "rewards/accuracies": 0.9375, "rewards/chosen": 33.56245803833008, "rewards/margins": 27.730606079101562, "rewards/rejected": 5.834037780761719, "step": 3190 }, { "epoch": 1.651656314699793, "grad_norm": 1.3065918684005737, "learning_rate": 4.578275236606684e-06, "loss": 0.15053290128707886, "rewards/accuracies": 0.9140625, "rewards/chosen": 39.46415710449219, "rewards/margins": 33.650848388671875, "rewards/rejected": 5.811393737792969, "step": 3191 }, { "epoch": 1.6521739130434783, "grad_norm": 1.0075719356536865, "learning_rate": 4.575432590233057e-06, "loss": 0.11730226874351501, "rewards/accuracies": 0.9375, "rewards/chosen": 35.69987487792969, "rewards/margins": 29.99261474609375, "rewards/rejected": 5.7039642333984375, "step": 3192 }, { "epoch": 1.6526915113871636, "grad_norm": 1.0617414712905884, "learning_rate": 4.572590082080415e-06, "loss": 0.191715806722641, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.31706619262695, "rewards/margins": 28.455322265625, "rewards/rejected": 5.8664751052856445, "step": 3193 }, { "epoch": 1.6532091097308488, "grad_norm": 2.9717438220977783, "learning_rate": 4.5697477130741495e-06, "loss": 0.21281211078166962, "rewards/accuracies": 0.8984375, "rewards/chosen": 33.11906433105469, "rewards/margins": 27.270172119140625, "rewards/rejected": 5.845425128936768, "step": 3194 }, { "epoch": 1.6537267080745341, "grad_norm": 0.7541780471801758, "learning_rate": 4.566905484139621e-06, "loss": 0.14545273780822754, "rewards/accuracies": 0.9296875, "rewards/chosen": 37.56547927856445, "rewards/margins": 31.210845947265625, "rewards/rejected": 6.3620147705078125, "step": 3195 }, { "epoch": 1.6542443064182195, "grad_norm": 0.7215334177017212, "learning_rate": 4.564063396202135e-06, "loss": 0.14751453697681427, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.867382049560547, "rewards/margins": 26.338470458984375, "rewards/rejected": 5.527142524719238, "step": 3196 }, { "epoch": 1.6547619047619047, "grad_norm": 1.0831655263900757, "learning_rate": 4.561221450186954e-06, "loss": 0.17261666059494019, "rewards/accuracies": 0.921875, "rewards/chosen": 31.73082733154297, "rewards/margins": 26.464256286621094, "rewards/rejected": 5.268599510192871, "step": 3197 }, { "epoch": 1.65527950310559, "grad_norm": 0.9676757454872131, "learning_rate": 4.558379647019293e-06, "loss": 0.16296064853668213, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.82331466674805, "rewards/margins": 28.663352966308594, "rewards/rejected": 5.170875549316406, "step": 3198 }, { "epoch": 1.6557971014492754, "grad_norm": 1.0135976076126099, "learning_rate": 4.555537987624324e-06, "loss": 0.13233555853366852, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.811973571777344, "rewards/margins": 29.29132080078125, "rewards/rejected": 5.522647857666016, "step": 3199 }, { "epoch": 1.6563146997929605, "grad_norm": 0.7304520606994629, "learning_rate": 4.552696472927167e-06, "loss": 0.07117539644241333, "rewards/accuracies": 0.953125, "rewards/chosen": 44.827972412109375, "rewards/margins": 37.97991943359375, "rewards/rejected": 6.8514862060546875, "step": 3200 }, { "epoch": 1.6568322981366461, "grad_norm": 0.7033883929252625, "learning_rate": 4.549855103852898e-06, "loss": 0.10342765599489212, "rewards/accuracies": 0.9609375, "rewards/chosen": 31.739381790161133, "rewards/margins": 27.353858947753906, "rewards/rejected": 4.382851600646973, "step": 3201 }, { "epoch": 1.6573498964803313, "grad_norm": 1.2453863620758057, "learning_rate": 4.547013881326548e-06, "loss": 0.10514786839485168, "rewards/accuracies": 0.9453125, "rewards/chosen": 40.97510528564453, "rewards/margins": 35.07928466796875, "rewards/rejected": 5.899910926818848, "step": 3202 }, { "epoch": 1.6578674948240164, "grad_norm": 0.8981425166130066, "learning_rate": 4.544172806273095e-06, "loss": 0.13039568066596985, "rewards/accuracies": 0.921875, "rewards/chosen": 36.87517547607422, "rewards/margins": 30.961151123046875, "rewards/rejected": 5.9146270751953125, "step": 3203 }, { "epoch": 1.658385093167702, "grad_norm": 0.8393381834030151, "learning_rate": 4.541331879617472e-06, "loss": 0.16713468730449677, "rewards/accuracies": 0.921875, "rewards/chosen": 33.047607421875, "rewards/margins": 27.621292114257812, "rewards/rejected": 5.419548988342285, "step": 3204 }, { "epoch": 1.6589026915113871, "grad_norm": 0.7763472199440002, "learning_rate": 4.5384911022845615e-06, "loss": 0.18008148670196533, "rewards/accuracies": 0.8984375, "rewards/chosen": 36.673282623291016, "rewards/margins": 30.40087890625, "rewards/rejected": 6.269407272338867, "step": 3205 }, { "epoch": 1.6594202898550725, "grad_norm": 0.8388953804969788, "learning_rate": 4.535650475199204e-06, "loss": 0.1443411111831665, "rewards/accuracies": 0.9296875, "rewards/chosen": 38.933143615722656, "rewards/margins": 32.059783935546875, "rewards/rejected": 6.8756279945373535, "step": 3206 }, { "epoch": 1.6599378881987579, "grad_norm": 0.8075873255729675, "learning_rate": 4.53280999928618e-06, "loss": 0.14111675322055817, "rewards/accuracies": 0.921875, "rewards/chosen": 39.91682434082031, "rewards/margins": 32.136871337890625, "rewards/rejected": 7.781917572021484, "step": 3207 }, { "epoch": 1.660455486542443, "grad_norm": 1.0328289270401, "learning_rate": 4.5299696754702295e-06, "loss": 0.1078731119632721, "rewards/accuracies": 0.953125, "rewards/chosen": 41.79365539550781, "rewards/margins": 35.30213928222656, "rewards/rejected": 6.491418838500977, "step": 3208 }, { "epoch": 1.6609730848861284, "grad_norm": 1.2622826099395752, "learning_rate": 4.527129504676043e-06, "loss": 0.18613804876804352, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.009490966796875, "rewards/margins": 26.54094696044922, "rewards/rejected": 5.469348430633545, "step": 3209 }, { "epoch": 1.6614906832298137, "grad_norm": 1.0075538158416748, "learning_rate": 4.524289487828255e-06, "loss": 0.17053329944610596, "rewards/accuracies": 0.9375, "rewards/chosen": 40.08502960205078, "rewards/margins": 33.83320617675781, "rewards/rejected": 6.25722074508667, "step": 3210 }, { "epoch": 1.662008281573499, "grad_norm": 0.9419707655906677, "learning_rate": 4.521449625851453e-06, "loss": 0.15076181292533875, "rewards/accuracies": 0.9140625, "rewards/chosen": 36.48066711425781, "rewards/margins": 29.78839874267578, "rewards/rejected": 6.691398620605469, "step": 3211 }, { "epoch": 1.6625258799171843, "grad_norm": 1.0942739248275757, "learning_rate": 4.518609919670178e-06, "loss": 0.12825822830200195, "rewards/accuracies": 0.953125, "rewards/chosen": 42.27836227416992, "rewards/margins": 33.4188232421875, "rewards/rejected": 8.8524169921875, "step": 3212 }, { "epoch": 1.6630434782608696, "grad_norm": 2.4684112071990967, "learning_rate": 4.515770370208915e-06, "loss": 0.10159768164157867, "rewards/accuracies": 0.9296875, "rewards/chosen": 45.00262451171875, "rewards/margins": 36.198760986328125, "rewards/rejected": 8.79379653930664, "step": 3213 }, { "epoch": 1.6635610766045548, "grad_norm": 2.628263235092163, "learning_rate": 4.512930978392098e-06, "loss": 0.1907665729522705, "rewards/accuracies": 0.90625, "rewards/chosen": 41.96438980102539, "rewards/margins": 31.798187255859375, "rewards/rejected": 10.176815032958984, "step": 3214 }, { "epoch": 1.6640786749482401, "grad_norm": 2.0014450550079346, "learning_rate": 4.510091745144115e-06, "loss": 0.1282048523426056, "rewards/accuracies": 0.9296875, "rewards/chosen": 42.631107330322266, "rewards/margins": 33.67578125, "rewards/rejected": 8.962567329406738, "step": 3215 }, { "epoch": 1.6645962732919255, "grad_norm": 4.078100681304932, "learning_rate": 4.507252671389298e-06, "loss": 0.17216840386390686, "rewards/accuracies": 0.921875, "rewards/chosen": 41.771148681640625, "rewards/margins": 33.582366943359375, "rewards/rejected": 8.185457229614258, "step": 3216 }, { "epoch": 1.6651138716356106, "grad_norm": 0.859321117401123, "learning_rate": 4.504413758051926e-06, "loss": 0.12344091385602951, "rewards/accuracies": 0.9453125, "rewards/chosen": 39.75367736816406, "rewards/margins": 33.02623748779297, "rewards/rejected": 6.7341508865356445, "step": 3217 }, { "epoch": 1.6656314699792962, "grad_norm": 1.35267972946167, "learning_rate": 4.501575006056227e-06, "loss": 0.1865701675415039, "rewards/accuracies": 0.90625, "rewards/chosen": 34.97691345214844, "rewards/margins": 27.823898315429688, "rewards/rejected": 7.1512451171875, "step": 3218 }, { "epoch": 1.6661490683229814, "grad_norm": 0.8170291185379028, "learning_rate": 4.498736416326384e-06, "loss": 0.09292779117822647, "rewards/accuracies": 0.96875, "rewards/chosen": 46.484893798828125, "rewards/margins": 37.81275939941406, "rewards/rejected": 8.673641204833984, "step": 3219 }, { "epoch": 1.6666666666666665, "grad_norm": 1.395537257194519, "learning_rate": 4.4958979897865106e-06, "loss": 0.16177603602409363, "rewards/accuracies": 0.9140625, "rewards/chosen": 44.32112121582031, "rewards/margins": 36.15399169921875, "rewards/rejected": 8.187644958496094, "step": 3220 }, { "epoch": 1.667184265010352, "grad_norm": 0.7055225372314453, "learning_rate": 4.493059727360684e-06, "loss": 0.11905913054943085, "rewards/accuracies": 0.9375, "rewards/chosen": 36.99756622314453, "rewards/margins": 30.458633422851562, "rewards/rejected": 6.535654067993164, "step": 3221 }, { "epoch": 1.6677018633540373, "grad_norm": 1.1449726819992065, "learning_rate": 4.490221629972918e-06, "loss": 0.12704062461853027, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.39146423339844, "rewards/margins": 29.226425170898438, "rewards/rejected": 6.154121398925781, "step": 3222 }, { "epoch": 1.6682194616977226, "grad_norm": 1.3970527648925781, "learning_rate": 4.487383698547177e-06, "loss": 0.1755758672952652, "rewards/accuracies": 0.8984375, "rewards/chosen": 35.27734375, "rewards/margins": 30.521453857421875, "rewards/rejected": 4.749486923217773, "step": 3223 }, { "epoch": 1.668737060041408, "grad_norm": 1.3884978294372559, "learning_rate": 4.484545934007368e-06, "loss": 0.1434311866760254, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.270912170410156, "rewards/margins": 29.552833557128906, "rewards/rejected": 6.729030609130859, "step": 3224 }, { "epoch": 1.6692546583850931, "grad_norm": 0.8806460499763489, "learning_rate": 4.481708337277348e-06, "loss": 0.13700160384178162, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.77120590209961, "rewards/margins": 31.040740966796875, "rewards/rejected": 4.732123374938965, "step": 3225 }, { "epoch": 1.6697722567287785, "grad_norm": 1.411752462387085, "learning_rate": 4.478870909280918e-06, "loss": 0.16899150609970093, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.24342346191406, "rewards/margins": 26.406021118164062, "rewards/rejected": 5.844322204589844, "step": 3226 }, { "epoch": 1.6702898550724639, "grad_norm": 0.6503130793571472, "learning_rate": 4.47603365094182e-06, "loss": 0.11038491129875183, "rewards/accuracies": 0.96875, "rewards/chosen": 29.84527587890625, "rewards/margins": 25.23687744140625, "rewards/rejected": 4.601308822631836, "step": 3227 }, { "epoch": 1.670807453416149, "grad_norm": 1.7928767204284668, "learning_rate": 4.473196563183744e-06, "loss": 0.19798481464385986, "rewards/accuracies": 0.8984375, "rewards/chosen": 28.7637939453125, "rewards/margins": 24.80926513671875, "rewards/rejected": 3.9587650299072266, "step": 3228 }, { "epoch": 1.6713250517598344, "grad_norm": 1.0530421733856201, "learning_rate": 4.4703596469303276e-06, "loss": 0.1140860840678215, "rewards/accuracies": 0.953125, "rewards/chosen": 33.176490783691406, "rewards/margins": 28.50164794921875, "rewards/rejected": 4.675127983093262, "step": 3229 }, { "epoch": 1.6718426501035197, "grad_norm": 3.4830868244171143, "learning_rate": 4.467522903105148e-06, "loss": 0.13083326816558838, "rewards/accuracies": 0.953125, "rewards/chosen": 32.76365661621094, "rewards/margins": 28.58995819091797, "rewards/rejected": 4.174967288970947, "step": 3230 }, { "epoch": 1.6723602484472049, "grad_norm": 3.904597520828247, "learning_rate": 4.464686332631724e-06, "loss": 0.19080746173858643, "rewards/accuracies": 0.921875, "rewards/chosen": 34.02863311767578, "rewards/margins": 28.704727172851562, "rewards/rejected": 5.328132629394531, "step": 3231 }, { "epoch": 1.6728778467908902, "grad_norm": 0.906238853931427, "learning_rate": 4.4618499364335285e-06, "loss": 0.1093386560678482, "rewards/accuracies": 0.953125, "rewards/chosen": 35.59262466430664, "rewards/margins": 29.16064453125, "rewards/rejected": 6.434806823730469, "step": 3232 }, { "epoch": 1.6733954451345756, "grad_norm": 1.6392085552215576, "learning_rate": 4.4590137154339645e-06, "loss": 0.23943567276000977, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.539268493652344, "rewards/margins": 25.551849365234375, "rewards/rejected": 4.991291046142578, "step": 3233 }, { "epoch": 1.6739130434782608, "grad_norm": 1.122668981552124, "learning_rate": 4.456177670556387e-06, "loss": 0.1375066488981247, "rewards/accuracies": 0.90625, "rewards/chosen": 32.340763092041016, "rewards/margins": 27.838253021240234, "rewards/rejected": 4.510497093200684, "step": 3234 }, { "epoch": 1.6744306418219461, "grad_norm": 0.7645796537399292, "learning_rate": 4.453341802724091e-06, "loss": 0.10532031953334808, "rewards/accuracies": 0.953125, "rewards/chosen": 30.24224090576172, "rewards/margins": 27.023807525634766, "rewards/rejected": 3.2261836528778076, "step": 3235 }, { "epoch": 1.6749482401656315, "grad_norm": 1.1355985403060913, "learning_rate": 4.450506112860314e-06, "loss": 0.14883486926555634, "rewards/accuracies": 0.921875, "rewards/chosen": 31.151535034179688, "rewards/margins": 26.448959350585938, "rewards/rejected": 4.701639175415039, "step": 3236 }, { "epoch": 1.6754658385093166, "grad_norm": 1.59456205368042, "learning_rate": 4.447670601888231e-06, "loss": 0.1396428644657135, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.50973129272461, "rewards/margins": 27.290634155273438, "rewards/rejected": 4.223712921142578, "step": 3237 }, { "epoch": 1.6759834368530022, "grad_norm": 2.906184434890747, "learning_rate": 4.444835270730969e-06, "loss": 0.1381860375404358, "rewards/accuracies": 0.9375, "rewards/chosen": 32.42323303222656, "rewards/margins": 27.809951782226562, "rewards/rejected": 4.613983154296875, "step": 3238 }, { "epoch": 1.6765010351966874, "grad_norm": 1.6939289569854736, "learning_rate": 4.442000120311588e-06, "loss": 0.20509783923625946, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.3335018157959, "rewards/margins": 27.436721801757812, "rewards/rejected": 3.90216064453125, "step": 3239 }, { "epoch": 1.6770186335403725, "grad_norm": 0.5557120442390442, "learning_rate": 4.4391651515530914e-06, "loss": 0.07765619456768036, "rewards/accuracies": 0.953125, "rewards/chosen": 36.912628173828125, "rewards/margins": 32.816619873046875, "rewards/rejected": 4.095623016357422, "step": 3240 }, { "epoch": 1.677536231884058, "grad_norm": 1.595342755317688, "learning_rate": 4.436330365378423e-06, "loss": 0.09736096858978271, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.65666961669922, "rewards/margins": 30.217559814453125, "rewards/rejected": 4.437450408935547, "step": 3241 }, { "epoch": 1.6780538302277432, "grad_norm": 0.9928335547447205, "learning_rate": 4.4334957627104706e-06, "loss": 0.15674754977226257, "rewards/accuracies": 0.921875, "rewards/chosen": 29.211036682128906, "rewards/margins": 24.98224639892578, "rewards/rejected": 4.227689743041992, "step": 3242 }, { "epoch": 1.6785714285714286, "grad_norm": 1.0598974227905273, "learning_rate": 4.430661344472057e-06, "loss": 0.12743979692459106, "rewards/accuracies": 0.953125, "rewards/chosen": 30.57543182373047, "rewards/margins": 26.435943603515625, "rewards/rejected": 4.146696090698242, "step": 3243 }, { "epoch": 1.679089026915114, "grad_norm": 1.0676465034484863, "learning_rate": 4.427827111585947e-06, "loss": 0.1439763605594635, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.19939422607422, "rewards/margins": 27.28467559814453, "rewards/rejected": 3.9235353469848633, "step": 3244 }, { "epoch": 1.6796066252587991, "grad_norm": 1.7927576303482056, "learning_rate": 4.42499306497485e-06, "loss": 0.11974117159843445, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.09270477294922, "rewards/margins": 29.028106689453125, "rewards/rejected": 4.067378044128418, "step": 3245 }, { "epoch": 1.6801242236024845, "grad_norm": 2.0619661808013916, "learning_rate": 4.422159205561404e-06, "loss": 0.17771115899085999, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.654502868652344, "rewards/margins": 27.538330078125, "rewards/rejected": 4.120323181152344, "step": 3246 }, { "epoch": 1.6806418219461698, "grad_norm": 1.647985816001892, "learning_rate": 4.419325534268196e-06, "loss": 0.13167008757591248, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.037139892578125, "rewards/margins": 28.154571533203125, "rewards/rejected": 2.885995864868164, "step": 3247 }, { "epoch": 1.681159420289855, "grad_norm": 0.7504402995109558, "learning_rate": 4.416492052017747e-06, "loss": 0.12398706376552582, "rewards/accuracies": 0.9375, "rewards/chosen": 33.06382369995117, "rewards/margins": 29.011932373046875, "rewards/rejected": 4.050870418548584, "step": 3248 }, { "epoch": 1.6816770186335404, "grad_norm": 1.1897706985473633, "learning_rate": 4.413658759732522e-06, "loss": 0.1839030683040619, "rewards/accuracies": 0.8984375, "rewards/chosen": 34.70747375488281, "rewards/margins": 30.246002197265625, "rewards/rejected": 4.4637298583984375, "step": 3249 }, { "epoch": 1.6821946169772257, "grad_norm": 0.7522043585777283, "learning_rate": 4.410825658334913e-06, "loss": 0.10741577297449112, "rewards/accuracies": 0.953125, "rewards/chosen": 32.74059295654297, "rewards/margins": 28.98566436767578, "rewards/rejected": 3.7488532066345215, "step": 3250 }, { "epoch": 1.6827122153209109, "grad_norm": 1.3771579265594482, "learning_rate": 4.40799274874726e-06, "loss": 0.21758046746253967, "rewards/accuracies": 0.8828125, "rewards/chosen": 29.753829956054688, "rewards/margins": 26.383148193359375, "rewards/rejected": 3.3697309494018555, "step": 3251 }, { "epoch": 1.6832298136645962, "grad_norm": 0.8884789943695068, "learning_rate": 4.405160031891838e-06, "loss": 0.13872836530208588, "rewards/accuracies": 0.921875, "rewards/chosen": 31.29189682006836, "rewards/margins": 26.463119506835938, "rewards/rejected": 4.827568054199219, "step": 3252 }, { "epoch": 1.6837474120082816, "grad_norm": 0.9549441933631897, "learning_rate": 4.402327508690856e-06, "loss": 0.14359676837921143, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.068010330200195, "rewards/margins": 24.0562744140625, "rewards/rejected": 3.0076980590820312, "step": 3253 }, { "epoch": 1.6842650103519667, "grad_norm": 1.2120734453201294, "learning_rate": 4.3994951800664614e-06, "loss": 0.12444732338190079, "rewards/accuracies": 0.921875, "rewards/chosen": 29.685054779052734, "rewards/margins": 25.525070190429688, "rewards/rejected": 4.1554412841796875, "step": 3254 }, { "epoch": 1.6847826086956523, "grad_norm": 2.762019157409668, "learning_rate": 4.396663046940744e-06, "loss": 0.2214873731136322, "rewards/accuracies": 0.90625, "rewards/chosen": 29.53946304321289, "rewards/margins": 25.380538940429688, "rewards/rejected": 4.160980224609375, "step": 3255 }, { "epoch": 1.6853002070393375, "grad_norm": 0.8503789901733398, "learning_rate": 4.393831110235722e-06, "loss": 0.1268330067396164, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.198719024658203, "rewards/margins": 25.92620849609375, "rewards/rejected": 3.2802562713623047, "step": 3256 }, { "epoch": 1.6858178053830226, "grad_norm": 1.600590705871582, "learning_rate": 4.390999370873353e-06, "loss": 0.1420835703611374, "rewards/accuracies": 0.9375, "rewards/chosen": 32.67978286743164, "rewards/margins": 28.63653564453125, "rewards/rejected": 4.0374298095703125, "step": 3257 }, { "epoch": 1.6863354037267082, "grad_norm": 1.0340256690979004, "learning_rate": 4.388167829775531e-06, "loss": 0.127366304397583, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.519142150878906, "rewards/margins": 26.682220458984375, "rewards/rejected": 3.840333938598633, "step": 3258 }, { "epoch": 1.6868530020703933, "grad_norm": 0.9912286400794983, "learning_rate": 4.385336487864082e-06, "loss": 0.1429770439863205, "rewards/accuracies": 0.9296875, "rewards/chosen": 23.478103637695312, "rewards/margins": 21.288841247558594, "rewards/rejected": 2.197734832763672, "step": 3259 }, { "epoch": 1.6873706004140787, "grad_norm": 1.6499333381652832, "learning_rate": 4.3825053460607734e-06, "loss": 0.11839787662029266, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.82794952392578, "rewards/margins": 27.566757202148438, "rewards/rejected": 4.266239166259766, "step": 3260 }, { "epoch": 1.687888198757764, "grad_norm": 1.6292614936828613, "learning_rate": 4.379674405287301e-06, "loss": 0.1927652657032013, "rewards/accuracies": 0.921875, "rewards/chosen": 31.510604858398438, "rewards/margins": 27.199188232421875, "rewards/rejected": 4.312015533447266, "step": 3261 }, { "epoch": 1.6884057971014492, "grad_norm": 0.9050797820091248, "learning_rate": 4.376843666465302e-06, "loss": 0.09796589612960815, "rewards/accuracies": 0.96875, "rewards/chosen": 33.846797943115234, "rewards/margins": 29.37548065185547, "rewards/rejected": 4.476513862609863, "step": 3262 }, { "epoch": 1.6889233954451346, "grad_norm": 2.0042200088500977, "learning_rate": 4.374013130516338e-06, "loss": 0.18457993865013123, "rewards/accuracies": 0.90625, "rewards/chosen": 30.749107360839844, "rewards/margins": 27.114601135253906, "rewards/rejected": 3.6409130096435547, "step": 3263 }, { "epoch": 1.68944099378882, "grad_norm": 2.048145294189453, "learning_rate": 4.3711827983619146e-06, "loss": 0.183292955160141, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.621620178222656, "rewards/margins": 27.409042358398438, "rewards/rejected": 5.209400177001953, "step": 3264 }, { "epoch": 1.689958592132505, "grad_norm": 0.9685450196266174, "learning_rate": 4.368352670923467e-06, "loss": 0.1280979961156845, "rewards/accuracies": 0.9375, "rewards/chosen": 32.32123565673828, "rewards/margins": 28.37188720703125, "rewards/rejected": 3.947503089904785, "step": 3265 }, { "epoch": 1.6904761904761905, "grad_norm": 1.4834274053573608, "learning_rate": 4.365522749122361e-06, "loss": 0.10718896985054016, "rewards/accuracies": 0.953125, "rewards/chosen": 33.09303283691406, "rewards/margins": 29.236770629882812, "rewards/rejected": 3.8591880798339844, "step": 3266 }, { "epoch": 1.6909937888198758, "grad_norm": 1.4955101013183594, "learning_rate": 4.362693033879898e-06, "loss": 0.17633017897605896, "rewards/accuracies": 0.90625, "rewards/chosen": 31.31143569946289, "rewards/margins": 27.393585205078125, "rewards/rejected": 3.911346435546875, "step": 3267 }, { "epoch": 1.691511387163561, "grad_norm": 1.2870019674301147, "learning_rate": 4.359863526117316e-06, "loss": 0.1319267451763153, "rewards/accuracies": 0.921875, "rewards/chosen": 36.8421745300293, "rewards/margins": 30.829132080078125, "rewards/rejected": 6.00602912902832, "step": 3268 }, { "epoch": 1.6920289855072463, "grad_norm": 1.4728868007659912, "learning_rate": 4.357034226755779e-06, "loss": 0.19531039893627167, "rewards/accuracies": 0.90625, "rewards/chosen": 30.169479370117188, "rewards/margins": 26.486419677734375, "rewards/rejected": 3.6811580657958984, "step": 3269 }, { "epoch": 1.6925465838509317, "grad_norm": 2.0812642574310303, "learning_rate": 4.354205136716385e-06, "loss": 0.23506924510002136, "rewards/accuracies": 0.875, "rewards/chosen": 26.541969299316406, "rewards/margins": 23.649169921875, "rewards/rejected": 2.894301176071167, "step": 3270 }, { "epoch": 1.6930641821946169, "grad_norm": 2.048285722732544, "learning_rate": 4.351376256920164e-06, "loss": 0.15575185418128967, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.950305938720703, "rewards/margins": 25.671051025390625, "rewards/rejected": 3.279571533203125, "step": 3271 }, { "epoch": 1.6935817805383024, "grad_norm": 1.5333058834075928, "learning_rate": 4.348547588288083e-06, "loss": 0.1595304012298584, "rewards/accuracies": 0.921875, "rewards/chosen": 28.900917053222656, "rewards/margins": 25.043701171875, "rewards/rejected": 3.860157012939453, "step": 3272 }, { "epoch": 1.6940993788819876, "grad_norm": 1.8606462478637695, "learning_rate": 4.345719131741031e-06, "loss": 0.11306184530258179, "rewards/accuracies": 0.953125, "rewards/chosen": 34.91651916503906, "rewards/margins": 30.386383056640625, "rewards/rejected": 4.52476692199707, "step": 3273 }, { "epoch": 1.6946169772256727, "grad_norm": 0.8711947202682495, "learning_rate": 4.342890888199832e-06, "loss": 0.16170227527618408, "rewards/accuracies": 0.90625, "rewards/chosen": 29.318988800048828, "rewards/margins": 25.458343505859375, "rewards/rejected": 3.856689453125, "step": 3274 }, { "epoch": 1.6951345755693583, "grad_norm": 1.0141527652740479, "learning_rate": 4.340062858585247e-06, "loss": 0.1364278793334961, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.55673599243164, "rewards/margins": 27.801803588867188, "rewards/rejected": 3.7583389282226562, "step": 3275 }, { "epoch": 1.6956521739130435, "grad_norm": 0.8168319463729858, "learning_rate": 4.337235043817954e-06, "loss": 0.12467099726200104, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.103721618652344, "rewards/margins": 26.049362182617188, "rewards/rejected": 3.050325393676758, "step": 3276 }, { "epoch": 1.6961697722567288, "grad_norm": 1.4935574531555176, "learning_rate": 4.3344074448185725e-06, "loss": 0.10258271545171738, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.73210906982422, "rewards/margins": 26.529312133789062, "rewards/rejected": 4.200016021728516, "step": 3277 }, { "epoch": 1.6966873706004142, "grad_norm": 0.8909978866577148, "learning_rate": 4.331580062507649e-06, "loss": 0.1266268491744995, "rewards/accuracies": 0.9375, "rewards/chosen": 32.271419525146484, "rewards/margins": 28.602890014648438, "rewards/rejected": 3.6663780212402344, "step": 3278 }, { "epoch": 1.6972049689440993, "grad_norm": 1.0918221473693848, "learning_rate": 4.328752897805654e-06, "loss": 0.15502242743968964, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.605667114257812, "rewards/margins": 23.42414093017578, "rewards/rejected": 3.181412696838379, "step": 3279 }, { "epoch": 1.6977225672877847, "grad_norm": 4.030947685241699, "learning_rate": 4.325925951632993e-06, "loss": 0.1989399939775467, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.157079696655273, "rewards/margins": 23.709640502929688, "rewards/rejected": 2.449496269226074, "step": 3280 }, { "epoch": 1.69824016563147, "grad_norm": 10.413344383239746, "learning_rate": 4.323099224910001e-06, "loss": 0.1440587043762207, "rewards/accuracies": 0.9375, "rewards/chosen": 28.090362548828125, "rewards/margins": 25.183486938476562, "rewards/rejected": 2.9001235961914062, "step": 3281 }, { "epoch": 1.6987577639751552, "grad_norm": 0.8470947742462158, "learning_rate": 4.320272718556937e-06, "loss": 0.08942500501871109, "rewards/accuracies": 0.96875, "rewards/chosen": 32.430755615234375, "rewards/margins": 27.686630249023438, "rewards/rejected": 4.7402262687683105, "step": 3282 }, { "epoch": 1.6992753623188406, "grad_norm": 2.4612481594085693, "learning_rate": 4.317446433493991e-06, "loss": 0.1646803617477417, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.65985107421875, "rewards/margins": 25.936691284179688, "rewards/rejected": 3.722627639770508, "step": 3283 }, { "epoch": 1.699792960662526, "grad_norm": 1.2872204780578613, "learning_rate": 4.3146203706412795e-06, "loss": 0.20366039872169495, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.05609893798828, "rewards/margins": 24.55921173095703, "rewards/rejected": 3.4968936443328857, "step": 3284 }, { "epoch": 1.700310559006211, "grad_norm": 1.293777346611023, "learning_rate": 4.311794530918851e-06, "loss": 0.16378484666347504, "rewards/accuracies": 0.9453125, "rewards/chosen": 27.25599479675293, "rewards/margins": 24.61029052734375, "rewards/rejected": 2.6399950981140137, "step": 3285 }, { "epoch": 1.7008281573498965, "grad_norm": 0.7087265849113464, "learning_rate": 4.308968915246674e-06, "loss": 0.11658155173063278, "rewards/accuracies": 0.9375, "rewards/chosen": 28.6685791015625, "rewards/margins": 25.2579345703125, "rewards/rejected": 3.4088363647460938, "step": 3286 }, { "epoch": 1.7013457556935818, "grad_norm": 2.610471248626709, "learning_rate": 4.306143524544651e-06, "loss": 0.19225600361824036, "rewards/accuracies": 0.875, "rewards/chosen": 29.583696365356445, "rewards/margins": 26.069320678710938, "rewards/rejected": 3.518932342529297, "step": 3287 }, { "epoch": 1.701863354037267, "grad_norm": 1.9269790649414062, "learning_rate": 4.3033183597326074e-06, "loss": 0.18105478584766388, "rewards/accuracies": 0.90625, "rewards/chosen": 30.314132690429688, "rewards/margins": 27.16058349609375, "rewards/rejected": 3.150491714477539, "step": 3288 }, { "epoch": 1.7023809523809523, "grad_norm": 1.2217377424240112, "learning_rate": 4.3004934217302935e-06, "loss": 0.18475696444511414, "rewards/accuracies": 0.90625, "rewards/chosen": 28.19137954711914, "rewards/margins": 24.69426727294922, "rewards/rejected": 3.4989795684814453, "step": 3289 }, { "epoch": 1.7028985507246377, "grad_norm": 0.8807148933410645, "learning_rate": 4.297668711457391e-06, "loss": 0.08869484066963196, "rewards/accuracies": 0.96875, "rewards/chosen": 31.290695190429688, "rewards/margins": 27.758819580078125, "rewards/rejected": 3.5302886962890625, "step": 3290 }, { "epoch": 1.7034161490683228, "grad_norm": 0.7805227041244507, "learning_rate": 4.294844229833506e-06, "loss": 0.1618657410144806, "rewards/accuracies": 0.921875, "rewards/chosen": 26.116147994995117, "rewards/margins": 24.01940155029297, "rewards/rejected": 2.099681854248047, "step": 3291 }, { "epoch": 1.7039337474120084, "grad_norm": 0.6096301078796387, "learning_rate": 4.2920199777781655e-06, "loss": 0.0970199778676033, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.896820068359375, "rewards/margins": 28.349609375, "rewards/rejected": 3.5536460876464844, "step": 3292 }, { "epoch": 1.7044513457556936, "grad_norm": 0.9725375771522522, "learning_rate": 4.289195956210826e-06, "loss": 0.16041892766952515, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.990482330322266, "rewards/margins": 25.171173095703125, "rewards/rejected": 2.8186793327331543, "step": 3293 }, { "epoch": 1.704968944099379, "grad_norm": 0.8010034561157227, "learning_rate": 4.28637216605087e-06, "loss": 0.13718558847904205, "rewards/accuracies": 0.9375, "rewards/chosen": 27.48174476623535, "rewards/margins": 24.524459838867188, "rewards/rejected": 2.9597339630126953, "step": 3294 }, { "epoch": 1.7054865424430643, "grad_norm": 1.185971736907959, "learning_rate": 4.283548608217601e-06, "loss": 0.1974383294582367, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.07434844970703, "rewards/margins": 24.17926025390625, "rewards/rejected": 2.891002655029297, "step": 3295 }, { "epoch": 1.7060041407867494, "grad_norm": 1.0873847007751465, "learning_rate": 4.280725283630249e-06, "loss": 0.1697329580783844, "rewards/accuracies": 0.90625, "rewards/chosen": 27.873140335083008, "rewards/margins": 25.219844818115234, "rewards/rejected": 2.6605072021484375, "step": 3296 }, { "epoch": 1.7065217391304348, "grad_norm": 1.5162162780761719, "learning_rate": 4.277902193207966e-06, "loss": 0.15861470997333527, "rewards/accuracies": 0.9375, "rewards/chosen": 27.967910766601562, "rewards/margins": 25.823509216308594, "rewards/rejected": 2.137404203414917, "step": 3297 }, { "epoch": 1.7070393374741202, "grad_norm": 0.7599896192550659, "learning_rate": 4.275079337869834e-06, "loss": 0.10782139003276825, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.164310455322266, "rewards/margins": 27.588661193847656, "rewards/rejected": 2.5722084045410156, "step": 3298 }, { "epoch": 1.7075569358178053, "grad_norm": 1.118015170097351, "learning_rate": 4.272256718534849e-06, "loss": 0.17163066565990448, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.10218048095703, "rewards/margins": 25.041290283203125, "rewards/rejected": 2.0597915649414062, "step": 3299 }, { "epoch": 1.7080745341614907, "grad_norm": 0.8146102428436279, "learning_rate": 4.269434336121939e-06, "loss": 0.1359635889530182, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.318893432617188, "rewards/margins": 24.0006103515625, "rewards/rejected": 2.3151464462280273, "step": 3300 }, { "epoch": 1.708592132505176, "grad_norm": 0.771848738193512, "learning_rate": 4.266612191549949e-06, "loss": 0.10416175425052643, "rewards/accuracies": 0.9765625, "rewards/chosen": 23.201026916503906, "rewards/margins": 21.715744018554688, "rewards/rejected": 1.4914970397949219, "step": 3301 }, { "epoch": 1.7091097308488612, "grad_norm": 0.8629195094108582, "learning_rate": 4.263790285737646e-06, "loss": 0.15100562572479248, "rewards/accuracies": 0.9296875, "rewards/chosen": 27.084808349609375, "rewards/margins": 24.771957397460938, "rewards/rejected": 2.3135251998901367, "step": 3302 }, { "epoch": 1.7096273291925466, "grad_norm": 1.7615220546722412, "learning_rate": 4.260968619603726e-06, "loss": 0.15905976295471191, "rewards/accuracies": 0.921875, "rewards/chosen": 26.814661026000977, "rewards/margins": 24.097747802734375, "rewards/rejected": 2.7205042839050293, "step": 3303 }, { "epoch": 1.710144927536232, "grad_norm": 2.041485071182251, "learning_rate": 4.258147194066802e-06, "loss": 0.18331629037857056, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.676027297973633, "rewards/margins": 25.33135986328125, "rewards/rejected": 2.354001045227051, "step": 3304 }, { "epoch": 1.710662525879917, "grad_norm": 0.8415717482566833, "learning_rate": 4.2553260100454084e-06, "loss": 0.12537524104118347, "rewards/accuracies": 0.9375, "rewards/chosen": 26.837467193603516, "rewards/margins": 24.831260681152344, "rewards/rejected": 2.009397506713867, "step": 3305 }, { "epoch": 1.7111801242236024, "grad_norm": 1.3596724271774292, "learning_rate": 4.252505068458e-06, "loss": 0.15239261090755463, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.133880615234375, "rewards/margins": 23.5067138671875, "rewards/rejected": 2.6241989135742188, "step": 3306 }, { "epoch": 1.7116977225672878, "grad_norm": 0.8129474520683289, "learning_rate": 4.2496843702229586e-06, "loss": 0.08210936933755875, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.385040283203125, "rewards/margins": 26.77880859375, "rewards/rejected": 2.60172176361084, "step": 3307 }, { "epoch": 1.712215320910973, "grad_norm": 1.1771453619003296, "learning_rate": 4.246863916258582e-06, "loss": 0.11584891378879547, "rewards/accuracies": 0.953125, "rewards/chosen": 27.592365264892578, "rewards/margins": 25.522003173828125, "rewards/rejected": 2.068056583404541, "step": 3308 }, { "epoch": 1.7127329192546585, "grad_norm": 0.8755922913551331, "learning_rate": 4.244043707483087e-06, "loss": 0.14486613869667053, "rewards/accuracies": 0.953125, "rewards/chosen": 25.054994583129883, "rewards/margins": 23.138412475585938, "rewards/rejected": 1.9109382629394531, "step": 3309 }, { "epoch": 1.7132505175983437, "grad_norm": 0.7088596224784851, "learning_rate": 4.2412237448146135e-06, "loss": 0.11112125217914581, "rewards/accuracies": 0.953125, "rewards/chosen": 27.697059631347656, "rewards/margins": 26.187713623046875, "rewards/rejected": 1.5189361572265625, "step": 3310 }, { "epoch": 1.7137681159420288, "grad_norm": 0.943252444267273, "learning_rate": 4.238404029171224e-06, "loss": 0.1278078854084015, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.069564819335938, "rewards/margins": 27.231109619140625, "rewards/rejected": 2.8388938903808594, "step": 3311 }, { "epoch": 1.7142857142857144, "grad_norm": 1.8879072666168213, "learning_rate": 4.2355845614708935e-06, "loss": 0.14881959557533264, "rewards/accuracies": 0.9375, "rewards/chosen": 22.712852478027344, "rewards/margins": 21.033950805664062, "rewards/rejected": 1.6799390316009521, "step": 3312 }, { "epoch": 1.7148033126293996, "grad_norm": 1.7417773008346558, "learning_rate": 4.232765342631521e-06, "loss": 0.14189866185188293, "rewards/accuracies": 0.9375, "rewards/chosen": 29.10354995727539, "rewards/margins": 26.690460205078125, "rewards/rejected": 2.4206199645996094, "step": 3313 }, { "epoch": 1.715320910973085, "grad_norm": 0.8402034044265747, "learning_rate": 4.229946373570926e-06, "loss": 0.1538686454296112, "rewards/accuracies": 0.9296875, "rewards/chosen": 24.781532287597656, "rewards/margins": 22.60333251953125, "rewards/rejected": 2.1768741607666016, "step": 3314 }, { "epoch": 1.7158385093167703, "grad_norm": 0.5907915234565735, "learning_rate": 4.2271276552068384e-06, "loss": 0.11772974580526352, "rewards/accuracies": 0.984375, "rewards/chosen": 27.898395538330078, "rewards/margins": 25.44940185546875, "rewards/rejected": 2.453643798828125, "step": 3315 }, { "epoch": 1.7163561076604554, "grad_norm": 0.6061124801635742, "learning_rate": 4.224309188456918e-06, "loss": 0.07139386236667633, "rewards/accuracies": 0.953125, "rewards/chosen": 31.247955322265625, "rewards/margins": 29.238800048828125, "rewards/rejected": 2.013866424560547, "step": 3316 }, { "epoch": 1.7168737060041408, "grad_norm": 1.4312056303024292, "learning_rate": 4.221490974238736e-06, "loss": 0.18895825743675232, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.918338775634766, "rewards/margins": 24.989639282226562, "rewards/rejected": 1.9153766632080078, "step": 3317 }, { "epoch": 1.7173913043478262, "grad_norm": 1.6672927141189575, "learning_rate": 4.21867301346978e-06, "loss": 0.09587084501981735, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.928260803222656, "rewards/margins": 28.248336791992188, "rewards/rejected": 2.6785888671875, "step": 3318 }, { "epoch": 1.7179089026915113, "grad_norm": 0.7251763343811035, "learning_rate": 4.215855307067457e-06, "loss": 0.07015535980463028, "rewards/accuracies": 0.984375, "rewards/chosen": 31.78522491455078, "rewards/margins": 28.76470947265625, "rewards/rejected": 3.019639492034912, "step": 3319 }, { "epoch": 1.7184265010351967, "grad_norm": 1.8728327751159668, "learning_rate": 4.213037855949096e-06, "loss": 0.13558214902877808, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.603557586669922, "rewards/margins": 28.358779907226562, "rewards/rejected": 2.253908157348633, "step": 3320 }, { "epoch": 1.718944099378882, "grad_norm": 0.9366040825843811, "learning_rate": 4.210220661031936e-06, "loss": 0.11250129342079163, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.09764862060547, "rewards/margins": 26.5, "rewards/rejected": 1.5959701538085938, "step": 3321 }, { "epoch": 1.7194616977225672, "grad_norm": 1.2782353162765503, "learning_rate": 4.207403723233134e-06, "loss": 0.09059473127126694, "rewards/accuracies": 0.9609375, "rewards/chosen": 28.752552032470703, "rewards/margins": 26.363616943359375, "rewards/rejected": 2.38836669921875, "step": 3322 }, { "epoch": 1.7199792960662525, "grad_norm": 1.4328534603118896, "learning_rate": 4.204587043469764e-06, "loss": 0.24379375576972961, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.776208877563477, "rewards/margins": 25.750946044921875, "rewards/rejected": 2.0243377685546875, "step": 3323 }, { "epoch": 1.720496894409938, "grad_norm": 1.0165961980819702, "learning_rate": 4.20177062265882e-06, "loss": 0.06971515715122223, "rewards/accuracies": 0.9609375, "rewards/chosen": 33.13050842285156, "rewards/margins": 29.883338928222656, "rewards/rejected": 3.2486495971679688, "step": 3324 }, { "epoch": 1.721014492753623, "grad_norm": 1.9731584787368774, "learning_rate": 4.1989544617172055e-06, "loss": 0.10759719461202621, "rewards/accuracies": 0.9375, "rewards/chosen": 33.84538269042969, "rewards/margins": 30.886688232421875, "rewards/rejected": 2.9567718505859375, "step": 3325 }, { "epoch": 1.7215320910973086, "grad_norm": 1.326359748840332, "learning_rate": 4.196138561561742e-06, "loss": 0.1520427167415619, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.81178283691406, "rewards/margins": 29.9949951171875, "rewards/rejected": 2.822805881500244, "step": 3326 }, { "epoch": 1.7220496894409938, "grad_norm": 0.820682942867279, "learning_rate": 4.1933229231091686e-06, "loss": 0.059494227170944214, "rewards/accuracies": 0.984375, "rewards/chosen": 34.15199661254883, "rewards/margins": 30.238250732421875, "rewards/rejected": 3.910879135131836, "step": 3327 }, { "epoch": 1.722567287784679, "grad_norm": 1.597348690032959, "learning_rate": 4.190507547276131e-06, "loss": 0.22463852167129517, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.460304260253906, "rewards/margins": 25.976043701171875, "rewards/rejected": 2.4855966567993164, "step": 3328 }, { "epoch": 1.7230848861283645, "grad_norm": 1.0624182224273682, "learning_rate": 4.187692434979201e-06, "loss": 0.15488174557685852, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.02129364013672, "rewards/margins": 28.1417236328125, "rewards/rejected": 1.8799591064453125, "step": 3329 }, { "epoch": 1.7236024844720497, "grad_norm": 1.3154921531677246, "learning_rate": 4.184877587134856e-06, "loss": 0.1568962186574936, "rewards/accuracies": 0.9140625, "rewards/chosen": 35.90375900268555, "rewards/margins": 31.852401733398438, "rewards/rejected": 4.047149658203125, "step": 3330 }, { "epoch": 1.724120082815735, "grad_norm": 0.7747669816017151, "learning_rate": 4.182063004659492e-06, "loss": 0.07622243463993073, "rewards/accuracies": 0.96875, "rewards/chosen": 35.3651123046875, "rewards/margins": 31.212112426757812, "rewards/rejected": 4.158418655395508, "step": 3331 }, { "epoch": 1.7246376811594204, "grad_norm": 0.9636688232421875, "learning_rate": 4.179248688469411e-06, "loss": 0.12523654103279114, "rewards/accuracies": 0.9375, "rewards/chosen": 32.48931884765625, "rewards/margins": 28.98870849609375, "rewards/rejected": 3.5049819946289062, "step": 3332 }, { "epoch": 1.7251552795031055, "grad_norm": 1.844786286354065, "learning_rate": 4.1764346394808415e-06, "loss": 0.17040389776229858, "rewards/accuracies": 0.9140625, "rewards/chosen": 37.49348449707031, "rewards/margins": 32.863983154296875, "rewards/rejected": 4.628269195556641, "step": 3333 }, { "epoch": 1.725672877846791, "grad_norm": 0.8687573671340942, "learning_rate": 4.173620858609914e-06, "loss": 0.12059667706489563, "rewards/accuracies": 0.9609375, "rewards/chosen": 33.780643463134766, "rewards/margins": 29.600738525390625, "rewards/rejected": 4.181390285491943, "step": 3334 }, { "epoch": 1.7261904761904763, "grad_norm": 0.9886729121208191, "learning_rate": 4.170807346772674e-06, "loss": 0.16318903863430023, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.23794937133789, "rewards/margins": 31.499603271484375, "rewards/rejected": 3.733583450317383, "step": 3335 }, { "epoch": 1.7267080745341614, "grad_norm": 1.2128099203109741, "learning_rate": 4.167994104885081e-06, "loss": 0.18907953798770905, "rewards/accuracies": 0.921875, "rewards/chosen": 31.98984146118164, "rewards/margins": 27.89599609375, "rewards/rejected": 4.09307861328125, "step": 3336 }, { "epoch": 1.7272256728778468, "grad_norm": 2.1340253353118896, "learning_rate": 4.1651811338630084e-06, "loss": 0.16483548283576965, "rewards/accuracies": 0.921875, "rewards/chosen": 34.129817962646484, "rewards/margins": 29.872894287109375, "rewards/rejected": 4.263431549072266, "step": 3337 }, { "epoch": 1.7277432712215322, "grad_norm": 1.2731584310531616, "learning_rate": 4.162368434622238e-06, "loss": 0.16156375408172607, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.67005157470703, "rewards/margins": 32.432586669921875, "rewards/rejected": 4.239383697509766, "step": 3338 }, { "epoch": 1.7282608695652173, "grad_norm": 0.8517305850982666, "learning_rate": 4.159556008078463e-06, "loss": 0.10098541527986526, "rewards/accuracies": 0.9765625, "rewards/chosen": 37.261993408203125, "rewards/margins": 33.02191162109375, "rewards/rejected": 4.241238594055176, "step": 3339 }, { "epoch": 1.7287784679089027, "grad_norm": 0.8979634642601013, "learning_rate": 4.1567438551472905e-06, "loss": 0.12508335709571838, "rewards/accuracies": 0.9375, "rewards/chosen": 37.72929382324219, "rewards/margins": 32.5908203125, "rewards/rejected": 5.136589050292969, "step": 3340 }, { "epoch": 1.729296066252588, "grad_norm": 0.6162602305412292, "learning_rate": 4.1539319767442354e-06, "loss": 0.11109261214733124, "rewards/accuracies": 0.953125, "rewards/chosen": 38.719932556152344, "rewards/margins": 33.654693603515625, "rewards/rejected": 5.059925079345703, "step": 3341 }, { "epoch": 1.7298136645962732, "grad_norm": 0.6311054825782776, "learning_rate": 4.151120373784728e-06, "loss": 0.1299179494380951, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.07891082763672, "rewards/margins": 32.01170349121094, "rewards/rejected": 5.065878391265869, "step": 3342 }, { "epoch": 1.7303312629399588, "grad_norm": 0.7476167678833008, "learning_rate": 4.1483090471841035e-06, "loss": 0.14574284851551056, "rewards/accuracies": 0.921875, "rewards/chosen": 36.68400573730469, "rewards/margins": 31.566932678222656, "rewards/rejected": 5.1215057373046875, "step": 3343 }, { "epoch": 1.730848861283644, "grad_norm": 1.7761294841766357, "learning_rate": 4.145497997857611e-06, "loss": 0.16506975889205933, "rewards/accuracies": 0.9296875, "rewards/chosen": 37.66516876220703, "rewards/margins": 32.859580993652344, "rewards/rejected": 4.8050079345703125, "step": 3344 }, { "epoch": 1.731366459627329, "grad_norm": 0.7006192803382874, "learning_rate": 4.142687226720407e-06, "loss": 0.11734689027070999, "rewards/accuracies": 0.9453125, "rewards/chosen": 39.05339431762695, "rewards/margins": 34.88873291015625, "rewards/rejected": 4.162040710449219, "step": 3345 }, { "epoch": 1.7318840579710146, "grad_norm": 3.766350507736206, "learning_rate": 4.139876734687558e-06, "loss": 0.15112946927547455, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.847084045410156, "rewards/margins": 33.241668701171875, "rewards/rejected": 5.602219104766846, "step": 3346 }, { "epoch": 1.7324016563146998, "grad_norm": 0.6556831002235413, "learning_rate": 4.137066522674042e-06, "loss": 0.12253260612487793, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.83647537231445, "rewards/margins": 33.15924072265625, "rewards/rejected": 4.677036285400391, "step": 3347 }, { "epoch": 1.7329192546583851, "grad_norm": 1.1625880002975464, "learning_rate": 4.134256591594741e-06, "loss": 0.20198094844818115, "rewards/accuracies": 0.875, "rewards/chosen": 36.67658996582031, "rewards/margins": 31.532562255859375, "rewards/rejected": 5.148952484130859, "step": 3348 }, { "epoch": 1.7334368530020705, "grad_norm": 0.9235314130783081, "learning_rate": 4.13144694236445e-06, "loss": 0.12870144844055176, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.32965087890625, "rewards/margins": 28.170257568359375, "rewards/rejected": 5.164190292358398, "step": 3349 }, { "epoch": 1.7339544513457557, "grad_norm": 0.9000226259231567, "learning_rate": 4.12863757589787e-06, "loss": 0.11516504734754562, "rewards/accuracies": 0.9375, "rewards/chosen": 34.321563720703125, "rewards/margins": 30.2850341796875, "rewards/rejected": 4.032057762145996, "step": 3350 }, { "epoch": 1.734472049689441, "grad_norm": 1.2097431421279907, "learning_rate": 4.125828493109611e-06, "loss": 0.11190805584192276, "rewards/accuracies": 0.9375, "rewards/chosen": 38.72538375854492, "rewards/margins": 32.52714538574219, "rewards/rejected": 6.204054832458496, "step": 3351 }, { "epoch": 1.7349896480331264, "grad_norm": 1.998270034790039, "learning_rate": 4.12301969491419e-06, "loss": 0.20816150307655334, "rewards/accuracies": 0.9140625, "rewards/chosen": 40.19062805175781, "rewards/margins": 34.402435302734375, "rewards/rejected": 5.783565998077393, "step": 3352 }, { "epoch": 1.7355072463768115, "grad_norm": 0.9269552826881409, "learning_rate": 4.120211182226029e-06, "loss": 0.16814886033535004, "rewards/accuracies": 0.90625, "rewards/chosen": 35.77318572998047, "rewards/margins": 30.985198974609375, "rewards/rejected": 4.781264305114746, "step": 3353 }, { "epoch": 1.736024844720497, "grad_norm": 1.2405794858932495, "learning_rate": 4.117402955959465e-06, "loss": 0.20035970211029053, "rewards/accuracies": 0.8984375, "rewards/chosen": 39.08760452270508, "rewards/margins": 33.73187255859375, "rewards/rejected": 5.340877532958984, "step": 3354 }, { "epoch": 1.7365424430641823, "grad_norm": 0.8922745585441589, "learning_rate": 4.114595017028733e-06, "loss": 0.12737374007701874, "rewards/accuracies": 0.9453125, "rewards/chosen": 40.9757194519043, "rewards/margins": 33.474639892578125, "rewards/rejected": 7.512119293212891, "step": 3355 }, { "epoch": 1.7370600414078674, "grad_norm": 1.064640760421753, "learning_rate": 4.111787366347977e-06, "loss": 0.11556670814752579, "rewards/accuracies": 0.9609375, "rewards/chosen": 39.442848205566406, "rewards/margins": 35.001068115234375, "rewards/rejected": 4.441729545593262, "step": 3356 }, { "epoch": 1.7375776397515528, "grad_norm": 1.865789771080017, "learning_rate": 4.108980004831249e-06, "loss": 0.1773415207862854, "rewards/accuracies": 0.9140625, "rewards/chosen": 36.029937744140625, "rewards/margins": 31.914886474609375, "rewards/rejected": 4.104911804199219, "step": 3357 }, { "epoch": 1.7380952380952381, "grad_norm": 0.862920343875885, "learning_rate": 4.106172933392505e-06, "loss": 0.1041266918182373, "rewards/accuracies": 0.953125, "rewards/chosen": 37.413818359375, "rewards/margins": 32.347198486328125, "rewards/rejected": 5.057773590087891, "step": 3358 }, { "epoch": 1.7386128364389233, "grad_norm": 0.9856584668159485, "learning_rate": 4.103366152945609e-06, "loss": 0.15407153964042664, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.295528411865234, "rewards/margins": 29.426895141601562, "rewards/rejected": 4.876209259033203, "step": 3359 }, { "epoch": 1.7391304347826086, "grad_norm": 1.0934034585952759, "learning_rate": 4.100559664404329e-06, "loss": 0.14313726127147675, "rewards/accuracies": 0.9375, "rewards/chosen": 33.986907958984375, "rewards/margins": 28.90768814086914, "rewards/rejected": 5.085060119628906, "step": 3360 }, { "epoch": 1.739648033126294, "grad_norm": 0.9058281779289246, "learning_rate": 4.097753468682334e-06, "loss": 0.14704188704490662, "rewards/accuracies": 0.9375, "rewards/chosen": 38.497005462646484, "rewards/margins": 33.079376220703125, "rewards/rejected": 5.4185590744018555, "step": 3361 }, { "epoch": 1.7401656314699792, "grad_norm": 1.5681750774383545, "learning_rate": 4.094947566693203e-06, "loss": 0.16237077116966248, "rewards/accuracies": 0.9140625, "rewards/chosen": 37.83655548095703, "rewards/margins": 33.188232421875, "rewards/rejected": 4.644767761230469, "step": 3362 }, { "epoch": 1.7406832298136647, "grad_norm": 0.9870904088020325, "learning_rate": 4.0921419593504205e-06, "loss": 0.10239558666944504, "rewards/accuracies": 0.9609375, "rewards/chosen": 32.89775085449219, "rewards/margins": 28.26690673828125, "rewards/rejected": 4.639659881591797, "step": 3363 }, { "epoch": 1.74120082815735, "grad_norm": 0.8474346995353699, "learning_rate": 4.089336647567369e-06, "loss": 0.14188390970230103, "rewards/accuracies": 0.921875, "rewards/chosen": 30.531082153320312, "rewards/margins": 26.98931884765625, "rewards/rejected": 3.533224105834961, "step": 3364 }, { "epoch": 1.741718426501035, "grad_norm": 0.7262114882469177, "learning_rate": 4.086531632257338e-06, "loss": 0.10114897787570953, "rewards/accuracies": 0.96875, "rewards/chosen": 35.44951248168945, "rewards/margins": 30.97332763671875, "rewards/rejected": 4.469478607177734, "step": 3365 }, { "epoch": 1.7422360248447206, "grad_norm": 0.93047034740448, "learning_rate": 4.083726914333521e-06, "loss": 0.09118562936782837, "rewards/accuracies": 0.953125, "rewards/chosen": 38.60982894897461, "rewards/margins": 32.423744201660156, "rewards/rejected": 6.177341461181641, "step": 3366 }, { "epoch": 1.7427536231884058, "grad_norm": 0.7422524094581604, "learning_rate": 4.080922494709016e-06, "loss": 0.09232616424560547, "rewards/accuracies": 0.9453125, "rewards/chosen": 39.42488098144531, "rewards/margins": 33.65672302246094, "rewards/rejected": 5.770162582397461, "step": 3367 }, { "epoch": 1.7432712215320911, "grad_norm": 1.2352895736694336, "learning_rate": 4.07811837429682e-06, "loss": 0.13023290038108826, "rewards/accuracies": 0.9296875, "rewards/chosen": 38.35414505004883, "rewards/margins": 33.10664367675781, "rewards/rejected": 5.257571220397949, "step": 3368 }, { "epoch": 1.7437888198757765, "grad_norm": 1.3464148044586182, "learning_rate": 4.075314554009835e-06, "loss": 0.13967686891555786, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.98767852783203, "rewards/margins": 29.72705078125, "rewards/rejected": 4.248640060424805, "step": 3369 }, { "epoch": 1.7443064182194616, "grad_norm": 1.1238189935684204, "learning_rate": 4.0725110347608655e-06, "loss": 0.12315268814563751, "rewards/accuracies": 0.921875, "rewards/chosen": 32.102455139160156, "rewards/margins": 27.744308471679688, "rewards/rejected": 4.358299255371094, "step": 3370 }, { "epoch": 1.744824016563147, "grad_norm": 1.9703199863433838, "learning_rate": 4.069707817462615e-06, "loss": 0.17510643601417542, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.41851806640625, "rewards/margins": 27.292510986328125, "rewards/rejected": 2.134549140930176, "step": 3371 }, { "epoch": 1.7453416149068324, "grad_norm": 0.7188339829444885, "learning_rate": 4.066904903027694e-06, "loss": 0.12008048593997955, "rewards/accuracies": 0.9375, "rewards/chosen": 33.68736267089844, "rewards/margins": 30.549667358398438, "rewards/rejected": 3.1352195739746094, "step": 3372 }, { "epoch": 1.7458592132505175, "grad_norm": 0.5839366912841797, "learning_rate": 4.064102292368612e-06, "loss": 0.10783897340297699, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.95891189575195, "rewards/margins": 34.124385833740234, "rewards/rejected": 4.831611633300781, "step": 3373 }, { "epoch": 1.7463768115942029, "grad_norm": 1.0853947401046753, "learning_rate": 4.061299986397776e-06, "loss": 0.11368462443351746, "rewards/accuracies": 0.953125, "rewards/chosen": 35.44457244873047, "rewards/margins": 31.21783447265625, "rewards/rejected": 4.2237091064453125, "step": 3374 }, { "epoch": 1.7468944099378882, "grad_norm": 0.7235104441642761, "learning_rate": 4.0584979860274975e-06, "loss": 0.10348859429359436, "rewards/accuracies": 0.9609375, "rewards/chosen": 37.29667663574219, "rewards/margins": 32.55115509033203, "rewards/rejected": 4.7566962242126465, "step": 3375 }, { "epoch": 1.7474120082815734, "grad_norm": 1.2185367345809937, "learning_rate": 4.055696292169991e-06, "loss": 0.15229161083698273, "rewards/accuracies": 0.90625, "rewards/chosen": 34.587127685546875, "rewards/margins": 29.995311737060547, "rewards/rejected": 4.591556549072266, "step": 3376 }, { "epoch": 1.7479296066252588, "grad_norm": 2.7794957160949707, "learning_rate": 4.052894905737365e-06, "loss": 0.22697260975837708, "rewards/accuracies": 0.890625, "rewards/chosen": 34.368682861328125, "rewards/margins": 29.96367645263672, "rewards/rejected": 4.397902488708496, "step": 3377 }, { "epoch": 1.7484472049689441, "grad_norm": 1.05363929271698, "learning_rate": 4.0500938276416335e-06, "loss": 0.1120983436703682, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.39080810546875, "rewards/margins": 28.19305419921875, "rewards/rejected": 5.198590278625488, "step": 3378 }, { "epoch": 1.7489648033126293, "grad_norm": 0.669009804725647, "learning_rate": 4.047293058794704e-06, "loss": 0.11191315948963165, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.42857360839844, "rewards/margins": 33.01969909667969, "rewards/rejected": 3.4149322509765625, "step": 3379 }, { "epoch": 1.7494824016563149, "grad_norm": 1.2746057510375977, "learning_rate": 4.044492600108392e-06, "loss": 0.12898270785808563, "rewards/accuracies": 0.9296875, "rewards/chosen": 38.74028396606445, "rewards/margins": 33.63768768310547, "rewards/rejected": 5.106317520141602, "step": 3380 }, { "epoch": 1.75, "grad_norm": 1.755170226097107, "learning_rate": 4.041692452494403e-06, "loss": 0.154526025056839, "rewards/accuracies": 0.9296875, "rewards/chosen": 37.01824951171875, "rewards/margins": 30.7454833984375, "rewards/rejected": 6.278604507446289, "step": 3381 }, { "epoch": 1.7505175983436851, "grad_norm": 1.4830034971237183, "learning_rate": 4.038892616864346e-06, "loss": 0.13077640533447266, "rewards/accuracies": 0.9296875, "rewards/chosen": 39.502052307128906, "rewards/margins": 34.319549560546875, "rewards/rejected": 5.175212860107422, "step": 3382 }, { "epoch": 1.7510351966873707, "grad_norm": 0.9216446876525879, "learning_rate": 4.036093094129731e-06, "loss": 0.06303967535495758, "rewards/accuracies": 0.96875, "rewards/chosen": 41.649200439453125, "rewards/margins": 37.48193359375, "rewards/rejected": 4.1751556396484375, "step": 3383 }, { "epoch": 1.7515527950310559, "grad_norm": 1.047454595565796, "learning_rate": 4.033293885201956e-06, "loss": 0.15685242414474487, "rewards/accuracies": 0.9140625, "rewards/chosen": 38.39442443847656, "rewards/margins": 33.896942138671875, "rewards/rejected": 4.504587173461914, "step": 3384 }, { "epoch": 1.7520703933747412, "grad_norm": 2.0621769428253174, "learning_rate": 4.03049499099233e-06, "loss": 0.17193709313869476, "rewards/accuracies": 0.921875, "rewards/chosen": 33.03131103515625, "rewards/margins": 29.32360076904297, "rewards/rejected": 3.7172088623046875, "step": 3385 }, { "epoch": 1.7525879917184266, "grad_norm": 0.6873382329940796, "learning_rate": 4.0276964124120516e-06, "loss": 0.07167237997055054, "rewards/accuracies": 0.9765625, "rewards/chosen": 33.62098693847656, "rewards/margins": 29.930267333984375, "rewards/rejected": 3.6950130462646484, "step": 3386 }, { "epoch": 1.7531055900621118, "grad_norm": 1.2117425203323364, "learning_rate": 4.024898150372216e-06, "loss": 0.17574435472488403, "rewards/accuracies": 0.90625, "rewards/chosen": 34.21875, "rewards/margins": 30.285003662109375, "rewards/rejected": 3.9295902252197266, "step": 3387 }, { "epoch": 1.7536231884057971, "grad_norm": 1.0526305437088013, "learning_rate": 4.022100205783819e-06, "loss": 0.13431864976882935, "rewards/accuracies": 0.9140625, "rewards/chosen": 38.12563705444336, "rewards/margins": 33.6383056640625, "rewards/rejected": 4.483455657958984, "step": 3388 }, { "epoch": 1.7541407867494825, "grad_norm": 1.847638726234436, "learning_rate": 4.0193025795577525e-06, "loss": 0.1986205279827118, "rewards/accuracies": 0.8828125, "rewards/chosen": 32.32939147949219, "rewards/margins": 29.076370239257812, "rewards/rejected": 3.2574539184570312, "step": 3389 }, { "epoch": 1.7546583850931676, "grad_norm": 1.8226172924041748, "learning_rate": 4.0165052726048045e-06, "loss": 0.17005836963653564, "rewards/accuracies": 0.90625, "rewards/chosen": 34.13135528564453, "rewards/margins": 29.201828002929688, "rewards/rejected": 4.931626319885254, "step": 3390 }, { "epoch": 1.755175983436853, "grad_norm": 1.5506393909454346, "learning_rate": 4.013708285835656e-06, "loss": 0.12148517370223999, "rewards/accuracies": 0.9375, "rewards/chosen": 33.37006378173828, "rewards/margins": 30.163482666015625, "rewards/rejected": 3.2093048095703125, "step": 3391 }, { "epoch": 1.7556935817805384, "grad_norm": 1.1625336408615112, "learning_rate": 4.010911620160886e-06, "loss": 0.11183912307024002, "rewards/accuracies": 0.9453125, "rewards/chosen": 42.02625274658203, "rewards/margins": 36.900482177734375, "rewards/rejected": 5.130267143249512, "step": 3392 }, { "epoch": 1.7562111801242235, "grad_norm": 7.348789691925049, "learning_rate": 4.008115276490972e-06, "loss": 0.14896593987941742, "rewards/accuracies": 0.921875, "rewards/chosen": 34.063316345214844, "rewards/margins": 30.63470458984375, "rewards/rejected": 3.4363832473754883, "step": 3393 }, { "epoch": 1.7567287784679089, "grad_norm": 1.0938841104507446, "learning_rate": 4.005319255736282e-06, "loss": 0.14081430435180664, "rewards/accuracies": 0.953125, "rewards/chosen": 34.407684326171875, "rewards/margins": 31.238937377929688, "rewards/rejected": 3.164539337158203, "step": 3394 }, { "epoch": 1.7572463768115942, "grad_norm": 2.547666549682617, "learning_rate": 4.002523558807081e-06, "loss": 0.14085137844085693, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.56182861328125, "rewards/margins": 26.486907958984375, "rewards/rejected": 2.0708541870117188, "step": 3395 }, { "epoch": 1.7577639751552794, "grad_norm": 1.2827564477920532, "learning_rate": 3.99972818661353e-06, "loss": 0.11610794812440872, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.79573059082031, "rewards/margins": 32.90234375, "rewards/rejected": 3.8889923095703125, "step": 3396 }, { "epoch": 1.758281573498965, "grad_norm": 1.681966781616211, "learning_rate": 3.996933140065678e-06, "loss": 0.12070062011480331, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.80604934692383, "rewards/margins": 31.276565551757812, "rewards/rejected": 2.528752326965332, "step": 3397 }, { "epoch": 1.75879917184265, "grad_norm": 1.4184932708740234, "learning_rate": 3.994138420073478e-06, "loss": 0.1315295398235321, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.49057388305664, "rewards/margins": 32.095428466796875, "rewards/rejected": 3.3898067474365234, "step": 3398 }, { "epoch": 1.7593167701863353, "grad_norm": 1.5130436420440674, "learning_rate": 3.991344027546767e-06, "loss": 0.11863633245229721, "rewards/accuracies": 0.953125, "rewards/chosen": 37.076534271240234, "rewards/margins": 33.33870315551758, "rewards/rejected": 3.750457763671875, "step": 3399 }, { "epoch": 1.7598343685300208, "grad_norm": 2.0772252082824707, "learning_rate": 3.988549963395284e-06, "loss": 0.196616992354393, "rewards/accuracies": 0.8984375, "rewards/chosen": 35.111473083496094, "rewards/margins": 31.630416870117188, "rewards/rejected": 3.4779882431030273, "step": 3400 }, { "epoch": 1.760351966873706, "grad_norm": 0.810174286365509, "learning_rate": 3.985756228528651e-06, "loss": 0.16789056360721588, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.49637222290039, "rewards/margins": 29.69525146484375, "rewards/rejected": 2.8025197982788086, "step": 3401 }, { "epoch": 1.7608695652173914, "grad_norm": 0.8914860486984253, "learning_rate": 3.982962823856393e-06, "loss": 0.15060986578464508, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.64476776123047, "rewards/margins": 28.88994598388672, "rewards/rejected": 2.754606246948242, "step": 3402 }, { "epoch": 1.7613871635610767, "grad_norm": 0.954773485660553, "learning_rate": 3.980169750287923e-06, "loss": 0.11070314049720764, "rewards/accuracies": 0.9375, "rewards/chosen": 34.230018615722656, "rewards/margins": 30.99072265625, "rewards/rejected": 3.2364044189453125, "step": 3403 }, { "epoch": 1.7619047619047619, "grad_norm": 0.5529863834381104, "learning_rate": 3.977377008732544e-06, "loss": 0.06787434220314026, "rewards/accuracies": 0.9765625, "rewards/chosen": 38.763526916503906, "rewards/margins": 34.71614074707031, "rewards/rejected": 4.049537658691406, "step": 3404 }, { "epoch": 1.7624223602484472, "grad_norm": 2.351100206375122, "learning_rate": 3.974584600099452e-06, "loss": 0.18713748455047607, "rewards/accuracies": 0.8828125, "rewards/chosen": 36.119911193847656, "rewards/margins": 32.76031494140625, "rewards/rejected": 3.369034767150879, "step": 3405 }, { "epoch": 1.7629399585921326, "grad_norm": 0.9322534203529358, "learning_rate": 3.971792525297741e-06, "loss": 0.1430920660495758, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.094276428222656, "rewards/margins": 28.02289581298828, "rewards/rejected": 2.065129518508911, "step": 3406 }, { "epoch": 1.7634575569358177, "grad_norm": 0.9548754096031189, "learning_rate": 3.969000785236386e-06, "loss": 0.07654280960559845, "rewards/accuracies": 0.9609375, "rewards/chosen": 35.033409118652344, "rewards/margins": 32.105804443359375, "rewards/rejected": 2.917276382446289, "step": 3407 }, { "epoch": 1.763975155279503, "grad_norm": 1.8321545124053955, "learning_rate": 3.9662093808242604e-06, "loss": 0.18222805857658386, "rewards/accuracies": 0.921875, "rewards/chosen": 34.002349853515625, "rewards/margins": 31.005645751953125, "rewards/rejected": 2.9959230422973633, "step": 3408 }, { "epoch": 1.7644927536231885, "grad_norm": 2.280031442642212, "learning_rate": 3.9634183129701276e-06, "loss": 0.12674172222614288, "rewards/accuracies": 0.9375, "rewards/chosen": 34.98561477661133, "rewards/margins": 31.951446533203125, "rewards/rejected": 3.0215237140655518, "step": 3409 }, { "epoch": 1.7650103519668736, "grad_norm": 0.8036179542541504, "learning_rate": 3.960627582582634e-06, "loss": 0.09538112580776215, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.02674865722656, "rewards/margins": 30.754684448242188, "rewards/rejected": 2.2760963439941406, "step": 3410 }, { "epoch": 1.765527950310559, "grad_norm": 1.1976317167282104, "learning_rate": 3.957837190570329e-06, "loss": 0.14283344149589539, "rewards/accuracies": 0.9140625, "rewards/chosen": 33.02189636230469, "rewards/margins": 30.098876953125, "rewards/rejected": 2.922170639038086, "step": 3411 }, { "epoch": 1.7660455486542443, "grad_norm": 0.959320068359375, "learning_rate": 3.955047137841641e-06, "loss": 0.14871735870838165, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.37859344482422, "rewards/margins": 30.303955078125, "rewards/rejected": 3.0760555267333984, "step": 3412 }, { "epoch": 1.7665631469979295, "grad_norm": 0.9245523810386658, "learning_rate": 3.952257425304894e-06, "loss": 0.13828222453594208, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.94908142089844, "rewards/margins": 33.786163330078125, "rewards/rejected": 3.1760940551757812, "step": 3413 }, { "epoch": 1.7670807453416149, "grad_norm": 0.8347885608673096, "learning_rate": 3.949468053868295e-06, "loss": 0.11837805807590485, "rewards/accuracies": 0.9296875, "rewards/chosen": 39.285888671875, "rewards/margins": 35.55828857421875, "rewards/rejected": 3.7312097549438477, "step": 3414 }, { "epoch": 1.7675983436853002, "grad_norm": 1.3787745237350464, "learning_rate": 3.946679024439948e-06, "loss": 0.14718320965766907, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.53831481933594, "rewards/margins": 34.27445983886719, "rewards/rejected": 3.2573318481445312, "step": 3415 }, { "epoch": 1.7681159420289854, "grad_norm": 1.2182592153549194, "learning_rate": 3.943890337927841e-06, "loss": 0.11413347721099854, "rewards/accuracies": 0.921875, "rewards/chosen": 37.197601318359375, "rewards/margins": 32.87998962402344, "rewards/rejected": 4.3104095458984375, "step": 3416 }, { "epoch": 1.768633540372671, "grad_norm": 1.9723763465881348, "learning_rate": 3.941101995239849e-06, "loss": 0.119423508644104, "rewards/accuracies": 0.9375, "rewards/chosen": 34.62335968017578, "rewards/margins": 30.625167846679688, "rewards/rejected": 4.001470565795898, "step": 3417 }, { "epoch": 1.769151138716356, "grad_norm": 1.5111364126205444, "learning_rate": 3.938313997283738e-06, "loss": 0.1231442540884018, "rewards/accuracies": 0.921875, "rewards/chosen": 32.251617431640625, "rewards/margins": 28.261611938476562, "rewards/rejected": 3.9820456504821777, "step": 3418 }, { "epoch": 1.7696687370600412, "grad_norm": 0.9619354009628296, "learning_rate": 3.935526344967164e-06, "loss": 0.16429467499256134, "rewards/accuracies": 0.921875, "rewards/chosen": 32.17044448852539, "rewards/margins": 28.812095642089844, "rewards/rejected": 3.3536529541015625, "step": 3419 }, { "epoch": 1.7701863354037268, "grad_norm": 1.6788729429244995, "learning_rate": 3.932739039197661e-06, "loss": 0.11843886226415634, "rewards/accuracies": 0.953125, "rewards/chosen": 37.69355773925781, "rewards/margins": 33.501739501953125, "rewards/rejected": 4.183711528778076, "step": 3420 }, { "epoch": 1.770703933747412, "grad_norm": 1.5828256607055664, "learning_rate": 3.929952080882661e-06, "loss": 0.08928488194942474, "rewards/accuracies": 0.9609375, "rewards/chosen": 36.653724670410156, "rewards/margins": 33.37335205078125, "rewards/rejected": 3.284579277038574, "step": 3421 }, { "epoch": 1.7712215320910973, "grad_norm": 0.6476175785064697, "learning_rate": 3.9271654709294795e-06, "loss": 0.09409927576780319, "rewards/accuracies": 0.9609375, "rewards/chosen": 37.413448333740234, "rewards/margins": 34.203033447265625, "rewards/rejected": 3.226858139038086, "step": 3422 }, { "epoch": 1.7717391304347827, "grad_norm": 0.9764599204063416, "learning_rate": 3.9243792102453124e-06, "loss": 0.13476043939590454, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.07733154296875, "rewards/margins": 30.89226531982422, "rewards/rejected": 3.1889164447784424, "step": 3423 }, { "epoch": 1.7722567287784678, "grad_norm": 1.6362870931625366, "learning_rate": 3.92159329973725e-06, "loss": 0.1886727213859558, "rewards/accuracies": 0.890625, "rewards/chosen": 35.09659957885742, "rewards/margins": 31.854217529296875, "rewards/rejected": 3.2489089965820312, "step": 3424 }, { "epoch": 1.7727743271221532, "grad_norm": 0.7573040127754211, "learning_rate": 3.918807740312267e-06, "loss": 0.11366390436887741, "rewards/accuracies": 0.953125, "rewards/chosen": 38.336639404296875, "rewards/margins": 33.561370849609375, "rewards/rejected": 4.769264221191406, "step": 3425 }, { "epoch": 1.7732919254658386, "grad_norm": 0.6514630913734436, "learning_rate": 3.9160225328772215e-06, "loss": 0.11257994920015335, "rewards/accuracies": 0.921875, "rewards/chosen": 36.14384460449219, "rewards/margins": 33.884033203125, "rewards/rejected": 2.262571334838867, "step": 3426 }, { "epoch": 1.7738095238095237, "grad_norm": 0.912986695766449, "learning_rate": 3.9132376783388544e-06, "loss": 0.1172524243593216, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.27301025390625, "rewards/margins": 33.99906921386719, "rewards/rejected": 3.2648048400878906, "step": 3427 }, { "epoch": 1.774327122153209, "grad_norm": 1.4168568849563599, "learning_rate": 3.910453177603799e-06, "loss": 0.15476927161216736, "rewards/accuracies": 0.921875, "rewards/chosen": 35.06818389892578, "rewards/margins": 31.764358520507812, "rewards/rejected": 3.294849395751953, "step": 3428 }, { "epoch": 1.7748447204968945, "grad_norm": 1.0037713050842285, "learning_rate": 3.90766903157857e-06, "loss": 0.16671720147132874, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.472232818603516, "rewards/margins": 31.98244857788086, "rewards/rejected": 4.498918533325195, "step": 3429 }, { "epoch": 1.7753623188405796, "grad_norm": 1.9762946367263794, "learning_rate": 3.904885241169564e-06, "loss": 0.09963171929121017, "rewards/accuracies": 0.9765625, "rewards/chosen": 34.05066680908203, "rewards/margins": 30.68792724609375, "rewards/rejected": 3.3661539554595947, "step": 3430 }, { "epoch": 1.775879917184265, "grad_norm": 1.0840362310409546, "learning_rate": 3.902101807283062e-06, "loss": 0.2177806794643402, "rewards/accuracies": 0.90625, "rewards/chosen": 32.430450439453125, "rewards/margins": 29.090591430664062, "rewards/rejected": 3.332489013671875, "step": 3431 }, { "epoch": 1.7763975155279503, "grad_norm": 1.7742618322372437, "learning_rate": 3.899318730825239e-06, "loss": 0.14625361561775208, "rewards/accuracies": 0.921875, "rewards/chosen": 37.010101318359375, "rewards/margins": 32.56391906738281, "rewards/rejected": 4.437810897827148, "step": 3432 }, { "epoch": 1.7769151138716355, "grad_norm": 0.8666725158691406, "learning_rate": 3.896536012702137e-06, "loss": 0.14514566957950592, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.73365020751953, "rewards/margins": 31.995712280273438, "rewards/rejected": 2.7321853637695312, "step": 3433 }, { "epoch": 1.777432712215321, "grad_norm": 4.19521951675415, "learning_rate": 3.893753653819693e-06, "loss": 0.20495513081550598, "rewards/accuracies": 0.8984375, "rewards/chosen": 38.42048645019531, "rewards/margins": 33.80149841308594, "rewards/rejected": 4.619010925292969, "step": 3434 }, { "epoch": 1.7779503105590062, "grad_norm": 2.954071283340454, "learning_rate": 3.890971655083724e-06, "loss": 0.13773223757743835, "rewards/accuracies": 0.9609375, "rewards/chosen": 42.68268585205078, "rewards/margins": 37.568634033203125, "rewards/rejected": 5.115123748779297, "step": 3435 }, { "epoch": 1.7784679089026914, "grad_norm": 1.2043836116790771, "learning_rate": 3.888190017399932e-06, "loss": 0.10225240886211395, "rewards/accuracies": 0.953125, "rewards/chosen": 39.02166748046875, "rewards/margins": 35.64787292480469, "rewards/rejected": 3.378795623779297, "step": 3436 }, { "epoch": 1.778985507246377, "grad_norm": 3.3377504348754883, "learning_rate": 3.885408741673896e-06, "loss": 0.11508287489414215, "rewards/accuracies": 0.953125, "rewards/chosen": 37.621036529541016, "rewards/margins": 34.29936599731445, "rewards/rejected": 3.3239002227783203, "step": 3437 }, { "epoch": 1.779503105590062, "grad_norm": 1.0232247114181519, "learning_rate": 3.882627828811083e-06, "loss": 0.13724136352539062, "rewards/accuracies": 0.9296875, "rewards/chosen": 43.556522369384766, "rewards/margins": 39.324798583984375, "rewards/rejected": 4.234283447265625, "step": 3438 }, { "epoch": 1.7800207039337475, "grad_norm": 0.8573025465011597, "learning_rate": 3.879847279716837e-06, "loss": 0.13399598002433777, "rewards/accuracies": 0.953125, "rewards/chosen": 39.00482177734375, "rewards/margins": 34.04246520996094, "rewards/rejected": 4.964130401611328, "step": 3439 }, { "epoch": 1.7805383022774328, "grad_norm": 1.2876336574554443, "learning_rate": 3.877067095296386e-06, "loss": 0.12943479418754578, "rewards/accuracies": 0.953125, "rewards/chosen": 32.44670867919922, "rewards/margins": 30.1019287109375, "rewards/rejected": 2.3401308059692383, "step": 3440 }, { "epoch": 1.781055900621118, "grad_norm": 1.032165765762329, "learning_rate": 3.87428727645484e-06, "loss": 0.10573548078536987, "rewards/accuracies": 0.953125, "rewards/chosen": 37.106597900390625, "rewards/margins": 33.27081298828125, "rewards/rejected": 3.840311050415039, "step": 3441 }, { "epoch": 1.7815734989648033, "grad_norm": 2.109611749649048, "learning_rate": 3.871507824097191e-06, "loss": 0.1738424450159073, "rewards/accuracies": 0.953125, "rewards/chosen": 37.21598815917969, "rewards/margins": 34.71343994140625, "rewards/rejected": 2.497955322265625, "step": 3442 }, { "epoch": 1.7820910973084887, "grad_norm": 1.7078437805175781, "learning_rate": 3.868728739128305e-06, "loss": 0.16757777333259583, "rewards/accuracies": 0.9375, "rewards/chosen": 37.58872985839844, "rewards/margins": 33.634521484375, "rewards/rejected": 3.9487247467041016, "step": 3443 }, { "epoch": 1.7826086956521738, "grad_norm": 1.1574777364730835, "learning_rate": 3.865950022452936e-06, "loss": 0.1437995284795761, "rewards/accuracies": 0.9453125, "rewards/chosen": 41.1311149597168, "rewards/margins": 36.81834411621094, "rewards/rejected": 4.314711570739746, "step": 3444 }, { "epoch": 1.7831262939958592, "grad_norm": 1.2496531009674072, "learning_rate": 3.863171674975717e-06, "loss": 0.12651965022087097, "rewards/accuracies": 0.953125, "rewards/chosen": 34.50639343261719, "rewards/margins": 30.192596435546875, "rewards/rejected": 4.312164306640625, "step": 3445 }, { "epoch": 1.7836438923395446, "grad_norm": 0.9811268448829651, "learning_rate": 3.860393697601154e-06, "loss": 0.14879325032234192, "rewards/accuracies": 0.921875, "rewards/chosen": 47.06049346923828, "rewards/margins": 41.451568603515625, "rewards/rejected": 5.602802276611328, "step": 3446 }, { "epoch": 1.7841614906832297, "grad_norm": 1.2212090492248535, "learning_rate": 3.857616091233642e-06, "loss": 0.16555911302566528, "rewards/accuracies": 0.890625, "rewards/chosen": 35.27576446533203, "rewards/margins": 31.364700317382812, "rewards/rejected": 3.912731170654297, "step": 3447 }, { "epoch": 1.784679089026915, "grad_norm": 3.1602964401245117, "learning_rate": 3.8548388567774495e-06, "loss": 0.12787467241287231, "rewards/accuracies": 0.9609375, "rewards/chosen": 38.65443801879883, "rewards/margins": 34.64045715332031, "rewards/rejected": 4.021595001220703, "step": 3448 }, { "epoch": 1.7851966873706004, "grad_norm": 0.807637095451355, "learning_rate": 3.8520619951367275e-06, "loss": 0.09114834666252136, "rewards/accuracies": 0.9609375, "rewards/chosen": 37.51152038574219, "rewards/margins": 34.32844543457031, "rewards/rejected": 3.1762847900390625, "step": 3449 }, { "epoch": 1.7857142857142856, "grad_norm": 1.2059108018875122, "learning_rate": 3.849285507215497e-06, "loss": 0.1752786934375763, "rewards/accuracies": 0.8984375, "rewards/chosen": 36.272613525390625, "rewards/margins": 32.362037658691406, "rewards/rejected": 3.9215545654296875, "step": 3450 }, { "epoch": 1.7862318840579712, "grad_norm": 3.015075922012329, "learning_rate": 3.846509393917669e-06, "loss": 0.12937216460704803, "rewards/accuracies": 0.9453125, "rewards/chosen": 40.19557189941406, "rewards/margins": 36.53623962402344, "rewards/rejected": 3.661914825439453, "step": 3451 }, { "epoch": 1.7867494824016563, "grad_norm": 1.1635992527008057, "learning_rate": 3.843733656147027e-06, "loss": 0.1721891462802887, "rewards/accuracies": 0.9375, "rewards/chosen": 39.96544647216797, "rewards/margins": 35.653076171875, "rewards/rejected": 4.3120269775390625, "step": 3452 }, { "epoch": 1.7872670807453415, "grad_norm": 1.2604879140853882, "learning_rate": 3.840958294807229e-06, "loss": 0.1456245481967926, "rewards/accuracies": 0.9453125, "rewards/chosen": 40.44467544555664, "rewards/margins": 36.32060241699219, "rewards/rejected": 4.137246608734131, "step": 3453 }, { "epoch": 1.787784679089027, "grad_norm": 1.7942748069763184, "learning_rate": 3.838183310801818e-06, "loss": 0.16416534781455994, "rewards/accuracies": 0.9375, "rewards/chosen": 40.28966522216797, "rewards/margins": 35.99525451660156, "rewards/rejected": 4.28228759765625, "step": 3454 }, { "epoch": 1.7883022774327122, "grad_norm": 0.9042367935180664, "learning_rate": 3.8354087050342086e-06, "loss": 0.16798453032970428, "rewards/accuracies": 0.9140625, "rewards/chosen": 37.26263427734375, "rewards/margins": 34.57451629638672, "rewards/rejected": 2.6911964416503906, "step": 3455 }, { "epoch": 1.7888198757763976, "grad_norm": 4.18600606918335, "learning_rate": 3.8326344784076925e-06, "loss": 0.14497335255146027, "rewards/accuracies": 0.9609375, "rewards/chosen": 42.65885925292969, "rewards/margins": 37.97515869140625, "rewards/rejected": 4.678595542907715, "step": 3456 }, { "epoch": 1.789337474120083, "grad_norm": 2.573812246322632, "learning_rate": 3.829860631825438e-06, "loss": 0.18867796659469604, "rewards/accuracies": 0.9375, "rewards/chosen": 41.83106231689453, "rewards/margins": 37.827728271484375, "rewards/rejected": 4.009012699127197, "step": 3457 }, { "epoch": 1.789855072463768, "grad_norm": 0.951276957988739, "learning_rate": 3.827087166190495e-06, "loss": 0.15284210443496704, "rewards/accuracies": 0.9140625, "rewards/chosen": 38.777469635009766, "rewards/margins": 36.08161926269531, "rewards/rejected": 2.6909379959106445, "step": 3458 }, { "epoch": 1.7903726708074534, "grad_norm": 1.1128615140914917, "learning_rate": 3.824314082405785e-06, "loss": 0.14073187112808228, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.91679382324219, "rewards/margins": 34.50978088378906, "rewards/rejected": 4.400201797485352, "step": 3459 }, { "epoch": 1.7908902691511388, "grad_norm": 1.1991856098175049, "learning_rate": 3.8215413813741e-06, "loss": 0.155767023563385, "rewards/accuracies": 0.9296875, "rewards/chosen": 39.567623138427734, "rewards/margins": 37.002037048339844, "rewards/rejected": 2.559821128845215, "step": 3460 }, { "epoch": 1.791407867494824, "grad_norm": 1.6698826551437378, "learning_rate": 3.818769063998117e-06, "loss": 0.14632737636566162, "rewards/accuracies": 0.9375, "rewards/chosen": 35.15053176879883, "rewards/margins": 32.70688247680664, "rewards/rejected": 2.433187246322632, "step": 3461 }, { "epoch": 1.7919254658385093, "grad_norm": 1.5432066917419434, "learning_rate": 3.815997131180386e-06, "loss": 0.18066348135471344, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.79779052734375, "rewards/margins": 32.617645263671875, "rewards/rejected": 2.1775150299072266, "step": 3462 }, { "epoch": 1.7924430641821947, "grad_norm": 0.8558304905891418, "learning_rate": 3.813225583823324e-06, "loss": 0.08790284395217896, "rewards/accuracies": 0.9609375, "rewards/chosen": 42.457008361816406, "rewards/margins": 38.883750915527344, "rewards/rejected": 3.570859432220459, "step": 3463 }, { "epoch": 1.7929606625258798, "grad_norm": 0.8296774625778198, "learning_rate": 3.8104544228292324e-06, "loss": 0.1359008550643921, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.17839813232422, "rewards/margins": 32.606689453125, "rewards/rejected": 1.567317247390747, "step": 3464 }, { "epoch": 1.7934782608695652, "grad_norm": 0.6741753220558167, "learning_rate": 3.807683649100282e-06, "loss": 0.11039768159389496, "rewards/accuracies": 0.953125, "rewards/chosen": 35.46877670288086, "rewards/margins": 33.975006103515625, "rewards/rejected": 1.4903450012207031, "step": 3465 }, { "epoch": 1.7939958592132506, "grad_norm": 1.187504768371582, "learning_rate": 3.8049132635385157e-06, "loss": 0.1659807562828064, "rewards/accuracies": 0.9375, "rewards/chosen": 38.401100158691406, "rewards/margins": 34.38386535644531, "rewards/rejected": 4.017301559448242, "step": 3466 }, { "epoch": 1.7945134575569357, "grad_norm": 3.5280628204345703, "learning_rate": 3.8021432670458553e-06, "loss": 0.2490755319595337, "rewards/accuracies": 0.9296875, "rewards/chosen": 37.86076354980469, "rewards/margins": 35.439369201660156, "rewards/rejected": 2.421018600463867, "step": 3467 }, { "epoch": 1.795031055900621, "grad_norm": 1.091814398765564, "learning_rate": 3.7993736605240933e-06, "loss": 0.09524238109588623, "rewards/accuracies": 0.9609375, "rewards/chosen": 34.31207275390625, "rewards/margins": 32.798736572265625, "rewards/rejected": 1.5153350830078125, "step": 3468 }, { "epoch": 1.7955486542443064, "grad_norm": 1.1620591878890991, "learning_rate": 3.796604444874893e-06, "loss": 0.15009045600891113, "rewards/accuracies": 0.921875, "rewards/chosen": 37.377052307128906, "rewards/margins": 35.34577941894531, "rewards/rejected": 2.0345096588134766, "step": 3469 }, { "epoch": 1.7960662525879916, "grad_norm": 0.7876095175743103, "learning_rate": 3.793835620999793e-06, "loss": 0.10883168131113052, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.988643646240234, "rewards/margins": 33.49711608886719, "rewards/rejected": 1.4939746856689453, "step": 3470 }, { "epoch": 1.7965838509316772, "grad_norm": 1.6352444887161255, "learning_rate": 3.791067189800205e-06, "loss": 0.1433459222316742, "rewards/accuracies": 0.9140625, "rewards/chosen": 41.127384185791016, "rewards/margins": 37.478607177734375, "rewards/rejected": 3.659792900085449, "step": 3471 }, { "epoch": 1.7971014492753623, "grad_norm": 1.7143436670303345, "learning_rate": 3.788299152177412e-06, "loss": 0.12390929460525513, "rewards/accuracies": 0.9375, "rewards/chosen": 34.071746826171875, "rewards/margins": 32.5975341796875, "rewards/rejected": 1.471731185913086, "step": 3472 }, { "epoch": 1.7976190476190477, "grad_norm": 0.8724899291992188, "learning_rate": 3.7855315090325674e-06, "loss": 0.0882592648267746, "rewards/accuracies": 0.9765625, "rewards/chosen": 36.171104431152344, "rewards/margins": 34.483154296875, "rewards/rejected": 1.6917896270751953, "step": 3473 }, { "epoch": 1.798136645962733, "grad_norm": 1.111922264099121, "learning_rate": 3.782764261266696e-06, "loss": 0.13583730161190033, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.736778259277344, "rewards/margins": 34.833351135253906, "rewards/rejected": 1.8985824584960938, "step": 3474 }, { "epoch": 1.7986542443064182, "grad_norm": 2.2682316303253174, "learning_rate": 3.7799974097807e-06, "loss": 0.19598616659641266, "rewards/accuracies": 0.921875, "rewards/chosen": 37.092674255371094, "rewards/margins": 33.994781494140625, "rewards/rejected": 3.103527069091797, "step": 3475 }, { "epoch": 1.7991718426501035, "grad_norm": 0.7347557544708252, "learning_rate": 3.777230955475343e-06, "loss": 0.13614149391651154, "rewards/accuracies": 0.9453125, "rewards/chosen": 40.345062255859375, "rewards/margins": 38.406471252441406, "rewards/rejected": 1.9335484504699707, "step": 3476 }, { "epoch": 1.799689440993789, "grad_norm": 0.9320292472839355, "learning_rate": 3.7744648992512654e-06, "loss": 0.11228647828102112, "rewards/accuracies": 0.96875, "rewards/chosen": 34.41094970703125, "rewards/margins": 32.633140563964844, "rewards/rejected": 1.77651047706604, "step": 3477 }, { "epoch": 1.800207039337474, "grad_norm": 1.0572034120559692, "learning_rate": 3.7716992420089792e-06, "loss": 0.16847321391105652, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.580467224121094, "rewards/margins": 32.32684326171875, "rewards/rejected": 1.2662067413330078, "step": 3478 }, { "epoch": 1.8007246376811594, "grad_norm": 1.3871268033981323, "learning_rate": 3.7689339846488616e-06, "loss": 0.12615270912647247, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.086822509765625, "rewards/margins": 34.15342330932617, "rewards/rejected": 1.934661865234375, "step": 3479 }, { "epoch": 1.8012422360248448, "grad_norm": 2.166269063949585, "learning_rate": 3.766169128071161e-06, "loss": 0.14540702104568481, "rewards/accuracies": 0.90625, "rewards/chosen": 34.25453186035156, "rewards/margins": 32.438018798828125, "rewards/rejected": 1.8161563873291016, "step": 3480 }, { "epoch": 1.80175983436853, "grad_norm": 1.1129615306854248, "learning_rate": 3.763404673176e-06, "loss": 0.10746422410011292, "rewards/accuracies": 0.9375, "rewards/chosen": 34.79640579223633, "rewards/margins": 32.77467346191406, "rewards/rejected": 2.0227813720703125, "step": 3481 }, { "epoch": 1.8022774327122153, "grad_norm": 0.7384742498397827, "learning_rate": 3.760640620863365e-06, "loss": 0.0873388797044754, "rewards/accuracies": 0.9765625, "rewards/chosen": 35.69181823730469, "rewards/margins": 35.084388732910156, "rewards/rejected": 0.6035919189453125, "step": 3482 }, { "epoch": 1.8027950310559007, "grad_norm": 1.1452195644378662, "learning_rate": 3.7578769720331113e-06, "loss": 0.13585850596427917, "rewards/accuracies": 0.921875, "rewards/chosen": 37.97538757324219, "rewards/margins": 35.19976806640625, "rewards/rejected": 2.7754178047180176, "step": 3483 }, { "epoch": 1.8033126293995858, "grad_norm": 0.9206100106239319, "learning_rate": 3.755113727584968e-06, "loss": 0.12283775210380554, "rewards/accuracies": 0.921875, "rewards/chosen": 38.294761657714844, "rewards/margins": 36.017974853515625, "rewards/rejected": 2.2784957885742188, "step": 3484 }, { "epoch": 1.8038302277432712, "grad_norm": 1.0076420307159424, "learning_rate": 3.7523508884185283e-06, "loss": 0.09760183095932007, "rewards/accuracies": 0.953125, "rewards/chosen": 37.77738952636719, "rewards/margins": 35.9945068359375, "rewards/rejected": 1.779129981994629, "step": 3485 }, { "epoch": 1.8043478260869565, "grad_norm": 5.0548624992370605, "learning_rate": 3.749588455433253e-06, "loss": 0.17491650581359863, "rewards/accuracies": 0.9296875, "rewards/chosen": 40.890106201171875, "rewards/margins": 37.42823028564453, "rewards/rejected": 3.4484596252441406, "step": 3486 }, { "epoch": 1.8048654244306417, "grad_norm": 1.2709169387817383, "learning_rate": 3.746826429528472e-06, "loss": 0.11069737374782562, "rewards/accuracies": 0.9296875, "rewards/chosen": 38.785369873046875, "rewards/margins": 36.193809509277344, "rewards/rejected": 2.5955147743225098, "step": 3487 }, { "epoch": 1.8053830227743273, "grad_norm": 3.6793463230133057, "learning_rate": 3.7440648116033863e-06, "loss": 0.18842032551765442, "rewards/accuracies": 0.890625, "rewards/chosen": 40.30986785888672, "rewards/margins": 36.77473449707031, "rewards/rejected": 3.5371131896972656, "step": 3488 }, { "epoch": 1.8059006211180124, "grad_norm": 1.4710627794265747, "learning_rate": 3.741303602557055e-06, "loss": 0.15567883849143982, "rewards/accuracies": 0.90625, "rewards/chosen": 34.76750564575195, "rewards/margins": 33.441375732421875, "rewards/rejected": 1.3277397155761719, "step": 3489 }, { "epoch": 1.8064182194616976, "grad_norm": 1.6214101314544678, "learning_rate": 3.738542803288414e-06, "loss": 0.15768399834632874, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.558433532714844, "rewards/margins": 33.326690673828125, "rewards/rejected": 2.236128807067871, "step": 3490 }, { "epoch": 1.8069358178053831, "grad_norm": 0.8878612518310547, "learning_rate": 3.7357824146962605e-06, "loss": 0.14048507809638977, "rewards/accuracies": 0.890625, "rewards/chosen": 38.11090087890625, "rewards/margins": 35.926025390625, "rewards/rejected": 2.180299758911133, "step": 3491 }, { "epoch": 1.8074534161490683, "grad_norm": 0.9391139149665833, "learning_rate": 3.733022437679258e-06, "loss": 0.132765531539917, "rewards/accuracies": 0.9375, "rewards/chosen": 38.207252502441406, "rewards/margins": 35.63319396972656, "rewards/rejected": 2.5660457611083984, "step": 3492 }, { "epoch": 1.8079710144927537, "grad_norm": 2.6609272956848145, "learning_rate": 3.7302628731359357e-06, "loss": 0.12823417782783508, "rewards/accuracies": 0.9453125, "rewards/chosen": 42.241058349609375, "rewards/margins": 38.83891296386719, "rewards/rejected": 3.396984100341797, "step": 3493 }, { "epoch": 1.808488612836439, "grad_norm": 3.0025508403778076, "learning_rate": 3.7275037219646927e-06, "loss": 0.20974507927894592, "rewards/accuracies": 0.8984375, "rewards/chosen": 36.780094146728516, "rewards/margins": 34.008079528808594, "rewards/rejected": 2.768756866455078, "step": 3494 }, { "epoch": 1.8090062111801242, "grad_norm": 2.872256278991699, "learning_rate": 3.7247449850637898e-06, "loss": 0.16607336699962616, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.88772964477539, "rewards/margins": 32.53662109375, "rewards/rejected": 2.332172393798828, "step": 3495 }, { "epoch": 1.8095238095238095, "grad_norm": 1.351343035697937, "learning_rate": 3.7219866633313505e-06, "loss": 0.1392885446548462, "rewards/accuracies": 0.9375, "rewards/chosen": 36.10643768310547, "rewards/margins": 32.550140380859375, "rewards/rejected": 3.542710304260254, "step": 3496 }, { "epoch": 1.810041407867495, "grad_norm": 3.3842391967773438, "learning_rate": 3.7192287576653706e-06, "loss": 0.18515567481517792, "rewards/accuracies": 0.9140625, "rewards/chosen": 32.54317855834961, "rewards/margins": 30.232086181640625, "rewards/rejected": 2.3112735748291016, "step": 3497 }, { "epoch": 1.81055900621118, "grad_norm": 1.1045836210250854, "learning_rate": 3.716471268963706e-06, "loss": 0.1495460867881775, "rewards/accuracies": 0.9140625, "rewards/chosen": 35.80702590942383, "rewards/margins": 33.46825408935547, "rewards/rejected": 2.3329696655273438, "step": 3498 }, { "epoch": 1.8110766045548654, "grad_norm": 1.0102492570877075, "learning_rate": 3.7137141981240736e-06, "loss": 0.1305023580789566, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.80535888671875, "rewards/margins": 31.453414916992188, "rewards/rejected": 2.357635498046875, "step": 3499 }, { "epoch": 1.8115942028985508, "grad_norm": 1.0978026390075684, "learning_rate": 3.710957546044059e-06, "loss": 0.15953953564167023, "rewards/accuracies": 0.921875, "rewards/chosen": 31.73761558532715, "rewards/margins": 29.1578369140625, "rewards/rejected": 2.5838117599487305, "step": 3500 }, { "epoch": 1.812111801242236, "grad_norm": 1.3625199794769287, "learning_rate": 3.7082013136211135e-06, "loss": 0.13629873096942902, "rewards/accuracies": 0.953125, "rewards/chosen": 36.63056945800781, "rewards/margins": 33.42872619628906, "rewards/rejected": 3.202116012573242, "step": 3501 }, { "epoch": 1.8126293995859213, "grad_norm": 0.9335705041885376, "learning_rate": 3.705445501752543e-06, "loss": 0.1239398792386055, "rewards/accuracies": 0.9375, "rewards/chosen": 33.01052474975586, "rewards/margins": 30.82361602783203, "rewards/rejected": 2.1795148849487305, "step": 3502 }, { "epoch": 1.8131469979296067, "grad_norm": 1.4112292528152466, "learning_rate": 3.702690111335526e-06, "loss": 0.10941925644874573, "rewards/accuracies": 0.9375, "rewards/chosen": 32.5302734375, "rewards/margins": 30.908187866210938, "rewards/rejected": 1.6251373291015625, "step": 3503 }, { "epoch": 1.8136645962732918, "grad_norm": 0.8234710693359375, "learning_rate": 3.6999351432670966e-06, "loss": 0.11096266657114029, "rewards/accuracies": 0.9765625, "rewards/chosen": 33.0333251953125, "rewards/margins": 30.611099243164062, "rewards/rejected": 2.415127754211426, "step": 3504 }, { "epoch": 1.8141821946169774, "grad_norm": 1.082127332687378, "learning_rate": 3.69718059844416e-06, "loss": 0.1641106903553009, "rewards/accuracies": 0.8984375, "rewards/chosen": 29.852792739868164, "rewards/margins": 27.621143341064453, "rewards/rejected": 2.240936279296875, "step": 3505 }, { "epoch": 1.8146997929606625, "grad_norm": 1.2202603816986084, "learning_rate": 3.694426477763471e-06, "loss": 0.1562444120645523, "rewards/accuracies": 0.921875, "rewards/chosen": 30.96476936340332, "rewards/margins": 28.65911865234375, "rewards/rejected": 2.3038368225097656, "step": 3506 }, { "epoch": 1.8152173913043477, "grad_norm": 1.0252960920333862, "learning_rate": 3.691672782121658e-06, "loss": 0.1301504373550415, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.530677795410156, "rewards/margins": 32.754486083984375, "rewards/rejected": 2.7825355529785156, "step": 3507 }, { "epoch": 1.8157349896480333, "grad_norm": 1.6536647081375122, "learning_rate": 3.6889195124152054e-06, "loss": 0.19294431805610657, "rewards/accuracies": 0.921875, "rewards/chosen": 31.588239669799805, "rewards/margins": 29.456581115722656, "rewards/rejected": 2.128040313720703, "step": 3508 }, { "epoch": 1.8162525879917184, "grad_norm": 0.9253339767456055, "learning_rate": 3.68616666954046e-06, "loss": 0.13604050874710083, "rewards/accuracies": 0.9375, "rewards/chosen": 35.38698196411133, "rewards/margins": 33.0462646484375, "rewards/rejected": 2.336817502975464, "step": 3509 }, { "epoch": 1.8167701863354038, "grad_norm": 0.7895592451095581, "learning_rate": 3.6834142543936285e-06, "loss": 0.11570778489112854, "rewards/accuracies": 0.9609375, "rewards/chosen": 34.43439483642578, "rewards/margins": 31.44898223876953, "rewards/rejected": 2.989875316619873, "step": 3510 }, { "epoch": 1.8172877846790891, "grad_norm": 1.0477925539016724, "learning_rate": 3.6806622678707827e-06, "loss": 0.1666618436574936, "rewards/accuracies": 0.90625, "rewards/chosen": 36.41783905029297, "rewards/margins": 33.394866943359375, "rewards/rejected": 3.0442028045654297, "step": 3511 }, { "epoch": 1.8178053830227743, "grad_norm": 0.7311462759971619, "learning_rate": 3.677910710867848e-06, "loss": 0.11839501559734344, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.167579650878906, "rewards/margins": 34.509429931640625, "rewards/rejected": 2.658374786376953, "step": 3512 }, { "epoch": 1.8183229813664596, "grad_norm": 1.002298355102539, "learning_rate": 3.6751595842806143e-06, "loss": 0.15151898562908173, "rewards/accuracies": 0.921875, "rewards/chosen": 34.16111755371094, "rewards/margins": 31.48443603515625, "rewards/rejected": 2.6834282875061035, "step": 3513 }, { "epoch": 1.818840579710145, "grad_norm": 0.8968423008918762, "learning_rate": 3.672408889004735e-06, "loss": 0.09977398812770844, "rewards/accuracies": 0.953125, "rewards/chosen": 35.31946563720703, "rewards/margins": 33.26536560058594, "rewards/rejected": 2.052793502807617, "step": 3514 }, { "epoch": 1.8193581780538302, "grad_norm": 0.76719069480896, "learning_rate": 3.6696586259357113e-06, "loss": 0.10872921347618103, "rewards/accuracies": 0.96875, "rewards/chosen": 36.489349365234375, "rewards/margins": 33.33538818359375, "rewards/rejected": 3.140216827392578, "step": 3515 }, { "epoch": 1.8198757763975155, "grad_norm": 3.1807284355163574, "learning_rate": 3.6669087959689174e-06, "loss": 0.17882958054542542, "rewards/accuracies": 0.921875, "rewards/chosen": 33.365875244140625, "rewards/margins": 30.914398193359375, "rewards/rejected": 2.453639030456543, "step": 3516 }, { "epoch": 1.8203933747412009, "grad_norm": 0.8294086456298828, "learning_rate": 3.664159399999576e-06, "loss": 0.12378998100757599, "rewards/accuracies": 0.9375, "rewards/chosen": 36.921661376953125, "rewards/margins": 33.9254150390625, "rewards/rejected": 2.993898391723633, "step": 3517 }, { "epoch": 1.820910973084886, "grad_norm": 1.474840760231018, "learning_rate": 3.661410438922779e-06, "loss": 0.21407896280288696, "rewards/accuracies": 0.90625, "rewards/chosen": 30.24293327331543, "rewards/margins": 27.284896850585938, "rewards/rejected": 2.9521141052246094, "step": 3518 }, { "epoch": 1.8214285714285714, "grad_norm": 2.5347139835357666, "learning_rate": 3.658661913633463e-06, "loss": 0.15284496545791626, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.82536697387695, "rewards/margins": 33.17982482910156, "rewards/rejected": 2.6418800354003906, "step": 3519 }, { "epoch": 1.8219461697722568, "grad_norm": 0.9851306080818176, "learning_rate": 3.6559138250264343e-06, "loss": 0.12624157965183258, "rewards/accuracies": 0.9375, "rewards/chosen": 31.031269073486328, "rewards/margins": 28.233901977539062, "rewards/rejected": 2.7902297973632812, "step": 3520 }, { "epoch": 1.822463768115942, "grad_norm": 0.864142894744873, "learning_rate": 3.653166173996353e-06, "loss": 0.11303985863924026, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.924312591552734, "rewards/margins": 33.142120361328125, "rewards/rejected": 2.7822208404541016, "step": 3521 }, { "epoch": 1.8229813664596275, "grad_norm": 1.360439658164978, "learning_rate": 3.650418961437736e-06, "loss": 0.18655309081077576, "rewards/accuracies": 0.8984375, "rewards/chosen": 32.09766387939453, "rewards/margins": 29.884735107421875, "rewards/rejected": 2.2139058113098145, "step": 3522 }, { "epoch": 1.8234989648033126, "grad_norm": 0.7521038055419922, "learning_rate": 3.647672188244956e-06, "loss": 0.14595352113246918, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.52751922607422, "rewards/margins": 31.889968872070312, "rewards/rejected": 1.6394500732421875, "step": 3523 }, { "epoch": 1.8240165631469978, "grad_norm": 1.0822736024856567, "learning_rate": 3.64492585531225e-06, "loss": 0.17251019179821014, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.16655731201172, "rewards/margins": 26.791889190673828, "rewards/rejected": 1.373906135559082, "step": 3524 }, { "epoch": 1.8245341614906834, "grad_norm": 0.5532451868057251, "learning_rate": 3.642179963533702e-06, "loss": 0.09065846353769302, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.620121002197266, "rewards/margins": 28.861068725585938, "rewards/rejected": 1.7521313428878784, "step": 3525 }, { "epoch": 1.8250517598343685, "grad_norm": 3.4283976554870605, "learning_rate": 3.639434513803257e-06, "loss": 0.1593528538942337, "rewards/accuracies": 0.953125, "rewards/chosen": 35.416107177734375, "rewards/margins": 32.747962951660156, "rewards/rejected": 2.6676883697509766, "step": 3526 }, { "epoch": 1.8255693581780539, "grad_norm": 1.1547926664352417, "learning_rate": 3.6366895070147214e-06, "loss": 0.1924057900905609, "rewards/accuracies": 0.890625, "rewards/chosen": 34.035980224609375, "rewards/margins": 31.384963989257812, "rewards/rejected": 2.638385772705078, "step": 3527 }, { "epoch": 1.8260869565217392, "grad_norm": 1.4838588237762451, "learning_rate": 3.6339449440617437e-06, "loss": 0.22096017003059387, "rewards/accuracies": 0.859375, "rewards/chosen": 29.693328857421875, "rewards/margins": 27.358169555664062, "rewards/rejected": 2.327646255493164, "step": 3528 }, { "epoch": 1.8266045548654244, "grad_norm": 0.887730062007904, "learning_rate": 3.6312008258378414e-06, "loss": 0.13810324668884277, "rewards/accuracies": 0.921875, "rewards/chosen": 30.10392189025879, "rewards/margins": 28.45916748046875, "rewards/rejected": 1.6400878429412842, "step": 3529 }, { "epoch": 1.8271221532091098, "grad_norm": 1.007409930229187, "learning_rate": 3.6284571532363798e-06, "loss": 0.1078108474612236, "rewards/accuracies": 0.953125, "rewards/chosen": 33.5980224609375, "rewards/margins": 31.246429443359375, "rewards/rejected": 2.349956512451172, "step": 3530 }, { "epoch": 1.8276397515527951, "grad_norm": 0.6550233960151672, "learning_rate": 3.625713927150586e-06, "loss": 0.09080716967582703, "rewards/accuracies": 0.984375, "rewards/chosen": 34.630165100097656, "rewards/margins": 32.471038818359375, "rewards/rejected": 2.164374351501465, "step": 3531 }, { "epoch": 1.8281573498964803, "grad_norm": 0.7690591812133789, "learning_rate": 3.6229711484735298e-06, "loss": 0.12165556102991104, "rewards/accuracies": 0.953125, "rewards/chosen": 31.05914306640625, "rewards/margins": 29.43341827392578, "rewards/rejected": 1.6213722229003906, "step": 3532 }, { "epoch": 1.8286749482401656, "grad_norm": 1.6367952823638916, "learning_rate": 3.620228818098147e-06, "loss": 0.1252313107252121, "rewards/accuracies": 0.9609375, "rewards/chosen": 28.9459171295166, "rewards/margins": 27.5435791015625, "rewards/rejected": 1.4037952423095703, "step": 3533 }, { "epoch": 1.829192546583851, "grad_norm": 0.9061165452003479, "learning_rate": 3.6174869369172237e-06, "loss": 0.1942528635263443, "rewards/accuracies": 0.90625, "rewards/chosen": 31.77219581604004, "rewards/margins": 29.627403259277344, "rewards/rejected": 2.1444263458251953, "step": 3534 }, { "epoch": 1.8297101449275361, "grad_norm": 1.056735873222351, "learning_rate": 3.614745505823397e-06, "loss": 0.13300833106040955, "rewards/accuracies": 0.921875, "rewards/chosen": 31.556245803833008, "rewards/margins": 30.184223175048828, "rewards/rejected": 1.3651599884033203, "step": 3535 }, { "epoch": 1.8302277432712215, "grad_norm": 0.8768357634544373, "learning_rate": 3.6120045257091585e-06, "loss": 0.16030870378017426, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.5704402923584, "rewards/margins": 28.108322143554688, "rewards/rejected": 1.4679317474365234, "step": 3536 }, { "epoch": 1.8307453416149069, "grad_norm": 1.863956332206726, "learning_rate": 3.6092639974668575e-06, "loss": 0.13060641288757324, "rewards/accuracies": 0.953125, "rewards/chosen": 31.862241744995117, "rewards/margins": 30.033348083496094, "rewards/rejected": 1.8225326538085938, "step": 3537 }, { "epoch": 1.831262939958592, "grad_norm": 1.3742802143096924, "learning_rate": 3.6065239219886907e-06, "loss": 0.13941726088523865, "rewards/accuracies": 0.9375, "rewards/chosen": 28.289417266845703, "rewards/margins": 27.305496215820312, "rewards/rejected": 0.9867286682128906, "step": 3538 }, { "epoch": 1.8317805383022774, "grad_norm": 0.7120242714881897, "learning_rate": 3.6037843001667096e-06, "loss": 0.1613524705171585, "rewards/accuracies": 0.890625, "rewards/chosen": 29.263029098510742, "rewards/margins": 27.704345703125, "rewards/rejected": 1.5628199577331543, "step": 3539 }, { "epoch": 1.8322981366459627, "grad_norm": 0.8886904120445251, "learning_rate": 3.6010451328928165e-06, "loss": 0.1487399935722351, "rewards/accuracies": 0.9375, "rewards/chosen": 31.185657501220703, "rewards/margins": 29.244430541992188, "rewards/rejected": 1.9338054656982422, "step": 3540 }, { "epoch": 1.832815734989648, "grad_norm": 0.4921579360961914, "learning_rate": 3.5983064210587703e-06, "loss": 0.09377336502075195, "rewards/accuracies": 0.9609375, "rewards/chosen": 32.960121154785156, "rewards/margins": 30.76361083984375, "rewards/rejected": 2.1947596073150635, "step": 3541 }, { "epoch": 1.8333333333333335, "grad_norm": 0.7554442286491394, "learning_rate": 3.5955681655561758e-06, "loss": 0.16581127047538757, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.76093292236328, "rewards/margins": 29.495285034179688, "rewards/rejected": 2.2623634338378906, "step": 3542 }, { "epoch": 1.8338509316770186, "grad_norm": 3.045837879180908, "learning_rate": 3.5928303672764913e-06, "loss": 0.20253697037696838, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.374229431152344, "rewards/margins": 30.16478729248047, "rewards/rejected": 2.218812942504883, "step": 3543 }, { "epoch": 1.8343685300207038, "grad_norm": 1.6326714754104614, "learning_rate": 3.590093027111031e-06, "loss": 0.12901295721530914, "rewards/accuracies": 0.9453125, "rewards/chosen": 29.74453353881836, "rewards/margins": 28.278778076171875, "rewards/rejected": 1.4663333892822266, "step": 3544 }, { "epoch": 1.8348861283643894, "grad_norm": 1.1002211570739746, "learning_rate": 3.5873561459509506e-06, "loss": 0.16083434224128723, "rewards/accuracies": 0.9375, "rewards/chosen": 33.33634948730469, "rewards/margins": 31.384769439697266, "rewards/rejected": 1.956979751586914, "step": 3545 }, { "epoch": 1.8354037267080745, "grad_norm": 1.9205968379974365, "learning_rate": 3.584619724687264e-06, "loss": 0.1861743927001953, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.482582092285156, "rewards/margins": 25.461456298828125, "rewards/rejected": 1.024627685546875, "step": 3546 }, { "epoch": 1.8359213250517599, "grad_norm": 1.9257007837295532, "learning_rate": 3.581883764210834e-06, "loss": 0.2094782143831253, "rewards/accuracies": 0.90625, "rewards/chosen": 28.8411865234375, "rewards/margins": 26.910064697265625, "rewards/rejected": 1.9228897094726562, "step": 3547 }, { "epoch": 1.8364389233954452, "grad_norm": 0.5779696106910706, "learning_rate": 3.579148265412369e-06, "loss": 0.08205601572990417, "rewards/accuracies": 0.96875, "rewards/chosen": 33.64567565917969, "rewards/margins": 31.43695068359375, "rewards/rejected": 2.200784683227539, "step": 3548 }, { "epoch": 1.8369565217391304, "grad_norm": 0.9151245951652527, "learning_rate": 3.5764132291824316e-06, "loss": 0.16406528651714325, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.818201065063477, "rewards/margins": 28.864952087402344, "rewards/rejected": 1.9544899463653564, "step": 3549 }, { "epoch": 1.8374741200828157, "grad_norm": 1.3379746675491333, "learning_rate": 3.5736786564114355e-06, "loss": 0.201704204082489, "rewards/accuracies": 0.8671875, "rewards/chosen": 30.98746109008789, "rewards/margins": 28.133926391601562, "rewards/rejected": 2.8526363372802734, "step": 3550 }, { "epoch": 1.837991718426501, "grad_norm": 1.1845754384994507, "learning_rate": 3.5709445479896363e-06, "loss": 0.11096694320440292, "rewards/accuracies": 0.953125, "rewards/chosen": 35.13591003417969, "rewards/margins": 32.13470458984375, "rewards/rejected": 2.9941673278808594, "step": 3551 }, { "epoch": 1.8385093167701863, "grad_norm": 1.8353497982025146, "learning_rate": 3.5682109048071455e-06, "loss": 0.1349278837442398, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.797508239746094, "rewards/margins": 29.939781188964844, "rewards/rejected": 2.8501968383789062, "step": 3552 }, { "epoch": 1.8390269151138716, "grad_norm": 1.8844465017318726, "learning_rate": 3.565477727753919e-06, "loss": 0.12299161404371262, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.596336364746094, "rewards/margins": 30.30063247680664, "rewards/rejected": 2.298534393310547, "step": 3553 }, { "epoch": 1.839544513457557, "grad_norm": 0.7336382865905762, "learning_rate": 3.5627450177197636e-06, "loss": 0.1456339955329895, "rewards/accuracies": 0.9375, "rewards/chosen": 33.106117248535156, "rewards/margins": 30.577194213867188, "rewards/rejected": 2.5329513549804688, "step": 3554 }, { "epoch": 1.8400621118012421, "grad_norm": 1.363694667816162, "learning_rate": 3.5600127755943313e-06, "loss": 0.1963958591222763, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.01609420776367, "rewards/margins": 30.770057678222656, "rewards/rejected": 3.243804931640625, "step": 3555 }, { "epoch": 1.8405797101449275, "grad_norm": 2.6547017097473145, "learning_rate": 3.5572810022671234e-06, "loss": 0.2534794509410858, "rewards/accuracies": 0.8828125, "rewards/chosen": 27.602413177490234, "rewards/margins": 25.852935791015625, "rewards/rejected": 1.734994888305664, "step": 3556 }, { "epoch": 1.8410973084886129, "grad_norm": 1.1351896524429321, "learning_rate": 3.5545496986274914e-06, "loss": 0.11881465464830399, "rewards/accuracies": 0.9375, "rewards/chosen": 36.31237030029297, "rewards/margins": 33.0789794921875, "rewards/rejected": 3.234114408493042, "step": 3557 }, { "epoch": 1.841614906832298, "grad_norm": 0.9243746399879456, "learning_rate": 3.551818865564625e-06, "loss": 0.12845519185066223, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.002338409423828, "rewards/margins": 28.37354278564453, "rewards/rejected": 2.619504451751709, "step": 3558 }, { "epoch": 1.8421325051759836, "grad_norm": 0.766723096370697, "learning_rate": 3.5490885039675716e-06, "loss": 0.1526510864496231, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.784645080566406, "rewards/margins": 31.971717834472656, "rewards/rejected": 2.8199691772460938, "step": 3559 }, { "epoch": 1.8426501035196687, "grad_norm": 0.827928364276886, "learning_rate": 3.5463586147252195e-06, "loss": 0.08520446717739105, "rewards/accuracies": 0.96875, "rewards/chosen": 36.22611999511719, "rewards/margins": 33.72663879394531, "rewards/rejected": 2.4885470867156982, "step": 3560 }, { "epoch": 1.8431677018633539, "grad_norm": 1.468743085861206, "learning_rate": 3.5436291987263016e-06, "loss": 0.16860288381576538, "rewards/accuracies": 0.90625, "rewards/chosen": 34.2993049621582, "rewards/margins": 30.467239379882812, "rewards/rejected": 3.8270082473754883, "step": 3561 }, { "epoch": 1.8436853002070395, "grad_norm": 0.837400496006012, "learning_rate": 3.5409002568593987e-06, "loss": 0.14752259850502014, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.590301513671875, "rewards/margins": 29.328765869140625, "rewards/rejected": 2.266000747680664, "step": 3562 }, { "epoch": 1.8442028985507246, "grad_norm": 0.7768846154212952, "learning_rate": 3.5381717900129408e-06, "loss": 0.10170790553092957, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.904685974121094, "rewards/margins": 32.28806686401367, "rewards/rejected": 3.616138458251953, "step": 3563 }, { "epoch": 1.84472049689441, "grad_norm": 0.7183854579925537, "learning_rate": 3.535443799075199e-06, "loss": 0.10913874208927155, "rewards/accuracies": 0.96875, "rewards/chosen": 30.86590576171875, "rewards/margins": 28.661300659179688, "rewards/rejected": 2.1955642700195312, "step": 3564 }, { "epoch": 1.8452380952380953, "grad_norm": 0.938026487827301, "learning_rate": 3.5327162849342878e-06, "loss": 0.08085788786411285, "rewards/accuracies": 0.9609375, "rewards/chosen": 38.809608459472656, "rewards/margins": 35.03205871582031, "rewards/rejected": 3.7799625396728516, "step": 3565 }, { "epoch": 1.8457556935817805, "grad_norm": 0.9768950343132019, "learning_rate": 3.5299892484781705e-06, "loss": 0.10687126219272614, "rewards/accuracies": 0.9609375, "rewards/chosen": 39.17835235595703, "rewards/margins": 34.44395446777344, "rewards/rejected": 4.731202125549316, "step": 3566 }, { "epoch": 1.8462732919254659, "grad_norm": 0.9371535778045654, "learning_rate": 3.5272626905946557e-06, "loss": 0.12977199256420135, "rewards/accuracies": 0.9296875, "rewards/chosen": 33.977813720703125, "rewards/margins": 31.08576202392578, "rewards/rejected": 2.8953380584716797, "step": 3567 }, { "epoch": 1.8467908902691512, "grad_norm": 0.7843624353408813, "learning_rate": 3.524536612171391e-06, "loss": 0.14010506868362427, "rewards/accuracies": 0.953125, "rewards/chosen": 34.166778564453125, "rewards/margins": 31.092025756835938, "rewards/rejected": 3.0852785110473633, "step": 3568 }, { "epoch": 1.8473084886128364, "grad_norm": 1.8323535919189453, "learning_rate": 3.5218110140958717e-06, "loss": 0.1330750286579132, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.70507049560547, "rewards/margins": 31.43008041381836, "rewards/rejected": 3.2694549560546875, "step": 3569 }, { "epoch": 1.8478260869565217, "grad_norm": 1.1311228275299072, "learning_rate": 3.5190858972554375e-06, "loss": 0.1356091946363449, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.04615783691406, "rewards/margins": 31.603164672851562, "rewards/rejected": 3.4355807304382324, "step": 3570 }, { "epoch": 1.848343685300207, "grad_norm": 0.8104742765426636, "learning_rate": 3.516361262537267e-06, "loss": 0.15116196870803833, "rewards/accuracies": 0.9375, "rewards/chosen": 31.457435607910156, "rewards/margins": 28.681594848632812, "rewards/rejected": 2.7781524658203125, "step": 3571 }, { "epoch": 1.8488612836438922, "grad_norm": 1.0065515041351318, "learning_rate": 3.5136371108283866e-06, "loss": 0.11002752184867859, "rewards/accuracies": 0.953125, "rewards/chosen": 40.471519470214844, "rewards/margins": 36.947540283203125, "rewards/rejected": 3.5172009468078613, "step": 3572 }, { "epoch": 1.8493788819875776, "grad_norm": 2.153533697128296, "learning_rate": 3.5109134430156645e-06, "loss": 0.1585131138563156, "rewards/accuracies": 0.921875, "rewards/chosen": 35.35736846923828, "rewards/margins": 31.858505249023438, "rewards/rejected": 3.505760669708252, "step": 3573 }, { "epoch": 1.849896480331263, "grad_norm": 1.4263460636138916, "learning_rate": 3.508190259985808e-06, "loss": 0.1884397566318512, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.410953521728516, "rewards/margins": 31.298599243164062, "rewards/rejected": 3.114058494567871, "step": 3574 }, { "epoch": 1.8504140786749481, "grad_norm": 1.8188457489013672, "learning_rate": 3.5054675626253687e-06, "loss": 0.1681041568517685, "rewards/accuracies": 0.9375, "rewards/chosen": 33.372108459472656, "rewards/margins": 30.63720703125, "rewards/rejected": 2.7392845153808594, "step": 3575 }, { "epoch": 1.8509316770186337, "grad_norm": 0.8601242899894714, "learning_rate": 3.5027453518207434e-06, "loss": 0.0860561653971672, "rewards/accuracies": 0.953125, "rewards/chosen": 38.273956298828125, "rewards/margins": 34.34246826171875, "rewards/rejected": 3.9173583984375, "step": 3576 }, { "epoch": 1.8514492753623188, "grad_norm": 4.942264080047607, "learning_rate": 3.500023628458166e-06, "loss": 0.18030893802642822, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.973052978515625, "rewards/margins": 33.474853515625, "rewards/rejected": 3.5026087760925293, "step": 3577 }, { "epoch": 1.851966873706004, "grad_norm": 1.4517322778701782, "learning_rate": 3.4973023934237126e-06, "loss": 0.1161859929561615, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.9849739074707, "rewards/margins": 31.718826293945312, "rewards/rejected": 3.259166717529297, "step": 3578 }, { "epoch": 1.8524844720496896, "grad_norm": 2.3226168155670166, "learning_rate": 3.4945816476032996e-06, "loss": 0.13029354810714722, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.54126739501953, "rewards/margins": 30.883821487426758, "rewards/rejected": 4.656047821044922, "step": 3579 }, { "epoch": 1.8530020703933747, "grad_norm": 3.373162269592285, "learning_rate": 3.4918613918826894e-06, "loss": 0.17969980835914612, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.16879653930664, "rewards/margins": 28.52923583984375, "rewards/rejected": 2.6345224380493164, "step": 3580 }, { "epoch": 1.85351966873706, "grad_norm": 1.2996059656143188, "learning_rate": 3.4891416271474774e-06, "loss": 0.1192466989159584, "rewards/accuracies": 0.953125, "rewards/chosen": 28.572101593017578, "rewards/margins": 26.626708984375, "rewards/rejected": 1.944936752319336, "step": 3581 }, { "epoch": 1.8540372670807455, "grad_norm": 0.8202539086341858, "learning_rate": 3.486422354283103e-06, "loss": 0.1325203776359558, "rewards/accuracies": 0.921875, "rewards/chosen": 30.335111618041992, "rewards/margins": 26.578628540039062, "rewards/rejected": 3.7621726989746094, "step": 3582 }, { "epoch": 1.8545548654244306, "grad_norm": 1.0838398933410645, "learning_rate": 3.483703574174848e-06, "loss": 0.15784306824207306, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.812868118286133, "rewards/margins": 27.338653564453125, "rewards/rejected": 3.4715538024902344, "step": 3583 }, { "epoch": 1.855072463768116, "grad_norm": 1.0659511089324951, "learning_rate": 3.480985287707826e-06, "loss": 0.09538425505161285, "rewards/accuracies": 0.953125, "rewards/chosen": 35.39308547973633, "rewards/margins": 30.974578857421875, "rewards/rejected": 4.4085235595703125, "step": 3584 }, { "epoch": 1.8555900621118013, "grad_norm": 1.0443003177642822, "learning_rate": 3.4782674957669982e-06, "loss": 0.09881944954395294, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.638357162475586, "rewards/margins": 26.743194580078125, "rewards/rejected": 3.894803524017334, "step": 3585 }, { "epoch": 1.8561076604554865, "grad_norm": 1.0882253646850586, "learning_rate": 3.47555019923716e-06, "loss": 0.09043832868337631, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.993194580078125, "rewards/margins": 32.89349365234375, "rewards/rejected": 3.1020379066467285, "step": 3586 }, { "epoch": 1.8566252587991718, "grad_norm": 1.1054660081863403, "learning_rate": 3.4728333990029504e-06, "loss": 0.13861194252967834, "rewards/accuracies": 0.953125, "rewards/chosen": 29.83441925048828, "rewards/margins": 26.714874267578125, "rewards/rejected": 3.1171722412109375, "step": 3587 }, { "epoch": 1.8571428571428572, "grad_norm": 1.9965040683746338, "learning_rate": 3.4701170959488377e-06, "loss": 0.1269717663526535, "rewards/accuracies": 0.953125, "rewards/chosen": 32.912841796875, "rewards/margins": 29.37200927734375, "rewards/rejected": 3.5405778884887695, "step": 3588 }, { "epoch": 1.8576604554865424, "grad_norm": 2.567570447921753, "learning_rate": 3.467401290959137e-06, "loss": 0.17089520394802094, "rewards/accuracies": 0.9375, "rewards/chosen": 32.23162841796875, "rewards/margins": 28.044570922851562, "rewards/rejected": 4.1819353103637695, "step": 3589 }, { "epoch": 1.8581780538302277, "grad_norm": 1.5878714323043823, "learning_rate": 3.464685984917999e-06, "loss": 0.16266074776649475, "rewards/accuracies": 0.921875, "rewards/chosen": 30.738128662109375, "rewards/margins": 27.366111755371094, "rewards/rejected": 3.3713388442993164, "step": 3590 }, { "epoch": 1.858695652173913, "grad_norm": 3.486677408218384, "learning_rate": 3.4619711787094103e-06, "loss": 0.2106146365404129, "rewards/accuracies": 0.90625, "rewards/chosen": 30.670656204223633, "rewards/margins": 27.14604949951172, "rewards/rejected": 3.5278210639953613, "step": 3591 }, { "epoch": 1.8592132505175982, "grad_norm": 1.3795727491378784, "learning_rate": 3.4592568732171926e-06, "loss": 0.14661505818367004, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.29756164550781, "rewards/margins": 31.2509765625, "rewards/rejected": 3.0499515533447266, "step": 3592 }, { "epoch": 1.8597308488612836, "grad_norm": 3.6584365367889404, "learning_rate": 3.456543069325013e-06, "loss": 0.2168194204568863, "rewards/accuracies": 0.890625, "rewards/chosen": 33.47399139404297, "rewards/margins": 29.15064239501953, "rewards/rejected": 4.323846817016602, "step": 3593 }, { "epoch": 1.860248447204969, "grad_norm": 1.5360184907913208, "learning_rate": 3.4538297679163664e-06, "loss": 0.15386545658111572, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.04132080078125, "rewards/margins": 27.765789031982422, "rewards/rejected": 3.2667369842529297, "step": 3594 }, { "epoch": 1.860766045548654, "grad_norm": 1.0044306516647339, "learning_rate": 3.4511169698745885e-06, "loss": 0.09555007517337799, "rewards/accuracies": 0.953125, "rewards/chosen": 30.43109893798828, "rewards/margins": 27.434524536132812, "rewards/rejected": 2.996304512023926, "step": 3595 }, { "epoch": 1.8612836438923397, "grad_norm": 1.2602300643920898, "learning_rate": 3.4484046760828503e-06, "loss": 0.16434533894062042, "rewards/accuracies": 0.9140625, "rewards/chosen": 28.015729904174805, "rewards/margins": 25.03075408935547, "rewards/rejected": 2.979236602783203, "step": 3596 }, { "epoch": 1.8618012422360248, "grad_norm": 1.9851648807525635, "learning_rate": 3.445692887424158e-06, "loss": 0.16156792640686035, "rewards/accuracies": 0.90625, "rewards/chosen": 28.427871704101562, "rewards/margins": 25.015892028808594, "rewards/rejected": 3.4116759300231934, "step": 3597 }, { "epoch": 1.8623188405797102, "grad_norm": 1.1312389373779297, "learning_rate": 3.4429816047813545e-06, "loss": 0.13105523586273193, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.524993896484375, "rewards/margins": 28.92718505859375, "rewards/rejected": 3.598785400390625, "step": 3598 }, { "epoch": 1.8628364389233956, "grad_norm": 1.4616718292236328, "learning_rate": 3.4402708290371185e-06, "loss": 0.1506873369216919, "rewards/accuracies": 0.9375, "rewards/chosen": 29.891578674316406, "rewards/margins": 26.40209197998047, "rewards/rejected": 3.488950729370117, "step": 3599 }, { "epoch": 1.8633540372670807, "grad_norm": 0.9992113709449768, "learning_rate": 3.4375605610739615e-06, "loss": 0.16136567294597626, "rewards/accuracies": 0.8984375, "rewards/chosen": 29.21339225769043, "rewards/margins": 25.84100341796875, "rewards/rejected": 3.3726019859313965, "step": 3600 }, { "epoch": 1.863871635610766, "grad_norm": 0.8011426329612732, "learning_rate": 3.43485080177423e-06, "loss": 0.11009912192821503, "rewards/accuracies": 0.953125, "rewards/chosen": 28.292377471923828, "rewards/margins": 25.449691772460938, "rewards/rejected": 2.8395471572875977, "step": 3601 }, { "epoch": 1.8643892339544514, "grad_norm": 1.5819164514541626, "learning_rate": 3.432141552020106e-06, "loss": 0.13278910517692566, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.561328887939453, "rewards/margins": 26.286102294921875, "rewards/rejected": 3.2793941497802734, "step": 3602 }, { "epoch": 1.8649068322981366, "grad_norm": 2.314851760864258, "learning_rate": 3.4294328126936083e-06, "loss": 0.23426644504070282, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.55998992919922, "rewards/margins": 27.38250732421875, "rewards/rejected": 4.175115585327148, "step": 3603 }, { "epoch": 1.865424430641822, "grad_norm": 1.1359480619430542, "learning_rate": 3.426724584676582e-06, "loss": 0.1278858482837677, "rewards/accuracies": 0.953125, "rewards/chosen": 27.438451766967773, "rewards/margins": 23.929885864257812, "rewards/rejected": 3.510110855102539, "step": 3604 }, { "epoch": 1.8659420289855073, "grad_norm": 1.3991590738296509, "learning_rate": 3.4240168688507116e-06, "loss": 0.14159977436065674, "rewards/accuracies": 0.9140625, "rewards/chosen": 26.60547637939453, "rewards/margins": 23.90972900390625, "rewards/rejected": 2.6930770874023438, "step": 3605 }, { "epoch": 1.8664596273291925, "grad_norm": 0.7618063688278198, "learning_rate": 3.4213096660975167e-06, "loss": 0.11129531264305115, "rewards/accuracies": 0.9375, "rewards/chosen": 35.143672943115234, "rewards/margins": 30.611968994140625, "rewards/rejected": 4.534886360168457, "step": 3606 }, { "epoch": 1.8669772256728778, "grad_norm": 1.3273180723190308, "learning_rate": 3.418602977298342e-06, "loss": 0.2461434304714203, "rewards/accuracies": 0.8671875, "rewards/chosen": 27.330245971679688, "rewards/margins": 22.80272674560547, "rewards/rejected": 4.526837348937988, "step": 3607 }, { "epoch": 1.8674948240165632, "grad_norm": 1.0004481077194214, "learning_rate": 3.415896803334373e-06, "loss": 0.11837559193372726, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.370765686035156, "rewards/margins": 23.455902099609375, "rewards/rejected": 2.9106101989746094, "step": 3608 }, { "epoch": 1.8680124223602483, "grad_norm": 0.7519561648368835, "learning_rate": 3.413191145086621e-06, "loss": 0.13939541578292847, "rewards/accuracies": 0.9140625, "rewards/chosen": 30.782394409179688, "rewards/margins": 27.062423706054688, "rewards/rejected": 3.7254791259765625, "step": 3609 }, { "epoch": 1.8685300207039337, "grad_norm": 1.7676373720169067, "learning_rate": 3.4104860034359365e-06, "loss": 0.1342867910861969, "rewards/accuracies": 0.953125, "rewards/chosen": 27.445106506347656, "rewards/margins": 24.012619018554688, "rewards/rejected": 3.4342479705810547, "step": 3610 }, { "epoch": 1.869047619047619, "grad_norm": 0.9698432683944702, "learning_rate": 3.4077813792629954e-06, "loss": 0.1079038754105568, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.48436737060547, "rewards/margins": 26.587905883789062, "rewards/rejected": 3.8986778259277344, "step": 3611 }, { "epoch": 1.8695652173913042, "grad_norm": 0.8363842368125916, "learning_rate": 3.4050772734483077e-06, "loss": 0.12287209928035736, "rewards/accuracies": 0.9453125, "rewards/chosen": 26.47875213623047, "rewards/margins": 23.058639526367188, "rewards/rejected": 3.4244961738586426, "step": 3612 }, { "epoch": 1.8700828157349898, "grad_norm": 1.3175288438796997, "learning_rate": 3.4023736868722155e-06, "loss": 0.1357003003358841, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.207828521728516, "rewards/margins": 24.123504638671875, "rewards/rejected": 4.082876682281494, "step": 3613 }, { "epoch": 1.870600414078675, "grad_norm": 1.0718997716903687, "learning_rate": 3.399670620414889e-06, "loss": 0.17297548055648804, "rewards/accuracies": 0.921875, "rewards/chosen": 22.79217529296875, "rewards/margins": 20.145675659179688, "rewards/rejected": 2.647550582885742, "step": 3614 }, { "epoch": 1.87111801242236, "grad_norm": 1.1629083156585693, "learning_rate": 3.3969680749563327e-06, "loss": 0.15158817172050476, "rewards/accuracies": 0.90625, "rewards/chosen": 29.707149505615234, "rewards/margins": 26.300884246826172, "rewards/rejected": 3.406416654586792, "step": 3615 }, { "epoch": 1.8716356107660457, "grad_norm": 0.7791795134544373, "learning_rate": 3.3942660513763804e-06, "loss": 0.1289486289024353, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.031002044677734, "rewards/margins": 26.583465576171875, "rewards/rejected": 3.4412269592285156, "step": 3616 }, { "epoch": 1.8721532091097308, "grad_norm": 0.8621843457221985, "learning_rate": 3.3915645505546933e-06, "loss": 0.1347942352294922, "rewards/accuracies": 0.9296875, "rewards/chosen": 25.884536743164062, "rewards/margins": 22.751358032226562, "rewards/rejected": 3.1319985389709473, "step": 3617 }, { "epoch": 1.8726708074534162, "grad_norm": 0.8972392678260803, "learning_rate": 3.3888635733707647e-06, "loss": 0.19767296314239502, "rewards/accuracies": 0.9140625, "rewards/chosen": 27.894195556640625, "rewards/margins": 24.117935180664062, "rewards/rejected": 3.777526378631592, "step": 3618 }, { "epoch": 1.8731884057971016, "grad_norm": 1.1650243997573853, "learning_rate": 3.38616312070392e-06, "loss": 0.13450437784194946, "rewards/accuracies": 0.9609375, "rewards/chosen": 26.5245361328125, "rewards/margins": 23.599288940429688, "rewards/rejected": 2.9262619018554688, "step": 3619 }, { "epoch": 1.8737060041407867, "grad_norm": 0.9333469271659851, "learning_rate": 3.3834631934333074e-06, "loss": 0.14047203958034515, "rewards/accuracies": 0.9453125, "rewards/chosen": 28.64090347290039, "rewards/margins": 25.23998260498047, "rewards/rejected": 3.3980064392089844, "step": 3620 }, { "epoch": 1.874223602484472, "grad_norm": 0.9233554005622864, "learning_rate": 3.3807637924379095e-06, "loss": 0.15111351013183594, "rewards/accuracies": 0.921875, "rewards/chosen": 27.434646606445312, "rewards/margins": 23.84305191040039, "rewards/rejected": 3.595050811767578, "step": 3621 }, { "epoch": 1.8747412008281574, "grad_norm": 0.7350742220878601, "learning_rate": 3.3780649185965343e-06, "loss": 0.1150529682636261, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.191158294677734, "rewards/margins": 26.6490478515625, "rewards/rejected": 3.533946990966797, "step": 3622 }, { "epoch": 1.8752587991718426, "grad_norm": 1.7989174127578735, "learning_rate": 3.375366572787824e-06, "loss": 0.1895160675048828, "rewards/accuracies": 0.9375, "rewards/chosen": 29.16946029663086, "rewards/margins": 24.689064025878906, "rewards/rejected": 4.474017143249512, "step": 3623 }, { "epoch": 1.875776397515528, "grad_norm": 0.7351971864700317, "learning_rate": 3.3726687558902394e-06, "loss": 0.12876248359680176, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.284793853759766, "rewards/margins": 27.534713745117188, "rewards/rejected": 3.7491188049316406, "step": 3624 }, { "epoch": 1.8762939958592133, "grad_norm": 2.9792375564575195, "learning_rate": 3.3699714687820763e-06, "loss": 0.17225846648216248, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.323089599609375, "rewards/margins": 27.81182861328125, "rewards/rejected": 4.521965026855469, "step": 3625 }, { "epoch": 1.8768115942028984, "grad_norm": 1.2265108823776245, "learning_rate": 3.367274712341457e-06, "loss": 0.16772040724754333, "rewards/accuracies": 0.90625, "rewards/chosen": 30.387718200683594, "rewards/margins": 26.57061004638672, "rewards/rejected": 3.815427780151367, "step": 3626 }, { "epoch": 1.8773291925465838, "grad_norm": 0.7738518714904785, "learning_rate": 3.3645784874463266e-06, "loss": 0.08865436166524887, "rewards/accuracies": 0.9765625, "rewards/chosen": 33.14672088623047, "rewards/margins": 28.755409240722656, "rewards/rejected": 4.398458480834961, "step": 3627 }, { "epoch": 1.8778467908902692, "grad_norm": 2.056729555130005, "learning_rate": 3.3618827949744644e-06, "loss": 0.17377564311027527, "rewards/accuracies": 0.9375, "rewards/chosen": 28.817378997802734, "rewards/margins": 25.789138793945312, "rewards/rejected": 3.0193843841552734, "step": 3628 }, { "epoch": 1.8783643892339543, "grad_norm": 1.0683472156524658, "learning_rate": 3.3591876358034714e-06, "loss": 0.15936392545700073, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.093910217285156, "rewards/margins": 29.889312744140625, "rewards/rejected": 5.198602676391602, "step": 3629 }, { "epoch": 1.87888198757764, "grad_norm": 0.551020622253418, "learning_rate": 3.356493010810775e-06, "loss": 0.07498284429311752, "rewards/accuracies": 0.96875, "rewards/chosen": 37.829429626464844, "rewards/margins": 32.25091552734375, "rewards/rejected": 5.579527854919434, "step": 3630 }, { "epoch": 1.879399585921325, "grad_norm": 1.028972864151001, "learning_rate": 3.35379892087363e-06, "loss": 0.16557490825653076, "rewards/accuracies": 0.8984375, "rewards/chosen": 35.23326110839844, "rewards/margins": 30.27978515625, "rewards/rejected": 4.958381652832031, "step": 3631 }, { "epoch": 1.8799171842650102, "grad_norm": 1.5118062496185303, "learning_rate": 3.351105366869119e-06, "loss": 0.165683776140213, "rewards/accuracies": 0.921875, "rewards/chosen": 30.790966033935547, "rewards/margins": 25.909500122070312, "rewards/rejected": 4.884149551391602, "step": 3632 }, { "epoch": 1.8804347826086958, "grad_norm": 1.105589747428894, "learning_rate": 3.348412349674145e-06, "loss": 0.13805750012397766, "rewards/accuracies": 0.921875, "rewards/chosen": 31.22732925415039, "rewards/margins": 27.242965698242188, "rewards/rejected": 3.974522113800049, "step": 3633 }, { "epoch": 1.880952380952381, "grad_norm": 1.3092347383499146, "learning_rate": 3.345719870165441e-06, "loss": 0.1522415578365326, "rewards/accuracies": 0.921875, "rewards/chosen": 32.915000915527344, "rewards/margins": 27.68572235107422, "rewards/rejected": 5.22636604309082, "step": 3634 }, { "epoch": 1.8814699792960663, "grad_norm": 1.0718504190444946, "learning_rate": 3.343027929219561e-06, "loss": 0.1585076004266739, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.37625503540039, "rewards/margins": 29.286026000976562, "rewards/rejected": 5.096015930175781, "step": 3635 }, { "epoch": 1.8819875776397517, "grad_norm": 0.829586923122406, "learning_rate": 3.3403365277128897e-06, "loss": 0.15111902356147766, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.580188751220703, "rewards/margins": 27.064453125, "rewards/rejected": 4.510072708129883, "step": 3636 }, { "epoch": 1.8825051759834368, "grad_norm": 1.1978874206542969, "learning_rate": 3.337645666521628e-06, "loss": 0.16803616285324097, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.665863037109375, "rewards/margins": 28.39544677734375, "rewards/rejected": 6.273952484130859, "step": 3637 }, { "epoch": 1.8830227743271222, "grad_norm": 0.7491787075996399, "learning_rate": 3.334955346521808e-06, "loss": 0.1430794596672058, "rewards/accuracies": 0.921875, "rewards/chosen": 34.30216979980469, "rewards/margins": 29.5186767578125, "rewards/rejected": 4.779430389404297, "step": 3638 }, { "epoch": 1.8835403726708075, "grad_norm": 0.8118775486946106, "learning_rate": 3.332265568589283e-06, "loss": 0.1055920422077179, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.80754089355469, "rewards/margins": 28.3126220703125, "rewards/rejected": 5.4934282302856445, "step": 3639 }, { "epoch": 1.8840579710144927, "grad_norm": 0.9907692670822144, "learning_rate": 3.3295763335997255e-06, "loss": 0.1867678463459015, "rewards/accuracies": 0.90625, "rewards/chosen": 40.06605529785156, "rewards/margins": 32.82749938964844, "rewards/rejected": 7.229349136352539, "step": 3640 }, { "epoch": 1.884575569358178, "grad_norm": 1.020138144493103, "learning_rate": 3.32688764242864e-06, "loss": 0.14230045676231384, "rewards/accuracies": 0.953125, "rewards/chosen": 31.213050842285156, "rewards/margins": 27.07103729248047, "rewards/rejected": 4.146602630615234, "step": 3641 }, { "epoch": 1.8850931677018634, "grad_norm": 2.929556131362915, "learning_rate": 3.3241994959513487e-06, "loss": 0.13759534060955048, "rewards/accuracies": 0.9375, "rewards/chosen": 35.393638610839844, "rewards/margins": 29.637710571289062, "rewards/rejected": 5.757656097412109, "step": 3642 }, { "epoch": 1.8856107660455486, "grad_norm": 1.6433857679367065, "learning_rate": 3.321511895042994e-06, "loss": 0.14900363981723785, "rewards/accuracies": 0.921875, "rewards/chosen": 38.86703109741211, "rewards/margins": 32.447723388671875, "rewards/rejected": 6.421955108642578, "step": 3643 }, { "epoch": 1.886128364389234, "grad_norm": 0.7657766938209534, "learning_rate": 3.3188248405785457e-06, "loss": 0.10656879097223282, "rewards/accuracies": 0.953125, "rewards/chosen": 40.09964370727539, "rewards/margins": 33.686614990234375, "rewards/rejected": 6.423011779785156, "step": 3644 }, { "epoch": 1.8866459627329193, "grad_norm": 2.5666637420654297, "learning_rate": 3.3161383334327944e-06, "loss": 0.14878618717193604, "rewards/accuracies": 0.921875, "rewards/chosen": 34.75372314453125, "rewards/margins": 28.54840087890625, "rewards/rejected": 6.197142601013184, "step": 3645 }, { "epoch": 1.8871635610766044, "grad_norm": 1.6457678079605103, "learning_rate": 3.313452374480352e-06, "loss": 0.1726399064064026, "rewards/accuracies": 0.921875, "rewards/chosen": 38.61245346069336, "rewards/margins": 32.62376403808594, "rewards/rejected": 5.989287376403809, "step": 3646 }, { "epoch": 1.8876811594202898, "grad_norm": 1.0776299238204956, "learning_rate": 3.3107669645956502e-06, "loss": 0.1904943287372589, "rewards/accuracies": 0.9140625, "rewards/chosen": 35.88256072998047, "rewards/margins": 29.9462890625, "rewards/rejected": 5.93369197845459, "step": 3647 }, { "epoch": 1.8881987577639752, "grad_norm": 0.8866724967956543, "learning_rate": 3.3080821046529434e-06, "loss": 0.10561701655387878, "rewards/accuracies": 0.953125, "rewards/chosen": 37.60626983642578, "rewards/margins": 32.067161560058594, "rewards/rejected": 5.534296989440918, "step": 3648 }, { "epoch": 1.8887163561076603, "grad_norm": 1.0827444791793823, "learning_rate": 3.30539779552631e-06, "loss": 0.12116098403930664, "rewards/accuracies": 0.9453125, "rewards/chosen": 34.95555114746094, "rewards/margins": 29.76861572265625, "rewards/rejected": 5.1873779296875, "step": 3649 }, { "epoch": 1.889233954451346, "grad_norm": 0.7947303652763367, "learning_rate": 3.3027140380896437e-06, "loss": 0.17968177795410156, "rewards/accuracies": 0.8828125, "rewards/chosen": 36.88402557373047, "rewards/margins": 30.164154052734375, "rewards/rejected": 6.7260894775390625, "step": 3650 }, { "epoch": 1.889751552795031, "grad_norm": 1.100715160369873, "learning_rate": 3.300030833216661e-06, "loss": 0.12555059790611267, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.887428283691406, "rewards/margins": 31.625308990478516, "rewards/rejected": 6.258052825927734, "step": 3651 }, { "epoch": 1.8902691511387164, "grad_norm": 1.0116000175476074, "learning_rate": 3.2973481817809004e-06, "loss": 0.1710990071296692, "rewards/accuracies": 0.9140625, "rewards/chosen": 38.43108367919922, "rewards/margins": 32.65708923339844, "rewards/rejected": 5.778177261352539, "step": 3652 }, { "epoch": 1.8907867494824018, "grad_norm": 0.9920939803123474, "learning_rate": 3.294666084655716e-06, "loss": 0.13261882960796356, "rewards/accuracies": 0.9140625, "rewards/chosen": 42.81804656982422, "rewards/margins": 35.783607482910156, "rewards/rejected": 7.0347900390625, "step": 3653 }, { "epoch": 1.891304347826087, "grad_norm": 0.8219850659370422, "learning_rate": 3.291984542714286e-06, "loss": 0.1585700511932373, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.15768051147461, "rewards/margins": 29.656417846679688, "rewards/rejected": 5.49207878112793, "step": 3654 }, { "epoch": 1.8918219461697723, "grad_norm": 1.6703423261642456, "learning_rate": 3.2893035568296054e-06, "loss": 0.2064538300037384, "rewards/accuracies": 0.8828125, "rewards/chosen": 39.86871337890625, "rewards/margins": 34.48131561279297, "rewards/rejected": 5.374626159667969, "step": 3655 }, { "epoch": 1.8923395445134576, "grad_norm": 0.801059901714325, "learning_rate": 3.2866231278744876e-06, "loss": 0.1295090913772583, "rewards/accuracies": 0.9453125, "rewards/chosen": 39.90839385986328, "rewards/margins": 33.60333251953125, "rewards/rejected": 6.302299499511719, "step": 3656 }, { "epoch": 1.8928571428571428, "grad_norm": 1.0243287086486816, "learning_rate": 3.283943256721564e-06, "loss": 0.15342368185520172, "rewards/accuracies": 0.9140625, "rewards/chosen": 40.69922637939453, "rewards/margins": 35.187164306640625, "rewards/rejected": 5.5215959548950195, "step": 3657 }, { "epoch": 1.8933747412008282, "grad_norm": 1.6219178438186646, "learning_rate": 3.2812639442432894e-06, "loss": 0.13515245914459229, "rewards/accuracies": 0.9296875, "rewards/chosen": 38.38629913330078, "rewards/margins": 32.358184814453125, "rewards/rejected": 6.02730655670166, "step": 3658 }, { "epoch": 1.8938923395445135, "grad_norm": 0.8948614001274109, "learning_rate": 3.2785851913119326e-06, "loss": 0.1306169331073761, "rewards/accuracies": 0.9296875, "rewards/chosen": 40.196537017822266, "rewards/margins": 34.591339111328125, "rewards/rejected": 5.6042022705078125, "step": 3659 }, { "epoch": 1.8944099378881987, "grad_norm": 1.156904697418213, "learning_rate": 3.2759069987995796e-06, "loss": 0.11072557419538498, "rewards/accuracies": 0.96875, "rewards/chosen": 35.617393493652344, "rewards/margins": 30.957855224609375, "rewards/rejected": 4.661827087402344, "step": 3660 }, { "epoch": 1.894927536231884, "grad_norm": 1.0455435514450073, "learning_rate": 3.2732293675781345e-06, "loss": 0.16815686225891113, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.543563842773438, "rewards/margins": 25.704341888427734, "rewards/rejected": 4.8505401611328125, "step": 3661 }, { "epoch": 1.8954451345755694, "grad_norm": 0.8815142512321472, "learning_rate": 3.270552298519323e-06, "loss": 0.12430371344089508, "rewards/accuracies": 0.9140625, "rewards/chosen": 39.044681549072266, "rewards/margins": 33.70398712158203, "rewards/rejected": 5.346200942993164, "step": 3662 }, { "epoch": 1.8959627329192545, "grad_norm": 0.8496831059455872, "learning_rate": 3.267875792494681e-06, "loss": 0.13618090748786926, "rewards/accuracies": 0.9453125, "rewards/chosen": 35.66075897216797, "rewards/margins": 31.373680114746094, "rewards/rejected": 4.291473388671875, "step": 3663 }, { "epoch": 1.89648033126294, "grad_norm": 0.9279394149780273, "learning_rate": 3.2651998503755657e-06, "loss": 0.18626049160957336, "rewards/accuracies": 0.9140625, "rewards/chosen": 35.50315856933594, "rewards/margins": 31.251113891601562, "rewards/rejected": 4.248613357543945, "step": 3664 }, { "epoch": 1.8969979296066253, "grad_norm": 1.1986302137374878, "learning_rate": 3.2625244730331497e-06, "loss": 0.15838758647441864, "rewards/accuracies": 0.90625, "rewards/chosen": 35.47175216674805, "rewards/margins": 31.389129638671875, "rewards/rejected": 4.081897735595703, "step": 3665 }, { "epoch": 1.8975155279503104, "grad_norm": 1.0496543645858765, "learning_rate": 3.259849661338418e-06, "loss": 0.20357343554496765, "rewards/accuracies": 0.890625, "rewards/chosen": 29.06939697265625, "rewards/margins": 26.427698135375977, "rewards/rejected": 2.639446258544922, "step": 3666 }, { "epoch": 1.898033126293996, "grad_norm": 1.3687034845352173, "learning_rate": 3.2571754161621794e-06, "loss": 0.10440612584352493, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.507484436035156, "rewards/margins": 33.04252624511719, "rewards/rejected": 3.46816349029541, "step": 3667 }, { "epoch": 1.8985507246376812, "grad_norm": 0.8408504128456116, "learning_rate": 3.2545017383750523e-06, "loss": 0.08337980508804321, "rewards/accuracies": 0.984375, "rewards/chosen": 39.49664306640625, "rewards/margins": 34.33140563964844, "rewards/rejected": 5.164154052734375, "step": 3668 }, { "epoch": 1.8990683229813663, "grad_norm": 1.4905180931091309, "learning_rate": 3.251828628847471e-06, "loss": 0.13861185312271118, "rewards/accuracies": 0.90625, "rewards/chosen": 33.90033721923828, "rewards/margins": 29.76763916015625, "rewards/rejected": 4.125804901123047, "step": 3669 }, { "epoch": 1.8995859213250519, "grad_norm": 0.9135248064994812, "learning_rate": 3.249156088449685e-06, "loss": 0.09280893206596375, "rewards/accuracies": 0.9609375, "rewards/chosen": 38.75649642944336, "rewards/margins": 34.24424743652344, "rewards/rejected": 4.513265132904053, "step": 3670 }, { "epoch": 1.900103519668737, "grad_norm": 0.6807959675788879, "learning_rate": 3.2464841180517604e-06, "loss": 0.1456458866596222, "rewards/accuracies": 0.90625, "rewards/chosen": 35.60517120361328, "rewards/margins": 31.033966064453125, "rewards/rejected": 4.579387664794922, "step": 3671 }, { "epoch": 1.9006211180124224, "grad_norm": 1.0248788595199585, "learning_rate": 3.243812718523577e-06, "loss": 0.16798032820224762, "rewards/accuracies": 0.921875, "rewards/chosen": 28.02151870727539, "rewards/margins": 25.722549438476562, "rewards/rejected": 2.2876739501953125, "step": 3672 }, { "epoch": 1.9011387163561078, "grad_norm": 0.9200320243835449, "learning_rate": 3.241141890734827e-06, "loss": 0.13240474462509155, "rewards/accuracies": 0.9375, "rewards/chosen": 36.9007568359375, "rewards/margins": 32.354278564453125, "rewards/rejected": 4.554039001464844, "step": 3673 }, { "epoch": 1.901656314699793, "grad_norm": 1.3816652297973633, "learning_rate": 3.238471635555016e-06, "loss": 0.1660163700580597, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.518775939941406, "rewards/margins": 29.44073486328125, "rewards/rejected": 5.080596923828125, "step": 3674 }, { "epoch": 1.9021739130434783, "grad_norm": 1.2834904193878174, "learning_rate": 3.235801953853469e-06, "loss": 0.14247310161590576, "rewards/accuracies": 0.921875, "rewards/chosen": 39.86695098876953, "rewards/margins": 35.07614517211914, "rewards/rejected": 4.778480529785156, "step": 3675 }, { "epoch": 1.9026915113871636, "grad_norm": 1.6632661819458008, "learning_rate": 3.2331328464993165e-06, "loss": 0.19386324286460876, "rewards/accuracies": 0.9140625, "rewards/chosen": 37.116180419921875, "rewards/margins": 32.041259765625, "rewards/rejected": 5.068122863769531, "step": 3676 }, { "epoch": 1.9032091097308488, "grad_norm": 1.1847175359725952, "learning_rate": 3.230464314361508e-06, "loss": 0.17056305706501007, "rewards/accuracies": 0.90625, "rewards/chosen": 35.957786560058594, "rewards/margins": 31.75848388671875, "rewards/rejected": 4.1841230392456055, "step": 3677 }, { "epoch": 1.9037267080745341, "grad_norm": 1.4523080587387085, "learning_rate": 3.2277963583088023e-06, "loss": 0.14375433325767517, "rewards/accuracies": 0.9375, "rewards/chosen": 33.856422424316406, "rewards/margins": 30.206619262695312, "rewards/rejected": 3.6535024642944336, "step": 3678 }, { "epoch": 1.9042443064182195, "grad_norm": 2.723205089569092, "learning_rate": 3.2251289792097694e-06, "loss": 0.12075777351856232, "rewards/accuracies": 0.921875, "rewards/chosen": 43.70134735107422, "rewards/margins": 38.232452392578125, "rewards/rejected": 5.460775375366211, "step": 3679 }, { "epoch": 1.9047619047619047, "grad_norm": 1.0193965435028076, "learning_rate": 3.222462177932798e-06, "loss": 0.12407030165195465, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.379878997802734, "rewards/margins": 32.748695373535156, "rewards/rejected": 5.629085540771484, "step": 3680 }, { "epoch": 1.90527950310559, "grad_norm": 1.1035778522491455, "learning_rate": 3.2197959553460827e-06, "loss": 0.11303359270095825, "rewards/accuracies": 0.9609375, "rewards/chosen": 38.6556396484375, "rewards/margins": 33.92980194091797, "rewards/rejected": 4.732871055603027, "step": 3681 }, { "epoch": 1.9057971014492754, "grad_norm": 1.4425941705703735, "learning_rate": 3.217130312317632e-06, "loss": 0.13844048976898193, "rewards/accuracies": 0.8984375, "rewards/chosen": 38.90248107910156, "rewards/margins": 34.236907958984375, "rewards/rejected": 4.659599304199219, "step": 3682 }, { "epoch": 1.9063146997929605, "grad_norm": 1.8493269681930542, "learning_rate": 3.2144652497152633e-06, "loss": 0.2503182888031006, "rewards/accuracies": 0.875, "rewards/chosen": 37.358394622802734, "rewards/margins": 33.32999038696289, "rewards/rejected": 4.040275573730469, "step": 3683 }, { "epoch": 1.9068322981366461, "grad_norm": 1.2595818042755127, "learning_rate": 3.211800768406609e-06, "loss": 0.13888885080814362, "rewards/accuracies": 0.921875, "rewards/chosen": 37.46968460083008, "rewards/margins": 33.13740539550781, "rewards/rejected": 4.3374810218811035, "step": 3684 }, { "epoch": 1.9073498964803313, "grad_norm": 0.8572182059288025, "learning_rate": 3.209136869259111e-06, "loss": 0.08695253729820251, "rewards/accuracies": 0.953125, "rewards/chosen": 43.41215515136719, "rewards/margins": 38.43115234375, "rewards/rejected": 4.992877960205078, "step": 3685 }, { "epoch": 1.9078674948240164, "grad_norm": 1.7415114641189575, "learning_rate": 3.2064735531400177e-06, "loss": 0.13680505752563477, "rewards/accuracies": 0.9375, "rewards/chosen": 32.26513671875, "rewards/margins": 29.03789520263672, "rewards/rejected": 3.229656219482422, "step": 3686 }, { "epoch": 1.908385093167702, "grad_norm": 1.3829246759414673, "learning_rate": 3.2038108209163914e-06, "loss": 0.13944576680660248, "rewards/accuracies": 0.9296875, "rewards/chosen": 38.59002685546875, "rewards/margins": 34.40178680419922, "rewards/rejected": 4.187742710113525, "step": 3687 }, { "epoch": 1.9089026915113871, "grad_norm": 0.9009994864463806, "learning_rate": 3.201148673455108e-06, "loss": 0.11604934930801392, "rewards/accuracies": 0.953125, "rewards/chosen": 32.798641204833984, "rewards/margins": 28.623573303222656, "rewards/rejected": 4.166933059692383, "step": 3688 }, { "epoch": 1.9094202898550725, "grad_norm": 1.3297984600067139, "learning_rate": 3.198487111622843e-06, "loss": 0.16387009620666504, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.36671447753906, "rewards/margins": 30.731792449951172, "rewards/rejected": 3.6357269287109375, "step": 3689 }, { "epoch": 1.9099378881987579, "grad_norm": 0.7089162468910217, "learning_rate": 3.1958261362860916e-06, "loss": 0.07410440593957901, "rewards/accuracies": 0.9921875, "rewards/chosen": 35.50604248046875, "rewards/margins": 31.925872802734375, "rewards/rejected": 3.58217716217041, "step": 3690 }, { "epoch": 1.910455486542443, "grad_norm": 1.1722079515457153, "learning_rate": 3.19316574831115e-06, "loss": 0.12260222434997559, "rewards/accuracies": 0.921875, "rewards/chosen": 37.793243408203125, "rewards/margins": 33.00660705566406, "rewards/rejected": 4.78251314163208, "step": 3691 }, { "epoch": 1.9109730848861284, "grad_norm": 1.7443962097167969, "learning_rate": 3.19050594856413e-06, "loss": 0.16371843218803406, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.26435470581055, "rewards/margins": 31.513015747070312, "rewards/rejected": 3.7565994262695312, "step": 3692 }, { "epoch": 1.9114906832298137, "grad_norm": 2.2547168731689453, "learning_rate": 3.187846737910947e-06, "loss": 0.17788070440292358, "rewards/accuracies": 0.90625, "rewards/chosen": 39.15534973144531, "rewards/margins": 33.855709075927734, "rewards/rejected": 5.294609069824219, "step": 3693 }, { "epoch": 1.912008281573499, "grad_norm": 0.8266968131065369, "learning_rate": 3.185188117217325e-06, "loss": 0.11469919979572296, "rewards/accuracies": 0.953125, "rewards/chosen": 34.96448516845703, "rewards/margins": 29.52741241455078, "rewards/rejected": 5.431361198425293, "step": 3694 }, { "epoch": 1.9125258799171843, "grad_norm": 2.0712296962738037, "learning_rate": 3.1825300873487997e-06, "loss": 0.1355063021183014, "rewards/accuracies": 0.9453125, "rewards/chosen": 39.551025390625, "rewards/margins": 34.503013610839844, "rewards/rejected": 5.046579360961914, "step": 3695 }, { "epoch": 1.9130434782608696, "grad_norm": 2.0669870376586914, "learning_rate": 3.179872649170709e-06, "loss": 0.13092976808547974, "rewards/accuracies": 0.921875, "rewards/chosen": 38.405784606933594, "rewards/margins": 33.9637451171875, "rewards/rejected": 4.448856353759766, "step": 3696 }, { "epoch": 1.9135610766045548, "grad_norm": 2.190480947494507, "learning_rate": 3.1772158035482028e-06, "loss": 0.1663593053817749, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.581581115722656, "rewards/margins": 30.840606689453125, "rewards/rejected": 4.753345489501953, "step": 3697 }, { "epoch": 1.9140786749482401, "grad_norm": 0.9877079725265503, "learning_rate": 3.174559551346238e-06, "loss": 0.10360995680093765, "rewards/accuracies": 0.9375, "rewards/chosen": 39.242347717285156, "rewards/margins": 33.92506408691406, "rewards/rejected": 5.313721656799316, "step": 3698 }, { "epoch": 1.9145962732919255, "grad_norm": 0.7650741338729858, "learning_rate": 3.1719038934295733e-06, "loss": 0.14933031797409058, "rewards/accuracies": 0.9375, "rewards/chosen": 37.22648239135742, "rewards/margins": 32.355743408203125, "rewards/rejected": 4.87123441696167, "step": 3699 }, { "epoch": 1.9151138716356106, "grad_norm": 0.5351163744926453, "learning_rate": 3.169248830662778e-06, "loss": 0.09401489794254303, "rewards/accuracies": 0.96875, "rewards/chosen": 39.202186584472656, "rewards/margins": 34.144927978515625, "rewards/rejected": 5.059602737426758, "step": 3700 }, { "epoch": 1.9156314699792962, "grad_norm": 1.4148263931274414, "learning_rate": 3.166594363910229e-06, "loss": 0.1606214940547943, "rewards/accuracies": 0.921875, "rewards/chosen": 43.17263412475586, "rewards/margins": 37.398048400878906, "rewards/rejected": 5.7678680419921875, "step": 3701 }, { "epoch": 1.9161490683229814, "grad_norm": 1.1149743795394897, "learning_rate": 3.1639404940361052e-06, "loss": 0.1350449025630951, "rewards/accuracies": 0.9140625, "rewards/chosen": 44.56146240234375, "rewards/margins": 38.19180679321289, "rewards/rejected": 6.381360054016113, "step": 3702 }, { "epoch": 1.9166666666666665, "grad_norm": 0.9910213947296143, "learning_rate": 3.1612872219043943e-06, "loss": 0.13820907473564148, "rewards/accuracies": 0.953125, "rewards/chosen": 38.40562438964844, "rewards/margins": 32.64300537109375, "rewards/rejected": 5.772640228271484, "step": 3703 }, { "epoch": 1.917184265010352, "grad_norm": 1.5308669805526733, "learning_rate": 3.158634548378886e-06, "loss": 0.14514383673667908, "rewards/accuracies": 0.9296875, "rewards/chosen": 45.14915084838867, "rewards/margins": 38.37457275390625, "rewards/rejected": 6.780248641967773, "step": 3704 }, { "epoch": 1.9177018633540373, "grad_norm": 0.7192935347557068, "learning_rate": 3.1559824743231804e-06, "loss": 0.0822700783610344, "rewards/accuracies": 0.953125, "rewards/chosen": 44.727210998535156, "rewards/margins": 38.684844970703125, "rewards/rejected": 6.033054351806641, "step": 3705 }, { "epoch": 1.9182194616977226, "grad_norm": 1.5635786056518555, "learning_rate": 3.153331000600678e-06, "loss": 0.12906025350093842, "rewards/accuracies": 0.9375, "rewards/chosen": 43.09571075439453, "rewards/margins": 36.755210876464844, "rewards/rejected": 6.3318328857421875, "step": 3706 }, { "epoch": 1.918737060041408, "grad_norm": 0.9140008687973022, "learning_rate": 3.1506801280745835e-06, "loss": 0.1093241423368454, "rewards/accuracies": 0.9375, "rewards/chosen": 44.08230209350586, "rewards/margins": 38.101287841796875, "rewards/rejected": 5.990001678466797, "step": 3707 }, { "epoch": 1.9192546583850931, "grad_norm": 0.9920974969863892, "learning_rate": 3.148029857607911e-06, "loss": 0.15776658058166504, "rewards/accuracies": 0.9375, "rewards/chosen": 43.272544860839844, "rewards/margins": 36.472347259521484, "rewards/rejected": 6.783577919006348, "step": 3708 }, { "epoch": 1.9197722567287785, "grad_norm": 0.85445237159729, "learning_rate": 3.1453801900634706e-06, "loss": 0.1527414619922638, "rewards/accuracies": 0.921875, "rewards/chosen": 37.91859436035156, "rewards/margins": 33.15330505371094, "rewards/rejected": 4.762828826904297, "step": 3709 }, { "epoch": 1.9202898550724639, "grad_norm": 3.318847417831421, "learning_rate": 3.142731126303885e-06, "loss": 0.16086842119693756, "rewards/accuracies": 0.9296875, "rewards/chosen": 44.33299255371094, "rewards/margins": 37.21934509277344, "rewards/rejected": 7.119574546813965, "step": 3710 }, { "epoch": 1.920807453416149, "grad_norm": 2.083648204803467, "learning_rate": 3.1400826671915753e-06, "loss": 0.14544682204723358, "rewards/accuracies": 0.9296875, "rewards/chosen": 38.717552185058594, "rewards/margins": 32.555580139160156, "rewards/rejected": 6.171435356140137, "step": 3711 }, { "epoch": 1.9213250517598344, "grad_norm": 1.6784292459487915, "learning_rate": 3.1374348135887635e-06, "loss": 0.18029645085334778, "rewards/accuracies": 0.8984375, "rewards/chosen": 41.322784423828125, "rewards/margins": 35.01678466796875, "rewards/rejected": 6.315071105957031, "step": 3712 }, { "epoch": 1.9218426501035197, "grad_norm": 0.7896307110786438, "learning_rate": 3.134787566357479e-06, "loss": 0.09644867479801178, "rewards/accuracies": 0.9609375, "rewards/chosen": 40.372802734375, "rewards/margins": 35.408294677734375, "rewards/rejected": 4.960740089416504, "step": 3713 }, { "epoch": 1.9223602484472049, "grad_norm": 0.9474044442176819, "learning_rate": 3.1321409263595537e-06, "loss": 0.14031848311424255, "rewards/accuracies": 0.9375, "rewards/chosen": 43.62882614135742, "rewards/margins": 36.285400390625, "rewards/rejected": 7.355447769165039, "step": 3714 }, { "epoch": 1.9228778467908902, "grad_norm": 14.26228141784668, "learning_rate": 3.1294948944566195e-06, "loss": 0.2076939195394516, "rewards/accuracies": 0.9140625, "rewards/chosen": 43.60185241699219, "rewards/margins": 36.15087890625, "rewards/rejected": 7.461271286010742, "step": 3715 }, { "epoch": 1.9233954451345756, "grad_norm": 1.1524471044540405, "learning_rate": 3.1268494715101106e-06, "loss": 0.16397380828857422, "rewards/accuracies": 0.9140625, "rewards/chosen": 39.37355041503906, "rewards/margins": 33.11137390136719, "rewards/rejected": 6.25013542175293, "step": 3716 }, { "epoch": 1.9239130434782608, "grad_norm": 1.3761264085769653, "learning_rate": 3.1242046583812625e-06, "loss": 0.1351756900548935, "rewards/accuracies": 0.9296875, "rewards/chosen": 42.01460266113281, "rewards/margins": 35.34294128417969, "rewards/rejected": 6.691873550415039, "step": 3717 }, { "epoch": 1.9244306418219461, "grad_norm": 1.484177589416504, "learning_rate": 3.1215604559311168e-06, "loss": 0.12087448686361313, "rewards/accuracies": 0.921875, "rewards/chosen": 40.48371124267578, "rewards/margins": 33.82585144042969, "rewards/rejected": 6.656303405761719, "step": 3718 }, { "epoch": 1.9249482401656315, "grad_norm": 1.2199935913085938, "learning_rate": 3.118916865020509e-06, "loss": 0.12033569812774658, "rewards/accuracies": 0.953125, "rewards/chosen": 38.07734680175781, "rewards/margins": 33.313690185546875, "rewards/rejected": 4.761592864990234, "step": 3719 }, { "epoch": 1.9254658385093166, "grad_norm": 0.5737310647964478, "learning_rate": 3.1162738865100816e-06, "loss": 0.06004190072417259, "rewards/accuracies": 0.9765625, "rewards/chosen": 47.3492431640625, "rewards/margins": 40.25132751464844, "rewards/rejected": 7.100218772888184, "step": 3720 }, { "epoch": 1.9259834368530022, "grad_norm": 2.719052314758301, "learning_rate": 3.113631521260274e-06, "loss": 0.17692197859287262, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.85240173339844, "rewards/margins": 30.887908935546875, "rewards/rejected": 5.966526031494141, "step": 3721 }, { "epoch": 1.9265010351966874, "grad_norm": 2.7015058994293213, "learning_rate": 3.1109897701313264e-06, "loss": 0.15710270404815674, "rewards/accuracies": 0.90625, "rewards/chosen": 41.83790588378906, "rewards/margins": 35.386688232421875, "rewards/rejected": 6.4518585205078125, "step": 3722 }, { "epoch": 1.9270186335403725, "grad_norm": 1.0336467027664185, "learning_rate": 3.1083486339832812e-06, "loss": 0.10096888989210129, "rewards/accuracies": 0.921875, "rewards/chosen": 43.59081268310547, "rewards/margins": 37.01824188232422, "rewards/rejected": 6.579647064208984, "step": 3723 }, { "epoch": 1.927536231884058, "grad_norm": 3.384424924850464, "learning_rate": 3.1057081136759813e-06, "loss": 0.17637518048286438, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.972076416015625, "rewards/margins": 32.69660949707031, "rewards/rejected": 6.272964000701904, "step": 3724 }, { "epoch": 1.9280538302277432, "grad_norm": 0.8858316540718079, "learning_rate": 3.1030682100690633e-06, "loss": 0.14947494864463806, "rewards/accuracies": 0.921875, "rewards/chosen": 41.76732635498047, "rewards/margins": 34.13568115234375, "rewards/rejected": 7.630695343017578, "step": 3725 }, { "epoch": 1.9285714285714286, "grad_norm": 1.8139437437057495, "learning_rate": 3.1004289240219677e-06, "loss": 0.12838837504386902, "rewards/accuracies": 0.953125, "rewards/chosen": 39.930213928222656, "rewards/margins": 34.04511260986328, "rewards/rejected": 5.884969234466553, "step": 3726 }, { "epoch": 1.929089026915114, "grad_norm": 1.513822317123413, "learning_rate": 3.0977902563939347e-06, "loss": 0.16414865851402283, "rewards/accuracies": 0.921875, "rewards/chosen": 40.360618591308594, "rewards/margins": 31.78253936767578, "rewards/rejected": 8.574630737304688, "step": 3727 }, { "epoch": 1.9296066252587991, "grad_norm": 1.1162934303283691, "learning_rate": 3.095152208044002e-06, "loss": 0.1296418011188507, "rewards/accuracies": 0.9375, "rewards/chosen": 38.17549133300781, "rewards/margins": 32.93504333496094, "rewards/rejected": 5.224910736083984, "step": 3728 }, { "epoch": 1.9301242236024845, "grad_norm": 1.6290473937988281, "learning_rate": 3.092514779831004e-06, "loss": 0.10125795006752014, "rewards/accuracies": 0.953125, "rewards/chosen": 40.66217041015625, "rewards/margins": 33.111236572265625, "rewards/rejected": 7.546720504760742, "step": 3729 }, { "epoch": 1.9306418219461698, "grad_norm": 2.2463927268981934, "learning_rate": 3.0898779726135724e-06, "loss": 0.15274491906166077, "rewards/accuracies": 0.9375, "rewards/chosen": 39.33106231689453, "rewards/margins": 33.545562744140625, "rewards/rejected": 5.784449577331543, "step": 3730 }, { "epoch": 1.931159420289855, "grad_norm": 0.8451343774795532, "learning_rate": 3.0872417872501436e-06, "loss": 0.0853293314576149, "rewards/accuracies": 0.96875, "rewards/chosen": 37.04383850097656, "rewards/margins": 31.47528076171875, "rewards/rejected": 5.57073974609375, "step": 3731 }, { "epoch": 1.9316770186335404, "grad_norm": 1.3667277097702026, "learning_rate": 3.0846062245989427e-06, "loss": 0.17234548926353455, "rewards/accuracies": 0.9296875, "rewards/chosen": 36.01387023925781, "rewards/margins": 29.62286376953125, "rewards/rejected": 6.391674041748047, "step": 3732 }, { "epoch": 1.9321946169772257, "grad_norm": 2.262768268585205, "learning_rate": 3.0819712855179986e-06, "loss": 0.24341940879821777, "rewards/accuracies": 0.9140625, "rewards/chosen": 38.280479431152344, "rewards/margins": 31.537704467773438, "rewards/rejected": 6.735324859619141, "step": 3733 }, { "epoch": 1.9327122153209109, "grad_norm": 1.9495779275894165, "learning_rate": 3.079336970865133e-06, "loss": 0.18941959738731384, "rewards/accuracies": 0.9375, "rewards/chosen": 38.41994094848633, "rewards/margins": 31.388229370117188, "rewards/rejected": 7.037965774536133, "step": 3734 }, { "epoch": 1.9332298136645962, "grad_norm": 1.1689125299453735, "learning_rate": 3.0767032814979656e-06, "loss": 0.18314433097839355, "rewards/accuracies": 0.9375, "rewards/chosen": 37.97107696533203, "rewards/margins": 32.87461853027344, "rewards/rejected": 5.103118896484375, "step": 3735 }, { "epoch": 1.9337474120082816, "grad_norm": 1.237006425857544, "learning_rate": 3.0740702182739157e-06, "loss": 0.17453864216804504, "rewards/accuracies": 0.9375, "rewards/chosen": 35.96347427368164, "rewards/margins": 29.409652709960938, "rewards/rejected": 6.546073913574219, "step": 3736 }, { "epoch": 1.9342650103519667, "grad_norm": 1.6332333087921143, "learning_rate": 3.071437782050195e-06, "loss": 0.2287566214799881, "rewards/accuracies": 0.9296875, "rewards/chosen": 42.68395233154297, "rewards/margins": 35.30976867675781, "rewards/rejected": 7.381500244140625, "step": 3737 }, { "epoch": 1.9347826086956523, "grad_norm": 1.605789065361023, "learning_rate": 3.0688059736838107e-06, "loss": 0.13767224550247192, "rewards/accuracies": 0.953125, "rewards/chosen": 38.356937408447266, "rewards/margins": 31.97027587890625, "rewards/rejected": 6.380977630615234, "step": 3738 }, { "epoch": 1.9353002070393375, "grad_norm": 1.3441879749298096, "learning_rate": 3.066174794031566e-06, "loss": 0.07612499594688416, "rewards/accuracies": 0.96875, "rewards/chosen": 41.43547058105469, "rewards/margins": 34.99462890625, "rewards/rejected": 6.4424285888671875, "step": 3739 }, { "epoch": 1.9358178053830226, "grad_norm": 1.1741434335708618, "learning_rate": 3.063544243950064e-06, "loss": 0.1302797496318817, "rewards/accuracies": 0.921875, "rewards/chosen": 37.46898651123047, "rewards/margins": 32.028900146484375, "rewards/rejected": 5.433097839355469, "step": 3740 }, { "epoch": 1.9363354037267082, "grad_norm": 1.379888653755188, "learning_rate": 3.0609143242956984e-06, "loss": 0.16253024339675903, "rewards/accuracies": 0.921875, "rewards/chosen": 32.98657989501953, "rewards/margins": 27.700912475585938, "rewards/rejected": 5.278953552246094, "step": 3741 }, { "epoch": 1.9368530020703933, "grad_norm": 0.6739371418952942, "learning_rate": 3.0582850359246564e-06, "loss": 0.11113407462835312, "rewards/accuracies": 0.9375, "rewards/chosen": 41.299293518066406, "rewards/margins": 35.30809020996094, "rewards/rejected": 5.996273040771484, "step": 3742 }, { "epoch": 1.9373706004140787, "grad_norm": 1.2000508308410645, "learning_rate": 3.0556563796929217e-06, "loss": 0.09675769507884979, "rewards/accuracies": 0.9765625, "rewards/chosen": 43.56183624267578, "rewards/margins": 36.217926025390625, "rewards/rejected": 7.336158752441406, "step": 3743 }, { "epoch": 1.937888198757764, "grad_norm": 0.6848348379135132, "learning_rate": 3.053028356456277e-06, "loss": 0.11618462204933167, "rewards/accuracies": 0.953125, "rewards/chosen": 35.20245361328125, "rewards/margins": 29.778106689453125, "rewards/rejected": 5.427734375, "step": 3744 }, { "epoch": 1.9384057971014492, "grad_norm": 2.105396032333374, "learning_rate": 3.050400967070287e-06, "loss": 0.23402252793312073, "rewards/accuracies": 0.9296875, "rewards/chosen": 40.38737106323242, "rewards/margins": 34.521026611328125, "rewards/rejected": 5.867889404296875, "step": 3745 }, { "epoch": 1.9389233954451346, "grad_norm": 1.7981750965118408, "learning_rate": 3.0477742123903224e-06, "loss": 0.20071126520633698, "rewards/accuracies": 0.921875, "rewards/chosen": 36.858306884765625, "rewards/margins": 32.064735412597656, "rewards/rejected": 4.804817199707031, "step": 3746 }, { "epoch": 1.93944099378882, "grad_norm": 1.3080172538757324, "learning_rate": 3.045148093271542e-06, "loss": 0.12588781118392944, "rewards/accuracies": 0.9609375, "rewards/chosen": 37.445228576660156, "rewards/margins": 32.74269104003906, "rewards/rejected": 4.698024749755859, "step": 3747 }, { "epoch": 1.939958592132505, "grad_norm": 1.5521950721740723, "learning_rate": 3.042522610568894e-06, "loss": 0.1923268586397171, "rewards/accuracies": 0.8984375, "rewards/chosen": 31.46883773803711, "rewards/margins": 27.411163330078125, "rewards/rejected": 4.063873767852783, "step": 3748 }, { "epoch": 1.9404761904761905, "grad_norm": 1.1356232166290283, "learning_rate": 3.0398977651371276e-06, "loss": 0.14537781476974487, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.17411422729492, "rewards/margins": 27.373374938964844, "rewards/rejected": 4.8067755699157715, "step": 3749 }, { "epoch": 1.9409937888198758, "grad_norm": 1.1767312288284302, "learning_rate": 3.037273557830778e-06, "loss": 0.1637938767671585, "rewards/accuracies": 0.8984375, "rewards/chosen": 32.55833435058594, "rewards/margins": 27.397735595703125, "rewards/rejected": 5.168615341186523, "step": 3750 }, { "epoch": 1.941511387163561, "grad_norm": 1.1617382764816284, "learning_rate": 3.0346499895041766e-06, "loss": 0.18404202163219452, "rewards/accuracies": 0.921875, "rewards/chosen": 28.617542266845703, "rewards/margins": 24.0653076171875, "rewards/rejected": 4.543906211853027, "step": 3751 }, { "epoch": 1.9420289855072463, "grad_norm": 1.357268214225769, "learning_rate": 3.032027061011441e-06, "loss": 0.19565922021865845, "rewards/accuracies": 0.921875, "rewards/chosen": 29.037948608398438, "rewards/margins": 23.93286895751953, "rewards/rejected": 5.106632232666016, "step": 3752 }, { "epoch": 1.9425465838509317, "grad_norm": 1.2824015617370605, "learning_rate": 3.029404773206488e-06, "loss": 0.16720999777317047, "rewards/accuracies": 0.890625, "rewards/chosen": 32.04151916503906, "rewards/margins": 27.183181762695312, "rewards/rejected": 4.860889434814453, "step": 3753 }, { "epoch": 1.9430641821946169, "grad_norm": 1.095118522644043, "learning_rate": 3.0267831269430225e-06, "loss": 0.20008620619773865, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.753984451293945, "rewards/margins": 25.418289184570312, "rewards/rejected": 4.340642929077148, "step": 3754 }, { "epoch": 1.9435817805383024, "grad_norm": 1.0267747640609741, "learning_rate": 3.024162123074538e-06, "loss": 0.14209923148155212, "rewards/accuracies": 0.9609375, "rewards/chosen": 31.849365234375, "rewards/margins": 26.836822509765625, "rewards/rejected": 5.008247375488281, "step": 3755 }, { "epoch": 1.9440993788819876, "grad_norm": 1.4909194707870483, "learning_rate": 3.021541762454322e-06, "loss": 0.16135311126708984, "rewards/accuracies": 0.9453125, "rewards/chosen": 30.836332321166992, "rewards/margins": 26.591964721679688, "rewards/rejected": 4.2349853515625, "step": 3756 }, { "epoch": 1.9446169772256727, "grad_norm": 1.0614279508590698, "learning_rate": 3.0189220459354545e-06, "loss": 0.19635358452796936, "rewards/accuracies": 0.921875, "rewards/chosen": 31.96963119506836, "rewards/margins": 26.715805053710938, "rewards/rejected": 5.254055500030518, "step": 3757 }, { "epoch": 1.9451345755693583, "grad_norm": 1.1150873899459839, "learning_rate": 3.0163029743707982e-06, "loss": 0.20688104629516602, "rewards/accuracies": 0.921875, "rewards/chosen": 25.152328491210938, "rewards/margins": 21.43115234375, "rewards/rejected": 3.722980499267578, "step": 3758 }, { "epoch": 1.9456521739130435, "grad_norm": 0.9700758457183838, "learning_rate": 3.013684548613014e-06, "loss": 0.13946083188056946, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.561004638671875, "rewards/margins": 24.868392944335938, "rewards/rejected": 4.688013076782227, "step": 3759 }, { "epoch": 1.9461697722567288, "grad_norm": 1.7395857572555542, "learning_rate": 3.01106676951455e-06, "loss": 0.19184677302837372, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.23362731933594, "rewards/margins": 27.77911376953125, "rewards/rejected": 4.4582061767578125, "step": 3760 }, { "epoch": 1.9466873706004142, "grad_norm": 1.411056399345398, "learning_rate": 3.0084496379276394e-06, "loss": 0.1568385511636734, "rewards/accuracies": 0.9375, "rewards/chosen": 26.374958038330078, "rewards/margins": 22.174148559570312, "rewards/rejected": 4.207511901855469, "step": 3761 }, { "epoch": 1.9472049689440993, "grad_norm": 1.7927944660186768, "learning_rate": 3.005833154704312e-06, "loss": 0.13537055253982544, "rewards/accuracies": 0.921875, "rewards/chosen": 26.86089515686035, "rewards/margins": 23.7081298828125, "rewards/rejected": 3.144481658935547, "step": 3762 }, { "epoch": 1.9477225672877847, "grad_norm": 0.9280999302864075, "learning_rate": 3.003217320696381e-06, "loss": 0.1805427074432373, "rewards/accuracies": 0.9140625, "rewards/chosen": 29.80807113647461, "rewards/margins": 24.992610931396484, "rewards/rejected": 4.8177947998046875, "step": 3763 }, { "epoch": 1.94824016563147, "grad_norm": 0.8859320878982544, "learning_rate": 3.0006021367554516e-06, "loss": 0.1373979151248932, "rewards/accuracies": 0.9375, "rewards/chosen": 29.27252960205078, "rewards/margins": 24.304519653320312, "rewards/rejected": 4.967487335205078, "step": 3764 }, { "epoch": 1.9487577639751552, "grad_norm": 0.712495744228363, "learning_rate": 2.997987603732912e-06, "loss": 0.11614199727773666, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.386011123657227, "rewards/margins": 25.027986526489258, "rewards/rejected": 5.351638317108154, "step": 3765 }, { "epoch": 1.9492753623188406, "grad_norm": 0.9585431814193726, "learning_rate": 2.995373722479946e-06, "loss": 0.19035270810127258, "rewards/accuracies": 0.8984375, "rewards/chosen": 28.562225341796875, "rewards/margins": 23.739151000976562, "rewards/rejected": 4.821086883544922, "step": 3766 }, { "epoch": 1.949792960662526, "grad_norm": 0.851960301399231, "learning_rate": 2.9927604938475214e-06, "loss": 0.16473974287509918, "rewards/accuracies": 0.9296875, "rewards/chosen": 26.858219146728516, "rewards/margins": 22.811668395996094, "rewards/rejected": 4.047039031982422, "step": 3767 }, { "epoch": 1.950310559006211, "grad_norm": 1.1290345191955566, "learning_rate": 2.9901479186863914e-06, "loss": 0.13237303495407104, "rewards/accuracies": 0.9375, "rewards/chosen": 29.57352066040039, "rewards/margins": 24.65869140625, "rewards/rejected": 4.914898872375488, "step": 3768 }, { "epoch": 1.9508281573498965, "grad_norm": 0.7558345198631287, "learning_rate": 2.9875359978470992e-06, "loss": 0.1160474345088005, "rewards/accuracies": 0.9453125, "rewards/chosen": 24.83185386657715, "rewards/margins": 21.18140411376953, "rewards/rejected": 3.6476545333862305, "step": 3769 }, { "epoch": 1.9513457556935818, "grad_norm": 0.7940961122512817, "learning_rate": 2.984924732179978e-06, "loss": 0.12792308628559113, "rewards/accuracies": 0.921875, "rewards/chosen": 28.93250274658203, "rewards/margins": 24.683242797851562, "rewards/rejected": 4.249629974365234, "step": 3770 }, { "epoch": 1.951863354037267, "grad_norm": 1.0386137962341309, "learning_rate": 2.982314122535138e-06, "loss": 0.10699906200170517, "rewards/accuracies": 0.953125, "rewards/chosen": 30.874229431152344, "rewards/margins": 26.3253173828125, "rewards/rejected": 4.547582626342773, "step": 3771 }, { "epoch": 1.9523809523809523, "grad_norm": 0.5946998000144958, "learning_rate": 2.979704169762486e-06, "loss": 0.10260625928640366, "rewards/accuracies": 0.9375, "rewards/chosen": 31.36671257019043, "rewards/margins": 26.510696411132812, "rewards/rejected": 4.860727310180664, "step": 3772 }, { "epoch": 1.9528985507246377, "grad_norm": 0.7564528584480286, "learning_rate": 2.97709487471171e-06, "loss": 0.14308494329452515, "rewards/accuracies": 0.9296875, "rewards/chosen": 30.92461395263672, "rewards/margins": 25.594863891601562, "rewards/rejected": 5.331484794616699, "step": 3773 }, { "epoch": 1.9534161490683228, "grad_norm": 1.0993949174880981, "learning_rate": 2.974486238232287e-06, "loss": 0.16050168871879578, "rewards/accuracies": 0.8984375, "rewards/chosen": 23.177413940429688, "rewards/margins": 19.34423828125, "rewards/rejected": 3.8333606719970703, "step": 3774 }, { "epoch": 1.9539337474120084, "grad_norm": 0.7643247842788696, "learning_rate": 2.9718782611734733e-06, "loss": 0.14583775401115417, "rewards/accuracies": 0.90625, "rewards/chosen": 29.956356048583984, "rewards/margins": 24.01702117919922, "rewards/rejected": 5.940742492675781, "step": 3775 }, { "epoch": 1.9544513457556936, "grad_norm": 2.008013963699341, "learning_rate": 2.9692709443843176e-06, "loss": 0.1821734458208084, "rewards/accuracies": 0.8984375, "rewards/chosen": 27.315067291259766, "rewards/margins": 22.500015258789062, "rewards/rejected": 4.818458557128906, "step": 3776 }, { "epoch": 1.954968944099379, "grad_norm": 0.9806235432624817, "learning_rate": 2.966664288713651e-06, "loss": 0.12893857061862946, "rewards/accuracies": 0.9609375, "rewards/chosen": 30.396377563476562, "rewards/margins": 25.07574462890625, "rewards/rejected": 5.323036193847656, "step": 3777 }, { "epoch": 1.9554865424430643, "grad_norm": 1.2916462421417236, "learning_rate": 2.964058295010085e-06, "loss": 0.15880292654037476, "rewards/accuracies": 0.8984375, "rewards/chosen": 30.677719116210938, "rewards/margins": 24.784103393554688, "rewards/rejected": 5.892829895019531, "step": 3778 }, { "epoch": 1.9560041407867494, "grad_norm": 0.6918794512748718, "learning_rate": 2.961452964122025e-06, "loss": 0.13231311738491058, "rewards/accuracies": 0.953125, "rewards/chosen": 32.85285186767578, "rewards/margins": 26.67095947265625, "rewards/rejected": 6.191089630126953, "step": 3779 }, { "epoch": 1.9565217391304348, "grad_norm": 1.1738101243972778, "learning_rate": 2.9588482968976523e-06, "loss": 0.1702381670475006, "rewards/accuracies": 0.921875, "rewards/chosen": 30.947660446166992, "rewards/margins": 24.446075439453125, "rewards/rejected": 6.505104064941406, "step": 3780 }, { "epoch": 1.9570393374741202, "grad_norm": 1.2633543014526367, "learning_rate": 2.956244294184936e-06, "loss": 0.1401127129793167, "rewards/accuracies": 0.9296875, "rewards/chosen": 28.295520782470703, "rewards/margins": 23.75738525390625, "rewards/rejected": 4.542842864990234, "step": 3781 }, { "epoch": 1.9575569358178053, "grad_norm": 0.9224025011062622, "learning_rate": 2.953640956831625e-06, "loss": 0.13714690506458282, "rewards/accuracies": 0.9453125, "rewards/chosen": 31.53137969970703, "rewards/margins": 25.26306915283203, "rewards/rejected": 6.267974853515625, "step": 3782 }, { "epoch": 1.9580745341614907, "grad_norm": 0.8953583836555481, "learning_rate": 2.951038285685261e-06, "loss": 0.11521682143211365, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.113643646240234, "rewards/margins": 25.827362060546875, "rewards/rejected": 6.286266326904297, "step": 3783 }, { "epoch": 1.958592132505176, "grad_norm": 1.7880734205245972, "learning_rate": 2.9484362815931554e-06, "loss": 0.15260766446590424, "rewards/accuracies": 0.921875, "rewards/chosen": 33.969669342041016, "rewards/margins": 27.40087890625, "rewards/rejected": 6.568511962890625, "step": 3784 }, { "epoch": 1.9591097308488612, "grad_norm": 1.7493497133255005, "learning_rate": 2.9458349454024138e-06, "loss": 0.1321205496788025, "rewards/accuracies": 0.953125, "rewards/chosen": 32.48766326904297, "rewards/margins": 26.507400512695312, "rewards/rejected": 5.978855133056641, "step": 3785 }, { "epoch": 1.9596273291925466, "grad_norm": 1.6519877910614014, "learning_rate": 2.9432342779599164e-06, "loss": 0.14288471639156342, "rewards/accuracies": 0.90625, "rewards/chosen": 36.97080993652344, "rewards/margins": 29.220458984375, "rewards/rejected": 7.753612518310547, "step": 3786 }, { "epoch": 1.960144927536232, "grad_norm": 1.8095979690551758, "learning_rate": 2.9406342801123357e-06, "loss": 0.18098561465740204, "rewards/accuracies": 0.8984375, "rewards/chosen": 35.923133850097656, "rewards/margins": 29.376556396484375, "rewards/rejected": 6.550983428955078, "step": 3787 }, { "epoch": 1.960662525879917, "grad_norm": 1.7078325748443604, "learning_rate": 2.9380349527061123e-06, "loss": 0.14768244326114655, "rewards/accuracies": 0.90625, "rewards/chosen": 34.44151306152344, "rewards/margins": 28.613555908203125, "rewards/rejected": 5.822608947753906, "step": 3788 }, { "epoch": 1.9611801242236024, "grad_norm": 1.042021632194519, "learning_rate": 2.93543629658748e-06, "loss": 0.17934978008270264, "rewards/accuracies": 0.9375, "rewards/chosen": 33.51905822753906, "rewards/margins": 26.318588256835938, "rewards/rejected": 7.209283828735352, "step": 3789 }, { "epoch": 1.9616977225672878, "grad_norm": 0.9023581147193909, "learning_rate": 2.9328383126024507e-06, "loss": 0.14987778663635254, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.78038787841797, "rewards/margins": 24.555816650390625, "rewards/rejected": 5.222952365875244, "step": 3790 }, { "epoch": 1.962215320910973, "grad_norm": 0.7748648524284363, "learning_rate": 2.9302410015968125e-06, "loss": 0.08493957668542862, "rewards/accuracies": 0.96875, "rewards/chosen": 34.20156478881836, "rewards/margins": 27.324722290039062, "rewards/rejected": 6.875038146972656, "step": 3791 }, { "epoch": 1.9627329192546585, "grad_norm": 1.5043392181396484, "learning_rate": 2.9276443644161436e-06, "loss": 0.08961233496665955, "rewards/accuracies": 0.9609375, "rewards/chosen": 35.930763244628906, "rewards/margins": 28.514549255371094, "rewards/rejected": 7.409284591674805, "step": 3792 }, { "epoch": 1.9632505175983437, "grad_norm": 0.9403648376464844, "learning_rate": 2.9250484019057955e-06, "loss": 0.10875816643238068, "rewards/accuracies": 0.9375, "rewards/chosen": 34.002159118652344, "rewards/margins": 28.018753051757812, "rewards/rejected": 5.98725700378418, "step": 3793 }, { "epoch": 1.9637681159420288, "grad_norm": 1.7632865905761719, "learning_rate": 2.922453114910903e-06, "loss": 0.16945791244506836, "rewards/accuracies": 0.8828125, "rewards/chosen": 32.20818328857422, "rewards/margins": 25.878204345703125, "rewards/rejected": 6.338128566741943, "step": 3794 }, { "epoch": 1.9642857142857144, "grad_norm": 1.0033328533172607, "learning_rate": 2.9198585042763792e-06, "loss": 0.15653096139431, "rewards/accuracies": 0.9375, "rewards/chosen": 32.44464111328125, "rewards/margins": 26.005081176757812, "rewards/rejected": 6.445978164672852, "step": 3795 }, { "epoch": 1.9648033126293996, "grad_norm": 0.9791389107704163, "learning_rate": 2.9172645708469216e-06, "loss": 0.1553126871585846, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.3044319152832, "rewards/margins": 26.17534637451172, "rewards/rejected": 6.125117301940918, "step": 3796 }, { "epoch": 1.965320910973085, "grad_norm": 1.269128441810608, "learning_rate": 2.914671315467001e-06, "loss": 0.11661940068006516, "rewards/accuracies": 0.9375, "rewards/chosen": 35.29505920410156, "rewards/margins": 27.652446746826172, "rewards/rejected": 7.648382186889648, "step": 3797 }, { "epoch": 1.9658385093167703, "grad_norm": 1.3012831211090088, "learning_rate": 2.9120787389808716e-06, "loss": 0.11770883202552795, "rewards/accuracies": 0.953125, "rewards/chosen": 35.99016571044922, "rewards/margins": 29.36395263671875, "rewards/rejected": 6.631071090698242, "step": 3798 }, { "epoch": 1.9663561076604554, "grad_norm": 1.9623818397521973, "learning_rate": 2.909486842232564e-06, "loss": 0.160779669880867, "rewards/accuracies": 0.9140625, "rewards/chosen": 34.546470642089844, "rewards/margins": 26.285018920898438, "rewards/rejected": 8.268147468566895, "step": 3799 }, { "epoch": 1.9668737060041408, "grad_norm": 2.6410117149353027, "learning_rate": 2.9068956260658904e-06, "loss": 0.22499027848243713, "rewards/accuracies": 0.875, "rewards/chosen": 28.887189865112305, "rewards/margins": 23.492691040039062, "rewards/rejected": 5.3953857421875, "step": 3800 }, { "epoch": 1.9673913043478262, "grad_norm": 1.6461979150772095, "learning_rate": 2.9043050913244397e-06, "loss": 0.1620086431503296, "rewards/accuracies": 0.90625, "rewards/chosen": 32.51079559326172, "rewards/margins": 26.18653106689453, "rewards/rejected": 6.331505298614502, "step": 3801 }, { "epoch": 1.9679089026915113, "grad_norm": 0.9648314714431763, "learning_rate": 2.9017152388515787e-06, "loss": 0.1326318383216858, "rewards/accuracies": 0.9296875, "rewards/chosen": 31.20396614074707, "rewards/margins": 25.525794982910156, "rewards/rejected": 5.677396774291992, "step": 3802 }, { "epoch": 1.9684265010351967, "grad_norm": 1.4318944215774536, "learning_rate": 2.8991260694904523e-06, "loss": 0.13347309827804565, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.945343017578125, "rewards/margins": 27.512710571289062, "rewards/rejected": 7.4317474365234375, "step": 3803 }, { "epoch": 1.968944099378882, "grad_norm": 1.049210786819458, "learning_rate": 2.8965375840839843e-06, "loss": 0.1523820459842682, "rewards/accuracies": 0.9140625, "rewards/chosen": 35.15907287597656, "rewards/margins": 28.00836181640625, "rewards/rejected": 7.151556968688965, "step": 3804 }, { "epoch": 1.9694616977225672, "grad_norm": 1.0359902381896973, "learning_rate": 2.8939497834748744e-06, "loss": 0.06823009252548218, "rewards/accuracies": 0.9609375, "rewards/chosen": 38.92101287841797, "rewards/margins": 31.43408203125, "rewards/rejected": 7.481697082519531, "step": 3805 }, { "epoch": 1.9699792960662525, "grad_norm": 1.9162628650665283, "learning_rate": 2.891362668505601e-06, "loss": 0.11812859773635864, "rewards/accuracies": 0.9375, "rewards/chosen": 37.292259216308594, "rewards/margins": 29.669166564941406, "rewards/rejected": 7.618191242218018, "step": 3806 }, { "epoch": 1.970496894409938, "grad_norm": 1.3795766830444336, "learning_rate": 2.8887762400184128e-06, "loss": 0.16102907061576843, "rewards/accuracies": 0.9296875, "rewards/chosen": 39.59635925292969, "rewards/margins": 32.325355529785156, "rewards/rejected": 7.265180587768555, "step": 3807 }, { "epoch": 1.971014492753623, "grad_norm": 0.5732273459434509, "learning_rate": 2.8861904988553458e-06, "loss": 0.10059545934200287, "rewards/accuracies": 0.953125, "rewards/chosen": 37.42510986328125, "rewards/margins": 30.16533660888672, "rewards/rejected": 7.248294830322266, "step": 3808 }, { "epoch": 1.9715320910973086, "grad_norm": 1.0247846841812134, "learning_rate": 2.8836054458582053e-06, "loss": 0.1084122434258461, "rewards/accuracies": 0.953125, "rewards/chosen": 33.15125274658203, "rewards/margins": 26.405929565429688, "rewards/rejected": 6.738943099975586, "step": 3809 }, { "epoch": 1.9720496894409938, "grad_norm": 2.0229156017303467, "learning_rate": 2.881021081868575e-06, "loss": 0.1592113971710205, "rewards/accuracies": 0.921875, "rewards/chosen": 33.69819641113281, "rewards/margins": 28.099700927734375, "rewards/rejected": 5.591838836669922, "step": 3810 }, { "epoch": 1.972567287784679, "grad_norm": 3.261390209197998, "learning_rate": 2.8784374077278077e-06, "loss": 0.21828846633434296, "rewards/accuracies": 0.8828125, "rewards/chosen": 30.9451904296875, "rewards/margins": 25.181884765625, "rewards/rejected": 5.7647247314453125, "step": 3811 }, { "epoch": 1.9730848861283645, "grad_norm": 1.90110445022583, "learning_rate": 2.875854424277044e-06, "loss": 0.12502658367156982, "rewards/accuracies": 0.9375, "rewards/chosen": 37.12580871582031, "rewards/margins": 30.09539031982422, "rewards/rejected": 7.029212951660156, "step": 3812 }, { "epoch": 1.9736024844720497, "grad_norm": 0.5934797525405884, "learning_rate": 2.8732721323571915e-06, "loss": 0.07384863495826721, "rewards/accuracies": 0.9765625, "rewards/chosen": 35.815155029296875, "rewards/margins": 29.544944763183594, "rewards/rejected": 6.285392761230469, "step": 3813 }, { "epoch": 1.974120082815735, "grad_norm": 0.8282322883605957, "learning_rate": 2.8706905328089307e-06, "loss": 0.1145201176404953, "rewards/accuracies": 0.921875, "rewards/chosen": 34.783843994140625, "rewards/margins": 27.363082885742188, "rewards/rejected": 7.415639877319336, "step": 3814 }, { "epoch": 1.9746376811594204, "grad_norm": 1.370377779006958, "learning_rate": 2.8681096264727194e-06, "loss": 0.14639876782894135, "rewards/accuracies": 0.9296875, "rewards/chosen": 29.356075286865234, "rewards/margins": 23.559127807617188, "rewards/rejected": 5.7907633781433105, "step": 3815 }, { "epoch": 1.9751552795031055, "grad_norm": 1.789651870727539, "learning_rate": 2.865529414188797e-06, "loss": 0.16220906376838684, "rewards/accuracies": 0.921875, "rewards/chosen": 37.02043533325195, "rewards/margins": 29.28173828125, "rewards/rejected": 7.732433319091797, "step": 3816 }, { "epoch": 1.975672877846791, "grad_norm": 2.170646905899048, "learning_rate": 2.8629498967971638e-06, "loss": 0.12167835980653763, "rewards/accuracies": 0.9765625, "rewards/chosen": 36.73316192626953, "rewards/margins": 29.978271484375, "rewards/rejected": 6.7608642578125, "step": 3817 }, { "epoch": 1.9761904761904763, "grad_norm": 0.8808158040046692, "learning_rate": 2.8603710751376017e-06, "loss": 0.12201353907585144, "rewards/accuracies": 0.9609375, "rewards/chosen": 37.42420196533203, "rewards/margins": 30.439117431640625, "rewards/rejected": 6.97819709777832, "step": 3818 }, { "epoch": 1.9767080745341614, "grad_norm": 0.8003095388412476, "learning_rate": 2.8577929500496637e-06, "loss": 0.10155417025089264, "rewards/accuracies": 0.9375, "rewards/chosen": 34.09653091430664, "rewards/margins": 28.25665283203125, "rewards/rejected": 5.844902038574219, "step": 3819 }, { "epoch": 1.9772256728778468, "grad_norm": 2.24839448928833, "learning_rate": 2.8552155223726826e-06, "loss": 0.14724640548229218, "rewards/accuracies": 0.9375, "rewards/chosen": 32.53497314453125, "rewards/margins": 25.692489624023438, "rewards/rejected": 6.835798740386963, "step": 3820 }, { "epoch": 1.9777432712215322, "grad_norm": 1.5319408178329468, "learning_rate": 2.8526387929457522e-06, "loss": 0.22151993215084076, "rewards/accuracies": 0.9296875, "rewards/chosen": 32.27708435058594, "rewards/margins": 24.988418579101562, "rewards/rejected": 7.291217803955078, "step": 3821 }, { "epoch": 1.9782608695652173, "grad_norm": 0.9357215166091919, "learning_rate": 2.850062762607748e-06, "loss": 0.08769935369491577, "rewards/accuracies": 0.96875, "rewards/chosen": 37.00269317626953, "rewards/margins": 29.684982299804688, "rewards/rejected": 7.3138885498046875, "step": 3822 }, { "epoch": 1.9787784679089027, "grad_norm": 3.6948065757751465, "learning_rate": 2.847487432197315e-06, "loss": 0.1387205421924591, "rewards/accuracies": 0.921875, "rewards/chosen": 34.62354278564453, "rewards/margins": 28.17462158203125, "rewards/rejected": 6.44813346862793, "step": 3823 }, { "epoch": 1.979296066252588, "grad_norm": 0.7978171110153198, "learning_rate": 2.8449128025528704e-06, "loss": 0.11599452793598175, "rewards/accuracies": 0.921875, "rewards/chosen": 32.0098876953125, "rewards/margins": 25.194732666015625, "rewards/rejected": 6.8198699951171875, "step": 3824 }, { "epoch": 1.9798136645962732, "grad_norm": 1.4140304327011108, "learning_rate": 2.8423388745126037e-06, "loss": 0.09732039272785187, "rewards/accuracies": 0.9609375, "rewards/chosen": 35.98088073730469, "rewards/margins": 29.4591064453125, "rewards/rejected": 6.528217315673828, "step": 3825 }, { "epoch": 1.9803312629399588, "grad_norm": 2.302144765853882, "learning_rate": 2.839765648914475e-06, "loss": 0.1980016678571701, "rewards/accuracies": 0.9140625, "rewards/chosen": 31.554359436035156, "rewards/margins": 26.351806640625, "rewards/rejected": 5.205722808837891, "step": 3826 }, { "epoch": 1.980848861283644, "grad_norm": 1.0411168336868286, "learning_rate": 2.8371931265962184e-06, "loss": 0.11417485773563385, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.652069091796875, "rewards/margins": 29.64324951171875, "rewards/rejected": 7.015874862670898, "step": 3827 }, { "epoch": 1.981366459627329, "grad_norm": 0.7103300094604492, "learning_rate": 2.834621308395335e-06, "loss": 0.12436296790838242, "rewards/accuracies": 0.9375, "rewards/chosen": 34.080543518066406, "rewards/margins": 27.472129821777344, "rewards/rejected": 6.611017227172852, "step": 3828 }, { "epoch": 1.9818840579710146, "grad_norm": 1.9021368026733398, "learning_rate": 2.8320501951490996e-06, "loss": 0.13173246383666992, "rewards/accuracies": 0.9609375, "rewards/chosen": 34.01013946533203, "rewards/margins": 27.408432006835938, "rewards/rejected": 6.604637145996094, "step": 3829 }, { "epoch": 1.9824016563146998, "grad_norm": 1.2448415756225586, "learning_rate": 2.8294797876945567e-06, "loss": 0.20977550745010376, "rewards/accuracies": 0.90625, "rewards/chosen": 29.39240264892578, "rewards/margins": 23.123779296875, "rewards/rejected": 6.26945686340332, "step": 3830 }, { "epoch": 1.9829192546583851, "grad_norm": 0.9449001550674438, "learning_rate": 2.826910086868521e-06, "loss": 0.12440574169158936, "rewards/accuracies": 0.9453125, "rewards/chosen": 32.47297668457031, "rewards/margins": 27.254119873046875, "rewards/rejected": 5.224510192871094, "step": 3831 }, { "epoch": 1.9834368530020705, "grad_norm": 1.3820617198944092, "learning_rate": 2.8243410935075765e-06, "loss": 0.15401843190193176, "rewards/accuracies": 0.921875, "rewards/chosen": 36.39848327636719, "rewards/margins": 28.840301513671875, "rewards/rejected": 7.557044982910156, "step": 3832 }, { "epoch": 1.9839544513457557, "grad_norm": 1.0987212657928467, "learning_rate": 2.8217728084480777e-06, "loss": 0.18014171719551086, "rewards/accuracies": 0.890625, "rewards/chosen": 33.543312072753906, "rewards/margins": 27.816354751586914, "rewards/rejected": 5.731647968292236, "step": 3833 }, { "epoch": 1.984472049689441, "grad_norm": 1.4595277309417725, "learning_rate": 2.8192052325261497e-06, "loss": 0.11753907799720764, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.298824310302734, "rewards/margins": 30.068401336669922, "rewards/rejected": 7.224295616149902, "step": 3834 }, { "epoch": 1.9849896480331264, "grad_norm": 0.9501888155937195, "learning_rate": 2.8166383665776836e-06, "loss": 0.1352929025888443, "rewards/accuracies": 0.921875, "rewards/chosen": 31.650630950927734, "rewards/margins": 26.186843872070312, "rewards/rejected": 5.4620513916015625, "step": 3835 }, { "epoch": 1.9855072463768115, "grad_norm": 1.5684173107147217, "learning_rate": 2.814072211438344e-06, "loss": 0.15041294693946838, "rewards/accuracies": 0.9140625, "rewards/chosen": 37.32091522216797, "rewards/margins": 31.063339233398438, "rewards/rejected": 6.2588582038879395, "step": 3836 }, { "epoch": 1.986024844720497, "grad_norm": 1.6602932214736938, "learning_rate": 2.8115067679435547e-06, "loss": 0.15616971254348755, "rewards/accuracies": 0.90625, "rewards/chosen": 32.766475677490234, "rewards/margins": 26.678543090820312, "rewards/rejected": 6.085172653198242, "step": 3837 }, { "epoch": 1.9865424430641823, "grad_norm": 0.7301107048988342, "learning_rate": 2.8089420369285213e-06, "loss": 0.1079646572470665, "rewards/accuracies": 0.9609375, "rewards/chosen": 32.74565887451172, "rewards/margins": 27.198486328125, "rewards/rejected": 5.540248870849609, "step": 3838 }, { "epoch": 1.9870600414078674, "grad_norm": 0.9407005310058594, "learning_rate": 2.8063780192282097e-06, "loss": 0.12071309983730316, "rewards/accuracies": 0.9375, "rewards/chosen": 34.700286865234375, "rewards/margins": 29.004974365234375, "rewards/rejected": 5.696020126342773, "step": 3839 }, { "epoch": 1.9875776397515528, "grad_norm": 0.8207988142967224, "learning_rate": 2.8038147156773497e-06, "loss": 0.08863706886768341, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.426883697509766, "rewards/margins": 29.3121337890625, "rewards/rejected": 7.118183135986328, "step": 3840 }, { "epoch": 1.9880952380952381, "grad_norm": 0.6883071064949036, "learning_rate": 2.801252127110445e-06, "loss": 0.08319315314292908, "rewards/accuracies": 0.96875, "rewards/chosen": 35.62861633300781, "rewards/margins": 29.420822143554688, "rewards/rejected": 6.2058258056640625, "step": 3841 }, { "epoch": 1.9886128364389233, "grad_norm": 1.2528589963912964, "learning_rate": 2.798690254361769e-06, "loss": 0.13294264674186707, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.168949127197266, "rewards/margins": 28.387229919433594, "rewards/rejected": 4.78138542175293, "step": 3842 }, { "epoch": 1.9891304347826086, "grad_norm": 0.5799947381019592, "learning_rate": 2.796129098265353e-06, "loss": 0.09853062033653259, "rewards/accuracies": 0.9453125, "rewards/chosen": 39.94810485839844, "rewards/margins": 32.48186492919922, "rewards/rejected": 7.467813491821289, "step": 3843 }, { "epoch": 1.989648033126294, "grad_norm": 0.8417988419532776, "learning_rate": 2.7935686596550014e-06, "loss": 0.12018201500177383, "rewards/accuracies": 0.9296875, "rewards/chosen": 34.176300048828125, "rewards/margins": 29.06146240234375, "rewards/rejected": 5.112127304077148, "step": 3844 }, { "epoch": 1.9901656314699792, "grad_norm": 0.9099281430244446, "learning_rate": 2.7910089393642826e-06, "loss": 0.07751991599798203, "rewards/accuracies": 0.9765625, "rewards/chosen": 36.79144287109375, "rewards/margins": 31.03668212890625, "rewards/rejected": 5.757774353027344, "step": 3845 }, { "epoch": 1.9906832298136647, "grad_norm": 1.2643002271652222, "learning_rate": 2.7884499382265364e-06, "loss": 0.17226363718509674, "rewards/accuracies": 0.9375, "rewards/chosen": 38.02367401123047, "rewards/margins": 30.7218017578125, "rewards/rejected": 7.306182384490967, "step": 3846 }, { "epoch": 1.99120082815735, "grad_norm": 0.7677261233329773, "learning_rate": 2.785891657074858e-06, "loss": 0.11863785237073898, "rewards/accuracies": 0.9453125, "rewards/chosen": 36.91864013671875, "rewards/margins": 30.4036865234375, "rewards/rejected": 6.513521194458008, "step": 3847 }, { "epoch": 1.991718426501035, "grad_norm": 0.9531682133674622, "learning_rate": 2.7833340967421174e-06, "loss": 0.15772227942943573, "rewards/accuracies": 0.9140625, "rewards/chosen": 37.743247985839844, "rewards/margins": 32.128257751464844, "rewards/rejected": 5.617763519287109, "step": 3848 }, { "epoch": 1.9922360248447206, "grad_norm": 1.104067325592041, "learning_rate": 2.780777258060946e-06, "loss": 0.12924550473690033, "rewards/accuracies": 0.9375, "rewards/chosen": 36.706607818603516, "rewards/margins": 30.274368286132812, "rewards/rejected": 6.4379730224609375, "step": 3849 }, { "epoch": 1.9927536231884058, "grad_norm": 1.742668628692627, "learning_rate": 2.778221141863742e-06, "loss": 0.19030368328094482, "rewards/accuracies": 0.8828125, "rewards/chosen": 35.45610046386719, "rewards/margins": 29.184371948242188, "rewards/rejected": 6.270149230957031, "step": 3850 }, { "epoch": 1.9932712215320911, "grad_norm": 0.8099491596221924, "learning_rate": 2.7756657489826656e-06, "loss": 0.11848746240139008, "rewards/accuracies": 0.9375, "rewards/chosen": 39.99861526489258, "rewards/margins": 33.35198974609375, "rewards/rejected": 6.63934326171875, "step": 3851 }, { "epoch": 1.9937888198757765, "grad_norm": 1.630937099456787, "learning_rate": 2.7731110802496453e-06, "loss": 0.21528688073158264, "rewards/accuracies": 0.8984375, "rewards/chosen": 33.851932525634766, "rewards/margins": 28.11053466796875, "rewards/rejected": 5.737295150756836, "step": 3852 }, { "epoch": 1.9943064182194616, "grad_norm": 0.8915739059448242, "learning_rate": 2.7705571364963716e-06, "loss": 0.1273833066225052, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.745811462402344, "rewards/margins": 30.460739135742188, "rewards/rejected": 7.292209625244141, "step": 3853 }, { "epoch": 1.994824016563147, "grad_norm": 0.8576055765151978, "learning_rate": 2.768003918554299e-06, "loss": 0.14277152717113495, "rewards/accuracies": 0.9296875, "rewards/chosen": 35.959739685058594, "rewards/margins": 29.14714813232422, "rewards/rejected": 6.809295654296875, "step": 3854 }, { "epoch": 1.9953416149068324, "grad_norm": 1.3366403579711914, "learning_rate": 2.7654514272546458e-06, "loss": 0.11496759206056595, "rewards/accuracies": 0.9453125, "rewards/chosen": 40.98469161987305, "rewards/margins": 33.39410400390625, "rewards/rejected": 7.585034370422363, "step": 3855 }, { "epoch": 1.9958592132505175, "grad_norm": 0.8825064301490784, "learning_rate": 2.7628996634283937e-06, "loss": 0.13211363554000854, "rewards/accuracies": 0.9375, "rewards/chosen": 36.180694580078125, "rewards/margins": 29.844558715820312, "rewards/rejected": 6.324573516845703, "step": 3856 }, { "epoch": 1.9963768115942029, "grad_norm": 1.6881349086761475, "learning_rate": 2.7603486279062884e-06, "loss": 0.12204644829034805, "rewards/accuracies": 0.9453125, "rewards/chosen": 38.07575988769531, "rewards/margins": 32.58451843261719, "rewards/rejected": 5.489510536193848, "step": 3857 }, { "epoch": 1.9968944099378882, "grad_norm": 2.603529691696167, "learning_rate": 2.757798321518839e-06, "loss": 0.17866453528404236, "rewards/accuracies": 0.9140625, "rewards/chosen": 33.87554931640625, "rewards/margins": 27.858673095703125, "rewards/rejected": 6.017555236816406, "step": 3858 }, { "epoch": 1.9974120082815734, "grad_norm": 1.818960189819336, "learning_rate": 2.755248745096313e-06, "loss": 0.08197580277919769, "rewards/accuracies": 0.953125, "rewards/chosen": 38.126220703125, "rewards/margins": 32.221282958984375, "rewards/rejected": 5.90203857421875, "step": 3859 }, { "epoch": 1.9979296066252588, "grad_norm": 1.0140609741210938, "learning_rate": 2.752699899468746e-06, "loss": 0.1460154950618744, "rewards/accuracies": 0.90625, "rewards/chosen": 35.18709182739258, "rewards/margins": 29.798011779785156, "rewards/rejected": 5.389095306396484, "step": 3860 }, { "epoch": 1.9984472049689441, "grad_norm": 1.686501383781433, "learning_rate": 2.750151785465933e-06, "loss": 0.1904926300048828, "rewards/accuracies": 0.890625, "rewards/chosen": 36.10875701904297, "rewards/margins": 30.132186889648438, "rewards/rejected": 5.96988582611084, "step": 3861 }, { "epoch": 1.9989648033126293, "grad_norm": 0.9468633532524109, "learning_rate": 2.7476044039174317e-06, "loss": 0.1420857012271881, "rewards/accuracies": 0.9140625, "rewards/chosen": 37.91312026977539, "rewards/margins": 31.973121643066406, "rewards/rejected": 5.93077278137207, "step": 3862 }, { "epoch": 1.9994824016563149, "grad_norm": 1.0703308582305908, "learning_rate": 2.745057755652555e-06, "loss": 0.12772265076637268, "rewards/accuracies": 0.9453125, "rewards/chosen": 33.65774917602539, "rewards/margins": 28.556259155273438, "rewards/rejected": 5.106136322021484, "step": 3863 }, { "epoch": 2.0, "grad_norm": 1.0283572673797607, "learning_rate": 2.7425118415003886e-06, "loss": 0.1091877892613411, "rewards/accuracies": 0.953125, "rewards/chosen": 42.779232025146484, "rewards/margins": 36.41496276855469, "rewards/rejected": 6.3664164543151855, "step": 3864 }, { "epoch": 2.000517598343685, "grad_norm": 0.5266503691673279, "learning_rate": 2.7399666622897738e-06, "loss": 0.09142997860908508, "rewards/accuracies": 0.9609375, "rewards/chosen": 38.31092071533203, "rewards/margins": 31.726638793945312, "rewards/rejected": 6.58939266204834, "step": 3865 }, { "epoch": 2.0010351966873707, "grad_norm": 0.7140711545944214, "learning_rate": 2.737422218849307e-06, "loss": 0.10334475338459015, "rewards/accuracies": 0.9765625, "rewards/chosen": 36.98686599731445, "rewards/margins": 30.25885009765625, "rewards/rejected": 6.721277236938477, "step": 3866 }, { "epoch": 2.001552795031056, "grad_norm": 0.9100161790847778, "learning_rate": 2.734878512007351e-06, "loss": 0.10411512851715088, "rewards/accuracies": 0.9609375, "rewards/chosen": 37.56876754760742, "rewards/margins": 31.52301025390625, "rewards/rejected": 6.038902282714844, "step": 3867 }, { "epoch": 2.002070393374741, "grad_norm": 0.598502516746521, "learning_rate": 2.732335542592031e-06, "loss": 0.1177356094121933, "rewards/accuracies": 0.9453125, "rewards/chosen": 42.54646682739258, "rewards/margins": 34.962059020996094, "rewards/rejected": 7.590326309204102, "step": 3868 }, { "epoch": 2.0025879917184266, "grad_norm": 1.2255016565322876, "learning_rate": 2.72979331143123e-06, "loss": 0.12205130606889725, "rewards/accuracies": 0.9453125, "rewards/chosen": 37.05937194824219, "rewards/margins": 30.9906005859375, "rewards/rejected": 6.065309524536133, "step": 3869 }, { "epoch": 2.0031055900621118, "grad_norm": 1.0639580488204956, "learning_rate": 2.7272518193525855e-06, "loss": 0.0908719152212143, "rewards/accuracies": 0.96875, "rewards/chosen": 39.1112174987793, "rewards/margins": 32.572021484375, "rewards/rejected": 6.548954010009766, "step": 3870 }, { "epoch": 2.003623188405797, "grad_norm": 0.8782691955566406, "learning_rate": 2.7247110671834974e-06, "loss": 0.10074497759342194, "rewards/accuracies": 0.9453125, "rewards/chosen": 39.22859191894531, "rewards/margins": 33.82830810546875, "rewards/rejected": 5.40312385559082, "step": 3871 }, { "epoch": 2.0041407867494825, "grad_norm": 1.4576011896133423, "learning_rate": 2.7221710557511337e-06, "loss": 0.07701990008354187, "rewards/accuracies": 0.9609375, "rewards/chosen": 43.32061767578125, "rewards/margins": 35.948516845703125, "rewards/rejected": 7.374961853027344, "step": 3872 }, { "epoch": 2.0046583850931676, "grad_norm": 1.3173671960830688, "learning_rate": 2.719631785882406e-06, "loss": 0.07463642954826355, "rewards/accuracies": 0.96875, "rewards/chosen": 42.895294189453125, "rewards/margins": 35.25933837890625, "rewards/rejected": 7.634906768798828, "step": 3873 }, { "epoch": 2.005175983436853, "grad_norm": 0.799344539642334, "learning_rate": 2.7170932584039954e-06, "loss": 0.058527588844299316, "rewards/accuracies": 0.984375, "rewards/chosen": 41.86354064941406, "rewards/margins": 33.93721008300781, "rewards/rejected": 7.926243782043457, "step": 3874 }, { "epoch": 2.0056935817805384, "grad_norm": 0.5363348126411438, "learning_rate": 2.714555474142336e-06, "loss": 0.0689062848687172, "rewards/accuracies": 0.9609375, "rewards/chosen": 44.144065856933594, "rewards/margins": 37.38257598876953, "rewards/rejected": 6.762409210205078, "step": 3875 }, { "epoch": 2.0062111801242235, "grad_norm": 0.5511012673377991, "learning_rate": 2.7120184339236232e-06, "loss": 0.06284074485301971, "rewards/accuracies": 0.9765625, "rewards/chosen": 42.53984069824219, "rewards/margins": 35.24896240234375, "rewards/rejected": 7.298465728759766, "step": 3876 }, { "epoch": 2.006728778467909, "grad_norm": 1.0653809309005737, "learning_rate": 2.709482138573808e-06, "loss": 0.07733552157878876, "rewards/accuracies": 0.984375, "rewards/chosen": 46.52095413208008, "rewards/margins": 37.611175537109375, "rewards/rejected": 8.90842056274414, "step": 3877 }, { "epoch": 2.0072463768115942, "grad_norm": 0.49622225761413574, "learning_rate": 2.7069465889186003e-06, "loss": 0.05078909546136856, "rewards/accuracies": 0.9921875, "rewards/chosen": 43.286102294921875, "rewards/margins": 34.25128173828125, "rewards/rejected": 9.035648345947266, "step": 3878 }, { "epoch": 2.0077639751552794, "grad_norm": 0.4276379644870758, "learning_rate": 2.704411785783466e-06, "loss": 0.05362403765320778, "rewards/accuracies": 0.96875, "rewards/chosen": 45.954254150390625, "rewards/margins": 37.43635559082031, "rewards/rejected": 8.525613784790039, "step": 3879 }, { "epoch": 2.008281573498965, "grad_norm": 0.6971765160560608, "learning_rate": 2.7018777299936284e-06, "loss": 0.11685596406459808, "rewards/accuracies": 0.9453125, "rewards/chosen": 39.41582107543945, "rewards/margins": 31.902145385742188, "rewards/rejected": 7.518320083618164, "step": 3880 }, { "epoch": 2.00879917184265, "grad_norm": 0.6471461057662964, "learning_rate": 2.6993444223740677e-06, "loss": 0.09922587126493454, "rewards/accuracies": 0.953125, "rewards/chosen": 47.903160095214844, "rewards/margins": 37.25782775878906, "rewards/rejected": 10.651422500610352, "step": 3881 }, { "epoch": 2.0093167701863353, "grad_norm": 0.6683308482170105, "learning_rate": 2.696811863749521e-06, "loss": 0.09174641966819763, "rewards/accuracies": 0.9453125, "rewards/chosen": 45.07915496826172, "rewards/margins": 36.31201934814453, "rewards/rejected": 8.77157211303711, "step": 3882 }, { "epoch": 2.009834368530021, "grad_norm": 2.5287322998046875, "learning_rate": 2.6942800549444803e-06, "loss": 0.1130371019244194, "rewards/accuracies": 0.96875, "rewards/chosen": 41.544403076171875, "rewards/margins": 31.97186279296875, "rewards/rejected": 9.57215690612793, "step": 3883 }, { "epoch": 2.010351966873706, "grad_norm": 0.7197549939155579, "learning_rate": 2.691748996783195e-06, "loss": 0.046881407499313354, "rewards/accuracies": 0.984375, "rewards/chosen": 54.54691696166992, "rewards/margins": 42.209930419921875, "rewards/rejected": 12.334173202514648, "step": 3884 }, { "epoch": 2.010869565217391, "grad_norm": 0.8972259759902954, "learning_rate": 2.689218690089671e-06, "loss": 0.06269651651382446, "rewards/accuracies": 0.9765625, "rewards/chosen": 45.67182159423828, "rewards/margins": 35.402679443359375, "rewards/rejected": 10.269721984863281, "step": 3885 }, { "epoch": 2.0113871635610767, "grad_norm": 0.6320930123329163, "learning_rate": 2.6866891356876617e-06, "loss": 0.04433745890855789, "rewards/accuracies": 0.9765625, "rewards/chosen": 49.32145690917969, "rewards/margins": 37.397796630859375, "rewards/rejected": 11.92544174194336, "step": 3886 }, { "epoch": 2.011904761904762, "grad_norm": 0.995725154876709, "learning_rate": 2.6841603344006876e-06, "loss": 0.053351111710071564, "rewards/accuracies": 0.96875, "rewards/chosen": 52.516265869140625, "rewards/margins": 39.770782470703125, "rewards/rejected": 12.75058364868164, "step": 3887 }, { "epoch": 2.012422360248447, "grad_norm": 1.0156385898590088, "learning_rate": 2.681632287052019e-06, "loss": 0.05289935693144798, "rewards/accuracies": 0.984375, "rewards/chosen": 42.386497497558594, "rewards/margins": 34.410980224609375, "rewards/rejected": 7.976912975311279, "step": 3888 }, { "epoch": 2.0129399585921326, "grad_norm": 1.6727913618087769, "learning_rate": 2.679104994464673e-06, "loss": 0.10791397094726562, "rewards/accuracies": 0.9375, "rewards/chosen": 40.13770294189453, "rewards/margins": 31.0325927734375, "rewards/rejected": 9.107110977172852, "step": 3889 }, { "epoch": 2.0134575569358177, "grad_norm": 0.6400967240333557, "learning_rate": 2.676578457461435e-06, "loss": 0.04577195644378662, "rewards/accuracies": 0.984375, "rewards/chosen": 54.040218353271484, "rewards/margins": 38.971229553222656, "rewards/rejected": 15.06496810913086, "step": 3890 }, { "epoch": 2.0139751552795033, "grad_norm": 0.7451190948486328, "learning_rate": 2.6740526768648344e-06, "loss": 0.059007517993450165, "rewards/accuracies": 0.984375, "rewards/chosen": 44.53958511352539, "rewards/margins": 33.8226318359375, "rewards/rejected": 10.712188720703125, "step": 3891 }, { "epoch": 2.0144927536231885, "grad_norm": 1.854663372039795, "learning_rate": 2.6715276534971596e-06, "loss": 0.11828917264938354, "rewards/accuracies": 0.96875, "rewards/chosen": 57.107627868652344, "rewards/margins": 41.77790832519531, "rewards/rejected": 15.327945709228516, "step": 3892 }, { "epoch": 2.0150103519668736, "grad_norm": 0.876960277557373, "learning_rate": 2.6690033881804445e-06, "loss": 0.06711423397064209, "rewards/accuracies": 0.96875, "rewards/chosen": 53.994110107421875, "rewards/margins": 39.45416259765625, "rewards/rejected": 14.541084289550781, "step": 3893 }, { "epoch": 2.015527950310559, "grad_norm": 1.1932746171951294, "learning_rate": 2.6664798817364867e-06, "loss": 0.09864351153373718, "rewards/accuracies": 0.953125, "rewards/chosen": 49.6993408203125, "rewards/margins": 36.78753662109375, "rewards/rejected": 12.894781112670898, "step": 3894 }, { "epoch": 2.0160455486542443, "grad_norm": 1.4676342010498047, "learning_rate": 2.663957134986833e-06, "loss": 0.0780605673789978, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.19861602783203, "rewards/margins": 42.24737548828125, "rewards/rejected": 14.958293914794922, "step": 3895 }, { "epoch": 2.0165631469979295, "grad_norm": 0.8961125612258911, "learning_rate": 2.6614351487527777e-06, "loss": 0.07828479260206223, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.23389434814453, "rewards/margins": 35.74601745605469, "rewards/rejected": 11.490028381347656, "step": 3896 }, { "epoch": 2.017080745341615, "grad_norm": 0.789277970790863, "learning_rate": 2.658913923855372e-06, "loss": 0.04671042785048485, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.66444778442383, "rewards/margins": 40.913330078125, "rewards/rejected": 13.770427703857422, "step": 3897 }, { "epoch": 2.0175983436853, "grad_norm": 1.4508981704711914, "learning_rate": 2.656393461115424e-06, "loss": 0.053201593458652496, "rewards/accuracies": 0.984375, "rewards/chosen": 48.433780670166016, "rewards/margins": 36.01774597167969, "rewards/rejected": 12.407261848449707, "step": 3898 }, { "epoch": 2.0181159420289854, "grad_norm": 1.403012752532959, "learning_rate": 2.6538737613534826e-06, "loss": 0.05789349973201752, "rewards/accuracies": 0.984375, "rewards/chosen": 52.51703643798828, "rewards/margins": 38.13145446777344, "rewards/rejected": 14.380731582641602, "step": 3899 }, { "epoch": 2.018633540372671, "grad_norm": 0.7425548434257507, "learning_rate": 2.6513548253898567e-06, "loss": 0.034408215433359146, "rewards/accuracies": 0.984375, "rewards/chosen": 55.56303405761719, "rewards/margins": 39.138458251953125, "rewards/rejected": 16.436458587646484, "step": 3900 }, { "epoch": 2.019151138716356, "grad_norm": 0.8594541549682617, "learning_rate": 2.648836654044602e-06, "loss": 0.050147730857133865, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.37603759765625, "rewards/margins": 38.67333984375, "rewards/rejected": 13.696950912475586, "step": 3901 }, { "epoch": 2.0196687370600412, "grad_norm": 1.4074102640151978, "learning_rate": 2.6463192481375322e-06, "loss": 0.12045350670814514, "rewards/accuracies": 0.9375, "rewards/chosen": 50.70503234863281, "rewards/margins": 34.11040496826172, "rewards/rejected": 16.585399627685547, "step": 3902 }, { "epoch": 2.020186335403727, "grad_norm": 1.4204894304275513, "learning_rate": 2.643802608488203e-06, "loss": 0.06396907567977905, "rewards/accuracies": 0.96875, "rewards/chosen": 52.233245849609375, "rewards/margins": 38.12477111816406, "rewards/rejected": 14.10371208190918, "step": 3903 }, { "epoch": 2.020703933747412, "grad_norm": 1.6621588468551636, "learning_rate": 2.641286735915925e-06, "loss": 0.1096406951546669, "rewards/accuracies": 0.953125, "rewards/chosen": 56.1248893737793, "rewards/margins": 39.70491027832031, "rewards/rejected": 16.438440322875977, "step": 3904 }, { "epoch": 2.021221532091097, "grad_norm": 0.7370873689651489, "learning_rate": 2.638771631239759e-06, "loss": 0.05278654396533966, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.1866455078125, "rewards/margins": 38.632354736328125, "rewards/rejected": 15.567634582519531, "step": 3905 }, { "epoch": 2.0217391304347827, "grad_norm": 1.0787702798843384, "learning_rate": 2.636257295278516e-06, "loss": 0.0792759358882904, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.29820251464844, "rewards/margins": 36.168548583984375, "rewards/rejected": 15.11587142944336, "step": 3906 }, { "epoch": 2.022256728778468, "grad_norm": 1.2701830863952637, "learning_rate": 2.633743728850756e-06, "loss": 0.06864187121391296, "rewards/accuracies": 0.96875, "rewards/chosen": 61.508338928222656, "rewards/margins": 43.774169921875, "rewards/rejected": 17.73910903930664, "step": 3907 }, { "epoch": 2.022774327122153, "grad_norm": 4.784791469573975, "learning_rate": 2.631230932774788e-06, "loss": 0.1302993893623352, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.40763854980469, "rewards/margins": 39.14239501953125, "rewards/rejected": 13.273229598999023, "step": 3908 }, { "epoch": 2.0232919254658386, "grad_norm": 0.7439733743667603, "learning_rate": 2.628718907868672e-06, "loss": 0.04948553070425987, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.52916717529297, "rewards/margins": 35.731536865234375, "rewards/rejected": 15.798927307128906, "step": 3909 }, { "epoch": 2.0238095238095237, "grad_norm": 1.2501379251480103, "learning_rate": 2.6262076549502147e-06, "loss": 0.05620922893285751, "rewards/accuracies": 0.9921875, "rewards/chosen": 57.90229034423828, "rewards/margins": 42.078025817871094, "rewards/rejected": 15.819271087646484, "step": 3910 }, { "epoch": 2.0243271221532093, "grad_norm": 0.5059860348701477, "learning_rate": 2.6236971748369754e-06, "loss": 0.03715889900922775, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.96015930175781, "rewards/margins": 43.049713134765625, "rewards/rejected": 13.899372100830078, "step": 3911 }, { "epoch": 2.0248447204968945, "grad_norm": 1.4108229875564575, "learning_rate": 2.6211874683462525e-06, "loss": 0.05033773183822632, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.891990661621094, "rewards/margins": 36.568145751953125, "rewards/rejected": 14.335097312927246, "step": 3912 }, { "epoch": 2.0253623188405796, "grad_norm": 1.1369396448135376, "learning_rate": 2.618678536295105e-06, "loss": 0.07294318079948425, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.78132629394531, "rewards/margins": 42.87467956542969, "rewards/rejected": 16.900440216064453, "step": 3913 }, { "epoch": 2.025879917184265, "grad_norm": 1.3567159175872803, "learning_rate": 2.6161703795003325e-06, "loss": 0.1131109744310379, "rewards/accuracies": 0.953125, "rewards/chosen": 45.26441192626953, "rewards/margins": 32.516510009765625, "rewards/rejected": 12.750381469726562, "step": 3914 }, { "epoch": 2.0263975155279503, "grad_norm": 1.7469477653503418, "learning_rate": 2.613662998778485e-06, "loss": 0.11877540498971939, "rewards/accuracies": 0.9375, "rewards/chosen": 56.36406707763672, "rewards/margins": 37.3565673828125, "rewards/rejected": 19.001983642578125, "step": 3915 }, { "epoch": 2.0269151138716355, "grad_norm": 1.3208979368209839, "learning_rate": 2.611156394945853e-06, "loss": 0.1264510303735733, "rewards/accuracies": 0.9296875, "rewards/chosen": 53.199310302734375, "rewards/margins": 37.14872741699219, "rewards/rejected": 16.061386108398438, "step": 3916 }, { "epoch": 2.027432712215321, "grad_norm": 1.7115803956985474, "learning_rate": 2.6086505688184836e-06, "loss": 0.08010430634021759, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.87892532348633, "rewards/margins": 35.56077575683594, "rewards/rejected": 15.30449104309082, "step": 3917 }, { "epoch": 2.027950310559006, "grad_norm": 2.5410025119781494, "learning_rate": 2.606145521212169e-06, "loss": 0.08515428006649017, "rewards/accuracies": 0.96875, "rewards/chosen": 54.469234466552734, "rewards/margins": 38.83856964111328, "rewards/rejected": 15.617589950561523, "step": 3918 }, { "epoch": 2.0284679089026914, "grad_norm": 1.0397108793258667, "learning_rate": 2.603641252942438e-06, "loss": 0.06659909337759018, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.90950012207031, "rewards/margins": 42.07911682128906, "rewards/rejected": 16.828702926635742, "step": 3919 }, { "epoch": 2.028985507246377, "grad_norm": 1.9031809568405151, "learning_rate": 2.60113776482458e-06, "loss": 0.12871688604354858, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.09191131591797, "rewards/margins": 38.54032897949219, "rewards/rejected": 18.563459396362305, "step": 3920 }, { "epoch": 2.029503105590062, "grad_norm": 1.8467071056365967, "learning_rate": 2.598635057673623e-06, "loss": 0.08429579436779022, "rewards/accuracies": 0.984375, "rewards/chosen": 51.82147979736328, "rewards/margins": 37.86065673828125, "rewards/rejected": 13.965841293334961, "step": 3921 }, { "epoch": 2.0300207039337472, "grad_norm": 1.4015097618103027, "learning_rate": 2.5961331323043376e-06, "loss": 0.08801499754190445, "rewards/accuracies": 0.96875, "rewards/chosen": 57.333770751953125, "rewards/margins": 40.381744384765625, "rewards/rejected": 16.947425842285156, "step": 3922 }, { "epoch": 2.030538302277433, "grad_norm": 1.2880252599716187, "learning_rate": 2.5936319895312433e-06, "loss": 0.06760279834270477, "rewards/accuracies": 0.96875, "rewards/chosen": 49.580623626708984, "rewards/margins": 34.695594787597656, "rewards/rejected": 14.881109237670898, "step": 3923 }, { "epoch": 2.031055900621118, "grad_norm": 3.3299548625946045, "learning_rate": 2.59113163016861e-06, "loss": 0.10880982130765915, "rewards/accuracies": 0.96875, "rewards/chosen": 57.715049743652344, "rewards/margins": 40.094451904296875, "rewards/rejected": 17.611324310302734, "step": 3924 }, { "epoch": 2.031573498964803, "grad_norm": 1.8383692502975464, "learning_rate": 2.588632055030447e-06, "loss": 0.1235952153801918, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.17991638183594, "rewards/margins": 36.3363037109375, "rewards/rejected": 17.854738235473633, "step": 3925 }, { "epoch": 2.0320910973084887, "grad_norm": 0.9268056154251099, "learning_rate": 2.5861332649305056e-06, "loss": 0.07318257540464401, "rewards/accuracies": 0.953125, "rewards/chosen": 53.945823669433594, "rewards/margins": 38.951988220214844, "rewards/rejected": 14.983421325683594, "step": 3926 }, { "epoch": 2.032608695652174, "grad_norm": 0.9318631887435913, "learning_rate": 2.583635260682284e-06, "loss": 0.07408007979393005, "rewards/accuracies": 0.984375, "rewards/chosen": 47.00503158569336, "rewards/margins": 33.281036376953125, "rewards/rejected": 13.716102600097656, "step": 3927 }, { "epoch": 2.0331262939958594, "grad_norm": 0.7649117708206177, "learning_rate": 2.581138043099032e-06, "loss": 0.07923334091901779, "rewards/accuracies": 0.96875, "rewards/chosen": 49.67039489746094, "rewards/margins": 34.00691223144531, "rewards/rejected": 15.6680326461792, "step": 3928 }, { "epoch": 2.0336438923395446, "grad_norm": 1.9369945526123047, "learning_rate": 2.5786416129937314e-06, "loss": 0.101462721824646, "rewards/accuracies": 0.953125, "rewards/chosen": 49.31640625, "rewards/margins": 35.467071533203125, "rewards/rejected": 13.850288391113281, "step": 3929 }, { "epoch": 2.0341614906832297, "grad_norm": 0.5809380412101746, "learning_rate": 2.576145971179114e-06, "loss": 0.04654476046562195, "rewards/accuracies": 0.984375, "rewards/chosen": 44.51960754394531, "rewards/margins": 32.600006103515625, "rewards/rejected": 11.914644241333008, "step": 3930 }, { "epoch": 2.0346790890269153, "grad_norm": 0.9625077843666077, "learning_rate": 2.573651118467655e-06, "loss": 0.06925509870052338, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.2694091796875, "rewards/margins": 37.542266845703125, "rewards/rejected": 12.746764183044434, "step": 3931 }, { "epoch": 2.0351966873706004, "grad_norm": 0.7828927040100098, "learning_rate": 2.5711570556715703e-06, "loss": 0.08630190789699554, "rewards/accuracies": 0.9453125, "rewards/chosen": 47.15899658203125, "rewards/margins": 34.834381103515625, "rewards/rejected": 12.32475471496582, "step": 3932 }, { "epoch": 2.0357142857142856, "grad_norm": 0.6629186272621155, "learning_rate": 2.5686637836028216e-06, "loss": 0.06428688019514084, "rewards/accuracies": 0.984375, "rewards/chosen": 52.85803985595703, "rewards/margins": 39.22523498535156, "rewards/rejected": 13.634883880615234, "step": 3933 }, { "epoch": 2.036231884057971, "grad_norm": 0.9661129117012024, "learning_rate": 2.5661713030731107e-06, "loss": 0.06936459243297577, "rewards/accuracies": 0.9921875, "rewards/chosen": 50.8323974609375, "rewards/margins": 37.454498291015625, "rewards/rejected": 13.384663581848145, "step": 3934 }, { "epoch": 2.0367494824016563, "grad_norm": 0.7029432654380798, "learning_rate": 2.5636796148938824e-06, "loss": 0.0749032273888588, "rewards/accuracies": 0.96875, "rewards/chosen": 45.867340087890625, "rewards/margins": 34.73706817626953, "rewards/rejected": 11.139551162719727, "step": 3935 }, { "epoch": 2.0372670807453415, "grad_norm": 6.522998809814453, "learning_rate": 2.5611887198763246e-06, "loss": 0.14542162418365479, "rewards/accuracies": 0.953125, "rewards/chosen": 48.640357971191406, "rewards/margins": 35.24613952636719, "rewards/rejected": 13.3939208984375, "step": 3936 }, { "epoch": 2.037784679089027, "grad_norm": 0.7861965894699097, "learning_rate": 2.5586986188313654e-06, "loss": 0.08365863561630249, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.0977783203125, "rewards/margins": 40.856414794921875, "rewards/rejected": 16.233882904052734, "step": 3937 }, { "epoch": 2.038302277432712, "grad_norm": 0.835364818572998, "learning_rate": 2.556209312569676e-06, "loss": 0.07454323768615723, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.68555450439453, "rewards/margins": 39.319576263427734, "rewards/rejected": 14.364938735961914, "step": 3938 }, { "epoch": 2.0388198757763973, "grad_norm": 0.6524861454963684, "learning_rate": 2.5537208019016668e-06, "loss": 0.05385447293519974, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.06391143798828, "rewards/margins": 37.51561737060547, "rewards/rejected": 13.547103881835938, "step": 3939 }, { "epoch": 2.039337474120083, "grad_norm": 1.5867702960968018, "learning_rate": 2.5512330876374915e-06, "loss": 0.10754317790269852, "rewards/accuracies": 0.953125, "rewards/chosen": 45.868560791015625, "rewards/margins": 33.1622314453125, "rewards/rejected": 12.710861206054688, "step": 3940 }, { "epoch": 2.039855072463768, "grad_norm": 1.6119076013565063, "learning_rate": 2.548746170587044e-06, "loss": 0.11266590654850006, "rewards/accuracies": 0.953125, "rewards/chosen": 47.09827423095703, "rewards/margins": 36.2491455078125, "rewards/rejected": 10.847835540771484, "step": 3941 }, { "epoch": 2.040372670807453, "grad_norm": 1.0992618799209595, "learning_rate": 2.5462600515599545e-06, "loss": 0.06611986458301544, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.927223205566406, "rewards/margins": 39.25421142578125, "rewards/rejected": 12.67824935913086, "step": 3942 }, { "epoch": 2.040890269151139, "grad_norm": 0.9010891318321228, "learning_rate": 2.5437747313656e-06, "loss": 0.05483151227235794, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.76991653442383, "rewards/margins": 41.42103576660156, "rewards/rejected": 12.332305908203125, "step": 3943 }, { "epoch": 2.041407867494824, "grad_norm": 0.8640077114105225, "learning_rate": 2.541290210813097e-06, "loss": 0.08755111694335938, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.66761779785156, "rewards/margins": 37.01344299316406, "rewards/rejected": 10.645334243774414, "step": 3944 }, { "epoch": 2.0419254658385095, "grad_norm": 0.8529109954833984, "learning_rate": 2.5388064907112936e-06, "loss": 0.04992762953042984, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.89519119262695, "rewards/margins": 42.266357421875, "rewards/rejected": 12.619022369384766, "step": 3945 }, { "epoch": 2.0424430641821947, "grad_norm": 0.7990944981575012, "learning_rate": 2.5363235718687833e-06, "loss": 0.06512236595153809, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.55772399902344, "rewards/margins": 39.014556884765625, "rewards/rejected": 11.531144142150879, "step": 3946 }, { "epoch": 2.04296066252588, "grad_norm": 0.8354443311691284, "learning_rate": 2.5338414550939037e-06, "loss": 0.0777832418680191, "rewards/accuracies": 0.9453125, "rewards/chosen": 44.10279846191406, "rewards/margins": 33.91332244873047, "rewards/rejected": 10.192276954650879, "step": 3947 }, { "epoch": 2.0434782608695654, "grad_norm": 1.0339202880859375, "learning_rate": 2.5313601411947208e-06, "loss": 0.06720875948667526, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.875091552734375, "rewards/margins": 38.78038024902344, "rewards/rejected": 11.097076416015625, "step": 3948 }, { "epoch": 2.0439958592132506, "grad_norm": 0.9607865214347839, "learning_rate": 2.5288796309790435e-06, "loss": 0.08218139410018921, "rewards/accuracies": 0.953125, "rewards/chosen": 54.92901611328125, "rewards/margins": 42.14862060546875, "rewards/rejected": 12.776084899902344, "step": 3949 }, { "epoch": 2.0445134575569357, "grad_norm": 1.7815392017364502, "learning_rate": 2.5263999252544242e-06, "loss": 0.07651236653327942, "rewards/accuracies": 0.9609375, "rewards/chosen": 45.039955139160156, "rewards/margins": 36.28800964355469, "rewards/rejected": 8.762496948242188, "step": 3950 }, { "epoch": 2.0450310559006213, "grad_norm": 1.4390287399291992, "learning_rate": 2.523921024828149e-06, "loss": 0.07625175267457962, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.34697723388672, "rewards/margins": 39.634315490722656, "rewards/rejected": 11.708442687988281, "step": 3951 }, { "epoch": 2.0455486542443064, "grad_norm": 1.0727328062057495, "learning_rate": 2.521442930507236e-06, "loss": 0.07164604961872101, "rewards/accuracies": 0.96875, "rewards/chosen": 46.73814010620117, "rewards/margins": 36.698699951171875, "rewards/rejected": 10.047945976257324, "step": 3952 }, { "epoch": 2.0460662525879916, "grad_norm": 0.9345002770423889, "learning_rate": 2.5189656430984497e-06, "loss": 0.07616101950407028, "rewards/accuracies": 0.9765625, "rewards/chosen": 44.69359588623047, "rewards/margins": 35.76165771484375, "rewards/rejected": 8.936164855957031, "step": 3953 }, { "epoch": 2.046583850931677, "grad_norm": 1.1353859901428223, "learning_rate": 2.5164891634082933e-06, "loss": 0.08536037057638168, "rewards/accuracies": 0.96875, "rewards/chosen": 51.13312530517578, "rewards/margins": 40.791290283203125, "rewards/rejected": 10.341835021972656, "step": 3954 }, { "epoch": 2.0471014492753623, "grad_norm": 0.857133150100708, "learning_rate": 2.5140134922429958e-06, "loss": 0.05908006429672241, "rewards/accuracies": 0.9765625, "rewards/chosen": 44.417938232421875, "rewards/margins": 35.153411865234375, "rewards/rejected": 9.240205764770508, "step": 3955 }, { "epoch": 2.0476190476190474, "grad_norm": 0.7682403922080994, "learning_rate": 2.511538630408533e-06, "loss": 0.05822708085179329, "rewards/accuracies": 0.96875, "rewards/chosen": 55.64813232421875, "rewards/margins": 42.492706298828125, "rewards/rejected": 13.14499282836914, "step": 3956 }, { "epoch": 2.048136645962733, "grad_norm": 1.183744192123413, "learning_rate": 2.5090645787106148e-06, "loss": 0.073357492685318, "rewards/accuracies": 0.96875, "rewards/chosen": 46.471343994140625, "rewards/margins": 36.62950134277344, "rewards/rejected": 9.830963134765625, "step": 3957 }, { "epoch": 2.048654244306418, "grad_norm": 1.0380538702011108, "learning_rate": 2.5065913379546846e-06, "loss": 0.07987166941165924, "rewards/accuracies": 0.953125, "rewards/chosen": 48.26171875, "rewards/margins": 39.9461669921875, "rewards/rejected": 8.31854248046875, "step": 3958 }, { "epoch": 2.0491718426501033, "grad_norm": 0.7669700980186462, "learning_rate": 2.504118908945925e-06, "loss": 0.0505702868103981, "rewards/accuracies": 0.9765625, "rewards/chosen": 49.67267608642578, "rewards/margins": 38.793914794921875, "rewards/rejected": 10.877961158752441, "step": 3959 }, { "epoch": 2.049689440993789, "grad_norm": 1.3576414585113525, "learning_rate": 2.501647292489252e-06, "loss": 0.08365264534950256, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.18272399902344, "rewards/margins": 40.3291015625, "rewards/rejected": 13.854584693908691, "step": 3960 }, { "epoch": 2.050207039337474, "grad_norm": 1.0192204713821411, "learning_rate": 2.49917648938932e-06, "loss": 0.0673367828130722, "rewards/accuracies": 0.96875, "rewards/chosen": 48.36918640136719, "rewards/margins": 38.68408203125, "rewards/rejected": 9.675298690795898, "step": 3961 }, { "epoch": 2.050724637681159, "grad_norm": 1.2759150266647339, "learning_rate": 2.4967065004505153e-06, "loss": 0.07119433581829071, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.73561096191406, "rewards/margins": 41.4503173828125, "rewards/rejected": 10.285743713378906, "step": 3962 }, { "epoch": 2.051242236024845, "grad_norm": 1.1421583890914917, "learning_rate": 2.4942373264769607e-06, "loss": 0.06966345012187958, "rewards/accuracies": 0.953125, "rewards/chosen": 54.74610137939453, "rewards/margins": 42.13238525390625, "rewards/rejected": 12.620981216430664, "step": 3963 }, { "epoch": 2.05175983436853, "grad_norm": 1.004620909690857, "learning_rate": 2.4917689682725143e-06, "loss": 0.06229644641280174, "rewards/accuracies": 0.96875, "rewards/chosen": 54.26908493041992, "rewards/margins": 40.166595458984375, "rewards/rejected": 14.099502563476562, "step": 3964 }, { "epoch": 2.0522774327122155, "grad_norm": 0.8161779046058655, "learning_rate": 2.489301426640767e-06, "loss": 0.06372882425785065, "rewards/accuracies": 0.984375, "rewards/chosen": 50.25328063964844, "rewards/margins": 38.3968505859375, "rewards/rejected": 11.850954055786133, "step": 3965 }, { "epoch": 2.0527950310559007, "grad_norm": 1.1986337900161743, "learning_rate": 2.486834702385046e-06, "loss": 0.07715342938899994, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.788700103759766, "rewards/margins": 37.313446044921875, "rewards/rejected": 12.464111328125, "step": 3966 }, { "epoch": 2.053312629399586, "grad_norm": 1.78836190700531, "learning_rate": 2.4843687963084117e-06, "loss": 0.13506077229976654, "rewards/accuracies": 0.9453125, "rewards/chosen": 50.594329833984375, "rewards/margins": 39.49469757080078, "rewards/rejected": 11.09385871887207, "step": 3967 }, { "epoch": 2.0538302277432714, "grad_norm": 2.2050888538360596, "learning_rate": 2.4819037092136527e-06, "loss": 0.11460394412279129, "rewards/accuracies": 0.9375, "rewards/chosen": 45.955780029296875, "rewards/margins": 35.19647216796875, "rewards/rejected": 10.742950439453125, "step": 3968 }, { "epoch": 2.0543478260869565, "grad_norm": 1.20005202293396, "learning_rate": 2.479439441903302e-06, "loss": 0.07523014396429062, "rewards/accuracies": 0.96875, "rewards/chosen": 55.85981369018555, "rewards/margins": 42.89820098876953, "rewards/rejected": 12.958145141601562, "step": 3969 }, { "epoch": 2.0548654244306417, "grad_norm": 0.9715726971626282, "learning_rate": 2.4769759951796184e-06, "loss": 0.07387430965900421, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.41007995605469, "rewards/margins": 40.572265625, "rewards/rejected": 11.840784072875977, "step": 3970 }, { "epoch": 2.0553830227743273, "grad_norm": 1.148349404335022, "learning_rate": 2.4745133698445917e-06, "loss": 0.065416119992733, "rewards/accuracies": 0.984375, "rewards/chosen": 54.8689079284668, "rewards/margins": 41.273468017578125, "rewards/rejected": 13.60220718383789, "step": 3971 }, { "epoch": 2.0559006211180124, "grad_norm": 1.3539636135101318, "learning_rate": 2.4720515666999472e-06, "loss": 0.10313482582569122, "rewards/accuracies": 0.953125, "rewards/chosen": 46.529869079589844, "rewards/margins": 35.733123779296875, "rewards/rejected": 10.799123764038086, "step": 3972 }, { "epoch": 2.0564182194616976, "grad_norm": 1.2797114849090576, "learning_rate": 2.469590586547147e-06, "loss": 0.06610377132892609, "rewards/accuracies": 0.96875, "rewards/chosen": 51.95751953125, "rewards/margins": 40.09596252441406, "rewards/rejected": 11.873739242553711, "step": 3973 }, { "epoch": 2.056935817805383, "grad_norm": 1.187504768371582, "learning_rate": 2.46713043018738e-06, "loss": 0.09986642003059387, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.322296142578125, "rewards/margins": 35.822509765625, "rewards/rejected": 11.491632461547852, "step": 3974 }, { "epoch": 2.0574534161490683, "grad_norm": 2.150763750076294, "learning_rate": 2.4646710984215643e-06, "loss": 0.08489462733268738, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.037384033203125, "rewards/margins": 39.283782958984375, "rewards/rejected": 13.75732421875, "step": 3975 }, { "epoch": 2.0579710144927534, "grad_norm": 2.749314308166504, "learning_rate": 2.462212592050354e-06, "loss": 0.11234159022569656, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.872093200683594, "rewards/margins": 41.325653076171875, "rewards/rejected": 15.560541152954102, "step": 3976 }, { "epoch": 2.058488612836439, "grad_norm": 1.1391546726226807, "learning_rate": 2.4597549118741378e-06, "loss": 0.047395236790180206, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.33203125, "rewards/margins": 38.862457275390625, "rewards/rejected": 11.480281829833984, "step": 3977 }, { "epoch": 2.059006211180124, "grad_norm": 1.0158649682998657, "learning_rate": 2.4572980586930264e-06, "loss": 0.05936478450894356, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.58790588378906, "rewards/margins": 44.43048095703125, "rewards/rejected": 15.159614562988281, "step": 3978 }, { "epoch": 2.0595238095238093, "grad_norm": 0.8377394080162048, "learning_rate": 2.4548420333068657e-06, "loss": 0.055073678493499756, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.18205261230469, "rewards/margins": 41.9566650390625, "rewards/rejected": 15.219486236572266, "step": 3979 }, { "epoch": 2.060041407867495, "grad_norm": 4.214540958404541, "learning_rate": 2.452386836515238e-06, "loss": 0.10644285380840302, "rewards/accuracies": 0.9375, "rewards/chosen": 52.26521301269531, "rewards/margins": 40.36488342285156, "rewards/rejected": 11.892044067382812, "step": 3980 }, { "epoch": 2.06055900621118, "grad_norm": 0.8008572459220886, "learning_rate": 2.4499324691174458e-06, "loss": 0.05454140529036522, "rewards/accuracies": 0.96875, "rewards/chosen": 50.61607360839844, "rewards/margins": 36.69598388671875, "rewards/rejected": 13.92702865600586, "step": 3981 }, { "epoch": 2.0610766045548656, "grad_norm": 1.4179738759994507, "learning_rate": 2.447478931912527e-06, "loss": 0.08926523476839066, "rewards/accuracies": 0.9609375, "rewards/chosen": 45.221466064453125, "rewards/margins": 35.24871826171875, "rewards/rejected": 9.982345581054688, "step": 3982 }, { "epoch": 2.0615942028985508, "grad_norm": 1.5237576961517334, "learning_rate": 2.445026225699247e-06, "loss": 0.09691384434700012, "rewards/accuracies": 0.953125, "rewards/chosen": 55.42628479003906, "rewards/margins": 40.63311767578125, "rewards/rejected": 14.799858093261719, "step": 3983 }, { "epoch": 2.062111801242236, "grad_norm": 0.8464018106460571, "learning_rate": 2.4425743512761064e-06, "loss": 0.048479773104190826, "rewards/accuracies": 0.984375, "rewards/chosen": 48.101139068603516, "rewards/margins": 36.8134765625, "rewards/rejected": 11.284591674804688, "step": 3984 }, { "epoch": 2.0626293995859215, "grad_norm": 1.9999072551727295, "learning_rate": 2.440123309441326e-06, "loss": 0.06328701972961426, "rewards/accuracies": 0.96875, "rewards/chosen": 56.48639678955078, "rewards/margins": 42.568878173828125, "rewards/rejected": 13.914451599121094, "step": 3985 }, { "epoch": 2.0631469979296067, "grad_norm": 1.5486539602279663, "learning_rate": 2.4376731009928626e-06, "loss": 0.13091936707496643, "rewards/accuracies": 0.9609375, "rewards/chosen": 46.003726959228516, "rewards/margins": 35.053985595703125, "rewards/rejected": 10.946516036987305, "step": 3986 }, { "epoch": 2.063664596273292, "grad_norm": 0.7525392770767212, "learning_rate": 2.435223726728398e-06, "loss": 0.07177618145942688, "rewards/accuracies": 0.96875, "rewards/chosen": 54.599853515625, "rewards/margins": 41.317169189453125, "rewards/rejected": 13.287662506103516, "step": 3987 }, { "epoch": 2.0641821946169774, "grad_norm": 1.229983925819397, "learning_rate": 2.432775187445344e-06, "loss": 0.08481934666633606, "rewards/accuracies": 0.953125, "rewards/chosen": 52.160919189453125, "rewards/margins": 39.65673828125, "rewards/rejected": 12.501077651977539, "step": 3988 }, { "epoch": 2.0646997929606625, "grad_norm": 1.1613065004348755, "learning_rate": 2.4303274839408407e-06, "loss": 0.09039778262376785, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.28077697753906, "rewards/margins": 40.32688903808594, "rewards/rejected": 10.942901611328125, "step": 3989 }, { "epoch": 2.0652173913043477, "grad_norm": 1.3222333192825317, "learning_rate": 2.427880617011756e-06, "loss": 0.10159920901060104, "rewards/accuracies": 0.953125, "rewards/chosen": 52.74285888671875, "rewards/margins": 39.34181213378906, "rewards/rejected": 13.412975311279297, "step": 3990 }, { "epoch": 2.0657349896480333, "grad_norm": 1.0105191469192505, "learning_rate": 2.425434587454683e-06, "loss": 0.06661079823970795, "rewards/accuracies": 0.96875, "rewards/chosen": 60.43647766113281, "rewards/margins": 44.68659973144531, "rewards/rejected": 15.750057220458984, "step": 3991 }, { "epoch": 2.0662525879917184, "grad_norm": 1.3093600273132324, "learning_rate": 2.422989396065946e-06, "loss": 0.11332139372825623, "rewards/accuracies": 0.9453125, "rewards/chosen": 45.16939163208008, "rewards/margins": 35.25849151611328, "rewards/rejected": 9.91897201538086, "step": 3992 }, { "epoch": 2.0667701863354035, "grad_norm": 1.8168436288833618, "learning_rate": 2.4205450436415965e-06, "loss": 0.12343655526638031, "rewards/accuracies": 0.9375, "rewards/chosen": 53.83440399169922, "rewards/margins": 42.048919677734375, "rewards/rejected": 11.783016204833984, "step": 3993 }, { "epoch": 2.067287784679089, "grad_norm": 0.9977017641067505, "learning_rate": 2.4181015309774053e-06, "loss": 0.07401585578918457, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.179588317871094, "rewards/margins": 40.865936279296875, "rewards/rejected": 13.319480895996094, "step": 3994 }, { "epoch": 2.0678053830227743, "grad_norm": 1.2898240089416504, "learning_rate": 2.4156588588688814e-06, "loss": 0.0552632138133049, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.37776565551758, "rewards/margins": 42.56037902832031, "rewards/rejected": 12.813721656799316, "step": 3995 }, { "epoch": 2.0683229813664594, "grad_norm": 0.6713663935661316, "learning_rate": 2.413217028111251e-06, "loss": 0.05923168361186981, "rewards/accuracies": 0.96875, "rewards/chosen": 59.65815734863281, "rewards/margins": 44.987945556640625, "rewards/rejected": 14.665733337402344, "step": 3996 }, { "epoch": 2.068840579710145, "grad_norm": 0.8355859518051147, "learning_rate": 2.410776039499473e-06, "loss": 0.06233428046107292, "rewards/accuracies": 0.96875, "rewards/chosen": 49.107139587402344, "rewards/margins": 38.143646240234375, "rewards/rejected": 10.959144592285156, "step": 3997 }, { "epoch": 2.06935817805383, "grad_norm": 1.8330358266830444, "learning_rate": 2.4083358938282233e-06, "loss": 0.10223543643951416, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.2723388671875, "rewards/margins": 39.658355712890625, "rewards/rejected": 12.61723804473877, "step": 3998 }, { "epoch": 2.0698757763975157, "grad_norm": 1.4334909915924072, "learning_rate": 2.4058965918919136e-06, "loss": 0.054251596331596375, "rewards/accuracies": 0.96875, "rewards/chosen": 52.2115592956543, "rewards/margins": 40.1737060546875, "rewards/rejected": 12.03532600402832, "step": 3999 }, { "epoch": 2.070393374741201, "grad_norm": 1.232141375541687, "learning_rate": 2.403458134484677e-06, "loss": 0.086062490940094, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.89140319824219, "rewards/margins": 42.846439361572266, "rewards/rejected": 15.054195404052734, "step": 4000 }, { "epoch": 2.070910973084886, "grad_norm": 0.9535442590713501, "learning_rate": 2.4010205224003667e-06, "loss": 0.06453865021467209, "rewards/accuracies": 0.96875, "rewards/chosen": 48.984161376953125, "rewards/margins": 37.801422119140625, "rewards/rejected": 11.192802429199219, "step": 4001 }, { "epoch": 2.0714285714285716, "grad_norm": 1.0941458940505981, "learning_rate": 2.3985837564325655e-06, "loss": 0.06363930553197861, "rewards/accuracies": 0.953125, "rewards/chosen": 54.03118896484375, "rewards/margins": 41.27392578125, "rewards/rejected": 12.755199432373047, "step": 4002 }, { "epoch": 2.0719461697722568, "grad_norm": 1.222588062286377, "learning_rate": 2.396147837374585e-06, "loss": 0.0969790518283844, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.111244201660156, "rewards/margins": 41.408355712890625, "rewards/rejected": 12.697389602661133, "step": 4003 }, { "epoch": 2.072463768115942, "grad_norm": 0.5188474655151367, "learning_rate": 2.3937127660194513e-06, "loss": 0.022272951900959015, "rewards/accuracies": 0.9921875, "rewards/chosen": 55.248382568359375, "rewards/margins": 40.253387451171875, "rewards/rejected": 15.00346565246582, "step": 4004 }, { "epoch": 2.0729813664596275, "grad_norm": 0.7109526991844177, "learning_rate": 2.3912785431599207e-06, "loss": 0.0526227131485939, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.837440490722656, "rewards/margins": 40.7449951171875, "rewards/rejected": 16.087678909301758, "step": 4005 }, { "epoch": 2.0734989648033126, "grad_norm": 2.566074848175049, "learning_rate": 2.388845169588471e-06, "loss": 0.09776975214481354, "rewards/accuracies": 0.953125, "rewards/chosen": 58.45006561279297, "rewards/margins": 41.596405029296875, "rewards/rejected": 16.854488372802734, "step": 4006 }, { "epoch": 2.074016563146998, "grad_norm": 1.1367319822311401, "learning_rate": 2.3864126460973085e-06, "loss": 0.09625086188316345, "rewards/accuracies": 0.953125, "rewards/chosen": 49.111915588378906, "rewards/margins": 35.03131103515625, "rewards/rejected": 14.07107925415039, "step": 4007 }, { "epoch": 2.0745341614906834, "grad_norm": 0.7010244131088257, "learning_rate": 2.383980973478354e-06, "loss": 0.06047955900430679, "rewards/accuracies": 0.9765625, "rewards/chosen": 49.81252670288086, "rewards/margins": 37.712005615234375, "rewards/rejected": 12.096277236938477, "step": 4008 }, { "epoch": 2.0750517598343685, "grad_norm": 0.7701948285102844, "learning_rate": 2.381550152523257e-06, "loss": 0.06301780045032501, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.587764739990234, "rewards/margins": 41.38581085205078, "rewards/rejected": 14.210098266601562, "step": 4009 }, { "epoch": 2.0755693581780537, "grad_norm": 2.0915863513946533, "learning_rate": 2.3791201840233935e-06, "loss": 0.06818562746047974, "rewards/accuracies": 0.96875, "rewards/chosen": 54.607120513916016, "rewards/margins": 39.1549072265625, "rewards/rejected": 15.462124824523926, "step": 4010 }, { "epoch": 2.0760869565217392, "grad_norm": 0.8481863737106323, "learning_rate": 2.376691068769852e-06, "loss": 0.06317681074142456, "rewards/accuracies": 0.984375, "rewards/chosen": 49.3382568359375, "rewards/margins": 33.44239807128906, "rewards/rejected": 15.9000244140625, "step": 4011 }, { "epoch": 2.0766045548654244, "grad_norm": 0.76646488904953, "learning_rate": 2.37426280755345e-06, "loss": 0.05082516372203827, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.7852783203125, "rewards/margins": 40.6214599609375, "rewards/rejected": 16.167190551757812, "step": 4012 }, { "epoch": 2.0771221532091095, "grad_norm": 1.3593000173568726, "learning_rate": 2.3718354011647253e-06, "loss": 0.08158305287361145, "rewards/accuracies": 0.96875, "rewards/chosen": 60.07538986206055, "rewards/margins": 44.013916015625, "rewards/rejected": 16.055702209472656, "step": 4013 }, { "epoch": 2.077639751552795, "grad_norm": 1.6173155307769775, "learning_rate": 2.369408850393938e-06, "loss": 0.10814942419528961, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.22999572753906, "rewards/margins": 36.30046081542969, "rewards/rejected": 12.945770263671875, "step": 4014 }, { "epoch": 2.0781573498964803, "grad_norm": 1.4949398040771484, "learning_rate": 2.366983156031069e-06, "loss": 0.09015452861785889, "rewards/accuracies": 0.953125, "rewards/chosen": 59.78042984008789, "rewards/margins": 44.089447021484375, "rewards/rejected": 15.676299095153809, "step": 4015 }, { "epoch": 2.078674948240166, "grad_norm": 1.176168441772461, "learning_rate": 2.3645583188658205e-06, "loss": 0.08318182826042175, "rewards/accuracies": 0.96875, "rewards/chosen": 52.594276428222656, "rewards/margins": 36.78050231933594, "rewards/rejected": 15.814538955688477, "step": 4016 }, { "epoch": 2.079192546583851, "grad_norm": 0.7618045210838318, "learning_rate": 2.3621343396876162e-06, "loss": 0.07207372784614563, "rewards/accuracies": 0.9609375, "rewards/chosen": 62.30594253540039, "rewards/margins": 43.55464172363281, "rewards/rejected": 18.74142837524414, "step": 4017 }, { "epoch": 2.079710144927536, "grad_norm": 0.7322865128517151, "learning_rate": 2.3597112192856e-06, "loss": 0.055721890181303024, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.708099365234375, "rewards/margins": 40.486358642578125, "rewards/rejected": 15.21998405456543, "step": 4018 }, { "epoch": 2.0802277432712217, "grad_norm": 1.1876866817474365, "learning_rate": 2.357288958448635e-06, "loss": 0.0662793293595314, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.037567138671875, "rewards/margins": 37.27838134765625, "rewards/rejected": 14.768080711364746, "step": 4019 }, { "epoch": 2.080745341614907, "grad_norm": 0.42266836762428284, "learning_rate": 2.3548675579653067e-06, "loss": 0.029840193688869476, "rewards/accuracies": 0.9921875, "rewards/chosen": 51.9132080078125, "rewards/margins": 37.816986083984375, "rewards/rejected": 14.106292724609375, "step": 4020 }, { "epoch": 2.081262939958592, "grad_norm": 0.8577542901039124, "learning_rate": 2.35244701862392e-06, "loss": 0.05273286998271942, "rewards/accuracies": 0.96875, "rewards/chosen": 66.59721374511719, "rewards/margins": 45.82830810546875, "rewards/rejected": 20.76913833618164, "step": 4021 }, { "epoch": 2.0817805383022776, "grad_norm": 0.9998043775558472, "learning_rate": 2.350027341212498e-06, "loss": 0.071182981133461, "rewards/accuracies": 0.953125, "rewards/chosen": 60.45503234863281, "rewards/margins": 41.3292236328125, "rewards/rejected": 19.138141632080078, "step": 4022 }, { "epoch": 2.0822981366459627, "grad_norm": 1.5852514505386353, "learning_rate": 2.347608526518786e-06, "loss": 0.10444428771734238, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.23399353027344, "rewards/margins": 39.46376037597656, "rewards/rejected": 19.775848388671875, "step": 4023 }, { "epoch": 2.082815734989648, "grad_norm": 0.7737975716590881, "learning_rate": 2.345190575330241e-06, "loss": 0.03420911729335785, "rewards/accuracies": 0.9921875, "rewards/chosen": 53.56145477294922, "rewards/margins": 36.84136962890625, "rewards/rejected": 16.720577239990234, "step": 4024 }, { "epoch": 2.0833333333333335, "grad_norm": 3.170877456665039, "learning_rate": 2.3427734884340512e-06, "loss": 0.13089707493782043, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.79205322265625, "rewards/margins": 39.090972900390625, "rewards/rejected": 19.704280853271484, "step": 4025 }, { "epoch": 2.0838509316770186, "grad_norm": 2.305288553237915, "learning_rate": 2.3403572666171154e-06, "loss": 0.10384424030780792, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.489227294921875, "rewards/margins": 41.52911376953125, "rewards/rejected": 18.95706558227539, "step": 4026 }, { "epoch": 2.0843685300207038, "grad_norm": 1.6896330118179321, "learning_rate": 2.3379419106660486e-06, "loss": 0.10047075152397156, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.308135986328125, "rewards/margins": 35.32769775390625, "rewards/rejected": 13.973838806152344, "step": 4027 }, { "epoch": 2.0848861283643894, "grad_norm": 1.2042962312698364, "learning_rate": 2.3355274213671873e-06, "loss": 0.07713818550109863, "rewards/accuracies": 0.96875, "rewards/chosen": 58.522613525390625, "rewards/margins": 41.34276580810547, "rewards/rejected": 17.178630828857422, "step": 4028 }, { "epoch": 2.0854037267080745, "grad_norm": 0.9282635450363159, "learning_rate": 2.33311379950659e-06, "loss": 0.048113562166690826, "rewards/accuracies": 0.984375, "rewards/chosen": 59.823829650878906, "rewards/margins": 42.5906982421875, "rewards/rejected": 17.232643127441406, "step": 4029 }, { "epoch": 2.0859213250517596, "grad_norm": 4.1782941818237305, "learning_rate": 2.3307010458700285e-06, "loss": 0.12691426277160645, "rewards/accuracies": 0.953125, "rewards/chosen": 58.470428466796875, "rewards/margins": 38.47108459472656, "rewards/rejected": 19.99383544921875, "step": 4030 }, { "epoch": 2.0864389233954452, "grad_norm": 1.071466088294983, "learning_rate": 2.328289161242988e-06, "loss": 0.06132874637842178, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.12294006347656, "rewards/margins": 39.837066650390625, "rewards/rejected": 18.283592224121094, "step": 4031 }, { "epoch": 2.0869565217391304, "grad_norm": 6.019320011138916, "learning_rate": 2.3258781464106756e-06, "loss": 0.055901095271110535, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.06748962402344, "rewards/margins": 40.110107421875, "rewards/rejected": 19.950714111328125, "step": 4032 }, { "epoch": 2.0874741200828155, "grad_norm": 0.6502854228019714, "learning_rate": 2.3234680021580195e-06, "loss": 0.031757012009620667, "rewards/accuracies": 0.984375, "rewards/chosen": 61.85838317871094, "rewards/margins": 43.906280517578125, "rewards/rejected": 17.966232299804688, "step": 4033 }, { "epoch": 2.087991718426501, "grad_norm": 1.711639642715454, "learning_rate": 2.3210587292696544e-06, "loss": 0.1256466507911682, "rewards/accuracies": 0.9375, "rewards/chosen": 52.78177261352539, "rewards/margins": 37.372802734375, "rewards/rejected": 15.409379959106445, "step": 4034 }, { "epoch": 2.0885093167701863, "grad_norm": 1.6614750623703003, "learning_rate": 2.3186503285299387e-06, "loss": 0.06788591295480728, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.750030517578125, "rewards/margins": 36.662200927734375, "rewards/rejected": 14.097278594970703, "step": 4035 }, { "epoch": 2.089026915113872, "grad_norm": 0.9066343903541565, "learning_rate": 2.3162428007229437e-06, "loss": 0.06315869092941284, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.632164001464844, "rewards/margins": 45.92173767089844, "rewards/rejected": 14.713523864746094, "step": 4036 }, { "epoch": 2.089544513457557, "grad_norm": 2.1411871910095215, "learning_rate": 2.313836146632458e-06, "loss": 0.08023396134376526, "rewards/accuracies": 0.96875, "rewards/chosen": 55.680938720703125, "rewards/margins": 41.54847717285156, "rewards/rejected": 14.132095336914062, "step": 4037 }, { "epoch": 2.090062111801242, "grad_norm": 1.1682233810424805, "learning_rate": 2.311430367041986e-06, "loss": 0.082039475440979, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.90791320800781, "rewards/margins": 44.603782653808594, "rewards/rejected": 16.30758285522461, "step": 4038 }, { "epoch": 2.0905797101449277, "grad_norm": 1.3586177825927734, "learning_rate": 2.309025462734744e-06, "loss": 0.06365293264389038, "rewards/accuracies": 0.96875, "rewards/chosen": 58.955047607421875, "rewards/margins": 43.2681884765625, "rewards/rejected": 15.673782348632812, "step": 4039 }, { "epoch": 2.091097308488613, "grad_norm": 0.7690982818603516, "learning_rate": 2.3066214344936687e-06, "loss": 0.0644538551568985, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.56590270996094, "rewards/margins": 45.073944091796875, "rewards/rejected": 14.489046096801758, "step": 4040 }, { "epoch": 2.091614906832298, "grad_norm": 0.9037038683891296, "learning_rate": 2.3042182831014075e-06, "loss": 0.08082397282123566, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.77253723144531, "rewards/margins": 41.6614990234375, "rewards/rejected": 13.112442016601562, "step": 4041 }, { "epoch": 2.0921325051759836, "grad_norm": 0.9184926748275757, "learning_rate": 2.301816009340324e-06, "loss": 0.09076624363660812, "rewards/accuracies": 0.9453125, "rewards/chosen": 50.28059005737305, "rewards/margins": 37.93327331542969, "rewards/rejected": 12.347602844238281, "step": 4042 }, { "epoch": 2.0926501035196687, "grad_norm": 0.8538848161697388, "learning_rate": 2.299414613992496e-06, "loss": 0.051550619304180145, "rewards/accuracies": 0.96875, "rewards/chosen": 63.846702575683594, "rewards/margins": 50.63739013671875, "rewards/rejected": 13.216346740722656, "step": 4043 }, { "epoch": 2.093167701863354, "grad_norm": 1.601312518119812, "learning_rate": 2.297014097839714e-06, "loss": 0.06562404334545135, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.78511047363281, "rewards/margins": 39.595924377441406, "rewards/rejected": 14.184333801269531, "step": 4044 }, { "epoch": 2.0936853002070395, "grad_norm": 0.5752555131912231, "learning_rate": 2.2946144616634846e-06, "loss": 0.037716180086135864, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.736488342285156, "rewards/margins": 43.0814208984375, "rewards/rejected": 14.66450309753418, "step": 4045 }, { "epoch": 2.0942028985507246, "grad_norm": 1.215718150138855, "learning_rate": 2.292215706245026e-06, "loss": 0.10632136464118958, "rewards/accuracies": 0.9375, "rewards/chosen": 51.17628479003906, "rewards/margins": 40.515472412109375, "rewards/rejected": 10.6632080078125, "step": 4046 }, { "epoch": 2.0947204968944098, "grad_norm": 1.3613098859786987, "learning_rate": 2.289817832365271e-06, "loss": 0.07556294649839401, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.56200408935547, "rewards/margins": 41.11712646484375, "rewards/rejected": 12.448383331298828, "step": 4047 }, { "epoch": 2.0952380952380953, "grad_norm": 2.664504051208496, "learning_rate": 2.2874208408048635e-06, "loss": 0.08915002644062042, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.0033073425293, "rewards/margins": 41.579864501953125, "rewards/rejected": 11.418075561523438, "step": 4048 }, { "epoch": 2.0957556935817805, "grad_norm": 1.5301628112792969, "learning_rate": 2.2850247323441644e-06, "loss": 0.10953755676746368, "rewards/accuracies": 0.9375, "rewards/chosen": 53.076080322265625, "rewards/margins": 40.356536865234375, "rewards/rejected": 12.722965240478516, "step": 4049 }, { "epoch": 2.0962732919254656, "grad_norm": 0.5644577741622925, "learning_rate": 2.2826295077632376e-06, "loss": 0.03568009287118912, "rewards/accuracies": 0.9921875, "rewards/chosen": 46.17629623413086, "rewards/margins": 36.36058044433594, "rewards/rejected": 9.814638137817383, "step": 4050 }, { "epoch": 2.096790890269151, "grad_norm": 0.9953485727310181, "learning_rate": 2.280235167841872e-06, "loss": 0.08169041574001312, "rewards/accuracies": 0.96875, "rewards/chosen": 53.887481689453125, "rewards/margins": 41.769287109375, "rewards/rejected": 12.129764556884766, "step": 4051 }, { "epoch": 2.0973084886128364, "grad_norm": 2.1201910972595215, "learning_rate": 2.277841713359562e-06, "loss": 0.05391208082437515, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.94055938720703, "rewards/margins": 45.62986755371094, "rewards/rejected": 12.30156135559082, "step": 4052 }, { "epoch": 2.097826086956522, "grad_norm": 1.0173594951629639, "learning_rate": 2.2754491450955107e-06, "loss": 0.08718761056661606, "rewards/accuracies": 0.9375, "rewards/chosen": 51.56782913208008, "rewards/margins": 41.6214599609375, "rewards/rejected": 9.948661804199219, "step": 4053 }, { "epoch": 2.098343685300207, "grad_norm": 1.4560054540634155, "learning_rate": 2.2730574638286352e-06, "loss": 0.10330091416835785, "rewards/accuracies": 0.953125, "rewards/chosen": 51.54798889160156, "rewards/margins": 39.517974853515625, "rewards/rejected": 12.019424438476562, "step": 4054 }, { "epoch": 2.0988612836438922, "grad_norm": 1.1669423580169678, "learning_rate": 2.2706666703375686e-06, "loss": 0.0742899477481842, "rewards/accuracies": 0.9609375, "rewards/chosen": 45.634246826171875, "rewards/margins": 36.6546630859375, "rewards/rejected": 8.979484558105469, "step": 4055 }, { "epoch": 2.099378881987578, "grad_norm": 3.4829816818237305, "learning_rate": 2.2682767654006507e-06, "loss": 0.10280479490756989, "rewards/accuracies": 0.9375, "rewards/chosen": 56.111244201660156, "rewards/margins": 43.34645080566406, "rewards/rejected": 12.77438735961914, "step": 4056 }, { "epoch": 2.099896480331263, "grad_norm": 1.9042294025421143, "learning_rate": 2.2658877497959286e-06, "loss": 0.0765172615647316, "rewards/accuracies": 0.9609375, "rewards/chosen": 63.37876510620117, "rewards/margins": 47.121337890625, "rewards/rejected": 16.25482940673828, "step": 4057 }, { "epoch": 2.100414078674948, "grad_norm": 1.0445619821548462, "learning_rate": 2.263499624301164e-06, "loss": 0.05458441376686096, "rewards/accuracies": 0.984375, "rewards/chosen": 60.804840087890625, "rewards/margins": 46.36688232421875, "rewards/rejected": 14.425821304321289, "step": 4058 }, { "epoch": 2.1009316770186337, "grad_norm": 0.8584612607955933, "learning_rate": 2.261112389693834e-06, "loss": 0.058413319289684296, "rewards/accuracies": 0.96875, "rewards/chosen": 54.81111145019531, "rewards/margins": 42.00836181640625, "rewards/rejected": 12.798564910888672, "step": 4059 }, { "epoch": 2.101449275362319, "grad_norm": 1.6647546291351318, "learning_rate": 2.2587260467511144e-06, "loss": 0.10372453182935715, "rewards/accuracies": 0.953125, "rewards/chosen": 53.171897888183594, "rewards/margins": 40.892578125, "rewards/rejected": 12.277318954467773, "step": 4060 }, { "epoch": 2.101966873706004, "grad_norm": 0.6845551133155823, "learning_rate": 2.2563405962498978e-06, "loss": 0.04871109873056412, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.96372985839844, "rewards/margins": 42.8876953125, "rewards/rejected": 10.069942474365234, "step": 4061 }, { "epoch": 2.1024844720496896, "grad_norm": 1.0829185247421265, "learning_rate": 2.253956038966785e-06, "loss": 0.06300318241119385, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.024200439453125, "rewards/margins": 46.80853271484375, "rewards/rejected": 11.229423522949219, "step": 4062 }, { "epoch": 2.1030020703933747, "grad_norm": 1.4183217287063599, "learning_rate": 2.251572375678086e-06, "loss": 0.08195526897907257, "rewards/accuracies": 0.96875, "rewards/chosen": 56.53049087524414, "rewards/margins": 46.072357177734375, "rewards/rejected": 10.460405349731445, "step": 4063 }, { "epoch": 2.10351966873706, "grad_norm": 0.8552923202514648, "learning_rate": 2.249189607159819e-06, "loss": 0.05852237716317177, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.186248779296875, "rewards/margins": 45.7958984375, "rewards/rejected": 12.395000457763672, "step": 4064 }, { "epoch": 2.1040372670807455, "grad_norm": 1.2200675010681152, "learning_rate": 2.246807734187713e-06, "loss": 0.05272959545254707, "rewards/accuracies": 0.984375, "rewards/chosen": 49.990966796875, "rewards/margins": 40.98115539550781, "rewards/rejected": 9.024406433105469, "step": 4065 }, { "epoch": 2.1045548654244306, "grad_norm": 1.0653659105300903, "learning_rate": 2.2444267575372015e-06, "loss": 0.08635153621435165, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.461029052734375, "rewards/margins": 42.33244323730469, "rewards/rejected": 9.13577938079834, "step": 4066 }, { "epoch": 2.1050724637681157, "grad_norm": 1.0523532629013062, "learning_rate": 2.2420466779834304e-06, "loss": 0.11716429889202118, "rewards/accuracies": 0.9296875, "rewards/chosen": 49.47096252441406, "rewards/margins": 37.90745544433594, "rewards/rejected": 11.561186790466309, "step": 4067 }, { "epoch": 2.1055900621118013, "grad_norm": 1.7551124095916748, "learning_rate": 2.2396674963012517e-06, "loss": 0.09336075186729431, "rewards/accuracies": 0.9453125, "rewards/chosen": 48.4114875793457, "rewards/margins": 38.47174072265625, "rewards/rejected": 9.951972961425781, "step": 4068 }, { "epoch": 2.1061076604554865, "grad_norm": 0.8144323229789734, "learning_rate": 2.237289213265224e-06, "loss": 0.07409397512674332, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.803375244140625, "rewards/margins": 41.56402587890625, "rewards/rejected": 12.244277954101562, "step": 4069 }, { "epoch": 2.1066252587991716, "grad_norm": 0.6587883234024048, "learning_rate": 2.234911829649616e-06, "loss": 0.05764596909284592, "rewards/accuracies": 0.984375, "rewards/chosen": 54.259613037109375, "rewards/margins": 41.853118896484375, "rewards/rejected": 12.401077270507812, "step": 4070 }, { "epoch": 2.107142857142857, "grad_norm": 0.9426724314689636, "learning_rate": 2.2325353462284006e-06, "loss": 0.0809832215309143, "rewards/accuracies": 0.96875, "rewards/chosen": 50.42630386352539, "rewards/margins": 39.56256103515625, "rewards/rejected": 10.882682800292969, "step": 4071 }, { "epoch": 2.1076604554865424, "grad_norm": 0.5373110175132751, "learning_rate": 2.23015976377526e-06, "loss": 0.043882254511117935, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.289703369140625, "rewards/margins": 42.922698974609375, "rewards/rejected": 12.374053955078125, "step": 4072 }, { "epoch": 2.108178053830228, "grad_norm": 1.1327158212661743, "learning_rate": 2.227785083063583e-06, "loss": 0.08581819385290146, "rewards/accuracies": 0.953125, "rewards/chosen": 52.29829406738281, "rewards/margins": 39.54290771484375, "rewards/rejected": 12.750823974609375, "step": 4073 }, { "epoch": 2.108695652173913, "grad_norm": 3.2429375648498535, "learning_rate": 2.225411304866462e-06, "loss": 0.18417692184448242, "rewards/accuracies": 0.9296875, "rewards/chosen": 50.0726318359375, "rewards/margins": 38.542144775390625, "rewards/rejected": 11.522979736328125, "step": 4074 }, { "epoch": 2.1092132505175982, "grad_norm": 1.0483043193817139, "learning_rate": 2.223038429956701e-06, "loss": 0.0903688594698906, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.21788787841797, "rewards/margins": 37.669921875, "rewards/rejected": 11.54486083984375, "step": 4075 }, { "epoch": 2.109730848861284, "grad_norm": 1.4423197507858276, "learning_rate": 2.2206664591068e-06, "loss": 0.13072164356708527, "rewards/accuracies": 0.9296875, "rewards/chosen": 52.41801834106445, "rewards/margins": 40.897857666015625, "rewards/rejected": 11.511085510253906, "step": 4076 }, { "epoch": 2.110248447204969, "grad_norm": 0.8617547154426575, "learning_rate": 2.218295393088978e-06, "loss": 0.06288744509220123, "rewards/accuracies": 0.96875, "rewards/chosen": 55.19624328613281, "rewards/margins": 44.097412109375, "rewards/rejected": 11.10323715209961, "step": 4077 }, { "epoch": 2.110766045548654, "grad_norm": 1.129530668258667, "learning_rate": 2.2159252326751495e-06, "loss": 0.1198115199804306, "rewards/accuracies": 0.9453125, "rewards/chosen": 48.358238220214844, "rewards/margins": 37.386322021484375, "rewards/rejected": 10.976919174194336, "step": 4078 }, { "epoch": 2.1112836438923397, "grad_norm": 1.4278838634490967, "learning_rate": 2.2135559786369405e-06, "loss": 0.1171209067106247, "rewards/accuracies": 0.953125, "rewards/chosen": 55.482749938964844, "rewards/margins": 41.967498779296875, "rewards/rejected": 13.507474899291992, "step": 4079 }, { "epoch": 2.111801242236025, "grad_norm": 0.8712340593338013, "learning_rate": 2.2111876317456717e-06, "loss": 0.10438293218612671, "rewards/accuracies": 0.953125, "rewards/chosen": 53.97937774658203, "rewards/margins": 43.123046875, "rewards/rejected": 10.845252990722656, "step": 4080 }, { "epoch": 2.11231884057971, "grad_norm": 0.7836917042732239, "learning_rate": 2.2088201927723817e-06, "loss": 0.0686120092868805, "rewards/accuracies": 0.96875, "rewards/chosen": 45.006195068359375, "rewards/margins": 37.040679931640625, "rewards/rejected": 7.962158203125, "step": 4081 }, { "epoch": 2.1128364389233956, "grad_norm": 0.6645591855049133, "learning_rate": 2.206453662487808e-06, "loss": 0.05817181617021561, "rewards/accuracies": 0.96875, "rewards/chosen": 52.62371826171875, "rewards/margins": 41.58305358886719, "rewards/rejected": 11.048149108886719, "step": 4082 }, { "epoch": 2.1133540372670807, "grad_norm": 0.8609780073165894, "learning_rate": 2.2040880416623865e-06, "loss": 0.07351469993591309, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.93000030517578, "rewards/margins": 39.56208038330078, "rewards/rejected": 13.369638442993164, "step": 4083 }, { "epoch": 2.113871635610766, "grad_norm": 1.2498579025268555, "learning_rate": 2.201723331066264e-06, "loss": 0.09122852236032486, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.41717529296875, "rewards/margins": 42.34968566894531, "rewards/rejected": 10.075538635253906, "step": 4084 }, { "epoch": 2.1143892339544514, "grad_norm": 0.5433022975921631, "learning_rate": 2.199359531469293e-06, "loss": 0.045881323516368866, "rewards/accuracies": 0.96875, "rewards/chosen": 55.70491409301758, "rewards/margins": 44.933197021484375, "rewards/rejected": 10.77359390258789, "step": 4085 }, { "epoch": 2.1149068322981366, "grad_norm": 1.1130132675170898, "learning_rate": 2.1969966436410202e-06, "loss": 0.08176283538341522, "rewards/accuracies": 0.953125, "rewards/chosen": 54.2862434387207, "rewards/margins": 41.548370361328125, "rewards/rejected": 12.743301391601562, "step": 4086 }, { "epoch": 2.1154244306418217, "grad_norm": 0.5922015905380249, "learning_rate": 2.1946346683507035e-06, "loss": 0.061298854649066925, "rewards/accuracies": 0.953125, "rewards/chosen": 53.2742919921875, "rewards/margins": 43.64874267578125, "rewards/rejected": 9.634628295898438, "step": 4087 }, { "epoch": 2.1159420289855073, "grad_norm": 3.142245292663574, "learning_rate": 2.1922736063672984e-06, "loss": 0.1268431842327118, "rewards/accuracies": 0.953125, "rewards/chosen": 53.394203186035156, "rewards/margins": 42.25177001953125, "rewards/rejected": 11.145044326782227, "step": 4088 }, { "epoch": 2.1164596273291925, "grad_norm": 3.1469085216522217, "learning_rate": 2.1899134584594712e-06, "loss": 0.14170503616333008, "rewards/accuracies": 0.9140625, "rewards/chosen": 49.261932373046875, "rewards/margins": 38.84498596191406, "rewards/rejected": 10.413070678710938, "step": 4089 }, { "epoch": 2.116977225672878, "grad_norm": 2.3415794372558594, "learning_rate": 2.18755422539558e-06, "loss": 0.10018975287675858, "rewards/accuracies": 0.953125, "rewards/chosen": 51.36933135986328, "rewards/margins": 41.066009521484375, "rewards/rejected": 10.299758911132812, "step": 4090 }, { "epoch": 2.117494824016563, "grad_norm": 1.4239416122436523, "learning_rate": 2.185195907943691e-06, "loss": 0.1429271101951599, "rewards/accuracies": 0.9375, "rewards/chosen": 44.85064697265625, "rewards/margins": 34.849510192871094, "rewards/rejected": 9.99847412109375, "step": 4091 }, { "epoch": 2.1180124223602483, "grad_norm": 0.8410381078720093, "learning_rate": 2.182838506871573e-06, "loss": 0.08428685367107391, "rewards/accuracies": 0.953125, "rewards/chosen": 50.19775390625, "rewards/margins": 40.07135009765625, "rewards/rejected": 10.111544609069824, "step": 4092 }, { "epoch": 2.118530020703934, "grad_norm": 0.6120830178260803, "learning_rate": 2.180482022946694e-06, "loss": 0.0633908212184906, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.379852294921875, "rewards/margins": 46.0906982421875, "rewards/rejected": 13.288259506225586, "step": 4093 }, { "epoch": 2.119047619047619, "grad_norm": 0.7533301711082458, "learning_rate": 2.1781264569362244e-06, "loss": 0.07571873813867569, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.142913818359375, "rewards/margins": 42.2012939453125, "rewards/rejected": 13.93674087524414, "step": 4094 }, { "epoch": 2.119565217391304, "grad_norm": 4.368602752685547, "learning_rate": 2.175771809607035e-06, "loss": 0.10635550320148468, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.0001220703125, "rewards/margins": 40.89605712890625, "rewards/rejected": 11.102542877197266, "step": 4095 }, { "epoch": 2.12008281573499, "grad_norm": 0.4230055809020996, "learning_rate": 2.1734180817256994e-06, "loss": 0.02332688868045807, "rewards/accuracies": 1.0, "rewards/chosen": 52.6641845703125, "rewards/margins": 40.88116455078125, "rewards/rejected": 11.773277282714844, "step": 4096 }, { "epoch": 2.120600414078675, "grad_norm": 0.923204779624939, "learning_rate": 2.1710652740584905e-06, "loss": 0.1057736724615097, "rewards/accuracies": 0.9453125, "rewards/chosen": 52.564170837402344, "rewards/margins": 41.84912109375, "rewards/rejected": 10.701934814453125, "step": 4097 }, { "epoch": 2.12111801242236, "grad_norm": 1.6059391498565674, "learning_rate": 2.1687133873713812e-06, "loss": 0.07146304100751877, "rewards/accuracies": 0.96875, "rewards/chosen": 51.34947967529297, "rewards/margins": 39.613525390625, "rewards/rejected": 11.739688873291016, "step": 4098 }, { "epoch": 2.1216356107660457, "grad_norm": 0.9011387228965759, "learning_rate": 2.1663624224300454e-06, "loss": 0.0717427060008049, "rewards/accuracies": 0.96875, "rewards/chosen": 59.17767333984375, "rewards/margins": 46.86090087890625, "rewards/rejected": 12.32042121887207, "step": 4099 }, { "epoch": 2.122153209109731, "grad_norm": 0.8976560831069946, "learning_rate": 2.164012379999857e-06, "loss": 0.05384156107902527, "rewards/accuracies": 0.984375, "rewards/chosen": 54.74628448486328, "rewards/margins": 42.30218505859375, "rewards/rejected": 12.433128356933594, "step": 4100 }, { "epoch": 2.122670807453416, "grad_norm": 0.7511704564094543, "learning_rate": 2.1616632608458887e-06, "loss": 0.05339078605175018, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.23314666748047, "rewards/margins": 42.227691650390625, "rewards/rejected": 13.009014129638672, "step": 4101 }, { "epoch": 2.1231884057971016, "grad_norm": 1.001694679260254, "learning_rate": 2.1593150657329136e-06, "loss": 0.0955730676651001, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.13127899169922, "rewards/margins": 41.382781982421875, "rewards/rejected": 11.750916481018066, "step": 4102 }, { "epoch": 2.1237060041407867, "grad_norm": 1.7527117729187012, "learning_rate": 2.1569677954254037e-06, "loss": 0.10028166323900223, "rewards/accuracies": 0.9765625, "rewards/chosen": 49.113807678222656, "rewards/margins": 41.019317626953125, "rewards/rejected": 8.096839904785156, "step": 4103 }, { "epoch": 2.124223602484472, "grad_norm": 0.6953075528144836, "learning_rate": 2.1546214506875296e-06, "loss": 0.0727476254105568, "rewards/accuracies": 0.96875, "rewards/chosen": 56.47581481933594, "rewards/margins": 44.46124267578125, "rewards/rejected": 12.011428833007812, "step": 4104 }, { "epoch": 2.1247412008281574, "grad_norm": 0.9992334842681885, "learning_rate": 2.1522760322831623e-06, "loss": 0.07179930061101913, "rewards/accuracies": 0.96875, "rewards/chosen": 47.6546630859375, "rewards/margins": 38.647125244140625, "rewards/rejected": 9.026123046875, "step": 4105 }, { "epoch": 2.1252587991718426, "grad_norm": 0.7771419882774353, "learning_rate": 2.149931540975864e-06, "loss": 0.08883985877037048, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.96803283691406, "rewards/margins": 43.673431396484375, "rewards/rejected": 11.292793273925781, "step": 4106 }, { "epoch": 2.125776397515528, "grad_norm": 1.0122014284133911, "learning_rate": 2.147587977528907e-06, "loss": 0.09914206713438034, "rewards/accuracies": 0.953125, "rewards/chosen": 49.413230895996094, "rewards/margins": 39.184600830078125, "rewards/rejected": 10.239387512207031, "step": 4107 }, { "epoch": 2.1262939958592133, "grad_norm": 1.1942932605743408, "learning_rate": 2.145245342705255e-06, "loss": 0.05283292382955551, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.39680480957031, "rewards/margins": 45.94586181640625, "rewards/rejected": 11.446998596191406, "step": 4108 }, { "epoch": 2.1268115942028984, "grad_norm": 0.7967694997787476, "learning_rate": 2.142903637267566e-06, "loss": 0.06921282410621643, "rewards/accuracies": 0.953125, "rewards/chosen": 49.68621826171875, "rewards/margins": 40.057861328125, "rewards/rejected": 9.630271911621094, "step": 4109 }, { "epoch": 2.127329192546584, "grad_norm": 0.8142881989479065, "learning_rate": 2.1405628619781987e-06, "loss": 0.050749506801366806, "rewards/accuracies": 0.96875, "rewards/chosen": 55.37387466430664, "rewards/margins": 43.74627685546875, "rewards/rejected": 11.632911682128906, "step": 4110 }, { "epoch": 2.127846790890269, "grad_norm": 1.651679515838623, "learning_rate": 2.1382230175992136e-06, "loss": 0.05857127532362938, "rewards/accuracies": 0.96875, "rewards/chosen": 50.42406463623047, "rewards/margins": 41.901397705078125, "rewards/rejected": 8.519744873046875, "step": 4111 }, { "epoch": 2.1283643892339543, "grad_norm": 1.153259038925171, "learning_rate": 2.135884104892364e-06, "loss": 0.1222514659166336, "rewards/accuracies": 0.9453125, "rewards/chosen": 49.4849853515625, "rewards/margins": 38.089019775390625, "rewards/rejected": 11.397834777832031, "step": 4112 }, { "epoch": 2.12888198757764, "grad_norm": 1.0968767404556274, "learning_rate": 2.133546124619095e-06, "loss": 0.10503534972667694, "rewards/accuracies": 0.9453125, "rewards/chosen": 50.22792053222656, "rewards/margins": 39.9644775390625, "rewards/rejected": 10.272378921508789, "step": 4113 }, { "epoch": 2.129399585921325, "grad_norm": 0.9413313269615173, "learning_rate": 2.1312090775405542e-06, "loss": 0.07064048945903778, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.271263122558594, "rewards/margins": 42.062591552734375, "rewards/rejected": 13.207077026367188, "step": 4114 }, { "epoch": 2.12991718426501, "grad_norm": 0.701288104057312, "learning_rate": 2.128872964417589e-06, "loss": 0.07556921243667603, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.36222839355469, "rewards/margins": 42.753173828125, "rewards/rejected": 11.611326217651367, "step": 4115 }, { "epoch": 2.130434782608696, "grad_norm": 0.7907811403274536, "learning_rate": 2.1265377860107327e-06, "loss": 0.0681469514966011, "rewards/accuracies": 0.96875, "rewards/chosen": 52.9954833984375, "rewards/margins": 42.768890380859375, "rewards/rejected": 10.232732772827148, "step": 4116 }, { "epoch": 2.130952380952381, "grad_norm": 1.1154097318649292, "learning_rate": 2.124203543080221e-06, "loss": 0.10830839723348618, "rewards/accuracies": 0.9296875, "rewards/chosen": 52.36246109008789, "rewards/margins": 40.96307373046875, "rewards/rejected": 11.392242431640625, "step": 4117 }, { "epoch": 2.131469979296066, "grad_norm": 2.0310142040252686, "learning_rate": 2.121870236385984e-06, "loss": 0.12642008066177368, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.284217834472656, "rewards/margins": 39.22882080078125, "rewards/rejected": 10.062118530273438, "step": 4118 }, { "epoch": 2.1319875776397517, "grad_norm": 0.8773250579833984, "learning_rate": 2.119537866687646e-06, "loss": 0.09832149744033813, "rewards/accuracies": 0.9453125, "rewards/chosen": 50.073692321777344, "rewards/margins": 40.28813934326172, "rewards/rejected": 9.779145240783691, "step": 4119 }, { "epoch": 2.132505175983437, "grad_norm": 0.5738805532455444, "learning_rate": 2.1172064347445264e-06, "loss": 0.052819836884737015, "rewards/accuracies": 0.96875, "rewards/chosen": 49.847381591796875, "rewards/margins": 40.82942199707031, "rewards/rejected": 9.016266822814941, "step": 4120 }, { "epoch": 2.133022774327122, "grad_norm": 1.015618920326233, "learning_rate": 2.1148759413156406e-06, "loss": 0.10126754641532898, "rewards/accuracies": 0.953125, "rewards/chosen": 54.142303466796875, "rewards/margins": 43.564300537109375, "rewards/rejected": 10.580585479736328, "step": 4121 }, { "epoch": 2.1335403726708075, "grad_norm": 1.4575505256652832, "learning_rate": 2.1125463871596974e-06, "loss": 0.07838109135627747, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.34290313720703, "rewards/margins": 42.75152587890625, "rewards/rejected": 11.58975601196289, "step": 4122 }, { "epoch": 2.1340579710144927, "grad_norm": 0.6589928269386292, "learning_rate": 2.1102177730351e-06, "loss": 0.03500992804765701, "rewards/accuracies": 0.9921875, "rewards/chosen": 51.04205322265625, "rewards/margins": 42.400604248046875, "rewards/rejected": 8.636033058166504, "step": 4123 }, { "epoch": 2.1345755693581783, "grad_norm": 0.7182097434997559, "learning_rate": 2.1078900996999448e-06, "loss": 0.07810264825820923, "rewards/accuracies": 0.953125, "rewards/chosen": 49.553131103515625, "rewards/margins": 40.28884506225586, "rewards/rejected": 9.266864776611328, "step": 4124 }, { "epoch": 2.1350931677018634, "grad_norm": 1.4326386451721191, "learning_rate": 2.1055633679120224e-06, "loss": 0.07222975045442581, "rewards/accuracies": 0.9453125, "rewards/chosen": 49.650634765625, "rewards/margins": 40.08903503417969, "rewards/rejected": 9.553657531738281, "step": 4125 }, { "epoch": 2.1356107660455486, "grad_norm": 0.7688694000244141, "learning_rate": 2.103237578428819e-06, "loss": 0.0666951984167099, "rewards/accuracies": 0.96875, "rewards/chosen": 57.56226348876953, "rewards/margins": 45.17628479003906, "rewards/rejected": 12.387687683105469, "step": 4126 }, { "epoch": 2.136128364389234, "grad_norm": 2.814493179321289, "learning_rate": 2.1009127320075103e-06, "loss": 0.05130431428551674, "rewards/accuracies": 0.984375, "rewards/chosen": 50.94000244140625, "rewards/margins": 39.77171325683594, "rewards/rejected": 11.173133850097656, "step": 4127 }, { "epoch": 2.1366459627329193, "grad_norm": 1.072901725769043, "learning_rate": 2.098588829404967e-06, "loss": 0.0588759183883667, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.89259338378906, "rewards/margins": 40.44212341308594, "rewards/rejected": 10.461654663085938, "step": 4128 }, { "epoch": 2.1371635610766044, "grad_norm": 1.4404217004776, "learning_rate": 2.096265871377754e-06, "loss": 0.1101016104221344, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.607574462890625, "rewards/margins": 40.165679931640625, "rewards/rejected": 11.454110145568848, "step": 4129 }, { "epoch": 2.13768115942029, "grad_norm": 0.9248368144035339, "learning_rate": 2.093943858682125e-06, "loss": 0.07677901536226273, "rewards/accuracies": 0.9765625, "rewards/chosen": 49.552276611328125, "rewards/margins": 40.19677734375, "rewards/rejected": 9.349418640136719, "step": 4130 }, { "epoch": 2.138198757763975, "grad_norm": 0.8565154075622559, "learning_rate": 2.0916227920740308e-06, "loss": 0.09043185412883759, "rewards/accuracies": 0.9453125, "rewards/chosen": 48.5081787109375, "rewards/margins": 40.37409973144531, "rewards/rejected": 8.132078170776367, "step": 4131 }, { "epoch": 2.1387163561076603, "grad_norm": 0.7292227745056152, "learning_rate": 2.089302672309106e-06, "loss": 0.07203125208616257, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.129180908203125, "rewards/margins": 44.0982666015625, "rewards/rejected": 12.03272819519043, "step": 4132 }, { "epoch": 2.139233954451346, "grad_norm": 0.4163387715816498, "learning_rate": 2.0869835001426873e-06, "loss": 0.024274365976452827, "rewards/accuracies": 0.984375, "rewards/chosen": 56.50225830078125, "rewards/margins": 43.82098388671875, "rewards/rejected": 12.674530982971191, "step": 4133 }, { "epoch": 2.139751552795031, "grad_norm": 0.5776531100273132, "learning_rate": 2.0846652763297965e-06, "loss": 0.05261075496673584, "rewards/accuracies": 0.9921875, "rewards/chosen": 50.39984130859375, "rewards/margins": 37.728759765625, "rewards/rejected": 12.669588088989258, "step": 4134 }, { "epoch": 2.140269151138716, "grad_norm": 1.3150296211242676, "learning_rate": 2.0823480016251496e-06, "loss": 0.046638377010822296, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.417877197265625, "rewards/margins": 44.292205810546875, "rewards/rejected": 12.125694274902344, "step": 4135 }, { "epoch": 2.1407867494824018, "grad_norm": 0.8028373122215271, "learning_rate": 2.0800316767831465e-06, "loss": 0.06800264865159988, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.841278076171875, "rewards/margins": 43.5811767578125, "rewards/rejected": 14.250423431396484, "step": 4136 }, { "epoch": 2.141304347826087, "grad_norm": 2.3947646617889404, "learning_rate": 2.0777163025578894e-06, "loss": 0.09962992370128632, "rewards/accuracies": 0.953125, "rewards/chosen": 51.842342376708984, "rewards/margins": 42.1539306640625, "rewards/rejected": 9.677848815917969, "step": 4137 }, { "epoch": 2.141821946169772, "grad_norm": 3.9854273796081543, "learning_rate": 2.075401879703165e-06, "loss": 0.11571578681468964, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.96307373046875, "rewards/margins": 42.83428955078125, "rewards/rejected": 14.136360168457031, "step": 4138 }, { "epoch": 2.1423395445134576, "grad_norm": 1.7295536994934082, "learning_rate": 2.0730884089724463e-06, "loss": 0.11749202013015747, "rewards/accuracies": 0.953125, "rewards/chosen": 53.72189712524414, "rewards/margins": 40.480987548828125, "rewards/rejected": 13.240196228027344, "step": 4139 }, { "epoch": 2.142857142857143, "grad_norm": 1.1196013689041138, "learning_rate": 2.0707758911189006e-06, "loss": 0.07836121320724487, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.71873474121094, "rewards/margins": 41.7679443359375, "rewards/rejected": 11.939666748046875, "step": 4140 }, { "epoch": 2.1433747412008284, "grad_norm": 1.560855746269226, "learning_rate": 2.0684643268953906e-06, "loss": 0.096308633685112, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.03636932373047, "rewards/margins": 48.33319091796875, "rewards/rejected": 12.696415901184082, "step": 4141 }, { "epoch": 2.1438923395445135, "grad_norm": 1.0324171781539917, "learning_rate": 2.066153717054457e-06, "loss": 0.07767529040575027, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.92369079589844, "rewards/margins": 40.495635986328125, "rewards/rejected": 15.423873901367188, "step": 4142 }, { "epoch": 2.1444099378881987, "grad_norm": 0.967266857624054, "learning_rate": 2.063844062348337e-06, "loss": 0.09655874967575073, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.200645446777344, "rewards/margins": 41.169677734375, "rewards/rejected": 13.02328872680664, "step": 4143 }, { "epoch": 2.1449275362318843, "grad_norm": 1.5453839302062988, "learning_rate": 2.0615353635289553e-06, "loss": 0.07578206062316895, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.31425476074219, "rewards/margins": 39.680419921875, "rewards/rejected": 15.648311614990234, "step": 4144 }, { "epoch": 2.1454451345755694, "grad_norm": 1.2670520544052124, "learning_rate": 2.0592276213479262e-06, "loss": 0.09373173117637634, "rewards/accuracies": 0.953125, "rewards/chosen": 52.367279052734375, "rewards/margins": 40.712249755859375, "rewards/rejected": 11.64306640625, "step": 4145 }, { "epoch": 2.1459627329192545, "grad_norm": 0.7620844841003418, "learning_rate": 2.0569208365565506e-06, "loss": 0.05864046514034271, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.98432922363281, "rewards/margins": 45.34466552734375, "rewards/rejected": 13.665664672851562, "step": 4146 }, { "epoch": 2.14648033126294, "grad_norm": 0.9378508925437927, "learning_rate": 2.0546150099058203e-06, "loss": 0.05949748679995537, "rewards/accuracies": 0.96875, "rewards/chosen": 53.19960021972656, "rewards/margins": 40.685089111328125, "rewards/rejected": 12.506683349609375, "step": 4147 }, { "epoch": 2.1469979296066253, "grad_norm": 1.6861546039581299, "learning_rate": 2.052310142146413e-06, "loss": 0.07070805132389069, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.256256103515625, "rewards/margins": 44.109466552734375, "rewards/rejected": 15.156440734863281, "step": 4148 }, { "epoch": 2.1475155279503104, "grad_norm": 1.9837497472763062, "learning_rate": 2.050006234028695e-06, "loss": 0.072959765791893, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.682037353515625, "rewards/margins": 46.318641662597656, "rewards/rejected": 11.356454849243164, "step": 4149 }, { "epoch": 2.148033126293996, "grad_norm": 1.32504141330719, "learning_rate": 2.047703286302721e-06, "loss": 0.11801940947771072, "rewards/accuracies": 0.9296875, "rewards/chosen": 56.89591979980469, "rewards/margins": 44.53564453125, "rewards/rejected": 12.35361099243164, "step": 4150 }, { "epoch": 2.148550724637681, "grad_norm": 2.4217238426208496, "learning_rate": 2.045401299718231e-06, "loss": 0.0638166218996048, "rewards/accuracies": 0.96875, "rewards/chosen": 55.12884521484375, "rewards/margins": 42.980560302734375, "rewards/rejected": 12.136734008789062, "step": 4151 }, { "epoch": 2.1490683229813663, "grad_norm": 0.8912543654441833, "learning_rate": 2.0431002750246544e-06, "loss": 0.05466146767139435, "rewards/accuracies": 0.9921875, "rewards/chosen": 53.543312072753906, "rewards/margins": 42.440826416015625, "rewards/rejected": 11.084924697875977, "step": 4152 }, { "epoch": 2.149585921325052, "grad_norm": 1.3879557847976685, "learning_rate": 2.040800212971105e-06, "loss": 0.07976509630680084, "rewards/accuracies": 0.953125, "rewards/chosen": 53.876705169677734, "rewards/margins": 42.35968017578125, "rewards/rejected": 11.516075134277344, "step": 4153 }, { "epoch": 2.150103519668737, "grad_norm": 4.0715413093566895, "learning_rate": 2.0385011143063867e-06, "loss": 0.1216023787856102, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.690895080566406, "rewards/margins": 41.784454345703125, "rewards/rejected": 9.90890884399414, "step": 4154 }, { "epoch": 2.150621118012422, "grad_norm": 0.9694628119468689, "learning_rate": 2.0362029797789857e-06, "loss": 0.08996506035327911, "rewards/accuracies": 0.9453125, "rewards/chosen": 48.862491607666016, "rewards/margins": 38.10601806640625, "rewards/rejected": 10.767890930175781, "step": 4155 }, { "epoch": 2.1511387163561078, "grad_norm": 1.7186839580535889, "learning_rate": 2.033905810137077e-06, "loss": 0.11025600135326385, "rewards/accuracies": 0.953125, "rewards/chosen": 46.55353927612305, "rewards/margins": 36.77748107910156, "rewards/rejected": 9.771116256713867, "step": 4156 }, { "epoch": 2.151656314699793, "grad_norm": 0.5911874771118164, "learning_rate": 2.031609606128523e-06, "loss": 0.047854937613010406, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.966835021972656, "rewards/margins": 44.7056884765625, "rewards/rejected": 10.261817932128906, "step": 4157 }, { "epoch": 2.1521739130434785, "grad_norm": 1.1149320602416992, "learning_rate": 2.029314368500863e-06, "loss": 0.05541699007153511, "rewards/accuracies": 0.9921875, "rewards/chosen": 44.670127868652344, "rewards/margins": 35.416412353515625, "rewards/rejected": 9.248489379882812, "step": 4158 }, { "epoch": 2.1526915113871636, "grad_norm": 0.9561939835548401, "learning_rate": 2.027020098001334e-06, "loss": 0.0877484679222107, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.39414978027344, "rewards/margins": 42.9537353515625, "rewards/rejected": 10.434381484985352, "step": 4159 }, { "epoch": 2.153209109730849, "grad_norm": 2.2343909740448, "learning_rate": 2.024726795376852e-06, "loss": 0.07189935445785522, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.474891662597656, "rewards/margins": 44.17437744140625, "rewards/rejected": 12.29765796661377, "step": 4160 }, { "epoch": 2.1537267080745344, "grad_norm": 0.7990787625312805, "learning_rate": 2.022434461374018e-06, "loss": 0.05446810647845268, "rewards/accuracies": 0.96875, "rewards/chosen": 57.106231689453125, "rewards/margins": 46.59306335449219, "rewards/rejected": 10.500738143920898, "step": 4161 }, { "epoch": 2.1542443064182195, "grad_norm": 0.9941034913063049, "learning_rate": 2.020143096739113e-06, "loss": 0.09375990927219391, "rewards/accuracies": 0.96875, "rewards/chosen": 58.35747528076172, "rewards/margins": 45.1124267578125, "rewards/rejected": 13.25834846496582, "step": 4162 }, { "epoch": 2.1547619047619047, "grad_norm": 1.8510936498641968, "learning_rate": 2.0178527022181126e-06, "loss": 0.13198621571063995, "rewards/accuracies": 0.953125, "rewards/chosen": 49.67524719238281, "rewards/margins": 41.8492431640625, "rewards/rejected": 7.832691192626953, "step": 4163 }, { "epoch": 2.1552795031055902, "grad_norm": 0.8403435349464417, "learning_rate": 2.015563278556672e-06, "loss": 0.09883804619312286, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.33473205566406, "rewards/margins": 43.94219970703125, "rewards/rejected": 12.414020538330078, "step": 4164 }, { "epoch": 2.1557971014492754, "grad_norm": 1.3749074935913086, "learning_rate": 2.0132748265001257e-06, "loss": 0.15940120816230774, "rewards/accuracies": 0.9296875, "rewards/chosen": 52.64530944824219, "rewards/margins": 41.95411682128906, "rewards/rejected": 10.70086669921875, "step": 4165 }, { "epoch": 2.1563146997929605, "grad_norm": 1.2344214916229248, "learning_rate": 2.0109873467934953e-06, "loss": 0.06432110071182251, "rewards/accuracies": 0.96875, "rewards/chosen": 56.630733489990234, "rewards/margins": 46.3612060546875, "rewards/rejected": 10.275531768798828, "step": 4166 }, { "epoch": 2.156832298136646, "grad_norm": 0.5511334538459778, "learning_rate": 2.0087008401814916e-06, "loss": 0.07351825386285782, "rewards/accuracies": 0.953125, "rewards/chosen": 51.40812683105469, "rewards/margins": 40.82945251464844, "rewards/rejected": 10.590160369873047, "step": 4167 }, { "epoch": 2.1573498964803313, "grad_norm": 2.741764545440674, "learning_rate": 2.006415307408499e-06, "loss": 0.12456884980201721, "rewards/accuracies": 0.953125, "rewards/chosen": 53.56703186035156, "rewards/margins": 44.99394226074219, "rewards/rejected": 8.575729370117188, "step": 4168 }, { "epoch": 2.1578674948240164, "grad_norm": 1.0100420713424683, "learning_rate": 2.0041307492185896e-06, "loss": 0.08275867253541946, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.41382598876953, "rewards/margins": 41.167144775390625, "rewards/rejected": 10.249824523925781, "step": 4169 }, { "epoch": 2.158385093167702, "grad_norm": 0.7937766909599304, "learning_rate": 2.0018471663555173e-06, "loss": 0.06040192022919655, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.43382263183594, "rewards/margins": 44.36859130859375, "rewards/rejected": 9.082420349121094, "step": 4170 }, { "epoch": 2.158902691511387, "grad_norm": 3.2130539417266846, "learning_rate": 1.999564559562723e-06, "loss": 0.13015106320381165, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.242549896240234, "rewards/margins": 46.024749755859375, "rewards/rejected": 8.203804016113281, "step": 4171 }, { "epoch": 2.1594202898550723, "grad_norm": 1.9119937419891357, "learning_rate": 1.9972829295833217e-06, "loss": 0.09910772740840912, "rewards/accuracies": 0.953125, "rewards/chosen": 48.753639221191406, "rewards/margins": 40.162139892578125, "rewards/rejected": 8.58469295501709, "step": 4172 }, { "epoch": 2.159937888198758, "grad_norm": 0.8076615929603577, "learning_rate": 1.9950022771601155e-06, "loss": 0.05495251342654228, "rewards/accuracies": 0.984375, "rewards/chosen": 53.5235481262207, "rewards/margins": 45.59718322753906, "rewards/rejected": 7.9277191162109375, "step": 4173 }, { "epoch": 2.160455486542443, "grad_norm": 0.9301470518112183, "learning_rate": 1.992722603035587e-06, "loss": 0.08240363001823425, "rewards/accuracies": 0.96875, "rewards/chosen": 54.885650634765625, "rewards/margins": 45.13140869140625, "rewards/rejected": 9.751335144042969, "step": 4174 }, { "epoch": 2.160973084886128, "grad_norm": 1.1378583908081055, "learning_rate": 1.990443907951902e-06, "loss": 0.10173305124044418, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.31373977661133, "rewards/margins": 45.95086669921875, "rewards/rejected": 8.361162185668945, "step": 4175 }, { "epoch": 2.1614906832298137, "grad_norm": 0.9036394357681274, "learning_rate": 1.988166192650905e-06, "loss": 0.07453852891921997, "rewards/accuracies": 0.96875, "rewards/chosen": 45.23883819580078, "rewards/margins": 37.806365966796875, "rewards/rejected": 7.420879364013672, "step": 4176 }, { "epoch": 2.162008281573499, "grad_norm": 0.729743242263794, "learning_rate": 1.9858894578741224e-06, "loss": 0.05867677181959152, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.31707763671875, "rewards/margins": 44.082611083984375, "rewards/rejected": 8.228148460388184, "step": 4177 }, { "epoch": 2.162525879917184, "grad_norm": 1.3281430006027222, "learning_rate": 1.983613704362762e-06, "loss": 0.09192050993442535, "rewards/accuracies": 0.953125, "rewards/chosen": 50.61577606201172, "rewards/margins": 44.429664611816406, "rewards/rejected": 6.178963661193848, "step": 4178 }, { "epoch": 2.1630434782608696, "grad_norm": 0.6631596684455872, "learning_rate": 1.9813389328577117e-06, "loss": 0.05866381153464317, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.18518829345703, "rewards/margins": 48.22113037109375, "rewards/rejected": 7.980449676513672, "step": 4179 }, { "epoch": 2.1635610766045548, "grad_norm": 1.4156146049499512, "learning_rate": 1.979065144099541e-06, "loss": 0.08908209204673767, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.19813537597656, "rewards/margins": 48.28094482421875, "rewards/rejected": 8.908638000488281, "step": 4180 }, { "epoch": 2.1640786749482404, "grad_norm": 1.488569736480713, "learning_rate": 1.976792338828495e-06, "loss": 0.0812772661447525, "rewards/accuracies": 0.96875, "rewards/chosen": 55.81150436401367, "rewards/margins": 45.702301025390625, "rewards/rejected": 10.110992431640625, "step": 4181 }, { "epoch": 2.1645962732919255, "grad_norm": 2.3372740745544434, "learning_rate": 1.9745205177845047e-06, "loss": 0.0943659096956253, "rewards/accuracies": 0.953125, "rewards/chosen": 55.044761657714844, "rewards/margins": 45.24229431152344, "rewards/rejected": 9.803634643554688, "step": 4182 }, { "epoch": 2.1651138716356106, "grad_norm": 1.48590087890625, "learning_rate": 1.972249681707177e-06, "loss": 0.14391815662384033, "rewards/accuracies": 0.9296875, "rewards/chosen": 41.95854949951172, "rewards/margins": 35.386871337890625, "rewards/rejected": 6.558935165405273, "step": 4183 }, { "epoch": 2.1656314699792962, "grad_norm": 0.7063678503036499, "learning_rate": 1.9699798313357994e-06, "loss": 0.05920197069644928, "rewards/accuracies": 0.96875, "rewards/chosen": 51.99604034423828, "rewards/margins": 41.84880065917969, "rewards/rejected": 10.153680801391602, "step": 4184 }, { "epoch": 2.1661490683229814, "grad_norm": 1.1413406133651733, "learning_rate": 1.967710967409337e-06, "loss": 0.08438047766685486, "rewards/accuracies": 0.9609375, "rewards/chosen": 62.218902587890625, "rewards/margins": 49.8236083984375, "rewards/rejected": 12.410089492797852, "step": 4185 }, { "epoch": 2.1666666666666665, "grad_norm": 3.0783519744873047, "learning_rate": 1.9654430906664363e-06, "loss": 0.12428805232048035, "rewards/accuracies": 0.9375, "rewards/chosen": 45.56475830078125, "rewards/margins": 39.087005615234375, "rewards/rejected": 6.475881576538086, "step": 4186 }, { "epoch": 2.167184265010352, "grad_norm": 0.9477304220199585, "learning_rate": 1.9631762018454215e-06, "loss": 0.07274868339300156, "rewards/accuracies": 0.96875, "rewards/chosen": 51.25254821777344, "rewards/margins": 45.16975402832031, "rewards/rejected": 6.0684123039245605, "step": 4187 }, { "epoch": 2.1677018633540373, "grad_norm": 0.9951829314231873, "learning_rate": 1.9609103016842897e-06, "loss": 0.05327742174267769, "rewards/accuracies": 0.984375, "rewards/chosen": 47.161399841308594, "rewards/margins": 39.69744873046875, "rewards/rejected": 7.4653778076171875, "step": 4188 }, { "epoch": 2.1682194616977224, "grad_norm": 0.6810786128044128, "learning_rate": 1.958645390920726e-06, "loss": 0.05660471320152283, "rewards/accuracies": 0.984375, "rewards/chosen": 47.281288146972656, "rewards/margins": 40.173248291015625, "rewards/rejected": 7.092733383178711, "step": 4189 }, { "epoch": 2.168737060041408, "grad_norm": 1.2780542373657227, "learning_rate": 1.9563814702920887e-06, "loss": 0.09824018180370331, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.68830871582031, "rewards/margins": 45.66755676269531, "rewards/rejected": 8.01462459564209, "step": 4190 }, { "epoch": 2.169254658385093, "grad_norm": 2.2252674102783203, "learning_rate": 1.9541185405354103e-06, "loss": 0.08727389574050903, "rewards/accuracies": 0.953125, "rewards/chosen": 51.722198486328125, "rewards/margins": 44.160980224609375, "rewards/rejected": 7.56360387802124, "step": 4191 }, { "epoch": 2.1697722567287783, "grad_norm": 0.8907583951950073, "learning_rate": 1.951856602387403e-06, "loss": 0.07758364081382751, "rewards/accuracies": 0.96875, "rewards/chosen": 47.92396545410156, "rewards/margins": 41.81854248046875, "rewards/rejected": 6.11041259765625, "step": 4192 }, { "epoch": 2.170289855072464, "grad_norm": 1.204695463180542, "learning_rate": 1.949595656584462e-06, "loss": 0.10246157646179199, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.46971893310547, "rewards/margins": 41.54718017578125, "rewards/rejected": 5.9320526123046875, "step": 4193 }, { "epoch": 2.170807453416149, "grad_norm": 0.7936440110206604, "learning_rate": 1.9473357038626535e-06, "loss": 0.090856172144413, "rewards/accuracies": 0.9609375, "rewards/chosen": 45.29393768310547, "rewards/margins": 40.54194641113281, "rewards/rejected": 4.750205993652344, "step": 4194 }, { "epoch": 2.171325051759834, "grad_norm": 1.1266828775405884, "learning_rate": 1.9450767449577174e-06, "loss": 0.09422081708908081, "rewards/accuracies": 0.9609375, "rewards/chosen": 48.27428436279297, "rewards/margins": 42.249664306640625, "rewards/rejected": 6.017604827880859, "step": 4195 }, { "epoch": 2.1718426501035197, "grad_norm": 0.7720064520835876, "learning_rate": 1.9428187806050752e-06, "loss": 0.07395625114440918, "rewards/accuracies": 0.9453125, "rewards/chosen": 52.13341522216797, "rewards/margins": 44.80474853515625, "rewards/rejected": 7.326812744140625, "step": 4196 }, { "epoch": 2.172360248447205, "grad_norm": 0.6977128982543945, "learning_rate": 1.9405618115398282e-06, "loss": 0.06436674296855927, "rewards/accuracies": 0.9609375, "rewards/chosen": 46.96875762939453, "rewards/margins": 40.20024871826172, "rewards/rejected": 6.770782470703125, "step": 4197 }, { "epoch": 2.1728778467908905, "grad_norm": 1.3780288696289062, "learning_rate": 1.938305838496744e-06, "loss": 0.17005807161331177, "rewards/accuracies": 0.921875, "rewards/chosen": 42.090396881103516, "rewards/margins": 35.147003173828125, "rewards/rejected": 6.951045989990234, "step": 4198 }, { "epoch": 2.1733954451345756, "grad_norm": 1.4878522157669067, "learning_rate": 1.9360508622102724e-06, "loss": 0.09514513611793518, "rewards/accuracies": 0.953125, "rewards/chosen": 45.70611572265625, "rewards/margins": 39.28692626953125, "rewards/rejected": 6.420158386230469, "step": 4199 }, { "epoch": 2.1739130434782608, "grad_norm": 0.8105286359786987, "learning_rate": 1.933796883414537e-06, "loss": 0.0640001893043518, "rewards/accuracies": 0.96875, "rewards/chosen": 51.46820831298828, "rewards/margins": 44.04139709472656, "rewards/rejected": 7.428766250610352, "step": 4200 }, { "epoch": 2.1744306418219463, "grad_norm": 1.08537757396698, "learning_rate": 1.9315439028433376e-06, "loss": 0.08435659110546112, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.61376190185547, "rewards/margins": 45.4246826171875, "rewards/rejected": 9.181955337524414, "step": 4201 }, { "epoch": 2.1749482401656315, "grad_norm": 0.9876143932342529, "learning_rate": 1.929291921230147e-06, "loss": 0.08956200629472733, "rewards/accuracies": 0.9375, "rewards/chosen": 50.00408935546875, "rewards/margins": 43.30525207519531, "rewards/rejected": 6.6923065185546875, "step": 4202 }, { "epoch": 2.1754658385093166, "grad_norm": 1.2604022026062012, "learning_rate": 1.9270409393081153e-06, "loss": 0.06429028511047363, "rewards/accuracies": 0.96875, "rewards/chosen": 51.63201904296875, "rewards/margins": 42.83062744140625, "rewards/rejected": 8.786186218261719, "step": 4203 }, { "epoch": 2.175983436853002, "grad_norm": 0.8171917200088501, "learning_rate": 1.924790957810065e-06, "loss": 0.06839217245578766, "rewards/accuracies": 0.984375, "rewards/chosen": 51.177879333496094, "rewards/margins": 42.528541564941406, "rewards/rejected": 8.65447998046875, "step": 4204 }, { "epoch": 2.1765010351966874, "grad_norm": 1.524465799331665, "learning_rate": 1.9225419774684943e-06, "loss": 0.14798268675804138, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.91090774536133, "rewards/margins": 43.572265625, "rewards/rejected": 7.352001190185547, "step": 4205 }, { "epoch": 2.1770186335403725, "grad_norm": 0.8322408199310303, "learning_rate": 1.920293999015575e-06, "loss": 0.060745906084775925, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.37391662597656, "rewards/margins": 48.67039489746094, "rewards/rejected": 7.711757659912109, "step": 4206 }, { "epoch": 2.177536231884058, "grad_norm": 0.49873337149620056, "learning_rate": 1.9180470231831523e-06, "loss": 0.03670060634613037, "rewards/accuracies": 0.9921875, "rewards/chosen": 55.790245056152344, "rewards/margins": 47.892333984375, "rewards/rejected": 7.900733947753906, "step": 4207 }, { "epoch": 2.1780538302277432, "grad_norm": 0.7633572816848755, "learning_rate": 1.9158010507027457e-06, "loss": 0.03541126847267151, "rewards/accuracies": 0.9921875, "rewards/chosen": 49.634803771972656, "rewards/margins": 43.99237060546875, "rewards/rejected": 5.6410722732543945, "step": 4208 }, { "epoch": 2.1785714285714284, "grad_norm": 1.5171489715576172, "learning_rate": 1.9135560823055476e-06, "loss": 0.08023547381162643, "rewards/accuracies": 0.9609375, "rewards/chosen": 45.43452835083008, "rewards/margins": 38.81829833984375, "rewards/rejected": 6.6158294677734375, "step": 4209 }, { "epoch": 2.179089026915114, "grad_norm": 1.8872652053833008, "learning_rate": 1.911312118722426e-06, "loss": 0.10562735795974731, "rewards/accuracies": 0.96875, "rewards/chosen": 47.48529052734375, "rewards/margins": 40.00079345703125, "rewards/rejected": 7.487183570861816, "step": 4210 }, { "epoch": 2.179606625258799, "grad_norm": 0.8050914406776428, "learning_rate": 1.909069160683913e-06, "loss": 0.08866693079471588, "rewards/accuracies": 0.953125, "rewards/chosen": 52.46533203125, "rewards/margins": 45.791473388671875, "rewards/rejected": 6.6782379150390625, "step": 4211 }, { "epoch": 2.1801242236024843, "grad_norm": 0.9064075350761414, "learning_rate": 1.906827208920226e-06, "loss": 0.08190742880105972, "rewards/accuracies": 0.9609375, "rewards/chosen": 46.121826171875, "rewards/margins": 41.058074951171875, "rewards/rejected": 5.071994781494141, "step": 4212 }, { "epoch": 2.18064182194617, "grad_norm": 1.2545627355575562, "learning_rate": 1.9045862641612483e-06, "loss": 0.11235310882329941, "rewards/accuracies": 0.96875, "rewards/chosen": 47.1729736328125, "rewards/margins": 39.0631103515625, "rewards/rejected": 8.101106643676758, "step": 4213 }, { "epoch": 2.181159420289855, "grad_norm": 0.5824921727180481, "learning_rate": 1.90234632713653e-06, "loss": 0.07331988215446472, "rewards/accuracies": 0.96875, "rewards/chosen": 48.773643493652344, "rewards/margins": 41.50201416015625, "rewards/rejected": 7.274463653564453, "step": 4214 }, { "epoch": 2.1816770186335406, "grad_norm": 1.0666371583938599, "learning_rate": 1.9001073985753043e-06, "loss": 0.10039035975933075, "rewards/accuracies": 0.9453125, "rewards/chosen": 49.74537658691406, "rewards/margins": 43.11248779296875, "rewards/rejected": 6.6410813331604, "step": 4215 }, { "epoch": 2.1821946169772257, "grad_norm": 0.9106183052062988, "learning_rate": 1.897869479206469e-06, "loss": 0.07746507972478867, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.44512939453125, "rewards/margins": 44.23004150390625, "rewards/rejected": 7.204361915588379, "step": 4216 }, { "epoch": 2.182712215320911, "grad_norm": 1.904935359954834, "learning_rate": 1.8956325697585963e-06, "loss": 0.10875792801380157, "rewards/accuracies": 0.9453125, "rewards/chosen": 49.46943664550781, "rewards/margins": 42.28955078125, "rewards/rejected": 7.1897430419921875, "step": 4217 }, { "epoch": 2.1832298136645965, "grad_norm": 0.6484460234642029, "learning_rate": 1.8933966709599233e-06, "loss": 0.05822651833295822, "rewards/accuracies": 0.984375, "rewards/chosen": 54.1685791015625, "rewards/margins": 45.163360595703125, "rewards/rejected": 8.989940643310547, "step": 4218 }, { "epoch": 2.1837474120082816, "grad_norm": 1.4797800779342651, "learning_rate": 1.8911617835383677e-06, "loss": 0.08238236606121063, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.372127532958984, "rewards/margins": 43.26353454589844, "rewards/rejected": 7.1048431396484375, "step": 4219 }, { "epoch": 2.1842650103519667, "grad_norm": 1.4390597343444824, "learning_rate": 1.888927908221514e-06, "loss": 0.1299443542957306, "rewards/accuracies": 0.9296875, "rewards/chosen": 49.74628448486328, "rewards/margins": 43.91888427734375, "rewards/rejected": 5.831722259521484, "step": 4220 }, { "epoch": 2.1847826086956523, "grad_norm": 3.1634016036987305, "learning_rate": 1.8866950457366118e-06, "loss": 0.08228912949562073, "rewards/accuracies": 0.96875, "rewards/chosen": 49.81087112426758, "rewards/margins": 42.4254150390625, "rewards/rejected": 7.388206481933594, "step": 4221 }, { "epoch": 2.1853002070393375, "grad_norm": 5.36331033706665, "learning_rate": 1.8844631968105864e-06, "loss": 0.11222672462463379, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.73146057128906, "rewards/margins": 45.75616455078125, "rewards/rejected": 8.973820686340332, "step": 4222 }, { "epoch": 2.1858178053830226, "grad_norm": 1.3899365663528442, "learning_rate": 1.8822323621700379e-06, "loss": 0.09421572089195251, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.27338790893555, "rewards/margins": 44.68072509765625, "rewards/rejected": 8.598121643066406, "step": 4223 }, { "epoch": 2.186335403726708, "grad_norm": 1.2563049793243408, "learning_rate": 1.8800025425412243e-06, "loss": 0.0895390585064888, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.826560974121094, "rewards/margins": 44.52154541015625, "rewards/rejected": 7.2998480796813965, "step": 4224 }, { "epoch": 2.1868530020703933, "grad_norm": 2.351161241531372, "learning_rate": 1.8777737386500822e-06, "loss": 0.04707148298621178, "rewards/accuracies": 0.984375, "rewards/chosen": 54.37645721435547, "rewards/margins": 45.58624267578125, "rewards/rejected": 8.789108276367188, "step": 4225 }, { "epoch": 2.1873706004140785, "grad_norm": 1.0461161136627197, "learning_rate": 1.8755459512222136e-06, "loss": 0.09548503160476685, "rewards/accuracies": 0.953125, "rewards/chosen": 49.171180725097656, "rewards/margins": 41.2080078125, "rewards/rejected": 7.976348876953125, "step": 4226 }, { "epoch": 2.187888198757764, "grad_norm": 1.065604567527771, "learning_rate": 1.8733191809828922e-06, "loss": 0.06320740282535553, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.942108154296875, "rewards/margins": 44.984893798828125, "rewards/rejected": 7.946075439453125, "step": 4227 }, { "epoch": 2.1884057971014492, "grad_norm": 0.6377863883972168, "learning_rate": 1.8710934286570581e-06, "loss": 0.05434265732765198, "rewards/accuracies": 0.9765625, "rewards/chosen": 47.995765686035156, "rewards/margins": 40.865081787109375, "rewards/rejected": 7.1265106201171875, "step": 4228 }, { "epoch": 2.1889233954451344, "grad_norm": 0.8891904950141907, "learning_rate": 1.8688686949693214e-06, "loss": 0.08936070650815964, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.65050506591797, "rewards/margins": 42.55908203125, "rewards/rejected": 7.1034698486328125, "step": 4229 }, { "epoch": 2.18944099378882, "grad_norm": 2.9249184131622314, "learning_rate": 1.8666449806439601e-06, "loss": 0.09576190263032913, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.44879150390625, "rewards/margins": 45.95037841796875, "rewards/rejected": 7.487659454345703, "step": 4230 }, { "epoch": 2.189958592132505, "grad_norm": 0.7791441082954407, "learning_rate": 1.8644222864049205e-06, "loss": 0.05313833802938461, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.60211181640625, "rewards/margins": 45.286285400390625, "rewards/rejected": 8.342720031738281, "step": 4231 }, { "epoch": 2.1904761904761907, "grad_norm": 0.5066660046577454, "learning_rate": 1.8622006129758163e-06, "loss": 0.04185584560036659, "rewards/accuracies": 0.984375, "rewards/chosen": 54.32902526855469, "rewards/margins": 46.383636474609375, "rewards/rejected": 7.953531265258789, "step": 4232 }, { "epoch": 2.190993788819876, "grad_norm": 1.5575236082077026, "learning_rate": 1.85997996107993e-06, "loss": 0.0651225820183754, "rewards/accuracies": 0.9609375, "rewards/chosen": 48.069976806640625, "rewards/margins": 40.7598876953125, "rewards/rejected": 7.308601379394531, "step": 4233 }, { "epoch": 2.191511387163561, "grad_norm": 1.4222462177276611, "learning_rate": 1.8577603314402105e-06, "loss": 0.07471615821123123, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.24699401855469, "rewards/margins": 45.108978271484375, "rewards/rejected": 6.136819839477539, "step": 4234 }, { "epoch": 2.1920289855072466, "grad_norm": 2.343090295791626, "learning_rate": 1.8555417247792746e-06, "loss": 0.08623509854078293, "rewards/accuracies": 0.953125, "rewards/chosen": 52.485877990722656, "rewards/margins": 44.65008544921875, "rewards/rejected": 7.825506210327148, "step": 4235 }, { "epoch": 2.1925465838509317, "grad_norm": 0.8384562730789185, "learning_rate": 1.8533241418194065e-06, "loss": 0.0786500871181488, "rewards/accuracies": 0.96875, "rewards/chosen": 47.979461669921875, "rewards/margins": 41.07830810546875, "rewards/rejected": 6.901393890380859, "step": 4236 }, { "epoch": 2.193064182194617, "grad_norm": 0.7189925312995911, "learning_rate": 1.851107583282552e-06, "loss": 0.05925972759723663, "rewards/accuracies": 0.9765625, "rewards/chosen": 47.46997833251953, "rewards/margins": 41.922271728515625, "rewards/rejected": 5.557014465332031, "step": 4237 }, { "epoch": 2.1935817805383024, "grad_norm": 0.7126664519309998, "learning_rate": 1.848892049890333e-06, "loss": 0.07675057649612427, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.271785736083984, "rewards/margins": 42.48236083984375, "rewards/rejected": 6.7921905517578125, "step": 4238 }, { "epoch": 2.1940993788819876, "grad_norm": 0.8727985620498657, "learning_rate": 1.8466775423640299e-06, "loss": 0.051465295255184174, "rewards/accuracies": 0.9921875, "rewards/chosen": 46.02293395996094, "rewards/margins": 39.61029052734375, "rewards/rejected": 6.425450801849365, "step": 4239 }, { "epoch": 2.1946169772256727, "grad_norm": 2.003305435180664, "learning_rate": 1.8444640614245935e-06, "loss": 0.08827032148838043, "rewards/accuracies": 0.9765625, "rewards/chosen": 46.142669677734375, "rewards/margins": 39.524200439453125, "rewards/rejected": 6.618356704711914, "step": 4240 }, { "epoch": 2.1951345755693583, "grad_norm": 0.5981862545013428, "learning_rate": 1.842251607792634e-06, "loss": 0.05513569340109825, "rewards/accuracies": 0.9765625, "rewards/chosen": 47.89262390136719, "rewards/margins": 41.954254150390625, "rewards/rejected": 5.923137664794922, "step": 4241 }, { "epoch": 2.1956521739130435, "grad_norm": 1.3781818151474, "learning_rate": 1.8400401821884356e-06, "loss": 0.1542540043592453, "rewards/accuracies": 0.921875, "rewards/chosen": 52.35972595214844, "rewards/margins": 43.41139221191406, "rewards/rejected": 8.951007843017578, "step": 4242 }, { "epoch": 2.1961697722567286, "grad_norm": 0.6574764847755432, "learning_rate": 1.837829785331945e-06, "loss": 0.03267645090818405, "rewards/accuracies": 1.0, "rewards/chosen": 45.669857025146484, "rewards/margins": 40.006378173828125, "rewards/rejected": 5.672267913818359, "step": 4243 }, { "epoch": 2.196687370600414, "grad_norm": 0.6887134909629822, "learning_rate": 1.8356204179427673e-06, "loss": 0.04933150112628937, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.6884880065918, "rewards/margins": 43.072906494140625, "rewards/rejected": 8.607421875, "step": 4244 }, { "epoch": 2.1972049689440993, "grad_norm": 0.8640777468681335, "learning_rate": 1.8334120807401828e-06, "loss": 0.06962345540523529, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.385406494140625, "rewards/margins": 40.94744873046875, "rewards/rejected": 6.4581146240234375, "step": 4245 }, { "epoch": 2.1977225672877845, "grad_norm": 1.0026097297668457, "learning_rate": 1.8312047744431322e-06, "loss": 0.11528456211090088, "rewards/accuracies": 0.9375, "rewards/chosen": 45.60027313232422, "rewards/margins": 38.66021728515625, "rewards/rejected": 6.934993743896484, "step": 4246 }, { "epoch": 2.19824016563147, "grad_norm": 0.9241675734519958, "learning_rate": 1.8289984997702159e-06, "loss": 0.10043986886739731, "rewards/accuracies": 0.9609375, "rewards/chosen": 45.262672424316406, "rewards/margins": 38.95890808105469, "rewards/rejected": 6.304808616638184, "step": 4247 }, { "epoch": 2.198757763975155, "grad_norm": 1.2410398721694946, "learning_rate": 1.8267932574397035e-06, "loss": 0.12585753202438354, "rewards/accuracies": 0.953125, "rewards/chosen": 45.28197479248047, "rewards/margins": 39.2525634765625, "rewards/rejected": 6.019417762756348, "step": 4248 }, { "epoch": 2.199275362318841, "grad_norm": 0.8291754126548767, "learning_rate": 1.8245890481695317e-06, "loss": 0.05798613280057907, "rewards/accuracies": 0.9765625, "rewards/chosen": 48.71238708496094, "rewards/margins": 43.22113037109375, "rewards/rejected": 5.4935302734375, "step": 4249 }, { "epoch": 2.199792960662526, "grad_norm": 0.7246075868606567, "learning_rate": 1.8223858726772915e-06, "loss": 0.038647741079330444, "rewards/accuracies": 0.984375, "rewards/chosen": 57.349517822265625, "rewards/margins": 48.0533447265625, "rewards/rejected": 9.28846549987793, "step": 4250 }, { "epoch": 2.200310559006211, "grad_norm": 0.8428727984428406, "learning_rate": 1.820183731680245e-06, "loss": 0.08836615085601807, "rewards/accuracies": 0.9609375, "rewards/chosen": 48.45936965942383, "rewards/margins": 42.85798645019531, "rewards/rejected": 5.614021301269531, "step": 4251 }, { "epoch": 2.2008281573498967, "grad_norm": 1.3656723499298096, "learning_rate": 1.8179826258953132e-06, "loss": 0.12819042801856995, "rewards/accuracies": 0.9453125, "rewards/chosen": 50.97536849975586, "rewards/margins": 44.626007080078125, "rewards/rejected": 6.3651123046875, "step": 4252 }, { "epoch": 2.201345755693582, "grad_norm": 0.9789686799049377, "learning_rate": 1.8157825560390868e-06, "loss": 0.11072321236133575, "rewards/accuracies": 0.9453125, "rewards/chosen": 49.26350402832031, "rewards/margins": 43.26953125, "rewards/rejected": 5.980571746826172, "step": 4253 }, { "epoch": 2.201863354037267, "grad_norm": 2.321519613265991, "learning_rate": 1.8135835228278098e-06, "loss": 0.12309595942497253, "rewards/accuracies": 0.9453125, "rewards/chosen": 45.66693115234375, "rewards/margins": 40.49803161621094, "rewards/rejected": 5.16351318359375, "step": 4254 }, { "epoch": 2.2023809523809526, "grad_norm": 1.3973723649978638, "learning_rate": 1.8113855269773938e-06, "loss": 0.07388461381196976, "rewards/accuracies": 0.96875, "rewards/chosen": 47.60588073730469, "rewards/margins": 42.21734619140625, "rewards/rejected": 5.397712707519531, "step": 4255 }, { "epoch": 2.2028985507246377, "grad_norm": 0.968555748462677, "learning_rate": 1.8091885692034138e-06, "loss": 0.07682754844427109, "rewards/accuracies": 0.96875, "rewards/chosen": 52.20191955566406, "rewards/margins": 46.71929931640625, "rewards/rejected": 5.482959747314453, "step": 4256 }, { "epoch": 2.203416149068323, "grad_norm": 0.9635357856750488, "learning_rate": 1.8069926502211044e-06, "loss": 0.08979463577270508, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.67982482910156, "rewards/margins": 45.19685363769531, "rewards/rejected": 6.4973464012146, "step": 4257 }, { "epoch": 2.2039337474120084, "grad_norm": 0.8302039504051208, "learning_rate": 1.8047977707453634e-06, "loss": 0.03857841342687607, "rewards/accuracies": 0.984375, "rewards/chosen": 47.22254180908203, "rewards/margins": 43.122894287109375, "rewards/rejected": 4.108542442321777, "step": 4258 }, { "epoch": 2.2044513457556936, "grad_norm": 1.1443889141082764, "learning_rate": 1.8026039314907483e-06, "loss": 0.06294796615839005, "rewards/accuracies": 0.96875, "rewards/chosen": 55.23902893066406, "rewards/margins": 46.57965087890625, "rewards/rejected": 8.651077270507812, "step": 4259 }, { "epoch": 2.2049689440993787, "grad_norm": 0.7317896485328674, "learning_rate": 1.8004111331714808e-06, "loss": 0.06408125907182693, "rewards/accuracies": 0.96875, "rewards/chosen": 51.559844970703125, "rewards/margins": 45.64939880371094, "rewards/rejected": 5.913949966430664, "step": 4260 }, { "epoch": 2.2054865424430643, "grad_norm": 1.2637271881103516, "learning_rate": 1.7982193765014416e-06, "loss": 0.08523713052272797, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.7716064453125, "rewards/margins": 45.071136474609375, "rewards/rejected": 6.704929351806641, "step": 4261 }, { "epoch": 2.2060041407867494, "grad_norm": 2.8975307941436768, "learning_rate": 1.7960286621941747e-06, "loss": 0.06799719482660294, "rewards/accuracies": 0.96875, "rewards/chosen": 47.132347106933594, "rewards/margins": 41.14898681640625, "rewards/rejected": 5.978743553161621, "step": 4262 }, { "epoch": 2.2065217391304346, "grad_norm": 1.4462602138519287, "learning_rate": 1.7938389909628773e-06, "loss": 0.09081050008535385, "rewards/accuracies": 0.953125, "rewards/chosen": 47.70428466796875, "rewards/margins": 43.378692626953125, "rewards/rejected": 4.3345489501953125, "step": 4263 }, { "epoch": 2.20703933747412, "grad_norm": 0.6218185424804688, "learning_rate": 1.7916503635204185e-06, "loss": 0.05883316323161125, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.202178955078125, "rewards/margins": 45.67633056640625, "rewards/rejected": 8.517650604248047, "step": 4264 }, { "epoch": 2.2075569358178053, "grad_norm": 0.6168248653411865, "learning_rate": 1.7894627805793202e-06, "loss": 0.0489894337952137, "rewards/accuracies": 0.9765625, "rewards/chosen": 47.73186492919922, "rewards/margins": 43.31195068359375, "rewards/rejected": 4.428339004516602, "step": 4265 }, { "epoch": 2.208074534161491, "grad_norm": 1.105016827583313, "learning_rate": 1.7872762428517664e-06, "loss": 0.05296209827065468, "rewards/accuracies": 0.984375, "rewards/chosen": 50.15091323852539, "rewards/margins": 42.43605041503906, "rewards/rejected": 7.7144317626953125, "step": 4266 }, { "epoch": 2.208592132505176, "grad_norm": 0.8820621967315674, "learning_rate": 1.7850907510495958e-06, "loss": 0.0778626799583435, "rewards/accuracies": 0.9609375, "rewards/chosen": 45.346923828125, "rewards/margins": 39.7127685546875, "rewards/rejected": 5.6350603103637695, "step": 4267 }, { "epoch": 2.209109730848861, "grad_norm": 1.2836408615112305, "learning_rate": 1.7829063058843166e-06, "loss": 0.07027452439069748, "rewards/accuracies": 0.9765625, "rewards/chosen": 49.57292175292969, "rewards/margins": 43.46641540527344, "rewards/rejected": 6.106898784637451, "step": 4268 }, { "epoch": 2.209627329192547, "grad_norm": 0.7301075458526611, "learning_rate": 1.7807229080670895e-06, "loss": 0.03877463936805725, "rewards/accuracies": 0.984375, "rewards/chosen": 52.893798828125, "rewards/margins": 45.66023254394531, "rewards/rejected": 7.216190338134766, "step": 4269 }, { "epoch": 2.210144927536232, "grad_norm": 1.8595986366271973, "learning_rate": 1.7785405583087329e-06, "loss": 0.10927671194076538, "rewards/accuracies": 0.9609375, "rewards/chosen": 48.77458572387695, "rewards/margins": 42.44046401977539, "rewards/rejected": 6.329902648925781, "step": 4270 }, { "epoch": 2.210662525879917, "grad_norm": 1.3654512166976929, "learning_rate": 1.7763592573197263e-06, "loss": 0.11995261907577515, "rewards/accuracies": 0.953125, "rewards/chosen": 44.8776741027832, "rewards/margins": 38.4827880859375, "rewards/rejected": 6.396553039550781, "step": 4271 }, { "epoch": 2.2111801242236027, "grad_norm": 1.0762561559677124, "learning_rate": 1.7741790058102131e-06, "loss": 0.06438177824020386, "rewards/accuracies": 0.96875, "rewards/chosen": 52.960609436035156, "rewards/margins": 46.950958251953125, "rewards/rejected": 6.023329257965088, "step": 4272 }, { "epoch": 2.211697722567288, "grad_norm": 0.8888765573501587, "learning_rate": 1.771999804489985e-06, "loss": 0.07237623631954193, "rewards/accuracies": 0.96875, "rewards/chosen": 46.992462158203125, "rewards/margins": 41.81646728515625, "rewards/rejected": 5.1825408935546875, "step": 4273 }, { "epoch": 2.212215320910973, "grad_norm": 1.0096495151519775, "learning_rate": 1.7698216540684966e-06, "loss": 0.08335286378860474, "rewards/accuracies": 0.96875, "rewards/chosen": 48.54356384277344, "rewards/margins": 42.7109375, "rewards/rejected": 5.837856292724609, "step": 4274 }, { "epoch": 2.2127329192546585, "grad_norm": 0.726422905921936, "learning_rate": 1.7676445552548633e-06, "loss": 0.06905604898929596, "rewards/accuracies": 0.953125, "rewards/chosen": 55.27091598510742, "rewards/margins": 47.128379821777344, "rewards/rejected": 8.138124465942383, "step": 4275 }, { "epoch": 2.2132505175983437, "grad_norm": 2.38535213470459, "learning_rate": 1.7654685087578554e-06, "loss": 0.1374465376138687, "rewards/accuracies": 0.953125, "rewards/chosen": 49.12400817871094, "rewards/margins": 43.065673828125, "rewards/rejected": 6.061695098876953, "step": 4276 }, { "epoch": 2.213768115942029, "grad_norm": 1.2232351303100586, "learning_rate": 1.7632935152858975e-06, "loss": 0.10203897953033447, "rewards/accuracies": 0.953125, "rewards/chosen": 46.890525817871094, "rewards/margins": 41.056396484375, "rewards/rejected": 5.825717926025391, "step": 4277 }, { "epoch": 2.2142857142857144, "grad_norm": 1.6790883541107178, "learning_rate": 1.761119575547075e-06, "loss": 0.14507681131362915, "rewards/accuracies": 0.9296875, "rewards/chosen": 48.03543472290039, "rewards/margins": 43.45994567871094, "rewards/rejected": 4.572450160980225, "step": 4278 }, { "epoch": 2.2148033126293996, "grad_norm": 1.8188201189041138, "learning_rate": 1.7589466902491337e-06, "loss": 0.09894362837076187, "rewards/accuracies": 0.9375, "rewards/chosen": 51.70476150512695, "rewards/margins": 44.5086669921875, "rewards/rejected": 7.1864013671875, "step": 4279 }, { "epoch": 2.2153209109730847, "grad_norm": 0.7723519206047058, "learning_rate": 1.7567748600994676e-06, "loss": 0.050889819860458374, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.88874816894531, "rewards/margins": 44.88812255859375, "rewards/rejected": 7.0102996826171875, "step": 4280 }, { "epoch": 2.2158385093167703, "grad_norm": 0.7404299974441528, "learning_rate": 1.7546040858051334e-06, "loss": 0.05411914736032486, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.39363098144531, "rewards/margins": 49.28668212890625, "rewards/rejected": 7.100939750671387, "step": 4281 }, { "epoch": 2.2163561076604554, "grad_norm": 4.680325031280518, "learning_rate": 1.7524343680728418e-06, "loss": 0.14764292538166046, "rewards/accuracies": 0.96875, "rewards/chosen": 49.29228210449219, "rewards/margins": 42.1019287109375, "rewards/rejected": 7.17762565612793, "step": 4282 }, { "epoch": 2.216873706004141, "grad_norm": 1.4243208169937134, "learning_rate": 1.7502657076089602e-06, "loss": 0.08096137642860413, "rewards/accuracies": 0.96875, "rewards/chosen": 51.5684928894043, "rewards/margins": 46.182373046875, "rewards/rejected": 5.3641204833984375, "step": 4283 }, { "epoch": 2.217391304347826, "grad_norm": 3.3256213665008545, "learning_rate": 1.7480981051195124e-06, "loss": 0.09538544714450836, "rewards/accuracies": 0.96875, "rewards/chosen": 51.499794006347656, "rewards/margins": 44.789405822753906, "rewards/rejected": 6.714630126953125, "step": 4284 }, { "epoch": 2.2179089026915113, "grad_norm": 1.5210062265396118, "learning_rate": 1.7459315613101758e-06, "loss": 0.11841659247875214, "rewards/accuracies": 0.9375, "rewards/chosen": 49.61254119873047, "rewards/margins": 43.6326904296875, "rewards/rejected": 5.990695953369141, "step": 4285 }, { "epoch": 2.2184265010351965, "grad_norm": 1.1546931266784668, "learning_rate": 1.7437660768862852e-06, "loss": 0.11097751557826996, "rewards/accuracies": 0.9453125, "rewards/chosen": 47.67168426513672, "rewards/margins": 41.293670654296875, "rewards/rejected": 6.381056785583496, "step": 4286 }, { "epoch": 2.218944099378882, "grad_norm": 1.5600847005844116, "learning_rate": 1.7416016525528294e-06, "loss": 0.0695614293217659, "rewards/accuracies": 0.96875, "rewards/chosen": 51.576751708984375, "rewards/margins": 45.85107421875, "rewards/rejected": 5.710613250732422, "step": 4287 }, { "epoch": 2.219461697722567, "grad_norm": 0.5591873526573181, "learning_rate": 1.7394382890144528e-06, "loss": 0.025795873254537582, "rewards/accuracies": 0.9921875, "rewards/chosen": 55.562294006347656, "rewards/margins": 47.02937698364258, "rewards/rejected": 8.550872802734375, "step": 4288 }, { "epoch": 2.2199792960662528, "grad_norm": 1.5287247896194458, "learning_rate": 1.7372759869754535e-06, "loss": 0.11777051538228989, "rewards/accuracies": 0.9453125, "rewards/chosen": 46.19929885864258, "rewards/margins": 39.77294921875, "rewards/rejected": 6.417545318603516, "step": 4289 }, { "epoch": 2.220496894409938, "grad_norm": 1.2563085556030273, "learning_rate": 1.7351147471397845e-06, "loss": 0.11245697736740112, "rewards/accuracies": 0.9453125, "rewards/chosen": 47.777435302734375, "rewards/margins": 42.08039093017578, "rewards/rejected": 5.690773010253906, "step": 4290 }, { "epoch": 2.221014492753623, "grad_norm": 1.3087410926818848, "learning_rate": 1.7329545702110534e-06, "loss": 0.10965146869421005, "rewards/accuracies": 0.9453125, "rewards/chosen": 50.73870849609375, "rewards/margins": 43.25299072265625, "rewards/rejected": 7.496429443359375, "step": 4291 }, { "epoch": 2.2215320910973086, "grad_norm": 0.6133151650428772, "learning_rate": 1.7307954568925223e-06, "loss": 0.044918209314346313, "rewards/accuracies": 0.96875, "rewards/chosen": 54.46846008300781, "rewards/margins": 45.180816650390625, "rewards/rejected": 9.291088104248047, "step": 4292 }, { "epoch": 2.222049689440994, "grad_norm": 1.1472084522247314, "learning_rate": 1.7286374078871022e-06, "loss": 0.099820077419281, "rewards/accuracies": 0.9375, "rewards/chosen": 48.014434814453125, "rewards/margins": 42.17781066894531, "rewards/rejected": 5.834203720092773, "step": 4293 }, { "epoch": 2.222567287784679, "grad_norm": 0.7591630220413208, "learning_rate": 1.7264804238973653e-06, "loss": 0.05450144410133362, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.81053161621094, "rewards/margins": 43.66192626953125, "rewards/rejected": 7.148017883300781, "step": 4294 }, { "epoch": 2.2230848861283645, "grad_norm": 0.6976730227470398, "learning_rate": 1.7243245056255343e-06, "loss": 0.03067350946366787, "rewards/accuracies": 0.9921875, "rewards/chosen": 57.204505920410156, "rewards/margins": 48.06590270996094, "rewards/rejected": 9.141815185546875, "step": 4295 }, { "epoch": 2.2236024844720497, "grad_norm": 1.592292070388794, "learning_rate": 1.72216965377348e-06, "loss": 0.07581198960542679, "rewards/accuracies": 0.953125, "rewards/chosen": 50.0687255859375, "rewards/margins": 42.3709716796875, "rewards/rejected": 7.694740295410156, "step": 4296 }, { "epoch": 2.224120082815735, "grad_norm": 2.63116192817688, "learning_rate": 1.7200158690427304e-06, "loss": 0.0611257441341877, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.99787902832031, "rewards/margins": 41.930267333984375, "rewards/rejected": 9.072321891784668, "step": 4297 }, { "epoch": 2.2246376811594204, "grad_norm": 1.4250684976577759, "learning_rate": 1.7178631521344684e-06, "loss": 0.08960245549678802, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.15924835205078, "rewards/margins": 41.966796875, "rewards/rejected": 7.198545455932617, "step": 4298 }, { "epoch": 2.2251552795031055, "grad_norm": 0.5128678679466248, "learning_rate": 1.715711503749527e-06, "loss": 0.030066346749663353, "rewards/accuracies": 0.984375, "rewards/chosen": 55.992530822753906, "rewards/margins": 48.10845947265625, "rewards/rejected": 7.878265380859375, "step": 4299 }, { "epoch": 2.2256728778467907, "grad_norm": 1.8536587953567505, "learning_rate": 1.7135609245883877e-06, "loss": 0.17019815742969513, "rewards/accuracies": 0.921875, "rewards/chosen": 47.59783935546875, "rewards/margins": 41.180450439453125, "rewards/rejected": 6.4102325439453125, "step": 4300 }, { "epoch": 2.2261904761904763, "grad_norm": 2.2712061405181885, "learning_rate": 1.711411415351187e-06, "loss": 0.061673201620578766, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.236083984375, "rewards/margins": 43.3778076171875, "rewards/rejected": 8.855751037597656, "step": 4301 }, { "epoch": 2.2267080745341614, "grad_norm": 1.2602483034133911, "learning_rate": 1.7092629767377172e-06, "loss": 0.10469093918800354, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.69230651855469, "rewards/margins": 48.68510437011719, "rewards/rejected": 8.996650695800781, "step": 4302 }, { "epoch": 2.2272256728778466, "grad_norm": 1.6327961683273315, "learning_rate": 1.707115609447414e-06, "loss": 0.10196184366941452, "rewards/accuracies": 0.9375, "rewards/chosen": 47.48912048339844, "rewards/margins": 39.161376953125, "rewards/rejected": 8.328470230102539, "step": 4303 }, { "epoch": 2.227743271221532, "grad_norm": 0.5094853639602661, "learning_rate": 1.7049693141793677e-06, "loss": 0.029306558892130852, "rewards/accuracies": 0.984375, "rewards/chosen": 53.52632141113281, "rewards/margins": 45.3731689453125, "rewards/rejected": 8.153373718261719, "step": 4304 }, { "epoch": 2.2282608695652173, "grad_norm": 1.8676211833953857, "learning_rate": 1.7028240916323258e-06, "loss": 0.07657556235790253, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.921722412109375, "rewards/margins": 47.36328125, "rewards/rejected": 7.562217712402344, "step": 4305 }, { "epoch": 2.228778467908903, "grad_norm": 0.4851403832435608, "learning_rate": 1.7006799425046749e-06, "loss": 0.03747865557670593, "rewards/accuracies": 0.9921875, "rewards/chosen": 50.680747985839844, "rewards/margins": 44.230621337890625, "rewards/rejected": 6.449053764343262, "step": 4306 }, { "epoch": 2.229296066252588, "grad_norm": 1.1319916248321533, "learning_rate": 1.6985368674944613e-06, "loss": 0.10075746476650238, "rewards/accuracies": 0.9375, "rewards/chosen": 52.366058349609375, "rewards/margins": 43.653564453125, "rewards/rejected": 8.708904266357422, "step": 4307 }, { "epoch": 2.229813664596273, "grad_norm": 1.3803472518920898, "learning_rate": 1.696394867299378e-06, "loss": 0.11603261530399323, "rewards/accuracies": 0.9453125, "rewards/chosen": 47.13236999511719, "rewards/margins": 40.52783203125, "rewards/rejected": 6.612007141113281, "step": 4308 }, { "epoch": 2.2303312629399588, "grad_norm": 2.2587785720825195, "learning_rate": 1.6942539426167681e-06, "loss": 0.10783575475215912, "rewards/accuracies": 0.9453125, "rewards/chosen": 45.127105712890625, "rewards/margins": 39.9715576171875, "rewards/rejected": 5.152400016784668, "step": 4309 }, { "epoch": 2.230848861283644, "grad_norm": 1.4431573152542114, "learning_rate": 1.6921140941436254e-06, "loss": 0.10422952473163605, "rewards/accuracies": 0.9375, "rewards/chosen": 47.49577331542969, "rewards/margins": 41.32586669921875, "rewards/rejected": 6.181850433349609, "step": 4310 }, { "epoch": 2.231366459627329, "grad_norm": 0.731024444103241, "learning_rate": 1.689975322576593e-06, "loss": 0.05800589546561241, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.66285705566406, "rewards/margins": 43.88531494140625, "rewards/rejected": 5.767810821533203, "step": 4311 }, { "epoch": 2.2318840579710146, "grad_norm": 1.7101384401321411, "learning_rate": 1.6878376286119636e-06, "loss": 0.12557253241539001, "rewards/accuracies": 0.9140625, "rewards/chosen": 42.08452606201172, "rewards/margins": 36.39399719238281, "rewards/rejected": 5.693755149841309, "step": 4312 }, { "epoch": 2.2324016563147, "grad_norm": 1.6771832704544067, "learning_rate": 1.6857010129456785e-06, "loss": 0.07299632579088211, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.30120849609375, "rewards/margins": 42.65716552734375, "rewards/rejected": 4.6519317626953125, "step": 4313 }, { "epoch": 2.232919254658385, "grad_norm": 0.9882583618164062, "learning_rate": 1.6835654762733288e-06, "loss": 0.09462638199329376, "rewards/accuracies": 0.9375, "rewards/chosen": 49.03985595703125, "rewards/margins": 43.222381591796875, "rewards/rejected": 5.798702239990234, "step": 4314 }, { "epoch": 2.2334368530020705, "grad_norm": 1.3715721368789673, "learning_rate": 1.6814310192901533e-06, "loss": 0.09529280662536621, "rewards/accuracies": 0.953125, "rewards/chosen": 44.70305633544922, "rewards/margins": 38.967987060546875, "rewards/rejected": 5.742088317871094, "step": 4315 }, { "epoch": 2.2339544513457557, "grad_norm": 1.0406763553619385, "learning_rate": 1.6792976426910407e-06, "loss": 0.08886876702308655, "rewards/accuracies": 0.953125, "rewards/chosen": 46.82527160644531, "rewards/margins": 40.2601318359375, "rewards/rejected": 6.560488224029541, "step": 4316 }, { "epoch": 2.234472049689441, "grad_norm": 0.8248482346534729, "learning_rate": 1.6771653471705264e-06, "loss": 0.0705827921628952, "rewards/accuracies": 0.9765625, "rewards/chosen": 42.524620056152344, "rewards/margins": 35.800537109375, "rewards/rejected": 6.7258806228637695, "step": 4317 }, { "epoch": 2.2349896480331264, "grad_norm": 0.5056045651435852, "learning_rate": 1.675034133422796e-06, "loss": 0.0621279738843441, "rewards/accuracies": 0.96875, "rewards/chosen": 45.4527473449707, "rewards/margins": 39.597259521484375, "rewards/rejected": 5.862051010131836, "step": 4318 }, { "epoch": 2.2355072463768115, "grad_norm": 0.6845331788063049, "learning_rate": 1.6729040021416775e-06, "loss": 0.05005519837141037, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.12164306640625, "rewards/margins": 43.58306884765625, "rewards/rejected": 6.534780502319336, "step": 4319 }, { "epoch": 2.2360248447204967, "grad_norm": 0.6572498679161072, "learning_rate": 1.6707749540206546e-06, "loss": 0.06496661901473999, "rewards/accuracies": 0.96875, "rewards/chosen": 46.223670959472656, "rewards/margins": 42.055755615234375, "rewards/rejected": 4.1641645431518555, "step": 4320 }, { "epoch": 2.2365424430641823, "grad_norm": 0.9039541482925415, "learning_rate": 1.6686469897528534e-06, "loss": 0.09701576828956604, "rewards/accuracies": 0.953125, "rewards/chosen": 49.634376525878906, "rewards/margins": 43.02983093261719, "rewards/rejected": 6.603849411010742, "step": 4321 }, { "epoch": 2.2370600414078674, "grad_norm": 0.7058882117271423, "learning_rate": 1.666520110031049e-06, "loss": 0.06779005378484726, "rewards/accuracies": 0.9765625, "rewards/chosen": 44.88568878173828, "rewards/margins": 39.644287109375, "rewards/rejected": 5.235252380371094, "step": 4322 }, { "epoch": 2.237577639751553, "grad_norm": 0.6630402207374573, "learning_rate": 1.6643943155476578e-06, "loss": 0.06072046607732773, "rewards/accuracies": 0.984375, "rewards/chosen": 43.13286590576172, "rewards/margins": 36.9818115234375, "rewards/rejected": 6.146080017089844, "step": 4323 }, { "epoch": 2.238095238095238, "grad_norm": 0.7249823212623596, "learning_rate": 1.662269606994752e-06, "loss": 0.04821799695491791, "rewards/accuracies": 0.9921875, "rewards/chosen": 48.3029899597168, "rewards/margins": 40.82012939453125, "rewards/rejected": 7.491600036621094, "step": 4324 }, { "epoch": 2.2386128364389233, "grad_norm": 2.1528749465942383, "learning_rate": 1.6601459850640461e-06, "loss": 0.06524741649627686, "rewards/accuracies": 0.96875, "rewards/chosen": 44.53837585449219, "rewards/margins": 38.2503662109375, "rewards/rejected": 6.301002502441406, "step": 4325 }, { "epoch": 2.239130434782609, "grad_norm": 0.9544919729232788, "learning_rate": 1.6580234504468977e-06, "loss": 0.09528949856758118, "rewards/accuracies": 0.9375, "rewards/chosen": 45.600364685058594, "rewards/margins": 41.3607177734375, "rewards/rejected": 4.236507415771484, "step": 4326 }, { "epoch": 2.239648033126294, "grad_norm": 1.17919921875, "learning_rate": 1.6559020038343131e-06, "loss": 0.08227430284023285, "rewards/accuracies": 0.953125, "rewards/chosen": 50.42966079711914, "rewards/margins": 43.219329833984375, "rewards/rejected": 7.205570220947266, "step": 4327 }, { "epoch": 2.240165631469979, "grad_norm": 0.9012529850006104, "learning_rate": 1.6537816459169487e-06, "loss": 0.06950001418590546, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.15322494506836, "rewards/margins": 41.47674560546875, "rewards/rejected": 5.667991638183594, "step": 4328 }, { "epoch": 2.2406832298136647, "grad_norm": 1.2311028242111206, "learning_rate": 1.6516623773850982e-06, "loss": 0.058210793882608414, "rewards/accuracies": 0.9765625, "rewards/chosen": 46.58256149291992, "rewards/margins": 40.878631591796875, "rewards/rejected": 5.712602615356445, "step": 4329 }, { "epoch": 2.24120082815735, "grad_norm": 0.8297966122627258, "learning_rate": 1.6495441989287058e-06, "loss": 0.08059439063072205, "rewards/accuracies": 0.953125, "rewards/chosen": 51.283721923828125, "rewards/margins": 43.887939453125, "rewards/rejected": 7.397068023681641, "step": 4330 }, { "epoch": 2.241718426501035, "grad_norm": 1.214471697807312, "learning_rate": 1.6474271112373607e-06, "loss": 0.07967064529657364, "rewards/accuracies": 0.96875, "rewards/chosen": 43.88249206542969, "rewards/margins": 39.06977844238281, "rewards/rejected": 4.815512180328369, "step": 4331 }, { "epoch": 2.2422360248447206, "grad_norm": 1.2511430978775024, "learning_rate": 1.645311115000295e-06, "loss": 0.06749974936246872, "rewards/accuracies": 0.96875, "rewards/chosen": 47.51750946044922, "rewards/margins": 39.99079895019531, "rewards/rejected": 7.52397346496582, "step": 4332 }, { "epoch": 2.2427536231884058, "grad_norm": 4.201740264892578, "learning_rate": 1.643196210906387e-06, "loss": 0.13079833984375, "rewards/accuracies": 0.953125, "rewards/chosen": 53.44171142578125, "rewards/margins": 45.03369140625, "rewards/rejected": 8.395267486572266, "step": 4333 }, { "epoch": 2.243271221532091, "grad_norm": 1.1227999925613403, "learning_rate": 1.6410823996441578e-06, "loss": 0.09032399207353592, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.95575714111328, "rewards/margins": 42.992645263671875, "rewards/rejected": 4.949897766113281, "step": 4334 }, { "epoch": 2.2437888198757765, "grad_norm": 0.7219406962394714, "learning_rate": 1.6389696819017775e-06, "loss": 0.06284811347723007, "rewards/accuracies": 0.96875, "rewards/chosen": 47.45849609375, "rewards/margins": 40.64263916015625, "rewards/rejected": 6.8183135986328125, "step": 4335 }, { "epoch": 2.2443064182194616, "grad_norm": 0.8124452829360962, "learning_rate": 1.6368580583670529e-06, "loss": 0.07318289577960968, "rewards/accuracies": 0.96875, "rewards/chosen": 50.19482421875, "rewards/margins": 43.391876220703125, "rewards/rejected": 6.802284240722656, "step": 4336 }, { "epoch": 2.244824016563147, "grad_norm": 1.1966464519500732, "learning_rate": 1.6347475297274397e-06, "loss": 0.08301085978746414, "rewards/accuracies": 0.96875, "rewards/chosen": 47.864479064941406, "rewards/margins": 41.423553466796875, "rewards/rejected": 6.432853698730469, "step": 4337 }, { "epoch": 2.2453416149068324, "grad_norm": 0.8436383605003357, "learning_rate": 1.6326380966700356e-06, "loss": 0.06599501520395279, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.46349334716797, "rewards/margins": 46.239776611328125, "rewards/rejected": 7.230613708496094, "step": 4338 }, { "epoch": 2.2458592132505175, "grad_norm": 1.0480819940567017, "learning_rate": 1.630529759881581e-06, "loss": 0.051597900688648224, "rewards/accuracies": 0.9765625, "rewards/chosen": 49.70564270019531, "rewards/margins": 41.906341552734375, "rewards/rejected": 7.79359245300293, "step": 4339 }, { "epoch": 2.246376811594203, "grad_norm": 2.846130132675171, "learning_rate": 1.6284225200484616e-06, "loss": 0.1731259524822235, "rewards/accuracies": 0.9375, "rewards/chosen": 49.10215759277344, "rewards/margins": 42.623321533203125, "rewards/rejected": 6.492084503173828, "step": 4340 }, { "epoch": 2.2468944099378882, "grad_norm": 2.2260842323303223, "learning_rate": 1.6263163778567037e-06, "loss": 0.14156599342823029, "rewards/accuracies": 0.9296875, "rewards/chosen": 45.21870422363281, "rewards/margins": 39.46232604980469, "rewards/rejected": 5.762718200683594, "step": 4341 }, { "epoch": 2.2474120082815734, "grad_norm": 0.9793403744697571, "learning_rate": 1.6242113339919768e-06, "loss": 0.05706912279129028, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.991050720214844, "rewards/margins": 44.48480224609375, "rewards/rejected": 8.500602722167969, "step": 4342 }, { "epoch": 2.247929606625259, "grad_norm": 0.9916425347328186, "learning_rate": 1.6221073891395933e-06, "loss": 0.03288775682449341, "rewards/accuracies": 0.984375, "rewards/chosen": 52.609161376953125, "rewards/margins": 45.292236328125, "rewards/rejected": 7.3179168701171875, "step": 4343 }, { "epoch": 2.248447204968944, "grad_norm": 1.5635430812835693, "learning_rate": 1.620004543984508e-06, "loss": 0.07727428525686264, "rewards/accuracies": 0.9765625, "rewards/chosen": 48.45970153808594, "rewards/margins": 42.35643005371094, "rewards/rejected": 6.095970153808594, "step": 4344 }, { "epoch": 2.2489648033126293, "grad_norm": 1.9332472085952759, "learning_rate": 1.6179027992113172e-06, "loss": 0.10175144672393799, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.12046813964844, "rewards/margins": 41.68695068359375, "rewards/rejected": 10.434150695800781, "step": 4345 }, { "epoch": 2.249482401656315, "grad_norm": 0.8687083125114441, "learning_rate": 1.6158021555042592e-06, "loss": 0.06353456526994705, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.862396240234375, "rewards/margins": 45.478553771972656, "rewards/rejected": 6.377941131591797, "step": 4346 }, { "epoch": 2.25, "grad_norm": 0.8192620873451233, "learning_rate": 1.6137026135472129e-06, "loss": 0.062171779572963715, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.772239685058594, "rewards/margins": 41.96405029296875, "rewards/rejected": 7.80699348449707, "step": 4347 }, { "epoch": 2.250517598343685, "grad_norm": 2.1675479412078857, "learning_rate": 1.6116041740237015e-06, "loss": 0.10774453729391098, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.436485290527344, "rewards/margins": 42.644439697265625, "rewards/rejected": 7.786784648895264, "step": 4348 }, { "epoch": 2.2510351966873707, "grad_norm": 1.6233912706375122, "learning_rate": 1.6095068376168822e-06, "loss": 0.13409948348999023, "rewards/accuracies": 0.953125, "rewards/chosen": 46.02557373046875, "rewards/margins": 40.2783203125, "rewards/rejected": 5.745075225830078, "step": 4349 }, { "epoch": 2.251552795031056, "grad_norm": 1.014153003692627, "learning_rate": 1.6074106050095633e-06, "loss": 0.06066878139972687, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.127418518066406, "rewards/margins": 48.20367431640625, "rewards/rejected": 7.918302536010742, "step": 4350 }, { "epoch": 2.252070393374741, "grad_norm": 1.4839879274368286, "learning_rate": 1.6053154768841878e-06, "loss": 0.11026874929666519, "rewards/accuracies": 0.953125, "rewards/chosen": 47.373443603515625, "rewards/margins": 39.29364013671875, "rewards/rejected": 8.076114654541016, "step": 4351 }, { "epoch": 2.2525879917184266, "grad_norm": 1.8082391023635864, "learning_rate": 1.6032214539228368e-06, "loss": 0.13409049808979034, "rewards/accuracies": 0.9375, "rewards/chosen": 49.83037185668945, "rewards/margins": 42.4951171875, "rewards/rejected": 7.321125030517578, "step": 4352 }, { "epoch": 2.2531055900621118, "grad_norm": 1.0455694198608398, "learning_rate": 1.6011285368072344e-06, "loss": 0.03965287655591965, "rewards/accuracies": 0.984375, "rewards/chosen": 55.88087463378906, "rewards/margins": 46.04139709472656, "rewards/rejected": 9.844510078430176, "step": 4353 }, { "epoch": 2.253623188405797, "grad_norm": 0.6543310880661011, "learning_rate": 1.5990367262187502e-06, "loss": 0.035889606922864914, "rewards/accuracies": 0.9921875, "rewards/chosen": 54.404457092285156, "rewards/margins": 44.05181884765625, "rewards/rejected": 10.359638214111328, "step": 4354 }, { "epoch": 2.2541407867494825, "grad_norm": 0.5186252593994141, "learning_rate": 1.5969460228383826e-06, "loss": 0.05078766494989395, "rewards/accuracies": 0.96875, "rewards/chosen": 50.697052001953125, "rewards/margins": 44.732452392578125, "rewards/rejected": 5.963438987731934, "step": 4355 }, { "epoch": 2.2546583850931676, "grad_norm": 0.712195873260498, "learning_rate": 1.5948564273467776e-06, "loss": 0.052309900522232056, "rewards/accuracies": 0.9765625, "rewards/chosen": 44.06463623046875, "rewards/margins": 37.09062194824219, "rewards/rejected": 6.976027488708496, "step": 4356 }, { "epoch": 2.255175983436853, "grad_norm": 1.219773530960083, "learning_rate": 1.5927679404242159e-06, "loss": 0.11964541673660278, "rewards/accuracies": 0.9296875, "rewards/chosen": 48.824302673339844, "rewards/margins": 42.353294372558594, "rewards/rejected": 6.469673156738281, "step": 4357 }, { "epoch": 2.2556935817805384, "grad_norm": 2.860874891281128, "learning_rate": 1.5906805627506244e-06, "loss": 0.09523718059062958, "rewards/accuracies": 0.9609375, "rewards/chosen": 46.726463317871094, "rewards/margins": 37.680999755859375, "rewards/rejected": 9.037424087524414, "step": 4358 }, { "epoch": 2.2562111801242235, "grad_norm": 0.9834904074668884, "learning_rate": 1.5885942950055587e-06, "loss": 0.07807217538356781, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.64335632324219, "rewards/margins": 45.336692810058594, "rewards/rejected": 7.3044891357421875, "step": 4359 }, { "epoch": 2.256728778467909, "grad_norm": 1.9387621879577637, "learning_rate": 1.5865091378682212e-06, "loss": 0.11756287515163422, "rewards/accuracies": 0.9453125, "rewards/chosen": 50.00872039794922, "rewards/margins": 40.5811767578125, "rewards/rejected": 9.415641784667969, "step": 4360 }, { "epoch": 2.2572463768115942, "grad_norm": 1.2906397581100464, "learning_rate": 1.5844250920174493e-06, "loss": 0.09535915404558182, "rewards/accuracies": 0.96875, "rewards/chosen": 46.050987243652344, "rewards/margins": 39.03721618652344, "rewards/rejected": 7.010042667388916, "step": 4361 }, { "epoch": 2.2577639751552794, "grad_norm": 1.993343710899353, "learning_rate": 1.5823421581317184e-06, "loss": 0.1512746810913086, "rewards/accuracies": 0.921875, "rewards/chosen": 44.31041717529297, "rewards/margins": 36.7757568359375, "rewards/rejected": 7.535213470458984, "step": 4362 }, { "epoch": 2.258281573498965, "grad_norm": 1.3077049255371094, "learning_rate": 1.5802603368891428e-06, "loss": 0.0835084542632103, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.86284255981445, "rewards/margins": 40.704811096191406, "rewards/rejected": 9.157394409179688, "step": 4363 }, { "epoch": 2.25879917184265, "grad_norm": 1.3114478588104248, "learning_rate": 1.5781796289674755e-06, "loss": 0.10085531324148178, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.12199401855469, "rewards/margins": 38.9898681640625, "rewards/rejected": 8.140819549560547, "step": 4364 }, { "epoch": 2.2593167701863353, "grad_norm": 1.274735927581787, "learning_rate": 1.5761000350441046e-06, "loss": 0.06844308227300644, "rewards/accuracies": 0.9765625, "rewards/chosen": 45.91682815551758, "rewards/margins": 37.430511474609375, "rewards/rejected": 8.485567092895508, "step": 4365 }, { "epoch": 2.259834368530021, "grad_norm": 1.0041487216949463, "learning_rate": 1.5740215557960581e-06, "loss": 0.08225741982460022, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.615562438964844, "rewards/margins": 43.85430908203125, "rewards/rejected": 7.760601043701172, "step": 4366 }, { "epoch": 2.260351966873706, "grad_norm": 1.5000454187393188, "learning_rate": 1.5719441918999994e-06, "loss": 0.14463430643081665, "rewards/accuracies": 0.9453125, "rewards/chosen": 46.249267578125, "rewards/margins": 40.3494873046875, "rewards/rejected": 5.904384613037109, "step": 4367 }, { "epoch": 2.260869565217391, "grad_norm": 1.036491870880127, "learning_rate": 1.569867944032229e-06, "loss": 0.14046965539455414, "rewards/accuracies": 0.9375, "rewards/chosen": 47.915428161621094, "rewards/margins": 39.91973114013672, "rewards/rejected": 7.993579864501953, "step": 4368 }, { "epoch": 2.2613871635610767, "grad_norm": 1.7022680044174194, "learning_rate": 1.5677928128686848e-06, "loss": 0.1019512265920639, "rewards/accuracies": 0.9609375, "rewards/chosen": 44.13570022583008, "rewards/margins": 38.17230224609375, "rewards/rejected": 5.962611198425293, "step": 4369 }, { "epoch": 2.261904761904762, "grad_norm": 1.157007098197937, "learning_rate": 1.5657187990849405e-06, "loss": 0.14438126981258392, "rewards/accuracies": 0.9296875, "rewards/chosen": 46.588714599609375, "rewards/margins": 38.741485595703125, "rewards/rejected": 7.847385406494141, "step": 4370 }, { "epoch": 2.262422360248447, "grad_norm": 1.0529050827026367, "learning_rate": 1.5636459033562056e-06, "loss": 0.07409659028053284, "rewards/accuracies": 0.953125, "rewards/chosen": 48.687469482421875, "rewards/margins": 41.294189453125, "rewards/rejected": 7.396221160888672, "step": 4371 }, { "epoch": 2.2629399585921326, "grad_norm": 1.006239891052246, "learning_rate": 1.561574126357328e-06, "loss": 0.08316505700349808, "rewards/accuracies": 0.96875, "rewards/chosen": 51.61068344116211, "rewards/margins": 44.317901611328125, "rewards/rejected": 7.271125793457031, "step": 4372 }, { "epoch": 2.2634575569358177, "grad_norm": 1.4094903469085693, "learning_rate": 1.559503468762788e-06, "loss": 0.12052053213119507, "rewards/accuracies": 0.9453125, "rewards/chosen": 41.973243713378906, "rewards/margins": 36.78875732421875, "rewards/rejected": 5.203122138977051, "step": 4373 }, { "epoch": 2.2639751552795033, "grad_norm": 1.3718010187149048, "learning_rate": 1.5574339312467052e-06, "loss": 0.086093470454216, "rewards/accuracies": 0.96875, "rewards/chosen": 52.436004638671875, "rewards/margins": 44.006256103515625, "rewards/rejected": 8.43590259552002, "step": 4374 }, { "epoch": 2.2644927536231885, "grad_norm": 1.9696143865585327, "learning_rate": 1.5553655144828279e-06, "loss": 0.12457036972045898, "rewards/accuracies": 0.9453125, "rewards/chosen": 48.76091766357422, "rewards/margins": 41.378814697265625, "rewards/rejected": 7.3760986328125, "step": 4375 }, { "epoch": 2.2650103519668736, "grad_norm": 0.6334286332130432, "learning_rate": 1.553298219144549e-06, "loss": 0.05076894536614418, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.655303955078125, "rewards/margins": 43.0909423828125, "rewards/rejected": 8.551851272583008, "step": 4376 }, { "epoch": 2.2655279503105588, "grad_norm": 0.9099352359771729, "learning_rate": 1.5512320459048908e-06, "loss": 0.08963403105735779, "rewards/accuracies": 0.96875, "rewards/chosen": 53.89506149291992, "rewards/margins": 45.3524169921875, "rewards/rejected": 8.55678939819336, "step": 4377 }, { "epoch": 2.2660455486542443, "grad_norm": 0.7163922786712646, "learning_rate": 1.5491669954365084e-06, "loss": 0.06279445439577103, "rewards/accuracies": 0.9609375, "rewards/chosen": 48.122711181640625, "rewards/margins": 40.412109375, "rewards/rejected": 7.712839126586914, "step": 4378 }, { "epoch": 2.2665631469979295, "grad_norm": 2.520212173461914, "learning_rate": 1.5471030684116933e-06, "loss": 0.09265085309743881, "rewards/accuracies": 0.96875, "rewards/chosen": 41.733299255371094, "rewards/margins": 35.78875732421875, "rewards/rejected": 5.944177627563477, "step": 4379 }, { "epoch": 2.267080745341615, "grad_norm": 0.5516955256462097, "learning_rate": 1.545040265502375e-06, "loss": 0.05250929668545723, "rewards/accuracies": 0.9765625, "rewards/chosen": 49.908447265625, "rewards/margins": 42.15196228027344, "rewards/rejected": 7.772308349609375, "step": 4380 }, { "epoch": 2.2675983436853, "grad_norm": 0.7308472990989685, "learning_rate": 1.5429785873801145e-06, "loss": 0.09160727262496948, "rewards/accuracies": 0.953125, "rewards/chosen": 47.89385223388672, "rewards/margins": 40.223392486572266, "rewards/rejected": 7.669058799743652, "step": 4381 }, { "epoch": 2.2681159420289854, "grad_norm": 1.486562728881836, "learning_rate": 1.540918034716102e-06, "loss": 0.14894500374794006, "rewards/accuracies": 0.9296875, "rewards/chosen": 45.36433792114258, "rewards/margins": 38.2367057800293, "rewards/rejected": 7.129213809967041, "step": 4382 }, { "epoch": 2.268633540372671, "grad_norm": 0.9612070322036743, "learning_rate": 1.5388586081811657e-06, "loss": 0.08119787275791168, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.769554138183594, "rewards/margins": 46.45098876953125, "rewards/rejected": 7.325565338134766, "step": 4383 }, { "epoch": 2.269151138716356, "grad_norm": 1.9996393918991089, "learning_rate": 1.5368003084457717e-06, "loss": 0.171681746840477, "rewards/accuracies": 0.921875, "rewards/chosen": 45.708900451660156, "rewards/margins": 39.554412841796875, "rewards/rejected": 6.142078399658203, "step": 4384 }, { "epoch": 2.2696687370600412, "grad_norm": 0.6558118462562561, "learning_rate": 1.5347431361800096e-06, "loss": 0.08331115543842316, "rewards/accuracies": 0.953125, "rewards/chosen": 52.59601593017578, "rewards/margins": 43.52455139160156, "rewards/rejected": 9.07379150390625, "step": 4385 }, { "epoch": 2.270186335403727, "grad_norm": 1.0513641834259033, "learning_rate": 1.5326870920536074e-06, "loss": 0.12016328424215317, "rewards/accuracies": 0.953125, "rewards/chosen": 44.127655029296875, "rewards/margins": 36.564117431640625, "rewards/rejected": 7.569911956787109, "step": 4386 }, { "epoch": 2.270703933747412, "grad_norm": 0.7284976243972778, "learning_rate": 1.5306321767359262e-06, "loss": 0.06688814610242844, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.598182678222656, "rewards/margins": 43.60400390625, "rewards/rejected": 9.004554748535156, "step": 4387 }, { "epoch": 2.271221532091097, "grad_norm": 1.5220375061035156, "learning_rate": 1.5285783908959578e-06, "loss": 0.07446762919425964, "rewards/accuracies": 0.953125, "rewards/chosen": 56.28160858154297, "rewards/margins": 46.44976806640625, "rewards/rejected": 9.83249282836914, "step": 4388 }, { "epoch": 2.2717391304347827, "grad_norm": 0.7699747681617737, "learning_rate": 1.5265257352023271e-06, "loss": 0.0710785984992981, "rewards/accuracies": 0.96875, "rewards/chosen": 47.025909423828125, "rewards/margins": 41.134521484375, "rewards/rejected": 5.902902603149414, "step": 4389 }, { "epoch": 2.272256728778468, "grad_norm": 0.826869010925293, "learning_rate": 1.5244742103232913e-06, "loss": 0.09989926218986511, "rewards/accuracies": 0.953125, "rewards/chosen": 50.13175964355469, "rewards/margins": 41.182281494140625, "rewards/rejected": 8.961563110351562, "step": 4390 }, { "epoch": 2.2727743271221534, "grad_norm": 0.5118038654327393, "learning_rate": 1.5224238169267392e-06, "loss": 0.07084165513515472, "rewards/accuracies": 0.96875, "rewards/chosen": 54.86473083496094, "rewards/margins": 45.823516845703125, "rewards/rejected": 9.052379608154297, "step": 4391 }, { "epoch": 2.2732919254658386, "grad_norm": 0.8424033522605896, "learning_rate": 1.5203745556801901e-06, "loss": 0.10314102470874786, "rewards/accuracies": 0.9375, "rewards/chosen": 47.97145462036133, "rewards/margins": 42.43492889404297, "rewards/rejected": 5.527109146118164, "step": 4392 }, { "epoch": 2.2738095238095237, "grad_norm": 0.4591241478919983, "learning_rate": 1.518326427250797e-06, "loss": 0.0474623367190361, "rewards/accuracies": 0.984375, "rewards/chosen": 52.6912956237793, "rewards/margins": 44.894012451171875, "rewards/rejected": 7.7953643798828125, "step": 4393 }, { "epoch": 2.274327122153209, "grad_norm": 1.0109977722167969, "learning_rate": 1.516279432305342e-06, "loss": 0.09848455339670181, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.625091552734375, "rewards/margins": 43.025604248046875, "rewards/rejected": 8.582584381103516, "step": 4394 }, { "epoch": 2.2748447204968945, "grad_norm": 0.5568683743476868, "learning_rate": 1.5142335715102397e-06, "loss": 0.06211462989449501, "rewards/accuracies": 0.984375, "rewards/chosen": 52.092742919921875, "rewards/margins": 43.4556884765625, "rewards/rejected": 8.629863739013672, "step": 4395 }, { "epoch": 2.2753623188405796, "grad_norm": 0.5441604852676392, "learning_rate": 1.5121888455315341e-06, "loss": 0.07766614109277725, "rewards/accuracies": 0.953125, "rewards/chosen": 51.72238540649414, "rewards/margins": 44.0915641784668, "rewards/rejected": 7.632923126220703, "step": 4396 }, { "epoch": 2.275879917184265, "grad_norm": 0.8339076042175293, "learning_rate": 1.5101452550349017e-06, "loss": 0.10565663129091263, "rewards/accuracies": 0.9609375, "rewards/chosen": 43.18581771850586, "rewards/margins": 36.72999572753906, "rewards/rejected": 6.4584174156188965, "step": 4397 }, { "epoch": 2.2763975155279503, "grad_norm": 0.7241299748420715, "learning_rate": 1.5081028006856473e-06, "loss": 0.04165460541844368, "rewards/accuracies": 0.9921875, "rewards/chosen": 50.41395568847656, "rewards/margins": 42.88690185546875, "rewards/rejected": 7.526123046875, "step": 4398 }, { "epoch": 2.2769151138716355, "grad_norm": 0.8722521066665649, "learning_rate": 1.5060614831487064e-06, "loss": 0.0717267245054245, "rewards/accuracies": 0.96875, "rewards/chosen": 52.947998046875, "rewards/margins": 44.457733154296875, "rewards/rejected": 8.498950958251953, "step": 4399 }, { "epoch": 2.277432712215321, "grad_norm": 1.2213785648345947, "learning_rate": 1.5040213030886469e-06, "loss": 0.13027429580688477, "rewards/accuracies": 0.9296875, "rewards/chosen": 53.3121337890625, "rewards/margins": 46.217529296875, "rewards/rejected": 7.104768753051758, "step": 4400 }, { "epoch": 2.277950310559006, "grad_norm": 0.9633173942565918, "learning_rate": 1.501982261169659e-06, "loss": 0.11441498249769211, "rewards/accuracies": 0.9375, "rewards/chosen": 58.06291961669922, "rewards/margins": 47.78045654296875, "rewards/rejected": 10.276605606079102, "step": 4401 }, { "epoch": 2.2784679089026914, "grad_norm": 0.911238431930542, "learning_rate": 1.4999443580555717e-06, "loss": 0.05025810748338699, "rewards/accuracies": 0.96875, "rewards/chosen": 49.88042449951172, "rewards/margins": 41.327972412109375, "rewards/rejected": 8.561767578125, "step": 4402 }, { "epoch": 2.278985507246377, "grad_norm": 0.6036351919174194, "learning_rate": 1.4979075944098387e-06, "loss": 0.059826500713825226, "rewards/accuracies": 0.96875, "rewards/chosen": 51.865272521972656, "rewards/margins": 42.7662353515625, "rewards/rejected": 9.097930908203125, "step": 4403 }, { "epoch": 2.279503105590062, "grad_norm": 2.4280247688293457, "learning_rate": 1.495871970895544e-06, "loss": 0.11203649640083313, "rewards/accuracies": 0.9453125, "rewards/chosen": 50.2269401550293, "rewards/margins": 41.788330078125, "rewards/rejected": 8.442329406738281, "step": 4404 }, { "epoch": 2.2800207039337472, "grad_norm": 0.8389788269996643, "learning_rate": 1.4938374881753943e-06, "loss": 0.07049816846847534, "rewards/accuracies": 0.96875, "rewards/chosen": 56.853660583496094, "rewards/margins": 47.794586181640625, "rewards/rejected": 9.054588317871094, "step": 4405 }, { "epoch": 2.280538302277433, "grad_norm": 1.110195279121399, "learning_rate": 1.4918041469117355e-06, "loss": 0.10897856950759888, "rewards/accuracies": 0.953125, "rewards/chosen": 51.16736602783203, "rewards/margins": 44.1090087890625, "rewards/rejected": 7.073230743408203, "step": 4406 }, { "epoch": 2.281055900621118, "grad_norm": 0.6362870931625366, "learning_rate": 1.4897719477665368e-06, "loss": 0.07566368579864502, "rewards/accuracies": 0.953125, "rewards/chosen": 50.0040168762207, "rewards/margins": 41.72239685058594, "rewards/rejected": 8.298494338989258, "step": 4407 }, { "epoch": 2.2815734989648035, "grad_norm": 0.8460480570793152, "learning_rate": 1.4877408914013913e-06, "loss": 0.07220853120088577, "rewards/accuracies": 0.9609375, "rewards/chosen": 48.62636184692383, "rewards/margins": 42.01673889160156, "rewards/rejected": 6.6133575439453125, "step": 4408 }, { "epoch": 2.2820910973084887, "grad_norm": 1.0873829126358032, "learning_rate": 1.4857109784775248e-06, "loss": 0.04615603759884834, "rewards/accuracies": 0.984375, "rewards/chosen": 48.661808013916016, "rewards/margins": 40.099700927734375, "rewards/rejected": 8.581398010253906, "step": 4409 }, { "epoch": 2.282608695652174, "grad_norm": 1.7158745527267456, "learning_rate": 1.4836822096557945e-06, "loss": 0.09936140477657318, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.05792999267578, "rewards/margins": 45.631622314453125, "rewards/rejected": 10.454730987548828, "step": 4410 }, { "epoch": 2.283126293995859, "grad_norm": 1.4745432138442993, "learning_rate": 1.4816545855966757e-06, "loss": 0.08062884211540222, "rewards/accuracies": 0.953125, "rewards/chosen": 54.97229766845703, "rewards/margins": 44.900787353515625, "rewards/rejected": 10.068305969238281, "step": 4411 }, { "epoch": 2.2836438923395446, "grad_norm": 0.6326128244400024, "learning_rate": 1.479628106960278e-06, "loss": 0.06650780141353607, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.47046661376953, "rewards/margins": 45.213775634765625, "rewards/rejected": 9.269218444824219, "step": 4412 }, { "epoch": 2.2841614906832297, "grad_norm": 1.519791603088379, "learning_rate": 1.4776027744063358e-06, "loss": 0.11331082135438919, "rewards/accuracies": 0.9375, "rewards/chosen": 51.242462158203125, "rewards/margins": 42.986427307128906, "rewards/rejected": 8.253768920898438, "step": 4413 }, { "epoch": 2.2846790890269153, "grad_norm": 2.3557510375976562, "learning_rate": 1.4755785885942113e-06, "loss": 0.11525501310825348, "rewards/accuracies": 0.953125, "rewards/chosen": 51.11619567871094, "rewards/margins": 42.528411865234375, "rewards/rejected": 8.587301254272461, "step": 4414 }, { "epoch": 2.2851966873706004, "grad_norm": 1.5702084302902222, "learning_rate": 1.4735555501828925e-06, "loss": 0.10356053709983826, "rewards/accuracies": 0.953125, "rewards/chosen": 55.92294692993164, "rewards/margins": 46.573699951171875, "rewards/rejected": 9.340141296386719, "step": 4415 }, { "epoch": 2.2857142857142856, "grad_norm": 0.9831761717796326, "learning_rate": 1.4715336598309938e-06, "loss": 0.07956774532794952, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.21250915527344, "rewards/margins": 50.75341796875, "rewards/rejected": 10.457855224609375, "step": 4416 }, { "epoch": 2.286231884057971, "grad_norm": 0.6365954279899597, "learning_rate": 1.4695129181967566e-06, "loss": 0.04008343070745468, "rewards/accuracies": 0.984375, "rewards/chosen": 55.2347412109375, "rewards/margins": 44.7310791015625, "rewards/rejected": 10.490974426269531, "step": 4417 }, { "epoch": 2.2867494824016563, "grad_norm": 3.4971814155578613, "learning_rate": 1.4674933259380474e-06, "loss": 0.15125465393066406, "rewards/accuracies": 0.9375, "rewards/chosen": 58.35643005371094, "rewards/margins": 49.719390869140625, "rewards/rejected": 8.629463195800781, "step": 4418 }, { "epoch": 2.2872670807453415, "grad_norm": 0.9585162997245789, "learning_rate": 1.4654748837123594e-06, "loss": 0.03532334044575691, "rewards/accuracies": 0.984375, "rewards/chosen": 56.27821350097656, "rewards/margins": 47.946136474609375, "rewards/rejected": 8.32339096069336, "step": 4419 }, { "epoch": 2.287784679089027, "grad_norm": 1.259376883506775, "learning_rate": 1.4634575921768113e-06, "loss": 0.09979024529457092, "rewards/accuracies": 0.96875, "rewards/chosen": 45.17508316040039, "rewards/margins": 37.358428955078125, "rewards/rejected": 7.813464164733887, "step": 4420 }, { "epoch": 2.288302277432712, "grad_norm": 0.9323895573616028, "learning_rate": 1.4614414519881465e-06, "loss": 0.08548811078071594, "rewards/accuracies": 0.96875, "rewards/chosen": 49.83281326293945, "rewards/margins": 43.098876953125, "rewards/rejected": 6.732043266296387, "step": 4421 }, { "epoch": 2.2888198757763973, "grad_norm": 0.7732895016670227, "learning_rate": 1.4594264638027345e-06, "loss": 0.05462203174829483, "rewards/accuracies": 0.984375, "rewards/chosen": 62.6346435546875, "rewards/margins": 49.992034912109375, "rewards/rejected": 12.639050483703613, "step": 4422 }, { "epoch": 2.289337474120083, "grad_norm": 1.1972264051437378, "learning_rate": 1.4574126282765682e-06, "loss": 0.06982315331697464, "rewards/accuracies": 0.9765625, "rewards/chosen": 64.06700134277344, "rewards/margins": 53.00738525390625, "rewards/rejected": 11.067281723022461, "step": 4423 }, { "epoch": 2.289855072463768, "grad_norm": 0.6932305097579956, "learning_rate": 1.4553999460652674e-06, "loss": 0.06181042268872261, "rewards/accuracies": 0.96875, "rewards/chosen": 51.01054382324219, "rewards/margins": 42.21697998046875, "rewards/rejected": 8.80556869506836, "step": 4424 }, { "epoch": 2.2903726708074537, "grad_norm": 2.0329136848449707, "learning_rate": 1.4533884178240743e-06, "loss": 0.1386221945285797, "rewards/accuracies": 0.9296875, "rewards/chosen": 51.99578857421875, "rewards/margins": 43.633575439453125, "rewards/rejected": 8.351005554199219, "step": 4425 }, { "epoch": 2.290890269151139, "grad_norm": 0.6487421989440918, "learning_rate": 1.4513780442078574e-06, "loss": 0.0549626462161541, "rewards/accuracies": 0.96875, "rewards/chosen": 58.3428955078125, "rewards/margins": 47.53973388671875, "rewards/rejected": 10.802837371826172, "step": 4426 }, { "epoch": 2.291407867494824, "grad_norm": 1.0334330797195435, "learning_rate": 1.4493688258711075e-06, "loss": 0.09255349636077881, "rewards/accuracies": 0.9609375, "rewards/chosen": 46.15742111206055, "rewards/margins": 36.94415283203125, "rewards/rejected": 9.211009979248047, "step": 4427 }, { "epoch": 2.291925465838509, "grad_norm": 0.818243682384491, "learning_rate": 1.4473607634679404e-06, "loss": 0.07137773931026459, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.142669677734375, "rewards/margins": 42.40557861328125, "rewards/rejected": 8.733211517333984, "step": 4428 }, { "epoch": 2.2924430641821947, "grad_norm": 0.9765617251396179, "learning_rate": 1.445353857652096e-06, "loss": 0.05264697223901749, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.40957260131836, "rewards/margins": 47.66331481933594, "rewards/rejected": 8.747701644897461, "step": 4429 }, { "epoch": 2.29296066252588, "grad_norm": 0.9055247902870178, "learning_rate": 1.4433481090769374e-06, "loss": 0.08734151721000671, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.48046875, "rewards/margins": 47.81941223144531, "rewards/rejected": 10.656553268432617, "step": 4430 }, { "epoch": 2.2934782608695654, "grad_norm": 1.5228177309036255, "learning_rate": 1.441343518395446e-06, "loss": 0.08535635471343994, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.14508056640625, "rewards/margins": 44.531097412109375, "rewards/rejected": 9.606658935546875, "step": 4431 }, { "epoch": 2.2939958592132506, "grad_norm": 1.018082857131958, "learning_rate": 1.4393400862602363e-06, "loss": 0.0906146839261055, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.855613708496094, "rewards/margins": 44.81849670410156, "rewards/rejected": 7.043117523193359, "step": 4432 }, { "epoch": 2.2945134575569357, "grad_norm": 2.415544271469116, "learning_rate": 1.4373378133235393e-06, "loss": 0.14679867029190063, "rewards/accuracies": 0.9375, "rewards/chosen": 51.433998107910156, "rewards/margins": 44.24339294433594, "rewards/rejected": 7.1812896728515625, "step": 4433 }, { "epoch": 2.2950310559006213, "grad_norm": 0.5826810002326965, "learning_rate": 1.4353367002372071e-06, "loss": 0.04506656154990196, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.20500183105469, "rewards/margins": 49.597930908203125, "rewards/rejected": 8.611980438232422, "step": 4434 }, { "epoch": 2.2955486542443064, "grad_norm": 2.0251362323760986, "learning_rate": 1.433336747652715e-06, "loss": 0.10592229664325714, "rewards/accuracies": 0.953125, "rewards/chosen": 56.832950592041016, "rewards/margins": 45.22235107421875, "rewards/rejected": 11.616108894348145, "step": 4435 }, { "epoch": 2.2960662525879916, "grad_norm": 2.8155763149261475, "learning_rate": 1.4313379562211681e-06, "loss": 0.10736637562513351, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.49822998046875, "rewards/margins": 45.32814025878906, "rewards/rejected": 8.184416770935059, "step": 4436 }, { "epoch": 2.296583850931677, "grad_norm": 2.775224208831787, "learning_rate": 1.429340326593282e-06, "loss": 0.058242492377758026, "rewards/accuracies": 0.984375, "rewards/chosen": 52.64842987060547, "rewards/margins": 43.61175537109375, "rewards/rejected": 9.030952453613281, "step": 4437 }, { "epoch": 2.2971014492753623, "grad_norm": 0.8933678269386292, "learning_rate": 1.4273438594194012e-06, "loss": 0.09298180043697357, "rewards/accuracies": 0.953125, "rewards/chosen": 54.89881134033203, "rewards/margins": 47.77397155761719, "rewards/rejected": 7.121128082275391, "step": 4438 }, { "epoch": 2.2976190476190474, "grad_norm": 0.6337122321128845, "learning_rate": 1.4253485553494883e-06, "loss": 0.0673384889960289, "rewards/accuracies": 0.96875, "rewards/chosen": 57.0648193359375, "rewards/margins": 47.0667724609375, "rewards/rejected": 9.992774963378906, "step": 4439 }, { "epoch": 2.298136645962733, "grad_norm": 1.066136360168457, "learning_rate": 1.4233544150331342e-06, "loss": 0.08745145797729492, "rewards/accuracies": 0.953125, "rewards/chosen": 59.722503662109375, "rewards/margins": 48.331878662109375, "rewards/rejected": 11.391265869140625, "step": 4440 }, { "epoch": 2.298654244306418, "grad_norm": 1.435314655303955, "learning_rate": 1.4213614391195397e-06, "loss": 0.06221554055809975, "rewards/accuracies": 0.953125, "rewards/chosen": 54.96031188964844, "rewards/margins": 46.540374755859375, "rewards/rejected": 8.417774200439453, "step": 4441 }, { "epoch": 2.2991718426501033, "grad_norm": 1.1975210905075073, "learning_rate": 1.4193696282575347e-06, "loss": 0.09669747948646545, "rewards/accuracies": 0.953125, "rewards/chosen": 53.256412506103516, "rewards/margins": 44.671783447265625, "rewards/rejected": 8.588550567626953, "step": 4442 }, { "epoch": 2.299689440993789, "grad_norm": 4.444005012512207, "learning_rate": 1.4173789830955687e-06, "loss": 0.18121039867401123, "rewards/accuracies": 0.9140625, "rewards/chosen": 49.78171920776367, "rewards/margins": 41.738616943359375, "rewards/rejected": 8.041038513183594, "step": 4443 }, { "epoch": 2.300207039337474, "grad_norm": 0.7962467074394226, "learning_rate": 1.4153895042817084e-06, "loss": 0.08547751605510712, "rewards/accuracies": 0.953125, "rewards/chosen": 53.722625732421875, "rewards/margins": 43.719818115234375, "rewards/rejected": 9.99929428100586, "step": 4444 }, { "epoch": 2.300724637681159, "grad_norm": 0.973629891872406, "learning_rate": 1.4134011924636448e-06, "loss": 0.07215366512537003, "rewards/accuracies": 0.96875, "rewards/chosen": 57.43115997314453, "rewards/margins": 47.31927490234375, "rewards/rejected": 10.110679626464844, "step": 4445 }, { "epoch": 2.301242236024845, "grad_norm": 1.1155420541763306, "learning_rate": 1.4114140482886867e-06, "loss": 0.08080118894577026, "rewards/accuracies": 0.96875, "rewards/chosen": 57.399574279785156, "rewards/margins": 47.901214599609375, "rewards/rejected": 9.492111206054688, "step": 4446 }, { "epoch": 2.30175983436853, "grad_norm": 2.0976269245147705, "learning_rate": 1.4094280724037628e-06, "loss": 0.14846986532211304, "rewards/accuracies": 0.9140625, "rewards/chosen": 59.749019622802734, "rewards/margins": 48.51124572753906, "rewards/rejected": 11.258035659790039, "step": 4447 }, { "epoch": 2.3022774327122155, "grad_norm": 0.6495358347892761, "learning_rate": 1.407443265455422e-06, "loss": 0.05302734300494194, "rewards/accuracies": 0.984375, "rewards/chosen": 55.51130676269531, "rewards/margins": 47.1480712890625, "rewards/rejected": 8.367958068847656, "step": 4448 }, { "epoch": 2.3027950310559007, "grad_norm": 0.9037384390830994, "learning_rate": 1.4054596280898325e-06, "loss": 0.07755404710769653, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.338077545166016, "rewards/margins": 45.018402099609375, "rewards/rejected": 9.330951690673828, "step": 4449 }, { "epoch": 2.303312629399586, "grad_norm": 4.183730602264404, "learning_rate": 1.403477160952782e-06, "loss": 0.10772241652011871, "rewards/accuracies": 0.953125, "rewards/chosen": 58.80108642578125, "rewards/margins": 45.392791748046875, "rewards/rejected": 13.409280776977539, "step": 4450 }, { "epoch": 2.3038302277432714, "grad_norm": 0.7386602163314819, "learning_rate": 1.4014958646896765e-06, "loss": 0.07816220819950104, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.033203125, "rewards/margins": 50.820037841796875, "rewards/rejected": 10.2298583984375, "step": 4451 }, { "epoch": 2.3043478260869565, "grad_norm": 0.5539577603340149, "learning_rate": 1.3995157399455422e-06, "loss": 0.043006472289562225, "rewards/accuracies": 0.984375, "rewards/chosen": 56.4406852722168, "rewards/margins": 47.27337646484375, "rewards/rejected": 9.188714981079102, "step": 4452 }, { "epoch": 2.3048654244306417, "grad_norm": 0.6179702281951904, "learning_rate": 1.397536787365022e-06, "loss": 0.05134482681751251, "rewards/accuracies": 0.96875, "rewards/chosen": 58.05345153808594, "rewards/margins": 49.05322265625, "rewards/rejected": 8.99839973449707, "step": 4453 }, { "epoch": 2.3053830227743273, "grad_norm": 0.8980206847190857, "learning_rate": 1.3955590075923785e-06, "loss": 0.06275879591703415, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.43537139892578, "rewards/margins": 46.57086181640625, "rewards/rejected": 10.85460090637207, "step": 4454 }, { "epoch": 2.3059006211180124, "grad_norm": 2.3483757972717285, "learning_rate": 1.3935824012714928e-06, "loss": 0.14286115765571594, "rewards/accuracies": 0.9296875, "rewards/chosen": 50.758113861083984, "rewards/margins": 42.90605926513672, "rewards/rejected": 7.851345539093018, "step": 4455 }, { "epoch": 2.3064182194616976, "grad_norm": 5.6466755867004395, "learning_rate": 1.391606969045864e-06, "loss": 0.05499562993645668, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.040557861328125, "rewards/margins": 45.24871826171875, "rewards/rejected": 9.80718994140625, "step": 4456 }, { "epoch": 2.306935817805383, "grad_norm": 0.40164241194725037, "learning_rate": 1.389632711558604e-06, "loss": 0.035759761929512024, "rewards/accuracies": 0.984375, "rewards/chosen": 61.21455001831055, "rewards/margins": 48.3131103515625, "rewards/rejected": 12.907730102539062, "step": 4457 }, { "epoch": 2.3074534161490683, "grad_norm": 1.2769078016281128, "learning_rate": 1.3876596294524508e-06, "loss": 0.12018805742263794, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.464599609375, "rewards/margins": 41.70892333984375, "rewards/rejected": 9.766559600830078, "step": 4458 }, { "epoch": 2.3079710144927534, "grad_norm": 0.873339831829071, "learning_rate": 1.3856877233697563e-06, "loss": 0.0836603119969368, "rewards/accuracies": 0.96875, "rewards/chosen": 56.67035675048828, "rewards/margins": 47.0677490234375, "rewards/rejected": 9.595184326171875, "step": 4459 }, { "epoch": 2.308488612836439, "grad_norm": 1.9946388006210327, "learning_rate": 1.3837169939524853e-06, "loss": 0.08792421221733093, "rewards/accuracies": 0.9609375, "rewards/chosen": 63.5280647277832, "rewards/margins": 51.464141845703125, "rewards/rejected": 12.06270694732666, "step": 4460 }, { "epoch": 2.309006211180124, "grad_norm": 0.6323714852333069, "learning_rate": 1.3817474418422227e-06, "loss": 0.06559984385967255, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.95914077758789, "rewards/margins": 41.351806640625, "rewards/rejected": 11.599586486816406, "step": 4461 }, { "epoch": 2.3095238095238093, "grad_norm": 0.932737410068512, "learning_rate": 1.3797790676801743e-06, "loss": 0.06580043584108353, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.12238311767578, "rewards/margins": 46.79595947265625, "rewards/rejected": 11.323065757751465, "step": 4462 }, { "epoch": 2.310041407867495, "grad_norm": 1.5270915031433105, "learning_rate": 1.3778118721071575e-06, "loss": 0.09843219816684723, "rewards/accuracies": 0.96875, "rewards/chosen": 56.44181823730469, "rewards/margins": 47.33282470703125, "rewards/rejected": 9.0985107421875, "step": 4463 }, { "epoch": 2.31055900621118, "grad_norm": 0.9724458456039429, "learning_rate": 1.3758458557636045e-06, "loss": 0.08809046447277069, "rewards/accuracies": 0.953125, "rewards/chosen": 51.45549774169922, "rewards/margins": 42.987152099609375, "rewards/rejected": 8.464309692382812, "step": 4464 }, { "epoch": 2.3110766045548656, "grad_norm": 2.8135170936584473, "learning_rate": 1.3738810192895658e-06, "loss": 0.10824629664421082, "rewards/accuracies": 0.96875, "rewards/chosen": 60.08758544921875, "rewards/margins": 46.2952880859375, "rewards/rejected": 13.786598205566406, "step": 4465 }, { "epoch": 2.3115942028985508, "grad_norm": 1.0954184532165527, "learning_rate": 1.3719173633247123e-06, "loss": 0.07237337529659271, "rewards/accuracies": 0.953125, "rewards/chosen": 46.99925231933594, "rewards/margins": 37.165496826171875, "rewards/rejected": 9.819686889648438, "step": 4466 }, { "epoch": 2.312111801242236, "grad_norm": 0.8573497533798218, "learning_rate": 1.369954888508323e-06, "loss": 0.05960606038570404, "rewards/accuracies": 0.96875, "rewards/chosen": 52.34576416015625, "rewards/margins": 41.91558837890625, "rewards/rejected": 10.432918548583984, "step": 4467 }, { "epoch": 2.3126293995859215, "grad_norm": 1.5100303888320923, "learning_rate": 1.3679935954792966e-06, "loss": 0.08145216107368469, "rewards/accuracies": 0.953125, "rewards/chosen": 59.61405944824219, "rewards/margins": 46.6260986328125, "rewards/rejected": 12.98611068725586, "step": 4468 }, { "epoch": 2.3131469979296067, "grad_norm": 1.0766875743865967, "learning_rate": 1.366033484876146e-06, "loss": 0.06678023934364319, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.919158935546875, "rewards/margins": 46.38044738769531, "rewards/rejected": 12.537487030029297, "step": 4469 }, { "epoch": 2.313664596273292, "grad_norm": 0.8134006261825562, "learning_rate": 1.3640745573369996e-06, "loss": 0.03563786298036575, "rewards/accuracies": 0.9921875, "rewards/chosen": 56.57868957519531, "rewards/margins": 44.531829833984375, "rewards/rejected": 12.068412780761719, "step": 4470 }, { "epoch": 2.3141821946169774, "grad_norm": 3.4185173511505127, "learning_rate": 1.3621168134995999e-06, "loss": 0.05149605870246887, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.701873779296875, "rewards/margins": 48.15655517578125, "rewards/rejected": 13.547470092773438, "step": 4471 }, { "epoch": 2.3146997929606625, "grad_norm": 1.0145559310913086, "learning_rate": 1.3601602540013048e-06, "loss": 0.10059108585119247, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.01190185546875, "rewards/margins": 48.70111083984375, "rewards/rejected": 10.306900024414062, "step": 4472 }, { "epoch": 2.3152173913043477, "grad_norm": 0.793164849281311, "learning_rate": 1.358204879479087e-06, "loss": 0.059246305376291275, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.455448150634766, "rewards/margins": 46.3492431640625, "rewards/rejected": 12.092178344726562, "step": 4473 }, { "epoch": 2.3157349896480333, "grad_norm": 0.824448823928833, "learning_rate": 1.3562506905695317e-06, "loss": 0.059740640223026276, "rewards/accuracies": 0.96875, "rewards/chosen": 58.085655212402344, "rewards/margins": 46.18798828125, "rewards/rejected": 11.904312133789062, "step": 4474 }, { "epoch": 2.3162525879917184, "grad_norm": 2.7893059253692627, "learning_rate": 1.35429768790884e-06, "loss": 0.13622549176216125, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.52882385253906, "rewards/margins": 48.523895263671875, "rewards/rejected": 10.008329391479492, "step": 4475 }, { "epoch": 2.3167701863354035, "grad_norm": 3.8008310794830322, "learning_rate": 1.352345872132826e-06, "loss": 0.10873851180076599, "rewards/accuracies": 0.953125, "rewards/chosen": 53.7750244140625, "rewards/margins": 41.517578125, "rewards/rejected": 12.250039100646973, "step": 4476 }, { "epoch": 2.317287784679089, "grad_norm": 0.8563255071640015, "learning_rate": 1.3503952438769163e-06, "loss": 0.06796276569366455, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.553157806396484, "rewards/margins": 44.340545654296875, "rewards/rejected": 15.223301887512207, "step": 4477 }, { "epoch": 2.3178053830227743, "grad_norm": 1.5793606042861938, "learning_rate": 1.3484458037761533e-06, "loss": 0.08674174547195435, "rewards/accuracies": 0.953125, "rewards/chosen": 60.3702392578125, "rewards/margins": 48.67816162109375, "rewards/rejected": 11.697101593017578, "step": 4478 }, { "epoch": 2.3183229813664594, "grad_norm": 1.7519280910491943, "learning_rate": 1.3464975524651907e-06, "loss": 0.047242626547813416, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.33261489868164, "rewards/margins": 49.473785400390625, "rewards/rejected": 10.867401123046875, "step": 4479 }, { "epoch": 2.318840579710145, "grad_norm": 0.7935702800750732, "learning_rate": 1.3445504905782963e-06, "loss": 0.07108159363269806, "rewards/accuracies": 0.953125, "rewards/chosen": 54.1611328125, "rewards/margins": 42.359375, "rewards/rejected": 11.793628692626953, "step": 4480 }, { "epoch": 2.31935817805383, "grad_norm": 2.722649097442627, "learning_rate": 1.3426046187493492e-06, "loss": 0.22233474254608154, "rewards/accuracies": 0.8984375, "rewards/chosen": 56.727516174316406, "rewards/margins": 45.08097839355469, "rewards/rejected": 11.644721984863281, "step": 4481 }, { "epoch": 2.3198757763975157, "grad_norm": 0.7917501926422119, "learning_rate": 1.340659937611844e-06, "loss": 0.07648734748363495, "rewards/accuracies": 0.953125, "rewards/chosen": 53.92498779296875, "rewards/margins": 45.1337890625, "rewards/rejected": 8.771438598632812, "step": 4482 }, { "epoch": 2.320393374741201, "grad_norm": 2.2151875495910645, "learning_rate": 1.3387164477988813e-06, "loss": 0.09118393063545227, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.878875732421875, "rewards/margins": 43.67781066894531, "rewards/rejected": 13.19930648803711, "step": 4483 }, { "epoch": 2.320910973084886, "grad_norm": 1.1305947303771973, "learning_rate": 1.336774149943183e-06, "loss": 0.13998007774353027, "rewards/accuracies": 0.9453125, "rewards/chosen": 48.491607666015625, "rewards/margins": 38.976932525634766, "rewards/rejected": 9.509321212768555, "step": 4484 }, { "epoch": 2.3214285714285716, "grad_norm": 0.6743568778038025, "learning_rate": 1.3348330446770757e-06, "loss": 0.04951747506856918, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.5152587890625, "rewards/margins": 45.379150390625, "rewards/rejected": 10.144607543945312, "step": 4485 }, { "epoch": 2.3219461697722568, "grad_norm": 1.0700974464416504, "learning_rate": 1.3328931326325022e-06, "loss": 0.07311241328716278, "rewards/accuracies": 0.96875, "rewards/chosen": 52.46842956542969, "rewards/margins": 42.16545104980469, "rewards/rejected": 10.301445007324219, "step": 4486 }, { "epoch": 2.322463768115942, "grad_norm": 0.7449687719345093, "learning_rate": 1.3309544144410107e-06, "loss": 0.049576081335544586, "rewards/accuracies": 0.9921875, "rewards/chosen": 59.43309020996094, "rewards/margins": 45.31489562988281, "rewards/rejected": 14.116440773010254, "step": 4487 }, { "epoch": 2.3229813664596275, "grad_norm": 2.665402412414551, "learning_rate": 1.3290168907337692e-06, "loss": 0.05332653224468231, "rewards/accuracies": 0.984375, "rewards/chosen": 63.03395080566406, "rewards/margins": 48.7528076171875, "rewards/rejected": 14.298538208007812, "step": 4488 }, { "epoch": 2.3234989648033126, "grad_norm": 0.8705942630767822, "learning_rate": 1.3270805621415523e-06, "loss": 0.061161018908023834, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.30955123901367, "rewards/margins": 48.69720458984375, "rewards/rejected": 11.604118347167969, "step": 4489 }, { "epoch": 2.324016563146998, "grad_norm": 1.1664364337921143, "learning_rate": 1.325145429294743e-06, "loss": 0.08017555624246597, "rewards/accuracies": 0.96875, "rewards/chosen": 58.232269287109375, "rewards/margins": 44.19610595703125, "rewards/rejected": 14.034652709960938, "step": 4490 }, { "epoch": 2.3245341614906834, "grad_norm": 1.2116565704345703, "learning_rate": 1.3232114928233375e-06, "loss": 0.1185014545917511, "rewards/accuracies": 0.9375, "rewards/chosen": 60.05413818359375, "rewards/margins": 45.902008056640625, "rewards/rejected": 14.157058715820312, "step": 4491 }, { "epoch": 2.3250517598343685, "grad_norm": 0.91109299659729, "learning_rate": 1.3212787533569472e-06, "loss": 0.0872449278831482, "rewards/accuracies": 0.953125, "rewards/chosen": 63.76703643798828, "rewards/margins": 48.83734130859375, "rewards/rejected": 14.92791748046875, "step": 4492 }, { "epoch": 2.3255693581780537, "grad_norm": 1.9850672483444214, "learning_rate": 1.3193472115247853e-06, "loss": 0.08460339903831482, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.17851257324219, "rewards/margins": 45.32275390625, "rewards/rejected": 12.866754531860352, "step": 4493 }, { "epoch": 2.3260869565217392, "grad_norm": 0.7292595505714417, "learning_rate": 1.3174168679556803e-06, "loss": 0.0742000937461853, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.481903076171875, "rewards/margins": 40.73944091796875, "rewards/rejected": 13.746469497680664, "step": 4494 }, { "epoch": 2.3266045548654244, "grad_norm": 3.090219736099243, "learning_rate": 1.3154877232780699e-06, "loss": 0.10273085534572601, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.406089782714844, "rewards/margins": 44.08312225341797, "rewards/rejected": 11.326580047607422, "step": 4495 }, { "epoch": 2.3271221532091095, "grad_norm": 0.7761503458023071, "learning_rate": 1.3135597781200004e-06, "loss": 0.08125899732112885, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.39872741699219, "rewards/margins": 43.1387939453125, "rewards/rejected": 11.26270866394043, "step": 4496 }, { "epoch": 2.327639751552795, "grad_norm": 3.008115291595459, "learning_rate": 1.3116330331091287e-06, "loss": 0.10471577942371368, "rewards/accuracies": 0.953125, "rewards/chosen": 56.669044494628906, "rewards/margins": 45.1962890625, "rewards/rejected": 11.468795776367188, "step": 4497 }, { "epoch": 2.3281573498964803, "grad_norm": 1.0247597694396973, "learning_rate": 1.3097074888727207e-06, "loss": 0.06311335414648056, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.10337829589844, "rewards/margins": 44.65087890625, "rewards/rejected": 16.451255798339844, "step": 4498 }, { "epoch": 2.328674948240166, "grad_norm": 1.0588921308517456, "learning_rate": 1.3077831460376505e-06, "loss": 0.050665490329265594, "rewards/accuracies": 0.984375, "rewards/chosen": 54.30940246582031, "rewards/margins": 40.817779541015625, "rewards/rejected": 13.486238479614258, "step": 4499 }, { "epoch": 2.329192546583851, "grad_norm": 0.8308804035186768, "learning_rate": 1.3058600052304026e-06, "loss": 0.09060147404670715, "rewards/accuracies": 0.9375, "rewards/chosen": 56.04277420043945, "rewards/margins": 42.60302734375, "rewards/rejected": 13.4324951171875, "step": 4500 }, { "epoch": 2.329710144927536, "grad_norm": 0.5369257926940918, "learning_rate": 1.303938067077069e-06, "loss": 0.052652254700660706, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.30353546142578, "rewards/margins": 43.46376037597656, "rewards/rejected": 12.841033935546875, "step": 4501 }, { "epoch": 2.3302277432712213, "grad_norm": 0.8451611995697021, "learning_rate": 1.3020173322033508e-06, "loss": 0.09455438703298569, "rewards/accuracies": 0.953125, "rewards/chosen": 56.67520523071289, "rewards/margins": 43.496185302734375, "rewards/rejected": 13.17095947265625, "step": 4502 }, { "epoch": 2.330745341614907, "grad_norm": 0.8916352987289429, "learning_rate": 1.3000978012345561e-06, "loss": 0.057022228837013245, "rewards/accuracies": 0.9609375, "rewards/chosen": 65.53813171386719, "rewards/margins": 48.7371826171875, "rewards/rejected": 16.796335220336914, "step": 4503 }, { "epoch": 2.331262939958592, "grad_norm": 1.7034698724746704, "learning_rate": 1.2981794747956034e-06, "loss": 0.1113872230052948, "rewards/accuracies": 0.9453125, "rewards/chosen": 52.57203674316406, "rewards/margins": 42.725372314453125, "rewards/rejected": 9.85061264038086, "step": 4504 }, { "epoch": 2.3317805383022776, "grad_norm": 1.2436792850494385, "learning_rate": 1.296262353511018e-06, "loss": 0.07838724553585052, "rewards/accuracies": 0.9453125, "rewards/chosen": 61.32981872558594, "rewards/margins": 47.38008117675781, "rewards/rejected": 13.95699691772461, "step": 4505 }, { "epoch": 2.3322981366459627, "grad_norm": 0.7887042760848999, "learning_rate": 1.2943464380049292e-06, "loss": 0.05023587495088577, "rewards/accuracies": 0.984375, "rewards/chosen": 68.07550811767578, "rewards/margins": 50.952484130859375, "rewards/rejected": 17.116565704345703, "step": 4506 }, { "epoch": 2.332815734989648, "grad_norm": 0.8528623580932617, "learning_rate": 1.2924317289010806e-06, "loss": 0.07123811542987823, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.369102478027344, "rewards/margins": 44.592811584472656, "rewards/rejected": 13.759376525878906, "step": 4507 }, { "epoch": 2.3333333333333335, "grad_norm": 1.0849398374557495, "learning_rate": 1.2905182268228188e-06, "loss": 0.0899675264954567, "rewards/accuracies": 0.96875, "rewards/chosen": 45.02879333496094, "rewards/margins": 33.891204833984375, "rewards/rejected": 11.139911651611328, "step": 4508 }, { "epoch": 2.3338509316770186, "grad_norm": 0.7739119529724121, "learning_rate": 1.2886059323930978e-06, "loss": 0.030761191621422768, "rewards/accuracies": 0.9765625, "rewards/chosen": 62.261741638183594, "rewards/margins": 49.39644241333008, "rewards/rejected": 12.85847282409668, "step": 4509 }, { "epoch": 2.3343685300207038, "grad_norm": 9.731147766113281, "learning_rate": 1.2866948462344787e-06, "loss": 0.12641727924346924, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.95745849609375, "rewards/margins": 44.104095458984375, "rewards/rejected": 13.854011535644531, "step": 4510 }, { "epoch": 2.3348861283643894, "grad_norm": 0.5891897082328796, "learning_rate": 1.28478496896913e-06, "loss": 0.06352916359901428, "rewards/accuracies": 0.96875, "rewards/chosen": 62.182403564453125, "rewards/margins": 47.946746826171875, "rewards/rejected": 14.234414100646973, "step": 4511 }, { "epoch": 2.3354037267080745, "grad_norm": 3.1684534549713135, "learning_rate": 1.2828763012188267e-06, "loss": 0.09673532098531723, "rewards/accuracies": 0.96875, "rewards/chosen": 60.41631317138672, "rewards/margins": 48.231475830078125, "rewards/rejected": 12.17047119140625, "step": 4512 }, { "epoch": 2.3359213250517596, "grad_norm": 0.8670594692230225, "learning_rate": 1.2809688436049455e-06, "loss": 0.02699417434632778, "rewards/accuracies": 0.9921875, "rewards/chosen": 56.65453338623047, "rewards/margins": 42.80278015136719, "rewards/rejected": 13.847387313842773, "step": 4513 }, { "epoch": 2.3364389233954452, "grad_norm": 0.5270143747329712, "learning_rate": 1.279062596748477e-06, "loss": 0.046069689095020294, "rewards/accuracies": 0.9765625, "rewards/chosen": 65.32471466064453, "rewards/margins": 47.54461669921875, "rewards/rejected": 17.784875869750977, "step": 4514 }, { "epoch": 2.3369565217391304, "grad_norm": 0.8455201983451843, "learning_rate": 1.2771575612700143e-06, "loss": 0.0697002112865448, "rewards/accuracies": 0.953125, "rewards/chosen": 58.72758483886719, "rewards/margins": 43.331451416015625, "rewards/rejected": 15.378631591796875, "step": 4515 }, { "epoch": 2.337474120082816, "grad_norm": 2.3255887031555176, "learning_rate": 1.2752537377897522e-06, "loss": 0.10430276393890381, "rewards/accuracies": 0.953125, "rewards/chosen": 54.92399597167969, "rewards/margins": 43.22943115234375, "rewards/rejected": 11.693572998046875, "step": 4516 }, { "epoch": 2.337991718426501, "grad_norm": 0.8447642922401428, "learning_rate": 1.2733511269274934e-06, "loss": 0.05173516273498535, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.03102111816406, "rewards/margins": 43.12107849121094, "rewards/rejected": 12.905025482177734, "step": 4517 }, { "epoch": 2.3385093167701863, "grad_norm": 1.2108650207519531, "learning_rate": 1.2714497293026517e-06, "loss": 0.10134106129407883, "rewards/accuracies": 0.9375, "rewards/chosen": 57.540428161621094, "rewards/margins": 45.83819580078125, "rewards/rejected": 11.702033996582031, "step": 4518 }, { "epoch": 2.3390269151138714, "grad_norm": 0.7847692370414734, "learning_rate": 1.2695495455342365e-06, "loss": 0.08994948118925095, "rewards/accuracies": 0.9453125, "rewards/chosen": 61.00922393798828, "rewards/margins": 46.32087707519531, "rewards/rejected": 14.693153381347656, "step": 4519 }, { "epoch": 2.339544513457557, "grad_norm": 0.7479998469352722, "learning_rate": 1.2676505762408676e-06, "loss": 0.08120384812355042, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.760162353515625, "rewards/margins": 46.5977783203125, "rewards/rejected": 10.16397476196289, "step": 4520 }, { "epoch": 2.340062111801242, "grad_norm": 2.8901422023773193, "learning_rate": 1.2657528220407666e-06, "loss": 0.14691239595413208, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.34579086303711, "rewards/margins": 44.9288330078125, "rewards/rejected": 13.42059326171875, "step": 4521 }, { "epoch": 2.3405797101449277, "grad_norm": 3.7144150733947754, "learning_rate": 1.263856283551766e-06, "loss": 0.1334177404642105, "rewards/accuracies": 0.953125, "rewards/chosen": 57.3851318359375, "rewards/margins": 44.673126220703125, "rewards/rejected": 12.70475959777832, "step": 4522 }, { "epoch": 2.341097308488613, "grad_norm": 0.8958806395530701, "learning_rate": 1.261960961391292e-06, "loss": 0.0728466734290123, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.01030731201172, "rewards/margins": 44.0078125, "rewards/rejected": 13.007274627685547, "step": 4523 }, { "epoch": 2.341614906832298, "grad_norm": 0.6612764596939087, "learning_rate": 1.2600668561763828e-06, "loss": 0.043824128806591034, "rewards/accuracies": 0.984375, "rewards/chosen": 55.29527282714844, "rewards/margins": 44.27130126953125, "rewards/rejected": 11.018339157104492, "step": 4524 }, { "epoch": 2.3421325051759836, "grad_norm": 1.1658426523208618, "learning_rate": 1.2581739685236776e-06, "loss": 0.0747307538986206, "rewards/accuracies": 0.96875, "rewards/chosen": 55.77922058105469, "rewards/margins": 45.18549346923828, "rewards/rejected": 10.579715728759766, "step": 4525 }, { "epoch": 2.3426501035196687, "grad_norm": 2.5503413677215576, "learning_rate": 1.2562822990494206e-06, "loss": 0.058103904128074646, "rewards/accuracies": 0.96875, "rewards/chosen": 56.048728942871094, "rewards/margins": 45.35546875, "rewards/rejected": 10.688156127929688, "step": 4526 }, { "epoch": 2.343167701863354, "grad_norm": 0.8591353297233582, "learning_rate": 1.2543918483694568e-06, "loss": 0.060181356966495514, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.82372283935547, "rewards/margins": 47.789459228515625, "rewards/rejected": 12.026424407958984, "step": 4527 }, { "epoch": 2.3436853002070395, "grad_norm": 1.2132594585418701, "learning_rate": 1.252502617099237e-06, "loss": 0.0764990895986557, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.24591827392578, "rewards/margins": 46.5064697265625, "rewards/rejected": 11.76214599609375, "step": 4528 }, { "epoch": 2.3442028985507246, "grad_norm": 1.1682208776474, "learning_rate": 1.2506146058538144e-06, "loss": 0.06487104296684265, "rewards/accuracies": 0.984375, "rewards/chosen": 54.99339294433594, "rewards/margins": 43.48602294921875, "rewards/rejected": 11.51266098022461, "step": 4529 }, { "epoch": 2.3447204968944098, "grad_norm": 0.6867892146110535, "learning_rate": 1.2487278152478444e-06, "loss": 0.043714478611946106, "rewards/accuracies": 0.96875, "rewards/chosen": 59.24911117553711, "rewards/margins": 45.3106689453125, "rewards/rejected": 13.936187744140625, "step": 4530 }, { "epoch": 2.3452380952380953, "grad_norm": 6.898741245269775, "learning_rate": 1.2468422458955848e-06, "loss": 0.09728607535362244, "rewards/accuracies": 0.953125, "rewards/chosen": 61.34814453125, "rewards/margins": 48.524749755859375, "rewards/rejected": 12.822315216064453, "step": 4531 }, { "epoch": 2.3457556935817805, "grad_norm": 0.8523309230804443, "learning_rate": 1.2449578984108968e-06, "loss": 0.04282711073756218, "rewards/accuracies": 0.984375, "rewards/chosen": 56.906944274902344, "rewards/margins": 46.230560302734375, "rewards/rejected": 10.676528930664062, "step": 4532 }, { "epoch": 2.346273291925466, "grad_norm": 0.8478978872299194, "learning_rate": 1.243074773407243e-06, "loss": 0.07614485174417496, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.72865676879883, "rewards/margins": 46.50269317626953, "rewards/rejected": 11.211334228515625, "step": 4533 }, { "epoch": 2.346790890269151, "grad_norm": 0.8880041837692261, "learning_rate": 1.2411928714976883e-06, "loss": 0.059642039239406586, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.00887680053711, "rewards/margins": 47.09596252441406, "rewards/rejected": 9.909156799316406, "step": 4534 }, { "epoch": 2.3473084886128364, "grad_norm": 0.887844979763031, "learning_rate": 1.2393121932949009e-06, "loss": 0.05984148755669594, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.86628723144531, "rewards/margins": 49.21273422241211, "rewards/rejected": 9.664740562438965, "step": 4535 }, { "epoch": 2.3478260869565215, "grad_norm": 2.8554935455322266, "learning_rate": 1.2374327394111456e-06, "loss": 0.08114004135131836, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.470977783203125, "rewards/margins": 45.09124755859375, "rewards/rejected": 12.365447998046875, "step": 4536 }, { "epoch": 2.348343685300207, "grad_norm": 1.1835546493530273, "learning_rate": 1.2355545104582955e-06, "loss": 0.07134207338094711, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.78913497924805, "rewards/margins": 47.84339904785156, "rewards/rejected": 8.955375671386719, "step": 4537 }, { "epoch": 2.3488612836438922, "grad_norm": 3.3680806159973145, "learning_rate": 1.2336775070478218e-06, "loss": 0.0935327559709549, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.177215576171875, "rewards/margins": 50.8663330078125, "rewards/rejected": 10.317192077636719, "step": 4538 }, { "epoch": 2.349378881987578, "grad_norm": 1.053259253501892, "learning_rate": 1.231801729790793e-06, "loss": 0.1027718037366867, "rewards/accuracies": 0.953125, "rewards/chosen": 53.63151931762695, "rewards/margins": 42.424835205078125, "rewards/rejected": 11.20768928527832, "step": 4539 }, { "epoch": 2.349896480331263, "grad_norm": 1.702999472618103, "learning_rate": 1.229927179297885e-06, "loss": 0.06804877519607544, "rewards/accuracies": 0.984375, "rewards/chosen": 62.50096130371094, "rewards/margins": 49.70849609375, "rewards/rejected": 12.794113159179688, "step": 4540 }, { "epoch": 2.350414078674948, "grad_norm": 1.229566216468811, "learning_rate": 1.228053856179372e-06, "loss": 0.024584786966443062, "rewards/accuracies": 1.0, "rewards/chosen": 63.57304000854492, "rewards/margins": 52.24822998046875, "rewards/rejected": 11.310745239257812, "step": 4541 }, { "epoch": 2.3509316770186337, "grad_norm": 0.5914803147315979, "learning_rate": 1.2261817610451242e-06, "loss": 0.028646662831306458, "rewards/accuracies": 0.9921875, "rewards/chosen": 53.01239013671875, "rewards/margins": 44.181243896484375, "rewards/rejected": 8.847652435302734, "step": 4542 }, { "epoch": 2.351449275362319, "grad_norm": 1.5433639287948608, "learning_rate": 1.2243108945046167e-06, "loss": 0.07339740544557571, "rewards/accuracies": 0.953125, "rewards/chosen": 55.63269805908203, "rewards/margins": 44.30072021484375, "rewards/rejected": 11.332927703857422, "step": 4543 }, { "epoch": 2.351966873706004, "grad_norm": 1.1959121227264404, "learning_rate": 1.2224412571669265e-06, "loss": 0.09536854922771454, "rewards/accuracies": 0.9375, "rewards/chosen": 61.774505615234375, "rewards/margins": 48.89385986328125, "rewards/rejected": 12.880462646484375, "step": 4544 }, { "epoch": 2.3524844720496896, "grad_norm": 1.3117247819900513, "learning_rate": 1.2205728496407265e-06, "loss": 0.08813425898551941, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.992271423339844, "rewards/margins": 40.83130645751953, "rewards/rejected": 10.150293350219727, "step": 4545 }, { "epoch": 2.3530020703933747, "grad_norm": 1.8152952194213867, "learning_rate": 1.2187056725342877e-06, "loss": 0.11404597759246826, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.16282653808594, "rewards/margins": 42.57489013671875, "rewards/rejected": 7.581438064575195, "step": 4546 }, { "epoch": 2.35351966873706, "grad_norm": 1.17727530002594, "learning_rate": 1.2168397264554831e-06, "loss": 0.10726624727249146, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.10694885253906, "rewards/margins": 41.901458740234375, "rewards/rejected": 11.198756217956543, "step": 4547 }, { "epoch": 2.3540372670807455, "grad_norm": 1.3865406513214111, "learning_rate": 1.2149750120117899e-06, "loss": 0.091751828789711, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.09907531738281, "rewards/margins": 41.738800048828125, "rewards/rejected": 7.368141174316406, "step": 4548 }, { "epoch": 2.3545548654244306, "grad_norm": 1.4761852025985718, "learning_rate": 1.2131115298102741e-06, "loss": 0.10555779933929443, "rewards/accuracies": 0.953125, "rewards/chosen": 62.37936019897461, "rewards/margins": 50.616363525390625, "rewards/rejected": 11.744152069091797, "step": 4549 }, { "epoch": 2.355072463768116, "grad_norm": 0.7782989144325256, "learning_rate": 1.211249280457608e-06, "loss": 0.0732499361038208, "rewards/accuracies": 0.953125, "rewards/chosen": 56.65382385253906, "rewards/margins": 47.861846923828125, "rewards/rejected": 8.796615600585938, "step": 4550 }, { "epoch": 2.3555900621118013, "grad_norm": 0.7712252736091614, "learning_rate": 1.2093882645600597e-06, "loss": 0.05991790443658829, "rewards/accuracies": 0.96875, "rewards/chosen": 48.76560974121094, "rewards/margins": 41.28240966796875, "rewards/rejected": 7.490474224090576, "step": 4551 }, { "epoch": 2.3561076604554865, "grad_norm": 1.458922028541565, "learning_rate": 1.2075284827234969e-06, "loss": 0.11366960406303406, "rewards/accuracies": 0.953125, "rewards/chosen": 45.16038131713867, "rewards/margins": 36.69866943359375, "rewards/rejected": 8.461990356445312, "step": 4552 }, { "epoch": 2.3566252587991716, "grad_norm": 3.0902915000915527, "learning_rate": 1.205669935553384e-06, "loss": 0.09492938965559006, "rewards/accuracies": 0.984375, "rewards/chosen": 50.77992248535156, "rewards/margins": 43.454498291015625, "rewards/rejected": 7.333665370941162, "step": 4553 }, { "epoch": 2.357142857142857, "grad_norm": 1.4668477773666382, "learning_rate": 1.2038126236547854e-06, "loss": 0.08944359421730042, "rewards/accuracies": 0.96875, "rewards/chosen": 59.85954284667969, "rewards/margins": 48.849609375, "rewards/rejected": 10.995321273803711, "step": 4554 }, { "epoch": 2.3576604554865424, "grad_norm": 2.17141056060791, "learning_rate": 1.2019565476323619e-06, "loss": 0.06825114786624908, "rewards/accuracies": 0.96875, "rewards/chosen": 50.95755386352539, "rewards/margins": 42.25193786621094, "rewards/rejected": 8.711311340332031, "step": 4555 }, { "epoch": 2.358178053830228, "grad_norm": 0.6202481985092163, "learning_rate": 1.2001017080903727e-06, "loss": 0.03830474987626076, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.784149169921875, "rewards/margins": 47.492919921875, "rewards/rejected": 10.281415939331055, "step": 4556 }, { "epoch": 2.358695652173913, "grad_norm": 0.6717191338539124, "learning_rate": 1.1982481056326738e-06, "loss": 0.05134889483451843, "rewards/accuracies": 0.96875, "rewards/chosen": 59.167449951171875, "rewards/margins": 48.26104736328125, "rewards/rejected": 10.900567054748535, "step": 4557 }, { "epoch": 2.3592132505175982, "grad_norm": 1.0586930513381958, "learning_rate": 1.1963957408627197e-06, "loss": 0.07460129261016846, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.9053955078125, "rewards/margins": 43.598419189453125, "rewards/rejected": 9.302505493164062, "step": 4558 }, { "epoch": 2.359730848861284, "grad_norm": 1.1488677263259888, "learning_rate": 1.1945446143835598e-06, "loss": 0.07027656584978104, "rewards/accuracies": 0.9609375, "rewards/chosen": 46.980369567871094, "rewards/margins": 39.19940185546875, "rewards/rejected": 7.7856292724609375, "step": 4559 }, { "epoch": 2.360248447204969, "grad_norm": 1.9136136770248413, "learning_rate": 1.1926947267978434e-06, "loss": 0.13967803120613098, "rewards/accuracies": 0.9296875, "rewards/chosen": 47.62934875488281, "rewards/margins": 38.897186279296875, "rewards/rejected": 8.7374267578125, "step": 4560 }, { "epoch": 2.360766045548654, "grad_norm": 1.2288540601730347, "learning_rate": 1.1908460787078141e-06, "loss": 0.08961230516433716, "rewards/accuracies": 0.9375, "rewards/chosen": 54.4405632019043, "rewards/margins": 47.18914794921875, "rewards/rejected": 7.2484283447265625, "step": 4561 }, { "epoch": 2.3612836438923397, "grad_norm": 1.7581251859664917, "learning_rate": 1.18899867071531e-06, "loss": 0.11061900109052658, "rewards/accuracies": 0.9375, "rewards/chosen": 47.623046875, "rewards/margins": 39.892181396484375, "rewards/rejected": 7.726379871368408, "step": 4562 }, { "epoch": 2.361801242236025, "grad_norm": 1.3509539365768433, "learning_rate": 1.1871525034217718e-06, "loss": 0.07167726755142212, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.30311584472656, "rewards/margins": 45.62074279785156, "rewards/rejected": 9.684776306152344, "step": 4563 }, { "epoch": 2.36231884057971, "grad_norm": 1.316733956336975, "learning_rate": 1.1853075774282325e-06, "loss": 0.07947896420955658, "rewards/accuracies": 0.96875, "rewards/chosen": 53.69775390625, "rewards/margins": 45.29443359375, "rewards/rejected": 8.414484024047852, "step": 4564 }, { "epoch": 2.3628364389233956, "grad_norm": 0.6798623204231262, "learning_rate": 1.1834638933353183e-06, "loss": 0.056435681879520416, "rewards/accuracies": 0.96875, "rewards/chosen": 50.90447998046875, "rewards/margins": 42.22998046875, "rewards/rejected": 8.673171997070312, "step": 4565 }, { "epoch": 2.3633540372670807, "grad_norm": 1.0408717393875122, "learning_rate": 1.1816214517432534e-06, "loss": 0.06739804148674011, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.87803649902344, "rewards/margins": 43.50421142578125, "rewards/rejected": 8.376136779785156, "step": 4566 }, { "epoch": 2.363871635610766, "grad_norm": 0.9259964227676392, "learning_rate": 1.1797802532518616e-06, "loss": 0.06762052327394485, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.41039276123047, "rewards/margins": 42.155372619628906, "rewards/rejected": 11.237401962280273, "step": 4567 }, { "epoch": 2.3643892339544514, "grad_norm": 0.8403137922286987, "learning_rate": 1.1779402984605581e-06, "loss": 0.07883191853761673, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.5009765625, "rewards/margins": 45.080322265625, "rewards/rejected": 9.422111511230469, "step": 4568 }, { "epoch": 2.3649068322981366, "grad_norm": 0.8397412896156311, "learning_rate": 1.1761015879683486e-06, "loss": 0.08822938799858093, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.188018798828125, "rewards/margins": 45.548004150390625, "rewards/rejected": 9.653364181518555, "step": 4569 }, { "epoch": 2.3654244306418217, "grad_norm": 0.9478506445884705, "learning_rate": 1.1742641223738437e-06, "loss": 0.05672925338149071, "rewards/accuracies": 0.96875, "rewards/chosen": 56.64524841308594, "rewards/margins": 48.1754150390625, "rewards/rejected": 8.458370208740234, "step": 4570 }, { "epoch": 2.3659420289855073, "grad_norm": 1.0833566188812256, "learning_rate": 1.1724279022752427e-06, "loss": 0.08342158794403076, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.897857666015625, "rewards/margins": 44.17268371582031, "rewards/rejected": 8.726852416992188, "step": 4571 }, { "epoch": 2.3664596273291925, "grad_norm": 1.1835134029388428, "learning_rate": 1.1705929282703382e-06, "loss": 0.09797471761703491, "rewards/accuracies": 0.953125, "rewards/chosen": 48.44605255126953, "rewards/margins": 40.906219482421875, "rewards/rejected": 7.533233642578125, "step": 4572 }, { "epoch": 2.366977225672878, "grad_norm": 0.6845276951789856, "learning_rate": 1.168759200956519e-06, "loss": 0.06710510700941086, "rewards/accuracies": 0.9921875, "rewards/chosen": 63.86112976074219, "rewards/margins": 51.20458984375, "rewards/rejected": 12.65628433227539, "step": 4573 }, { "epoch": 2.367494824016563, "grad_norm": 1.0575770139694214, "learning_rate": 1.1669267209307723e-06, "loss": 0.07041453570127487, "rewards/accuracies": 0.96875, "rewards/chosen": 55.69032287597656, "rewards/margins": 45.89878845214844, "rewards/rejected": 9.807260513305664, "step": 4574 }, { "epoch": 2.3680124223602483, "grad_norm": 1.4471657276153564, "learning_rate": 1.1650954887896708e-06, "loss": 0.13464823365211487, "rewards/accuracies": 0.9296875, "rewards/chosen": 57.252777099609375, "rewards/margins": 48.26751708984375, "rewards/rejected": 8.993465423583984, "step": 4575 }, { "epoch": 2.368530020703934, "grad_norm": 0.5721271634101868, "learning_rate": 1.1632655051293873e-06, "loss": 0.056443240493535995, "rewards/accuracies": 0.96875, "rewards/chosen": 57.056785583496094, "rewards/margins": 47.098976135253906, "rewards/rejected": 9.95701789855957, "step": 4576 }, { "epoch": 2.369047619047619, "grad_norm": 0.7321661114692688, "learning_rate": 1.1614367705456847e-06, "loss": 0.07003571093082428, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.91656494140625, "rewards/margins": 46.46075439453125, "rewards/rejected": 10.469589233398438, "step": 4577 }, { "epoch": 2.369565217391304, "grad_norm": 0.9092800617218018, "learning_rate": 1.1596092856339241e-06, "loss": 0.051766544580459595, "rewards/accuracies": 0.9921875, "rewards/chosen": 54.00018310546875, "rewards/margins": 46.81175231933594, "rewards/rejected": 7.173126220703125, "step": 4578 }, { "epoch": 2.37008281573499, "grad_norm": 0.48888903856277466, "learning_rate": 1.1577830509890531e-06, "loss": 0.05432668328285217, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.11896514892578, "rewards/margins": 43.77032470703125, "rewards/rejected": 8.349679946899414, "step": 4579 }, { "epoch": 2.370600414078675, "grad_norm": 0.8309314250946045, "learning_rate": 1.1559580672056169e-06, "loss": 0.05262041091918945, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.554718017578125, "rewards/margins": 45.04972839355469, "rewards/rejected": 10.492546081542969, "step": 4580 }, { "epoch": 2.37111801242236, "grad_norm": 0.879176676273346, "learning_rate": 1.1541343348777518e-06, "loss": 0.07957610487937927, "rewards/accuracies": 0.953125, "rewards/chosen": 54.9442138671875, "rewards/margins": 48.569183349609375, "rewards/rejected": 6.388326644897461, "step": 4581 }, { "epoch": 2.3716356107660457, "grad_norm": 0.7216448187828064, "learning_rate": 1.1523118545991868e-06, "loss": 0.05649229884147644, "rewards/accuracies": 0.96875, "rewards/chosen": 56.58709716796875, "rewards/margins": 50.05680847167969, "rewards/rejected": 6.511993408203125, "step": 4582 }, { "epoch": 2.372153209109731, "grad_norm": 1.278395652770996, "learning_rate": 1.1504906269632443e-06, "loss": 0.06109864264726639, "rewards/accuracies": 0.9921875, "rewards/chosen": 51.43541717529297, "rewards/margins": 42.428466796875, "rewards/rejected": 9.002479553222656, "step": 4583 }, { "epoch": 2.372670807453416, "grad_norm": 1.0246080160140991, "learning_rate": 1.1486706525628373e-06, "loss": 0.09137831628322601, "rewards/accuracies": 0.9453125, "rewards/chosen": 48.926307678222656, "rewards/margins": 38.8941650390625, "rewards/rejected": 10.029411315917969, "step": 4584 }, { "epoch": 2.3731884057971016, "grad_norm": 1.3212465047836304, "learning_rate": 1.1468519319904714e-06, "loss": 0.06644114851951599, "rewards/accuracies": 0.96875, "rewards/chosen": 54.19235610961914, "rewards/margins": 45.02510070800781, "rewards/rejected": 9.177849769592285, "step": 4585 }, { "epoch": 2.3737060041407867, "grad_norm": 0.8437327742576599, "learning_rate": 1.145034465838245e-06, "loss": 0.07529528439044952, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.49057388305664, "rewards/margins": 44.049102783203125, "rewards/rejected": 7.434438705444336, "step": 4586 }, { "epoch": 2.374223602484472, "grad_norm": 1.0238025188446045, "learning_rate": 1.1432182546978487e-06, "loss": 0.0901075005531311, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.775489807128906, "rewards/margins": 44.074462890625, "rewards/rejected": 7.701385498046875, "step": 4587 }, { "epoch": 2.3747412008281574, "grad_norm": 1.506510615348816, "learning_rate": 1.1414032991605583e-06, "loss": 0.06142304837703705, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.95167541503906, "rewards/margins": 40.9708251953125, "rewards/rejected": 8.984935760498047, "step": 4588 }, { "epoch": 2.3752587991718426, "grad_norm": 0.7955148220062256, "learning_rate": 1.1395895998172508e-06, "loss": 0.08497226238250732, "rewards/accuracies": 0.9609375, "rewards/chosen": 62.42585754394531, "rewards/margins": 49.62144470214844, "rewards/rejected": 12.820663452148438, "step": 4589 }, { "epoch": 2.375776397515528, "grad_norm": 0.7936195135116577, "learning_rate": 1.1377771572583863e-06, "loss": 0.0766797661781311, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.379737854003906, "rewards/margins": 48.96282958984375, "rewards/rejected": 8.416847229003906, "step": 4590 }, { "epoch": 2.3762939958592133, "grad_norm": 0.6540537476539612, "learning_rate": 1.1359659720740212e-06, "loss": 0.04613690823316574, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.171104431152344, "rewards/margins": 48.891387939453125, "rewards/rejected": 9.284431457519531, "step": 4591 }, { "epoch": 2.3768115942028984, "grad_norm": 1.1373356580734253, "learning_rate": 1.1341560448537947e-06, "loss": 0.11083215475082397, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.78288650512695, "rewards/margins": 44.415069580078125, "rewards/rejected": 10.375089645385742, "step": 4592 }, { "epoch": 2.377329192546584, "grad_norm": 1.4484179019927979, "learning_rate": 1.1323473761869468e-06, "loss": 0.11002390831708908, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.68089294433594, "rewards/margins": 46.73993682861328, "rewards/rejected": 10.943288803100586, "step": 4593 }, { "epoch": 2.377846790890269, "grad_norm": 0.953274130821228, "learning_rate": 1.1305399666623024e-06, "loss": 0.08860397338867188, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.473731994628906, "rewards/margins": 42.453887939453125, "rewards/rejected": 9.02573013305664, "step": 4594 }, { "epoch": 2.3783643892339543, "grad_norm": 0.773971438407898, "learning_rate": 1.1287338168682733e-06, "loss": 0.05884106457233429, "rewards/accuracies": 0.9921875, "rewards/chosen": 55.54998779296875, "rewards/margins": 45.1820068359375, "rewards/rejected": 10.367182731628418, "step": 4595 }, { "epoch": 2.37888198757764, "grad_norm": 1.1618155241012573, "learning_rate": 1.1269289273928653e-06, "loss": 0.1250700205564499, "rewards/accuracies": 0.9296875, "rewards/chosen": 55.68617248535156, "rewards/margins": 44.840476989746094, "rewards/rejected": 10.847183227539062, "step": 4596 }, { "epoch": 2.379399585921325, "grad_norm": 1.3480663299560547, "learning_rate": 1.1251252988236772e-06, "loss": 0.08970099687576294, "rewards/accuracies": 0.953125, "rewards/chosen": 53.29576873779297, "rewards/margins": 43.3536376953125, "rewards/rejected": 9.944816589355469, "step": 4597 }, { "epoch": 2.37991718426501, "grad_norm": 0.9938952326774597, "learning_rate": 1.1233229317478883e-06, "loss": 0.0919371247291565, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.84455490112305, "rewards/margins": 48.339447021484375, "rewards/rejected": 10.508481979370117, "step": 4598 }, { "epoch": 2.380434782608696, "grad_norm": 1.0040526390075684, "learning_rate": 1.1215218267522736e-06, "loss": 0.10257769376039505, "rewards/accuracies": 0.9375, "rewards/chosen": 51.62129211425781, "rewards/margins": 43.26275634765625, "rewards/rejected": 8.374191284179688, "step": 4599 }, { "epoch": 2.380952380952381, "grad_norm": 2.4916958808898926, "learning_rate": 1.1197219844231988e-06, "loss": 0.10797086358070374, "rewards/accuracies": 0.96875, "rewards/chosen": 54.433448791503906, "rewards/margins": 44.45341491699219, "rewards/rejected": 9.98160171508789, "step": 4600 }, { "epoch": 2.381469979296066, "grad_norm": 1.3239376544952393, "learning_rate": 1.117923405346612e-06, "loss": 0.08507940918207169, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.1646728515625, "rewards/margins": 50.04888916015625, "rewards/rejected": 9.121837615966797, "step": 4601 }, { "epoch": 2.3819875776397517, "grad_norm": 0.945631206035614, "learning_rate": 1.1161260901080551e-06, "loss": 0.08878874033689499, "rewards/accuracies": 0.953125, "rewards/chosen": 61.40271759033203, "rewards/margins": 50.05905532836914, "rewards/rejected": 11.346529960632324, "step": 4602 }, { "epoch": 2.382505175983437, "grad_norm": 1.0208377838134766, "learning_rate": 1.1143300392926548e-06, "loss": 0.0856986790895462, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.70115280151367, "rewards/margins": 41.46563720703125, "rewards/rejected": 12.227828979492188, "step": 4603 }, { "epoch": 2.383022774327122, "grad_norm": 0.5501110553741455, "learning_rate": 1.1125352534851335e-06, "loss": 0.032831672579050064, "rewards/accuracies": 0.9921875, "rewards/chosen": 60.04302978515625, "rewards/margins": 49.86566162109375, "rewards/rejected": 10.185791969299316, "step": 4604 }, { "epoch": 2.3835403726708075, "grad_norm": 0.445671409368515, "learning_rate": 1.1107417332697924e-06, "loss": 0.03817971795797348, "rewards/accuracies": 0.984375, "rewards/chosen": 56.877349853515625, "rewards/margins": 46.285491943359375, "rewards/rejected": 10.586151123046875, "step": 4605 }, { "epoch": 2.3840579710144927, "grad_norm": 3.43941330909729, "learning_rate": 1.108949479230526e-06, "loss": 0.06010287255048752, "rewards/accuracies": 0.96875, "rewards/chosen": 61.60087585449219, "rewards/margins": 48.98423767089844, "rewards/rejected": 12.613128662109375, "step": 4606 }, { "epoch": 2.3845755693581783, "grad_norm": 0.7079676389694214, "learning_rate": 1.1071584919508155e-06, "loss": 0.06053198501467705, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.77044677734375, "rewards/margins": 47.49858093261719, "rewards/rejected": 10.264083862304688, "step": 4607 }, { "epoch": 2.3850931677018634, "grad_norm": 0.6356905698776245, "learning_rate": 1.1053687720137296e-06, "loss": 0.06201564893126488, "rewards/accuracies": 0.96875, "rewards/chosen": 55.21543502807617, "rewards/margins": 43.32044982910156, "rewards/rejected": 11.89188003540039, "step": 4608 }, { "epoch": 2.3856107660455486, "grad_norm": 0.9638032913208008, "learning_rate": 1.1035803200019245e-06, "loss": 0.07920065522193909, "rewards/accuracies": 0.96875, "rewards/chosen": 56.2579345703125, "rewards/margins": 45.96202087402344, "rewards/rejected": 10.295732498168945, "step": 4609 }, { "epoch": 2.386128364389234, "grad_norm": 0.8811402320861816, "learning_rate": 1.1017931364976442e-06, "loss": 0.07361945509910583, "rewards/accuracies": 0.96875, "rewards/chosen": 51.47504425048828, "rewards/margins": 40.63232421875, "rewards/rejected": 10.842900276184082, "step": 4610 }, { "epoch": 2.3866459627329193, "grad_norm": 1.1012446880340576, "learning_rate": 1.1000072220827191e-06, "loss": 0.07456931471824646, "rewards/accuracies": 0.9609375, "rewards/chosen": 62.30186462402344, "rewards/margins": 50.631317138671875, "rewards/rejected": 11.682441711425781, "step": 4611 }, { "epoch": 2.3871635610766044, "grad_norm": 0.4532589316368103, "learning_rate": 1.0982225773385662e-06, "loss": 0.039273228496313095, "rewards/accuracies": 0.984375, "rewards/chosen": 55.2112922668457, "rewards/margins": 45.841156005859375, "rewards/rejected": 9.362678527832031, "step": 4612 }, { "epoch": 2.38768115942029, "grad_norm": 3.9448862075805664, "learning_rate": 1.0964392028461896e-06, "loss": 0.10683134198188782, "rewards/accuracies": 0.9453125, "rewards/chosen": 61.548431396484375, "rewards/margins": 50.10975646972656, "rewards/rejected": 11.434009552001953, "step": 4613 }, { "epoch": 2.388198757763975, "grad_norm": 2.9022231101989746, "learning_rate": 1.0946570991861793e-06, "loss": 0.14130347967147827, "rewards/accuracies": 0.921875, "rewards/chosen": 54.61968994140625, "rewards/margins": 44.31791687011719, "rewards/rejected": 10.321392059326172, "step": 4614 }, { "epoch": 2.3887163561076603, "grad_norm": 1.1758683919906616, "learning_rate": 1.0928762669387127e-06, "loss": 0.07871891558170319, "rewards/accuracies": 0.96875, "rewards/chosen": 61.36667251586914, "rewards/margins": 49.027374267578125, "rewards/rejected": 12.331222534179688, "step": 4615 }, { "epoch": 2.389233954451346, "grad_norm": 0.7509832382202148, "learning_rate": 1.0910967066835521e-06, "loss": 0.07962207496166229, "rewards/accuracies": 0.96875, "rewards/chosen": 59.99580001831055, "rewards/margins": 47.50074768066406, "rewards/rejected": 12.500980377197266, "step": 4616 }, { "epoch": 2.389751552795031, "grad_norm": 1.0527751445770264, "learning_rate": 1.0893184190000477e-06, "loss": 0.07579074800014496, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.835357666015625, "rewards/margins": 46.3599853515625, "rewards/rejected": 8.47198486328125, "step": 4617 }, { "epoch": 2.390269151138716, "grad_norm": 0.8223356008529663, "learning_rate": 1.08754140446713e-06, "loss": 0.05072220042347908, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.971893310546875, "rewards/margins": 42.1839599609375, "rewards/rejected": 9.796493530273438, "step": 4618 }, { "epoch": 2.3907867494824018, "grad_norm": 2.666809558868408, "learning_rate": 1.0857656636633228e-06, "loss": 0.13914507627487183, "rewards/accuracies": 0.9375, "rewards/chosen": 52.07969665527344, "rewards/margins": 43.840423583984375, "rewards/rejected": 8.218250274658203, "step": 4619 }, { "epoch": 2.391304347826087, "grad_norm": 1.0486398935317993, "learning_rate": 1.083991197166731e-06, "loss": 0.09543848037719727, "rewards/accuracies": 0.9375, "rewards/chosen": 61.38594055175781, "rewards/margins": 50.585693359375, "rewards/rejected": 10.818500518798828, "step": 4620 }, { "epoch": 2.391821946169772, "grad_norm": 0.8835338950157166, "learning_rate": 1.0822180055550429e-06, "loss": 0.09849594533443451, "rewards/accuracies": 0.953125, "rewards/chosen": 51.938873291015625, "rewards/margins": 44.1820068359375, "rewards/rejected": 7.735014915466309, "step": 4621 }, { "epoch": 2.3923395445134576, "grad_norm": 4.436303615570068, "learning_rate": 1.080446089405533e-06, "loss": 0.1029931828379631, "rewards/accuracies": 0.9375, "rewards/chosen": 57.370262145996094, "rewards/margins": 48.43794250488281, "rewards/rejected": 8.927635192871094, "step": 4622 }, { "epoch": 2.392857142857143, "grad_norm": 1.0530339479446411, "learning_rate": 1.0786754492950664e-06, "loss": 0.08555644750595093, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.59101867675781, "rewards/margins": 46.8397216796875, "rewards/rejected": 11.74264144897461, "step": 4623 }, { "epoch": 2.3933747412008284, "grad_norm": 1.5497052669525146, "learning_rate": 1.076906085800083e-06, "loss": 0.06751684844493866, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.27008819580078, "rewards/margins": 48.18218994140625, "rewards/rejected": 13.081438064575195, "step": 4624 }, { "epoch": 2.3938923395445135, "grad_norm": 0.9003270268440247, "learning_rate": 1.0751379994966133e-06, "loss": 0.06201644986867905, "rewards/accuracies": 0.96875, "rewards/chosen": 57.096824645996094, "rewards/margins": 47.050140380859375, "rewards/rejected": 10.045551300048828, "step": 4625 }, { "epoch": 2.3944099378881987, "grad_norm": 1.1323918104171753, "learning_rate": 1.073371190960269e-06, "loss": 0.11032971739768982, "rewards/accuracies": 0.953125, "rewards/chosen": 51.96314239501953, "rewards/margins": 45.07305908203125, "rewards/rejected": 6.885021209716797, "step": 4626 }, { "epoch": 2.394927536231884, "grad_norm": 5.022462844848633, "learning_rate": 1.0716056607662523e-06, "loss": 0.0847848430275917, "rewards/accuracies": 0.96875, "rewards/chosen": 55.55775451660156, "rewards/margins": 46.68328857421875, "rewards/rejected": 8.866476058959961, "step": 4627 }, { "epoch": 2.3954451345755694, "grad_norm": 2.011134147644043, "learning_rate": 1.0698414094893389e-06, "loss": 0.06331399828195572, "rewards/accuracies": 0.96875, "rewards/chosen": 62.356468200683594, "rewards/margins": 52.772247314453125, "rewards/rejected": 9.61075210571289, "step": 4628 }, { "epoch": 2.3959627329192545, "grad_norm": 1.0982130765914917, "learning_rate": 1.068078437703895e-06, "loss": 0.09933438897132874, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.61186218261719, "rewards/margins": 48.556640625, "rewards/rejected": 10.051254272460938, "step": 4629 }, { "epoch": 2.39648033126294, "grad_norm": 1.611600399017334, "learning_rate": 1.066316745983872e-06, "loss": 0.11497318744659424, "rewards/accuracies": 0.953125, "rewards/chosen": 55.80817413330078, "rewards/margins": 46.415771484375, "rewards/rejected": 9.404380798339844, "step": 4630 }, { "epoch": 2.3969979296066253, "grad_norm": 0.8976805210113525, "learning_rate": 1.064556334902797e-06, "loss": 0.06445470452308655, "rewards/accuracies": 0.96875, "rewards/chosen": 58.525123596191406, "rewards/margins": 48.962554931640625, "rewards/rejected": 9.558639526367188, "step": 4631 }, { "epoch": 2.3975155279503104, "grad_norm": 0.8288531303405762, "learning_rate": 1.0627972050337864e-06, "loss": 0.08798527717590332, "rewards/accuracies": 0.953125, "rewards/chosen": 51.22069549560547, "rewards/margins": 43.7117919921875, "rewards/rejected": 7.510154724121094, "step": 4632 }, { "epoch": 2.398033126293996, "grad_norm": 0.8718438744544983, "learning_rate": 1.0610393569495376e-06, "loss": 0.0739375501871109, "rewards/accuracies": 0.96875, "rewards/chosen": 61.2907600402832, "rewards/margins": 51.095458984375, "rewards/rejected": 10.188970565795898, "step": 4633 }, { "epoch": 2.398550724637681, "grad_norm": 0.7408367991447449, "learning_rate": 1.0592827912223302e-06, "loss": 0.06187424808740616, "rewards/accuracies": 0.984375, "rewards/chosen": 55.38624572753906, "rewards/margins": 46.390869140625, "rewards/rejected": 8.991065979003906, "step": 4634 }, { "epoch": 2.3990683229813663, "grad_norm": 0.9005976915359497, "learning_rate": 1.0575275084240277e-06, "loss": 0.081217460334301, "rewards/accuracies": 0.96875, "rewards/chosen": 58.90242004394531, "rewards/margins": 47.02252197265625, "rewards/rejected": 11.873359680175781, "step": 4635 }, { "epoch": 2.399585921325052, "grad_norm": 1.6841444969177246, "learning_rate": 1.055773509126074e-06, "loss": 0.1394575983285904, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.11021423339844, "rewards/margins": 46.94822692871094, "rewards/rejected": 9.16135311126709, "step": 4636 }, { "epoch": 2.400103519668737, "grad_norm": 1.328519582748413, "learning_rate": 1.0540207938994972e-06, "loss": 0.116644486784935, "rewards/accuracies": 0.9296875, "rewards/chosen": 52.630123138427734, "rewards/margins": 43.83259582519531, "rewards/rejected": 8.816741943359375, "step": 4637 }, { "epoch": 2.400621118012422, "grad_norm": 1.6069285869598389, "learning_rate": 1.052269363314906e-06, "loss": 0.0783056691288948, "rewards/accuracies": 0.96875, "rewards/chosen": 54.44367218017578, "rewards/margins": 45.304473876953125, "rewards/rejected": 9.131423950195312, "step": 4638 }, { "epoch": 2.4011387163561078, "grad_norm": 0.838875949382782, "learning_rate": 1.0505192179424917e-06, "loss": 0.07106243073940277, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.55479049682617, "rewards/margins": 42.99285888671875, "rewards/rejected": 6.559455871582031, "step": 4639 }, { "epoch": 2.401656314699793, "grad_norm": 1.0928484201431274, "learning_rate": 1.048770358352026e-06, "loss": 0.10505199432373047, "rewards/accuracies": 0.9453125, "rewards/chosen": 52.00901794433594, "rewards/margins": 43.1048583984375, "rewards/rejected": 8.912940979003906, "step": 4640 }, { "epoch": 2.4021739130434785, "grad_norm": 1.1813701391220093, "learning_rate": 1.0470227851128628e-06, "loss": 0.07659220695495605, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.52042770385742, "rewards/margins": 44.44415283203125, "rewards/rejected": 10.073867797851562, "step": 4641 }, { "epoch": 2.4026915113871636, "grad_norm": 3.502920627593994, "learning_rate": 1.0452764987939374e-06, "loss": 0.10090900957584381, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.70111846923828, "rewards/margins": 47.658775329589844, "rewards/rejected": 10.03380012512207, "step": 4642 }, { "epoch": 2.403209109730849, "grad_norm": 1.3547520637512207, "learning_rate": 1.0435314999637675e-06, "loss": 0.10063493251800537, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.102806091308594, "rewards/margins": 44.66764831542969, "rewards/rejected": 8.42947769165039, "step": 4643 }, { "epoch": 2.403726708074534, "grad_norm": 1.2418044805526733, "learning_rate": 1.0417877891904454e-06, "loss": 0.12403637915849686, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.56964111328125, "rewards/margins": 45.297119140625, "rewards/rejected": 9.267718315124512, "step": 4644 }, { "epoch": 2.4042443064182195, "grad_norm": 0.7591869235038757, "learning_rate": 1.0400453670416538e-06, "loss": 0.05693869665265083, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.722015380859375, "rewards/margins": 44.37542724609375, "rewards/rejected": 10.345792770385742, "step": 4645 }, { "epoch": 2.4047619047619047, "grad_norm": 0.9012926816940308, "learning_rate": 1.03830423408465e-06, "loss": 0.07540207356214523, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.42760467529297, "rewards/margins": 45.247406005859375, "rewards/rejected": 9.182392120361328, "step": 4646 }, { "epoch": 2.4052795031055902, "grad_norm": 1.241449236869812, "learning_rate": 1.0365643908862694e-06, "loss": 0.12170793861150742, "rewards/accuracies": 0.953125, "rewards/chosen": 55.5029296875, "rewards/margins": 44.0943603515625, "rewards/rejected": 11.394241333007812, "step": 4647 }, { "epoch": 2.4057971014492754, "grad_norm": 1.7543737888336182, "learning_rate": 1.0348258380129312e-06, "loss": 0.08037833869457245, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.835479736328125, "rewards/margins": 48.616943359375, "rewards/rejected": 11.198970794677734, "step": 4648 }, { "epoch": 2.4063146997929605, "grad_norm": 3.592541217803955, "learning_rate": 1.0330885760306363e-06, "loss": 0.1104036197066307, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.90430450439453, "rewards/margins": 40.85575866699219, "rewards/rejected": 11.048233032226562, "step": 4649 }, { "epoch": 2.406832298136646, "grad_norm": 0.7829515337944031, "learning_rate": 1.0313526055049628e-06, "loss": 0.09163327515125275, "rewards/accuracies": 0.96875, "rewards/chosen": 57.56880187988281, "rewards/margins": 47.690399169921875, "rewards/rejected": 9.87276840209961, "step": 4650 }, { "epoch": 2.4073498964803313, "grad_norm": 0.9178419709205627, "learning_rate": 1.0296179270010653e-06, "loss": 0.07937034964561462, "rewards/accuracies": 0.9609375, "rewards/chosen": 62.007484436035156, "rewards/margins": 51.027130126953125, "rewards/rejected": 10.988861083984375, "step": 4651 }, { "epoch": 2.4078674948240164, "grad_norm": 0.6023674607276917, "learning_rate": 1.0278845410836819e-06, "loss": 0.0672081708908081, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.77928161621094, "rewards/margins": 47.5750732421875, "rewards/rejected": 8.192604064941406, "step": 4652 }, { "epoch": 2.408385093167702, "grad_norm": 0.6750377416610718, "learning_rate": 1.026152448317132e-06, "loss": 0.0691356286406517, "rewards/accuracies": 0.953125, "rewards/chosen": 55.18768310546875, "rewards/margins": 46.04425048828125, "rewards/rejected": 9.142105102539062, "step": 4653 }, { "epoch": 2.408902691511387, "grad_norm": 0.864344596862793, "learning_rate": 1.0244216492653076e-06, "loss": 0.08370943367481232, "rewards/accuracies": 0.9453125, "rewards/chosen": 52.7220344543457, "rewards/margins": 45.940277099609375, "rewards/rejected": 6.779582977294922, "step": 4654 }, { "epoch": 2.4094202898550723, "grad_norm": 2.4237887859344482, "learning_rate": 1.0226921444916838e-06, "loss": 0.07067298889160156, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.05294418334961, "rewards/margins": 48.03831481933594, "rewards/rejected": 9.028247833251953, "step": 4655 }, { "epoch": 2.409937888198758, "grad_norm": 1.4496831893920898, "learning_rate": 1.020963934559313e-06, "loss": 0.06600743532180786, "rewards/accuracies": 0.96875, "rewards/chosen": 58.74208450317383, "rewards/margins": 47.55859375, "rewards/rejected": 11.188980102539062, "step": 4656 }, { "epoch": 2.410455486542443, "grad_norm": 0.9550479650497437, "learning_rate": 1.019237020030827e-06, "loss": 0.09634643793106079, "rewards/accuracies": 0.9296875, "rewards/chosen": 60.24885177612305, "rewards/margins": 51.67041778564453, "rewards/rejected": 8.593873023986816, "step": 4657 }, { "epoch": 2.4109730848861286, "grad_norm": 1.1435168981552124, "learning_rate": 1.0175114014684346e-06, "loss": 0.07266109436750412, "rewards/accuracies": 0.96875, "rewards/chosen": 56.36327362060547, "rewards/margins": 47.13311767578125, "rewards/rejected": 9.210540771484375, "step": 4658 }, { "epoch": 2.4114906832298137, "grad_norm": 1.0545730590820312, "learning_rate": 1.015787079433923e-06, "loss": 0.09332162886857986, "rewards/accuracies": 0.953125, "rewards/chosen": 51.11708068847656, "rewards/margins": 42.99029541015625, "rewards/rejected": 8.109909057617188, "step": 4659 }, { "epoch": 2.412008281573499, "grad_norm": 0.5851364135742188, "learning_rate": 1.014064054488661e-06, "loss": 0.03481408953666687, "rewards/accuracies": 0.984375, "rewards/chosen": 52.08763122558594, "rewards/margins": 43.74530029296875, "rewards/rejected": 8.359369277954102, "step": 4660 }, { "epoch": 2.412525879917184, "grad_norm": 1.1179333925247192, "learning_rate": 1.0123423271935878e-06, "loss": 0.15087167918682098, "rewards/accuracies": 0.9296875, "rewards/chosen": 51.31138610839844, "rewards/margins": 41.855064392089844, "rewards/rejected": 9.469993591308594, "step": 4661 }, { "epoch": 2.4130434782608696, "grad_norm": 0.6677932739257812, "learning_rate": 1.0106218981092253e-06, "loss": 0.07070105522871017, "rewards/accuracies": 0.96875, "rewards/chosen": 60.787132263183594, "rewards/margins": 50.52825927734375, "rewards/rejected": 10.25787353515625, "step": 4662 }, { "epoch": 2.4135610766045548, "grad_norm": 0.85899817943573, "learning_rate": 1.008902767795672e-06, "loss": 0.08498934656381607, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.46977615356445, "rewards/margins": 42.348785400390625, "rewards/rejected": 8.121440887451172, "step": 4663 }, { "epoch": 2.4140786749482404, "grad_norm": 0.6133079528808594, "learning_rate": 1.0071849368126034e-06, "loss": 0.06738919764757156, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.70130920410156, "rewards/margins": 46.2093505859375, "rewards/rejected": 7.489032745361328, "step": 4664 }, { "epoch": 2.4145962732919255, "grad_norm": 0.7674607634544373, "learning_rate": 1.005468405719271e-06, "loss": 0.09021700918674469, "rewards/accuracies": 0.953125, "rewards/chosen": 55.51292419433594, "rewards/margins": 47.39154052734375, "rewards/rejected": 8.109500885009766, "step": 4665 }, { "epoch": 2.4151138716356106, "grad_norm": 0.6927124261856079, "learning_rate": 1.0037531750745045e-06, "loss": 0.07051768153905869, "rewards/accuracies": 0.96875, "rewards/chosen": 58.89830780029297, "rewards/margins": 48.675048828125, "rewards/rejected": 10.241901397705078, "step": 4666 }, { "epoch": 2.4156314699792962, "grad_norm": 0.7115035057067871, "learning_rate": 1.0020392454367095e-06, "loss": 0.050685495138168335, "rewards/accuracies": 0.984375, "rewards/chosen": 54.30133056640625, "rewards/margins": 45.90653991699219, "rewards/rejected": 8.392328262329102, "step": 4667 }, { "epoch": 2.4161490683229814, "grad_norm": 1.4080853462219238, "learning_rate": 1.000326617363868e-06, "loss": 0.08325876295566559, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.82158279418945, "rewards/margins": 46.958740234375, "rewards/rejected": 8.8758544921875, "step": 4668 }, { "epoch": 2.4166666666666665, "grad_norm": 0.8849051594734192, "learning_rate": 9.9861529141354e-07, "loss": 0.060306861996650696, "rewards/accuracies": 0.96875, "rewards/chosen": 63.468971252441406, "rewards/margins": 49.8497314453125, "rewards/rejected": 13.63817024230957, "step": 4669 }, { "epoch": 2.417184265010352, "grad_norm": 0.714616060256958, "learning_rate": 9.969052681428555e-07, "loss": 0.07313266396522522, "rewards/accuracies": 0.96875, "rewards/chosen": 54.81827163696289, "rewards/margins": 46.640045166015625, "rewards/rejected": 8.16988754272461, "step": 4670 }, { "epoch": 2.4177018633540373, "grad_norm": 0.8102695345878601, "learning_rate": 9.95196548108529e-07, "loss": 0.08311363309621811, "rewards/accuracies": 0.96875, "rewards/chosen": 55.79273223876953, "rewards/margins": 46.57563781738281, "rewards/rejected": 9.224827766418457, "step": 4671 }, { "epoch": 2.4182194616977224, "grad_norm": 0.8629825711250305, "learning_rate": 9.93489131866846e-07, "loss": 0.08062005043029785, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.53318786621094, "rewards/margins": 41.139862060546875, "rewards/rejected": 10.4051513671875, "step": 4672 }, { "epoch": 2.418737060041408, "grad_norm": 0.7629693746566772, "learning_rate": 9.917830199736683e-07, "loss": 0.06901778280735016, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.215816497802734, "rewards/margins": 49.7869873046875, "rewards/rejected": 10.441360473632812, "step": 4673 }, { "epoch": 2.419254658385093, "grad_norm": 0.7732743620872498, "learning_rate": 9.900782129844306e-07, "loss": 0.09141850471496582, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.91640090942383, "rewards/margins": 44.89031982421875, "rewards/rejected": 11.02337646484375, "step": 4674 }, { "epoch": 2.4197722567287787, "grad_norm": 1.04457688331604, "learning_rate": 9.883747114541475e-07, "loss": 0.10646569728851318, "rewards/accuracies": 0.9375, "rewards/chosen": 53.52702713012695, "rewards/margins": 45.552337646484375, "rewards/rejected": 8.00103759765625, "step": 4675 }, { "epoch": 2.420289855072464, "grad_norm": 0.6037644743919373, "learning_rate": 9.866725159374069e-07, "loss": 0.07217638194561005, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.630645751953125, "rewards/margins": 47.545501708984375, "rewards/rejected": 12.09276008605957, "step": 4676 }, { "epoch": 2.420807453416149, "grad_norm": 1.4775279760360718, "learning_rate": 9.849716269883685e-07, "loss": 0.1557914763689041, "rewards/accuracies": 0.9296875, "rewards/chosen": 55.1226806640625, "rewards/margins": 46.15135192871094, "rewards/rejected": 8.973907470703125, "step": 4677 }, { "epoch": 2.421325051759834, "grad_norm": 1.1839430332183838, "learning_rate": 9.83272045160768e-07, "loss": 0.06690937280654907, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.598876953125, "rewards/margins": 44.80810546875, "rewards/rejected": 6.803180694580078, "step": 4678 }, { "epoch": 2.4218426501035197, "grad_norm": 0.7971454858779907, "learning_rate": 9.815737710079215e-07, "loss": 0.07605531066656113, "rewards/accuracies": 0.953125, "rewards/chosen": 59.01527404785156, "rewards/margins": 47.882080078125, "rewards/rejected": 11.139127731323242, "step": 4679 }, { "epoch": 2.422360248447205, "grad_norm": 0.6398442387580872, "learning_rate": 9.798768050827102e-07, "loss": 0.07336249947547913, "rewards/accuracies": 0.96875, "rewards/chosen": 62.35832214355469, "rewards/margins": 50.060760498046875, "rewards/rejected": 12.309331893920898, "step": 4680 }, { "epoch": 2.4228778467908905, "grad_norm": 1.1182599067687988, "learning_rate": 9.78181147937594e-07, "loss": 0.05335092544555664, "rewards/accuracies": 0.984375, "rewards/chosen": 54.65101623535156, "rewards/margins": 46.90362548828125, "rewards/rejected": 7.734066963195801, "step": 4681 }, { "epoch": 2.4233954451345756, "grad_norm": 1.2719601392745972, "learning_rate": 9.764868001246065e-07, "loss": 0.1443164348602295, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.037986755371094, "rewards/margins": 43.65436553955078, "rewards/rejected": 10.373040199279785, "step": 4682 }, { "epoch": 2.4239130434782608, "grad_norm": 0.8838273882865906, "learning_rate": 9.747937621953573e-07, "loss": 0.0858844518661499, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.624755859375, "rewards/margins": 50.136199951171875, "rewards/rejected": 11.478775024414062, "step": 4683 }, { "epoch": 2.4244306418219463, "grad_norm": 0.6994782090187073, "learning_rate": 9.731020347010244e-07, "loss": 0.060446929186582565, "rewards/accuracies": 0.9921875, "rewards/chosen": 53.456722259521484, "rewards/margins": 42.53816223144531, "rewards/rejected": 10.913217544555664, "step": 4684 }, { "epoch": 2.4249482401656315, "grad_norm": 2.4732301235198975, "learning_rate": 9.714116181923628e-07, "loss": 0.11284792423248291, "rewards/accuracies": 0.953125, "rewards/chosen": 50.98636245727539, "rewards/margins": 39.283599853515625, "rewards/rejected": 11.695451736450195, "step": 4685 }, { "epoch": 2.4254658385093166, "grad_norm": 2.688750743865967, "learning_rate": 9.697225132196992e-07, "loss": 0.11523619294166565, "rewards/accuracies": 0.953125, "rewards/chosen": 52.740501403808594, "rewards/margins": 41.81427001953125, "rewards/rejected": 10.929191589355469, "step": 4686 }, { "epoch": 2.425983436853002, "grad_norm": 1.0024020671844482, "learning_rate": 9.680347203329344e-07, "loss": 0.10652457922697067, "rewards/accuracies": 0.9296875, "rewards/chosen": 52.051658630371094, "rewards/margins": 41.461578369140625, "rewards/rejected": 10.612171173095703, "step": 4687 }, { "epoch": 2.4265010351966874, "grad_norm": 1.0927207469940186, "learning_rate": 9.663482400815416e-07, "loss": 0.11437804996967316, "rewards/accuracies": 0.953125, "rewards/chosen": 57.789146423339844, "rewards/margins": 46.225372314453125, "rewards/rejected": 11.549755096435547, "step": 4688 }, { "epoch": 2.4270186335403725, "grad_norm": 0.889470636844635, "learning_rate": 9.646630730145667e-07, "loss": 0.09083452820777893, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.21199417114258, "rewards/margins": 48.809600830078125, "rewards/rejected": 9.406736373901367, "step": 4689 }, { "epoch": 2.427536231884058, "grad_norm": 1.5252225399017334, "learning_rate": 9.629792196806276e-07, "loss": 0.13419803977012634, "rewards/accuracies": 0.9375, "rewards/chosen": 57.27731704711914, "rewards/margins": 46.35145568847656, "rewards/rejected": 10.919509887695312, "step": 4690 }, { "epoch": 2.4280538302277432, "grad_norm": 1.039932370185852, "learning_rate": 9.61296680627915e-07, "loss": 0.118186816573143, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.56951141357422, "rewards/margins": 47.09778594970703, "rewards/rejected": 8.477493286132812, "step": 4691 }, { "epoch": 2.4285714285714284, "grad_norm": 5.7768778800964355, "learning_rate": 9.59615456404192e-07, "loss": 0.07934325933456421, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.11585998535156, "rewards/margins": 47.605438232421875, "rewards/rejected": 9.51870346069336, "step": 4692 }, { "epoch": 2.429089026915114, "grad_norm": 0.7687538862228394, "learning_rate": 9.579355475567931e-07, "loss": 0.05251730978488922, "rewards/accuracies": 0.984375, "rewards/chosen": 55.986358642578125, "rewards/margins": 44.787750244140625, "rewards/rejected": 11.193119049072266, "step": 4693 }, { "epoch": 2.429606625258799, "grad_norm": 0.727446973323822, "learning_rate": 9.562569546326245e-07, "loss": 0.05543644726276398, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.0324821472168, "rewards/margins": 47.747833251953125, "rewards/rejected": 11.286372184753418, "step": 4694 }, { "epoch": 2.4301242236024843, "grad_norm": 0.7972875237464905, "learning_rate": 9.545796781781642e-07, "loss": 0.07625430822372437, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.23375701904297, "rewards/margins": 41.045745849609375, "rewards/rejected": 10.197101593017578, "step": 4695 }, { "epoch": 2.43064182194617, "grad_norm": 0.6555739045143127, "learning_rate": 9.529037187394618e-07, "loss": 0.05652473121881485, "rewards/accuracies": 0.984375, "rewards/chosen": 56.15962600708008, "rewards/margins": 45.48890686035156, "rewards/rejected": 10.657339096069336, "step": 4696 }, { "epoch": 2.431159420289855, "grad_norm": 0.8972982168197632, "learning_rate": 9.51229076862138e-07, "loss": 0.09440150856971741, "rewards/accuracies": 0.953125, "rewards/chosen": 58.05748748779297, "rewards/margins": 47.914398193359375, "rewards/rejected": 10.133953094482422, "step": 4697 }, { "epoch": 2.4316770186335406, "grad_norm": 0.9258033037185669, "learning_rate": 9.495557530913846e-07, "loss": 0.07949721068143845, "rewards/accuracies": 0.953125, "rewards/chosen": 57.66534423828125, "rewards/margins": 46.95367431640625, "rewards/rejected": 10.711669921875, "step": 4698 }, { "epoch": 2.4321946169772257, "grad_norm": 0.8053295016288757, "learning_rate": 9.478837479719655e-07, "loss": 0.08816367387771606, "rewards/accuracies": 0.953125, "rewards/chosen": 50.10193634033203, "rewards/margins": 41.97438049316406, "rewards/rejected": 8.128681182861328, "step": 4699 }, { "epoch": 2.432712215320911, "grad_norm": 4.6835198402404785, "learning_rate": 9.462130620482101e-07, "loss": 0.13851270079612732, "rewards/accuracies": 0.921875, "rewards/chosen": 55.431243896484375, "rewards/margins": 46.256591796875, "rewards/rejected": 9.173294067382812, "step": 4700 }, { "epoch": 2.4332298136645965, "grad_norm": 4.747382164001465, "learning_rate": 9.445436958640258e-07, "loss": 0.19453009963035583, "rewards/accuracies": 0.953125, "rewards/chosen": 52.76692581176758, "rewards/margins": 42.763153076171875, "rewards/rejected": 10.007148742675781, "step": 4701 }, { "epoch": 2.4337474120082816, "grad_norm": 3.1816399097442627, "learning_rate": 9.428756499628878e-07, "loss": 0.14074744284152985, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.75819396972656, "rewards/margins": 45.22505187988281, "rewards/rejected": 9.529590606689453, "step": 4702 }, { "epoch": 2.4342650103519667, "grad_norm": 1.7055877447128296, "learning_rate": 9.412089248878364e-07, "loss": 0.1029832512140274, "rewards/accuracies": 0.9453125, "rewards/chosen": 52.95361328125, "rewards/margins": 43.500640869140625, "rewards/rejected": 9.446496963500977, "step": 4703 }, { "epoch": 2.4347826086956523, "grad_norm": 1.5999466180801392, "learning_rate": 9.395435211814874e-07, "loss": 0.08813221752643585, "rewards/accuracies": 0.9375, "rewards/chosen": 59.39842224121094, "rewards/margins": 47.60760498046875, "rewards/rejected": 11.793267250061035, "step": 4704 }, { "epoch": 2.4353002070393375, "grad_norm": 0.9246299862861633, "learning_rate": 9.37879439386028e-07, "loss": 0.08834712207317352, "rewards/accuracies": 0.953125, "rewards/chosen": 47.45044708251953, "rewards/margins": 39.1903076171875, "rewards/rejected": 8.258554458618164, "step": 4705 }, { "epoch": 2.4358178053830226, "grad_norm": 0.6769206523895264, "learning_rate": 9.362166800432082e-07, "loss": 0.05787315592169762, "rewards/accuracies": 0.96875, "rewards/chosen": 54.191802978515625, "rewards/margins": 43.33124542236328, "rewards/rejected": 10.865165710449219, "step": 4706 }, { "epoch": 2.436335403726708, "grad_norm": 1.3914693593978882, "learning_rate": 9.345552436943528e-07, "loss": 0.14962971210479736, "rewards/accuracies": 0.9296875, "rewards/chosen": 56.04264831542969, "rewards/margins": 44.741973876953125, "rewards/rejected": 11.303359985351562, "step": 4707 }, { "epoch": 2.4368530020703933, "grad_norm": 1.2681893110275269, "learning_rate": 9.328951308803535e-07, "loss": 0.09603364765644073, "rewards/accuracies": 0.953125, "rewards/chosen": 49.15089416503906, "rewards/margins": 40.62843322753906, "rewards/rejected": 8.527709007263184, "step": 4708 }, { "epoch": 2.4373706004140785, "grad_norm": 1.083653211593628, "learning_rate": 9.312363421416748e-07, "loss": 0.07653342932462692, "rewards/accuracies": 0.9609375, "rewards/chosen": 65.87812042236328, "rewards/margins": 52.78666687011719, "rewards/rejected": 13.078033447265625, "step": 4709 }, { "epoch": 2.437888198757764, "grad_norm": 1.5565190315246582, "learning_rate": 9.295788780183446e-07, "loss": 0.10586608201265335, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.46950149536133, "rewards/margins": 45.507659912109375, "rewards/rejected": 11.956482887268066, "step": 4710 }, { "epoch": 2.4384057971014492, "grad_norm": 0.9405705332756042, "learning_rate": 9.279227390499629e-07, "loss": 0.07304039597511292, "rewards/accuracies": 0.9765625, "rewards/chosen": 64.16581726074219, "rewards/margins": 52.35356140136719, "rewards/rejected": 11.8067626953125, "step": 4711 }, { "epoch": 2.4389233954451344, "grad_norm": 1.06062650680542, "learning_rate": 9.262679257756979e-07, "loss": 0.11131566017866135, "rewards/accuracies": 0.9296875, "rewards/chosen": 47.85802459716797, "rewards/margins": 38.59515380859375, "rewards/rejected": 9.261397361755371, "step": 4712 }, { "epoch": 2.43944099378882, "grad_norm": 0.6100167036056519, "learning_rate": 9.24614438734286e-07, "loss": 0.047061312943696976, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.811012268066406, "rewards/margins": 47.8602294921875, "rewards/rejected": 11.966739654541016, "step": 4713 }, { "epoch": 2.439958592132505, "grad_norm": 0.8153408169746399, "learning_rate": 9.229622784640313e-07, "loss": 0.062071219086647034, "rewards/accuracies": 0.96875, "rewards/chosen": 58.94895935058594, "rewards/margins": 50.601226806640625, "rewards/rejected": 8.341228485107422, "step": 4714 }, { "epoch": 2.4404761904761907, "grad_norm": 2.0259411334991455, "learning_rate": 9.213114455028077e-07, "loss": 0.09725849330425262, "rewards/accuracies": 0.953125, "rewards/chosen": 50.67549514770508, "rewards/margins": 43.73881530761719, "rewards/rejected": 6.923920631408691, "step": 4715 }, { "epoch": 2.440993788819876, "grad_norm": 1.2350865602493286, "learning_rate": 9.196619403880547e-07, "loss": 0.10821732878684998, "rewards/accuracies": 0.9375, "rewards/chosen": 46.30790710449219, "rewards/margins": 40.272735595703125, "rewards/rejected": 6.0244140625, "step": 4716 }, { "epoch": 2.441511387163561, "grad_norm": 1.1420906782150269, "learning_rate": 9.18013763656781e-07, "loss": 0.09437288343906403, "rewards/accuracies": 0.953125, "rewards/chosen": 46.28295135498047, "rewards/margins": 39.204315185546875, "rewards/rejected": 7.077688217163086, "step": 4717 }, { "epoch": 2.4420289855072466, "grad_norm": 4.442474365234375, "learning_rate": 9.163669158455624e-07, "loss": 0.19392040371894836, "rewards/accuracies": 0.9375, "rewards/chosen": 54.501251220703125, "rewards/margins": 47.239013671875, "rewards/rejected": 7.258513450622559, "step": 4718 }, { "epoch": 2.4425465838509317, "grad_norm": 0.8923038840293884, "learning_rate": 9.147213974905422e-07, "loss": 0.10772933065891266, "rewards/accuracies": 0.921875, "rewards/chosen": 55.995338439941406, "rewards/margins": 44.4091796875, "rewards/rejected": 11.59282398223877, "step": 4719 }, { "epoch": 2.443064182194617, "grad_norm": 1.4355683326721191, "learning_rate": 9.130772091274304e-07, "loss": 0.09806060791015625, "rewards/accuracies": 0.953125, "rewards/chosen": 62.11427688598633, "rewards/margins": 49.927734375, "rewards/rejected": 12.169326782226562, "step": 4720 }, { "epoch": 2.4435817805383024, "grad_norm": 2.725414991378784, "learning_rate": 9.114343512915047e-07, "loss": 0.11900099366903305, "rewards/accuracies": 0.9296875, "rewards/chosen": 48.72287368774414, "rewards/margins": 41.30848693847656, "rewards/rejected": 7.42537784576416, "step": 4721 }, { "epoch": 2.4440993788819876, "grad_norm": 1.3404953479766846, "learning_rate": 9.097928245176091e-07, "loss": 0.07716703414916992, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.024932861328125, "rewards/margins": 47.46653747558594, "rewards/rejected": 10.56357192993164, "step": 4722 }, { "epoch": 2.4446169772256727, "grad_norm": 1.326964259147644, "learning_rate": 9.081526293401544e-07, "loss": 0.0756552591919899, "rewards/accuracies": 0.96875, "rewards/chosen": 51.48316192626953, "rewards/margins": 44.39898681640625, "rewards/rejected": 7.079524993896484, "step": 4723 }, { "epoch": 2.4451345755693583, "grad_norm": 0.6977131962776184, "learning_rate": 9.065137662931184e-07, "loss": 0.09063709527254105, "rewards/accuracies": 0.953125, "rewards/chosen": 55.08354187011719, "rewards/margins": 46.25360107421875, "rewards/rejected": 8.833202362060547, "step": 4724 }, { "epoch": 2.4456521739130435, "grad_norm": 0.9600372314453125, "learning_rate": 9.048762359100455e-07, "loss": 0.05965173989534378, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.80516052246094, "rewards/margins": 51.7042236328125, "rewards/rejected": 10.0968017578125, "step": 4725 }, { "epoch": 2.4461697722567286, "grad_norm": 0.579418420791626, "learning_rate": 9.032400387240426e-07, "loss": 0.03828292712569237, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.126976013183594, "rewards/margins": 45.842987060546875, "rewards/rejected": 7.273948669433594, "step": 4726 }, { "epoch": 2.446687370600414, "grad_norm": 0.4837283194065094, "learning_rate": 9.016051752677879e-07, "loss": 0.022030550986528397, "rewards/accuracies": 0.9921875, "rewards/chosen": 60.61662292480469, "rewards/margins": 50.63555908203125, "rewards/rejected": 9.985062599182129, "step": 4727 }, { "epoch": 2.4472049689440993, "grad_norm": 1.1976981163024902, "learning_rate": 8.999716460735247e-07, "loss": 0.09340913593769073, "rewards/accuracies": 0.953125, "rewards/chosen": 49.21943664550781, "rewards/margins": 41.30992126464844, "rewards/rejected": 7.900533676147461, "step": 4728 }, { "epoch": 2.4477225672877845, "grad_norm": 0.4887515902519226, "learning_rate": 8.983394516730565e-07, "loss": 0.0518636591732502, "rewards/accuracies": 0.984375, "rewards/chosen": 54.95438003540039, "rewards/margins": 47.153350830078125, "rewards/rejected": 7.800266265869141, "step": 4729 }, { "epoch": 2.44824016563147, "grad_norm": 1.0927742719650269, "learning_rate": 8.967085925977559e-07, "loss": 0.0898623913526535, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.96354675292969, "rewards/margins": 48.891845703125, "rewards/rejected": 10.056411743164062, "step": 4730 }, { "epoch": 2.448757763975155, "grad_norm": 1.2153288125991821, "learning_rate": 8.950790693785638e-07, "loss": 0.10281888395547867, "rewards/accuracies": 0.9375, "rewards/chosen": 49.79286575317383, "rewards/margins": 42.6519775390625, "rewards/rejected": 7.123054504394531, "step": 4731 }, { "epoch": 2.449275362318841, "grad_norm": 0.603722333908081, "learning_rate": 8.934508825459832e-07, "loss": 0.05151330679655075, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.949676513671875, "rewards/margins": 51.670806884765625, "rewards/rejected": 9.2806396484375, "step": 4732 }, { "epoch": 2.449792960662526, "grad_norm": 1.1868171691894531, "learning_rate": 8.918240326300798e-07, "loss": 0.10452108830213547, "rewards/accuracies": 0.9375, "rewards/chosen": 56.26375961303711, "rewards/margins": 46.391937255859375, "rewards/rejected": 9.875885009765625, "step": 4733 }, { "epoch": 2.450310559006211, "grad_norm": 3.072319269180298, "learning_rate": 8.901985201604857e-07, "loss": 0.10687697678804398, "rewards/accuracies": 0.953125, "rewards/chosen": 54.66313552856445, "rewards/margins": 45.2459716796875, "rewards/rejected": 9.401668548583984, "step": 4734 }, { "epoch": 2.4508281573498967, "grad_norm": 1.1796718835830688, "learning_rate": 8.88574345666402e-07, "loss": 0.09974902123212814, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.98237609863281, "rewards/margins": 45.6839599609375, "rewards/rejected": 6.300060272216797, "step": 4735 }, { "epoch": 2.451345755693582, "grad_norm": 0.7725454568862915, "learning_rate": 8.869515096765874e-07, "loss": 0.03869981691241264, "rewards/accuracies": 0.984375, "rewards/chosen": 64.34468078613281, "rewards/margins": 52.491424560546875, "rewards/rejected": 11.834081649780273, "step": 4736 }, { "epoch": 2.451863354037267, "grad_norm": 0.981706440448761, "learning_rate": 8.853300127193682e-07, "loss": 0.10746761411428452, "rewards/accuracies": 0.9375, "rewards/chosen": 49.83512878417969, "rewards/margins": 43.2923583984375, "rewards/rejected": 6.520641326904297, "step": 4737 }, { "epoch": 2.4523809523809526, "grad_norm": 0.9001868963241577, "learning_rate": 8.837098553226353e-07, "loss": 0.11251229047775269, "rewards/accuracies": 0.9375, "rewards/chosen": 54.94624328613281, "rewards/margins": 44.842926025390625, "rewards/rejected": 10.1072998046875, "step": 4738 }, { "epoch": 2.4528985507246377, "grad_norm": 0.8013373613357544, "learning_rate": 8.82091038013842e-07, "loss": 0.078158438205719, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.32758331298828, "rewards/margins": 47.299346923828125, "rewards/rejected": 6.018154144287109, "step": 4739 }, { "epoch": 2.453416149068323, "grad_norm": 1.2196435928344727, "learning_rate": 8.804735613200061e-07, "loss": 0.09316015988588333, "rewards/accuracies": 0.9375, "rewards/chosen": 51.67580795288086, "rewards/margins": 44.41542053222656, "rewards/rejected": 7.249423980712891, "step": 4740 }, { "epoch": 2.4539337474120084, "grad_norm": 0.4132746160030365, "learning_rate": 8.78857425767709e-07, "loss": 0.028923742473125458, "rewards/accuracies": 0.984375, "rewards/chosen": 54.531593322753906, "rewards/margins": 44.7818603515625, "rewards/rejected": 9.733369827270508, "step": 4741 }, { "epoch": 2.4544513457556936, "grad_norm": 2.0378806591033936, "learning_rate": 8.772426318830951e-07, "loss": 0.09207359701395035, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.12678527832031, "rewards/margins": 45.74140930175781, "rewards/rejected": 8.3826265335083, "step": 4742 }, { "epoch": 2.4549689440993787, "grad_norm": 3.8624863624572754, "learning_rate": 8.75629180191872e-07, "loss": 0.14456789195537567, "rewards/accuracies": 0.953125, "rewards/chosen": 55.61695098876953, "rewards/margins": 47.76300048828125, "rewards/rejected": 7.856771469116211, "step": 4743 }, { "epoch": 2.4554865424430643, "grad_norm": 0.8419275283813477, "learning_rate": 8.740170712193108e-07, "loss": 0.08359795808792114, "rewards/accuracies": 0.953125, "rewards/chosen": 52.90846633911133, "rewards/margins": 44.85624694824219, "rewards/rejected": 8.056665420532227, "step": 4744 }, { "epoch": 2.4560041407867494, "grad_norm": 3.6661319732666016, "learning_rate": 8.724063054902454e-07, "loss": 0.08187120407819748, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.044456481933594, "rewards/margins": 48.11663818359375, "rewards/rejected": 8.925250053405762, "step": 4745 }, { "epoch": 2.4565217391304346, "grad_norm": 0.7138970494270325, "learning_rate": 8.707968835290714e-07, "loss": 0.0520658940076828, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.308902740478516, "rewards/margins": 42.844573974609375, "rewards/rejected": 7.454891204833984, "step": 4746 }, { "epoch": 2.45703933747412, "grad_norm": 0.7600672245025635, "learning_rate": 8.691888058597492e-07, "loss": 0.05583702400326729, "rewards/accuracies": 0.96875, "rewards/chosen": 46.920223236083984, "rewards/margins": 40.32061767578125, "rewards/rejected": 6.604930877685547, "step": 4747 }, { "epoch": 2.4575569358178053, "grad_norm": 0.7714698910713196, "learning_rate": 8.675820730057982e-07, "loss": 0.06387404352426529, "rewards/accuracies": 0.9765625, "rewards/chosen": 49.465049743652344, "rewards/margins": 40.43937683105469, "rewards/rejected": 9.002796173095703, "step": 4748 }, { "epoch": 2.458074534161491, "grad_norm": 1.2139763832092285, "learning_rate": 8.659766854903034e-07, "loss": 0.0741482526063919, "rewards/accuracies": 0.953125, "rewards/chosen": 49.48432159423828, "rewards/margins": 41.42547607421875, "rewards/rejected": 8.059486389160156, "step": 4749 }, { "epoch": 2.458592132505176, "grad_norm": 0.7394658327102661, "learning_rate": 8.643726438359096e-07, "loss": 0.04917062819004059, "rewards/accuracies": 0.984375, "rewards/chosen": 58.40020751953125, "rewards/margins": 47.41448974609375, "rewards/rejected": 10.993358612060547, "step": 4750 }, { "epoch": 2.459109730848861, "grad_norm": 1.1539124250411987, "learning_rate": 8.627699485648256e-07, "loss": 0.09437482804059982, "rewards/accuracies": 0.953125, "rewards/chosen": 52.088531494140625, "rewards/margins": 45.10565185546875, "rewards/rejected": 6.982306003570557, "step": 4751 }, { "epoch": 2.4596273291925463, "grad_norm": 1.1437630653381348, "learning_rate": 8.611686001988162e-07, "loss": 0.09444679319858551, "rewards/accuracies": 0.953125, "rewards/chosen": 57.283363342285156, "rewards/margins": 48.4649658203125, "rewards/rejected": 8.823675155639648, "step": 4752 }, { "epoch": 2.460144927536232, "grad_norm": 1.873537302017212, "learning_rate": 8.595685992592162e-07, "loss": 0.14066071808338165, "rewards/accuracies": 0.9296875, "rewards/chosen": 50.15460205078125, "rewards/margins": 42.548736572265625, "rewards/rejected": 7.609672546386719, "step": 4753 }, { "epoch": 2.460662525879917, "grad_norm": 0.8290650248527527, "learning_rate": 8.579699462669161e-07, "loss": 0.06058725714683533, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.23064422607422, "rewards/margins": 45.150238037109375, "rewards/rejected": 11.07939624786377, "step": 4754 }, { "epoch": 2.4611801242236027, "grad_norm": 1.1795539855957031, "learning_rate": 8.563726417423696e-07, "loss": 0.08263014256954193, "rewards/accuracies": 0.96875, "rewards/chosen": 52.44911193847656, "rewards/margins": 44.567710876464844, "rewards/rejected": 7.897002220153809, "step": 4755 }, { "epoch": 2.461697722567288, "grad_norm": 0.8208153247833252, "learning_rate": 8.547766862055879e-07, "loss": 0.08104510605335236, "rewards/accuracies": 0.9609375, "rewards/chosen": 48.517242431640625, "rewards/margins": 42.129852294921875, "rewards/rejected": 6.379396438598633, "step": 4756 }, { "epoch": 2.462215320910973, "grad_norm": 2.12219500541687, "learning_rate": 8.531820801761487e-07, "loss": 0.1350598931312561, "rewards/accuracies": 0.9453125, "rewards/chosen": 50.20909881591797, "rewards/margins": 43.40068817138672, "rewards/rejected": 6.822757720947266, "step": 4757 }, { "epoch": 2.4627329192546585, "grad_norm": 0.828620195388794, "learning_rate": 8.515888241731884e-07, "loss": 0.07572365552186966, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.71620178222656, "rewards/margins": 45.10978698730469, "rewards/rejected": 7.608772277832031, "step": 4758 }, { "epoch": 2.4632505175983437, "grad_norm": 0.8079144954681396, "learning_rate": 8.499969187153994e-07, "loss": 0.0790708065032959, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.756629943847656, "rewards/margins": 48.86058044433594, "rewards/rejected": 8.899909019470215, "step": 4759 }, { "epoch": 2.463768115942029, "grad_norm": 0.9338194131851196, "learning_rate": 8.484063643210388e-07, "loss": 0.13635823130607605, "rewards/accuracies": 0.921875, "rewards/chosen": 51.42641067504883, "rewards/margins": 44.83747863769531, "rewards/rejected": 6.591943740844727, "step": 4760 }, { "epoch": 2.4642857142857144, "grad_norm": 2.1545791625976562, "learning_rate": 8.468171615079268e-07, "loss": 0.09424090385437012, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.87590026855469, "rewards/margins": 45.10984802246094, "rewards/rejected": 9.7669677734375, "step": 4761 }, { "epoch": 2.4648033126293996, "grad_norm": 0.8295500874519348, "learning_rate": 8.452293107934357e-07, "loss": 0.06709500402212143, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.720848083496094, "rewards/margins": 42.15283203125, "rewards/rejected": 8.570402145385742, "step": 4762 }, { "epoch": 2.4653209109730847, "grad_norm": 0.8179653882980347, "learning_rate": 8.436428126945029e-07, "loss": 0.05460045859217644, "rewards/accuracies": 0.96875, "rewards/chosen": 52.87986755371094, "rewards/margins": 42.959136962890625, "rewards/rejected": 9.930931091308594, "step": 4763 }, { "epoch": 2.4658385093167703, "grad_norm": 0.6140071153640747, "learning_rate": 8.420576677276238e-07, "loss": 0.06063336133956909, "rewards/accuracies": 0.9609375, "rewards/chosen": 44.687644958496094, "rewards/margins": 39.760955810546875, "rewards/rejected": 4.938998222351074, "step": 4764 }, { "epoch": 2.4663561076604554, "grad_norm": 0.8673835396766663, "learning_rate": 8.404738764088566e-07, "loss": 0.08342564105987549, "rewards/accuracies": 0.96875, "rewards/chosen": 52.577064514160156, "rewards/margins": 44.42054748535156, "rewards/rejected": 8.155583381652832, "step": 4765 }, { "epoch": 2.466873706004141, "grad_norm": 1.646803617477417, "learning_rate": 8.388914392538128e-07, "loss": 0.10005359351634979, "rewards/accuracies": 0.953125, "rewards/chosen": 48.15754318237305, "rewards/margins": 42.43450927734375, "rewards/rejected": 5.7199530601501465, "step": 4766 }, { "epoch": 2.467391304347826, "grad_norm": 0.8292688131332397, "learning_rate": 8.373103567776669e-07, "loss": 0.07249408215284348, "rewards/accuracies": 0.9765625, "rewards/chosen": 49.32301330566406, "rewards/margins": 41.16143798828125, "rewards/rejected": 8.1617431640625, "step": 4767 }, { "epoch": 2.4679089026915113, "grad_norm": 0.983527660369873, "learning_rate": 8.357306294951528e-07, "loss": 0.09783218801021576, "rewards/accuracies": 0.9453125, "rewards/chosen": 52.387420654296875, "rewards/margins": 44.667633056640625, "rewards/rejected": 7.735157012939453, "step": 4768 }, { "epoch": 2.4684265010351965, "grad_norm": 1.1774282455444336, "learning_rate": 8.341522579205614e-07, "loss": 0.10881368815898895, "rewards/accuracies": 0.953125, "rewards/chosen": 49.308677673339844, "rewards/margins": 41.7454833984375, "rewards/rejected": 7.579776763916016, "step": 4769 }, { "epoch": 2.468944099378882, "grad_norm": 0.8965374231338501, "learning_rate": 8.325752425677424e-07, "loss": 0.06961976736783981, "rewards/accuracies": 0.96875, "rewards/chosen": 55.021263122558594, "rewards/margins": 46.812286376953125, "rewards/rejected": 8.217754364013672, "step": 4770 }, { "epoch": 2.469461697722567, "grad_norm": 1.1796594858169556, "learning_rate": 8.309995839501056e-07, "loss": 0.10430794954299927, "rewards/accuracies": 0.953125, "rewards/chosen": 50.15446472167969, "rewards/margins": 41.9176025390625, "rewards/rejected": 8.238548278808594, "step": 4771 }, { "epoch": 2.4699792960662528, "grad_norm": 0.9779054522514343, "learning_rate": 8.294252825806177e-07, "loss": 0.06999780237674713, "rewards/accuracies": 0.96875, "rewards/chosen": 47.859230041503906, "rewards/margins": 42.794342041015625, "rewards/rejected": 5.077627182006836, "step": 4772 }, { "epoch": 2.470496894409938, "grad_norm": 1.3781827688217163, "learning_rate": 8.27852338971803e-07, "loss": 0.07974765449762344, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.051788330078125, "rewards/margins": 43.65525817871094, "rewards/rejected": 5.396625518798828, "step": 4773 }, { "epoch": 2.471014492753623, "grad_norm": 0.9326684474945068, "learning_rate": 8.262807536357459e-07, "loss": 0.10285217314958572, "rewards/accuracies": 0.9296875, "rewards/chosen": 49.420997619628906, "rewards/margins": 41.854949951171875, "rewards/rejected": 7.567089080810547, "step": 4774 }, { "epoch": 2.4715320910973086, "grad_norm": 1.1006765365600586, "learning_rate": 8.247105270840866e-07, "loss": 0.06571683287620544, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.23743438720703, "rewards/margins": 46.06464385986328, "rewards/rejected": 11.187101364135742, "step": 4775 }, { "epoch": 2.472049689440994, "grad_norm": 3.4584238529205322, "learning_rate": 8.23141659828024e-07, "loss": 0.07191570103168488, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.2674674987793, "rewards/margins": 46.95030212402344, "rewards/rejected": 8.323561668395996, "step": 4776 }, { "epoch": 2.472567287784679, "grad_norm": 0.6008076071739197, "learning_rate": 8.215741523783139e-07, "loss": 0.07432038336992264, "rewards/accuracies": 0.96875, "rewards/chosen": 57.07842254638672, "rewards/margins": 47.6549072265625, "rewards/rejected": 9.43719482421875, "step": 4777 }, { "epoch": 2.4730848861283645, "grad_norm": 0.5985174775123596, "learning_rate": 8.200080052452697e-07, "loss": 0.047470785677433014, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.746788024902344, "rewards/margins": 47.65582275390625, "rewards/rejected": 8.09356689453125, "step": 4778 }, { "epoch": 2.4736024844720497, "grad_norm": 1.8909136056900024, "learning_rate": 8.18443218938762e-07, "loss": 0.0749843418598175, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.382354736328125, "rewards/margins": 47.47027587890625, "rewards/rejected": 8.904729843139648, "step": 4779 }, { "epoch": 2.474120082815735, "grad_norm": 1.1535615921020508, "learning_rate": 8.168797939682183e-07, "loss": 0.0695507675409317, "rewards/accuracies": 0.984375, "rewards/chosen": 47.66521453857422, "rewards/margins": 41.681549072265625, "rewards/rejected": 5.975414276123047, "step": 4780 }, { "epoch": 2.4746376811594204, "grad_norm": 0.8940821290016174, "learning_rate": 8.153177308426241e-07, "loss": 0.08140698820352554, "rewards/accuracies": 0.96875, "rewards/chosen": 51.91654968261719, "rewards/margins": 44.941162109375, "rewards/rejected": 6.969544410705566, "step": 4781 }, { "epoch": 2.4751552795031055, "grad_norm": 1.5051062107086182, "learning_rate": 8.13757030070516e-07, "loss": 0.10384203493595123, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.49629211425781, "rewards/margins": 40.584564208984375, "rewards/rejected": 6.915910720825195, "step": 4782 }, { "epoch": 2.475672877846791, "grad_norm": 0.7343592047691345, "learning_rate": 8.121976921599961e-07, "loss": 0.08488404005765915, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.35542678833008, "rewards/margins": 44.652587890625, "rewards/rejected": 8.675777435302734, "step": 4783 }, { "epoch": 2.4761904761904763, "grad_norm": 2.0901267528533936, "learning_rate": 8.106397176187175e-07, "loss": 0.12593725323677063, "rewards/accuracies": 0.953125, "rewards/chosen": 53.849884033203125, "rewards/margins": 45.29180908203125, "rewards/rejected": 8.538795471191406, "step": 4784 }, { "epoch": 2.4767080745341614, "grad_norm": 0.660973072052002, "learning_rate": 8.090831069538879e-07, "loss": 0.07624030113220215, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.283172607421875, "rewards/margins": 42.67169189453125, "rewards/rejected": 8.624988555908203, "step": 4785 }, { "epoch": 2.4772256728778466, "grad_norm": 1.2050063610076904, "learning_rate": 8.075278606722725e-07, "loss": 0.12131454050540924, "rewards/accuracies": 0.9375, "rewards/chosen": 55.41899108886719, "rewards/margins": 45.13348388671875, "rewards/rejected": 10.282825469970703, "step": 4786 }, { "epoch": 2.477743271221532, "grad_norm": 0.6991284489631653, "learning_rate": 8.059739792801963e-07, "loss": 0.05440818518400192, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.993370056152344, "rewards/margins": 43.203033447265625, "rewards/rejected": 7.800561904907227, "step": 4787 }, { "epoch": 2.4782608695652173, "grad_norm": 1.0179929733276367, "learning_rate": 8.044214632835362e-07, "loss": 0.11131532490253448, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.556419372558594, "rewards/margins": 46.59515380859375, "rewards/rejected": 8.963258743286133, "step": 4788 }, { "epoch": 2.478778467908903, "grad_norm": 0.7058560848236084, "learning_rate": 8.028703131877219e-07, "loss": 0.07084519416093826, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.256832122802734, "rewards/margins": 45.00471496582031, "rewards/rejected": 7.254512786865234, "step": 4789 }, { "epoch": 2.479296066252588, "grad_norm": 0.6572297811508179, "learning_rate": 8.013205294977422e-07, "loss": 0.08729421347379684, "rewards/accuracies": 0.953125, "rewards/chosen": 54.50932312011719, "rewards/margins": 46.391265869140625, "rewards/rejected": 8.115194320678711, "step": 4790 }, { "epoch": 2.479813664596273, "grad_norm": 0.8432139158248901, "learning_rate": 7.997721127181441e-07, "loss": 0.09500090777873993, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.49022674560547, "rewards/margins": 40.473846435546875, "rewards/rejected": 7.008583068847656, "step": 4791 }, { "epoch": 2.4803312629399588, "grad_norm": 1.0641101598739624, "learning_rate": 7.98225063353022e-07, "loss": 0.0753326267004013, "rewards/accuracies": 0.953125, "rewards/chosen": 58.29681396484375, "rewards/margins": 47.8912353515625, "rewards/rejected": 10.427248001098633, "step": 4792 }, { "epoch": 2.480848861283644, "grad_norm": 1.2637617588043213, "learning_rate": 7.966793819060304e-07, "loss": 0.05948171392083168, "rewards/accuracies": 0.96875, "rewards/chosen": 55.28788757324219, "rewards/margins": 47.53883361816406, "rewards/rejected": 7.768291473388672, "step": 4793 }, { "epoch": 2.481366459627329, "grad_norm": 1.327031135559082, "learning_rate": 7.95135068880376e-07, "loss": 0.09093406051397324, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.90791702270508, "rewards/margins": 42.74702835083008, "rewards/rejected": 10.177606582641602, "step": 4794 }, { "epoch": 2.4818840579710146, "grad_norm": 1.3490368127822876, "learning_rate": 7.935921247788226e-07, "loss": 0.09700280427932739, "rewards/accuracies": 0.96875, "rewards/chosen": 53.474571228027344, "rewards/margins": 44.534210205078125, "rewards/rejected": 8.938676834106445, "step": 4795 }, { "epoch": 2.4824016563147, "grad_norm": 0.5955081582069397, "learning_rate": 7.920505501036857e-07, "loss": 0.044623054563999176, "rewards/accuracies": 0.984375, "rewards/chosen": 55.404571533203125, "rewards/margins": 47.008026123046875, "rewards/rejected": 8.388452529907227, "step": 4796 }, { "epoch": 2.482919254658385, "grad_norm": 1.3226999044418335, "learning_rate": 7.905103453568364e-07, "loss": 0.10123036801815033, "rewards/accuracies": 0.953125, "rewards/chosen": 50.48179626464844, "rewards/margins": 43.147705078125, "rewards/rejected": 7.350661277770996, "step": 4797 }, { "epoch": 2.4834368530020705, "grad_norm": 0.736100435256958, "learning_rate": 7.889715110396995e-07, "loss": 0.09563884139060974, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.03995895385742, "rewards/margins": 44.898162841796875, "rewards/rejected": 10.132492065429688, "step": 4798 }, { "epoch": 2.4839544513457557, "grad_norm": 0.8874202966690063, "learning_rate": 7.874340476532532e-07, "loss": 0.11421366035938263, "rewards/accuracies": 0.9296875, "rewards/chosen": 46.856571197509766, "rewards/margins": 38.48577880859375, "rewards/rejected": 8.373823165893555, "step": 4799 }, { "epoch": 2.4844720496894412, "grad_norm": 0.924691915512085, "learning_rate": 7.858979556980306e-07, "loss": 0.1262207329273224, "rewards/accuracies": 0.953125, "rewards/chosen": 51.02967071533203, "rewards/margins": 41.11334228515625, "rewards/rejected": 9.923975944519043, "step": 4800 }, { "epoch": 2.4849896480331264, "grad_norm": 0.871709406375885, "learning_rate": 7.843632356741171e-07, "loss": 0.06378525495529175, "rewards/accuracies": 0.96875, "rewards/chosen": 60.66555404663086, "rewards/margins": 49.76611328125, "rewards/rejected": 10.894149780273438, "step": 4801 }, { "epoch": 2.4855072463768115, "grad_norm": 0.7320593595504761, "learning_rate": 7.828298880811513e-07, "loss": 0.08593486994504929, "rewards/accuracies": 0.96875, "rewards/chosen": 52.162872314453125, "rewards/margins": 43.719268798828125, "rewards/rejected": 8.441781997680664, "step": 4802 }, { "epoch": 2.4860248447204967, "grad_norm": 0.6683508157730103, "learning_rate": 7.812979134183263e-07, "loss": 0.06573523581027985, "rewards/accuracies": 0.96875, "rewards/chosen": 57.484169006347656, "rewards/margins": 47.28345489501953, "rewards/rejected": 10.20317554473877, "step": 4803 }, { "epoch": 2.4865424430641823, "grad_norm": 0.9128392338752747, "learning_rate": 7.797673121843874e-07, "loss": 0.08933135867118835, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.62641143798828, "rewards/margins": 44.726959228515625, "rewards/rejected": 9.889738082885742, "step": 4804 }, { "epoch": 2.4870600414078674, "grad_norm": 1.993431568145752, "learning_rate": 7.782380848776328e-07, "loss": 0.07722637057304382, "rewards/accuracies": 0.96875, "rewards/chosen": 52.54641342163086, "rewards/margins": 44.730865478515625, "rewards/rejected": 7.814275741577148, "step": 4805 }, { "epoch": 2.487577639751553, "grad_norm": 1.0062739849090576, "learning_rate": 7.767102319959136e-07, "loss": 0.08551563322544098, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.23677062988281, "rewards/margins": 49.41734313964844, "rewards/rejected": 9.805719375610352, "step": 4806 }, { "epoch": 2.488095238095238, "grad_norm": 0.8185334205627441, "learning_rate": 7.751837540366341e-07, "loss": 0.08741578459739685, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.24525451660156, "rewards/margins": 42.473419189453125, "rewards/rejected": 8.766485214233398, "step": 4807 }, { "epoch": 2.4886128364389233, "grad_norm": 0.968538761138916, "learning_rate": 7.736586514967464e-07, "loss": 0.0866064801812172, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.96440124511719, "rewards/margins": 42.324432373046875, "rewards/rejected": 8.646827697753906, "step": 4808 }, { "epoch": 2.489130434782609, "grad_norm": 0.7311749458312988, "learning_rate": 7.721349248727633e-07, "loss": 0.08393418788909912, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.910003662109375, "rewards/margins": 46.76287841796875, "rewards/rejected": 9.149810791015625, "step": 4809 }, { "epoch": 2.489648033126294, "grad_norm": 0.7862288951873779, "learning_rate": 7.706125746607445e-07, "loss": 0.08338772505521774, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.27946853637695, "rewards/margins": 44.94378662109375, "rewards/rejected": 8.331818580627441, "step": 4810 }, { "epoch": 2.490165631469979, "grad_norm": 0.667305052280426, "learning_rate": 7.690916013562993e-07, "loss": 0.05240616574883461, "rewards/accuracies": 0.984375, "rewards/chosen": 51.69718551635742, "rewards/margins": 44.045654296875, "rewards/rejected": 7.6600341796875, "step": 4811 }, { "epoch": 2.4906832298136647, "grad_norm": 0.7936612367630005, "learning_rate": 7.675720054545915e-07, "loss": 0.09088961780071259, "rewards/accuracies": 0.9375, "rewards/chosen": 50.90653610229492, "rewards/margins": 42.99671936035156, "rewards/rejected": 7.911532878875732, "step": 4812 }, { "epoch": 2.49120082815735, "grad_norm": 1.2229984998703003, "learning_rate": 7.660537874503398e-07, "loss": 0.11435199528932571, "rewards/accuracies": 0.9375, "rewards/chosen": 54.41145324707031, "rewards/margins": 45.671417236328125, "rewards/rejected": 8.742755889892578, "step": 4813 }, { "epoch": 2.491718426501035, "grad_norm": 1.100987434387207, "learning_rate": 7.645369478378101e-07, "loss": 0.0881553590297699, "rewards/accuracies": 0.953125, "rewards/chosen": 49.65680694580078, "rewards/margins": 40.98985290527344, "rewards/rejected": 8.670578002929688, "step": 4814 }, { "epoch": 2.4922360248447206, "grad_norm": 0.8993959426879883, "learning_rate": 7.630214871108182e-07, "loss": 0.07409980148077011, "rewards/accuracies": 0.96875, "rewards/chosen": 50.779541015625, "rewards/margins": 43.73088073730469, "rewards/rejected": 7.033817291259766, "step": 4815 }, { "epoch": 2.4927536231884058, "grad_norm": 0.9548197984695435, "learning_rate": 7.615074057627342e-07, "loss": 0.09763941168785095, "rewards/accuracies": 0.953125, "rewards/chosen": 54.69990539550781, "rewards/margins": 44.68646240234375, "rewards/rejected": 10.014999389648438, "step": 4816 }, { "epoch": 2.493271221532091, "grad_norm": 1.154621958732605, "learning_rate": 7.599947042864803e-07, "loss": 0.08371317386627197, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.139801025390625, "rewards/margins": 45.876487731933594, "rewards/rejected": 9.247058868408203, "step": 4817 }, { "epoch": 2.4937888198757765, "grad_norm": 0.48748013377189636, "learning_rate": 7.584833831745248e-07, "loss": 0.03151380270719528, "rewards/accuracies": 1.0, "rewards/chosen": 50.17927551269531, "rewards/margins": 41.23822021484375, "rewards/rejected": 8.939798355102539, "step": 4818 }, { "epoch": 2.4943064182194616, "grad_norm": 0.39129626750946045, "learning_rate": 7.569734429188902e-07, "loss": 0.04561984911561012, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.61157989501953, "rewards/margins": 41.70002746582031, "rewards/rejected": 8.898322105407715, "step": 4819 }, { "epoch": 2.494824016563147, "grad_norm": 1.0218405723571777, "learning_rate": 7.554648840111483e-07, "loss": 0.1191808432340622, "rewards/accuracies": 0.9296875, "rewards/chosen": 47.646278381347656, "rewards/margins": 38.433502197265625, "rewards/rejected": 9.213327407836914, "step": 4820 }, { "epoch": 2.4953416149068324, "grad_norm": 0.5895443558692932, "learning_rate": 7.539577069424215e-07, "loss": 0.06478384137153625, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.27943420410156, "rewards/margins": 47.189727783203125, "rewards/rejected": 8.091753005981445, "step": 4821 }, { "epoch": 2.4958592132505175, "grad_norm": 1.3344205617904663, "learning_rate": 7.524519122033819e-07, "loss": 0.10474241524934769, "rewards/accuracies": 0.953125, "rewards/chosen": 52.965572357177734, "rewards/margins": 43.474609375, "rewards/rejected": 9.497138977050781, "step": 4822 }, { "epoch": 2.496376811594203, "grad_norm": 0.8905105590820312, "learning_rate": 7.50947500284252e-07, "loss": 0.07445694506168365, "rewards/accuracies": 0.96875, "rewards/chosen": 53.032012939453125, "rewards/margins": 43.258056640625, "rewards/rejected": 9.788342475891113, "step": 4823 }, { "epoch": 2.4968944099378882, "grad_norm": 0.693666934967041, "learning_rate": 7.49444471674804e-07, "loss": 0.039569348096847534, "rewards/accuracies": 1.0, "rewards/chosen": 52.03564453125, "rewards/margins": 43.08062744140625, "rewards/rejected": 8.959497451782227, "step": 4824 }, { "epoch": 2.4974120082815734, "grad_norm": 1.0508772134780884, "learning_rate": 7.479428268643596e-07, "loss": 0.07189850509166718, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.748809814453125, "rewards/margins": 41.778228759765625, "rewards/rejected": 8.98442268371582, "step": 4825 }, { "epoch": 2.497929606625259, "grad_norm": 0.8885859251022339, "learning_rate": 7.464425663417907e-07, "loss": 0.09767121076583862, "rewards/accuracies": 0.9375, "rewards/chosen": 54.427734375, "rewards/margins": 42.2305908203125, "rewards/rejected": 12.196075439453125, "step": 4826 }, { "epoch": 2.498447204968944, "grad_norm": 0.9717844128608704, "learning_rate": 7.449436905955165e-07, "loss": 0.083661749958992, "rewards/accuracies": 0.96875, "rewards/chosen": 51.33160400390625, "rewards/margins": 42.259613037109375, "rewards/rejected": 9.074996948242188, "step": 4827 }, { "epoch": 2.4989648033126293, "grad_norm": 1.6720247268676758, "learning_rate": 7.434462001135084e-07, "loss": 0.12768104672431946, "rewards/accuracies": 0.921875, "rewards/chosen": 50.70365905761719, "rewards/margins": 41.62471008300781, "rewards/rejected": 9.073297500610352, "step": 4828 }, { "epoch": 2.499482401656315, "grad_norm": 1.5174061059951782, "learning_rate": 7.419500953832836e-07, "loss": 0.11079344153404236, "rewards/accuracies": 0.9296875, "rewards/chosen": 53.573516845703125, "rewards/margins": 43.46245574951172, "rewards/rejected": 10.114845275878906, "step": 4829 }, { "epoch": 2.5, "grad_norm": 1.581363320350647, "learning_rate": 7.404553768919126e-07, "loss": 0.09012498706579208, "rewards/accuracies": 0.953125, "rewards/chosen": 55.97247314453125, "rewards/margins": 46.003387451171875, "rewards/rejected": 9.979751586914062, "step": 4830 }, { "epoch": 2.500517598343685, "grad_norm": 1.1265435218811035, "learning_rate": 7.389620451260077e-07, "loss": 0.12150794267654419, "rewards/accuracies": 0.9140625, "rewards/chosen": 51.633056640625, "rewards/margins": 42.32032775878906, "rewards/rejected": 9.289528846740723, "step": 4831 }, { "epoch": 2.5010351966873707, "grad_norm": 1.3722479343414307, "learning_rate": 7.374701005717366e-07, "loss": 0.10197576880455017, "rewards/accuracies": 0.953125, "rewards/chosen": 57.23780059814453, "rewards/margins": 46.78254699707031, "rewards/rejected": 10.474517822265625, "step": 4832 }, { "epoch": 2.501552795031056, "grad_norm": 0.6997268795967102, "learning_rate": 7.359795437148132e-07, "loss": 0.06991112232208252, "rewards/accuracies": 0.96875, "rewards/chosen": 52.661399841308594, "rewards/margins": 39.995574951171875, "rewards/rejected": 12.684001922607422, "step": 4833 }, { "epoch": 2.5020703933747415, "grad_norm": 0.7395303845405579, "learning_rate": 7.344903750404958e-07, "loss": 0.06233572959899902, "rewards/accuracies": 0.96875, "rewards/chosen": 50.753543853759766, "rewards/margins": 40.5933723449707, "rewards/rejected": 10.17540168762207, "step": 4834 }, { "epoch": 2.5025879917184266, "grad_norm": 2.3551108837127686, "learning_rate": 7.330025950335972e-07, "loss": 0.08056125044822693, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.16810989379883, "rewards/margins": 42.23968505859375, "rewards/rejected": 9.920921325683594, "step": 4835 }, { "epoch": 2.5031055900621118, "grad_norm": 0.7348962426185608, "learning_rate": 7.31516204178474e-07, "loss": 0.048076167702674866, "rewards/accuracies": 0.984375, "rewards/chosen": 59.49214172363281, "rewards/margins": 47.01739501953125, "rewards/rejected": 12.476776123046875, "step": 4836 }, { "epoch": 2.503623188405797, "grad_norm": 0.5985212326049805, "learning_rate": 7.300312029590318e-07, "loss": 0.04867241531610489, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.442874908447266, "rewards/margins": 47.151153564453125, "rewards/rejected": 12.28363037109375, "step": 4837 }, { "epoch": 2.5041407867494825, "grad_norm": 0.6662135124206543, "learning_rate": 7.285475918587209e-07, "loss": 0.0666661411523819, "rewards/accuracies": 0.96875, "rewards/chosen": 50.80156707763672, "rewards/margins": 40.74137878417969, "rewards/rejected": 10.058501243591309, "step": 4838 }, { "epoch": 2.5046583850931676, "grad_norm": 1.693476915359497, "learning_rate": 7.270653713605447e-07, "loss": 0.1191650778055191, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.79475402832031, "rewards/margins": 42.675567626953125, "rewards/rejected": 13.115028381347656, "step": 4839 }, { "epoch": 2.505175983436853, "grad_norm": 1.3895634412765503, "learning_rate": 7.255845419470503e-07, "loss": 0.11820413172245026, "rewards/accuracies": 0.921875, "rewards/chosen": 50.54698181152344, "rewards/margins": 39.553741455078125, "rewards/rejected": 10.994438171386719, "step": 4840 }, { "epoch": 2.5056935817805384, "grad_norm": 0.5553634762763977, "learning_rate": 7.241051041003311e-07, "loss": 0.045053984969854355, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.23162078857422, "rewards/margins": 45.51066589355469, "rewards/rejected": 11.71811294555664, "step": 4841 }, { "epoch": 2.5062111801242235, "grad_norm": 1.5130295753479004, "learning_rate": 7.226270583020284e-07, "loss": 0.10935163497924805, "rewards/accuracies": 0.953125, "rewards/chosen": 54.007080078125, "rewards/margins": 45.44239807128906, "rewards/rejected": 8.570173263549805, "step": 4842 }, { "epoch": 2.5067287784679086, "grad_norm": 1.360253930091858, "learning_rate": 7.211504050333335e-07, "loss": 0.13263177871704102, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.96115493774414, "rewards/margins": 46.26171112060547, "rewards/rejected": 12.705968856811523, "step": 4843 }, { "epoch": 2.5072463768115942, "grad_norm": 0.4941796064376831, "learning_rate": 7.196751447749789e-07, "loss": 0.03522463142871857, "rewards/accuracies": 0.984375, "rewards/chosen": 56.33960723876953, "rewards/margins": 45.56561279296875, "rewards/rejected": 10.763052940368652, "step": 4844 }, { "epoch": 2.5077639751552794, "grad_norm": 1.056455373764038, "learning_rate": 7.182012780072467e-07, "loss": 0.07495296001434326, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.67845916748047, "rewards/margins": 44.697601318359375, "rewards/rejected": 7.9737701416015625, "step": 4845 }, { "epoch": 2.508281573498965, "grad_norm": 0.8058255314826965, "learning_rate": 7.167288052099642e-07, "loss": 0.07425746321678162, "rewards/accuracies": 0.96875, "rewards/chosen": 59.060394287109375, "rewards/margins": 47.4969482421875, "rewards/rejected": 11.572608947753906, "step": 4846 }, { "epoch": 2.50879917184265, "grad_norm": 1.9447007179260254, "learning_rate": 7.152577268625083e-07, "loss": 0.0942976176738739, "rewards/accuracies": 0.9375, "rewards/chosen": 53.03712463378906, "rewards/margins": 42.02125549316406, "rewards/rejected": 11.00448226928711, "step": 4847 }, { "epoch": 2.5093167701863353, "grad_norm": 0.8126870393753052, "learning_rate": 7.137880434437966e-07, "loss": 0.07156100869178772, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.34796142578125, "rewards/margins": 41.83148193359375, "rewards/rejected": 8.531585693359375, "step": 4848 }, { "epoch": 2.509834368530021, "grad_norm": 0.7783623337745667, "learning_rate": 7.123197554322958e-07, "loss": 0.06254314631223679, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.7091064453125, "rewards/margins": 48.1644287109375, "rewards/rejected": 10.532554626464844, "step": 4849 }, { "epoch": 2.510351966873706, "grad_norm": 1.9888007640838623, "learning_rate": 7.10852863306018e-07, "loss": 0.17420753836631775, "rewards/accuracies": 0.921875, "rewards/chosen": 49.136112213134766, "rewards/margins": 40.555084228515625, "rewards/rejected": 8.585845947265625, "step": 4850 }, { "epoch": 2.5108695652173916, "grad_norm": 1.122443437576294, "learning_rate": 7.0938736754252e-07, "loss": 0.0740770474076271, "rewards/accuracies": 0.96875, "rewards/chosen": 52.25032043457031, "rewards/margins": 43.29908752441406, "rewards/rejected": 8.95697021484375, "step": 4851 }, { "epoch": 2.5113871635610767, "grad_norm": 0.8517184853553772, "learning_rate": 7.079232686189047e-07, "loss": 0.06589759886264801, "rewards/accuracies": 0.96875, "rewards/chosen": 50.53593444824219, "rewards/margins": 42.4765625, "rewards/rejected": 8.05206298828125, "step": 4852 }, { "epoch": 2.511904761904762, "grad_norm": 1.9328668117523193, "learning_rate": 7.064605670118202e-07, "loss": 0.10839488357305527, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.64231872558594, "rewards/margins": 42.9254150390625, "rewards/rejected": 10.727002143859863, "step": 4853 }, { "epoch": 2.512422360248447, "grad_norm": 1.3679530620574951, "learning_rate": 7.049992631974595e-07, "loss": 0.04833611845970154, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.59980392456055, "rewards/margins": 47.63458251953125, "rewards/rejected": 9.952102661132812, "step": 4854 }, { "epoch": 2.5129399585921326, "grad_norm": 0.956250786781311, "learning_rate": 7.035393576515604e-07, "loss": 0.08521176874637604, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.884239196777344, "rewards/margins": 42.49365234375, "rewards/rejected": 11.396729469299316, "step": 4855 }, { "epoch": 2.5134575569358177, "grad_norm": 0.7323763966560364, "learning_rate": 7.02080850849407e-07, "loss": 0.072889044880867, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.5992317199707, "rewards/margins": 45.238677978515625, "rewards/rejected": 11.367298126220703, "step": 4856 }, { "epoch": 2.5139751552795033, "grad_norm": 1.1716184616088867, "learning_rate": 7.006237432658236e-07, "loss": 0.11609770357608795, "rewards/accuracies": 0.9375, "rewards/chosen": 54.54486846923828, "rewards/margins": 44.47137451171875, "rewards/rejected": 10.071556091308594, "step": 4857 }, { "epoch": 2.5144927536231885, "grad_norm": 0.6959104537963867, "learning_rate": 6.991680353751846e-07, "loss": 0.05086394399404526, "rewards/accuracies": 0.984375, "rewards/chosen": 57.84950256347656, "rewards/margins": 42.55517578125, "rewards/rejected": 15.29928970336914, "step": 4858 }, { "epoch": 2.5150103519668736, "grad_norm": 0.7686744332313538, "learning_rate": 6.977137276514062e-07, "loss": 0.06303077936172485, "rewards/accuracies": 0.96875, "rewards/chosen": 54.905242919921875, "rewards/margins": 44.32257080078125, "rewards/rejected": 10.595983505249023, "step": 4859 }, { "epoch": 2.5155279503105588, "grad_norm": 1.2120742797851562, "learning_rate": 6.962608205679489e-07, "loss": 0.09565898776054382, "rewards/accuracies": 0.9453125, "rewards/chosen": 60.45928955078125, "rewards/margins": 46.580841064453125, "rewards/rejected": 13.875045776367188, "step": 4860 }, { "epoch": 2.5160455486542443, "grad_norm": 1.1274769306182861, "learning_rate": 6.948093145978147e-07, "loss": 0.09480607509613037, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.84413146972656, "rewards/margins": 48.06866455078125, "rewards/rejected": 11.766767501831055, "step": 4861 }, { "epoch": 2.5165631469979295, "grad_norm": 2.0461812019348145, "learning_rate": 6.933592102135539e-07, "loss": 0.059981390833854675, "rewards/accuracies": 0.984375, "rewards/chosen": 68.238037109375, "rewards/margins": 52.28277587890625, "rewards/rejected": 15.967670440673828, "step": 4862 }, { "epoch": 2.517080745341615, "grad_norm": 0.7167750000953674, "learning_rate": 6.919105078872596e-07, "loss": 0.05216805264353752, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.926719665527344, "rewards/margins": 48.70948791503906, "rewards/rejected": 12.208106994628906, "step": 4863 }, { "epoch": 2.5175983436853, "grad_norm": 0.79700767993927, "learning_rate": 6.904632080905638e-07, "loss": 0.055031873285770416, "rewards/accuracies": 0.984375, "rewards/chosen": 54.61503219604492, "rewards/margins": 45.748138427734375, "rewards/rejected": 8.86459732055664, "step": 4864 }, { "epoch": 2.5181159420289854, "grad_norm": 0.6849491596221924, "learning_rate": 6.89017311294648e-07, "loss": 0.04970511049032211, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.03300476074219, "rewards/margins": 46.8399658203125, "rewards/rejected": 13.193707466125488, "step": 4865 }, { "epoch": 2.518633540372671, "grad_norm": 8.088427543640137, "learning_rate": 6.875728179702346e-07, "loss": 0.11401157081127167, "rewards/accuracies": 0.953125, "rewards/chosen": 52.428070068359375, "rewards/margins": 41.1878662109375, "rewards/rejected": 11.24466323852539, "step": 4866 }, { "epoch": 2.519151138716356, "grad_norm": 0.9128353595733643, "learning_rate": 6.861297285875873e-07, "loss": 0.10110640525817871, "rewards/accuracies": 0.9296875, "rewards/chosen": 62.541996002197266, "rewards/margins": 48.40911102294922, "rewards/rejected": 14.131415367126465, "step": 4867 }, { "epoch": 2.5196687370600412, "grad_norm": 1.2762579917907715, "learning_rate": 6.846880436165132e-07, "loss": 0.10840511322021484, "rewards/accuracies": 0.953125, "rewards/chosen": 51.980979919433594, "rewards/margins": 40.57977294921875, "rewards/rejected": 11.404632568359375, "step": 4868 }, { "epoch": 2.520186335403727, "grad_norm": 0.6557672619819641, "learning_rate": 6.832477635263662e-07, "loss": 0.05919565632939339, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.88957977294922, "rewards/margins": 40.26408386230469, "rewards/rejected": 11.626689910888672, "step": 4869 }, { "epoch": 2.520703933747412, "grad_norm": 0.4955047369003296, "learning_rate": 6.818088887860397e-07, "loss": 0.03051505982875824, "rewards/accuracies": 0.9921875, "rewards/chosen": 63.34967041015625, "rewards/margins": 50.350250244140625, "rewards/rejected": 12.999309539794922, "step": 4870 }, { "epoch": 2.521221532091097, "grad_norm": 1.58162260055542, "learning_rate": 6.803714198639677e-07, "loss": 0.10443609952926636, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.15251159667969, "rewards/margins": 45.34254455566406, "rewards/rejected": 10.820416450500488, "step": 4871 }, { "epoch": 2.5217391304347827, "grad_norm": 1.0456550121307373, "learning_rate": 6.789353572281282e-07, "loss": 0.06536607444286346, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.851192474365234, "rewards/margins": 47.83740234375, "rewards/rejected": 12.02261734008789, "step": 4872 }, { "epoch": 2.522256728778468, "grad_norm": 1.0727895498275757, "learning_rate": 6.775007013460449e-07, "loss": 0.09189511835575104, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.73103332519531, "rewards/margins": 45.84027099609375, "rewards/rejected": 10.896003723144531, "step": 4873 }, { "epoch": 2.5227743271221534, "grad_norm": 0.9900140762329102, "learning_rate": 6.760674526847777e-07, "loss": 0.09856632351875305, "rewards/accuracies": 0.953125, "rewards/chosen": 60.405250549316406, "rewards/margins": 49.529449462890625, "rewards/rejected": 10.874181747436523, "step": 4874 }, { "epoch": 2.5232919254658386, "grad_norm": 0.4375120997428894, "learning_rate": 6.746356117109315e-07, "loss": 0.030909564346075058, "rewards/accuracies": 0.984375, "rewards/chosen": 52.26704406738281, "rewards/margins": 41.784942626953125, "rewards/rejected": 10.480636596679688, "step": 4875 }, { "epoch": 2.5238095238095237, "grad_norm": 1.0440727472305298, "learning_rate": 6.732051788906529e-07, "loss": 0.09870167076587677, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.9172477722168, "rewards/margins": 47.12152099609375, "rewards/rejected": 10.790634155273438, "step": 4876 }, { "epoch": 2.524327122153209, "grad_norm": 1.2218875885009766, "learning_rate": 6.717761546896295e-07, "loss": 0.06412489712238312, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.44392395019531, "rewards/margins": 45.351043701171875, "rewards/rejected": 11.091089248657227, "step": 4877 }, { "epoch": 2.5248447204968945, "grad_norm": 0.8854168057441711, "learning_rate": 6.703485395730908e-07, "loss": 0.06142760068178177, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.0595703125, "rewards/margins": 45.83984375, "rewards/rejected": 11.227195739746094, "step": 4878 }, { "epoch": 2.5253623188405796, "grad_norm": 1.6694185733795166, "learning_rate": 6.689223340058065e-07, "loss": 0.11480562388896942, "rewards/accuracies": 0.953125, "rewards/chosen": 58.792640686035156, "rewards/margins": 46.559783935546875, "rewards/rejected": 12.24709701538086, "step": 4879 }, { "epoch": 2.525879917184265, "grad_norm": 2.5425870418548584, "learning_rate": 6.674975384520882e-07, "loss": 0.10754317790269852, "rewards/accuracies": 0.96875, "rewards/chosen": 58.35601806640625, "rewards/margins": 47.205810546875, "rewards/rejected": 11.156506538391113, "step": 4880 }, { "epoch": 2.5263975155279503, "grad_norm": 8.5125150680542, "learning_rate": 6.660741533757891e-07, "loss": 0.08285723626613617, "rewards/accuracies": 0.984375, "rewards/chosen": 54.40583801269531, "rewards/margins": 44.182373046875, "rewards/rejected": 10.216861724853516, "step": 4881 }, { "epoch": 2.5269151138716355, "grad_norm": 1.2690579891204834, "learning_rate": 6.646521792403016e-07, "loss": 0.12680913507938385, "rewards/accuracies": 0.953125, "rewards/chosen": 55.18729782104492, "rewards/margins": 43.2978515625, "rewards/rejected": 11.88715934753418, "step": 4882 }, { "epoch": 2.527432712215321, "grad_norm": 0.5438895225524902, "learning_rate": 6.6323161650856e-07, "loss": 0.04183728247880936, "rewards/accuracies": 0.984375, "rewards/chosen": 59.840118408203125, "rewards/margins": 48.24212646484375, "rewards/rejected": 11.606624603271484, "step": 4883 }, { "epoch": 2.527950310559006, "grad_norm": 1.0158989429473877, "learning_rate": 6.618124656430386e-07, "loss": 0.09815576672554016, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.71144104003906, "rewards/margins": 46.023956298828125, "rewards/rejected": 10.69192886352539, "step": 4884 }, { "epoch": 2.5284679089026914, "grad_norm": 0.9010587334632874, "learning_rate": 6.603947271057526e-07, "loss": 0.0652051568031311, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.77018737792969, "rewards/margins": 46.522979736328125, "rewards/rejected": 10.245246887207031, "step": 4885 }, { "epoch": 2.528985507246377, "grad_norm": 4.105987548828125, "learning_rate": 6.589784013582578e-07, "loss": 0.11732052266597748, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.04396057128906, "rewards/margins": 43.804656982421875, "rewards/rejected": 10.231254577636719, "step": 4886 }, { "epoch": 2.529503105590062, "grad_norm": 1.2553428411483765, "learning_rate": 6.575634888616461e-07, "loss": 0.0699048712849617, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.17604064941406, "rewards/margins": 45.26318359375, "rewards/rejected": 11.919605255126953, "step": 4887 }, { "epoch": 2.5300207039337472, "grad_norm": 0.9120792150497437, "learning_rate": 6.561499900765556e-07, "loss": 0.07754813879728317, "rewards/accuracies": 0.953125, "rewards/chosen": 57.22023010253906, "rewards/margins": 42.449249267578125, "rewards/rejected": 14.76236343383789, "step": 4888 }, { "epoch": 2.530538302277433, "grad_norm": 2.2326183319091797, "learning_rate": 6.547379054631608e-07, "loss": 0.12110692262649536, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.45579528808594, "rewards/margins": 45.083892822265625, "rewards/rejected": 12.372234344482422, "step": 4889 }, { "epoch": 2.531055900621118, "grad_norm": 1.6139944791793823, "learning_rate": 6.533272354811748e-07, "loss": 0.1201934665441513, "rewards/accuracies": 0.9375, "rewards/chosen": 54.8225212097168, "rewards/margins": 42.84297180175781, "rewards/rejected": 11.989697456359863, "step": 4890 }, { "epoch": 2.5315734989648035, "grad_norm": 1.2266724109649658, "learning_rate": 6.519179805898501e-07, "loss": 0.08942659944295883, "rewards/accuracies": 0.953125, "rewards/chosen": 56.873985290527344, "rewards/margins": 44.64642333984375, "rewards/rejected": 12.218887329101562, "step": 4891 }, { "epoch": 2.5320910973084887, "grad_norm": 0.6753802299499512, "learning_rate": 6.505101412479831e-07, "loss": 0.04457978531718254, "rewards/accuracies": 0.984375, "rewards/chosen": 56.697288513183594, "rewards/margins": 44.21014404296875, "rewards/rejected": 12.478919982910156, "step": 4892 }, { "epoch": 2.532608695652174, "grad_norm": 4.478215217590332, "learning_rate": 6.491037179139048e-07, "loss": 0.055491961538791656, "rewards/accuracies": 0.96875, "rewards/chosen": 53.09571838378906, "rewards/margins": 42.1136474609375, "rewards/rejected": 10.975017547607422, "step": 4893 }, { "epoch": 2.533126293995859, "grad_norm": 1.0394731760025024, "learning_rate": 6.476987110454847e-07, "loss": 0.0844026505947113, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.47742462158203, "rewards/margins": 42.750877380371094, "rewards/rejected": 9.718403816223145, "step": 4894 }, { "epoch": 2.5336438923395446, "grad_norm": 1.1344166994094849, "learning_rate": 6.46295121100135e-07, "loss": 0.112566277384758, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.07926940917969, "rewards/margins": 44.84538269042969, "rewards/rejected": 9.234874725341797, "step": 4895 }, { "epoch": 2.5341614906832297, "grad_norm": 1.4511953592300415, "learning_rate": 6.448929485348048e-07, "loss": 0.08114677667617798, "rewards/accuracies": 0.96875, "rewards/chosen": 55.67018508911133, "rewards/margins": 43.9871826171875, "rewards/rejected": 11.676193237304688, "step": 4896 }, { "epoch": 2.5346790890269153, "grad_norm": 0.6228660345077515, "learning_rate": 6.434921938059796e-07, "loss": 0.04273171350359917, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.3878059387207, "rewards/margins": 49.22727966308594, "rewards/rejected": 11.156482696533203, "step": 4897 }, { "epoch": 2.5351966873706004, "grad_norm": 1.175340175628662, "learning_rate": 6.42092857369685e-07, "loss": 0.1400514543056488, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.00664520263672, "rewards/margins": 42.99725341796875, "rewards/rejected": 11.005727767944336, "step": 4898 }, { "epoch": 2.5357142857142856, "grad_norm": 2.5143070220947266, "learning_rate": 6.406949396814887e-07, "loss": 0.1385289430618286, "rewards/accuracies": 0.921875, "rewards/chosen": 53.44049072265625, "rewards/margins": 44.26526641845703, "rewards/rejected": 9.189009666442871, "step": 4899 }, { "epoch": 2.536231884057971, "grad_norm": 1.2921267747879028, "learning_rate": 6.392984411964892e-07, "loss": 0.044555969536304474, "rewards/accuracies": 0.984375, "rewards/chosen": 58.83531188964844, "rewards/margins": 47.131103515625, "rewards/rejected": 11.714353561401367, "step": 4900 }, { "epoch": 2.5367494824016563, "grad_norm": 2.772484302520752, "learning_rate": 6.379033623693276e-07, "loss": 0.12144555151462555, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.292633056640625, "rewards/margins": 46.5430908203125, "rewards/rejected": 10.7465181350708, "step": 4901 }, { "epoch": 2.5372670807453415, "grad_norm": 1.6943161487579346, "learning_rate": 6.365097036541823e-07, "loss": 0.1320418417453766, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.80180740356445, "rewards/margins": 42.84967041015625, "rewards/rejected": 10.936415672302246, "step": 4902 }, { "epoch": 2.537784679089027, "grad_norm": 0.9281001091003418, "learning_rate": 6.351174655047687e-07, "loss": 0.11132968962192535, "rewards/accuracies": 0.9296875, "rewards/chosen": 61.272300720214844, "rewards/margins": 51.74818420410156, "rewards/rejected": 9.543083190917969, "step": 4903 }, { "epoch": 2.538302277432712, "grad_norm": 1.4618539810180664, "learning_rate": 6.337266483743398e-07, "loss": 0.12509390711784363, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.35285186767578, "rewards/margins": 46.193603515625, "rewards/rejected": 12.170219421386719, "step": 4904 }, { "epoch": 2.5388198757763973, "grad_norm": 18.723464965820312, "learning_rate": 6.323372527156868e-07, "loss": 0.08269921690225601, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.457305908203125, "rewards/margins": 46.624237060546875, "rewards/rejected": 11.827465057373047, "step": 4905 }, { "epoch": 2.539337474120083, "grad_norm": 0.8593428730964661, "learning_rate": 6.309492789811372e-07, "loss": 0.09373865276575089, "rewards/accuracies": 0.953125, "rewards/chosen": 52.27833938598633, "rewards/margins": 43.02696228027344, "rewards/rejected": 9.249994277954102, "step": 4906 }, { "epoch": 2.539855072463768, "grad_norm": 1.2931876182556152, "learning_rate": 6.295627276225552e-07, "loss": 0.06845806539058685, "rewards/accuracies": 0.96875, "rewards/chosen": 53.50582504272461, "rewards/margins": 42.0037841796875, "rewards/rejected": 11.503700256347656, "step": 4907 }, { "epoch": 2.5403726708074537, "grad_norm": 0.8845772743225098, "learning_rate": 6.281775990913435e-07, "loss": 0.10749079287052155, "rewards/accuracies": 0.9375, "rewards/chosen": 57.67461395263672, "rewards/margins": 43.51708984375, "rewards/rejected": 14.150935173034668, "step": 4908 }, { "epoch": 2.540890269151139, "grad_norm": 0.7808945178985596, "learning_rate": 6.2679389383844e-07, "loss": 0.06966975331306458, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.956451416015625, "rewards/margins": 41.967620849609375, "rewards/rejected": 10.974651336669922, "step": 4909 }, { "epoch": 2.541407867494824, "grad_norm": 0.9102622866630554, "learning_rate": 6.2541161231432e-07, "loss": 0.08432643115520477, "rewards/accuracies": 0.96875, "rewards/chosen": 58.541290283203125, "rewards/margins": 47.455078125, "rewards/rejected": 11.085461616516113, "step": 4910 }, { "epoch": 2.541925465838509, "grad_norm": 0.7892379760742188, "learning_rate": 6.240307549689961e-07, "loss": 0.04692453518509865, "rewards/accuracies": 0.9921875, "rewards/chosen": 56.273651123046875, "rewards/margins": 45.35211181640625, "rewards/rejected": 10.93355941772461, "step": 4911 }, { "epoch": 2.5424430641821947, "grad_norm": 0.6489058136940002, "learning_rate": 6.226513222520165e-07, "loss": 0.0594194270670414, "rewards/accuracies": 0.96875, "rewards/chosen": 61.64961242675781, "rewards/margins": 48.18878173828125, "rewards/rejected": 13.454058647155762, "step": 4912 }, { "epoch": 2.54296066252588, "grad_norm": 0.8334410786628723, "learning_rate": 6.212733146124622e-07, "loss": 0.06192094832658768, "rewards/accuracies": 0.984375, "rewards/chosen": 49.88866424560547, "rewards/margins": 41.58612060546875, "rewards/rejected": 8.297595977783203, "step": 4913 }, { "epoch": 2.5434782608695654, "grad_norm": 2.673248767852783, "learning_rate": 6.198967324989575e-07, "loss": 0.08931013196706772, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.28615951538086, "rewards/margins": 44.71470642089844, "rewards/rejected": 10.583251953125, "step": 4914 }, { "epoch": 2.5439958592132506, "grad_norm": 1.061981439590454, "learning_rate": 6.18521576359658e-07, "loss": 0.07122756540775299, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.59246826171875, "rewards/margins": 46.317413330078125, "rewards/rejected": 8.289634704589844, "step": 4915 }, { "epoch": 2.5445134575569357, "grad_norm": 0.5355165600776672, "learning_rate": 6.171478466422536e-07, "loss": 0.03919902816414833, "rewards/accuracies": 0.9921875, "rewards/chosen": 57.228065490722656, "rewards/margins": 44.045257568359375, "rewards/rejected": 13.173980712890625, "step": 4916 }, { "epoch": 2.545031055900621, "grad_norm": 0.6101858615875244, "learning_rate": 6.157755437939722e-07, "loss": 0.04831118881702423, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.369667053222656, "rewards/margins": 49.21337890625, "rewards/rejected": 10.1492919921875, "step": 4917 }, { "epoch": 2.5455486542443064, "grad_norm": 1.1918357610702515, "learning_rate": 6.14404668261579e-07, "loss": 0.12244884669780731, "rewards/accuracies": 0.9296875, "rewards/chosen": 60.34822082519531, "rewards/margins": 46.360626220703125, "rewards/rejected": 13.989501953125, "step": 4918 }, { "epoch": 2.5460662525879916, "grad_norm": 0.971567690372467, "learning_rate": 6.13035220491372e-07, "loss": 0.08517113327980042, "rewards/accuracies": 0.96875, "rewards/chosen": 57.50607681274414, "rewards/margins": 45.802032470703125, "rewards/rejected": 11.694347381591797, "step": 4919 }, { "epoch": 2.546583850931677, "grad_norm": 0.832205593585968, "learning_rate": 6.116672009291835e-07, "loss": 0.08353439718484879, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.898738861083984, "rewards/margins": 41.34858703613281, "rewards/rejected": 12.55630111694336, "step": 4920 }, { "epoch": 2.5471014492753623, "grad_norm": 2.2138712406158447, "learning_rate": 6.103006100203812e-07, "loss": 0.1368836760520935, "rewards/accuracies": 0.90625, "rewards/chosen": 53.776824951171875, "rewards/margins": 41.8900146484375, "rewards/rejected": 11.880378723144531, "step": 4921 }, { "epoch": 2.5476190476190474, "grad_norm": 3.6598470211029053, "learning_rate": 6.089354482098725e-07, "loss": 0.14901573956012726, "rewards/accuracies": 0.9375, "rewards/chosen": 51.98028564453125, "rewards/margins": 40.82832336425781, "rewards/rejected": 11.138534545898438, "step": 4922 }, { "epoch": 2.548136645962733, "grad_norm": 2.6483616828918457, "learning_rate": 6.075717159420919e-07, "loss": 0.08164394646883011, "rewards/accuracies": 0.96875, "rewards/chosen": 59.25325012207031, "rewards/margins": 47.49444580078125, "rewards/rejected": 11.764083862304688, "step": 4923 }, { "epoch": 2.548654244306418, "grad_norm": 0.6893897652626038, "learning_rate": 6.062094136610125e-07, "loss": 0.05358898639678955, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.834266662597656, "rewards/margins": 42.726165771484375, "rewards/rejected": 13.12131118774414, "step": 4924 }, { "epoch": 2.5491718426501038, "grad_norm": 0.4767284393310547, "learning_rate": 6.048485418101452e-07, "loss": 0.04820441082119942, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.97776412963867, "rewards/margins": 47.4852294921875, "rewards/rejected": 11.491459846496582, "step": 4925 }, { "epoch": 2.549689440993789, "grad_norm": 0.6919426918029785, "learning_rate": 6.034891008325283e-07, "loss": 0.08092017471790314, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.7064094543457, "rewards/margins": 41.3992919921875, "rewards/rejected": 11.29888916015625, "step": 4926 }, { "epoch": 2.550207039337474, "grad_norm": 1.082344651222229, "learning_rate": 6.021310911707379e-07, "loss": 0.11942248046398163, "rewards/accuracies": 0.921875, "rewards/chosen": 55.88059997558594, "rewards/margins": 44.844635009765625, "rewards/rejected": 11.026159286499023, "step": 4927 }, { "epoch": 2.550724637681159, "grad_norm": 1.0308037996292114, "learning_rate": 6.007745132668841e-07, "loss": 0.09360374510288239, "rewards/accuracies": 0.9453125, "rewards/chosen": 49.94841384887695, "rewards/margins": 41.081390380859375, "rewards/rejected": 8.861579895019531, "step": 4928 }, { "epoch": 2.551242236024845, "grad_norm": 0.8985560536384583, "learning_rate": 5.994193675626125e-07, "loss": 0.06402616947889328, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.59144592285156, "rewards/margins": 48.639007568359375, "rewards/rejected": 11.9559326171875, "step": 4929 }, { "epoch": 2.55175983436853, "grad_norm": 0.867128312587738, "learning_rate": 5.980656544990987e-07, "loss": 0.06388253718614578, "rewards/accuracies": 0.984375, "rewards/chosen": 53.28199005126953, "rewards/margins": 42.71270751953125, "rewards/rejected": 10.554214477539062, "step": 4930 }, { "epoch": 2.5522774327122155, "grad_norm": 0.7426834106445312, "learning_rate": 5.967133745170533e-07, "loss": 0.0634755939245224, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.934261322021484, "rewards/margins": 44.448211669921875, "rewards/rejected": 10.474044799804688, "step": 4931 }, { "epoch": 2.5527950310559007, "grad_norm": 1.0900532007217407, "learning_rate": 5.953625280567222e-07, "loss": 0.0796586275100708, "rewards/accuracies": 0.953125, "rewards/chosen": 53.60728454589844, "rewards/margins": 42.325469970703125, "rewards/rejected": 11.281993865966797, "step": 4932 }, { "epoch": 2.553312629399586, "grad_norm": 0.534216582775116, "learning_rate": 5.940131155578821e-07, "loss": 0.05185554921627045, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.86387634277344, "rewards/margins": 46.78424072265625, "rewards/rejected": 11.0728759765625, "step": 4933 }, { "epoch": 2.553830227743271, "grad_norm": 2.8512003421783447, "learning_rate": 5.926651374598447e-07, "loss": 0.12385480105876923, "rewards/accuracies": 0.921875, "rewards/chosen": 49.50201416015625, "rewards/margins": 41.589752197265625, "rewards/rejected": 7.912710189819336, "step": 4934 }, { "epoch": 2.5543478260869565, "grad_norm": 0.9574106335639954, "learning_rate": 5.913185942014537e-07, "loss": 0.0727967917919159, "rewards/accuracies": 0.96875, "rewards/chosen": 57.257076263427734, "rewards/margins": 44.59747314453125, "rewards/rejected": 12.660663604736328, "step": 4935 }, { "epoch": 2.5548654244306417, "grad_norm": 1.0226503610610962, "learning_rate": 5.899734862210865e-07, "loss": 0.09524855762720108, "rewards/accuracies": 0.953125, "rewards/chosen": 51.69091796875, "rewards/margins": 41.30596923828125, "rewards/rejected": 10.380504608154297, "step": 4936 }, { "epoch": 2.5553830227743273, "grad_norm": 0.686775267124176, "learning_rate": 5.886298139566515e-07, "loss": 0.05737032741308212, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.744136810302734, "rewards/margins": 46.7017822265625, "rewards/rejected": 11.050575256347656, "step": 4937 }, { "epoch": 2.5559006211180124, "grad_norm": 1.0081700086593628, "learning_rate": 5.872875778455933e-07, "loss": 0.04848434031009674, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.7911376953125, "rewards/margins": 46.434112548828125, "rewards/rejected": 11.361867904663086, "step": 4938 }, { "epoch": 2.5564182194616976, "grad_norm": 1.146132230758667, "learning_rate": 5.859467783248824e-07, "loss": 0.09161689877510071, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.588191986083984, "rewards/margins": 43.469268798828125, "rewards/rejected": 13.109474182128906, "step": 4939 }, { "epoch": 2.556935817805383, "grad_norm": 1.1408230066299438, "learning_rate": 5.846074158310294e-07, "loss": 0.12292198836803436, "rewards/accuracies": 0.9296875, "rewards/chosen": 49.10118103027344, "rewards/margins": 40.27537536621094, "rewards/rejected": 8.839553833007812, "step": 4940 }, { "epoch": 2.5574534161490683, "grad_norm": 0.961097776889801, "learning_rate": 5.832694908000724e-07, "loss": 0.10025715827941895, "rewards/accuracies": 0.9375, "rewards/chosen": 53.43393325805664, "rewards/margins": 44.707000732421875, "rewards/rejected": 8.728858947753906, "step": 4941 }, { "epoch": 2.557971014492754, "grad_norm": 0.6720505356788635, "learning_rate": 5.819330036675829e-07, "loss": 0.07813140004873276, "rewards/accuracies": 0.9609375, "rewards/chosen": 64.10791015625, "rewards/margins": 50.9896240234375, "rewards/rejected": 13.103964805603027, "step": 4942 }, { "epoch": 2.558488612836439, "grad_norm": 1.203575611114502, "learning_rate": 5.805979548686619e-07, "loss": 0.12905411422252655, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.010581970214844, "rewards/margins": 44.434226989746094, "rewards/rejected": 9.577367782592773, "step": 4943 }, { "epoch": 2.559006211180124, "grad_norm": 0.48236238956451416, "learning_rate": 5.792643448379464e-07, "loss": 0.03465265780687332, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.34111785888672, "rewards/margins": 49.33854675292969, "rewards/rejected": 11.004226684570312, "step": 4944 }, { "epoch": 2.5595238095238093, "grad_norm": 1.1999013423919678, "learning_rate": 5.779321740096034e-07, "loss": 0.09537278115749359, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.355194091796875, "rewards/margins": 43.303863525390625, "rewards/rejected": 12.052597045898438, "step": 4945 }, { "epoch": 2.560041407867495, "grad_norm": 2.551800489425659, "learning_rate": 5.766014428173278e-07, "loss": 0.08825306594371796, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.725196838378906, "rewards/margins": 42.346099853515625, "rewards/rejected": 13.370433807373047, "step": 4946 }, { "epoch": 2.56055900621118, "grad_norm": 2.9195804595947266, "learning_rate": 5.752721516943493e-07, "loss": 0.1027032658457756, "rewards/accuracies": 0.96875, "rewards/chosen": 56.32086944580078, "rewards/margins": 45.340057373046875, "rewards/rejected": 10.986997604370117, "step": 4947 }, { "epoch": 2.5610766045548656, "grad_norm": 0.9995790123939514, "learning_rate": 5.739443010734319e-07, "loss": 0.08118926733732224, "rewards/accuracies": 0.96875, "rewards/chosen": 58.351593017578125, "rewards/margins": 46.25799560546875, "rewards/rejected": 12.07991886138916, "step": 4948 }, { "epoch": 2.5615942028985508, "grad_norm": 1.1325982809066772, "learning_rate": 5.726178913868624e-07, "loss": 0.0709867775440216, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.5931396484375, "rewards/margins": 44.947021484375, "rewards/rejected": 8.630098342895508, "step": 4949 }, { "epoch": 2.562111801242236, "grad_norm": 0.8732879161834717, "learning_rate": 5.712929230664654e-07, "loss": 0.09077207744121552, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.42562484741211, "rewards/margins": 47.95660400390625, "rewards/rejected": 11.457874298095703, "step": 4950 }, { "epoch": 2.562629399585921, "grad_norm": 1.182949423789978, "learning_rate": 5.699693965435926e-07, "loss": 0.0831129401922226, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.69855499267578, "rewards/margins": 48.01319885253906, "rewards/rejected": 9.675453186035156, "step": 4951 }, { "epoch": 2.5631469979296067, "grad_norm": 1.002474069595337, "learning_rate": 5.686473122491298e-07, "loss": 0.06775012612342834, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.78925323486328, "rewards/margins": 48.335540771484375, "rewards/rejected": 13.4566650390625, "step": 4952 }, { "epoch": 2.563664596273292, "grad_norm": 1.0160784721374512, "learning_rate": 5.673266706134889e-07, "loss": 0.05080723389983177, "rewards/accuracies": 0.9765625, "rewards/chosen": 69.86424255371094, "rewards/margins": 55.152069091796875, "rewards/rejected": 14.706096649169922, "step": 4953 }, { "epoch": 2.5641821946169774, "grad_norm": 1.0623383522033691, "learning_rate": 5.660074720666142e-07, "loss": 0.10262469947338104, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.71751403808594, "rewards/margins": 44.59461975097656, "rewards/rejected": 12.122199058532715, "step": 4954 }, { "epoch": 2.5646997929606625, "grad_norm": 1.4825676679611206, "learning_rate": 5.646897170379828e-07, "loss": 0.09588897228240967, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.160362243652344, "rewards/margins": 45.98261260986328, "rewards/rejected": 11.167320251464844, "step": 4955 }, { "epoch": 2.5652173913043477, "grad_norm": 1.0454025268554688, "learning_rate": 5.63373405956597e-07, "loss": 0.08601070940494537, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.79621505737305, "rewards/margins": 44.07304382324219, "rewards/rejected": 13.73385238647461, "step": 4956 }, { "epoch": 2.5657349896480333, "grad_norm": 1.565261721611023, "learning_rate": 5.62058539250992e-07, "loss": 0.05244629830121994, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.993324279785156, "rewards/margins": 46.3328857421875, "rewards/rejected": 14.663497924804688, "step": 4957 }, { "epoch": 2.5662525879917184, "grad_norm": 0.7064666748046875, "learning_rate": 5.607451173492323e-07, "loss": 0.03930775448679924, "rewards/accuracies": 0.9921875, "rewards/chosen": 57.526798248291016, "rewards/margins": 46.193267822265625, "rewards/rejected": 11.343050003051758, "step": 4958 }, { "epoch": 2.566770186335404, "grad_norm": 0.8099057674407959, "learning_rate": 5.594331406789117e-07, "loss": 0.09239552915096283, "rewards/accuracies": 0.9375, "rewards/chosen": 54.4113883972168, "rewards/margins": 43.7589111328125, "rewards/rejected": 10.639595031738281, "step": 4959 }, { "epoch": 2.567287784679089, "grad_norm": 0.5345761179924011, "learning_rate": 5.581226096671538e-07, "loss": 0.026978716254234314, "rewards/accuracies": 1.0, "rewards/chosen": 64.66925811767578, "rewards/margins": 49.634033203125, "rewards/rejected": 15.039169311523438, "step": 4960 }, { "epoch": 2.5678053830227743, "grad_norm": 1.4886605739593506, "learning_rate": 5.568135247406109e-07, "loss": 0.10898838937282562, "rewards/accuracies": 0.9609375, "rewards/chosen": 63.654998779296875, "rewards/margins": 51.49163818359375, "rewards/rejected": 12.170585632324219, "step": 4961 }, { "epoch": 2.5683229813664594, "grad_norm": 1.7160367965698242, "learning_rate": 5.555058863254658e-07, "loss": 0.08017054200172424, "rewards/accuracies": 0.96875, "rewards/chosen": 52.71687698364258, "rewards/margins": 42.282745361328125, "rewards/rejected": 10.435690879821777, "step": 4962 }, { "epoch": 2.568840579710145, "grad_norm": 0.46533650159835815, "learning_rate": 5.541996948474299e-07, "loss": 0.046556148678064346, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.71907043457031, "rewards/margins": 47.135009765625, "rewards/rejected": 11.585224151611328, "step": 4963 }, { "epoch": 2.56935817805383, "grad_norm": 1.102099895477295, "learning_rate": 5.528949507317422e-07, "loss": 0.048762354999780655, "rewards/accuracies": 0.984375, "rewards/chosen": 58.50445556640625, "rewards/margins": 46.60113525390625, "rewards/rejected": 11.904376983642578, "step": 4964 }, { "epoch": 2.5698757763975157, "grad_norm": 1.314516305923462, "learning_rate": 5.515916544031718e-07, "loss": 0.11715738475322723, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.57996368408203, "rewards/margins": 41.46528625488281, "rewards/rejected": 13.105278015136719, "step": 4965 }, { "epoch": 2.570393374741201, "grad_norm": 1.4182716608047485, "learning_rate": 5.502898062860168e-07, "loss": 0.06569251418113708, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.23353576660156, "rewards/margins": 51.4083251953125, "rewards/rejected": 9.842695236206055, "step": 4966 }, { "epoch": 2.570910973084886, "grad_norm": 0.9807575941085815, "learning_rate": 5.489894068041029e-07, "loss": 0.08886861801147461, "rewards/accuracies": 0.9375, "rewards/chosen": 55.94744873046875, "rewards/margins": 44.862152099609375, "rewards/rejected": 11.098838806152344, "step": 4967 }, { "epoch": 2.571428571428571, "grad_norm": 0.7904672026634216, "learning_rate": 5.476904563807855e-07, "loss": 0.07522623240947723, "rewards/accuracies": 0.9765625, "rewards/chosen": 63.26771545410156, "rewards/margins": 51.1370849609375, "rewards/rejected": 12.118419647216797, "step": 4968 }, { "epoch": 2.5719461697722568, "grad_norm": 0.6021580100059509, "learning_rate": 5.463929554389442e-07, "loss": 0.0469939261674881, "rewards/accuracies": 0.984375, "rewards/chosen": 53.27235412597656, "rewards/margins": 44.044281005859375, "rewards/rejected": 9.241634368896484, "step": 4969 }, { "epoch": 2.572463768115942, "grad_norm": 1.1354684829711914, "learning_rate": 5.45096904400993e-07, "loss": 0.07602964341640472, "rewards/accuracies": 0.953125, "rewards/chosen": 57.29350280761719, "rewards/margins": 45.90692138671875, "rewards/rejected": 11.389398574829102, "step": 4970 }, { "epoch": 2.5729813664596275, "grad_norm": 1.0139235258102417, "learning_rate": 5.438023036888707e-07, "loss": 0.10528945177793503, "rewards/accuracies": 0.9453125, "rewards/chosen": 67.21735382080078, "rewards/margins": 48.90220642089844, "rewards/rejected": 18.315441131591797, "step": 4971 }, { "epoch": 2.5734989648033126, "grad_norm": 1.636758804321289, "learning_rate": 5.425091537240413e-07, "loss": 0.10998211055994034, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.352752685546875, "rewards/margins": 44.618896484375, "rewards/rejected": 9.735809326171875, "step": 4972 }, { "epoch": 2.574016563146998, "grad_norm": 1.2943308353424072, "learning_rate": 5.412174549275001e-07, "loss": 0.12076395750045776, "rewards/accuracies": 0.953125, "rewards/chosen": 52.778480529785156, "rewards/margins": 40.56550598144531, "rewards/rejected": 12.219833374023438, "step": 4973 }, { "epoch": 2.5745341614906834, "grad_norm": 1.3823773860931396, "learning_rate": 5.399272077197698e-07, "loss": 0.05050387233495712, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.874122619628906, "rewards/margins": 48.975738525390625, "rewards/rejected": 11.913363456726074, "step": 4974 }, { "epoch": 2.5750517598343685, "grad_norm": 1.4228887557983398, "learning_rate": 5.38638412520901e-07, "loss": 0.07564230263233185, "rewards/accuracies": 0.953125, "rewards/chosen": 54.58174133300781, "rewards/margins": 44.777252197265625, "rewards/rejected": 9.814712524414062, "step": 4975 }, { "epoch": 2.575569358178054, "grad_norm": 0.7610902190208435, "learning_rate": 5.373510697504669e-07, "loss": 0.06666821241378784, "rewards/accuracies": 0.984375, "rewards/chosen": 54.85166931152344, "rewards/margins": 43.4595947265625, "rewards/rejected": 11.377218246459961, "step": 4976 }, { "epoch": 2.5760869565217392, "grad_norm": 1.1799161434173584, "learning_rate": 5.360651798275723e-07, "loss": 0.09005233645439148, "rewards/accuracies": 0.96875, "rewards/chosen": 47.44734191894531, "rewards/margins": 39.447967529296875, "rewards/rejected": 7.991889953613281, "step": 4977 }, { "epoch": 2.5766045548654244, "grad_norm": 0.7517237067222595, "learning_rate": 5.347807431708501e-07, "loss": 0.086453378200531, "rewards/accuracies": 0.953125, "rewards/chosen": 60.571311950683594, "rewards/margins": 47.824249267578125, "rewards/rejected": 12.750404357910156, "step": 4978 }, { "epoch": 2.5771221532091095, "grad_norm": 0.8016909956932068, "learning_rate": 5.334977601984559e-07, "loss": 0.08656664192676544, "rewards/accuracies": 0.953125, "rewards/chosen": 50.023380279541016, "rewards/margins": 42.74372863769531, "rewards/rejected": 7.279962539672852, "step": 4979 }, { "epoch": 2.577639751552795, "grad_norm": 0.6336395740509033, "learning_rate": 5.322162313280737e-07, "loss": 0.04814363270998001, "rewards/accuracies": 0.9921875, "rewards/chosen": 59.60747146606445, "rewards/margins": 46.90509033203125, "rewards/rejected": 12.687541961669922, "step": 4980 }, { "epoch": 2.5781573498964803, "grad_norm": 1.7443020343780518, "learning_rate": 5.309361569769162e-07, "loss": 0.08876056969165802, "rewards/accuracies": 0.953125, "rewards/chosen": 58.09711837768555, "rewards/margins": 46.77674865722656, "rewards/rejected": 11.327423095703125, "step": 4981 }, { "epoch": 2.578674948240166, "grad_norm": 1.101197600364685, "learning_rate": 5.296575375617191e-07, "loss": 0.09244608879089355, "rewards/accuracies": 0.9375, "rewards/chosen": 55.540809631347656, "rewards/margins": 45.891876220703125, "rewards/rejected": 9.651050567626953, "step": 4982 }, { "epoch": 2.579192546583851, "grad_norm": 0.8464391231536865, "learning_rate": 5.283803734987475e-07, "loss": 0.04134385287761688, "rewards/accuracies": 0.9921875, "rewards/chosen": 54.138160705566406, "rewards/margins": 43.828033447265625, "rewards/rejected": 10.31320571899414, "step": 4983 }, { "epoch": 2.579710144927536, "grad_norm": 1.1798917055130005, "learning_rate": 5.271046652037903e-07, "loss": 0.1112283319234848, "rewards/accuracies": 0.9375, "rewards/chosen": 53.608734130859375, "rewards/margins": 41.9052734375, "rewards/rejected": 11.69295883178711, "step": 4984 }, { "epoch": 2.5802277432712213, "grad_norm": 0.6575095653533936, "learning_rate": 5.258304130921649e-07, "loss": 0.05517948791384697, "rewards/accuracies": 0.9765625, "rewards/chosen": 63.596641540527344, "rewards/margins": 50.64787292480469, "rewards/rejected": 12.958610534667969, "step": 4985 }, { "epoch": 2.580745341614907, "grad_norm": 0.7739659547805786, "learning_rate": 5.245576175787115e-07, "loss": 0.06374193727970123, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.24510192871094, "rewards/margins": 40.87751770019531, "rewards/rejected": 11.35991096496582, "step": 4986 }, { "epoch": 2.581262939958592, "grad_norm": 0.9865186810493469, "learning_rate": 5.232862790777993e-07, "loss": 0.08925262093544006, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.141204833984375, "rewards/margins": 45.3099365234375, "rewards/rejected": 13.836151123046875, "step": 4987 }, { "epoch": 2.5817805383022776, "grad_norm": 0.8352081179618835, "learning_rate": 5.22016398003321e-07, "loss": 0.08366283774375916, "rewards/accuracies": 0.953125, "rewards/chosen": 57.24910354614258, "rewards/margins": 45.97023010253906, "rewards/rejected": 11.268821716308594, "step": 4988 }, { "epoch": 2.5822981366459627, "grad_norm": 0.9203696250915527, "learning_rate": 5.207479747686961e-07, "loss": 0.05443427711725235, "rewards/accuracies": 0.984375, "rewards/chosen": 52.669952392578125, "rewards/margins": 41.39747619628906, "rewards/rejected": 11.254322052001953, "step": 4989 }, { "epoch": 2.582815734989648, "grad_norm": 0.8138914108276367, "learning_rate": 5.194810097868685e-07, "loss": 0.07093654572963715, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.359283447265625, "rewards/margins": 44.14335632324219, "rewards/rejected": 12.214800834655762, "step": 4990 }, { "epoch": 2.5833333333333335, "grad_norm": 0.6844247579574585, "learning_rate": 5.182155034703074e-07, "loss": 0.07136645913124084, "rewards/accuracies": 0.96875, "rewards/chosen": 56.8017578125, "rewards/margins": 44.2135009765625, "rewards/rejected": 12.560554504394531, "step": 4991 }, { "epoch": 2.5838509316770186, "grad_norm": 0.902229905128479, "learning_rate": 5.169514562310085e-07, "loss": 0.0742850974202156, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.72465515136719, "rewards/margins": 40.697418212890625, "rewards/rejected": 12.02175521850586, "step": 4992 }, { "epoch": 2.5843685300207038, "grad_norm": 0.7361067533493042, "learning_rate": 5.156888684804906e-07, "loss": 0.05686764419078827, "rewards/accuracies": 0.96875, "rewards/chosen": 60.345027923583984, "rewards/margins": 46.660614013671875, "rewards/rejected": 13.701690673828125, "step": 4993 }, { "epoch": 2.5848861283643894, "grad_norm": 3.643256902694702, "learning_rate": 5.144277406297998e-07, "loss": 0.19330035150051117, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.2253532409668, "rewards/margins": 44.873470306396484, "rewards/rejected": 13.358333587646484, "step": 4994 }, { "epoch": 2.5854037267080745, "grad_norm": 1.0319607257843018, "learning_rate": 5.131680730895022e-07, "loss": 0.08482281863689423, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.17729187011719, "rewards/margins": 49.4388427734375, "rewards/rejected": 10.73590087890625, "step": 4995 }, { "epoch": 2.5859213250517596, "grad_norm": 1.0371240377426147, "learning_rate": 5.119098662696948e-07, "loss": 0.053623300045728683, "rewards/accuracies": 0.96875, "rewards/chosen": 62.75755310058594, "rewards/margins": 49.771484375, "rewards/rejected": 12.970146179199219, "step": 4996 }, { "epoch": 2.5864389233954452, "grad_norm": 0.5882858633995056, "learning_rate": 5.106531205799942e-07, "loss": 0.027825187891721725, "rewards/accuracies": 0.9921875, "rewards/chosen": 66.42753601074219, "rewards/margins": 52.109344482421875, "rewards/rejected": 14.32812213897705, "step": 4997 }, { "epoch": 2.5869565217391304, "grad_norm": 1.153668999671936, "learning_rate": 5.093978364295448e-07, "loss": 0.07814083248376846, "rewards/accuracies": 0.96875, "rewards/chosen": 57.549949645996094, "rewards/margins": 43.727294921875, "rewards/rejected": 13.817764282226562, "step": 4998 }, { "epoch": 2.587474120082816, "grad_norm": 0.8998855948448181, "learning_rate": 5.081440142270105e-07, "loss": 0.0705178752541542, "rewards/accuracies": 0.96875, "rewards/chosen": 58.81529235839844, "rewards/margins": 45.08758544921875, "rewards/rejected": 13.746261596679688, "step": 4999 }, { "epoch": 2.587991718426501, "grad_norm": 0.7631833553314209, "learning_rate": 5.068916543805846e-07, "loss": 0.0716121569275856, "rewards/accuracies": 0.96875, "rewards/chosen": 52.76913833618164, "rewards/margins": 42.477691650390625, "rewards/rejected": 10.30517578125, "step": 5000 }, { "epoch": 2.5885093167701863, "grad_norm": 0.758350670337677, "learning_rate": 5.056407572979821e-07, "loss": 0.05732829123735428, "rewards/accuracies": 0.96875, "rewards/chosen": 58.091392517089844, "rewards/margins": 44.339599609375, "rewards/rejected": 13.748165130615234, "step": 5001 }, { "epoch": 2.5890269151138714, "grad_norm": 0.758274495601654, "learning_rate": 5.043913233864395e-07, "loss": 0.07170213758945465, "rewards/accuracies": 0.96875, "rewards/chosen": 55.05878448486328, "rewards/margins": 42.3035888671875, "rewards/rejected": 12.748380661010742, "step": 5002 }, { "epoch": 2.589544513457557, "grad_norm": 0.6035734415054321, "learning_rate": 5.031433530527191e-07, "loss": 0.03523276001214981, "rewards/accuracies": 0.9921875, "rewards/chosen": 57.32855224609375, "rewards/margins": 46.810455322265625, "rewards/rejected": 10.510313034057617, "step": 5003 }, { "epoch": 2.590062111801242, "grad_norm": 1.0074235200881958, "learning_rate": 5.018968467031094e-07, "loss": 0.09016510844230652, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.45331573486328, "rewards/margins": 43.288970947265625, "rewards/rejected": 13.171211242675781, "step": 5004 }, { "epoch": 2.5905797101449277, "grad_norm": 1.1672817468643188, "learning_rate": 5.006518047434172e-07, "loss": 0.14570046961307526, "rewards/accuracies": 0.9296875, "rewards/chosen": 51.34235382080078, "rewards/margins": 39.48136901855469, "rewards/rejected": 11.857364654541016, "step": 5005 }, { "epoch": 2.591097308488613, "grad_norm": 2.46893310546875, "learning_rate": 4.994082275789752e-07, "loss": 0.1544719636440277, "rewards/accuracies": 0.9453125, "rewards/chosen": 49.880409240722656, "rewards/margins": 38.95257568359375, "rewards/rejected": 10.93380355834961, "step": 5006 }, { "epoch": 2.591614906832298, "grad_norm": 1.195367455482483, "learning_rate": 4.981661156146395e-07, "loss": 0.08916227519512177, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.962432861328125, "rewards/margins": 43.37358856201172, "rewards/rejected": 10.58756160736084, "step": 5007 }, { "epoch": 2.5921325051759836, "grad_norm": 1.305509090423584, "learning_rate": 4.969254692547875e-07, "loss": 0.1042918860912323, "rewards/accuracies": 0.953125, "rewards/chosen": 52.852813720703125, "rewards/margins": 40.66888427734375, "rewards/rejected": 12.183685302734375, "step": 5008 }, { "epoch": 2.5926501035196687, "grad_norm": 0.7880954742431641, "learning_rate": 4.956862889033221e-07, "loss": 0.09998923540115356, "rewards/accuracies": 0.953125, "rewards/chosen": 57.269927978515625, "rewards/margins": 43.67083740234375, "rewards/rejected": 13.58978271484375, "step": 5009 }, { "epoch": 2.593167701863354, "grad_norm": 2.279071807861328, "learning_rate": 4.944485749636663e-07, "loss": 0.13433657586574554, "rewards/accuracies": 0.9296875, "rewards/chosen": 56.9144401550293, "rewards/margins": 43.99137878417969, "rewards/rejected": 12.935234069824219, "step": 5010 }, { "epoch": 2.5936853002070395, "grad_norm": 0.897278368473053, "learning_rate": 4.932123278387674e-07, "loss": 0.09500773996114731, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.38688659667969, "rewards/margins": 48.54124450683594, "rewards/rejected": 11.848194122314453, "step": 5011 }, { "epoch": 2.5942028985507246, "grad_norm": 1.1323188543319702, "learning_rate": 4.91977547931094e-07, "loss": 0.12502029538154602, "rewards/accuracies": 0.9296875, "rewards/chosen": 59.044189453125, "rewards/margins": 46.00526428222656, "rewards/rejected": 13.024883270263672, "step": 5012 }, { "epoch": 2.5947204968944098, "grad_norm": 0.9127316474914551, "learning_rate": 4.907442356426378e-07, "loss": 0.09277301281690598, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.814064025878906, "rewards/margins": 44.3037109375, "rewards/rejected": 13.508251190185547, "step": 5013 }, { "epoch": 2.5952380952380953, "grad_norm": 1.4384560585021973, "learning_rate": 4.895123913749128e-07, "loss": 0.06421499699354172, "rewards/accuracies": 0.96875, "rewards/chosen": 57.988670349121094, "rewards/margins": 46.330413818359375, "rewards/rejected": 11.663718223571777, "step": 5014 }, { "epoch": 2.5957556935817805, "grad_norm": 0.6829777956008911, "learning_rate": 4.882820155289547e-07, "loss": 0.046326830983161926, "rewards/accuracies": 0.9921875, "rewards/chosen": 59.33441162109375, "rewards/margins": 45.45257568359375, "rewards/rejected": 13.884830474853516, "step": 5015 }, { "epoch": 2.596273291925466, "grad_norm": 0.9848557710647583, "learning_rate": 4.870531085053209e-07, "loss": 0.05437438189983368, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.97998046875, "rewards/margins": 47.848602294921875, "rewards/rejected": 14.123611450195312, "step": 5016 }, { "epoch": 2.596790890269151, "grad_norm": 0.38342612981796265, "learning_rate": 4.858256707040915e-07, "loss": 0.04023933410644531, "rewards/accuracies": 0.984375, "rewards/chosen": 58.98331832885742, "rewards/margins": 45.50750732421875, "rewards/rejected": 13.48895263671875, "step": 5017 }, { "epoch": 2.5973084886128364, "grad_norm": 4.004873752593994, "learning_rate": 4.845997025248672e-07, "loss": 0.16091109812259674, "rewards/accuracies": 0.9296875, "rewards/chosen": 50.88542175292969, "rewards/margins": 40.064605712890625, "rewards/rejected": 10.816984176635742, "step": 5018 }, { "epoch": 2.5978260869565215, "grad_norm": 0.8463537693023682, "learning_rate": 4.833752043667705e-07, "loss": 0.07419109344482422, "rewards/accuracies": 0.953125, "rewards/chosen": 51.34944152832031, "rewards/margins": 39.569244384765625, "rewards/rejected": 11.780948638916016, "step": 5019 }, { "epoch": 2.598343685300207, "grad_norm": 0.8362244963645935, "learning_rate": 4.821521766284476e-07, "loss": 0.06200858950614929, "rewards/accuracies": 0.96875, "rewards/chosen": 63.8868408203125, "rewards/margins": 48.3853759765625, "rewards/rejected": 15.513755798339844, "step": 5020 }, { "epoch": 2.5988612836438922, "grad_norm": 1.1502304077148438, "learning_rate": 4.809306197080604e-07, "loss": 0.10559907555580139, "rewards/accuracies": 0.9296875, "rewards/chosen": 59.85126495361328, "rewards/margins": 48.42540740966797, "rewards/rejected": 11.429649353027344, "step": 5021 }, { "epoch": 2.599378881987578, "grad_norm": 1.3093217611312866, "learning_rate": 4.797105340032987e-07, "loss": 0.14264793694019318, "rewards/accuracies": 0.921875, "rewards/chosen": 52.16741943359375, "rewards/margins": 40.19662094116211, "rewards/rejected": 11.974116325378418, "step": 5022 }, { "epoch": 2.599896480331263, "grad_norm": 1.1929209232330322, "learning_rate": 4.784919199113697e-07, "loss": 0.09269683808088303, "rewards/accuracies": 0.953125, "rewards/chosen": 60.60685729980469, "rewards/margins": 50.3519287109375, "rewards/rejected": 10.26028823852539, "step": 5023 }, { "epoch": 2.600414078674948, "grad_norm": 0.7314833402633667, "learning_rate": 4.772747778290016e-07, "loss": 0.05314985290169716, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.79460144042969, "rewards/margins": 44.814422607421875, "rewards/rejected": 11.966535568237305, "step": 5024 }, { "epoch": 2.6009316770186337, "grad_norm": 0.7181944847106934, "learning_rate": 4.760591081524418e-07, "loss": 0.08284012973308563, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.310325622558594, "rewards/margins": 41.457611083984375, "rewards/rejected": 11.860160827636719, "step": 5025 }, { "epoch": 2.601449275362319, "grad_norm": 3.767289161682129, "learning_rate": 4.7484491127746333e-07, "loss": 0.0771343857049942, "rewards/accuracies": 0.96875, "rewards/chosen": 53.178306579589844, "rewards/margins": 42.88262939453125, "rewards/rejected": 10.306468963623047, "step": 5026 }, { "epoch": 2.601966873706004, "grad_norm": 2.56207275390625, "learning_rate": 4.736321875993566e-07, "loss": 0.10052601248025894, "rewards/accuracies": 0.96875, "rewards/chosen": 56.327239990234375, "rewards/margins": 42.998687744140625, "rewards/rejected": 13.320846557617188, "step": 5027 }, { "epoch": 2.6024844720496896, "grad_norm": 0.9564517140388489, "learning_rate": 4.7242093751293017e-07, "loss": 0.08933853358030319, "rewards/accuracies": 0.953125, "rewards/chosen": 58.883155822753906, "rewards/margins": 47.40403747558594, "rewards/rejected": 11.469823837280273, "step": 5028 }, { "epoch": 2.6030020703933747, "grad_norm": 1.398929238319397, "learning_rate": 4.7121116141251645e-07, "loss": 0.11253108829259872, "rewards/accuracies": 0.9453125, "rewards/chosen": 45.73785400390625, "rewards/margins": 37.765777587890625, "rewards/rejected": 7.9677581787109375, "step": 5029 }, { "epoch": 2.60351966873706, "grad_norm": 0.7476698160171509, "learning_rate": 4.700028596919681e-07, "loss": 0.05322171747684479, "rewards/accuracies": 0.9765625, "rewards/chosen": 65.75625610351562, "rewards/margins": 49.81280517578125, "rewards/rejected": 15.939056396484375, "step": 5030 }, { "epoch": 2.6040372670807455, "grad_norm": 0.9641266465187073, "learning_rate": 4.687960327446545e-07, "loss": 0.06077008694410324, "rewards/accuracies": 0.9921875, "rewards/chosen": 60.59083938598633, "rewards/margins": 47.0455322265625, "rewards/rejected": 13.536617279052734, "step": 5031 }, { "epoch": 2.6045548654244306, "grad_norm": 1.0869742631912231, "learning_rate": 4.6759068096346784e-07, "loss": 0.10030755400657654, "rewards/accuracies": 0.921875, "rewards/chosen": 58.68495178222656, "rewards/margins": 47.364227294921875, "rewards/rejected": 11.321083068847656, "step": 5032 }, { "epoch": 2.605072463768116, "grad_norm": 0.6527396440505981, "learning_rate": 4.6638680474081765e-07, "loss": 0.05374728888273239, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.3592414855957, "rewards/margins": 47.998260498046875, "rewards/rejected": 11.356895446777344, "step": 5033 }, { "epoch": 2.6055900621118013, "grad_norm": 1.5039540529251099, "learning_rate": 4.651844044686377e-07, "loss": 0.05803299322724342, "rewards/accuracies": 0.953125, "rewards/chosen": 57.14059066772461, "rewards/margins": 44.925697326660156, "rewards/rejected": 12.221393585205078, "step": 5034 }, { "epoch": 2.6061076604554865, "grad_norm": 0.564873456954956, "learning_rate": 4.639834805383753e-07, "loss": 0.06993907690048218, "rewards/accuracies": 0.953125, "rewards/chosen": 59.2203483581543, "rewards/margins": 48.5145263671875, "rewards/rejected": 10.691688537597656, "step": 5035 }, { "epoch": 2.6066252587991716, "grad_norm": 0.9089991450309753, "learning_rate": 4.627840333410005e-07, "loss": 0.0708000659942627, "rewards/accuracies": 0.96875, "rewards/chosen": 56.964576721191406, "rewards/margins": 46.670623779296875, "rewards/rejected": 10.295658111572266, "step": 5036 }, { "epoch": 2.607142857142857, "grad_norm": 0.6391699910163879, "learning_rate": 4.6158606326700215e-07, "loss": 0.05675838887691498, "rewards/accuracies": 0.96875, "rewards/chosen": 54.6231689453125, "rewards/margins": 42.897918701171875, "rewards/rejected": 11.724006652832031, "step": 5037 }, { "epoch": 2.6076604554865424, "grad_norm": 0.7851248979568481, "learning_rate": 4.6038957070638825e-07, "loss": 0.05828263238072395, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.89762878417969, "rewards/margins": 48.79646301269531, "rewards/rejected": 13.095771789550781, "step": 5038 }, { "epoch": 2.608178053830228, "grad_norm": 1.4635027647018433, "learning_rate": 4.591945560486855e-07, "loss": 0.17783893644809723, "rewards/accuracies": 0.8984375, "rewards/chosen": 55.97203063964844, "rewards/margins": 44.54638671875, "rewards/rejected": 11.409235954284668, "step": 5039 }, { "epoch": 2.608695652173913, "grad_norm": 2.0215904712677, "learning_rate": 4.580010196829393e-07, "loss": 0.12650816142559052, "rewards/accuracies": 0.953125, "rewards/chosen": 56.277008056640625, "rewards/margins": 42.82942199707031, "rewards/rejected": 13.44635009765625, "step": 5040 }, { "epoch": 2.6092132505175982, "grad_norm": 1.145330548286438, "learning_rate": 4.568089619977145e-07, "loss": 0.08065003156661987, "rewards/accuracies": 0.96875, "rewards/chosen": 58.353965759277344, "rewards/margins": 46.745849609375, "rewards/rejected": 11.604361534118652, "step": 5041 }, { "epoch": 2.6097308488612834, "grad_norm": 0.6081127524375916, "learning_rate": 4.556183833810934e-07, "loss": 0.05485457926988602, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.204124450683594, "rewards/margins": 45.768890380859375, "rewards/rejected": 9.423895835876465, "step": 5042 }, { "epoch": 2.610248447204969, "grad_norm": 0.6251550912857056, "learning_rate": 4.544292842206782e-07, "loss": 0.05703146755695343, "rewards/accuracies": 0.984375, "rewards/chosen": 56.42176055908203, "rewards/margins": 46.11859130859375, "rewards/rejected": 10.304960250854492, "step": 5043 }, { "epoch": 2.610766045548654, "grad_norm": 0.6076303124427795, "learning_rate": 4.532416649035881e-07, "loss": 0.07248060405254364, "rewards/accuracies": 0.96875, "rewards/chosen": 57.55718231201172, "rewards/margins": 47.52227783203125, "rewards/rejected": 10.02652359008789, "step": 5044 }, { "epoch": 2.6112836438923397, "grad_norm": 0.7368720769882202, "learning_rate": 4.5205552581646105e-07, "loss": 0.05585554987192154, "rewards/accuracies": 0.96875, "rewards/chosen": 58.34768295288086, "rewards/margins": 47.511566162109375, "rewards/rejected": 10.83140754699707, "step": 5045 }, { "epoch": 2.611801242236025, "grad_norm": 0.8468522429466248, "learning_rate": 4.508708673454537e-07, "loss": 0.08234960585832596, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.377376556396484, "rewards/margins": 46.5150146484375, "rewards/rejected": 11.862386703491211, "step": 5046 }, { "epoch": 2.61231884057971, "grad_norm": 0.6395817399024963, "learning_rate": 4.496876898762403e-07, "loss": 0.04158324748277664, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.803401947021484, "rewards/margins": 47.72265625, "rewards/rejected": 12.075157165527344, "step": 5047 }, { "epoch": 2.6128364389233956, "grad_norm": 1.7677335739135742, "learning_rate": 4.485059937940123e-07, "loss": 0.07920104265213013, "rewards/accuracies": 0.953125, "rewards/chosen": 56.202945709228516, "rewards/margins": 44.58929443359375, "rewards/rejected": 11.60305404663086, "step": 5048 }, { "epoch": 2.6133540372670807, "grad_norm": 1.1745001077651978, "learning_rate": 4.47325779483479e-07, "loss": 0.11317838728427887, "rewards/accuracies": 0.9375, "rewards/chosen": 51.493751525878906, "rewards/margins": 41.277740478515625, "rewards/rejected": 10.218345642089844, "step": 5049 }, { "epoch": 2.6138716356107663, "grad_norm": 0.7574639320373535, "learning_rate": 4.461470473288698e-07, "loss": 0.07697762548923492, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.817752838134766, "rewards/margins": 46.300899505615234, "rewards/rejected": 13.508164405822754, "step": 5050 }, { "epoch": 2.6143892339544514, "grad_norm": 4.234658241271973, "learning_rate": 4.449697977139256e-07, "loss": 0.14408206939697266, "rewards/accuracies": 0.953125, "rewards/chosen": 57.80823516845703, "rewards/margins": 46.79132080078125, "rewards/rejected": 11.02985954284668, "step": 5051 }, { "epoch": 2.6149068322981366, "grad_norm": 1.7206813097000122, "learning_rate": 4.437940310219119e-07, "loss": 0.09153957664966583, "rewards/accuracies": 0.953125, "rewards/chosen": 58.57499694824219, "rewards/margins": 44.61834716796875, "rewards/rejected": 13.945327758789062, "step": 5052 }, { "epoch": 2.6154244306418217, "grad_norm": 2.4264724254608154, "learning_rate": 4.4261974763560755e-07, "loss": 0.054437845945358276, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.497291564941406, "rewards/margins": 48.114166259765625, "rewards/rejected": 11.380695343017578, "step": 5053 }, { "epoch": 2.6159420289855073, "grad_norm": 0.8135291934013367, "learning_rate": 4.4144694793730793e-07, "loss": 0.05324612185359001, "rewards/accuracies": 0.96875, "rewards/chosen": 59.3216552734375, "rewards/margins": 46.953857421875, "rewards/rejected": 12.367133140563965, "step": 5054 }, { "epoch": 2.6164596273291925, "grad_norm": 0.559275209903717, "learning_rate": 4.402756323088253e-07, "loss": 0.06062756106257439, "rewards/accuracies": 0.96875, "rewards/chosen": 51.02460479736328, "rewards/margins": 41.815399169921875, "rewards/rejected": 9.201922416687012, "step": 5055 }, { "epoch": 2.616977225672878, "grad_norm": 1.040551781654358, "learning_rate": 4.3910580113149306e-07, "loss": 0.12470031529664993, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.57306671142578, "rewards/margins": 40.31158447265625, "rewards/rejected": 14.254404067993164, "step": 5056 }, { "epoch": 2.617494824016563, "grad_norm": 1.0886589288711548, "learning_rate": 4.37937454786157e-07, "loss": 0.07554232329130173, "rewards/accuracies": 0.96875, "rewards/chosen": 59.35809326171875, "rewards/margins": 46.861053466796875, "rewards/rejected": 12.497390747070312, "step": 5057 }, { "epoch": 2.6180124223602483, "grad_norm": 1.4343643188476562, "learning_rate": 4.367705936531791e-07, "loss": 0.0897083729505539, "rewards/accuracies": 0.953125, "rewards/chosen": 58.743568420410156, "rewards/margins": 45.33685302734375, "rewards/rejected": 13.417045593261719, "step": 5058 }, { "epoch": 2.6185300207039335, "grad_norm": 1.7665436267852783, "learning_rate": 4.356052181124404e-07, "loss": 0.09794402122497559, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.30052185058594, "rewards/margins": 46.694427490234375, "rewards/rejected": 12.612777709960938, "step": 5059 }, { "epoch": 2.619047619047619, "grad_norm": 0.677952229976654, "learning_rate": 4.344413285433385e-07, "loss": 0.050426021218299866, "rewards/accuracies": 0.9765625, "rewards/chosen": 62.04821014404297, "rewards/margins": 49.010162353515625, "rewards/rejected": 13.030120849609375, "step": 5060 }, { "epoch": 2.619565217391304, "grad_norm": 0.8839334845542908, "learning_rate": 4.332789253247849e-07, "loss": 0.06601111590862274, "rewards/accuracies": 0.96875, "rewards/chosen": 54.58403778076172, "rewards/margins": 44.145904541015625, "rewards/rejected": 10.447578430175781, "step": 5061 }, { "epoch": 2.62008281573499, "grad_norm": 0.5945374965667725, "learning_rate": 4.3211800883520785e-07, "loss": 0.03334621340036392, "rewards/accuracies": 1.0, "rewards/chosen": 57.22467041015625, "rewards/margins": 45.01094055175781, "rewards/rejected": 12.210350036621094, "step": 5062 }, { "epoch": 2.620600414078675, "grad_norm": 1.292335867881775, "learning_rate": 4.3095857945255284e-07, "loss": 0.0870290994644165, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.5066032409668, "rewards/margins": 46.94248962402344, "rewards/rejected": 11.581883430480957, "step": 5063 }, { "epoch": 2.62111801242236, "grad_norm": 1.9898532629013062, "learning_rate": 4.298006375542807e-07, "loss": 0.13162997364997864, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.858642578125, "rewards/margins": 42.02351379394531, "rewards/rejected": 9.829687118530273, "step": 5064 }, { "epoch": 2.6216356107660457, "grad_norm": 0.7652690410614014, "learning_rate": 4.2864418351736713e-07, "loss": 0.052850764244794846, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.044708251953125, "rewards/margins": 45.619903564453125, "rewards/rejected": 12.445690155029297, "step": 5065 }, { "epoch": 2.622153209109731, "grad_norm": 1.313266396522522, "learning_rate": 4.274892177183043e-07, "loss": 0.054088301956653595, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.6676025390625, "rewards/margins": 45.32237243652344, "rewards/rejected": 11.339385986328125, "step": 5066 }, { "epoch": 2.6226708074534164, "grad_norm": 1.1326948404312134, "learning_rate": 4.2633574053309977e-07, "loss": 0.08523936569690704, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.010581970214844, "rewards/margins": 46.957183837890625, "rewards/rejected": 12.064422607421875, "step": 5067 }, { "epoch": 2.6231884057971016, "grad_norm": 1.0228952169418335, "learning_rate": 4.251837523372759e-07, "loss": 0.0996396392583847, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.019065856933594, "rewards/margins": 44.725341796875, "rewards/rejected": 10.292896270751953, "step": 5068 }, { "epoch": 2.6237060041407867, "grad_norm": 0.9191361665725708, "learning_rate": 4.2403325350587165e-07, "loss": 0.09921293705701828, "rewards/accuracies": 0.953125, "rewards/chosen": 54.32468795776367, "rewards/margins": 43.3768310546875, "rewards/rejected": 10.948379516601562, "step": 5069 }, { "epoch": 2.624223602484472, "grad_norm": 0.8295985460281372, "learning_rate": 4.2288424441343956e-07, "loss": 0.06176462396979332, "rewards/accuracies": 0.96875, "rewards/chosen": 54.50094985961914, "rewards/margins": 43.4927978515625, "rewards/rejected": 11.015840530395508, "step": 5070 }, { "epoch": 2.6247412008281574, "grad_norm": 0.6035894751548767, "learning_rate": 4.217367254340482e-07, "loss": 0.04639401286840439, "rewards/accuracies": 0.9921875, "rewards/chosen": 57.2076530456543, "rewards/margins": 45.96540832519531, "rewards/rejected": 11.244338989257812, "step": 5071 }, { "epoch": 2.6252587991718426, "grad_norm": 0.8464725017547607, "learning_rate": 4.205906969412804e-07, "loss": 0.06197489798069, "rewards/accuracies": 0.96875, "rewards/chosen": 52.71117401123047, "rewards/margins": 38.973236083984375, "rewards/rejected": 13.744285583496094, "step": 5072 }, { "epoch": 2.625776397515528, "grad_norm": 0.8507160544395447, "learning_rate": 4.1944615930823386e-07, "loss": 0.056183431297540665, "rewards/accuracies": 0.984375, "rewards/chosen": 62.07080078125, "rewards/margins": 48.474334716796875, "rewards/rejected": 13.597076416015625, "step": 5073 }, { "epoch": 2.6262939958592133, "grad_norm": 1.0154221057891846, "learning_rate": 4.18303112907521e-07, "loss": 0.06174187362194061, "rewards/accuracies": 0.96875, "rewards/chosen": 58.61149597167969, "rewards/margins": 45.07611083984375, "rewards/rejected": 13.53424072265625, "step": 5074 }, { "epoch": 2.6268115942028984, "grad_norm": 2.4519786834716797, "learning_rate": 4.1716155811126926e-07, "loss": 0.0869993269443512, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.47234344482422, "rewards/margins": 41.70262145996094, "rewards/rejected": 7.767665863037109, "step": 5075 }, { "epoch": 2.6273291925465836, "grad_norm": 1.679054856300354, "learning_rate": 4.160214952911207e-07, "loss": 0.08377696573734283, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.10948944091797, "rewards/margins": 44.80450439453125, "rewards/rejected": 13.295669555664062, "step": 5076 }, { "epoch": 2.627846790890269, "grad_norm": 1.1073508262634277, "learning_rate": 4.1488292481822847e-07, "loss": 0.11706819385290146, "rewards/accuracies": 0.9375, "rewards/chosen": 53.0172119140625, "rewards/margins": 42.334747314453125, "rewards/rejected": 10.68626594543457, "step": 5077 }, { "epoch": 2.6283643892339543, "grad_norm": 0.7850766777992249, "learning_rate": 4.137458470632649e-07, "loss": 0.05238581448793411, "rewards/accuracies": 0.984375, "rewards/chosen": 54.231529235839844, "rewards/margins": 43.3265380859375, "rewards/rejected": 10.895984649658203, "step": 5078 }, { "epoch": 2.62888198757764, "grad_norm": 0.8789684176445007, "learning_rate": 4.126102623964129e-07, "loss": 0.06564938277006149, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.11625671386719, "rewards/margins": 44.5382080078125, "rewards/rejected": 14.591194152832031, "step": 5079 }, { "epoch": 2.629399585921325, "grad_norm": 1.217604637145996, "learning_rate": 4.1147617118737106e-07, "loss": 0.11225616931915283, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.959999084472656, "rewards/margins": 41.89299011230469, "rewards/rejected": 13.062934875488281, "step": 5080 }, { "epoch": 2.62991718426501, "grad_norm": 0.5919108986854553, "learning_rate": 4.103435738053491e-07, "loss": 0.05214743688702583, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.65931701660156, "rewards/margins": 47.305938720703125, "rewards/rejected": 14.355491638183594, "step": 5081 }, { "epoch": 2.630434782608696, "grad_norm": 0.9501708149909973, "learning_rate": 4.092124706190737e-07, "loss": 0.08788340538740158, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.51576232910156, "rewards/margins": 42.31959533691406, "rewards/rejected": 9.197166442871094, "step": 5082 }, { "epoch": 2.630952380952381, "grad_norm": 1.0248911380767822, "learning_rate": 4.080828619967842e-07, "loss": 0.09229019284248352, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.684906005859375, "rewards/margins": 47.8114013671875, "rewards/rejected": 11.8807373046875, "step": 5083 }, { "epoch": 2.6314699792960665, "grad_norm": 0.7017520666122437, "learning_rate": 4.0695474830623136e-07, "loss": 0.06509305536746979, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.83355712890625, "rewards/margins": 46.48895263671875, "rewards/rejected": 12.335201263427734, "step": 5084 }, { "epoch": 2.6319875776397517, "grad_norm": 0.447802871465683, "learning_rate": 4.0582812991468046e-07, "loss": 0.03385935723781586, "rewards/accuracies": 0.9921875, "rewards/chosen": 61.211181640625, "rewards/margins": 47.6339111328125, "rewards/rejected": 13.583946228027344, "step": 5085 }, { "epoch": 2.632505175983437, "grad_norm": 1.3048690557479858, "learning_rate": 4.047030071889124e-07, "loss": 0.08441475033760071, "rewards/accuracies": 0.953125, "rewards/chosen": 55.61351013183594, "rewards/margins": 44.79510498046875, "rewards/rejected": 10.820335388183594, "step": 5086 }, { "epoch": 2.633022774327122, "grad_norm": 1.556250810623169, "learning_rate": 4.035793804952176e-07, "loss": 0.06323839724063873, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.66676330566406, "rewards/margins": 42.33343505859375, "rewards/rejected": 10.330955505371094, "step": 5087 }, { "epoch": 2.6335403726708075, "grad_norm": 1.5893709659576416, "learning_rate": 4.024572501994006e-07, "loss": 0.11054524034261703, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.70545196533203, "rewards/margins": 38.98687744140625, "rewards/rejected": 10.725334167480469, "step": 5088 }, { "epoch": 2.6340579710144927, "grad_norm": 0.6476696133613586, "learning_rate": 4.013366166667793e-07, "loss": 0.051553450524806976, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.63594055175781, "rewards/margins": 43.7340087890625, "rewards/rejected": 12.891677856445312, "step": 5089 }, { "epoch": 2.6345755693581783, "grad_norm": 3.7762153148651123, "learning_rate": 4.002174802621844e-07, "loss": 0.09884199500083923, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.890411376953125, "rewards/margins": 44.96826171875, "rewards/rejected": 9.922271728515625, "step": 5090 }, { "epoch": 2.6350931677018634, "grad_norm": 2.0912303924560547, "learning_rate": 3.9909984134995804e-07, "loss": 0.10304845869541168, "rewards/accuracies": 0.953125, "rewards/chosen": 55.306941986083984, "rewards/margins": 41.629547119140625, "rewards/rejected": 13.669998168945312, "step": 5091 }, { "epoch": 2.6356107660455486, "grad_norm": 3.3304879665374756, "learning_rate": 3.9798370029395685e-07, "loss": 0.1079312413930893, "rewards/accuracies": 0.9140625, "rewards/chosen": 54.54835510253906, "rewards/margins": 41.32035827636719, "rewards/rejected": 13.245880126953125, "step": 5092 }, { "epoch": 2.6361283643892337, "grad_norm": 1.4257361888885498, "learning_rate": 3.9686905745754735e-07, "loss": 0.0835171490907669, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.347991943359375, "rewards/margins": 42.6314697265625, "rewards/rejected": 13.712242126464844, "step": 5093 }, { "epoch": 2.6366459627329193, "grad_norm": 1.579357385635376, "learning_rate": 3.957559132036098e-07, "loss": 0.1041441410779953, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.675018310546875, "rewards/margins": 41.1114501953125, "rewards/rejected": 11.553239822387695, "step": 5094 }, { "epoch": 2.6371635610766044, "grad_norm": 0.867940366268158, "learning_rate": 3.9464426789453727e-07, "loss": 0.05771039426326752, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.846641540527344, "rewards/margins": 43.17634582519531, "rewards/rejected": 13.686912536621094, "step": 5095 }, { "epoch": 2.63768115942029, "grad_norm": 1.14199960231781, "learning_rate": 3.935341218922328e-07, "loss": 0.08448251336812973, "rewards/accuracies": 0.96875, "rewards/chosen": 49.49105453491211, "rewards/margins": 39.06475830078125, "rewards/rejected": 10.428028106689453, "step": 5096 }, { "epoch": 2.638198757763975, "grad_norm": 2.044372797012329, "learning_rate": 3.9242547555811305e-07, "loss": 0.09545061737298965, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.21855163574219, "rewards/margins": 45.54840087890625, "rewards/rejected": 14.673904418945312, "step": 5097 }, { "epoch": 2.6387163561076603, "grad_norm": 1.1842621564865112, "learning_rate": 3.913183292531053e-07, "loss": 0.058728478848934174, "rewards/accuracies": 0.96875, "rewards/chosen": 55.82497024536133, "rewards/margins": 43.88447570800781, "rewards/rejected": 11.954933166503906, "step": 5098 }, { "epoch": 2.639233954451346, "grad_norm": 1.1383631229400635, "learning_rate": 3.902126833376507e-07, "loss": 0.1088590994477272, "rewards/accuracies": 0.9453125, "rewards/chosen": 49.63985061645508, "rewards/margins": 38.733123779296875, "rewards/rejected": 10.909599304199219, "step": 5099 }, { "epoch": 2.639751552795031, "grad_norm": 1.300886631011963, "learning_rate": 3.891085381716969e-07, "loss": 0.13143467903137207, "rewards/accuracies": 0.9375, "rewards/chosen": 53.47992706298828, "rewards/margins": 39.20954132080078, "rewards/rejected": 14.271415710449219, "step": 5100 }, { "epoch": 2.6402691511387166, "grad_norm": 1.6534639596939087, "learning_rate": 3.880058941147097e-07, "loss": 0.1543736755847931, "rewards/accuracies": 0.9296875, "rewards/chosen": 49.70249938964844, "rewards/margins": 41.21173095703125, "rewards/rejected": 8.477887153625488, "step": 5101 }, { "epoch": 2.6407867494824018, "grad_norm": 0.9848043918609619, "learning_rate": 3.8690475152566074e-07, "loss": 0.049756307154893875, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.180747985839844, "rewards/margins": 42.918701171875, "rewards/rejected": 11.262872695922852, "step": 5102 }, { "epoch": 2.641304347826087, "grad_norm": 0.8642789125442505, "learning_rate": 3.8580511076303617e-07, "loss": 0.06338950991630554, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.27033233642578, "rewards/margins": 44.41937255859375, "rewards/rejected": 9.840167999267578, "step": 5103 }, { "epoch": 2.641821946169772, "grad_norm": 0.8280728459358215, "learning_rate": 3.847069721848312e-07, "loss": 0.07795310765504837, "rewards/accuracies": 0.96875, "rewards/chosen": 61.16271209716797, "rewards/margins": 47.15727233886719, "rewards/rejected": 13.999992370605469, "step": 5104 }, { "epoch": 2.6423395445134576, "grad_norm": 5.5141825675964355, "learning_rate": 3.8361033614855325e-07, "loss": 0.19496703147888184, "rewards/accuracies": 0.953125, "rewards/chosen": 48.721221923828125, "rewards/margins": 38.76332092285156, "rewards/rejected": 9.949934959411621, "step": 5105 }, { "epoch": 2.642857142857143, "grad_norm": 0.7243459820747375, "learning_rate": 3.8251520301122057e-07, "loss": 0.051118411123752594, "rewards/accuracies": 0.96875, "rewards/chosen": 51.71966552734375, "rewards/margins": 41.343505859375, "rewards/rejected": 10.376714706420898, "step": 5106 }, { "epoch": 2.6433747412008284, "grad_norm": 2.405285596847534, "learning_rate": 3.814215731293591e-07, "loss": 0.10523413866758347, "rewards/accuracies": 0.9453125, "rewards/chosen": 60.681278228759766, "rewards/margins": 43.64337158203125, "rewards/rejected": 17.038236618041992, "step": 5107 }, { "epoch": 2.6438923395445135, "grad_norm": 1.0497353076934814, "learning_rate": 3.8032944685901075e-07, "loss": 0.09983378648757935, "rewards/accuracies": 0.9296875, "rewards/chosen": 55.18110656738281, "rewards/margins": 43.27685546875, "rewards/rejected": 11.894769668579102, "step": 5108 }, { "epoch": 2.6444099378881987, "grad_norm": 1.3705451488494873, "learning_rate": 3.792388245557249e-07, "loss": 0.10366317629814148, "rewards/accuracies": 0.9453125, "rewards/chosen": 50.9576416015625, "rewards/margins": 39.85406494140625, "rewards/rejected": 11.100456237792969, "step": 5109 }, { "epoch": 2.644927536231884, "grad_norm": 1.312861442565918, "learning_rate": 3.7814970657455984e-07, "loss": 0.08313632011413574, "rewards/accuracies": 0.953125, "rewards/chosen": 58.79286575317383, "rewards/margins": 46.016693115234375, "rewards/rejected": 12.784746170043945, "step": 5110 }, { "epoch": 2.6454451345755694, "grad_norm": 0.8064090013504028, "learning_rate": 3.770620932700858e-07, "loss": 0.05655479058623314, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.10247802734375, "rewards/margins": 46.51025390625, "rewards/rejected": 11.59156608581543, "step": 5111 }, { "epoch": 2.6459627329192545, "grad_norm": 0.6776736378669739, "learning_rate": 3.7597598499638566e-07, "loss": 0.057265929877758026, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.70572280883789, "rewards/margins": 48.57786560058594, "rewards/rejected": 12.138740539550781, "step": 5112 }, { "epoch": 2.64648033126294, "grad_norm": 0.8378955125808716, "learning_rate": 3.748913821070477e-07, "loss": 0.10465344786643982, "rewards/accuracies": 0.953125, "rewards/chosen": 60.43124771118164, "rewards/margins": 46.24237060546875, "rewards/rejected": 14.204132080078125, "step": 5113 }, { "epoch": 2.6469979296066253, "grad_norm": 0.7238861918449402, "learning_rate": 3.7380828495517217e-07, "loss": 0.0426381379365921, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.93267822265625, "rewards/margins": 46.1171875, "rewards/rejected": 15.810585975646973, "step": 5114 }, { "epoch": 2.6475155279503104, "grad_norm": 0.840170681476593, "learning_rate": 3.7272669389336935e-07, "loss": 0.07541026175022125, "rewards/accuracies": 0.953125, "rewards/chosen": 61.472190856933594, "rewards/margins": 48.283355712890625, "rewards/rejected": 13.18569564819336, "step": 5115 }, { "epoch": 2.648033126293996, "grad_norm": 0.9476635456085205, "learning_rate": 3.7164660927376026e-07, "loss": 0.08261176943778992, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.31988525390625, "rewards/margins": 45.17779541015625, "rewards/rejected": 14.129569053649902, "step": 5116 }, { "epoch": 2.648550724637681, "grad_norm": 1.3746726512908936, "learning_rate": 3.7056803144797314e-07, "loss": 0.06589915603399277, "rewards/accuracies": 0.96875, "rewards/chosen": 56.478519439697266, "rewards/margins": 44.700340270996094, "rewards/rejected": 11.794858932495117, "step": 5117 }, { "epoch": 2.6490683229813663, "grad_norm": 4.911869525909424, "learning_rate": 3.694909607671465e-07, "loss": 0.12740109860897064, "rewards/accuracies": 0.9453125, "rewards/chosen": 62.77540588378906, "rewards/margins": 48.68310546875, "rewards/rejected": 14.095893859863281, "step": 5118 }, { "epoch": 2.649585921325052, "grad_norm": 1.063522458076477, "learning_rate": 3.684153975819288e-07, "loss": 0.1138121709227562, "rewards/accuracies": 0.953125, "rewards/chosen": 54.04329299926758, "rewards/margins": 43.7845458984375, "rewards/rejected": 10.257226943969727, "step": 5119 }, { "epoch": 2.650103519668737, "grad_norm": 1.833825945854187, "learning_rate": 3.6734134224247764e-07, "loss": 0.10338285565376282, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.19103240966797, "rewards/margins": 46.32447814941406, "rewards/rejected": 12.860740661621094, "step": 5120 }, { "epoch": 2.650621118012422, "grad_norm": 2.0542891025543213, "learning_rate": 3.6626879509845946e-07, "loss": 0.07776367664337158, "rewards/accuracies": 0.96875, "rewards/chosen": 63.046417236328125, "rewards/margins": 50.44610595703125, "rewards/rejected": 12.600630760192871, "step": 5121 }, { "epoch": 2.6511387163561078, "grad_norm": 0.9028999209403992, "learning_rate": 3.6519775649904945e-07, "loss": 0.07551752030849457, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.409210205078125, "rewards/margins": 45.591217041015625, "rewards/rejected": 11.815032958984375, "step": 5122 }, { "epoch": 2.651656314699793, "grad_norm": 1.041637659072876, "learning_rate": 3.6412822679293194e-07, "loss": 0.08508151024580002, "rewards/accuracies": 0.96875, "rewards/chosen": 60.46377944946289, "rewards/margins": 47.139923095703125, "rewards/rejected": 13.32470703125, "step": 5123 }, { "epoch": 2.6521739130434785, "grad_norm": 1.6246417760849, "learning_rate": 3.6306020632830063e-07, "loss": 0.06190915405750275, "rewards/accuracies": 0.96875, "rewards/chosen": 60.42061233520508, "rewards/margins": 49.1875, "rewards/rejected": 11.232540130615234, "step": 5124 }, { "epoch": 2.6526915113871636, "grad_norm": 0.9446544051170349, "learning_rate": 3.619936954528569e-07, "loss": 0.07939625531435013, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.052337646484375, "rewards/margins": 45.707244873046875, "rewards/rejected": 14.342391967773438, "step": 5125 }, { "epoch": 2.653209109730849, "grad_norm": 1.1465095281600952, "learning_rate": 3.609286945138102e-07, "loss": 0.10302545130252838, "rewards/accuracies": 0.9453125, "rewards/chosen": 62.039207458496094, "rewards/margins": 45.505523681640625, "rewards/rejected": 16.53137969970703, "step": 5126 }, { "epoch": 2.653726708074534, "grad_norm": 0.720576822757721, "learning_rate": 3.598652038578804e-07, "loss": 0.07220492511987686, "rewards/accuracies": 0.953125, "rewards/chosen": 60.32750701904297, "rewards/margins": 45.91284942626953, "rewards/rejected": 14.409950256347656, "step": 5127 }, { "epoch": 2.6542443064182195, "grad_norm": 1.7153791189193726, "learning_rate": 3.5880322383129386e-07, "loss": 0.08572158217430115, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.436859130859375, "rewards/margins": 45.912384033203125, "rewards/rejected": 13.524295806884766, "step": 5128 }, { "epoch": 2.6547619047619047, "grad_norm": 0.831736147403717, "learning_rate": 3.577427547797874e-07, "loss": 0.08017068356275558, "rewards/accuracies": 0.953125, "rewards/chosen": 55.90528869628906, "rewards/margins": 45.36163330078125, "rewards/rejected": 10.53775405883789, "step": 5129 }, { "epoch": 2.6552795031055902, "grad_norm": 1.2054373025894165, "learning_rate": 3.566837970486015e-07, "loss": 0.06986024230718613, "rewards/accuracies": 0.984375, "rewards/chosen": 65.33500671386719, "rewards/margins": 49.6673583984375, "rewards/rejected": 15.659797668457031, "step": 5130 }, { "epoch": 2.6557971014492754, "grad_norm": 1.462032675743103, "learning_rate": 3.556263509824898e-07, "loss": 0.08733373135328293, "rewards/accuracies": 0.96875, "rewards/chosen": 53.36151123046875, "rewards/margins": 41.188316345214844, "rewards/rejected": 12.177249908447266, "step": 5131 }, { "epoch": 2.6563146997929605, "grad_norm": 0.736940324306488, "learning_rate": 3.5457041692571194e-07, "loss": 0.06046467646956444, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.11492156982422, "rewards/margins": 46.603515625, "rewards/rejected": 8.494952201843262, "step": 5132 }, { "epoch": 2.656832298136646, "grad_norm": 0.9237480163574219, "learning_rate": 3.5351599522203183e-07, "loss": 0.06696085631847382, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.88251495361328, "rewards/margins": 47.980010986328125, "rewards/rejected": 9.905441284179688, "step": 5133 }, { "epoch": 2.6573498964803313, "grad_norm": 0.5375964045524597, "learning_rate": 3.5246308621472713e-07, "loss": 0.028238719329237938, "rewards/accuracies": 1.0, "rewards/chosen": 58.31483459472656, "rewards/margins": 45.7652587890625, "rewards/rejected": 12.54864501953125, "step": 5134 }, { "epoch": 2.6578674948240164, "grad_norm": 1.5361018180847168, "learning_rate": 3.5141169024657916e-07, "loss": 0.04064064100384712, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.73994445800781, "rewards/margins": 45.0684814453125, "rewards/rejected": 11.665985107421875, "step": 5135 }, { "epoch": 2.658385093167702, "grad_norm": 2.4373373985290527, "learning_rate": 3.5036180765987637e-07, "loss": 0.0901540070772171, "rewards/accuracies": 0.9765625, "rewards/chosen": 62.79150390625, "rewards/margins": 52.005218505859375, "rewards/rejected": 10.777876853942871, "step": 5136 }, { "epoch": 2.658902691511387, "grad_norm": 0.7181593179702759, "learning_rate": 3.4931343879641587e-07, "loss": 0.07669852674007416, "rewards/accuracies": 0.96875, "rewards/chosen": 51.94359588623047, "rewards/margins": 42.61798095703125, "rewards/rejected": 9.317222595214844, "step": 5137 }, { "epoch": 2.6594202898550723, "grad_norm": 1.0162609815597534, "learning_rate": 3.4826658399750245e-07, "loss": 0.07426271587610245, "rewards/accuracies": 0.9765625, "rewards/chosen": 62.885005950927734, "rewards/margins": 48.568572998046875, "rewards/rejected": 14.307369232177734, "step": 5138 }, { "epoch": 2.659937888198758, "grad_norm": 4.738377571105957, "learning_rate": 3.472212436039485e-07, "loss": 0.12059660255908966, "rewards/accuracies": 0.984375, "rewards/chosen": 57.57722854614258, "rewards/margins": 47.38615417480469, "rewards/rejected": 10.206287384033203, "step": 5139 }, { "epoch": 2.660455486542443, "grad_norm": 0.7986171841621399, "learning_rate": 3.4617741795606953e-07, "loss": 0.05524950101971626, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.40965270996094, "rewards/margins": 45.93096923828125, "rewards/rejected": 11.478740692138672, "step": 5140 }, { "epoch": 2.6609730848861286, "grad_norm": 2.3476688861846924, "learning_rate": 3.451351073936904e-07, "loss": 0.09886839240789413, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.61053466796875, "rewards/margins": 44.6947021484375, "rewards/rejected": 12.906135559082031, "step": 5141 }, { "epoch": 2.6614906832298137, "grad_norm": 1.4613285064697266, "learning_rate": 3.4409431225614576e-07, "loss": 0.09847155213356018, "rewards/accuracies": 0.9375, "rewards/chosen": 54.50192642211914, "rewards/margins": 44.1114501953125, "rewards/rejected": 10.406293869018555, "step": 5142 }, { "epoch": 2.662008281573499, "grad_norm": 1.3778984546661377, "learning_rate": 3.4305503288227125e-07, "loss": 0.10353846848011017, "rewards/accuracies": 0.9375, "rewards/chosen": 56.90944290161133, "rewards/margins": 45.1197509765625, "rewards/rejected": 11.775373458862305, "step": 5143 }, { "epoch": 2.662525879917184, "grad_norm": 0.7160128355026245, "learning_rate": 3.4201726961041295e-07, "loss": 0.060145895928144455, "rewards/accuracies": 0.96875, "rewards/chosen": 59.939205169677734, "rewards/margins": 48.238555908203125, "rewards/rejected": 11.703418731689453, "step": 5144 }, { "epoch": 2.6630434782608696, "grad_norm": 1.1878036260604858, "learning_rate": 3.4098102277842214e-07, "loss": 0.08361425995826721, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.47708511352539, "rewards/margins": 46.54522705078125, "rewards/rejected": 12.934219360351562, "step": 5145 }, { "epoch": 2.6635610766045548, "grad_norm": 2.4058167934417725, "learning_rate": 3.3994629272365574e-07, "loss": 0.08961717784404755, "rewards/accuracies": 0.9453125, "rewards/chosen": 61.57872009277344, "rewards/margins": 48.31330871582031, "rewards/rejected": 13.255329132080078, "step": 5146 }, { "epoch": 2.6640786749482404, "grad_norm": 1.172528862953186, "learning_rate": 3.3891307978297874e-07, "loss": 0.11688876152038574, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.08592224121094, "rewards/margins": 45.10003662109375, "rewards/rejected": 11.987274169921875, "step": 5147 }, { "epoch": 2.6645962732919255, "grad_norm": 0.7636017799377441, "learning_rate": 3.3788138429275973e-07, "loss": 0.05820570886135101, "rewards/accuracies": 0.984375, "rewards/chosen": 57.6015625, "rewards/margins": 42.797943115234375, "rewards/rejected": 14.799842834472656, "step": 5148 }, { "epoch": 2.6651138716356106, "grad_norm": 1.370764136314392, "learning_rate": 3.368512065888757e-07, "loss": 0.10746467113494873, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.88961410522461, "rewards/margins": 45.08317565917969, "rewards/rejected": 8.804435729980469, "step": 5149 }, { "epoch": 2.6656314699792962, "grad_norm": 0.8931684494018555, "learning_rate": 3.3582254700670837e-07, "loss": 0.06782174855470657, "rewards/accuracies": 0.96875, "rewards/chosen": 52.864166259765625, "rewards/margins": 43.216339111328125, "rewards/rejected": 9.649903297424316, "step": 5150 }, { "epoch": 2.6661490683229814, "grad_norm": 2.0571370124816895, "learning_rate": 3.3479540588114425e-07, "loss": 0.13927096128463745, "rewards/accuracies": 0.9453125, "rewards/chosen": 52.77760696411133, "rewards/margins": 43.5631103515625, "rewards/rejected": 9.211189270019531, "step": 5151 }, { "epoch": 2.6666666666666665, "grad_norm": 0.9250035881996155, "learning_rate": 3.337697835465775e-07, "loss": 0.036816246807575226, "rewards/accuracies": 0.9921875, "rewards/chosen": 59.485939025878906, "rewards/margins": 48.50982666015625, "rewards/rejected": 10.975303649902344, "step": 5152 }, { "epoch": 2.667184265010352, "grad_norm": 2.037992238998413, "learning_rate": 3.327456803369067e-07, "loss": 0.04240208491683006, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.003387451171875, "rewards/margins": 45.525970458984375, "rewards/rejected": 11.466594696044922, "step": 5153 }, { "epoch": 2.6677018633540373, "grad_norm": 1.5558995008468628, "learning_rate": 3.3172309658553616e-07, "loss": 0.07818258553743362, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.768096923828125, "rewards/margins": 41.055572509765625, "rewards/rejected": 11.712522506713867, "step": 5154 }, { "epoch": 2.6682194616977224, "grad_norm": 0.6901648640632629, "learning_rate": 3.307020326253757e-07, "loss": 0.06971048563718796, "rewards/accuracies": 0.96875, "rewards/chosen": 63.207115173339844, "rewards/margins": 47.23321533203125, "rewards/rejected": 15.969802856445312, "step": 5155 }, { "epoch": 2.668737060041408, "grad_norm": 1.0578187704086304, "learning_rate": 3.296824887888378e-07, "loss": 0.10422290861606598, "rewards/accuracies": 0.953125, "rewards/chosen": 57.2713623046875, "rewards/margins": 44.324493408203125, "rewards/rejected": 12.956645965576172, "step": 5156 }, { "epoch": 2.669254658385093, "grad_norm": 1.0387662649154663, "learning_rate": 3.2866446540784523e-07, "loss": 0.07034320384263992, "rewards/accuracies": 0.9609375, "rewards/chosen": 64.01164245605469, "rewards/margins": 50.54864501953125, "rewards/rejected": 13.464553833007812, "step": 5157 }, { "epoch": 2.6697722567287787, "grad_norm": 0.875667929649353, "learning_rate": 3.276479628138224e-07, "loss": 0.07612034678459167, "rewards/accuracies": 0.953125, "rewards/chosen": 50.648406982421875, "rewards/margins": 42.01979064941406, "rewards/rejected": 8.635337829589844, "step": 5158 }, { "epoch": 2.670289855072464, "grad_norm": 0.763346791267395, "learning_rate": 3.266329813376973e-07, "loss": 0.07594209164381027, "rewards/accuracies": 0.96875, "rewards/chosen": 49.847076416015625, "rewards/margins": 40.655364990234375, "rewards/rejected": 9.191587448120117, "step": 5159 }, { "epoch": 2.670807453416149, "grad_norm": 1.1853615045547485, "learning_rate": 3.256195213099045e-07, "loss": 0.07350745797157288, "rewards/accuracies": 0.96875, "rewards/chosen": 55.86076736450195, "rewards/margins": 46.53155517578125, "rewards/rejected": 9.323532104492188, "step": 5160 }, { "epoch": 2.671325051759834, "grad_norm": 0.9294557571411133, "learning_rate": 3.246075830603851e-07, "loss": 0.09542076289653778, "rewards/accuracies": 0.9375, "rewards/chosen": 56.900943756103516, "rewards/margins": 47.50743103027344, "rewards/rejected": 9.389686584472656, "step": 5161 }, { "epoch": 2.6718426501035197, "grad_norm": 1.9303995370864868, "learning_rate": 3.2359716691858336e-07, "loss": 0.13503094017505646, "rewards/accuracies": 0.9453125, "rewards/chosen": 49.114479064941406, "rewards/margins": 40.68988037109375, "rewards/rejected": 8.428010940551758, "step": 5162 }, { "epoch": 2.672360248447205, "grad_norm": 1.7365236282348633, "learning_rate": 3.225882732134439e-07, "loss": 0.10818828642368317, "rewards/accuracies": 0.9375, "rewards/chosen": 52.6632080078125, "rewards/margins": 41.90211486816406, "rewards/rejected": 10.761751174926758, "step": 5163 }, { "epoch": 2.6728778467908905, "grad_norm": 1.012662410736084, "learning_rate": 3.2158090227342286e-07, "loss": 0.07969266176223755, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.97638702392578, "rewards/margins": 47.962127685546875, "rewards/rejected": 13.011970520019531, "step": 5164 }, { "epoch": 2.6733954451345756, "grad_norm": 0.9736490249633789, "learning_rate": 3.205750544264763e-07, "loss": 0.08480864018201828, "rewards/accuracies": 0.953125, "rewards/chosen": 58.38715744018555, "rewards/margins": 47.68597412109375, "rewards/rejected": 10.698062896728516, "step": 5165 }, { "epoch": 2.6739130434782608, "grad_norm": 1.2010139226913452, "learning_rate": 3.195707300000644e-07, "loss": 0.12378397583961487, "rewards/accuracies": 0.9296875, "rewards/chosen": 53.329132080078125, "rewards/margins": 44.67552185058594, "rewards/rejected": 8.65371322631836, "step": 5166 }, { "epoch": 2.674430641821946, "grad_norm": 0.7792221307754517, "learning_rate": 3.185679293211513e-07, "loss": 0.0520610474050045, "rewards/accuracies": 0.96875, "rewards/chosen": 57.48027038574219, "rewards/margins": 45.6494140625, "rewards/rejected": 11.83462905883789, "step": 5167 }, { "epoch": 2.6749482401656315, "grad_norm": 1.161212682723999, "learning_rate": 3.175666527162097e-07, "loss": 0.09344534575939178, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.385536193847656, "rewards/margins": 43.80369567871094, "rewards/rejected": 13.578502655029297, "step": 5168 }, { "epoch": 2.6754658385093166, "grad_norm": 1.1896114349365234, "learning_rate": 3.1656690051120875e-07, "loss": 0.09874242544174194, "rewards/accuracies": 0.953125, "rewards/chosen": 59.56045150756836, "rewards/margins": 47.2611083984375, "rewards/rejected": 12.302215576171875, "step": 5169 }, { "epoch": 2.675983436853002, "grad_norm": 0.8327924013137817, "learning_rate": 3.1556867303162595e-07, "loss": 0.07393941283226013, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.75716018676758, "rewards/margins": 43.84506607055664, "rewards/rejected": 10.898445129394531, "step": 5170 }, { "epoch": 2.6765010351966874, "grad_norm": 2.2357418537139893, "learning_rate": 3.145719706024425e-07, "loss": 0.14646440744400024, "rewards/accuracies": 0.921875, "rewards/chosen": 56.383888244628906, "rewards/margins": 45.759429931640625, "rewards/rejected": 10.636089324951172, "step": 5171 }, { "epoch": 2.6770186335403725, "grad_norm": 0.9733155965805054, "learning_rate": 3.1357679354814095e-07, "loss": 0.1108141615986824, "rewards/accuracies": 0.921875, "rewards/chosen": 58.08266830444336, "rewards/margins": 47.133453369140625, "rewards/rejected": 10.944023132324219, "step": 5172 }, { "epoch": 2.677536231884058, "grad_norm": 0.917341411113739, "learning_rate": 3.125831421927089e-07, "loss": 0.04858962818980217, "rewards/accuracies": 0.984375, "rewards/chosen": 60.47627258300781, "rewards/margins": 44.81260681152344, "rewards/rejected": 15.670166015625, "step": 5173 }, { "epoch": 2.6780538302277432, "grad_norm": 0.9119963645935059, "learning_rate": 3.1159101685963646e-07, "loss": 0.07913216948509216, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.446285247802734, "rewards/margins": 48.67649841308594, "rewards/rejected": 10.770712852478027, "step": 5174 }, { "epoch": 2.678571428571429, "grad_norm": 0.9426063895225525, "learning_rate": 3.106004178719174e-07, "loss": 0.0706339180469513, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.82284927368164, "rewards/margins": 42.538177490234375, "rewards/rejected": 10.28046989440918, "step": 5175 }, { "epoch": 2.679089026915114, "grad_norm": 1.0169345140457153, "learning_rate": 3.096113455520483e-07, "loss": 0.11558696627616882, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.25440979003906, "rewards/margins": 43.272796630859375, "rewards/rejected": 9.980268478393555, "step": 5176 }, { "epoch": 2.679606625258799, "grad_norm": 1.0451525449752808, "learning_rate": 3.0862380022202876e-07, "loss": 0.09640014171600342, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.20820617675781, "rewards/margins": 40.78974914550781, "rewards/rejected": 9.417388916015625, "step": 5177 }, { "epoch": 2.6801242236024843, "grad_norm": 0.9798035621643066, "learning_rate": 3.0763778220336104e-07, "loss": 0.07428738474845886, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.69648742675781, "rewards/margins": 46.78318786621094, "rewards/rejected": 11.924135208129883, "step": 5178 }, { "epoch": 2.68064182194617, "grad_norm": 0.5192062854766846, "learning_rate": 3.066532918170512e-07, "loss": 0.04402102902531624, "rewards/accuracies": 0.984375, "rewards/chosen": 53.01695251464844, "rewards/margins": 42.82244873046875, "rewards/rejected": 10.19970703125, "step": 5179 }, { "epoch": 2.681159420289855, "grad_norm": 2.298030138015747, "learning_rate": 3.0567032938360607e-07, "loss": 0.08877632021903992, "rewards/accuracies": 0.96875, "rewards/chosen": 60.35075378417969, "rewards/margins": 48.529296875, "rewards/rejected": 11.811203002929688, "step": 5180 }, { "epoch": 2.6816770186335406, "grad_norm": 2.0388238430023193, "learning_rate": 3.0468889522303756e-07, "loss": 0.1254091113805771, "rewards/accuracies": 0.921875, "rewards/chosen": 49.153953552246094, "rewards/margins": 39.70008087158203, "rewards/rejected": 9.44765567779541, "step": 5181 }, { "epoch": 2.6821946169772257, "grad_norm": 1.3547296524047852, "learning_rate": 3.0370898965485673e-07, "loss": 0.07578850537538528, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.475555419921875, "rewards/margins": 45.811004638671875, "rewards/rejected": 13.677391052246094, "step": 5182 }, { "epoch": 2.682712215320911, "grad_norm": 0.6330766081809998, "learning_rate": 3.027306129980806e-07, "loss": 0.06329646706581116, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.93585968017578, "rewards/margins": 41.36871337890625, "rewards/rejected": 10.556655883789062, "step": 5183 }, { "epoch": 2.683229813664596, "grad_norm": 1.9311444759368896, "learning_rate": 3.017537655712255e-07, "loss": 0.14496874809265137, "rewards/accuracies": 0.9296875, "rewards/chosen": 57.94829559326172, "rewards/margins": 43.59327697753906, "rewards/rejected": 14.337308883666992, "step": 5184 }, { "epoch": 2.6837474120082816, "grad_norm": 0.8188586831092834, "learning_rate": 3.007784476923131e-07, "loss": 0.05004357546567917, "rewards/accuracies": 0.9765625, "rewards/chosen": 64.74185943603516, "rewards/margins": 51.57804870605469, "rewards/rejected": 13.169309616088867, "step": 5185 }, { "epoch": 2.6842650103519667, "grad_norm": 1.0073843002319336, "learning_rate": 2.998046596788623e-07, "loss": 0.0605010986328125, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.324134826660156, "rewards/margins": 42.19892883300781, "rewards/rejected": 12.128217697143555, "step": 5186 }, { "epoch": 2.6847826086956523, "grad_norm": 1.1963176727294922, "learning_rate": 2.988324018478994e-07, "loss": 0.10848911106586456, "rewards/accuracies": 0.9375, "rewards/chosen": 60.59588623046875, "rewards/margins": 47.485626220703125, "rewards/rejected": 13.112018585205078, "step": 5187 }, { "epoch": 2.6853002070393375, "grad_norm": 3.2630581855773926, "learning_rate": 2.9786167451594903e-07, "loss": 0.1508832424879074, "rewards/accuracies": 0.953125, "rewards/chosen": 52.71519470214844, "rewards/margins": 40.26024627685547, "rewards/rejected": 12.45412826538086, "step": 5188 }, { "epoch": 2.6858178053830226, "grad_norm": 0.5508143901824951, "learning_rate": 2.9689247799903787e-07, "loss": 0.04475097358226776, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.34651184082031, "rewards/margins": 47.326873779296875, "rewards/rejected": 13.012336730957031, "step": 5189 }, { "epoch": 2.686335403726708, "grad_norm": 0.7089236378669739, "learning_rate": 2.9592481261269515e-07, "loss": 0.05949368700385094, "rewards/accuracies": 0.96875, "rewards/chosen": 57.848358154296875, "rewards/margins": 48.26068115234375, "rewards/rejected": 9.576305389404297, "step": 5190 }, { "epoch": 2.6868530020703933, "grad_norm": 0.8809359669685364, "learning_rate": 2.9495867867195215e-07, "loss": 0.07047824561595917, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.91997528076172, "rewards/margins": 45.07298278808594, "rewards/rejected": 11.850347518920898, "step": 5191 }, { "epoch": 2.687370600414079, "grad_norm": 0.8880408406257629, "learning_rate": 2.939940764913396e-07, "loss": 0.083713099360466, "rewards/accuracies": 0.953125, "rewards/chosen": 56.11912536621094, "rewards/margins": 43.90673828125, "rewards/rejected": 12.198104858398438, "step": 5192 }, { "epoch": 2.687888198757764, "grad_norm": 0.9312866926193237, "learning_rate": 2.930310063848907e-07, "loss": 0.06282192468643188, "rewards/accuracies": 0.96875, "rewards/chosen": 61.78986358642578, "rewards/margins": 48.218170166015625, "rewards/rejected": 13.56515884399414, "step": 5193 }, { "epoch": 2.6884057971014492, "grad_norm": 1.6315875053405762, "learning_rate": 2.920694686661413e-07, "loss": 0.0991157740354538, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.148040771484375, "rewards/margins": 45.91630554199219, "rewards/rejected": 11.23678970336914, "step": 5194 }, { "epoch": 2.6889233954451344, "grad_norm": 0.6406186819076538, "learning_rate": 2.9110946364812545e-07, "loss": 0.05397311598062515, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.384117126464844, "rewards/margins": 46.3790283203125, "rewards/rejected": 10.997268676757812, "step": 5195 }, { "epoch": 2.68944099378882, "grad_norm": 0.823214054107666, "learning_rate": 2.901509916433803e-07, "loss": 0.06900084763765335, "rewards/accuracies": 0.96875, "rewards/chosen": 57.96519088745117, "rewards/margins": 44.96892547607422, "rewards/rejected": 12.998292922973633, "step": 5196 }, { "epoch": 2.689958592132505, "grad_norm": 1.174643874168396, "learning_rate": 2.8919405296394254e-07, "loss": 0.0579877533018589, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.95257568359375, "rewards/margins": 44.37054443359375, "rewards/rejected": 12.582134246826172, "step": 5197 }, { "epoch": 2.6904761904761907, "grad_norm": 0.5114105343818665, "learning_rate": 2.882386479213528e-07, "loss": 0.028387203812599182, "rewards/accuracies": 0.984375, "rewards/chosen": 65.21278381347656, "rewards/margins": 52.02992248535156, "rewards/rejected": 13.187126159667969, "step": 5198 }, { "epoch": 2.690993788819876, "grad_norm": 0.7260948419570923, "learning_rate": 2.8728477682664733e-07, "loss": 0.08154575526714325, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.902061462402344, "rewards/margins": 44.8367919921875, "rewards/rejected": 9.054302215576172, "step": 5199 }, { "epoch": 2.691511387163561, "grad_norm": 1.3108347654342651, "learning_rate": 2.863324399903672e-07, "loss": 0.07861943542957306, "rewards/accuracies": 0.953125, "rewards/chosen": 58.78234100341797, "rewards/margins": 46.79978942871094, "rewards/rejected": 11.994049072265625, "step": 5200 }, { "epoch": 2.692028985507246, "grad_norm": 0.7274074554443359, "learning_rate": 2.853816377225521e-07, "loss": 0.07830719649791718, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.1086311340332, "rewards/margins": 45.05780029296875, "rewards/rejected": 11.048224449157715, "step": 5201 }, { "epoch": 2.6925465838509317, "grad_norm": 0.79109787940979, "learning_rate": 2.8443237033274285e-07, "loss": 0.0811198502779007, "rewards/accuracies": 0.953125, "rewards/chosen": 55.06903839111328, "rewards/margins": 44.40896224975586, "rewards/rejected": 10.670530319213867, "step": 5202 }, { "epoch": 2.693064182194617, "grad_norm": 1.3632622957229614, "learning_rate": 2.8348463812997994e-07, "loss": 0.08149641007184982, "rewards/accuracies": 0.953125, "rewards/chosen": 51.3875732421875, "rewards/margins": 40.7467041015625, "rewards/rejected": 10.65408706665039, "step": 5203 }, { "epoch": 2.6935817805383024, "grad_norm": 0.923154354095459, "learning_rate": 2.825384414228044e-07, "loss": 0.05717463791370392, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.25654602050781, "rewards/margins": 48.315643310546875, "rewards/rejected": 12.9417724609375, "step": 5204 }, { "epoch": 2.6940993788819876, "grad_norm": 1.3653994798660278, "learning_rate": 2.815937805192576e-07, "loss": 0.12001386284828186, "rewards/accuracies": 0.9375, "rewards/chosen": 60.671878814697266, "rewards/margins": 47.87675476074219, "rewards/rejected": 12.789642333984375, "step": 5205 }, { "epoch": 2.6946169772256727, "grad_norm": 2.881699800491333, "learning_rate": 2.8065065572688123e-07, "loss": 0.08876985311508179, "rewards/accuracies": 0.9375, "rewards/chosen": 62.8031005859375, "rewards/margins": 48.766693115234375, "rewards/rejected": 14.050132751464844, "step": 5206 }, { "epoch": 2.6951345755693583, "grad_norm": 1.3679423332214355, "learning_rate": 2.797090673527153e-07, "loss": 0.11471078544855118, "rewards/accuracies": 0.953125, "rewards/chosen": 54.24305725097656, "rewards/margins": 43.16764831542969, "rewards/rejected": 11.080646514892578, "step": 5207 }, { "epoch": 2.6956521739130435, "grad_norm": 2.0783169269561768, "learning_rate": 2.787690157033013e-07, "loss": 0.09148922562599182, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.64265441894531, "rewards/margins": 43.90936279296875, "rewards/rejected": 10.722127914428711, "step": 5208 }, { "epoch": 2.696169772256729, "grad_norm": 2.7825422286987305, "learning_rate": 2.778305010846799e-07, "loss": 0.12137159705162048, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.28109359741211, "rewards/margins": 46.76373291015625, "rewards/rejected": 12.516889572143555, "step": 5209 }, { "epoch": 2.696687370600414, "grad_norm": 0.6984475255012512, "learning_rate": 2.768935238023912e-07, "loss": 0.05628412216901779, "rewards/accuracies": 0.96875, "rewards/chosen": 60.10533142089844, "rewards/margins": 49.295989990234375, "rewards/rejected": 10.811003684997559, "step": 5210 }, { "epoch": 2.6972049689440993, "grad_norm": 0.5871882438659668, "learning_rate": 2.7595808416147505e-07, "loss": 0.05618567764759064, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.197998046875, "rewards/margins": 43.77568054199219, "rewards/rejected": 10.41840934753418, "step": 5211 }, { "epoch": 2.6977225672877845, "grad_norm": 1.0795432329177856, "learning_rate": 2.750241824664696e-07, "loss": 0.08945191651582718, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.77768325805664, "rewards/margins": 49.85137939453125, "rewards/rejected": 10.935409545898438, "step": 5212 }, { "epoch": 2.69824016563147, "grad_norm": 0.769625723361969, "learning_rate": 2.7409181902141423e-07, "loss": 0.06848733872175217, "rewards/accuracies": 0.96875, "rewards/chosen": 54.808082580566406, "rewards/margins": 41.50434875488281, "rewards/rejected": 13.287162780761719, "step": 5213 }, { "epoch": 2.698757763975155, "grad_norm": 0.7470608949661255, "learning_rate": 2.7316099412984745e-07, "loss": 0.04896971583366394, "rewards/accuracies": 0.9921875, "rewards/chosen": 58.61358642578125, "rewards/margins": 46.975189208984375, "rewards/rejected": 11.635677337646484, "step": 5214 }, { "epoch": 2.699275362318841, "grad_norm": 1.769640564918518, "learning_rate": 2.722317080948045e-07, "loss": 0.09447836875915527, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.02587890625, "rewards/margins": 43.469512939453125, "rewards/rejected": 13.560998916625977, "step": 5215 }, { "epoch": 2.699792960662526, "grad_norm": 2.140472412109375, "learning_rate": 2.713039612188206e-07, "loss": 0.12042568624019623, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.92207336425781, "rewards/margins": 47.86236572265625, "rewards/rejected": 11.059405326843262, "step": 5216 }, { "epoch": 2.700310559006211, "grad_norm": 0.9320109486579895, "learning_rate": 2.703777538039332e-07, "loss": 0.05627450346946716, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.53749084472656, "rewards/margins": 48.066741943359375, "rewards/rejected": 11.478042602539062, "step": 5217 }, { "epoch": 2.7008281573498962, "grad_norm": 0.553166389465332, "learning_rate": 2.694530861516731e-07, "loss": 0.044989004731178284, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.74177169799805, "rewards/margins": 45.2879638671875, "rewards/rejected": 13.441062927246094, "step": 5218 }, { "epoch": 2.701345755693582, "grad_norm": 0.9376187920570374, "learning_rate": 2.685299585630735e-07, "loss": 0.08038134127855301, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.82525634765625, "rewards/margins": 45.331024169921875, "rewards/rejected": 11.496414184570312, "step": 5219 }, { "epoch": 2.701863354037267, "grad_norm": 0.9111939072608948, "learning_rate": 2.676083713386646e-07, "loss": 0.1123572289943695, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.35807800292969, "rewards/margins": 45.802154541015625, "rewards/rejected": 9.543724060058594, "step": 5220 }, { "epoch": 2.7023809523809526, "grad_norm": 0.7396010160446167, "learning_rate": 2.666883247784774e-07, "loss": 0.03720853105187416, "rewards/accuracies": 0.9921875, "rewards/chosen": 61.484375, "rewards/margins": 48.906982421875, "rewards/rejected": 12.57166862487793, "step": 5221 }, { "epoch": 2.7028985507246377, "grad_norm": 1.3234467506408691, "learning_rate": 2.6576981918203845e-07, "loss": 0.09519119560718536, "rewards/accuracies": 0.9765625, "rewards/chosen": 66.56344604492188, "rewards/margins": 53.375, "rewards/rejected": 13.203462600708008, "step": 5222 }, { "epoch": 2.703416149068323, "grad_norm": 1.039239764213562, "learning_rate": 2.6485285484837366e-07, "loss": 0.10060364753007889, "rewards/accuracies": 0.953125, "rewards/chosen": 58.91273498535156, "rewards/margins": 48.397857666015625, "rewards/rejected": 10.525928497314453, "step": 5223 }, { "epoch": 2.7039337474120084, "grad_norm": 1.8727946281433105, "learning_rate": 2.639374320760085e-07, "loss": 0.08459307253360748, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.71891784667969, "rewards/margins": 48.053802490234375, "rewards/rejected": 11.675338745117188, "step": 5224 }, { "epoch": 2.7044513457556936, "grad_norm": 2.700763702392578, "learning_rate": 2.6302355116296474e-07, "loss": 0.13520288467407227, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.10877990722656, "rewards/margins": 43.27105712890625, "rewards/rejected": 11.846031188964844, "step": 5225 }, { "epoch": 2.704968944099379, "grad_norm": 0.7763716578483582, "learning_rate": 2.6211121240676265e-07, "loss": 0.08193641155958176, "rewards/accuracies": 0.9453125, "rewards/chosen": 48.70334243774414, "rewards/margins": 40.10084533691406, "rewards/rejected": 8.597237586975098, "step": 5226 }, { "epoch": 2.7054865424430643, "grad_norm": 0.4952094554901123, "learning_rate": 2.612004161044213e-07, "loss": 0.05213805288076401, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.83967590332031, "rewards/margins": 47.213165283203125, "rewards/rejected": 11.637221336364746, "step": 5227 }, { "epoch": 2.7060041407867494, "grad_norm": 0.9812266230583191, "learning_rate": 2.602911625524562e-07, "loss": 0.06367149204015732, "rewards/accuracies": 0.96875, "rewards/chosen": 58.039180755615234, "rewards/margins": 46.287750244140625, "rewards/rejected": 11.76458740234375, "step": 5228 }, { "epoch": 2.7065217391304346, "grad_norm": 0.7649813890457153, "learning_rate": 2.5938345204688186e-07, "loss": 0.06946907192468643, "rewards/accuracies": 0.953125, "rewards/chosen": 56.320518493652344, "rewards/margins": 44.23876953125, "rewards/rejected": 12.088401794433594, "step": 5229 }, { "epoch": 2.70703933747412, "grad_norm": 1.6376138925552368, "learning_rate": 2.5847728488321064e-07, "loss": 0.0707489550113678, "rewards/accuracies": 0.953125, "rewards/chosen": 57.047760009765625, "rewards/margins": 48.0322265625, "rewards/rejected": 9.014358520507812, "step": 5230 }, { "epoch": 2.7075569358178053, "grad_norm": 0.5874127149581909, "learning_rate": 2.5757266135645054e-07, "loss": 0.06653663516044617, "rewards/accuracies": 0.9609375, "rewards/chosen": 62.60057830810547, "rewards/margins": 49.78132629394531, "rewards/rejected": 12.811996459960938, "step": 5231 }, { "epoch": 2.708074534161491, "grad_norm": 2.567556381225586, "learning_rate": 2.5666958176110923e-07, "loss": 0.08230160921812057, "rewards/accuracies": 0.96875, "rewards/chosen": 60.00271224975586, "rewards/margins": 50.663848876953125, "rewards/rejected": 9.346989631652832, "step": 5232 }, { "epoch": 2.708592132505176, "grad_norm": 1.7352724075317383, "learning_rate": 2.5576804639119044e-07, "loss": 0.10038460791110992, "rewards/accuracies": 0.9375, "rewards/chosen": 61.88637924194336, "rewards/margins": 51.57293701171875, "rewards/rejected": 10.308372497558594, "step": 5233 }, { "epoch": 2.709109730848861, "grad_norm": 1.9170867204666138, "learning_rate": 2.5486805554019557e-07, "loss": 0.10414840281009674, "rewards/accuracies": 0.96875, "rewards/chosen": 64.66415405273438, "rewards/margins": 51.25531005859375, "rewards/rejected": 13.409643173217773, "step": 5234 }, { "epoch": 2.7096273291925463, "grad_norm": 1.1422553062438965, "learning_rate": 2.539696095011229e-07, "loss": 0.08783901482820511, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.15785217285156, "rewards/margins": 47.095916748046875, "rewards/rejected": 12.055946350097656, "step": 5235 }, { "epoch": 2.710144927536232, "grad_norm": 0.7304866909980774, "learning_rate": 2.530727085664686e-07, "loss": 0.08448350429534912, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.196922302246094, "rewards/margins": 42.044281005859375, "rewards/rejected": 10.16231918334961, "step": 5236 }, { "epoch": 2.710662525879917, "grad_norm": 0.8382190465927124, "learning_rate": 2.521773530282251e-07, "loss": 0.09205866605043411, "rewards/accuracies": 0.953125, "rewards/chosen": 55.8450927734375, "rewards/margins": 44.069549560546875, "rewards/rejected": 11.773513793945312, "step": 5237 }, { "epoch": 2.7111801242236027, "grad_norm": 0.9702104330062866, "learning_rate": 2.512835431778804e-07, "loss": 0.10963387042284012, "rewards/accuracies": 0.953125, "rewards/chosen": 59.5733528137207, "rewards/margins": 48.20262145996094, "rewards/rejected": 11.371986389160156, "step": 5238 }, { "epoch": 2.711697722567288, "grad_norm": 1.583133339881897, "learning_rate": 2.5039127930642226e-07, "loss": 0.06536810845136642, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.394508361816406, "rewards/margins": 47.81690979003906, "rewards/rejected": 12.57241439819336, "step": 5239 }, { "epoch": 2.712215320910973, "grad_norm": 0.6821041703224182, "learning_rate": 2.4950056170433445e-07, "loss": 0.05721653252840042, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.18604278564453, "rewards/margins": 49.557830810546875, "rewards/rejected": 11.614898681640625, "step": 5240 }, { "epoch": 2.7127329192546585, "grad_norm": 1.4172917604446411, "learning_rate": 2.4861139066159447e-07, "loss": 0.0947536751627922, "rewards/accuracies": 0.9453125, "rewards/chosen": 62.32991027832031, "rewards/margins": 48.044525146484375, "rewards/rejected": 14.299556732177734, "step": 5241 }, { "epoch": 2.7132505175983437, "grad_norm": 1.8545745611190796, "learning_rate": 2.477237664676785e-07, "loss": 0.08623914420604706, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.34613037109375, "rewards/margins": 44.05937957763672, "rewards/rejected": 13.288259506225586, "step": 5242 }, { "epoch": 2.713768115942029, "grad_norm": 0.8674634099006653, "learning_rate": 2.468376894115604e-07, "loss": 0.07256539165973663, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.777381896972656, "rewards/margins": 48.406707763671875, "rewards/rejected": 12.368717193603516, "step": 5243 }, { "epoch": 2.7142857142857144, "grad_norm": 0.7234349846839905, "learning_rate": 2.459531597817094e-07, "loss": 0.08156855404376984, "rewards/accuracies": 0.9375, "rewards/chosen": 60.255897521972656, "rewards/margins": 50.47447204589844, "rewards/rejected": 9.78270149230957, "step": 5244 }, { "epoch": 2.7148033126293996, "grad_norm": 6.475001335144043, "learning_rate": 2.45070177866088e-07, "loss": 0.13018284738063812, "rewards/accuracies": 0.953125, "rewards/chosen": 58.672630310058594, "rewards/margins": 45.94000244140625, "rewards/rejected": 12.744483947753906, "step": 5245 }, { "epoch": 2.7153209109730847, "grad_norm": 1.7174291610717773, "learning_rate": 2.4418874395215843e-07, "loss": 0.1002514660358429, "rewards/accuracies": 0.9609375, "rewards/chosen": 46.10289764404297, "rewards/margins": 37.7027587890625, "rewards/rejected": 8.413251876831055, "step": 5246 }, { "epoch": 2.7158385093167703, "grad_norm": 1.7236393690109253, "learning_rate": 2.4330885832687945e-07, "loss": 0.15094462037086487, "rewards/accuracies": 0.90625, "rewards/chosen": 49.38134765625, "rewards/margins": 40.65692138671875, "rewards/rejected": 8.721415519714355, "step": 5247 }, { "epoch": 2.7163561076604554, "grad_norm": 0.9189288020133972, "learning_rate": 2.4243052127670255e-07, "loss": 0.09569036215543747, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.38128662109375, "rewards/margins": 48.3958740234375, "rewards/rejected": 8.982833862304688, "step": 5248 }, { "epoch": 2.716873706004141, "grad_norm": 0.9714841246604919, "learning_rate": 2.4155373308757667e-07, "loss": 0.11742806434631348, "rewards/accuracies": 0.9296875, "rewards/chosen": 49.15913772583008, "rewards/margins": 41.0072021484375, "rewards/rejected": 8.158525466918945, "step": 5249 }, { "epoch": 2.717391304347826, "grad_norm": 1.916445016860962, "learning_rate": 2.4067849404494637e-07, "loss": 0.08830112963914871, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.70996856689453, "rewards/margins": 47.475433349609375, "rewards/rejected": 11.234346389770508, "step": 5250 }, { "epoch": 2.7179089026915113, "grad_norm": 0.4588073194026947, "learning_rate": 2.398048044337531e-07, "loss": 0.039486903697252274, "rewards/accuracies": 0.984375, "rewards/chosen": 64.90699005126953, "rewards/margins": 52.099853515625, "rewards/rejected": 12.801895141601562, "step": 5251 }, { "epoch": 2.7184265010351965, "grad_norm": 1.1486417055130005, "learning_rate": 2.389326645384321e-07, "loss": 0.10167407989501953, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.464805603027344, "rewards/margins": 44.565521240234375, "rewards/rejected": 12.896982192993164, "step": 5252 }, { "epoch": 2.718944099378882, "grad_norm": 0.8731952905654907, "learning_rate": 2.380620746429152e-07, "loss": 0.09746675938367844, "rewards/accuracies": 0.953125, "rewards/chosen": 53.65426254272461, "rewards/margins": 42.10172653198242, "rewards/rejected": 11.55460262298584, "step": 5253 }, { "epoch": 2.719461697722567, "grad_norm": 0.829803466796875, "learning_rate": 2.3719303503062841e-07, "loss": 0.05406736582517624, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.22096252441406, "rewards/margins": 49.62408447265625, "rewards/rejected": 11.623199462890625, "step": 5254 }, { "epoch": 2.7199792960662528, "grad_norm": 0.5365901589393616, "learning_rate": 2.3632554598449487e-07, "loss": 0.04655035212635994, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.11908721923828, "rewards/margins": 46.759521484375, "rewards/rejected": 12.359031677246094, "step": 5255 }, { "epoch": 2.720496894409938, "grad_norm": 0.9804007411003113, "learning_rate": 2.3545960778693146e-07, "loss": 0.1138630285859108, "rewards/accuracies": 0.9375, "rewards/chosen": 51.889076232910156, "rewards/margins": 41.888275146484375, "rewards/rejected": 10.008018493652344, "step": 5256 }, { "epoch": 2.721014492753623, "grad_norm": 0.8603116273880005, "learning_rate": 2.3459522071985042e-07, "loss": 0.10675971955060959, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.842124938964844, "rewards/margins": 42.869384765625, "rewards/rejected": 10.965126037597656, "step": 5257 }, { "epoch": 2.7215320910973086, "grad_norm": 0.9654470682144165, "learning_rate": 2.337323850646589e-07, "loss": 0.0758982002735138, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.02803039550781, "rewards/margins": 48.19560241699219, "rewards/rejected": 12.828594207763672, "step": 5258 }, { "epoch": 2.722049689440994, "grad_norm": 0.9654826521873474, "learning_rate": 2.3287110110225998e-07, "loss": 0.07657994329929352, "rewards/accuracies": 0.953125, "rewards/chosen": 58.552371978759766, "rewards/margins": 47.448699951171875, "rewards/rejected": 11.09521484375, "step": 5259 }, { "epoch": 2.722567287784679, "grad_norm": 0.6907655000686646, "learning_rate": 2.3201136911305099e-07, "loss": 0.09245941042900085, "rewards/accuracies": 0.9296875, "rewards/chosen": 56.87710952758789, "rewards/margins": 45.23272705078125, "rewards/rejected": 11.650161743164062, "step": 5260 }, { "epoch": 2.7230848861283645, "grad_norm": 0.6138346195220947, "learning_rate": 2.311531893769231e-07, "loss": 0.0571373775601387, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.21588897705078, "rewards/margins": 44.51429748535156, "rewards/rejected": 8.704374313354492, "step": 5261 }, { "epoch": 2.7236024844720497, "grad_norm": 1.195725679397583, "learning_rate": 2.3029656217326336e-07, "loss": 0.09032408893108368, "rewards/accuracies": 0.953125, "rewards/chosen": 59.644927978515625, "rewards/margins": 46.53900146484375, "rewards/rejected": 13.095001220703125, "step": 5262 }, { "epoch": 2.724120082815735, "grad_norm": 1.117255687713623, "learning_rate": 2.2944148778095365e-07, "loss": 0.07895854115486145, "rewards/accuracies": 0.953125, "rewards/chosen": 63.49969482421875, "rewards/margins": 49.959259033203125, "rewards/rejected": 13.52313232421875, "step": 5263 }, { "epoch": 2.7246376811594204, "grad_norm": 1.4227739572525024, "learning_rate": 2.285879664783669e-07, "loss": 0.0917096883058548, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.992820739746094, "rewards/margins": 47.692535400390625, "rewards/rejected": 10.305984497070312, "step": 5264 }, { "epoch": 2.7251552795031055, "grad_norm": 0.9336985349655151, "learning_rate": 2.2773599854337636e-07, "loss": 0.06768440455198288, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.55590057373047, "rewards/margins": 46.291717529296875, "rewards/rejected": 8.277936935424805, "step": 5265 }, { "epoch": 2.725672877846791, "grad_norm": 1.061200737953186, "learning_rate": 2.2688558425334463e-07, "loss": 0.06437767297029495, "rewards/accuracies": 0.953125, "rewards/chosen": 59.493858337402344, "rewards/margins": 48.9454345703125, "rewards/rejected": 10.543540954589844, "step": 5266 }, { "epoch": 2.7261904761904763, "grad_norm": 0.7612829208374023, "learning_rate": 2.2603672388513133e-07, "loss": 0.06583113223314285, "rewards/accuracies": 0.9609375, "rewards/chosen": 67.84172058105469, "rewards/margins": 52.5167236328125, "rewards/rejected": 15.322296142578125, "step": 5267 }, { "epoch": 2.7267080745341614, "grad_norm": 0.5616950392723083, "learning_rate": 2.2518941771508707e-07, "loss": 0.06161854416131973, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.68281173706055, "rewards/margins": 43.319488525390625, "rewards/rejected": 11.376686096191406, "step": 5268 }, { "epoch": 2.7272256728778466, "grad_norm": 1.9806045293807983, "learning_rate": 2.2434366601906067e-07, "loss": 0.07802536338567734, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.1483154296875, "rewards/margins": 46.75093078613281, "rewards/rejected": 11.395814895629883, "step": 5269 }, { "epoch": 2.727743271221532, "grad_norm": 0.8494965434074402, "learning_rate": 2.234994690723924e-07, "loss": 0.07195961475372314, "rewards/accuracies": 0.96875, "rewards/chosen": 61.33528137207031, "rewards/margins": 51.300689697265625, "rewards/rejected": 10.037581443786621, "step": 5270 }, { "epoch": 2.7282608695652173, "grad_norm": 0.6950985193252563, "learning_rate": 2.226568271499152e-07, "loss": 0.07189913839101791, "rewards/accuracies": 0.953125, "rewards/chosen": 58.600303649902344, "rewards/margins": 47.80238342285156, "rewards/rejected": 10.800468444824219, "step": 5271 }, { "epoch": 2.728778467908903, "grad_norm": 1.3210557699203491, "learning_rate": 2.2181574052595745e-07, "loss": 0.1232534795999527, "rewards/accuracies": 0.9453125, "rewards/chosen": 60.25080108642578, "rewards/margins": 47.969390869140625, "rewards/rejected": 12.268157958984375, "step": 5272 }, { "epoch": 2.729296066252588, "grad_norm": 0.9437233209609985, "learning_rate": 2.2097620947434284e-07, "loss": 0.09131583571434021, "rewards/accuracies": 0.953125, "rewards/chosen": 52.98579406738281, "rewards/margins": 41.313507080078125, "rewards/rejected": 11.681812286376953, "step": 5273 }, { "epoch": 2.729813664596273, "grad_norm": 0.5435563325881958, "learning_rate": 2.2013823426838444e-07, "loss": 0.07338553667068481, "rewards/accuracies": 0.953125, "rewards/chosen": 55.913169860839844, "rewards/margins": 45.342926025390625, "rewards/rejected": 10.561233520507812, "step": 5274 }, { "epoch": 2.7303312629399588, "grad_norm": 0.8984293341636658, "learning_rate": 2.193018151808929e-07, "loss": 0.056813597679138184, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.961265563964844, "rewards/margins": 48.09614562988281, "rewards/rejected": 12.863628387451172, "step": 5275 }, { "epoch": 2.730848861283644, "grad_norm": 2.24630069732666, "learning_rate": 2.1846695248416928e-07, "loss": 0.11489853262901306, "rewards/accuracies": 0.953125, "rewards/chosen": 58.79440689086914, "rewards/margins": 46.4276123046875, "rewards/rejected": 12.37709903717041, "step": 5276 }, { "epoch": 2.731366459627329, "grad_norm": 1.0695040225982666, "learning_rate": 2.1763364645001006e-07, "loss": 0.1179574653506279, "rewards/accuracies": 0.9296875, "rewards/chosen": 53.818416595458984, "rewards/margins": 43.228065490722656, "rewards/rejected": 10.58852767944336, "step": 5277 }, { "epoch": 2.7318840579710146, "grad_norm": 0.562839150428772, "learning_rate": 2.1680189734970325e-07, "loss": 0.05149379372596741, "rewards/accuracies": 0.96875, "rewards/chosen": 57.85247802734375, "rewards/margins": 48.673492431640625, "rewards/rejected": 9.180636405944824, "step": 5278 }, { "epoch": 2.7324016563147, "grad_norm": 0.6428520083427429, "learning_rate": 2.1597170545403056e-07, "loss": 0.06198421120643616, "rewards/accuracies": 0.96875, "rewards/chosen": 55.52095031738281, "rewards/margins": 43.1851806640625, "rewards/rejected": 12.331626892089844, "step": 5279 }, { "epoch": 2.732919254658385, "grad_norm": 1.4509707689285278, "learning_rate": 2.1514307103326915e-07, "loss": 0.13730382919311523, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.060340881347656, "rewards/margins": 41.395538330078125, "rewards/rejected": 9.6592435836792, "step": 5280 }, { "epoch": 2.7334368530020705, "grad_norm": 0.48805856704711914, "learning_rate": 2.1431599435718432e-07, "loss": 0.04840751737356186, "rewards/accuracies": 0.984375, "rewards/chosen": 52.473793029785156, "rewards/margins": 44.32405090332031, "rewards/rejected": 8.144225120544434, "step": 5281 }, { "epoch": 2.7339544513457557, "grad_norm": 0.5501322746276855, "learning_rate": 2.1349047569503788e-07, "loss": 0.039144717156887054, "rewards/accuracies": 1.0, "rewards/chosen": 57.189048767089844, "rewards/margins": 46.37420654296875, "rewards/rejected": 10.802619934082031, "step": 5282 }, { "epoch": 2.7344720496894412, "grad_norm": 0.5643348693847656, "learning_rate": 2.1266651531558324e-07, "loss": 0.042112041264772415, "rewards/accuracies": 0.984375, "rewards/chosen": 62.155181884765625, "rewards/margins": 48.293182373046875, "rewards/rejected": 13.869426727294922, "step": 5283 }, { "epoch": 2.7349896480331264, "grad_norm": 1.1907579898834229, "learning_rate": 2.1184411348706635e-07, "loss": 0.11714427173137665, "rewards/accuracies": 0.9375, "rewards/chosen": 56.31541442871094, "rewards/margins": 45.13958740234375, "rewards/rejected": 11.183549880981445, "step": 5284 }, { "epoch": 2.7355072463768115, "grad_norm": 1.1260579824447632, "learning_rate": 2.1102327047722694e-07, "loss": 0.08126362413167953, "rewards/accuracies": 0.953125, "rewards/chosen": 53.82195281982422, "rewards/margins": 42.09458923339844, "rewards/rejected": 11.7384033203125, "step": 5285 }, { "epoch": 2.7360248447204967, "grad_norm": 0.5710425972938538, "learning_rate": 2.1020398655329566e-07, "loss": 0.0637049451470375, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.16547775268555, "rewards/margins": 47.02430725097656, "rewards/rejected": 14.13787841796875, "step": 5286 }, { "epoch": 2.7365424430641823, "grad_norm": 0.7666301131248474, "learning_rate": 2.0938626198199697e-07, "loss": 0.05578029155731201, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.09039306640625, "rewards/margins": 47.62841796875, "rewards/rejected": 11.464252471923828, "step": 5287 }, { "epoch": 2.7370600414078674, "grad_norm": 0.9488485455513, "learning_rate": 2.0857009702954623e-07, "loss": 0.08500228822231293, "rewards/accuracies": 0.953125, "rewards/chosen": 59.085819244384766, "rewards/margins": 44.371490478515625, "rewards/rejected": 14.72812271118164, "step": 5288 }, { "epoch": 2.737577639751553, "grad_norm": 0.8222790956497192, "learning_rate": 2.0775549196165202e-07, "loss": 0.07789392024278641, "rewards/accuracies": 0.96875, "rewards/chosen": 52.034515380859375, "rewards/margins": 40.312957763671875, "rewards/rejected": 11.720213890075684, "step": 5289 }, { "epoch": 2.738095238095238, "grad_norm": 0.674412727355957, "learning_rate": 2.0694244704351495e-07, "loss": 0.05597453936934471, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.936920166015625, "rewards/margins": 43.07049560546875, "rewards/rejected": 9.8795166015625, "step": 5290 }, { "epoch": 2.7386128364389233, "grad_norm": 0.5889081358909607, "learning_rate": 2.0613096253982778e-07, "loss": 0.059052519500255585, "rewards/accuracies": 0.984375, "rewards/chosen": 57.5401611328125, "rewards/margins": 44.70062255859375, "rewards/rejected": 12.840240478515625, "step": 5291 }, { "epoch": 2.7391304347826084, "grad_norm": 1.2758108377456665, "learning_rate": 2.0532103871477526e-07, "loss": 0.11278438568115234, "rewards/accuracies": 0.953125, "rewards/chosen": 46.5500602722168, "rewards/margins": 38.00193786621094, "rewards/rejected": 8.548139572143555, "step": 5292 }, { "epoch": 2.739648033126294, "grad_norm": 0.6432249546051025, "learning_rate": 2.0451267583203426e-07, "loss": 0.0595477893948555, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.30443572998047, "rewards/margins": 46.4327392578125, "rewards/rejected": 9.880621910095215, "step": 5293 }, { "epoch": 2.740165631469979, "grad_norm": 0.9987066388130188, "learning_rate": 2.037058741547715e-07, "loss": 0.11764995753765106, "rewards/accuracies": 0.953125, "rewards/chosen": 48.99940490722656, "rewards/margins": 37.43373107910156, "rewards/rejected": 11.561445236206055, "step": 5294 }, { "epoch": 2.7406832298136647, "grad_norm": 0.9790231585502625, "learning_rate": 2.029006339456485e-07, "loss": 0.09693794697523117, "rewards/accuracies": 0.953125, "rewards/chosen": 57.82126235961914, "rewards/margins": 43.92753601074219, "rewards/rejected": 13.88926887512207, "step": 5295 }, { "epoch": 2.74120082815735, "grad_norm": 1.0627821683883667, "learning_rate": 2.0209695546681728e-07, "loss": 0.07073570042848587, "rewards/accuracies": 0.953125, "rewards/chosen": 57.27287292480469, "rewards/margins": 46.42559814453125, "rewards/rejected": 10.855880737304688, "step": 5296 }, { "epoch": 2.741718426501035, "grad_norm": 1.4262456893920898, "learning_rate": 2.0129483897992019e-07, "loss": 0.08354168385267258, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.95927429199219, "rewards/margins": 45.0810546875, "rewards/rejected": 8.883659362792969, "step": 5297 }, { "epoch": 2.7422360248447206, "grad_norm": 0.8547229766845703, "learning_rate": 2.0049428474609224e-07, "loss": 0.09336468577384949, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.877838134765625, "rewards/margins": 42.6004638671875, "rewards/rejected": 10.279165267944336, "step": 5298 }, { "epoch": 2.7427536231884058, "grad_norm": 4.271890640258789, "learning_rate": 1.9969529302596047e-07, "loss": 0.07666543126106262, "rewards/accuracies": 0.9921875, "rewards/chosen": 57.301513671875, "rewards/margins": 46.647491455078125, "rewards/rejected": 10.645633697509766, "step": 5299 }, { "epoch": 2.7432712215320914, "grad_norm": 0.9154717922210693, "learning_rate": 1.9889786407964184e-07, "loss": 0.06996451318264008, "rewards/accuracies": 0.96875, "rewards/chosen": 54.343589782714844, "rewards/margins": 42.721885681152344, "rewards/rejected": 11.623966217041016, "step": 5300 }, { "epoch": 2.7437888198757765, "grad_norm": 5.069762706756592, "learning_rate": 1.9810199816674423e-07, "loss": 0.11845283210277557, "rewards/accuracies": 0.9375, "rewards/chosen": 67.87559509277344, "rewards/margins": 50.5643310546875, "rewards/rejected": 17.31085968017578, "step": 5301 }, { "epoch": 2.7443064182194616, "grad_norm": 0.7765649557113647, "learning_rate": 1.973076955463682e-07, "loss": 0.03235338255763054, "rewards/accuracies": 0.984375, "rewards/chosen": 63.230472564697266, "rewards/margins": 50.6507568359375, "rewards/rejected": 12.56806468963623, "step": 5302 }, { "epoch": 2.744824016563147, "grad_norm": 0.9408098459243774, "learning_rate": 1.9651495647710627e-07, "loss": 0.0687948688864708, "rewards/accuracies": 0.96875, "rewards/chosen": 64.28509521484375, "rewards/margins": 51.20855712890625, "rewards/rejected": 13.076515197753906, "step": 5303 }, { "epoch": 2.7453416149068324, "grad_norm": 0.8335196375846863, "learning_rate": 1.9572378121703762e-07, "loss": 0.051797054708004, "rewards/accuracies": 0.9765625, "rewards/chosen": 64.076416015625, "rewards/margins": 51.108642578125, "rewards/rejected": 12.971473693847656, "step": 5304 }, { "epoch": 2.7458592132505175, "grad_norm": 0.7798559069633484, "learning_rate": 1.9493417002373672e-07, "loss": 0.08707325905561447, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.66289520263672, "rewards/margins": 44.57115173339844, "rewards/rejected": 11.089656829833984, "step": 5305 }, { "epoch": 2.746376811594203, "grad_norm": 0.5225992202758789, "learning_rate": 1.9414612315426685e-07, "loss": 0.042860500514507294, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.01094436645508, "rewards/margins": 45.40521240234375, "rewards/rejected": 12.60158920288086, "step": 5306 }, { "epoch": 2.7468944099378882, "grad_norm": 1.0249546766281128, "learning_rate": 1.9335964086518277e-07, "loss": 0.08216843008995056, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.43780517578125, "rewards/margins": 48.68290710449219, "rewards/rejected": 12.771484375, "step": 5307 }, { "epoch": 2.7474120082815734, "grad_norm": 0.9901177883148193, "learning_rate": 1.925747234125286e-07, "loss": 0.10632745921611786, "rewards/accuracies": 0.9296875, "rewards/chosen": 52.57264709472656, "rewards/margins": 41.573638916015625, "rewards/rejected": 10.999103546142578, "step": 5308 }, { "epoch": 2.7479296066252585, "grad_norm": 0.7119259238243103, "learning_rate": 1.9179137105183987e-07, "loss": 0.08236502856016159, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.46323013305664, "rewards/margins": 44.859375, "rewards/rejected": 12.60302734375, "step": 5309 }, { "epoch": 2.748447204968944, "grad_norm": 0.7551580667495728, "learning_rate": 1.9100958403814373e-07, "loss": 0.061550311744213104, "rewards/accuracies": 0.96875, "rewards/chosen": 62.762428283691406, "rewards/margins": 48.5321044921875, "rewards/rejected": 14.232097625732422, "step": 5310 }, { "epoch": 2.7489648033126293, "grad_norm": 6.729944229125977, "learning_rate": 1.902293626259555e-07, "loss": 0.07685867697000504, "rewards/accuracies": 0.96875, "rewards/chosen": 60.148563385009766, "rewards/margins": 46.29652404785156, "rewards/rejected": 13.851554870605469, "step": 5311 }, { "epoch": 2.749482401656315, "grad_norm": 0.5364425182342529, "learning_rate": 1.8945070706928247e-07, "loss": 0.057252779603004456, "rewards/accuracies": 0.984375, "rewards/chosen": 61.176963806152344, "rewards/margins": 48.9737548828125, "rewards/rejected": 12.204771041870117, "step": 5312 }, { "epoch": 2.75, "grad_norm": 0.921596348285675, "learning_rate": 1.8867361762162083e-07, "loss": 0.06352414935827255, "rewards/accuracies": 0.96875, "rewards/chosen": 60.256927490234375, "rewards/margins": 48.39415740966797, "rewards/rejected": 11.88058853149414, "step": 5313 }, { "epoch": 2.750517598343685, "grad_norm": 3.8865268230438232, "learning_rate": 1.8789809453595875e-07, "loss": 0.17856739461421967, "rewards/accuracies": 0.953125, "rewards/chosen": 53.66969680786133, "rewards/margins": 42.18389892578125, "rewards/rejected": 11.499337196350098, "step": 5314 }, { "epoch": 2.7510351966873707, "grad_norm": 1.1846705675125122, "learning_rate": 1.8712413806477258e-07, "loss": 0.11096504330635071, "rewards/accuracies": 0.953125, "rewards/chosen": 55.01065444946289, "rewards/margins": 42.849822998046875, "rewards/rejected": 12.146739959716797, "step": 5315 }, { "epoch": 2.751552795031056, "grad_norm": 0.6802650094032288, "learning_rate": 1.8635174846002967e-07, "loss": 0.07019301503896713, "rewards/accuracies": 0.953125, "rewards/chosen": 60.935943603515625, "rewards/margins": 47.07640838623047, "rewards/rejected": 13.850435256958008, "step": 5316 }, { "epoch": 2.7520703933747415, "grad_norm": 1.1360373497009277, "learning_rate": 1.8558092597318666e-07, "loss": 0.0715695321559906, "rewards/accuracies": 0.96875, "rewards/chosen": 58.47266387939453, "rewards/margins": 47.2789306640625, "rewards/rejected": 11.203605651855469, "step": 5317 }, { "epoch": 2.7525879917184266, "grad_norm": 0.9328603744506836, "learning_rate": 1.8481167085519114e-07, "loss": 0.0821056216955185, "rewards/accuracies": 0.96875, "rewards/chosen": 58.826171875, "rewards/margins": 45.6639404296875, "rewards/rejected": 13.170326232910156, "step": 5318 }, { "epoch": 2.7531055900621118, "grad_norm": 1.1220182180404663, "learning_rate": 1.8404398335648e-07, "loss": 0.03874818980693817, "rewards/accuracies": 0.9921875, "rewards/chosen": 61.95069122314453, "rewards/margins": 50.51702880859375, "rewards/rejected": 11.436683654785156, "step": 5319 }, { "epoch": 2.753623188405797, "grad_norm": 0.851988673210144, "learning_rate": 1.8327786372697666e-07, "loss": 0.05067009478807449, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.0927848815918, "rewards/margins": 46.1942138671875, "rewards/rejected": 13.880508422851562, "step": 5320 }, { "epoch": 2.7541407867494825, "grad_norm": 0.6546249985694885, "learning_rate": 1.8251331221610048e-07, "loss": 0.0606992170214653, "rewards/accuracies": 0.96875, "rewards/chosen": 65.07963562011719, "rewards/margins": 50.8045654296875, "rewards/rejected": 14.286651611328125, "step": 5321 }, { "epoch": 2.7546583850931676, "grad_norm": 0.9550510048866272, "learning_rate": 1.8175032907275513e-07, "loss": 0.06627652794122696, "rewards/accuracies": 0.96875, "rewards/chosen": 59.02107238769531, "rewards/margins": 44.114288330078125, "rewards/rejected": 14.917810440063477, "step": 5322 }, { "epoch": 2.755175983436853, "grad_norm": 0.5933629870414734, "learning_rate": 1.8098891454533473e-07, "loss": 0.05753070116043091, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.307647705078125, "rewards/margins": 48.017913818359375, "rewards/rejected": 10.296478271484375, "step": 5323 }, { "epoch": 2.7556935817805384, "grad_norm": 0.6835242509841919, "learning_rate": 1.802290688817232e-07, "loss": 0.03360356390476227, "rewards/accuracies": 0.9921875, "rewards/chosen": 63.4952392578125, "rewards/margins": 49.82403564453125, "rewards/rejected": 13.68641471862793, "step": 5324 }, { "epoch": 2.7562111801242235, "grad_norm": 0.8309862017631531, "learning_rate": 1.7947079232929488e-07, "loss": 0.06826764345169067, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.17876434326172, "rewards/margins": 43.038726806640625, "rewards/rejected": 12.146905899047852, "step": 5325 }, { "epoch": 2.7567287784679086, "grad_norm": 1.8664934635162354, "learning_rate": 1.7871408513491285e-07, "loss": 0.12349015474319458, "rewards/accuracies": 0.9375, "rewards/chosen": 55.99346923828125, "rewards/margins": 45.183868408203125, "rewards/rejected": 10.821747779846191, "step": 5326 }, { "epoch": 2.7572463768115942, "grad_norm": 1.2326420545578003, "learning_rate": 1.7795894754492616e-07, "loss": 0.09233686327934265, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.806846618652344, "rewards/margins": 43.917724609375, "rewards/rejected": 10.878440856933594, "step": 5327 }, { "epoch": 2.7577639751552794, "grad_norm": 1.3610271215438843, "learning_rate": 1.772053798051765e-07, "loss": 0.11344226449728012, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.87812805175781, "rewards/margins": 42.80541229248047, "rewards/rejected": 13.074541091918945, "step": 5328 }, { "epoch": 2.758281573498965, "grad_norm": 1.0220789909362793, "learning_rate": 1.764533821609943e-07, "loss": 0.08327552676200867, "rewards/accuracies": 0.953125, "rewards/chosen": 58.53606033325195, "rewards/margins": 45.45123291015625, "rewards/rejected": 13.100603103637695, "step": 5329 }, { "epoch": 2.75879917184265, "grad_norm": 0.9010889530181885, "learning_rate": 1.7570295485719646e-07, "loss": 0.051059022545814514, "rewards/accuracies": 0.984375, "rewards/chosen": 55.79216766357422, "rewards/margins": 41.775238037109375, "rewards/rejected": 14.00725269317627, "step": 5330 }, { "epoch": 2.7593167701863353, "grad_norm": 1.206247091293335, "learning_rate": 1.7495409813808983e-07, "loss": 0.07506069540977478, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.518035888671875, "rewards/margins": 46.615814208984375, "rewards/rejected": 9.90186595916748, "step": 5331 }, { "epoch": 2.759834368530021, "grad_norm": 3.557158946990967, "learning_rate": 1.7420681224747105e-07, "loss": 0.14068293571472168, "rewards/accuracies": 0.9375, "rewards/chosen": 59.63700485229492, "rewards/margins": 46.300537109375, "rewards/rejected": 13.321023941040039, "step": 5332 }, { "epoch": 2.760351966873706, "grad_norm": 2.766902208328247, "learning_rate": 1.7346109742862438e-07, "loss": 0.07369647920131683, "rewards/accuracies": 0.984375, "rewards/chosen": 54.415218353271484, "rewards/margins": 46.026611328125, "rewards/rejected": 8.379345893859863, "step": 5333 }, { "epoch": 2.7608695652173916, "grad_norm": 1.1177160739898682, "learning_rate": 1.7271695392432175e-07, "loss": 0.12393840402364731, "rewards/accuracies": 0.921875, "rewards/chosen": 52.75995635986328, "rewards/margins": 42.22282409667969, "rewards/rejected": 10.540868759155273, "step": 5334 }, { "epoch": 2.7613871635610767, "grad_norm": 0.866166353225708, "learning_rate": 1.7197438197682548e-07, "loss": 0.04835180565714836, "rewards/accuracies": 0.984375, "rewards/chosen": 58.92204666137695, "rewards/margins": 48.800537109375, "rewards/rejected": 10.127025604248047, "step": 5335 }, { "epoch": 2.761904761904762, "grad_norm": 0.9160217642784119, "learning_rate": 1.7123338182788496e-07, "loss": 0.04731176793575287, "rewards/accuracies": 0.96875, "rewards/chosen": 62.987754821777344, "rewards/margins": 47.77013397216797, "rewards/rejected": 15.205699920654297, "step": 5336 }, { "epoch": 2.762422360248447, "grad_norm": 1.0177109241485596, "learning_rate": 1.7049395371873721e-07, "loss": 0.07281360030174255, "rewards/accuracies": 0.96875, "rewards/chosen": 50.86793518066406, "rewards/margins": 40.403076171875, "rewards/rejected": 10.477630615234375, "step": 5337 }, { "epoch": 2.7629399585921326, "grad_norm": 1.0197275876998901, "learning_rate": 1.6975609789010972e-07, "loss": 0.10079733282327652, "rewards/accuracies": 0.9375, "rewards/chosen": 56.35548400878906, "rewards/margins": 44.874114990234375, "rewards/rejected": 11.50052261352539, "step": 5338 }, { "epoch": 2.7634575569358177, "grad_norm": 1.383914589881897, "learning_rate": 1.6901981458221638e-07, "loss": 0.07430434972047806, "rewards/accuracies": 0.96875, "rewards/chosen": 58.29800796508789, "rewards/margins": 43.590118408203125, "rewards/rejected": 14.7314453125, "step": 5339 }, { "epoch": 2.7639751552795033, "grad_norm": 0.8844115138053894, "learning_rate": 1.6828510403475885e-07, "loss": 0.07701824605464935, "rewards/accuracies": 0.953125, "rewards/chosen": 61.102867126464844, "rewards/margins": 47.64459228515625, "rewards/rejected": 13.456649780273438, "step": 5340 }, { "epoch": 2.7644927536231885, "grad_norm": 3.8258109092712402, "learning_rate": 1.6755196648692795e-07, "loss": 0.13293422758579254, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.01533508300781, "rewards/margins": 46.5574951171875, "rewards/rejected": 12.456016540527344, "step": 5341 }, { "epoch": 2.7650103519668736, "grad_norm": 1.6456338167190552, "learning_rate": 1.6682040217740225e-07, "loss": 0.12584947049617767, "rewards/accuracies": 0.9296875, "rewards/chosen": 55.503021240234375, "rewards/margins": 42.783538818359375, "rewards/rejected": 12.704319953918457, "step": 5342 }, { "epoch": 2.7655279503105588, "grad_norm": 2.0707080364227295, "learning_rate": 1.6609041134434677e-07, "loss": 0.08395694196224213, "rewards/accuracies": 0.96875, "rewards/chosen": 63.395530700683594, "rewards/margins": 48.644989013671875, "rewards/rejected": 14.74134635925293, "step": 5343 }, { "epoch": 2.7660455486542443, "grad_norm": 0.6783579587936401, "learning_rate": 1.6536199422541643e-07, "loss": 0.07504742592573166, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.68980407714844, "rewards/margins": 50.81195068359375, "rewards/rejected": 10.87426471710205, "step": 5344 }, { "epoch": 2.7665631469979295, "grad_norm": 0.4127480089664459, "learning_rate": 1.6463515105775262e-07, "loss": 0.03601044788956642, "rewards/accuracies": 0.984375, "rewards/chosen": 64.75604248046875, "rewards/margins": 48.614501953125, "rewards/rejected": 16.138286590576172, "step": 5345 }, { "epoch": 2.767080745341615, "grad_norm": 5.946441650390625, "learning_rate": 1.6390988207798276e-07, "loss": 0.12379325181245804, "rewards/accuracies": 0.984375, "rewards/chosen": 51.89122772216797, "rewards/margins": 39.6351318359375, "rewards/rejected": 12.264091491699219, "step": 5346 }, { "epoch": 2.7675983436853, "grad_norm": 2.8210370540618896, "learning_rate": 1.6318618752222515e-07, "loss": 0.1165623664855957, "rewards/accuracies": 0.953125, "rewards/chosen": 58.302947998046875, "rewards/margins": 45.171539306640625, "rewards/rejected": 13.122795104980469, "step": 5347 }, { "epoch": 2.7681159420289854, "grad_norm": 0.7882256507873535, "learning_rate": 1.6246406762608358e-07, "loss": 0.07857036590576172, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.390079498291016, "rewards/margins": 46.294158935546875, "rewards/rejected": 10.087249755859375, "step": 5348 }, { "epoch": 2.768633540372671, "grad_norm": 3.91690731048584, "learning_rate": 1.6174352262464888e-07, "loss": 0.08601924777030945, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.04603576660156, "rewards/margins": 42.752105712890625, "rewards/rejected": 13.298561096191406, "step": 5349 }, { "epoch": 2.769151138716356, "grad_norm": 0.6808715462684631, "learning_rate": 1.6102455275249895e-07, "loss": 0.051004812121391296, "rewards/accuracies": 0.984375, "rewards/chosen": 63.03556823730469, "rewards/margins": 46.8201904296875, "rewards/rejected": 16.2055606842041, "step": 5350 }, { "epoch": 2.7696687370600412, "grad_norm": 0.9743353128433228, "learning_rate": 1.60307158243701e-07, "loss": 0.08747420459985733, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.701290130615234, "rewards/margins": 44.493133544921875, "rewards/rejected": 14.224552154541016, "step": 5351 }, { "epoch": 2.770186335403727, "grad_norm": 0.8101205229759216, "learning_rate": 1.5959133933180827e-07, "loss": 0.08687105774879456, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.28684997558594, "rewards/margins": 44.55487060546875, "rewards/rejected": 11.726638793945312, "step": 5352 }, { "epoch": 2.770703933747412, "grad_norm": 0.8569443225860596, "learning_rate": 1.5887709624985927e-07, "loss": 0.0723365843296051, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.146751403808594, "rewards/margins": 40.20489501953125, "rewards/rejected": 12.942092895507812, "step": 5353 }, { "epoch": 2.771221532091097, "grad_norm": 0.6362425684928894, "learning_rate": 1.5816442923038077e-07, "loss": 0.06564582884311676, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.9090576171875, "rewards/margins": 47.678070068359375, "rewards/rejected": 13.224771499633789, "step": 5354 }, { "epoch": 2.7717391304347827, "grad_norm": 1.6663142442703247, "learning_rate": 1.5745333850538891e-07, "loss": 0.10073265433311462, "rewards/accuracies": 0.9375, "rewards/chosen": 58.2158317565918, "rewards/margins": 43.67938232421875, "rewards/rejected": 14.529678344726562, "step": 5355 }, { "epoch": 2.772256728778468, "grad_norm": 0.9413396120071411, "learning_rate": 1.5674382430638237e-07, "loss": 0.09226510673761368, "rewards/accuracies": 0.953125, "rewards/chosen": 59.24540710449219, "rewards/margins": 46.029266357421875, "rewards/rejected": 13.208719253540039, "step": 5356 }, { "epoch": 2.7727743271221534, "grad_norm": 0.6902580857276917, "learning_rate": 1.5603588686434968e-07, "loss": 0.05870860069990158, "rewards/accuracies": 0.9765625, "rewards/chosen": 62.92074966430664, "rewards/margins": 49.1138916015625, "rewards/rejected": 13.809463500976562, "step": 5357 }, { "epoch": 2.7732919254658386, "grad_norm": 1.3819350004196167, "learning_rate": 1.553295264097643e-07, "loss": 0.0635903924703598, "rewards/accuracies": 0.9765625, "rewards/chosen": 65.64743041992188, "rewards/margins": 50.19305419921875, "rewards/rejected": 15.430519104003906, "step": 5358 }, { "epoch": 2.7738095238095237, "grad_norm": 0.5616193413734436, "learning_rate": 1.5462474317258724e-07, "loss": 0.058537475764751434, "rewards/accuracies": 0.984375, "rewards/chosen": 58.17567443847656, "rewards/margins": 46.145751953125, "rewards/rejected": 12.036361694335938, "step": 5359 }, { "epoch": 2.774327122153209, "grad_norm": 1.2274130582809448, "learning_rate": 1.5392153738226555e-07, "loss": 0.12086272984743118, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.92851257324219, "rewards/margins": 42.351318359375, "rewards/rejected": 11.58544921875, "step": 5360 }, { "epoch": 2.7748447204968945, "grad_norm": 2.268543004989624, "learning_rate": 1.5321990926773333e-07, "loss": 0.0839824303984642, "rewards/accuracies": 0.9609375, "rewards/chosen": 52.52318572998047, "rewards/margins": 42.410675048828125, "rewards/rejected": 10.108100891113281, "step": 5361 }, { "epoch": 2.7753623188405796, "grad_norm": 0.8920280933380127, "learning_rate": 1.5251985905741063e-07, "loss": 0.09529884159564972, "rewards/accuracies": 0.96875, "rewards/chosen": 60.70765686035156, "rewards/margins": 49.518218994140625, "rewards/rejected": 11.20579719543457, "step": 5362 }, { "epoch": 2.775879917184265, "grad_norm": 4.005437850952148, "learning_rate": 1.5182138697920346e-07, "loss": 0.08155538886785507, "rewards/accuracies": 0.984375, "rewards/chosen": 49.66465759277344, "rewards/margins": 39.66883850097656, "rewards/rejected": 9.998420715332031, "step": 5363 }, { "epoch": 2.7763975155279503, "grad_norm": 0.6248762607574463, "learning_rate": 1.5112449326050494e-07, "loss": 0.061199598014354706, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.8183479309082, "rewards/margins": 44.68125915527344, "rewards/rejected": 11.1212158203125, "step": 5364 }, { "epoch": 2.7769151138716355, "grad_norm": 0.9549705386161804, "learning_rate": 1.5042917812819358e-07, "loss": 0.10130365192890167, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.81243133544922, "rewards/margins": 43.190582275390625, "rewards/rejected": 8.610214233398438, "step": 5365 }, { "epoch": 2.777432712215321, "grad_norm": 1.6159579753875732, "learning_rate": 1.4973544180863442e-07, "loss": 0.13295239210128784, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.29652404785156, "rewards/margins": 38.76702880859375, "rewards/rejected": 11.530548095703125, "step": 5366 }, { "epoch": 2.777950310559006, "grad_norm": 1.0816468000411987, "learning_rate": 1.490432845276779e-07, "loss": 0.11789175868034363, "rewards/accuracies": 0.9140625, "rewards/chosen": 56.724815368652344, "rewards/margins": 44.883758544921875, "rewards/rejected": 11.842788696289062, "step": 5367 }, { "epoch": 2.7784679089026914, "grad_norm": 0.6716031432151794, "learning_rate": 1.4835270651066102e-07, "loss": 0.07005485892295837, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.62654113769531, "rewards/margins": 43.53080749511719, "rewards/rejected": 10.089813232421875, "step": 5368 }, { "epoch": 2.778985507246377, "grad_norm": 0.7589211463928223, "learning_rate": 1.476637079824067e-07, "loss": 0.05958513915538788, "rewards/accuracies": 0.96875, "rewards/chosen": 58.82904052734375, "rewards/margins": 44.87530517578125, "rewards/rejected": 13.969642639160156, "step": 5369 }, { "epoch": 2.779503105590062, "grad_norm": 1.2262680530548096, "learning_rate": 1.4697628916722328e-07, "loss": 0.08621932566165924, "rewards/accuracies": 0.953125, "rewards/chosen": 53.29937744140625, "rewards/margins": 40.03392028808594, "rewards/rejected": 13.271011352539062, "step": 5370 }, { "epoch": 2.7800207039337472, "grad_norm": 1.3646756410598755, "learning_rate": 1.4629045028890508e-07, "loss": 0.13365264236927032, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.686126708984375, "rewards/margins": 41.54954528808594, "rewards/rejected": 13.133411407470703, "step": 5371 }, { "epoch": 2.780538302277433, "grad_norm": 1.4646281003952026, "learning_rate": 1.4560619157073074e-07, "loss": 0.07294704020023346, "rewards/accuracies": 0.96875, "rewards/chosen": 63.192649841308594, "rewards/margins": 48.02838134765625, "rewards/rejected": 15.176155090332031, "step": 5372 }, { "epoch": 2.781055900621118, "grad_norm": 0.7112342715263367, "learning_rate": 1.4492351323546705e-07, "loss": 0.06368830800056458, "rewards/accuracies": 0.96875, "rewards/chosen": 54.63604736328125, "rewards/margins": 43.44488525390625, "rewards/rejected": 11.18691635131836, "step": 5373 }, { "epoch": 2.7815734989648035, "grad_norm": 1.0544296503067017, "learning_rate": 1.4424241550536455e-07, "loss": 0.12727582454681396, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.69226837158203, "rewards/margins": 39.652313232421875, "rewards/rejected": 11.038154602050781, "step": 5374 }, { "epoch": 2.7820910973084887, "grad_norm": 1.1692745685577393, "learning_rate": 1.4356289860215922e-07, "loss": 0.09628985822200775, "rewards/accuracies": 0.9453125, "rewards/chosen": 60.67839050292969, "rewards/margins": 48.89967346191406, "rewards/rejected": 11.790157318115234, "step": 5375 }, { "epoch": 2.782608695652174, "grad_norm": 1.946258306503296, "learning_rate": 1.428849627470724e-07, "loss": 0.11353321373462677, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.195533752441406, "rewards/margins": 45.46697998046875, "rewards/rejected": 14.736370086669922, "step": 5376 }, { "epoch": 2.783126293995859, "grad_norm": 1.1725702285766602, "learning_rate": 1.422086081608115e-07, "loss": 0.09115168452262878, "rewards/accuracies": 0.953125, "rewards/chosen": 57.03954315185547, "rewards/margins": 43.44996643066406, "rewards/rejected": 13.570844650268555, "step": 5377 }, { "epoch": 2.7836438923395446, "grad_norm": 7.756277084350586, "learning_rate": 1.4153383506356867e-07, "loss": 0.1659708470106125, "rewards/accuracies": 0.96875, "rewards/chosen": 59.40272521972656, "rewards/margins": 46.98846435546875, "rewards/rejected": 12.421524047851562, "step": 5378 }, { "epoch": 2.7841614906832297, "grad_norm": 0.809257984161377, "learning_rate": 1.4086064367502095e-07, "loss": 0.09297524392604828, "rewards/accuracies": 0.9375, "rewards/chosen": 60.043033599853516, "rewards/margins": 45.45513916015625, "rewards/rejected": 14.591102600097656, "step": 5379 }, { "epoch": 2.7846790890269153, "grad_norm": 0.6687860488891602, "learning_rate": 1.4018903421433028e-07, "loss": 0.07140876352787018, "rewards/accuracies": 0.96875, "rewards/chosen": 60.338905334472656, "rewards/margins": 47.26688003540039, "rewards/rejected": 13.07845687866211, "step": 5380 }, { "epoch": 2.7851966873706004, "grad_norm": 0.7570059895515442, "learning_rate": 1.3951900690014508e-07, "loss": 0.07585903257131577, "rewards/accuracies": 0.96875, "rewards/chosen": 54.03776931762695, "rewards/margins": 42.60002136230469, "rewards/rejected": 11.430412292480469, "step": 5381 }, { "epoch": 2.7857142857142856, "grad_norm": 0.5729893445968628, "learning_rate": 1.3885056195059588e-07, "loss": 0.036869846284389496, "rewards/accuracies": 0.9921875, "rewards/chosen": 56.904510498046875, "rewards/margins": 44.64447021484375, "rewards/rejected": 12.24697494506836, "step": 5382 }, { "epoch": 2.786231884057971, "grad_norm": 2.424210548400879, "learning_rate": 1.3818369958330136e-07, "loss": 0.13791564106941223, "rewards/accuracies": 0.921875, "rewards/chosen": 55.45405578613281, "rewards/margins": 43.53662109375, "rewards/rejected": 11.923259735107422, "step": 5383 }, { "epoch": 2.7867494824016563, "grad_norm": 0.7092230916023254, "learning_rate": 1.375184200153623e-07, "loss": 0.06830210983753204, "rewards/accuracies": 0.953125, "rewards/chosen": 62.68219757080078, "rewards/margins": 48.7611083984375, "rewards/rejected": 13.908832550048828, "step": 5384 }, { "epoch": 2.7872670807453415, "grad_norm": 1.0357264280319214, "learning_rate": 1.3685472346336603e-07, "loss": 0.09963709115982056, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.506107330322266, "rewards/margins": 44.382476806640625, "rewards/rejected": 14.130800247192383, "step": 5385 }, { "epoch": 2.787784679089027, "grad_norm": 1.1312615871429443, "learning_rate": 1.361926101433836e-07, "loss": 0.09477434307336807, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.50676345825195, "rewards/margins": 47.675933837890625, "rewards/rejected": 10.830375671386719, "step": 5386 }, { "epoch": 2.788302277432712, "grad_norm": 2.0376946926116943, "learning_rate": 1.355320802709703e-07, "loss": 0.07052643597126007, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.48365783691406, "rewards/margins": 44.57421875, "rewards/rejected": 11.903583526611328, "step": 5387 }, { "epoch": 2.7888198757763973, "grad_norm": 0.7256602644920349, "learning_rate": 1.3487313406116698e-07, "loss": 0.07634610682725906, "rewards/accuracies": 0.953125, "rewards/chosen": 58.330955505371094, "rewards/margins": 47.667022705078125, "rewards/rejected": 10.668621063232422, "step": 5388 }, { "epoch": 2.789337474120083, "grad_norm": 1.4044173955917358, "learning_rate": 1.3421577172849755e-07, "loss": 0.14021845161914825, "rewards/accuracies": 0.9375, "rewards/chosen": 53.87973403930664, "rewards/margins": 42.2298583984375, "rewards/rejected": 11.6385498046875, "step": 5389 }, { "epoch": 2.789855072463768, "grad_norm": 1.2419198751449585, "learning_rate": 1.335599934869719e-07, "loss": 0.10039566457271576, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.425079345703125, "rewards/margins": 46.037017822265625, "rewards/rejected": 11.392814636230469, "step": 5390 }, { "epoch": 2.7903726708074537, "grad_norm": 0.8362202048301697, "learning_rate": 1.3290579955008265e-07, "loss": 0.052926912903785706, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.860137939453125, "rewards/margins": 45.339820861816406, "rewards/rejected": 9.508548736572266, "step": 5391 }, { "epoch": 2.790890269151139, "grad_norm": 2.356492757797241, "learning_rate": 1.3225319013080774e-07, "loss": 0.14538413286209106, "rewards/accuracies": 0.921875, "rewards/chosen": 56.95396041870117, "rewards/margins": 43.03016662597656, "rewards/rejected": 13.928314208984375, "step": 5392 }, { "epoch": 2.791407867494824, "grad_norm": 0.7334793210029602, "learning_rate": 1.3160216544160942e-07, "loss": 0.06447404623031616, "rewards/accuracies": 0.9921875, "rewards/chosen": 57.64154052734375, "rewards/margins": 45.3524169921875, "rewards/rejected": 12.27805233001709, "step": 5393 }, { "epoch": 2.791925465838509, "grad_norm": 0.5056332349777222, "learning_rate": 1.3095272569443208e-07, "loss": 0.041052013635635376, "rewards/accuracies": 0.984375, "rewards/chosen": 57.5479850769043, "rewards/margins": 46.432029724121094, "rewards/rejected": 11.139490127563477, "step": 5394 }, { "epoch": 2.7924430641821947, "grad_norm": 0.8188046216964722, "learning_rate": 1.3030487110070712e-07, "loss": 0.08659875392913818, "rewards/accuracies": 0.953125, "rewards/chosen": 52.60679626464844, "rewards/margins": 39.135772705078125, "rewards/rejected": 13.463203430175781, "step": 5395 }, { "epoch": 2.79296066252588, "grad_norm": 1.2911089658737183, "learning_rate": 1.2965860187134694e-07, "loss": 0.07879963517189026, "rewards/accuracies": 0.96875, "rewards/chosen": 63.21245574951172, "rewards/margins": 50.347991943359375, "rewards/rejected": 12.867382049560547, "step": 5396 }, { "epoch": 2.7934782608695654, "grad_norm": 0.5525856018066406, "learning_rate": 1.2901391821674992e-07, "loss": 0.04790719598531723, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.91594696044922, "rewards/margins": 45.687408447265625, "rewards/rejected": 11.227155685424805, "step": 5397 }, { "epoch": 2.7939958592132506, "grad_norm": 2.650487184524536, "learning_rate": 1.2837082034679703e-07, "loss": 0.08847378194332123, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.807464599609375, "rewards/margins": 43.519744873046875, "rewards/rejected": 10.301517486572266, "step": 5398 }, { "epoch": 2.7945134575569357, "grad_norm": 0.3540559709072113, "learning_rate": 1.2772930847085418e-07, "loss": 0.032528460025787354, "rewards/accuracies": 0.984375, "rewards/chosen": 64.78797149658203, "rewards/margins": 50.190185546875, "rewards/rejected": 14.583969116210938, "step": 5399 }, { "epoch": 2.795031055900621, "grad_norm": 1.2346166372299194, "learning_rate": 1.2708938279776873e-07, "loss": 0.07188517600297928, "rewards/accuracies": 0.96875, "rewards/chosen": 57.32017517089844, "rewards/margins": 45.75189208984375, "rewards/rejected": 11.583005905151367, "step": 5400 }, { "epoch": 2.7955486542443064, "grad_norm": 0.6541485786437988, "learning_rate": 1.2645104353587513e-07, "loss": 0.05214937403798103, "rewards/accuracies": 0.96875, "rewards/chosen": 59.238182067871094, "rewards/margins": 47.28741455078125, "rewards/rejected": 11.95334243774414, "step": 5401 }, { "epoch": 2.7960662525879916, "grad_norm": 0.8790348768234253, "learning_rate": 1.2581429089298714e-07, "loss": 0.08912873268127441, "rewards/accuracies": 0.953125, "rewards/chosen": 53.89381790161133, "rewards/margins": 41.579803466796875, "rewards/rejected": 12.304498672485352, "step": 5402 }, { "epoch": 2.796583850931677, "grad_norm": 0.9688371419906616, "learning_rate": 1.2517912507640562e-07, "loss": 0.08301695436239243, "rewards/accuracies": 0.96875, "rewards/chosen": 53.18437957763672, "rewards/margins": 39.96527099609375, "rewards/rejected": 13.223993301391602, "step": 5403 }, { "epoch": 2.7971014492753623, "grad_norm": 0.7514908313751221, "learning_rate": 1.2454554629291405e-07, "loss": 0.04875156655907631, "rewards/accuracies": 0.9765625, "rewards/chosen": 66.2558822631836, "rewards/margins": 52.3662109375, "rewards/rejected": 13.884002685546875, "step": 5404 }, { "epoch": 2.7976190476190474, "grad_norm": 3.238844633102417, "learning_rate": 1.239135547487763e-07, "loss": 0.11381727457046509, "rewards/accuracies": 0.96875, "rewards/chosen": 54.26104736328125, "rewards/margins": 42.04339599609375, "rewards/rejected": 12.22149658203125, "step": 5405 }, { "epoch": 2.798136645962733, "grad_norm": 0.4408109486103058, "learning_rate": 1.2328315064974282e-07, "loss": 0.0382605604827404, "rewards/accuracies": 0.984375, "rewards/chosen": 50.46979522705078, "rewards/margins": 39.237823486328125, "rewards/rejected": 11.241964340209961, "step": 5406 }, { "epoch": 2.798654244306418, "grad_norm": 1.0620759725570679, "learning_rate": 1.2265433420104666e-07, "loss": 0.10490163415670395, "rewards/accuracies": 0.9296875, "rewards/chosen": 49.575225830078125, "rewards/margins": 39.787261962890625, "rewards/rejected": 9.790061950683594, "step": 5407 }, { "epoch": 2.7991718426501038, "grad_norm": 0.4929157495498657, "learning_rate": 1.2202710560740404e-07, "loss": 0.0535333976149559, "rewards/accuracies": 0.96875, "rewards/chosen": 59.13058853149414, "rewards/margins": 45.44827651977539, "rewards/rejected": 13.678232192993164, "step": 5408 }, { "epoch": 2.799689440993789, "grad_norm": 0.5106837749481201, "learning_rate": 1.214014650730122e-07, "loss": 0.056595657020807266, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.41657257080078, "rewards/margins": 46.142333984375, "rewards/rejected": 11.280701637268066, "step": 5409 }, { "epoch": 2.800207039337474, "grad_norm": 0.7664307951927185, "learning_rate": 1.2077741280155375e-07, "loss": 0.07488226890563965, "rewards/accuracies": 0.9453125, "rewards/chosen": 63.7105712890625, "rewards/margins": 49.290618896484375, "rewards/rejected": 14.436136245727539, "step": 5410 }, { "epoch": 2.800724637681159, "grad_norm": 0.6255778074264526, "learning_rate": 1.2015494899619394e-07, "loss": 0.06972984969615936, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.85261535644531, "rewards/margins": 44.71466064453125, "rewards/rejected": 11.128536224365234, "step": 5411 }, { "epoch": 2.801242236024845, "grad_norm": 2.7932651042938232, "learning_rate": 1.195340738595796e-07, "loss": 0.10465740412473679, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.23046875, "rewards/margins": 45.52360534667969, "rewards/rejected": 12.710933685302734, "step": 5412 }, { "epoch": 2.80175983436853, "grad_norm": 0.9833581447601318, "learning_rate": 1.1891478759384068e-07, "loss": 0.07748126983642578, "rewards/accuracies": 0.96875, "rewards/chosen": 56.165130615234375, "rewards/margins": 43.5283203125, "rewards/rejected": 12.64892578125, "step": 5413 }, { "epoch": 2.8022774327122155, "grad_norm": 0.4518468976020813, "learning_rate": 1.1829709040059145e-07, "loss": 0.04404392093420029, "rewards/accuracies": 0.984375, "rewards/chosen": 56.529788970947266, "rewards/margins": 43.7042236328125, "rewards/rejected": 12.827590942382812, "step": 5414 }, { "epoch": 2.8027950310559007, "grad_norm": 0.9896360635757446, "learning_rate": 1.1768098248092718e-07, "loss": 0.12385275959968567, "rewards/accuracies": 0.921875, "rewards/chosen": 62.820281982421875, "rewards/margins": 49.44677734375, "rewards/rejected": 13.380134582519531, "step": 5415 }, { "epoch": 2.803312629399586, "grad_norm": 3.082277536392212, "learning_rate": 1.1706646403542576e-07, "loss": 0.13272108137607574, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.80792999267578, "rewards/margins": 42.2171630859375, "rewards/rejected": 10.589502334594727, "step": 5416 }, { "epoch": 2.803830227743271, "grad_norm": 1.381960391998291, "learning_rate": 1.1645353526414883e-07, "loss": 0.14125630259513855, "rewards/accuracies": 0.9140625, "rewards/chosen": 51.45101547241211, "rewards/margins": 37.548858642578125, "rewards/rejected": 13.913415908813477, "step": 5417 }, { "epoch": 2.8043478260869565, "grad_norm": 1.370091199874878, "learning_rate": 1.1584219636663952e-07, "loss": 0.09632013738155365, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.864227294921875, "rewards/margins": 48.77618408203125, "rewards/rejected": 10.090927124023438, "step": 5418 }, { "epoch": 2.8048654244306417, "grad_norm": 0.6903144717216492, "learning_rate": 1.1523244754192365e-07, "loss": 0.08648607134819031, "rewards/accuracies": 0.9609375, "rewards/chosen": 46.920166015625, "rewards/margins": 37.51202392578125, "rewards/rejected": 9.391983032226562, "step": 5419 }, { "epoch": 2.8053830227743273, "grad_norm": 0.5697219371795654, "learning_rate": 1.1462428898850908e-07, "loss": 0.05034183710813522, "rewards/accuracies": 0.96875, "rewards/chosen": 63.70806884765625, "rewards/margins": 51.462860107421875, "rewards/rejected": 12.251943588256836, "step": 5420 }, { "epoch": 2.8059006211180124, "grad_norm": 0.6329275369644165, "learning_rate": 1.1401772090438634e-07, "loss": 0.040259949862957, "rewards/accuracies": 0.9765625, "rewards/chosen": 65.62602233886719, "rewards/margins": 50.570831298828125, "rewards/rejected": 15.053462982177734, "step": 5421 }, { "epoch": 2.8064182194616976, "grad_norm": 0.547961950302124, "learning_rate": 1.1341274348702857e-07, "loss": 0.04916121065616608, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.925811767578125, "rewards/margins": 43.08610534667969, "rewards/rejected": 11.845803260803223, "step": 5422 }, { "epoch": 2.806935817805383, "grad_norm": 1.7406150102615356, "learning_rate": 1.128093569333899e-07, "loss": 0.08349595963954926, "rewards/accuracies": 0.96875, "rewards/chosen": 60.705841064453125, "rewards/margins": 47.465484619140625, "rewards/rejected": 13.259033203125, "step": 5423 }, { "epoch": 2.8074534161490683, "grad_norm": 1.0782328844070435, "learning_rate": 1.1220756143990764e-07, "loss": 0.1053628921508789, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.85690689086914, "rewards/margins": 44.63092041015625, "rewards/rejected": 11.23604965209961, "step": 5424 }, { "epoch": 2.807971014492754, "grad_norm": 0.5621936917304993, "learning_rate": 1.1160735720249949e-07, "loss": 0.05919535830616951, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.406402587890625, "rewards/margins": 46.492462158203125, "rewards/rejected": 10.911962509155273, "step": 5425 }, { "epoch": 2.808488612836439, "grad_norm": 0.7766591906547546, "learning_rate": 1.110087444165675e-07, "loss": 0.06941717863082886, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.71269607543945, "rewards/margins": 46.234405517578125, "rewards/rejected": 13.481178283691406, "step": 5426 }, { "epoch": 2.809006211180124, "grad_norm": 0.5642432570457458, "learning_rate": 1.1041172327699467e-07, "loss": 0.049580641090869904, "rewards/accuracies": 0.984375, "rewards/chosen": 59.33819580078125, "rewards/margins": 45.681365966796875, "rewards/rejected": 13.659961700439453, "step": 5427 }, { "epoch": 2.8095238095238093, "grad_norm": 1.0633046627044678, "learning_rate": 1.0981629397814387e-07, "loss": 0.07780110836029053, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.548614501953125, "rewards/margins": 46.138458251953125, "rewards/rejected": 13.419313430786133, "step": 5428 }, { "epoch": 2.810041407867495, "grad_norm": 1.1115195751190186, "learning_rate": 1.0922245671386278e-07, "loss": 0.1182168573141098, "rewards/accuracies": 0.953125, "rewards/chosen": 56.26320266723633, "rewards/margins": 42.793853759765625, "rewards/rejected": 13.47398567199707, "step": 5429 }, { "epoch": 2.81055900621118, "grad_norm": 0.38743874430656433, "learning_rate": 1.0863021167747955e-07, "loss": 0.023417379707098007, "rewards/accuracies": 0.984375, "rewards/chosen": 69.60556030273438, "rewards/margins": 54.246795654296875, "rewards/rejected": 15.355255126953125, "step": 5430 }, { "epoch": 2.8110766045548656, "grad_norm": 1.0706512928009033, "learning_rate": 1.080395590618033e-07, "loss": 0.07475356757640839, "rewards/accuracies": 0.9765625, "rewards/chosen": 51.198402404785156, "rewards/margins": 42.4293212890625, "rewards/rejected": 8.776748657226562, "step": 5431 }, { "epoch": 2.8115942028985508, "grad_norm": 0.7807766199111938, "learning_rate": 1.0745049905912463e-07, "loss": 0.07247057557106018, "rewards/accuracies": 0.953125, "rewards/chosen": 52.340065002441406, "rewards/margins": 40.87640380859375, "rewards/rejected": 11.455501556396484, "step": 5432 }, { "epoch": 2.812111801242236, "grad_norm": 1.0405429601669312, "learning_rate": 1.0686303186121682e-07, "loss": 0.07688173651695251, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.84032440185547, "rewards/margins": 42.70472717285156, "rewards/rejected": 13.120468139648438, "step": 5433 }, { "epoch": 2.812629399585921, "grad_norm": 0.865312933921814, "learning_rate": 1.0627715765933522e-07, "loss": 0.09293156117200851, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.52836608886719, "rewards/margins": 43.947418212890625, "rewards/rejected": 9.576007843017578, "step": 5434 }, { "epoch": 2.8131469979296067, "grad_norm": 1.5768369436264038, "learning_rate": 1.0569287664421335e-07, "loss": 0.09816333651542664, "rewards/accuracies": 0.9375, "rewards/chosen": 59.57395935058594, "rewards/margins": 49.594329833984375, "rewards/rejected": 9.971870422363281, "step": 5435 }, { "epoch": 2.813664596273292, "grad_norm": 1.1771535873413086, "learning_rate": 1.0511018900606906e-07, "loss": 0.10119631886482239, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.24705505371094, "rewards/margins": 40.243896484375, "rewards/rejected": 10.994890213012695, "step": 5436 }, { "epoch": 2.8141821946169774, "grad_norm": 1.1249722242355347, "learning_rate": 1.0452909493460118e-07, "loss": 0.1158919483423233, "rewards/accuracies": 0.9453125, "rewards/chosen": 65.3051986694336, "rewards/margins": 50.469390869140625, "rewards/rejected": 14.82379150390625, "step": 5437 }, { "epoch": 2.8146997929606625, "grad_norm": 0.8034656047821045, "learning_rate": 1.0394959461898779e-07, "loss": 0.09041374921798706, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.328529357910156, "rewards/margins": 41.96710205078125, "rewards/rejected": 12.373603820800781, "step": 5438 }, { "epoch": 2.8152173913043477, "grad_norm": 0.7528856992721558, "learning_rate": 1.0337168824789078e-07, "loss": 0.030068837106227875, "rewards/accuracies": 0.9921875, "rewards/chosen": 58.073577880859375, "rewards/margins": 45.757598876953125, "rewards/rejected": 12.3221435546875, "step": 5439 }, { "epoch": 2.8157349896480333, "grad_norm": 3.3576266765594482, "learning_rate": 1.0279537600945022e-07, "loss": 0.11064360290765762, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.79219055175781, "rewards/margins": 44.36529541015625, "rewards/rejected": 11.417699813842773, "step": 5440 }, { "epoch": 2.8162525879917184, "grad_norm": 3.498534917831421, "learning_rate": 1.0222065809128989e-07, "loss": 0.15347890555858612, "rewards/accuracies": 0.921875, "rewards/chosen": 56.30706787109375, "rewards/margins": 44.73591613769531, "rewards/rejected": 11.565340042114258, "step": 5441 }, { "epoch": 2.816770186335404, "grad_norm": 1.2975389957427979, "learning_rate": 1.0164753468051292e-07, "loss": 0.0972820296883583, "rewards/accuracies": 0.953125, "rewards/chosen": 58.5301628112793, "rewards/margins": 44.56282043457031, "rewards/rejected": 13.976715087890625, "step": 5442 }, { "epoch": 2.817287784679089, "grad_norm": 7.121316432952881, "learning_rate": 1.0107600596370337e-07, "loss": 0.13371717929840088, "rewards/accuracies": 0.9296875, "rewards/chosen": 59.41090393066406, "rewards/margins": 46.35113525390625, "rewards/rejected": 13.071678161621094, "step": 5443 }, { "epoch": 2.8178053830227743, "grad_norm": 1.1152594089508057, "learning_rate": 1.0050607212692742e-07, "loss": 0.10306888818740845, "rewards/accuracies": 0.9375, "rewards/chosen": 51.73633575439453, "rewards/margins": 40.59906005859375, "rewards/rejected": 11.144577026367188, "step": 5444 }, { "epoch": 2.8183229813664594, "grad_norm": 0.7291335463523865, "learning_rate": 9.993773335573053e-08, "loss": 0.0871623158454895, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.16992950439453, "rewards/margins": 44.19671630859375, "rewards/rejected": 10.973762512207031, "step": 5445 }, { "epoch": 2.818840579710145, "grad_norm": 1.0404157638549805, "learning_rate": 9.937098983513971e-08, "loss": 0.06196695566177368, "rewards/accuracies": 0.9765625, "rewards/chosen": 64.03364562988281, "rewards/margins": 51.25004577636719, "rewards/rejected": 12.786636352539062, "step": 5446 }, { "epoch": 2.81935817805383, "grad_norm": 0.7100982069969177, "learning_rate": 9.880584174966235e-08, "loss": 0.05130840837955475, "rewards/accuracies": 0.96875, "rewards/chosen": 61.5462532043457, "rewards/margins": 46.96856689453125, "rewards/rejected": 14.564355850219727, "step": 5447 }, { "epoch": 2.8198757763975157, "grad_norm": 0.7372838258743286, "learning_rate": 9.824228928328627e-08, "loss": 0.10365429520606995, "rewards/accuracies": 0.953125, "rewards/chosen": 61.356117248535156, "rewards/margins": 47.1243896484375, "rewards/rejected": 14.228322982788086, "step": 5448 }, { "epoch": 2.820393374741201, "grad_norm": 0.8811707496643066, "learning_rate": 9.768033261948028e-08, "loss": 0.06390716135501862, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.03361511230469, "rewards/margins": 46.97712707519531, "rewards/rejected": 11.049348831176758, "step": 5449 }, { "epoch": 2.820910973084886, "grad_norm": 1.4944889545440674, "learning_rate": 9.711997194119416e-08, "loss": 0.04557190462946892, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.81028366088867, "rewards/margins": 42.0517578125, "rewards/rejected": 11.747055053710938, "step": 5450 }, { "epoch": 2.821428571428571, "grad_norm": 0.7461838126182556, "learning_rate": 9.656120743085528e-08, "loss": 0.07274563610553741, "rewards/accuracies": 0.953125, "rewards/chosen": 62.02735900878906, "rewards/margins": 49.698577880859375, "rewards/rejected": 12.318191528320312, "step": 5451 }, { "epoch": 2.8219461697722568, "grad_norm": 11.518465995788574, "learning_rate": 9.600403927037538e-08, "loss": 0.2514958083629608, "rewards/accuracies": 0.96875, "rewards/chosen": 59.773719787597656, "rewards/margins": 47.140228271484375, "rewards/rejected": 12.639701843261719, "step": 5452 }, { "epoch": 2.822463768115942, "grad_norm": 1.386596918106079, "learning_rate": 9.54484676411438e-08, "loss": 0.0715368241071701, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.916900634765625, "rewards/margins": 45.367645263671875, "rewards/rejected": 14.560262680053711, "step": 5453 }, { "epoch": 2.8229813664596275, "grad_norm": 0.7513468861579895, "learning_rate": 9.489449272403195e-08, "loss": 0.07924477756023407, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.216827392578125, "rewards/margins": 45.691619873046875, "rewards/rejected": 10.536540985107422, "step": 5454 }, { "epoch": 2.8234989648033126, "grad_norm": 0.5219197869300842, "learning_rate": 9.434211469938837e-08, "loss": 0.03368992358446121, "rewards/accuracies": 0.984375, "rewards/chosen": 59.85242462158203, "rewards/margins": 48.71054458618164, "rewards/rejected": 11.142584800720215, "step": 5455 }, { "epoch": 2.824016563146998, "grad_norm": 0.9303234219551086, "learning_rate": 9.379133374704474e-08, "loss": 0.10938812047243118, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.816932678222656, "rewards/margins": 45.847869873046875, "rewards/rejected": 13.96735954284668, "step": 5456 }, { "epoch": 2.8245341614906834, "grad_norm": 1.6102088689804077, "learning_rate": 9.324215004631265e-08, "loss": 0.11021542549133301, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.06732177734375, "rewards/margins": 41.20519256591797, "rewards/rejected": 12.851974487304688, "step": 5457 }, { "epoch": 2.8250517598343685, "grad_norm": 4.930757522583008, "learning_rate": 9.26945637759813e-08, "loss": 0.08667643368244171, "rewards/accuracies": 0.96875, "rewards/chosen": 58.11505126953125, "rewards/margins": 45.866607666015625, "rewards/rejected": 12.244422912597656, "step": 5458 }, { "epoch": 2.825569358178054, "grad_norm": 0.7659887075424194, "learning_rate": 9.214857511432195e-08, "loss": 0.06990115344524384, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.809139251708984, "rewards/margins": 42.819091796875, "rewards/rejected": 8.976058959960938, "step": 5459 }, { "epoch": 2.8260869565217392, "grad_norm": 0.5981343388557434, "learning_rate": 9.160418423908635e-08, "loss": 0.0613168329000473, "rewards/accuracies": 0.96875, "rewards/chosen": 64.08843994140625, "rewards/margins": 49.904510498046875, "rewards/rejected": 14.173105239868164, "step": 5460 }, { "epoch": 2.8266045548654244, "grad_norm": 2.11637020111084, "learning_rate": 9.10613913275027e-08, "loss": 0.12752731144428253, "rewards/accuracies": 0.9375, "rewards/chosen": 56.77666473388672, "rewards/margins": 41.558135986328125, "rewards/rejected": 15.230615615844727, "step": 5461 }, { "epoch": 2.8271221532091095, "grad_norm": 0.5610605478286743, "learning_rate": 9.052019655628242e-08, "loss": 0.056758515536785126, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.81127166748047, "rewards/margins": 44.693023681640625, "rewards/rejected": 12.109432220458984, "step": 5462 }, { "epoch": 2.827639751552795, "grad_norm": 0.7823977470397949, "learning_rate": 8.998060010161514e-08, "loss": 0.07195718586444855, "rewards/accuracies": 0.96875, "rewards/chosen": 55.303627014160156, "rewards/margins": 45.07489013671875, "rewards/rejected": 10.239660263061523, "step": 5463 }, { "epoch": 2.8281573498964803, "grad_norm": 3.781268835067749, "learning_rate": 8.944260213917089e-08, "loss": 0.0958189070224762, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.11122512817383, "rewards/margins": 43.10498046875, "rewards/rejected": 10.009965896606445, "step": 5464 }, { "epoch": 2.828674948240166, "grad_norm": 0.4984093904495239, "learning_rate": 8.890620284409735e-08, "loss": 0.046363361179828644, "rewards/accuracies": 0.984375, "rewards/chosen": 60.90129089355469, "rewards/margins": 47.17939758300781, "rewards/rejected": 13.710320472717285, "step": 5465 }, { "epoch": 2.829192546583851, "grad_norm": 2.495039939880371, "learning_rate": 8.837140239102427e-08, "loss": 0.11245910823345184, "rewards/accuracies": 0.953125, "rewards/chosen": 55.86804962158203, "rewards/margins": 44.407806396484375, "rewards/rejected": 11.458992004394531, "step": 5466 }, { "epoch": 2.829710144927536, "grad_norm": 0.8139573931694031, "learning_rate": 8.78382009540607e-08, "loss": 0.05803950875997543, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.169822692871094, "rewards/margins": 42.97686767578125, "rewards/rejected": 11.195728302001953, "step": 5467 }, { "epoch": 2.8302277432712213, "grad_norm": 0.30103105306625366, "learning_rate": 8.730659870679226e-08, "loss": 0.026936210691928864, "rewards/accuracies": 0.9921875, "rewards/chosen": 59.99620819091797, "rewards/margins": 47.185462951660156, "rewards/rejected": 12.822410583496094, "step": 5468 }, { "epoch": 2.830745341614907, "grad_norm": 0.6056694388389587, "learning_rate": 8.677659582228714e-08, "loss": 0.06666918843984604, "rewards/accuracies": 0.96875, "rewards/chosen": 56.84674072265625, "rewards/margins": 45.500946044921875, "rewards/rejected": 11.347503662109375, "step": 5469 }, { "epoch": 2.831262939958592, "grad_norm": 0.7191177010536194, "learning_rate": 8.624819247309069e-08, "loss": 0.0660523772239685, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.511199951171875, "rewards/margins": 45.51470947265625, "rewards/rejected": 13.00229549407959, "step": 5470 }, { "epoch": 2.8317805383022776, "grad_norm": 1.1288352012634277, "learning_rate": 8.572138883122972e-08, "loss": 0.14655131101608276, "rewards/accuracies": 0.90625, "rewards/chosen": 53.04481887817383, "rewards/margins": 43.22406005859375, "rewards/rejected": 9.804630279541016, "step": 5471 }, { "epoch": 2.8322981366459627, "grad_norm": 3.692869186401367, "learning_rate": 8.519618506820759e-08, "loss": 0.09768655896186829, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.854705810546875, "rewards/margins": 42.11979675292969, "rewards/rejected": 9.743339538574219, "step": 5472 }, { "epoch": 2.832815734989648, "grad_norm": 0.717300534248352, "learning_rate": 8.467258135500923e-08, "loss": 0.08253965526819229, "rewards/accuracies": 0.9375, "rewards/chosen": 64.94329833984375, "rewards/margins": 50.851966857910156, "rewards/rejected": 14.089030265808105, "step": 5473 }, { "epoch": 2.8333333333333335, "grad_norm": 1.2585870027542114, "learning_rate": 8.415057786209712e-08, "loss": 0.08259972929954529, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.75382995605469, "rewards/margins": 48.50103759765625, "rewards/rejected": 11.261165618896484, "step": 5474 }, { "epoch": 2.8338509316770186, "grad_norm": 0.735537052154541, "learning_rate": 8.363017475941316e-08, "loss": 0.08096164464950562, "rewards/accuracies": 0.9609375, "rewards/chosen": 63.01648712158203, "rewards/margins": 47.507110595703125, "rewards/rejected": 15.514497756958008, "step": 5475 }, { "epoch": 2.8343685300207038, "grad_norm": 0.8473864793777466, "learning_rate": 8.311137221637844e-08, "loss": 0.07740554958581924, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.725372314453125, "rewards/margins": 44.998291015625, "rewards/rejected": 9.743886947631836, "step": 5476 }, { "epoch": 2.8348861283643894, "grad_norm": 5.008921146392822, "learning_rate": 8.259417040189289e-08, "loss": 0.1319165974855423, "rewards/accuracies": 0.96875, "rewards/chosen": 59.54607391357422, "rewards/margins": 46.64691162109375, "rewards/rejected": 12.890606880187988, "step": 5477 }, { "epoch": 2.8354037267080745, "grad_norm": 1.0037519931793213, "learning_rate": 8.207856948433568e-08, "loss": 0.1070900559425354, "rewards/accuracies": 0.9296875, "rewards/chosen": 55.108585357666016, "rewards/margins": 43.93719482421875, "rewards/rejected": 11.174539566040039, "step": 5478 }, { "epoch": 2.8359213250517596, "grad_norm": 0.8258587121963501, "learning_rate": 8.156456963156367e-08, "loss": 0.05668747052550316, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.19731140136719, "rewards/margins": 46.81224060058594, "rewards/rejected": 10.373689651489258, "step": 5479 }, { "epoch": 2.8364389233954452, "grad_norm": 0.6345207095146179, "learning_rate": 8.105217101091411e-08, "loss": 0.059872813522815704, "rewards/accuracies": 0.9765625, "rewards/chosen": 63.29472351074219, "rewards/margins": 50.462921142578125, "rewards/rejected": 12.838251113891602, "step": 5480 }, { "epoch": 2.8369565217391304, "grad_norm": 1.9341177940368652, "learning_rate": 8.054137378920135e-08, "loss": 0.06058581545948982, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.63489532470703, "rewards/margins": 45.9635009765625, "rewards/rejected": 11.675399780273438, "step": 5481 }, { "epoch": 2.837474120082816, "grad_norm": 7.880166053771973, "learning_rate": 8.003217813271902e-08, "loss": 0.08908411860466003, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.68330383300781, "rewards/margins": 47.2564697265625, "rewards/rejected": 11.422372817993164, "step": 5482 }, { "epoch": 2.837991718426501, "grad_norm": 1.1041573286056519, "learning_rate": 7.952458420724008e-08, "loss": 0.08174823969602585, "rewards/accuracies": 0.9765625, "rewards/chosen": 64.35899353027344, "rewards/margins": 48.42997741699219, "rewards/rejected": 15.930416107177734, "step": 5483 }, { "epoch": 2.8385093167701863, "grad_norm": 0.7805666327476501, "learning_rate": 7.901859217801511e-08, "loss": 0.056017789989709854, "rewards/accuracies": 0.96875, "rewards/chosen": 61.859710693359375, "rewards/margins": 47.982444763183594, "rewards/rejected": 13.869331359863281, "step": 5484 }, { "epoch": 2.8390269151138714, "grad_norm": 0.5396369099617004, "learning_rate": 7.85142022097729e-08, "loss": 0.05092329531908035, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.173789978027344, "rewards/margins": 43.240135192871094, "rewards/rejected": 8.939626693725586, "step": 5485 }, { "epoch": 2.839544513457557, "grad_norm": 0.8190539479255676, "learning_rate": 7.801141446672212e-08, "loss": 0.05533613637089729, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.989776611328125, "rewards/margins": 44.85076904296875, "rewards/rejected": 14.136093139648438, "step": 5486 }, { "epoch": 2.840062111801242, "grad_norm": 3.8527801036834717, "learning_rate": 7.751022911254846e-08, "loss": 0.0921713262796402, "rewards/accuracies": 0.984375, "rewards/chosen": 60.98412322998047, "rewards/margins": 48.26025390625, "rewards/rejected": 12.742053031921387, "step": 5487 }, { "epoch": 2.8405797101449277, "grad_norm": 2.1640567779541016, "learning_rate": 7.701064631041644e-08, "loss": 0.12396328151226044, "rewards/accuracies": 0.9375, "rewards/chosen": 57.69304656982422, "rewards/margins": 46.49566650390625, "rewards/rejected": 11.208274841308594, "step": 5488 }, { "epoch": 2.841097308488613, "grad_norm": 1.1982715129852295, "learning_rate": 7.651266622296872e-08, "loss": 0.09321916848421097, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.67616271972656, "rewards/margins": 43.1029052734375, "rewards/rejected": 13.594412803649902, "step": 5489 }, { "epoch": 2.841614906832298, "grad_norm": 1.0834704637527466, "learning_rate": 7.601628901232727e-08, "loss": 0.08429548889398575, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.823509216308594, "rewards/margins": 42.46287536621094, "rewards/rejected": 11.340397834777832, "step": 5490 }, { "epoch": 2.8421325051759836, "grad_norm": 0.6126192212104797, "learning_rate": 7.552151484009007e-08, "loss": 0.08371768891811371, "rewards/accuracies": 0.96875, "rewards/chosen": 54.408348083496094, "rewards/margins": 42.263427734375, "rewards/rejected": 12.153026580810547, "step": 5491 }, { "epoch": 2.8426501035196687, "grad_norm": 1.066220998764038, "learning_rate": 7.502834386733437e-08, "loss": 0.08664622902870178, "rewards/accuracies": 0.96875, "rewards/chosen": 54.74455261230469, "rewards/margins": 42.56976318359375, "rewards/rejected": 12.176107406616211, "step": 5492 }, { "epoch": 2.843167701863354, "grad_norm": 0.7505836486816406, "learning_rate": 7.45367762546173e-08, "loss": 0.07984401285648346, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.48746871948242, "rewards/margins": 46.464332580566406, "rewards/rejected": 12.04110050201416, "step": 5493 }, { "epoch": 2.8436853002070395, "grad_norm": 0.9817882180213928, "learning_rate": 7.404681216196974e-08, "loss": 0.06348172575235367, "rewards/accuracies": 0.984375, "rewards/chosen": 55.80207443237305, "rewards/margins": 45.01617431640625, "rewards/rejected": 10.777359008789062, "step": 5494 }, { "epoch": 2.8442028985507246, "grad_norm": 0.7253486514091492, "learning_rate": 7.35584517489052e-08, "loss": 0.06728329509496689, "rewards/accuracies": 0.96875, "rewards/chosen": 51.589805603027344, "rewards/margins": 42.864715576171875, "rewards/rejected": 8.72237777709961, "step": 5495 }, { "epoch": 2.8447204968944098, "grad_norm": 0.5787814855575562, "learning_rate": 7.307169517441148e-08, "loss": 0.048001084476709366, "rewards/accuracies": 0.9921875, "rewards/chosen": 55.270416259765625, "rewards/margins": 43.778167724609375, "rewards/rejected": 11.495603561401367, "step": 5496 }, { "epoch": 2.8452380952380953, "grad_norm": 0.7059357166290283, "learning_rate": 7.25865425969563e-08, "loss": 0.0842973068356514, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.851226806640625, "rewards/margins": 45.159423828125, "rewards/rejected": 11.687061309814453, "step": 5497 }, { "epoch": 2.8457556935817805, "grad_norm": 1.7979897260665894, "learning_rate": 7.21029941744844e-08, "loss": 0.058071888983249664, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.080474853515625, "rewards/margins": 49.065704345703125, "rewards/rejected": 12.015411376953125, "step": 5498 }, { "epoch": 2.846273291925466, "grad_norm": 0.7186582684516907, "learning_rate": 7.162105006441877e-08, "loss": 0.10229238867759705, "rewards/accuracies": 0.9453125, "rewards/chosen": 57.4642333984375, "rewards/margins": 41.642845153808594, "rewards/rejected": 15.803128242492676, "step": 5499 }, { "epoch": 2.846790890269151, "grad_norm": 0.49994221329689026, "learning_rate": 7.114071042365945e-08, "loss": 0.03765242546796799, "rewards/accuracies": 0.984375, "rewards/chosen": 59.48380661010742, "rewards/margins": 46.31095886230469, "rewards/rejected": 13.178699493408203, "step": 5500 }, { "epoch": 2.8473084886128364, "grad_norm": 1.220441460609436, "learning_rate": 7.066197540858466e-08, "loss": 0.133417010307312, "rewards/accuracies": 0.9375, "rewards/chosen": 52.62465286254883, "rewards/margins": 41.801177978515625, "rewards/rejected": 10.812854766845703, "step": 5501 }, { "epoch": 2.8478260869565215, "grad_norm": 0.49099794030189514, "learning_rate": 7.018484517504976e-08, "loss": 0.052630193531513214, "rewards/accuracies": 0.9609375, "rewards/chosen": 62.92988586425781, "rewards/margins": 49.338706970214844, "rewards/rejected": 13.590568542480469, "step": 5502 }, { "epoch": 2.848343685300207, "grad_norm": 0.8114919066429138, "learning_rate": 6.970931987838881e-08, "loss": 0.07249472290277481, "rewards/accuracies": 0.96875, "rewards/chosen": 56.21763610839844, "rewards/margins": 44.87884521484375, "rewards/rejected": 11.33980941772461, "step": 5503 }, { "epoch": 2.8488612836438922, "grad_norm": 0.9365025758743286, "learning_rate": 6.923539967341131e-08, "loss": 0.09520018100738525, "rewards/accuracies": 0.9375, "rewards/chosen": 57.302947998046875, "rewards/margins": 46.33843994140625, "rewards/rejected": 10.973896026611328, "step": 5504 }, { "epoch": 2.849378881987578, "grad_norm": 0.7289517521858215, "learning_rate": 6.876308471440607e-08, "loss": 0.04770239442586899, "rewards/accuracies": 0.96875, "rewards/chosen": 62.87370681762695, "rewards/margins": 49.5238037109375, "rewards/rejected": 13.349273681640625, "step": 5505 }, { "epoch": 2.849896480331263, "grad_norm": 0.8531321883201599, "learning_rate": 6.829237515513842e-08, "loss": 0.06597137451171875, "rewards/accuracies": 0.96875, "rewards/chosen": 61.93641662597656, "rewards/margins": 47.2139892578125, "rewards/rejected": 14.730758666992188, "step": 5506 }, { "epoch": 2.850414078674948, "grad_norm": 4.573177814483643, "learning_rate": 6.782327114885134e-08, "loss": 0.07952234148979187, "rewards/accuracies": 0.9453125, "rewards/chosen": 62.729827880859375, "rewards/margins": 46.99395751953125, "rewards/rejected": 15.734434127807617, "step": 5507 }, { "epoch": 2.8509316770186337, "grad_norm": 2.5638606548309326, "learning_rate": 6.73557728482649e-08, "loss": 0.12463868409395218, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.115264892578125, "rewards/margins": 47.04766845703125, "rewards/rejected": 12.074603080749512, "step": 5508 }, { "epoch": 2.851449275362319, "grad_norm": 0.8035415410995483, "learning_rate": 6.688988040557731e-08, "loss": 0.08348843455314636, "rewards/accuracies": 0.953125, "rewards/chosen": 60.37408447265625, "rewards/margins": 46.996673583984375, "rewards/rejected": 13.376953125, "step": 5509 }, { "epoch": 2.851966873706004, "grad_norm": 0.5867958068847656, "learning_rate": 6.642559397246173e-08, "loss": 0.06416790932416916, "rewards/accuracies": 0.96875, "rewards/chosen": 55.552978515625, "rewards/margins": 45.559783935546875, "rewards/rejected": 9.987210273742676, "step": 5510 }, { "epoch": 2.8524844720496896, "grad_norm": 0.5804279446601868, "learning_rate": 6.596291370007057e-08, "loss": 0.05439313128590584, "rewards/accuracies": 0.96875, "rewards/chosen": 54.995059967041016, "rewards/margins": 44.0198974609375, "rewards/rejected": 10.990346908569336, "step": 5511 }, { "epoch": 2.8530020703933747, "grad_norm": 0.8589859008789062, "learning_rate": 6.55018397390339e-08, "loss": 0.08930876106023788, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.51849365234375, "rewards/margins": 46.34454345703125, "rewards/rejected": 10.194351196289062, "step": 5512 }, { "epoch": 2.85351966873706, "grad_norm": 2.0178825855255127, "learning_rate": 6.504237223945664e-08, "loss": 0.04981285333633423, "rewards/accuracies": 0.96875, "rewards/chosen": 64.55070495605469, "rewards/margins": 49.65460205078125, "rewards/rejected": 14.90377426147461, "step": 5513 }, { "epoch": 2.8540372670807455, "grad_norm": 1.1911009550094604, "learning_rate": 6.458451135092136e-08, "loss": 0.11707445234060287, "rewards/accuracies": 0.953125, "rewards/chosen": 55.67033386230469, "rewards/margins": 42.78858947753906, "rewards/rejected": 12.882935523986816, "step": 5514 }, { "epoch": 2.8545548654244306, "grad_norm": 1.211623191833496, "learning_rate": 6.412825722248827e-08, "loss": 0.15274843573570251, "rewards/accuracies": 0.9375, "rewards/chosen": 56.841514587402344, "rewards/margins": 44.30877685546875, "rewards/rejected": 12.53908920288086, "step": 5515 }, { "epoch": 2.855072463768116, "grad_norm": 0.9642210006713867, "learning_rate": 6.367361000269467e-08, "loss": 0.08488389849662781, "rewards/accuracies": 0.953125, "rewards/chosen": 57.18940353393555, "rewards/margins": 46.82209777832031, "rewards/rejected": 10.356086730957031, "step": 5516 }, { "epoch": 2.8555900621118013, "grad_norm": 0.804478108882904, "learning_rate": 6.322056983955382e-08, "loss": 0.0615682452917099, "rewards/accuracies": 0.953125, "rewards/chosen": 54.358787536621094, "rewards/margins": 43.05989074707031, "rewards/rejected": 11.29455280303955, "step": 5517 }, { "epoch": 2.8561076604554865, "grad_norm": 0.6290794014930725, "learning_rate": 6.276913688055608e-08, "loss": 0.05024593323469162, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.43064498901367, "rewards/margins": 48.557952880859375, "rewards/rejected": 9.86783504486084, "step": 5518 }, { "epoch": 2.8566252587991716, "grad_norm": 1.0160197019577026, "learning_rate": 6.231931127267e-08, "loss": 0.06903356313705444, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.741249084472656, "rewards/margins": 45.801971435546875, "rewards/rejected": 12.93337631225586, "step": 5519 }, { "epoch": 2.857142857142857, "grad_norm": 0.9324340224266052, "learning_rate": 6.187109316233785e-08, "loss": 0.07415354251861572, "rewards/accuracies": 0.96875, "rewards/chosen": 58.56219482421875, "rewards/margins": 47.277740478515625, "rewards/rejected": 11.281440734863281, "step": 5520 }, { "epoch": 2.8576604554865424, "grad_norm": 1.1888850927352905, "learning_rate": 6.142448269548074e-08, "loss": 0.11849146336317062, "rewards/accuracies": 0.9375, "rewards/chosen": 55.95514678955078, "rewards/margins": 40.63014221191406, "rewards/rejected": 15.315134048461914, "step": 5521 }, { "epoch": 2.858178053830228, "grad_norm": 0.7362613677978516, "learning_rate": 6.097948001749565e-08, "loss": 0.03806958347558975, "rewards/accuracies": 0.9921875, "rewards/chosen": 54.93962097167969, "rewards/margins": 44.494140625, "rewards/rejected": 10.440677642822266, "step": 5522 }, { "epoch": 2.858695652173913, "grad_norm": 1.4525381326675415, "learning_rate": 6.053608527325783e-08, "loss": 0.12504389882087708, "rewards/accuracies": 0.9375, "rewards/chosen": 54.138004302978516, "rewards/margins": 42.80865478515625, "rewards/rejected": 11.321932792663574, "step": 5523 }, { "epoch": 2.8592132505175982, "grad_norm": 1.0114390850067139, "learning_rate": 6.009429860711624e-08, "loss": 0.08544772863388062, "rewards/accuracies": 0.953125, "rewards/chosen": 56.15391540527344, "rewards/margins": 43.45538330078125, "rewards/rejected": 12.6990966796875, "step": 5524 }, { "epoch": 2.8597308488612834, "grad_norm": 0.7305655479431152, "learning_rate": 5.965412016289807e-08, "loss": 0.07397454977035522, "rewards/accuracies": 0.9765625, "rewards/chosen": 54.28089904785156, "rewards/margins": 43.290740966796875, "rewards/rejected": 11.005971908569336, "step": 5525 }, { "epoch": 2.860248447204969, "grad_norm": 0.5942755937576294, "learning_rate": 5.9215550083907005e-08, "loss": 0.07774150371551514, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.8639030456543, "rewards/margins": 40.171630859375, "rewards/rejected": 11.685243606567383, "step": 5526 }, { "epoch": 2.860766045548654, "grad_norm": 1.8909832239151, "learning_rate": 5.877858851292217e-08, "loss": 0.1569560319185257, "rewards/accuracies": 0.921875, "rewards/chosen": 53.79413604736328, "rewards/margins": 43.51506042480469, "rewards/rejected": 10.27056884765625, "step": 5527 }, { "epoch": 2.8612836438923397, "grad_norm": 3.6366446018218994, "learning_rate": 5.8343235592199234e-08, "loss": 0.12111738324165344, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.69017028808594, "rewards/margins": 42.73663330078125, "rewards/rejected": 10.9564208984375, "step": 5528 }, { "epoch": 2.861801242236025, "grad_norm": 0.6914181709289551, "learning_rate": 5.790949146347147e-08, "loss": 0.06732162833213806, "rewards/accuracies": 0.953125, "rewards/chosen": 60.08051681518555, "rewards/margins": 47.91558074951172, "rewards/rejected": 12.16843032836914, "step": 5529 }, { "epoch": 2.86231884057971, "grad_norm": 1.9267221689224243, "learning_rate": 5.7477356267947034e-08, "loss": 0.11960946768522263, "rewards/accuracies": 0.953125, "rewards/chosen": 60.470184326171875, "rewards/margins": 49.655029296875, "rewards/rejected": 10.8221435546875, "step": 5530 }, { "epoch": 2.8628364389233956, "grad_norm": 0.7604178190231323, "learning_rate": 5.704683014631063e-08, "loss": 0.064830482006073, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.60131072998047, "rewards/margins": 43.084197998046875, "rewards/rejected": 13.518584251403809, "step": 5531 }, { "epoch": 2.8633540372670807, "grad_norm": 1.1903530359268188, "learning_rate": 5.66179132387229e-08, "loss": 0.10511906445026398, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.373619079589844, "rewards/margins": 44.75383758544922, "rewards/rejected": 11.624340057373047, "step": 5532 }, { "epoch": 2.8638716356107663, "grad_norm": 1.0433661937713623, "learning_rate": 5.6190605684820486e-08, "loss": 0.0839720070362091, "rewards/accuracies": 0.9765625, "rewards/chosen": 65.40652465820312, "rewards/margins": 51.03436279296875, "rewards/rejected": 14.370536804199219, "step": 5533 }, { "epoch": 2.8643892339544514, "grad_norm": 1.6753343343734741, "learning_rate": 5.576490762371767e-08, "loss": 0.12529514729976654, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.10011291503906, "rewards/margins": 43.514251708984375, "rewards/rejected": 14.58510971069336, "step": 5534 }, { "epoch": 2.8649068322981366, "grad_norm": 0.7271332740783691, "learning_rate": 5.534081919400247e-08, "loss": 0.0629718154668808, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.1856689453125, "rewards/margins": 43.523162841796875, "rewards/rejected": 15.665504455566406, "step": 5535 }, { "epoch": 2.8654244306418217, "grad_norm": 0.6170640587806702, "learning_rate": 5.4918340533740565e-08, "loss": 0.04719331115484238, "rewards/accuracies": 0.96875, "rewards/chosen": 58.51802062988281, "rewards/margins": 47.0369873046875, "rewards/rejected": 11.488807678222656, "step": 5536 }, { "epoch": 2.8659420289855073, "grad_norm": 0.7884495258331299, "learning_rate": 5.4497471780471936e-08, "loss": 0.053048066794872284, "rewards/accuracies": 0.9765625, "rewards/chosen": 63.25255584716797, "rewards/margins": 50.602325439453125, "rewards/rejected": 12.650970458984375, "step": 5537 }, { "epoch": 2.8664596273291925, "grad_norm": 0.5943190455436707, "learning_rate": 5.4078213071214746e-08, "loss": 0.049894653260707855, "rewards/accuracies": 0.96875, "rewards/chosen": 59.14319610595703, "rewards/margins": 47.30279541015625, "rewards/rejected": 11.843910217285156, "step": 5538 }, { "epoch": 2.866977225672878, "grad_norm": 0.7700983881950378, "learning_rate": 5.3660564542460936e-08, "loss": 0.08104471117258072, "rewards/accuracies": 0.96875, "rewards/chosen": 52.76190948486328, "rewards/margins": 43.729705810546875, "rewards/rejected": 9.03369140625, "step": 5539 }, { "epoch": 2.867494824016563, "grad_norm": 1.9814481735229492, "learning_rate": 5.324452633017951e-08, "loss": 0.07817298918962479, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.675140380859375, "rewards/margins": 49.97314453125, "rewards/rejected": 11.708147048950195, "step": 5540 }, { "epoch": 2.8680124223602483, "grad_norm": 0.5709184408187866, "learning_rate": 5.283009856981325e-08, "loss": 0.05569399893283844, "rewards/accuracies": 0.953125, "rewards/chosen": 62.51386260986328, "rewards/margins": 51.94474792480469, "rewards/rejected": 10.566062927246094, "step": 5541 }, { "epoch": 2.8685300207039335, "grad_norm": 0.9241300225257874, "learning_rate": 5.241728139628421e-08, "loss": 0.07085350155830383, "rewards/accuracies": 0.96875, "rewards/chosen": 56.645904541015625, "rewards/margins": 44.161376953125, "rewards/rejected": 12.482336044311523, "step": 5542 }, { "epoch": 2.869047619047619, "grad_norm": 1.2113808393478394, "learning_rate": 5.200607494398713e-08, "loss": 0.07387940585613251, "rewards/accuracies": 0.96875, "rewards/chosen": 56.673377990722656, "rewards/margins": 46.03338623046875, "rewards/rejected": 10.631736755371094, "step": 5543 }, { "epoch": 2.869565217391304, "grad_norm": 0.8217117786407471, "learning_rate": 5.159647934679213e-08, "loss": 0.05738675594329834, "rewards/accuracies": 0.984375, "rewards/chosen": 55.41938781738281, "rewards/margins": 44.85870361328125, "rewards/rejected": 10.563844680786133, "step": 5544 }, { "epoch": 2.87008281573499, "grad_norm": 1.3209073543548584, "learning_rate": 5.118849473804755e-08, "loss": 0.1601383090019226, "rewards/accuracies": 0.9296875, "rewards/chosen": 54.79494857788086, "rewards/margins": 43.441162109375, "rewards/rejected": 11.352096557617188, "step": 5545 }, { "epoch": 2.870600414078675, "grad_norm": 0.8730995655059814, "learning_rate": 5.078212125057547e-08, "loss": 0.05086999386548996, "rewards/accuracies": 0.984375, "rewards/chosen": 59.19847106933594, "rewards/margins": 47.121124267578125, "rewards/rejected": 12.079439163208008, "step": 5546 }, { "epoch": 2.87111801242236, "grad_norm": 1.1824818849563599, "learning_rate": 5.0377359016672845e-08, "loss": 0.11657922714948654, "rewards/accuracies": 0.9296875, "rewards/chosen": 56.960968017578125, "rewards/margins": 41.76239013671875, "rewards/rejected": 15.210807800292969, "step": 5547 }, { "epoch": 2.8716356107660457, "grad_norm": 2.2303965091705322, "learning_rate": 4.997420816811316e-08, "loss": 0.12137292325496674, "rewards/accuracies": 0.9375, "rewards/chosen": 56.501670837402344, "rewards/margins": 43.878021240234375, "rewards/rejected": 12.628746032714844, "step": 5548 }, { "epoch": 2.872153209109731, "grad_norm": 0.9602394700050354, "learning_rate": 4.9572668836145864e-08, "loss": 0.08510597050189972, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.48683166503906, "rewards/margins": 43.3089599609375, "rewards/rejected": 13.179305076599121, "step": 5549 }, { "epoch": 2.8726708074534164, "grad_norm": 0.7745203971862793, "learning_rate": 4.917274115149473e-08, "loss": 0.07597199082374573, "rewards/accuracies": 0.953125, "rewards/chosen": 52.524932861328125, "rewards/margins": 42.427024841308594, "rewards/rejected": 10.097221374511719, "step": 5550 }, { "epoch": 2.8731884057971016, "grad_norm": 3.1285743713378906, "learning_rate": 4.877442524435838e-08, "loss": 0.12063013017177582, "rewards/accuracies": 0.953125, "rewards/chosen": 52.4088134765625, "rewards/margins": 42.110260009765625, "rewards/rejected": 10.302177429199219, "step": 5551 }, { "epoch": 2.8737060041407867, "grad_norm": 1.9359220266342163, "learning_rate": 4.8377721244412533e-08, "loss": 0.1141681969165802, "rewards/accuracies": 0.9375, "rewards/chosen": 51.15155029296875, "rewards/margins": 40.10302734375, "rewards/rejected": 11.05716323852539, "step": 5552 }, { "epoch": 2.874223602484472, "grad_norm": 1.212721586227417, "learning_rate": 4.7982629280806104e-08, "loss": 0.11834598332643509, "rewards/accuracies": 0.9453125, "rewards/chosen": 51.4844970703125, "rewards/margins": 40.905181884765625, "rewards/rejected": 10.585660934448242, "step": 5553 }, { "epoch": 2.8747412008281574, "grad_norm": 0.8271511793136597, "learning_rate": 4.758914948216398e-08, "loss": 0.08404756337404251, "rewards/accuracies": 0.96875, "rewards/chosen": 54.082489013671875, "rewards/margins": 43.129425048828125, "rewards/rejected": 10.949893951416016, "step": 5554 }, { "epoch": 2.8752587991718426, "grad_norm": 0.43779706954956055, "learning_rate": 4.719728197658757e-08, "loss": 0.03332732617855072, "rewards/accuracies": 0.984375, "rewards/chosen": 67.4833984375, "rewards/margins": 52.770782470703125, "rewards/rejected": 14.705636978149414, "step": 5555 }, { "epoch": 2.875776397515528, "grad_norm": 0.9414516687393188, "learning_rate": 4.6807026891650956e-08, "loss": 0.062385350465774536, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.55029296875, "rewards/margins": 46.726531982421875, "rewards/rejected": 12.833002090454102, "step": 5556 }, { "epoch": 2.8762939958592133, "grad_norm": 2.446051597595215, "learning_rate": 4.6418384354405265e-08, "loss": 0.10795772820711136, "rewards/accuracies": 0.953125, "rewards/chosen": 59.33285903930664, "rewards/margins": 47.290740966796875, "rewards/rejected": 12.036930084228516, "step": 5557 }, { "epoch": 2.8768115942028984, "grad_norm": 0.44224750995635986, "learning_rate": 4.603135449137541e-08, "loss": 0.045481763780117035, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.726959228515625, "rewards/margins": 43.7476806640625, "rewards/rejected": 12.97214126586914, "step": 5558 }, { "epoch": 2.8773291925465836, "grad_norm": 0.5609001517295837, "learning_rate": 4.564593742856171e-08, "loss": 0.04616684094071388, "rewards/accuracies": 0.96875, "rewards/chosen": 57.023643493652344, "rewards/margins": 45.408355712890625, "rewards/rejected": 11.597564697265625, "step": 5559 }, { "epoch": 2.877846790890269, "grad_norm": 2.6342763900756836, "learning_rate": 4.5262133291439915e-08, "loss": 0.07593351602554321, "rewards/accuracies": 0.9765625, "rewards/chosen": 62.87432098388672, "rewards/margins": 45.9825439453125, "rewards/rejected": 16.900602340698242, "step": 5560 }, { "epoch": 2.8783643892339543, "grad_norm": 1.3319741487503052, "learning_rate": 4.487994220496006e-08, "loss": 0.062153320759534836, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.25798034667969, "rewards/margins": 46.8262939453125, "rewards/rejected": 13.429359436035156, "step": 5561 }, { "epoch": 2.87888198757764, "grad_norm": 0.780192494392395, "learning_rate": 4.449936429354762e-08, "loss": 0.06615151464939117, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.497955322265625, "rewards/margins": 46.996482849121094, "rewards/rejected": 11.503849029541016, "step": 5562 }, { "epoch": 2.879399585921325, "grad_norm": 5.553764820098877, "learning_rate": 4.4120399681100714e-08, "loss": 0.07007269561290741, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.747833251953125, "rewards/margins": 43.55189514160156, "rewards/rejected": 13.20112419128418, "step": 5563 }, { "epoch": 2.87991718426501, "grad_norm": 1.1660746335983276, "learning_rate": 4.37430484909962e-08, "loss": 0.10405108332633972, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.67259979248047, "rewards/margins": 48.83064270019531, "rewards/rejected": 12.827293395996094, "step": 5564 }, { "epoch": 2.880434782608696, "grad_norm": 2.695077657699585, "learning_rate": 4.336731084608303e-08, "loss": 0.08892036974430084, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.91899871826172, "rewards/margins": 47.81683349609375, "rewards/rejected": 11.111080169677734, "step": 5565 }, { "epoch": 2.880952380952381, "grad_norm": 0.9763801693916321, "learning_rate": 4.299318686868392e-08, "loss": 0.11742883920669556, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.553829193115234, "rewards/margins": 41.80862808227539, "rewards/rejected": 11.744190216064453, "step": 5566 }, { "epoch": 2.8814699792960665, "grad_norm": 2.2857213020324707, "learning_rate": 4.262067668059866e-08, "loss": 0.07064294815063477, "rewards/accuracies": 0.96875, "rewards/chosen": 58.15764617919922, "rewards/margins": 44.951080322265625, "rewards/rejected": 13.204605102539062, "step": 5567 }, { "epoch": 2.8819875776397517, "grad_norm": 1.0161819458007812, "learning_rate": 4.224978040310079e-08, "loss": 0.09479566663503647, "rewards/accuracies": 0.953125, "rewards/chosen": 59.20235061645508, "rewards/margins": 46.465335845947266, "rewards/rejected": 12.720718383789062, "step": 5568 }, { "epoch": 2.882505175983437, "grad_norm": 0.6404061913490295, "learning_rate": 4.1880498156938174e-08, "loss": 0.05972655862569809, "rewards/accuracies": 0.9765625, "rewards/chosen": 63.43107604980469, "rewards/margins": 48.78778076171875, "rewards/rejected": 14.641365051269531, "step": 5569 }, { "epoch": 2.883022774327122, "grad_norm": 1.2023752927780151, "learning_rate": 4.151283006233298e-08, "loss": 0.14150625467300415, "rewards/accuracies": 0.921875, "rewards/chosen": 55.073116302490234, "rewards/margins": 41.948081970214844, "rewards/rejected": 13.135245323181152, "step": 5570 }, { "epoch": 2.8835403726708075, "grad_norm": 1.1219550371170044, "learning_rate": 4.114677623898167e-08, "loss": 0.08924175798892975, "rewards/accuracies": 0.953125, "rewards/chosen": 56.274871826171875, "rewards/margins": 43.728759765625, "rewards/rejected": 12.547943115234375, "step": 5571 }, { "epoch": 2.8840579710144927, "grad_norm": 0.9118720889091492, "learning_rate": 4.0782336806057254e-08, "loss": 0.1152610033750534, "rewards/accuracies": 0.9296875, "rewards/chosen": 53.93329620361328, "rewards/margins": 42.0382080078125, "rewards/rejected": 11.887161254882812, "step": 5572 }, { "epoch": 2.8845755693581783, "grad_norm": 1.7953765392303467, "learning_rate": 4.041951188220428e-08, "loss": 0.0882066935300827, "rewards/accuracies": 0.96875, "rewards/chosen": 52.592872619628906, "rewards/margins": 41.42524719238281, "rewards/rejected": 11.173760414123535, "step": 5573 }, { "epoch": 2.8850931677018634, "grad_norm": 0.9881958961486816, "learning_rate": 4.00583015855438e-08, "loss": 0.07708686590194702, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.68851852416992, "rewards/margins": 48.193206787109375, "rewards/rejected": 11.483102798461914, "step": 5574 }, { "epoch": 2.8856107660455486, "grad_norm": 0.8416796326637268, "learning_rate": 3.96987060336701e-08, "loss": 0.079718679189682, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.45258331298828, "rewards/margins": 39.974327087402344, "rewards/rejected": 12.471179008483887, "step": 5575 }, { "epoch": 2.8861283643892337, "grad_norm": 0.8451128602027893, "learning_rate": 3.9340725343652294e-08, "loss": 0.1049090027809143, "rewards/accuracies": 0.953125, "rewards/chosen": 52.81242370605469, "rewards/margins": 41.334197998046875, "rewards/rejected": 11.483190536499023, "step": 5576 }, { "epoch": 2.8866459627329193, "grad_norm": 0.9783743023872375, "learning_rate": 3.8984359632033286e-08, "loss": 0.08414441347122192, "rewards/accuracies": 0.96875, "rewards/chosen": 58.94310760498047, "rewards/margins": 46.178375244140625, "rewards/rejected": 12.769601821899414, "step": 5577 }, { "epoch": 2.8871635610766044, "grad_norm": 0.7659689784049988, "learning_rate": 3.8629609014830263e-08, "loss": 0.08549283444881439, "rewards/accuracies": 0.953125, "rewards/chosen": 56.077247619628906, "rewards/margins": 45.48770523071289, "rewards/rejected": 10.58782958984375, "step": 5578 }, { "epoch": 2.88768115942029, "grad_norm": 4.3638224601745605, "learning_rate": 3.827647360753528e-08, "loss": 0.1092241108417511, "rewards/accuracies": 0.96875, "rewards/chosen": 59.058258056640625, "rewards/margins": 46.631805419921875, "rewards/rejected": 12.418180465698242, "step": 5579 }, { "epoch": 2.888198757763975, "grad_norm": 1.487532138824463, "learning_rate": 3.792495352511472e-08, "loss": 0.0742110088467598, "rewards/accuracies": 0.953125, "rewards/chosen": 61.915252685546875, "rewards/margins": 49.95416259765625, "rewards/rejected": 11.96174430847168, "step": 5580 }, { "epoch": 2.8887163561076603, "grad_norm": 0.6757612824440002, "learning_rate": 3.757504888200702e-08, "loss": 0.060586199164390564, "rewards/accuracies": 0.96875, "rewards/chosen": 62.12091827392578, "rewards/margins": 47.8887939453125, "rewards/rejected": 14.228317260742188, "step": 5581 }, { "epoch": 2.889233954451346, "grad_norm": 1.185666799545288, "learning_rate": 3.72267597921272e-08, "loss": 0.061402373015880585, "rewards/accuracies": 0.96875, "rewards/chosen": 65.13929748535156, "rewards/margins": 50.401611328125, "rewards/rejected": 14.73980712890625, "step": 5582 }, { "epoch": 2.889751552795031, "grad_norm": 0.9035637974739075, "learning_rate": 3.6880086368863424e-08, "loss": 0.08105297386646271, "rewards/accuracies": 0.96875, "rewards/chosen": 56.275146484375, "rewards/margins": 46.59215545654297, "rewards/rejected": 9.682610511779785, "step": 5583 }, { "epoch": 2.8902691511387166, "grad_norm": 0.85116046667099, "learning_rate": 3.653502872507708e-08, "loss": 0.0728478729724884, "rewards/accuracies": 0.96875, "rewards/chosen": 57.53851318359375, "rewards/margins": 43.03388977050781, "rewards/rejected": 14.503746032714844, "step": 5584 }, { "epoch": 2.8907867494824018, "grad_norm": 0.5672867298126221, "learning_rate": 3.619158697310443e-08, "loss": 0.04691148176789284, "rewards/accuracies": 0.96875, "rewards/chosen": 60.242889404296875, "rewards/margins": 47.188751220703125, "rewards/rejected": 13.047687530517578, "step": 5585 }, { "epoch": 2.891304347826087, "grad_norm": 1.586334228515625, "learning_rate": 3.5849761224755474e-08, "loss": 0.08191590011119843, "rewards/accuracies": 0.96875, "rewards/chosen": 60.369728088378906, "rewards/margins": 46.294036865234375, "rewards/rejected": 14.06805419921875, "step": 5586 }, { "epoch": 2.891821946169772, "grad_norm": 0.9363517761230469, "learning_rate": 3.5509551591313976e-08, "loss": 0.07637365162372589, "rewards/accuracies": 0.96875, "rewards/chosen": 55.83121871948242, "rewards/margins": 43.40801239013672, "rewards/rejected": 12.421707153320312, "step": 5587 }, { "epoch": 2.8923395445134576, "grad_norm": 1.0694258213043213, "learning_rate": 3.517095818353744e-08, "loss": 0.09153028577566147, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.718605041503906, "rewards/margins": 42.846954345703125, "rewards/rejected": 10.856498718261719, "step": 5588 }, { "epoch": 2.892857142857143, "grad_norm": 0.9735698103904724, "learning_rate": 3.48339811116577e-08, "loss": 0.07655767351388931, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.05607986450195, "rewards/margins": 45.896392822265625, "rewards/rejected": 11.155685424804688, "step": 5589 }, { "epoch": 2.8933747412008284, "grad_norm": 1.1164981126785278, "learning_rate": 3.449862048537978e-08, "loss": 0.1123322993516922, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.026268005371094, "rewards/margins": 48.11480712890625, "rewards/rejected": 10.905326843261719, "step": 5590 }, { "epoch": 2.8938923395445135, "grad_norm": 1.5424315929412842, "learning_rate": 3.4164876413883554e-08, "loss": 0.12541651725769043, "rewards/accuracies": 0.921875, "rewards/chosen": 56.62041473388672, "rewards/margins": 43.34611129760742, "rewards/rejected": 13.267536163330078, "step": 5591 }, { "epoch": 2.8944099378881987, "grad_norm": 1.9002379179000854, "learning_rate": 3.383274900582101e-08, "loss": 0.10248536616563797, "rewards/accuracies": 0.953125, "rewards/chosen": 55.21430206298828, "rewards/margins": 42.79547119140625, "rewards/rejected": 12.427738189697266, "step": 5592 }, { "epoch": 2.894927536231884, "grad_norm": 1.2015635967254639, "learning_rate": 3.350223836931843e-08, "loss": 0.1104002594947815, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.56194305419922, "rewards/margins": 42.67460632324219, "rewards/rejected": 12.893811225891113, "step": 5593 }, { "epoch": 2.8954451345755694, "grad_norm": 0.8132580518722534, "learning_rate": 3.317334461197641e-08, "loss": 0.08048715442419052, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.849090576171875, "rewards/margins": 43.30174255371094, "rewards/rejected": 10.535340309143066, "step": 5594 }, { "epoch": 2.8959627329192545, "grad_norm": 0.7972667813301086, "learning_rate": 3.2846067840868744e-08, "loss": 0.07282394915819168, "rewards/accuracies": 0.96875, "rewards/chosen": 56.37298583984375, "rewards/margins": 44.10992431640625, "rewards/rejected": 12.267780303955078, "step": 5595 }, { "epoch": 2.89648033126294, "grad_norm": 0.8298760652542114, "learning_rate": 3.252040816254298e-08, "loss": 0.08563174307346344, "rewards/accuracies": 0.953125, "rewards/chosen": 55.636566162109375, "rewards/margins": 42.58464813232422, "rewards/rejected": 13.051883697509766, "step": 5596 }, { "epoch": 2.8969979296066253, "grad_norm": 1.5290112495422363, "learning_rate": 3.219636568301876e-08, "loss": 0.12043602764606476, "rewards/accuracies": 0.953125, "rewards/chosen": 51.146095275878906, "rewards/margins": 39.9617919921875, "rewards/rejected": 11.171491622924805, "step": 5597 }, { "epoch": 2.8975155279503104, "grad_norm": 0.7265766263008118, "learning_rate": 3.187394050779169e-08, "loss": 0.07183484733104706, "rewards/accuracies": 0.96875, "rewards/chosen": 58.367950439453125, "rewards/margins": 44.39042663574219, "rewards/rejected": 13.97157096862793, "step": 5598 }, { "epoch": 2.898033126293996, "grad_norm": 1.7122281789779663, "learning_rate": 3.155313274182947e-08, "loss": 0.13336020708084106, "rewards/accuracies": 0.9296875, "rewards/chosen": 56.861366271972656, "rewards/margins": 44.93243408203125, "rewards/rejected": 11.916698455810547, "step": 5599 }, { "epoch": 2.898550724637681, "grad_norm": 3.2526915073394775, "learning_rate": 3.123394248957245e-08, "loss": 0.06907518208026886, "rewards/accuracies": 0.96875, "rewards/chosen": 62.5104866027832, "rewards/margins": 50.345489501953125, "rewards/rejected": 12.154016494750977, "step": 5600 }, { "epoch": 2.8990683229813663, "grad_norm": 1.3048490285873413, "learning_rate": 3.09163698549364e-08, "loss": 0.1184011921286583, "rewards/accuracies": 0.953125, "rewards/chosen": 56.31900405883789, "rewards/margins": 44.84004211425781, "rewards/rejected": 11.481147766113281, "step": 5601 }, { "epoch": 2.899585921325052, "grad_norm": 0.8944865465164185, "learning_rate": 3.0600414941309166e-08, "loss": 0.0932832658290863, "rewards/accuracies": 0.9609375, "rewards/chosen": 49.00701904296875, "rewards/margins": 40.294891357421875, "rewards/rejected": 8.716850280761719, "step": 5602 }, { "epoch": 2.900103519668737, "grad_norm": 0.7649922370910645, "learning_rate": 3.02860778515518e-08, "loss": 0.07419103384017944, "rewards/accuracies": 0.953125, "rewards/chosen": 64.97056579589844, "rewards/margins": 48.42303466796875, "rewards/rejected": 16.56031036376953, "step": 5603 }, { "epoch": 2.900621118012422, "grad_norm": 0.6877447366714478, "learning_rate": 2.9973358687998556e-08, "loss": 0.08089730143547058, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.69960021972656, "rewards/margins": 43.3135986328125, "rewards/rejected": 14.38608169555664, "step": 5604 }, { "epoch": 2.9011387163561078, "grad_norm": 1.3932111263275146, "learning_rate": 2.9662257552458553e-08, "loss": 0.08405230939388275, "rewards/accuracies": 0.953125, "rewards/chosen": 58.51908874511719, "rewards/margins": 47.11370849609375, "rewards/rejected": 11.421915054321289, "step": 5605 }, { "epoch": 2.901656314699793, "grad_norm": 2.9599034786224365, "learning_rate": 2.9352774546212438e-08, "loss": 0.11410864442586899, "rewards/accuracies": 0.9296875, "rewards/chosen": 58.88807678222656, "rewards/margins": 45.604461669921875, "rewards/rejected": 13.281410217285156, "step": 5606 }, { "epoch": 2.9021739130434785, "grad_norm": 1.0961703062057495, "learning_rate": 2.9044909770014617e-08, "loss": 0.07837417721748352, "rewards/accuracies": 0.9765625, "rewards/chosen": 59.245140075683594, "rewards/margins": 47.2730712890625, "rewards/rejected": 11.962669372558594, "step": 5607 }, { "epoch": 2.9026915113871636, "grad_norm": 0.5875126123428345, "learning_rate": 2.8738663324092698e-08, "loss": 0.05323733389377594, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.44395446777344, "rewards/margins": 48.35015869140625, "rewards/rejected": 9.114681243896484, "step": 5608 }, { "epoch": 2.903209109730849, "grad_norm": 0.8019550442695618, "learning_rate": 2.843403530814748e-08, "loss": 0.09172870218753815, "rewards/accuracies": 0.953125, "rewards/chosen": 51.342384338378906, "rewards/margins": 41.69451904296875, "rewards/rejected": 9.65799331665039, "step": 5609 }, { "epoch": 2.903726708074534, "grad_norm": 0.8607149720191956, "learning_rate": 2.8131025821352963e-08, "loss": 0.0742231011390686, "rewards/accuracies": 0.9609375, "rewards/chosen": 63.6005744934082, "rewards/margins": 48.643035888671875, "rewards/rejected": 14.954734802246094, "step": 5610 }, { "epoch": 2.9042443064182195, "grad_norm": 2.5992822647094727, "learning_rate": 2.78296349623558e-08, "loss": 0.08730781078338623, "rewards/accuracies": 0.9609375, "rewards/chosen": 50.47222137451172, "rewards/margins": 38.574310302734375, "rewards/rejected": 11.89703369140625, "step": 5611 }, { "epoch": 2.9047619047619047, "grad_norm": 1.2536925077438354, "learning_rate": 2.7529862829275833e-08, "loss": 0.09615087509155273, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.453025817871094, "rewards/margins": 44.490234375, "rewards/rejected": 10.973228454589844, "step": 5612 }, { "epoch": 2.9052795031055902, "grad_norm": 1.3208082914352417, "learning_rate": 2.7231709519706662e-08, "loss": 0.09586392343044281, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.35264587402344, "rewards/margins": 41.747314453125, "rewards/rejected": 12.60980224609375, "step": 5613 }, { "epoch": 2.9057971014492754, "grad_norm": 3.726120710372925, "learning_rate": 2.693517513071342e-08, "loss": 0.10869362950325012, "rewards/accuracies": 0.953125, "rewards/chosen": 54.864524841308594, "rewards/margins": 42.032989501953125, "rewards/rejected": 12.843246459960938, "step": 5614 }, { "epoch": 2.9063146997929605, "grad_norm": 0.7108193039894104, "learning_rate": 2.664025975883555e-08, "loss": 0.06473270058631897, "rewards/accuracies": 0.96875, "rewards/chosen": 57.45410919189453, "rewards/margins": 44.211669921875, "rewards/rejected": 13.258657455444336, "step": 5615 }, { "epoch": 2.906832298136646, "grad_norm": 3.729132652282715, "learning_rate": 2.634696350008459e-08, "loss": 0.08142191171646118, "rewards/accuracies": 0.9609375, "rewards/chosen": 62.080841064453125, "rewards/margins": 48.53033447265625, "rewards/rejected": 13.543620109558105, "step": 5616 }, { "epoch": 2.9073498964803313, "grad_norm": 1.0627654790878296, "learning_rate": 2.6055286449945817e-08, "loss": 0.12479203194379807, "rewards/accuracies": 0.90625, "rewards/chosen": 58.59233093261719, "rewards/margins": 44.429779052734375, "rewards/rejected": 14.15730094909668, "step": 5617 }, { "epoch": 2.9078674948240164, "grad_norm": 1.560468077659607, "learning_rate": 2.576522870337661e-08, "loss": 0.10178747773170471, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.37620544433594, "rewards/margins": 44.419273376464844, "rewards/rejected": 10.95977783203125, "step": 5618 }, { "epoch": 2.908385093167702, "grad_norm": 1.5734846591949463, "learning_rate": 2.5476790354806434e-08, "loss": 0.05645617097616196, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.45362854003906, "rewards/margins": 43.90312194824219, "rewards/rejected": 10.54928207397461, "step": 5619 }, { "epoch": 2.908902691511387, "grad_norm": 0.5993935465812683, "learning_rate": 2.518997149813962e-08, "loss": 0.03876642882823944, "rewards/accuracies": 1.0, "rewards/chosen": 61.80276107788086, "rewards/margins": 48.1082763671875, "rewards/rejected": 13.682205200195312, "step": 5620 }, { "epoch": 2.9094202898550723, "grad_norm": 2.276676654815674, "learning_rate": 2.490477222675147e-08, "loss": 0.10077846795320511, "rewards/accuracies": 0.9375, "rewards/chosen": 55.0585823059082, "rewards/margins": 41.018402099609375, "rewards/rejected": 14.025985717773438, "step": 5621 }, { "epoch": 2.909937888198758, "grad_norm": 1.524064064025879, "learning_rate": 2.4621192633491052e-08, "loss": 0.14412075281143188, "rewards/accuracies": 0.9296875, "rewards/chosen": 55.107460021972656, "rewards/margins": 43.31256103515625, "rewards/rejected": 11.789018630981445, "step": 5622 }, { "epoch": 2.910455486542443, "grad_norm": 0.6595304608345032, "learning_rate": 2.4339232810679512e-08, "loss": 0.04075494408607483, "rewards/accuracies": 0.9921875, "rewards/chosen": 59.61547088623047, "rewards/margins": 48.243896484375, "rewards/rejected": 11.361597061157227, "step": 5623 }, { "epoch": 2.9109730848861286, "grad_norm": 0.5566222071647644, "learning_rate": 2.4058892850111203e-08, "loss": 0.06518220901489258, "rewards/accuracies": 0.9609375, "rewards/chosen": 68.0516357421875, "rewards/margins": 53.4593505859375, "rewards/rejected": 14.593582153320312, "step": 5624 }, { "epoch": 2.9114906832298137, "grad_norm": 0.6216893196105957, "learning_rate": 2.378017284305256e-08, "loss": 0.049957025796175, "rewards/accuracies": 0.984375, "rewards/chosen": 57.418914794921875, "rewards/margins": 46.215118408203125, "rewards/rejected": 11.201641082763672, "step": 5625 }, { "epoch": 2.912008281573499, "grad_norm": 1.1534433364868164, "learning_rate": 2.3503072880242672e-08, "loss": 0.053915441036224365, "rewards/accuracies": 0.9765625, "rewards/chosen": 63.4073486328125, "rewards/margins": 48.697296142578125, "rewards/rejected": 14.719341278076172, "step": 5626 }, { "epoch": 2.912525879917184, "grad_norm": 0.9051361680030823, "learning_rate": 2.3227593051893815e-08, "loss": 0.04716076701879501, "rewards/accuracies": 0.984375, "rewards/chosen": 59.14179992675781, "rewards/margins": 47.55291748046875, "rewards/rejected": 11.598358154296875, "step": 5627 }, { "epoch": 2.9130434782608696, "grad_norm": 1.63132905960083, "learning_rate": 2.2953733447690363e-08, "loss": 0.17605651915073395, "rewards/accuracies": 0.8828125, "rewards/chosen": 54.110382080078125, "rewards/margins": 42.093597412109375, "rewards/rejected": 12.01947021484375, "step": 5628 }, { "epoch": 2.9135610766045548, "grad_norm": 1.026187777519226, "learning_rate": 2.268149415678933e-08, "loss": 0.09113948792219162, "rewards/accuracies": 0.953125, "rewards/chosen": 54.12541198730469, "rewards/margins": 44.17559814453125, "rewards/rejected": 9.94356918334961, "step": 5629 }, { "epoch": 2.9140786749482404, "grad_norm": 2.136094331741333, "learning_rate": 2.241087526781982e-08, "loss": 0.0876106545329094, "rewards/accuracies": 0.984375, "rewards/chosen": 57.917877197265625, "rewards/margins": 44.21722412109375, "rewards/rejected": 13.682769775390625, "step": 5630 }, { "epoch": 2.9145962732919255, "grad_norm": 0.6055499911308289, "learning_rate": 2.2141876868884138e-08, "loss": 0.0472574457526207, "rewards/accuracies": 0.984375, "rewards/chosen": 53.53612518310547, "rewards/margins": 42.686859130859375, "rewards/rejected": 10.836481094360352, "step": 5631 }, { "epoch": 2.9151138716356106, "grad_norm": 1.1685606241226196, "learning_rate": 2.187449904755612e-08, "loss": 0.1210370808839798, "rewards/accuracies": 0.953125, "rewards/chosen": 54.807838439941406, "rewards/margins": 42.00390625, "rewards/rejected": 12.80099105834961, "step": 5632 }, { "epoch": 2.9156314699792962, "grad_norm": 0.7894313335418701, "learning_rate": 2.1608741890883355e-08, "loss": 0.05167636275291443, "rewards/accuracies": 0.96875, "rewards/chosen": 60.71021270751953, "rewards/margins": 45.9581298828125, "rewards/rejected": 14.749580383300781, "step": 5633 }, { "epoch": 2.9161490683229814, "grad_norm": 0.7673553228378296, "learning_rate": 2.134460548538442e-08, "loss": 0.08414120227098465, "rewards/accuracies": 0.96875, "rewards/chosen": 58.27588653564453, "rewards/margins": 46.0546875, "rewards/rejected": 12.21269416809082, "step": 5634 }, { "epoch": 2.9166666666666665, "grad_norm": 1.7103242874145508, "learning_rate": 2.1082089917051075e-08, "loss": 0.11659396439790726, "rewards/accuracies": 0.9453125, "rewards/chosen": 46.62184524536133, "rewards/margins": 37.854766845703125, "rewards/rejected": 8.756479263305664, "step": 5635 }, { "epoch": 2.917184265010352, "grad_norm": 1.606942892074585, "learning_rate": 2.0821195271347184e-08, "loss": 0.10944537073373795, "rewards/accuracies": 0.953125, "rewards/chosen": 64.6335678100586, "rewards/margins": 51.18104553222656, "rewards/rejected": 13.454643249511719, "step": 5636 }, { "epoch": 2.9177018633540373, "grad_norm": 0.4746229946613312, "learning_rate": 2.0561921633209247e-08, "loss": 0.04754596948623657, "rewards/accuracies": 0.984375, "rewards/chosen": 61.150691986083984, "rewards/margins": 45.72412109375, "rewards/rejected": 15.426570892333984, "step": 5637 }, { "epoch": 2.9182194616977224, "grad_norm": 0.7511588335037231, "learning_rate": 2.0304269087044193e-08, "loss": 0.07021354138851166, "rewards/accuracies": 0.9609375, "rewards/chosen": 65.47213745117188, "rewards/margins": 51.085693359375, "rewards/rejected": 14.380960464477539, "step": 5638 }, { "epoch": 2.918737060041408, "grad_norm": 0.5560271143913269, "learning_rate": 2.0048237716734363e-08, "loss": 0.05702635645866394, "rewards/accuracies": 0.984375, "rewards/chosen": 61.5987548828125, "rewards/margins": 47.3404541015625, "rewards/rejected": 14.24365234375, "step": 5639 }, { "epoch": 2.919254658385093, "grad_norm": 0.7539120316505432, "learning_rate": 1.9793827605631978e-08, "loss": 0.0901733785867691, "rewards/accuracies": 0.9453125, "rewards/chosen": 53.4271240234375, "rewards/margins": 42.552642822265625, "rewards/rejected": 10.878767013549805, "step": 5640 }, { "epoch": 2.9197722567287787, "grad_norm": 0.9470324516296387, "learning_rate": 1.9541038836561888e-08, "loss": 0.07918094098567963, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.02275085449219, "rewards/margins": 44.78619384765625, "rewards/rejected": 11.246391296386719, "step": 5641 }, { "epoch": 2.920289855072464, "grad_norm": 0.7914834022521973, "learning_rate": 1.9289871491821043e-08, "loss": 0.08936323970556259, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.85806655883789, "rewards/margins": 46.9144287109375, "rewards/rejected": 11.947490692138672, "step": 5642 }, { "epoch": 2.920807453416149, "grad_norm": 0.6545733213424683, "learning_rate": 1.9040325653179592e-08, "loss": 0.06344549357891083, "rewards/accuracies": 0.953125, "rewards/chosen": 58.05015563964844, "rewards/margins": 45.308624267578125, "rewards/rejected": 12.739093780517578, "step": 5643 }, { "epoch": 2.921325051759834, "grad_norm": 0.9248182773590088, "learning_rate": 1.8792401401878656e-08, "loss": 0.08424465358257294, "rewards/accuracies": 0.96875, "rewards/chosen": 54.488922119140625, "rewards/margins": 41.947784423828125, "rewards/rejected": 12.548748016357422, "step": 5644 }, { "epoch": 2.9218426501035197, "grad_norm": 1.6099132299423218, "learning_rate": 1.8546098818631454e-08, "loss": 0.05881500244140625, "rewards/accuracies": 0.984375, "rewards/chosen": 56.72186279296875, "rewards/margins": 44.15873718261719, "rewards/rejected": 12.545677185058594, "step": 5645 }, { "epoch": 2.922360248447205, "grad_norm": 0.9582598209381104, "learning_rate": 1.8301417983623836e-08, "loss": 0.07248017191886902, "rewards/accuracies": 0.96875, "rewards/chosen": 60.920440673828125, "rewards/margins": 47.79669189453125, "rewards/rejected": 13.118209838867188, "step": 5646 }, { "epoch": 2.9228778467908905, "grad_norm": 0.6662882566452026, "learning_rate": 1.8058358976513756e-08, "loss": 0.07665311545133591, "rewards/accuracies": 0.953125, "rewards/chosen": 55.20301818847656, "rewards/margins": 41.970306396484375, "rewards/rejected": 13.230812072753906, "step": 5647 }, { "epoch": 2.9233954451345756, "grad_norm": 1.3458082675933838, "learning_rate": 1.7816921876430137e-08, "loss": 0.11008716374635696, "rewards/accuracies": 0.921875, "rewards/chosen": 53.015899658203125, "rewards/margins": 42.30650329589844, "rewards/rejected": 10.714561462402344, "step": 5648 }, { "epoch": 2.9239130434782608, "grad_norm": 3.1864397525787354, "learning_rate": 1.757710676197455e-08, "loss": 0.08693075180053711, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.31499481201172, "rewards/margins": 45.00958251953125, "rewards/rejected": 13.303291320800781, "step": 5649 }, { "epoch": 2.924430641821946, "grad_norm": 1.7432924509048462, "learning_rate": 1.7338913711221206e-08, "loss": 0.09519554674625397, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.18201446533203, "rewards/margins": 44.17051696777344, "rewards/rejected": 12.015899658203125, "step": 5650 }, { "epoch": 2.9249482401656315, "grad_norm": 0.8954786658287048, "learning_rate": 1.7102342801715298e-08, "loss": 0.07315725088119507, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.897830963134766, "rewards/margins": 46.148895263671875, "rewards/rejected": 14.723512649536133, "step": 5651 }, { "epoch": 2.9254658385093166, "grad_norm": 1.3898396492004395, "learning_rate": 1.686739411047411e-08, "loss": 0.11942867934703827, "rewards/accuracies": 0.921875, "rewards/chosen": 54.27549743652344, "rewards/margins": 42.43589782714844, "rewards/rejected": 11.828174591064453, "step": 5652 }, { "epoch": 2.925983436853002, "grad_norm": 0.7433719635009766, "learning_rate": 1.6634067713986458e-08, "loss": 0.0774025022983551, "rewards/accuracies": 0.96875, "rewards/chosen": 61.909637451171875, "rewards/margins": 48.8111572265625, "rewards/rejected": 13.09518051147461, "step": 5653 }, { "epoch": 2.9265010351966874, "grad_norm": 0.9457144141197205, "learning_rate": 1.6402363688214352e-08, "loss": 0.06784668564796448, "rewards/accuracies": 0.96875, "rewards/chosen": 64.81230163574219, "rewards/margins": 50.97235107421875, "rewards/rejected": 13.841773986816406, "step": 5654 }, { "epoch": 2.9270186335403725, "grad_norm": 1.5814106464385986, "learning_rate": 1.617228210859023e-08, "loss": 0.10215280950069427, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.964019775390625, "rewards/margins": 45.860382080078125, "rewards/rejected": 11.102025985717773, "step": 5655 }, { "epoch": 2.927536231884058, "grad_norm": 0.786054790019989, "learning_rate": 1.5943823050018626e-08, "loss": 0.06715309619903564, "rewards/accuracies": 0.9765625, "rewards/chosen": 58.98072052001953, "rewards/margins": 42.847808837890625, "rewards/rejected": 16.129894256591797, "step": 5656 }, { "epoch": 2.9280538302277432, "grad_norm": 1.484321117401123, "learning_rate": 1.5716986586876148e-08, "loss": 0.08504379540681839, "rewards/accuracies": 0.96875, "rewards/chosen": 56.87493896484375, "rewards/margins": 43.81683349609375, "rewards/rejected": 13.04660415649414, "step": 5657 }, { "epoch": 2.928571428571429, "grad_norm": 1.0099273920059204, "learning_rate": 1.5491772793011506e-08, "loss": 0.08614270389080048, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.80451965332031, "rewards/margins": 49.621116638183594, "rewards/rejected": 12.187188148498535, "step": 5658 }, { "epoch": 2.929089026915114, "grad_norm": 0.9233226180076599, "learning_rate": 1.5268181741743827e-08, "loss": 0.057442475110292435, "rewards/accuracies": 0.96875, "rewards/chosen": 64.87468719482422, "rewards/margins": 51.98712158203125, "rewards/rejected": 12.882623672485352, "step": 5659 }, { "epoch": 2.929606625258799, "grad_norm": 0.6071536540985107, "learning_rate": 1.5046213505865437e-08, "loss": 0.0491383895277977, "rewards/accuracies": 0.96875, "rewards/chosen": 56.59318542480469, "rewards/margins": 46.684417724609375, "rewards/rejected": 9.909431457519531, "step": 5660 }, { "epoch": 2.9301242236024843, "grad_norm": 0.5536086559295654, "learning_rate": 1.4825868157639645e-08, "loss": 0.04694947600364685, "rewards/accuracies": 0.9765625, "rewards/chosen": 63.67189025878906, "rewards/margins": 50.57281494140625, "rewards/rejected": 13.087890625, "step": 5661 }, { "epoch": 2.93064182194617, "grad_norm": 0.9258759617805481, "learning_rate": 1.4607145768800734e-08, "loss": 0.07954436540603638, "rewards/accuracies": 0.9453125, "rewards/chosen": 63.93049621582031, "rewards/margins": 49.96028137207031, "rewards/rejected": 13.958877563476562, "step": 5662 }, { "epoch": 2.931159420289855, "grad_norm": 0.7969869375228882, "learning_rate": 1.4390046410556745e-08, "loss": 0.1069803461432457, "rewards/accuracies": 0.9296875, "rewards/chosen": 62.12041473388672, "rewards/margins": 49.92474365234375, "rewards/rejected": 12.190868377685547, "step": 5663 }, { "epoch": 2.9316770186335406, "grad_norm": 0.8881180286407471, "learning_rate": 1.4174570153584477e-08, "loss": 0.10782287269830704, "rewards/accuracies": 0.953125, "rewards/chosen": 54.069114685058594, "rewards/margins": 40.1854248046875, "rewards/rejected": 13.875693321228027, "step": 5664 }, { "epoch": 2.9321946169772257, "grad_norm": 0.7158804535865784, "learning_rate": 1.3960717068035034e-08, "loss": 0.08977653831243515, "rewards/accuracies": 0.953125, "rewards/chosen": 54.67146301269531, "rewards/margins": 43.6212158203125, "rewards/rejected": 11.060415267944336, "step": 5665 }, { "epoch": 2.932712215320911, "grad_norm": 1.0898292064666748, "learning_rate": 1.374848722352884e-08, "loss": 0.11734512448310852, "rewards/accuracies": 0.9375, "rewards/chosen": 55.7969970703125, "rewards/margins": 45.22900390625, "rewards/rejected": 10.566570281982422, "step": 5666 }, { "epoch": 2.933229813664596, "grad_norm": 0.8913239240646362, "learning_rate": 1.3537880689158955e-08, "loss": 0.07488246262073517, "rewards/accuracies": 0.96875, "rewards/chosen": 57.04703140258789, "rewards/margins": 46.057411193847656, "rewards/rejected": 10.992445945739746, "step": 5667 }, { "epoch": 2.9337474120082816, "grad_norm": 0.9101501703262329, "learning_rate": 1.3328897533490536e-08, "loss": 0.08614401519298553, "rewards/accuracies": 0.96875, "rewards/chosen": 56.590030670166016, "rewards/margins": 46.16644287109375, "rewards/rejected": 10.400772094726562, "step": 5668 }, { "epoch": 2.9342650103519667, "grad_norm": 0.44296181201934814, "learning_rate": 1.3121537824559161e-08, "loss": 0.03465249761939049, "rewards/accuracies": 0.9921875, "rewards/chosen": 63.265953063964844, "rewards/margins": 50.33686065673828, "rewards/rejected": 12.952840805053711, "step": 5669 }, { "epoch": 2.9347826086956523, "grad_norm": 2.2255940437316895, "learning_rate": 1.2915801629872494e-08, "loss": 0.09007435292005539, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.95878601074219, "rewards/margins": 41.593505859375, "rewards/rejected": 10.366401672363281, "step": 5670 }, { "epoch": 2.9353002070393375, "grad_norm": 1.429087519645691, "learning_rate": 1.2711689016408623e-08, "loss": 0.07670027762651443, "rewards/accuracies": 0.953125, "rewards/chosen": 60.56776809692383, "rewards/margins": 47.22520446777344, "rewards/rejected": 13.323783874511719, "step": 5671 }, { "epoch": 2.9358178053830226, "grad_norm": 1.117038369178772, "learning_rate": 1.2509200050618842e-08, "loss": 0.0649324581027031, "rewards/accuracies": 0.9609375, "rewards/chosen": 62.808319091796875, "rewards/margins": 47.34185791015625, "rewards/rejected": 15.476909637451172, "step": 5672 }, { "epoch": 2.936335403726708, "grad_norm": 1.1406691074371338, "learning_rate": 1.2308334798424859e-08, "loss": 0.11305303871631622, "rewards/accuracies": 0.953125, "rewards/chosen": 53.96946716308594, "rewards/margins": 42.732208251953125, "rewards/rejected": 11.242389678955078, "step": 5673 }, { "epoch": 2.9368530020703933, "grad_norm": 1.4370006322860718, "learning_rate": 1.2109093325218812e-08, "loss": 0.1403411626815796, "rewards/accuracies": 0.9375, "rewards/chosen": 54.10494613647461, "rewards/margins": 42.40898132324219, "rewards/rejected": 11.69515609741211, "step": 5674 }, { "epoch": 2.937370600414079, "grad_norm": 0.8206685185432434, "learning_rate": 1.1911475695866037e-08, "loss": 0.08010869473218918, "rewards/accuracies": 0.984375, "rewards/chosen": 64.40243530273438, "rewards/margins": 51.062255859375, "rewards/rejected": 13.339241027832031, "step": 5675 }, { "epoch": 2.937888198757764, "grad_norm": 1.0624569654464722, "learning_rate": 1.1715481974702847e-08, "loss": 0.07419701665639877, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.65398406982422, "rewards/margins": 45.1866455078125, "rewards/rejected": 9.484743118286133, "step": 5676 }, { "epoch": 2.9384057971014492, "grad_norm": 0.6975368857383728, "learning_rate": 1.1521112225535425e-08, "loss": 0.07806611061096191, "rewards/accuracies": 0.953125, "rewards/chosen": 55.13139724731445, "rewards/margins": 45.312408447265625, "rewards/rejected": 9.818305969238281, "step": 5677 }, { "epoch": 2.9389233954451344, "grad_norm": 2.1513383388519287, "learning_rate": 1.1328366511642041e-08, "loss": 0.11422888934612274, "rewards/accuracies": 0.9453125, "rewards/chosen": 56.76634979248047, "rewards/margins": 44.73841857910156, "rewards/rejected": 12.040184020996094, "step": 5678 }, { "epoch": 2.93944099378882, "grad_norm": 1.121942400932312, "learning_rate": 1.1137244895773613e-08, "loss": 0.09970741719007492, "rewards/accuracies": 0.9453125, "rewards/chosen": 55.147308349609375, "rewards/margins": 43.64617919921875, "rewards/rejected": 11.49421501159668, "step": 5679 }, { "epoch": 2.939958592132505, "grad_norm": 1.0148506164550781, "learning_rate": 1.094774744015037e-08, "loss": 0.08607462048530579, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.393157958984375, "rewards/margins": 41.160552978515625, "rewards/rejected": 9.236335754394531, "step": 5680 }, { "epoch": 2.9404761904761907, "grad_norm": 2.4822750091552734, "learning_rate": 1.0759874206464626e-08, "loss": 0.11283230781555176, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.55230712890625, "rewards/margins": 44.137115478515625, "rewards/rejected": 12.402900695800781, "step": 5681 }, { "epoch": 2.940993788819876, "grad_norm": 1.288482904434204, "learning_rate": 1.0573625255880238e-08, "loss": 0.1008797138929367, "rewards/accuracies": 0.953125, "rewards/chosen": 56.11484146118164, "rewards/margins": 42.400848388671875, "rewards/rejected": 13.700639724731445, "step": 5682 }, { "epoch": 2.941511387163561, "grad_norm": 1.143914818763733, "learning_rate": 1.0389000649031478e-08, "loss": 0.1254817396402359, "rewards/accuracies": 0.9453125, "rewards/chosen": 48.77617263793945, "rewards/margins": 38.90245056152344, "rewards/rejected": 9.876688003540039, "step": 5683 }, { "epoch": 2.942028985507246, "grad_norm": 0.9410479664802551, "learning_rate": 1.020600044602471e-08, "loss": 0.07849752902984619, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.483070373535156, "rewards/margins": 43.77008056640625, "rewards/rejected": 10.71367073059082, "step": 5684 }, { "epoch": 2.9425465838509317, "grad_norm": 0.8124933242797852, "learning_rate": 1.0024624706436726e-08, "loss": 0.10762141644954681, "rewards/accuracies": 0.9296875, "rewards/chosen": 51.02600860595703, "rewards/margins": 40.86925506591797, "rewards/rejected": 10.167243957519531, "step": 5685 }, { "epoch": 2.943064182194617, "grad_norm": 3.3564493656158447, "learning_rate": 9.844873489315287e-09, "loss": 0.11627166718244553, "rewards/accuracies": 0.9375, "rewards/chosen": 52.60633850097656, "rewards/margins": 40.936309814453125, "rewards/rejected": 11.671348571777344, "step": 5686 }, { "epoch": 2.9435817805383024, "grad_norm": 0.8135483860969543, "learning_rate": 9.666746853179698e-09, "loss": 0.10640697181224823, "rewards/accuracies": 0.953125, "rewards/chosen": 57.97676086425781, "rewards/margins": 45.0240592956543, "rewards/rejected": 12.940011024475098, "step": 5687 }, { "epoch": 2.9440993788819876, "grad_norm": 0.6079220771789551, "learning_rate": 9.490244856021348e-09, "loss": 0.0515822172164917, "rewards/accuracies": 0.984375, "rewards/chosen": 53.53294372558594, "rewards/margins": 43.9051513671875, "rewards/rejected": 9.623176574707031, "step": 5688 }, { "epoch": 2.9446169772256727, "grad_norm": 0.8485791683197021, "learning_rate": 9.315367555300936e-09, "loss": 0.07037629187107086, "rewards/accuracies": 0.96875, "rewards/chosen": 48.5487060546875, "rewards/margins": 37.103302001953125, "rewards/rejected": 11.449050903320312, "step": 5689 }, { "epoch": 2.9451345755693583, "grad_norm": 1.5543582439422607, "learning_rate": 9.142115007951257e-09, "loss": 0.09467124938964844, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.67499923706055, "rewards/margins": 44.985748291015625, "rewards/rejected": 9.6947021484375, "step": 5690 }, { "epoch": 2.9456521739130435, "grad_norm": 1.8294638395309448, "learning_rate": 8.970487270375527e-09, "loss": 0.058359295129776, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.97865295410156, "rewards/margins": 44.52880859375, "rewards/rejected": 13.453056335449219, "step": 5691 }, { "epoch": 2.946169772256729, "grad_norm": 1.2355955839157104, "learning_rate": 8.80048439844905e-09, "loss": 0.05014823004603386, "rewards/accuracies": 0.984375, "rewards/chosen": 62.49501037597656, "rewards/margins": 48.683082580566406, "rewards/rejected": 13.797630310058594, "step": 5692 }, { "epoch": 2.946687370600414, "grad_norm": 0.8167494535446167, "learning_rate": 8.632106447516997e-09, "loss": 0.1135902851819992, "rewards/accuracies": 0.9375, "rewards/chosen": 50.67762756347656, "rewards/margins": 42.938873291015625, "rewards/rejected": 7.7347869873046875, "step": 5693 }, { "epoch": 2.9472049689440993, "grad_norm": 1.2426804304122925, "learning_rate": 8.46535347239663e-09, "loss": 0.08726485073566437, "rewards/accuracies": 0.953125, "rewards/chosen": 59.47160720825195, "rewards/margins": 46.37724685668945, "rewards/rejected": 13.091117858886719, "step": 5694 }, { "epoch": 2.9477225672877845, "grad_norm": 2.86916184425354, "learning_rate": 8.300225527374527e-09, "loss": 0.09786427766084671, "rewards/accuracies": 0.9609375, "rewards/chosen": 56.85264587402344, "rewards/margins": 45.735687255859375, "rewards/rejected": 11.134483337402344, "step": 5695 }, { "epoch": 2.94824016563147, "grad_norm": 1.3749850988388062, "learning_rate": 8.13672266621046e-09, "loss": 0.07960692048072815, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.956336975097656, "rewards/margins": 43.273040771484375, "rewards/rejected": 12.676918029785156, "step": 5696 }, { "epoch": 2.948757763975155, "grad_norm": 0.8996254205703735, "learning_rate": 7.974844942132964e-09, "loss": 0.0492311455309391, "rewards/accuracies": 0.9765625, "rewards/chosen": 60.026424407958984, "rewards/margins": 46.3782958984375, "rewards/rejected": 13.652812957763672, "step": 5697 }, { "epoch": 2.949275362318841, "grad_norm": 0.9427022933959961, "learning_rate": 7.814592407843214e-09, "loss": 0.09948144853115082, "rewards/accuracies": 0.9609375, "rewards/chosen": 63.734439849853516, "rewards/margins": 47.56375503540039, "rewards/rejected": 16.168495178222656, "step": 5698 }, { "epoch": 2.949792960662526, "grad_norm": 0.6145135760307312, "learning_rate": 7.655965115512253e-09, "loss": 0.04380860924720764, "rewards/accuracies": 0.9921875, "rewards/chosen": 58.328887939453125, "rewards/margins": 45.44073486328125, "rewards/rejected": 12.87516975402832, "step": 5699 }, { "epoch": 2.950310559006211, "grad_norm": 0.8328843116760254, "learning_rate": 7.498963116782665e-09, "loss": 0.06721863895654678, "rewards/accuracies": 0.96875, "rewards/chosen": 56.71790313720703, "rewards/margins": 45.88731384277344, "rewards/rejected": 10.826729774475098, "step": 5700 }, { "epoch": 2.9508281573498962, "grad_norm": 3.1153032779693604, "learning_rate": 7.343586462766894e-09, "loss": 0.089413583278656, "rewards/accuracies": 0.9375, "rewards/chosen": 57.287025451660156, "rewards/margins": 42.942413330078125, "rewards/rejected": 14.338401794433594, "step": 5701 }, { "epoch": 2.951345755693582, "grad_norm": 0.7972334027290344, "learning_rate": 7.189835204049478e-09, "loss": 0.08245943486690521, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.73332977294922, "rewards/margins": 44.831031799316406, "rewards/rejected": 13.903429985046387, "step": 5702 }, { "epoch": 2.951863354037267, "grad_norm": 0.8827716112136841, "learning_rate": 7.037709390684821e-09, "loss": 0.0954705998301506, "rewards/accuracies": 0.953125, "rewards/chosen": 58.113426208496094, "rewards/margins": 44.55059814453125, "rewards/rejected": 13.565431594848633, "step": 5703 }, { "epoch": 2.9523809523809526, "grad_norm": 0.5427876710891724, "learning_rate": 6.887209072199419e-09, "loss": 0.032485514879226685, "rewards/accuracies": 1.0, "rewards/chosen": 53.72187805175781, "rewards/margins": 45.57636642456055, "rewards/rejected": 8.142797470092773, "step": 5704 }, { "epoch": 2.9528985507246377, "grad_norm": 0.9463692307472229, "learning_rate": 6.7383342975890774e-09, "loss": 0.10872013121843338, "rewards/accuracies": 0.9453125, "rewards/chosen": 61.46964645385742, "rewards/margins": 47.384521484375, "rewards/rejected": 14.092641830444336, "step": 5705 }, { "epoch": 2.953416149068323, "grad_norm": 0.9397927522659302, "learning_rate": 6.5910851153211385e-09, "loss": 0.08753876388072968, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.96900177001953, "rewards/margins": 46.55170440673828, "rewards/rejected": 12.409769058227539, "step": 5706 }, { "epoch": 2.9539337474120084, "grad_norm": 0.8451319932937622, "learning_rate": 6.445461573333367e-09, "loss": 0.09332780539989471, "rewards/accuracies": 0.9375, "rewards/chosen": 53.68627166748047, "rewards/margins": 44.4066162109375, "rewards/rejected": 9.282402038574219, "step": 5707 }, { "epoch": 2.9544513457556936, "grad_norm": 4.286989212036133, "learning_rate": 6.3014637190350615e-09, "loss": 0.07403987646102905, "rewards/accuracies": 0.96875, "rewards/chosen": 58.50518035888672, "rewards/margins": 46.0333251953125, "rewards/rejected": 12.465705871582031, "step": 5708 }, { "epoch": 2.954968944099379, "grad_norm": 1.0133436918258667, "learning_rate": 6.159091599305389e-09, "loss": 0.051228731870651245, "rewards/accuracies": 0.984375, "rewards/chosen": 60.258689880371094, "rewards/margins": 47.47630310058594, "rewards/rejected": 12.781743049621582, "step": 5709 }, { "epoch": 2.9554865424430643, "grad_norm": 2.2221150398254395, "learning_rate": 6.018345260495051e-09, "loss": 0.14917230606079102, "rewards/accuracies": 0.921875, "rewards/chosen": 58.18992233276367, "rewards/margins": 45.42530822753906, "rewards/rejected": 12.774889945983887, "step": 5710 }, { "epoch": 2.9560041407867494, "grad_norm": 1.324465274810791, "learning_rate": 5.879224748425172e-09, "loss": 0.06544728577136993, "rewards/accuracies": 0.984375, "rewards/chosen": 55.03353500366211, "rewards/margins": 43.01080322265625, "rewards/rejected": 12.016983032226562, "step": 5711 }, { "epoch": 2.9565217391304346, "grad_norm": 0.8249471187591553, "learning_rate": 5.74173010838619e-09, "loss": 0.07325688749551773, "rewards/accuracies": 0.953125, "rewards/chosen": 57.7818603515625, "rewards/margins": 48.540679931640625, "rewards/rejected": 9.240324020385742, "step": 5712 }, { "epoch": 2.95703933747412, "grad_norm": 1.1546549797058105, "learning_rate": 5.605861385141742e-09, "loss": 0.07741276919841766, "rewards/accuracies": 0.953125, "rewards/chosen": 57.265594482421875, "rewards/margins": 44.32623291015625, "rewards/rejected": 12.927444458007812, "step": 5713 }, { "epoch": 2.9575569358178053, "grad_norm": 2.1452627182006836, "learning_rate": 5.471618622924779e-09, "loss": 0.09005239605903625, "rewards/accuracies": 0.96875, "rewards/chosen": 52.81243896484375, "rewards/margins": 42.14886474609375, "rewards/rejected": 10.650833129882812, "step": 5714 }, { "epoch": 2.958074534161491, "grad_norm": 2.6084680557250977, "learning_rate": 5.3390018654381206e-09, "loss": 0.05957332253456116, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.90685272216797, "rewards/margins": 44.18963623046875, "rewards/rejected": 9.7218656539917, "step": 5715 }, { "epoch": 2.958592132505176, "grad_norm": 0.6271867752075195, "learning_rate": 5.208011155856674e-09, "loss": 0.04813750088214874, "rewards/accuracies": 0.96875, "rewards/chosen": 60.46983337402344, "rewards/margins": 46.386932373046875, "rewards/rejected": 14.084800720214844, "step": 5716 }, { "epoch": 2.959109730848861, "grad_norm": 1.0088257789611816, "learning_rate": 5.078646536825771e-09, "loss": 0.07642095535993576, "rewards/accuracies": 0.953125, "rewards/chosen": 58.60260009765625, "rewards/margins": 45.36089324951172, "rewards/rejected": 13.235029220581055, "step": 5717 }, { "epoch": 2.9596273291925463, "grad_norm": 1.649980902671814, "learning_rate": 4.950908050460612e-09, "loss": 0.06955288350582123, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.91949462890625, "rewards/margins": 46.491851806640625, "rewards/rejected": 15.442588806152344, "step": 5718 }, { "epoch": 2.960144927536232, "grad_norm": 0.928658127784729, "learning_rate": 4.824795738346821e-09, "loss": 0.07005282491445541, "rewards/accuracies": 0.96875, "rewards/chosen": 64.43124389648438, "rewards/margins": 48.171356201171875, "rewards/rejected": 16.261262893676758, "step": 5719 }, { "epoch": 2.960662525879917, "grad_norm": 0.7367502450942993, "learning_rate": 4.7003096415421115e-09, "loss": 0.04069124162197113, "rewards/accuracies": 0.984375, "rewards/chosen": 58.4495849609375, "rewards/margins": 44.733795166015625, "rewards/rejected": 13.709169387817383, "step": 5720 }, { "epoch": 2.9611801242236027, "grad_norm": 0.45735877752304077, "learning_rate": 4.577449800573508e-09, "loss": 0.04067133739590645, "rewards/accuracies": 0.984375, "rewards/chosen": 50.41204833984375, "rewards/margins": 39.31230163574219, "rewards/rejected": 11.09592056274414, "step": 5721 }, { "epoch": 2.961697722567288, "grad_norm": 0.9198353886604309, "learning_rate": 4.456216255439017e-09, "loss": 0.0581706203520298, "rewards/accuracies": 0.984375, "rewards/chosen": 55.84971237182617, "rewards/margins": 44.50628662109375, "rewards/rejected": 11.348594665527344, "step": 5722 }, { "epoch": 2.962215320910973, "grad_norm": 1.2165756225585938, "learning_rate": 4.33660904560651e-09, "loss": 0.07739710807800293, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.902587890625, "rewards/margins": 45.1728515625, "rewards/rejected": 12.735107421875, "step": 5723 }, { "epoch": 2.9627329192546585, "grad_norm": 1.895865559577942, "learning_rate": 4.218628210015951e-09, "loss": 0.07874178886413574, "rewards/accuracies": 0.96875, "rewards/chosen": 60.42565155029297, "rewards/margins": 47.742706298828125, "rewards/rejected": 12.674660682678223, "step": 5724 }, { "epoch": 2.9632505175983437, "grad_norm": 1.1181386709213257, "learning_rate": 4.102273787076061e-09, "loss": 0.1290549486875534, "rewards/accuracies": 0.9296875, "rewards/chosen": 53.03410339355469, "rewards/margins": 40.86073303222656, "rewards/rejected": 12.166608810424805, "step": 5725 }, { "epoch": 2.963768115942029, "grad_norm": 1.1327584981918335, "learning_rate": 3.987545814667093e-09, "loss": 0.08945373445749283, "rewards/accuracies": 0.953125, "rewards/chosen": 52.065284729003906, "rewards/margins": 42.142791748046875, "rewards/rejected": 9.924367904663086, "step": 5726 }, { "epoch": 2.9642857142857144, "grad_norm": 0.9054176807403564, "learning_rate": 3.874444330139726e-09, "loss": 0.06751102954149246, "rewards/accuracies": 0.96875, "rewards/chosen": 57.539955139160156, "rewards/margins": 43.833404541015625, "rewards/rejected": 13.715188980102539, "step": 5727 }, { "epoch": 2.9648033126293996, "grad_norm": 0.9292994737625122, "learning_rate": 3.762969370314507e-09, "loss": 0.06967063248157501, "rewards/accuracies": 0.96875, "rewards/chosen": 57.92274475097656, "rewards/margins": 47.81689453125, "rewards/rejected": 10.108154296875, "step": 5728 }, { "epoch": 2.9653209109730847, "grad_norm": 1.3390313386917114, "learning_rate": 3.653120971483515e-09, "loss": 0.14222212135791779, "rewards/accuracies": 0.9296875, "rewards/chosen": 51.744564056396484, "rewards/margins": 40.565032958984375, "rewards/rejected": 11.185348510742188, "step": 5729 }, { "epoch": 2.9658385093167703, "grad_norm": 2.2793612480163574, "learning_rate": 3.5448991694081447e-09, "loss": 0.052837520837783813, "rewards/accuracies": 0.984375, "rewards/chosen": 60.854286193847656, "rewards/margins": 47.621185302734375, "rewards/rejected": 13.243873596191406, "step": 5730 }, { "epoch": 2.9663561076604554, "grad_norm": 1.1613035202026367, "learning_rate": 3.438303999321324e-09, "loss": 0.11869645863771439, "rewards/accuracies": 0.9609375, "rewards/chosen": 47.591468811035156, "rewards/margins": 39.038909912109375, "rewards/rejected": 8.553321838378906, "step": 5731 }, { "epoch": 2.966873706004141, "grad_norm": 1.7066890001296997, "learning_rate": 3.333335495925294e-09, "loss": 0.07012727111577988, "rewards/accuracies": 0.9609375, "rewards/chosen": 54.34175109863281, "rewards/margins": 42.70341491699219, "rewards/rejected": 11.655406951904297, "step": 5732 }, { "epoch": 2.967391304347826, "grad_norm": 0.8120712637901306, "learning_rate": 3.229993693393829e-09, "loss": 0.08301910758018494, "rewards/accuracies": 0.9609375, "rewards/chosen": 53.9172477722168, "rewards/margins": 43.763458251953125, "rewards/rejected": 10.167461395263672, "step": 5733 }, { "epoch": 2.9679089026915113, "grad_norm": 1.8520427942276, "learning_rate": 3.1282786253705735e-09, "loss": 0.10309918224811554, "rewards/accuracies": 0.953125, "rewards/chosen": 57.232421875, "rewards/margins": 46.256011962890625, "rewards/rejected": 10.970748901367188, "step": 5734 }, { "epoch": 2.9684265010351965, "grad_norm": 1.0383822917938232, "learning_rate": 3.028190324969038e-09, "loss": 0.14239448308944702, "rewards/accuracies": 0.921875, "rewards/chosen": 54.30852508544922, "rewards/margins": 41.75642395019531, "rewards/rejected": 12.54806137084961, "step": 5735 }, { "epoch": 2.968944099378882, "grad_norm": 1.0248221158981323, "learning_rate": 2.9297288247742695e-09, "loss": 0.06768553704023361, "rewards/accuracies": 0.9765625, "rewards/chosen": 52.312767028808594, "rewards/margins": 42.309417724609375, "rewards/rejected": 9.993812561035156, "step": 5736 }, { "epoch": 2.969461697722567, "grad_norm": 1.3774932622909546, "learning_rate": 2.8328941568411818e-09, "loss": 0.08520884066820145, "rewards/accuracies": 0.9765625, "rewards/chosen": 62.164581298828125, "rewards/margins": 50.66558837890625, "rewards/rejected": 11.502321243286133, "step": 5737 }, { "epoch": 2.9699792960662528, "grad_norm": 2.279815912246704, "learning_rate": 2.7376863526940023e-09, "loss": 0.14794887602329254, "rewards/accuracies": 0.9140625, "rewards/chosen": 52.24751281738281, "rewards/margins": 40.893798828125, "rewards/rejected": 11.348163604736328, "step": 5738 }, { "epoch": 2.970496894409938, "grad_norm": 0.7095838189125061, "learning_rate": 2.6441054433296032e-09, "loss": 0.06475330889225006, "rewards/accuracies": 0.9453125, "rewards/chosen": 61.415313720703125, "rewards/margins": 47.686004638671875, "rewards/rejected": 13.737434387207031, "step": 5739 }, { "epoch": 2.971014492753623, "grad_norm": 1.2769978046417236, "learning_rate": 2.5521514592130593e-09, "loss": 0.07481789588928223, "rewards/accuracies": 0.96875, "rewards/chosen": 65.43675231933594, "rewards/margins": 51.6605224609375, "rewards/rejected": 13.766326904296875, "step": 5740 }, { "epoch": 2.9715320910973086, "grad_norm": 1.0862407684326172, "learning_rate": 2.461824430281534e-09, "loss": 0.07443061470985413, "rewards/accuracies": 0.9609375, "rewards/chosen": 64.44441986083984, "rewards/margins": 52.37452697753906, "rewards/rejected": 12.080387115478516, "step": 5741 }, { "epoch": 2.972049689440994, "grad_norm": 4.1332244873046875, "learning_rate": 2.3731243859403953e-09, "loss": 0.10716928541660309, "rewards/accuracies": 0.9453125, "rewards/chosen": 60.43437194824219, "rewards/margins": 46.500213623046875, "rewards/rejected": 13.937629699707031, "step": 5742 }, { "epoch": 2.972567287784679, "grad_norm": 0.7388261556625366, "learning_rate": 2.2860513550676534e-09, "loss": 0.07439735531806946, "rewards/accuracies": 0.96875, "rewards/chosen": 63.93765640258789, "rewards/margins": 49.464935302734375, "rewards/rejected": 14.47421932220459, "step": 5743 }, { "epoch": 2.9730848861283645, "grad_norm": 1.7982350587844849, "learning_rate": 2.200605366010078e-09, "loss": 0.06309740990400314, "rewards/accuracies": 0.984375, "rewards/chosen": 52.26169204711914, "rewards/margins": 42.57875061035156, "rewards/rejected": 9.688949584960938, "step": 5744 }, { "epoch": 2.9736024844720497, "grad_norm": 1.1706976890563965, "learning_rate": 2.1167864465854172e-09, "loss": 0.06497777998447418, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.409912109375, "rewards/margins": 43.73907470703125, "rewards/rejected": 7.663873672485352, "step": 5745 }, { "epoch": 2.974120082815735, "grad_norm": 1.8876581192016602, "learning_rate": 2.0345946240812876e-09, "loss": 0.10969206690788269, "rewards/accuracies": 0.9453125, "rewards/chosen": 58.44206237792969, "rewards/margins": 47.285423278808594, "rewards/rejected": 11.156925201416016, "step": 5746 }, { "epoch": 2.9746376811594204, "grad_norm": 0.9258109927177429, "learning_rate": 1.954029925255729e-09, "loss": 0.08048203587532043, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.77906036376953, "rewards/margins": 44.40472412109375, "rewards/rejected": 11.375381469726562, "step": 5747 }, { "epoch": 2.9751552795031055, "grad_norm": 1.7004777193069458, "learning_rate": 1.8750923763377615e-09, "loss": 0.03288259357213974, "rewards/accuracies": 0.9765625, "rewards/chosen": 61.50328063964844, "rewards/margins": 49.71820068359375, "rewards/rejected": 11.764625549316406, "step": 5748 }, { "epoch": 2.975672877846791, "grad_norm": 1.0173310041427612, "learning_rate": 1.7977820030251614e-09, "loss": 0.12005764245986938, "rewards/accuracies": 0.9453125, "rewards/chosen": 59.10810089111328, "rewards/margins": 47.169334411621094, "rewards/rejected": 11.929768562316895, "step": 5749 }, { "epoch": 2.9761904761904763, "grad_norm": 0.6140886545181274, "learning_rate": 1.722098830487795e-09, "loss": 0.060194481164216995, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.67654037475586, "rewards/margins": 42.8427734375, "rewards/rejected": 10.816253662109375, "step": 5750 }, { "epoch": 2.9767080745341614, "grad_norm": 0.6794828772544861, "learning_rate": 1.6480428833648421e-09, "loss": 0.07150257378816605, "rewards/accuracies": 0.9609375, "rewards/chosen": 57.41864013671875, "rewards/margins": 44.77276611328125, "rewards/rejected": 12.655380249023438, "step": 5751 }, { "epoch": 2.9772256728778466, "grad_norm": 0.8479861617088318, "learning_rate": 1.5756141857647955e-09, "loss": 0.08401250839233398, "rewards/accuracies": 0.9375, "rewards/chosen": 53.61103057861328, "rewards/margins": 42.78995132446289, "rewards/rejected": 10.821870803833008, "step": 5752 }, { "epoch": 2.977743271221532, "grad_norm": 0.8729199767112732, "learning_rate": 1.5048127612682372e-09, "loss": 0.10697543621063232, "rewards/accuracies": 0.9453125, "rewards/chosen": 60.54771041870117, "rewards/margins": 46.818206787109375, "rewards/rejected": 13.733243942260742, "step": 5753 }, { "epoch": 2.9782608695652173, "grad_norm": 0.9970995187759399, "learning_rate": 1.4356386329250628e-09, "loss": 0.05450465902686119, "rewards/accuracies": 0.96875, "rewards/chosen": 56.072723388671875, "rewards/margins": 43.5443115234375, "rewards/rejected": 12.529396057128906, "step": 5754 }, { "epoch": 2.978778467908903, "grad_norm": 0.6567723751068115, "learning_rate": 1.3680918232550356e-09, "loss": 0.07126379013061523, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.273468017578125, "rewards/margins": 44.30220031738281, "rewards/rejected": 12.961795806884766, "step": 5755 }, { "epoch": 2.979296066252588, "grad_norm": 0.7631425857543945, "learning_rate": 1.3021723542483434e-09, "loss": 0.06331482529640198, "rewards/accuracies": 0.9765625, "rewards/chosen": 55.00098419189453, "rewards/margins": 42.472900390625, "rewards/rejected": 12.53371810913086, "step": 5756 }, { "epoch": 2.979813664596273, "grad_norm": 0.5068621635437012, "learning_rate": 1.237880247365597e-09, "loss": 0.06454569101333618, "rewards/accuracies": 0.96875, "rewards/chosen": 63.87431335449219, "rewards/margins": 48.747161865234375, "rewards/rejected": 15.13121223449707, "step": 5757 }, { "epoch": 2.9803312629399588, "grad_norm": 0.7368533611297607, "learning_rate": 1.1752155235383866e-09, "loss": 0.07807761430740356, "rewards/accuracies": 0.953125, "rewards/chosen": 53.881202697753906, "rewards/margins": 42.57177734375, "rewards/rejected": 11.301315307617188, "step": 5758 }, { "epoch": 2.980848861283644, "grad_norm": 0.9710492491722107, "learning_rate": 1.1141782031665049e-09, "loss": 0.0938410758972168, "rewards/accuracies": 0.953125, "rewards/chosen": 59.631736755371094, "rewards/margins": 46.861724853515625, "rewards/rejected": 12.762336730957031, "step": 5759 }, { "epoch": 2.981366459627329, "grad_norm": 1.4923540353775024, "learning_rate": 1.054768306121834e-09, "loss": 0.09974762052297592, "rewards/accuracies": 0.96875, "rewards/chosen": 50.1947021484375, "rewards/margins": 38.48614501953125, "rewards/rejected": 11.70225715637207, "step": 5760 }, { "epoch": 2.9818840579710146, "grad_norm": 0.9997195601463318, "learning_rate": 9.969858517455689e-10, "loss": 0.08588073402643204, "rewards/accuracies": 0.953125, "rewards/chosen": 56.1817626953125, "rewards/margins": 44.6026611328125, "rewards/rejected": 11.576602935791016, "step": 5761 }, { "epoch": 2.9824016563147, "grad_norm": 0.7450986504554749, "learning_rate": 9.408308588493287e-10, "loss": 0.08806858956813812, "rewards/accuracies": 0.953125, "rewards/chosen": 54.14707946777344, "rewards/margins": 43.73091125488281, "rewards/rejected": 10.412904739379883, "step": 5762 }, { "epoch": 2.982919254658385, "grad_norm": 0.7725557088851929, "learning_rate": 8.863033457140457e-10, "loss": 0.07472201436758041, "rewards/accuracies": 0.9609375, "rewards/chosen": 51.642242431640625, "rewards/margins": 42.95506286621094, "rewards/rejected": 8.683082580566406, "step": 5763 }, { "epoch": 2.9834368530020705, "grad_norm": 0.7080724835395813, "learning_rate": 8.334033300921863e-10, "loss": 0.08205524832010269, "rewards/accuracies": 0.96875, "rewards/chosen": 59.255035400390625, "rewards/margins": 47.56890869140625, "rewards/rejected": 11.67563247680664, "step": 5764 }, { "epoch": 2.9839544513457557, "grad_norm": 0.43375205993652344, "learning_rate": 7.821308292060847e-10, "loss": 0.040319956839084625, "rewards/accuracies": 0.984375, "rewards/chosen": 62.60789489746094, "rewards/margins": 48.185089111328125, "rewards/rejected": 14.421896934509277, "step": 5765 }, { "epoch": 2.9844720496894412, "grad_norm": 0.9650047421455383, "learning_rate": 7.324858597468343e-10, "loss": 0.0890481248497963, "rewards/accuracies": 0.9375, "rewards/chosen": 56.403316497802734, "rewards/margins": 45.05207061767578, "rewards/rejected": 11.336971282958984, "step": 5766 }, { "epoch": 2.9849896480331264, "grad_norm": 4.154196739196777, "learning_rate": 6.844684378776168e-10, "loss": 0.05479637533426285, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.794769287109375, "rewards/margins": 44.56317138671875, "rewards/rejected": 12.235042572021484, "step": 5767 }, { "epoch": 2.9855072463768115, "grad_norm": 1.6964317560195923, "learning_rate": 6.380785792309274e-10, "loss": 0.055751994252204895, "rewards/accuracies": 0.9765625, "rewards/chosen": 57.39537811279297, "rewards/margins": 45.47235107421875, "rewards/rejected": 11.917313575744629, "step": 5768 }, { "epoch": 2.9860248447204967, "grad_norm": 0.8436822295188904, "learning_rate": 5.933162989080199e-10, "loss": 0.057004742324352264, "rewards/accuracies": 0.96875, "rewards/chosen": 57.10633850097656, "rewards/margins": 45.8427734375, "rewards/rejected": 11.258441925048828, "step": 5769 }, { "epoch": 2.9865424430641823, "grad_norm": 1.4741382598876953, "learning_rate": 5.501816114827919e-10, "loss": 0.11803142726421356, "rewards/accuracies": 0.9296875, "rewards/chosen": 56.63377380371094, "rewards/margins": 45.6594123840332, "rewards/rejected": 10.968521118164062, "step": 5770 }, { "epoch": 2.9870600414078674, "grad_norm": 0.5828990340232849, "learning_rate": 5.086745309978991e-10, "loss": 0.060221992433071136, "rewards/accuracies": 0.9609375, "rewards/chosen": 65.49745178222656, "rewards/margins": 50.772430419921875, "rewards/rejected": 14.721301078796387, "step": 5771 }, { "epoch": 2.987577639751553, "grad_norm": 0.8712539672851562, "learning_rate": 4.687950709664213e-10, "loss": 0.03448542580008507, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.9837646484375, "rewards/margins": 44.5086669921875, "rewards/rejected": 12.469860076904297, "step": 5772 }, { "epoch": 2.988095238095238, "grad_norm": 3.017606735229492, "learning_rate": 4.305432443707513e-10, "loss": 0.15440337359905243, "rewards/accuracies": 0.9140625, "rewards/chosen": 55.638309478759766, "rewards/margins": 45.3865966796875, "rewards/rejected": 10.25738525390625, "step": 5773 }, { "epoch": 2.9886128364389233, "grad_norm": 0.6059364080429077, "learning_rate": 3.9391906366426093e-10, "loss": 0.055808037519454956, "rewards/accuracies": 0.984375, "rewards/chosen": 55.27220153808594, "rewards/margins": 44.94921875, "rewards/rejected": 10.316681861877441, "step": 5774 }, { "epoch": 2.9891304347826084, "grad_norm": 0.7566372752189636, "learning_rate": 3.589225407701902e-10, "loss": 0.07890400290489197, "rewards/accuracies": 0.9609375, "rewards/chosen": 55.723941802978516, "rewards/margins": 45.406532287597656, "rewards/rejected": 10.313315391540527, "step": 5775 }, { "epoch": 2.989648033126294, "grad_norm": 2.4592952728271484, "learning_rate": 3.2555368708220293e-10, "loss": 0.07696583867073059, "rewards/accuracies": 0.9765625, "rewards/chosen": 67.45523834228516, "rewards/margins": 53.8001708984375, "rewards/rejected": 13.652595520019531, "step": 5776 }, { "epoch": 2.990165631469979, "grad_norm": 1.8129360675811768, "learning_rate": 2.9381251346383144e-10, "loss": 0.0956563651561737, "rewards/accuracies": 0.9609375, "rewards/chosen": 59.96240234375, "rewards/margins": 47.060394287109375, "rewards/rejected": 12.913450241088867, "step": 5777 }, { "epoch": 2.9906832298136647, "grad_norm": 3.7386746406555176, "learning_rate": 2.6369903024847653e-10, "loss": 0.08771032840013504, "rewards/accuracies": 0.9765625, "rewards/chosen": 53.79392623901367, "rewards/margins": 43.298126220703125, "rewards/rejected": 10.501802444458008, "step": 5778 }, { "epoch": 2.99120082815735, "grad_norm": 0.9429721236228943, "learning_rate": 2.3521324723940755e-10, "loss": 0.1077938973903656, "rewards/accuracies": 0.96875, "rewards/chosen": 54.452491760253906, "rewards/margins": 44.16389465332031, "rewards/rejected": 10.27352523803711, "step": 5779 }, { "epoch": 2.991718426501035, "grad_norm": 0.766879141330719, "learning_rate": 2.0835517371087244e-10, "loss": 0.06705445051193237, "rewards/accuracies": 0.96875, "rewards/chosen": 58.82774353027344, "rewards/margins": 45.715179443359375, "rewards/rejected": 13.11572265625, "step": 5780 }, { "epoch": 2.9922360248447206, "grad_norm": 0.7159618139266968, "learning_rate": 1.831248184064327e-10, "loss": 0.07500050961971283, "rewards/accuracies": 0.9609375, "rewards/chosen": 58.08400344848633, "rewards/margins": 44.87445831298828, "rewards/rejected": 13.221206665039062, "step": 5781 }, { "epoch": 2.9927536231884058, "grad_norm": 0.959559977054596, "learning_rate": 1.5952218954062847e-10, "loss": 0.07253363728523254, "rewards/accuracies": 0.9609375, "rewards/chosen": 62.426300048828125, "rewards/margins": 50.841339111328125, "rewards/rejected": 11.559998512268066, "step": 5782 }, { "epoch": 2.9932712215320914, "grad_norm": 0.8953917026519775, "learning_rate": 1.375472947967582e-10, "loss": 0.06166912242770195, "rewards/accuracies": 0.96875, "rewards/chosen": 59.563758850097656, "rewards/margins": 46.609710693359375, "rewards/rejected": 12.958454132080078, "step": 5783 }, { "epoch": 2.9937888198757765, "grad_norm": 0.6404027342796326, "learning_rate": 1.1720014132909907e-10, "loss": 0.06683538854122162, "rewards/accuracies": 0.9609375, "rewards/chosen": 61.83854675292969, "rewards/margins": 45.39673614501953, "rewards/rejected": 16.45838165283203, "step": 5784 }, { "epoch": 2.9943064182194616, "grad_norm": 0.9326527118682861, "learning_rate": 9.848073576179673e-11, "loss": 0.061819855123758316, "rewards/accuracies": 0.9609375, "rewards/chosen": 60.126556396484375, "rewards/margins": 45.94842529296875, "rewards/rejected": 14.175725936889648, "step": 5785 }, { "epoch": 2.994824016563147, "grad_norm": 4.5598015785217285, "learning_rate": 8.138908418886537e-11, "loss": 0.1471552550792694, "rewards/accuracies": 0.9296875, "rewards/chosen": 56.433128356933594, "rewards/margins": 43.03163146972656, "rewards/rejected": 13.407646179199219, "step": 5786 }, { "epoch": 2.9953416149068324, "grad_norm": 1.1500695943832397, "learning_rate": 6.592519217529791e-11, "loss": 0.13216224312782288, "rewards/accuracies": 0.921875, "rewards/chosen": 50.82499694824219, "rewards/margins": 40.3631591796875, "rewards/rejected": 10.458772659301758, "step": 5787 }, { "epoch": 2.9958592132505175, "grad_norm": 0.6619333624839783, "learning_rate": 5.208906475484554e-11, "loss": 0.07449309527873993, "rewards/accuracies": 0.9375, "rewards/chosen": 54.482078552246094, "rewards/margins": 41.464111328125, "rewards/rejected": 13.017051696777344, "step": 5788 }, { "epoch": 2.996376811594203, "grad_norm": 0.9487649202346802, "learning_rate": 3.988070643223818e-11, "loss": 0.06491738557815552, "rewards/accuracies": 0.96875, "rewards/chosen": 60.0855712890625, "rewards/margins": 46.869659423828125, "rewards/rejected": 13.224567413330078, "step": 5789 }, { "epoch": 2.9968944099378882, "grad_norm": 1.5040717124938965, "learning_rate": 2.9300121182074257e-11, "loss": 0.0738685205578804, "rewards/accuracies": 0.9765625, "rewards/chosen": 56.38860321044922, "rewards/margins": 42.432769775390625, "rewards/rejected": 13.95904541015625, "step": 5790 }, { "epoch": 2.9974120082815734, "grad_norm": 1.245525598526001, "learning_rate": 2.03473124482656e-11, "loss": 0.05211298167705536, "rewards/accuracies": 0.984375, "rewards/chosen": 63.56919860839844, "rewards/margins": 49.9033203125, "rewards/rejected": 13.661209106445312, "step": 5791 }, { "epoch": 2.9979296066252585, "grad_norm": 0.75453120470047, "learning_rate": 1.3022283146812975e-11, "loss": 0.05461286008358002, "rewards/accuracies": 0.96875, "rewards/chosen": 60.43748474121094, "rewards/margins": 48.175567626953125, "rewards/rejected": 12.265645980834961, "step": 5792 }, { "epoch": 2.998447204968944, "grad_norm": 1.1033945083618164, "learning_rate": 7.32503566136522e-12, "loss": 0.10972613096237183, "rewards/accuracies": 0.9453125, "rewards/chosen": 54.75185775756836, "rewards/margins": 41.857757568359375, "rewards/rejected": 12.902229309082031, "step": 5793 }, { "epoch": 2.9989648033126293, "grad_norm": 0.5797685384750366, "learning_rate": 3.2555718465498986e-12, "loss": 0.043394915759563446, "rewards/accuracies": 0.9765625, "rewards/chosen": 50.732086181640625, "rewards/margins": 40.23884582519531, "rewards/rejected": 10.514737129211426, "step": 5794 }, { "epoch": 2.999482401656315, "grad_norm": 1.7798655033111572, "learning_rate": 8.138930279733004e-13, "loss": 0.11944903433322906, "rewards/accuracies": 0.953125, "rewards/chosen": 55.15019607543945, "rewards/margins": 42.74067687988281, "rewards/rejected": 12.402458190917969, "step": 5795 }, { "epoch": 3.0, "grad_norm": 0.6759138703346252, "learning_rate": 0.0, "loss": 0.06530967354774475, "rewards/accuracies": 0.96875, "rewards/chosen": 56.91914367675781, "rewards/margins": 44.37115478515625, "rewards/rejected": 12.53957748413086, "step": 5796 } ], "logging_steps": 1, "max_steps": 5796, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1996076774379435e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }