{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007407407407407407, "grad_norm": 3.357926845550537, "learning_rate": 9.992592592592592e-07, "logits/chosen": 1.5703125, "logits/rejected": 1.9208984375, "logps/chosen": -20.9375, "logps/rejected": -68.3125, "loss": 0.9678, "rewards/accuracies": 0.25, "rewards/chosen": -0.0406494140625, "rewards/margins": -0.436767578125, "rewards/rejected": 0.39599609375, "step": 1 }, { "epoch": 0.0014814814814814814, "grad_norm": 3.7804203033447266, "learning_rate": 9.985185185185185e-07, "logits/chosen": 1.611328125, "logits/rejected": 1.896484375, "logps/chosen": -50.8125, "logps/rejected": -36.78125, "loss": 1.0645, "rewards/accuracies": 0.25, "rewards/chosen": -0.6416015625, "rewards/margins": -0.44580078125, "rewards/rejected": -0.1956787109375, "step": 2 }, { "epoch": 0.0022222222222222222, "grad_norm": 1.415795087814331, "learning_rate": 9.977777777777778e-07, "logits/chosen": 1.681640625, "logits/rejected": 2.005859375, "logps/chosen": -30.25, "logps/rejected": -41.65625, "loss": 0.6504, "rewards/accuracies": 0.75, "rewards/chosen": -0.00115966796875, "rewards/margins": 0.08984375, "rewards/rejected": -0.09100341796875, "step": 3 }, { "epoch": 0.002962962962962963, "grad_norm": 2.029367446899414, "learning_rate": 9.97037037037037e-07, "logits/chosen": 2.03515625, "logits/rejected": 2.025390625, "logps/chosen": -39.40625, "logps/rejected": -41.96875, "loss": 0.6729, "rewards/accuracies": 0.5, "rewards/chosen": -0.099853515625, "rewards/margins": 0.050567626953125, "rewards/rejected": -0.150390625, "step": 4 }, { "epoch": 0.003703703703703704, "grad_norm": 4.088499546051025, "learning_rate": 9.962962962962964e-07, "logits/chosen": 1.7265625, "logits/rejected": 1.3720703125, "logps/chosen": -23.09375, "logps/rejected": -123.75, "loss": 0.7568, "rewards/accuracies": 0.5, "rewards/chosen": -0.022857666015625, "rewards/margins": -0.0963134765625, "rewards/rejected": 0.07342529296875, "step": 5 }, { "epoch": 0.0044444444444444444, "grad_norm": 1.6602762937545776, "learning_rate": 9.955555555555554e-07, "logits/chosen": 0.9501953125, "logits/rejected": 0.71240234375, "logps/chosen": -24.78125, "logps/rejected": -36.84375, "loss": 0.6187, "rewards/accuracies": 0.75, "rewards/chosen": 0.042572021484375, "rewards/margins": 0.174560546875, "rewards/rejected": -0.132080078125, "step": 6 }, { "epoch": 0.005185185185185185, "grad_norm": 2.667506694793701, "learning_rate": 9.948148148148147e-07, "logits/chosen": 1.884765625, "logits/rejected": 1.8330078125, "logps/chosen": -29.453125, "logps/rejected": -65.0, "loss": 0.8447, "rewards/accuracies": 0.25, "rewards/chosen": -0.108642578125, "rewards/margins": -0.25146484375, "rewards/rejected": 0.14306640625, "step": 7 }, { "epoch": 0.005925925925925926, "grad_norm": 1.3275270462036133, "learning_rate": 9.94074074074074e-07, "logits/chosen": 1.32421875, "logits/rejected": 1.224609375, "logps/chosen": -26.703125, "logps/rejected": -33.09375, "loss": 0.6484, "rewards/accuracies": 0.75, "rewards/chosen": 0.04571533203125, "rewards/margins": 0.09375, "rewards/rejected": -0.04803466796875, "step": 8 }, { "epoch": 0.006666666666666667, "grad_norm": 1.4520375728607178, "learning_rate": 9.933333333333333e-07, "logits/chosen": 1.2353515625, "logits/rejected": 1.9404296875, "logps/chosen": -28.21875, "logps/rejected": -30.25, "loss": 0.6411, "rewards/accuracies": 0.5, "rewards/chosen": 0.1468505859375, "rewards/margins": 0.1292724609375, "rewards/rejected": 0.0176239013671875, "step": 9 }, { "epoch": 0.007407407407407408, "grad_norm": 2.565462112426758, "learning_rate": 9.925925925925926e-07, "logits/chosen": 1.2158203125, "logits/rejected": 1.48828125, "logps/chosen": -39.15625, "logps/rejected": -32.875, "loss": 0.7871, "rewards/accuracies": 0.5, "rewards/chosen": -0.1326904296875, "rewards/margins": -0.1229248046875, "rewards/rejected": -0.0097808837890625, "step": 10 }, { "epoch": 0.008148148148148147, "grad_norm": 1.7205417156219482, "learning_rate": 9.918518518518518e-07, "logits/chosen": 2.3515625, "logits/rejected": 1.6474609375, "logps/chosen": -28.078125, "logps/rejected": -31.34375, "loss": 0.9072, "rewards/accuracies": 0.0, "rewards/chosen": -0.05059814453125, "rewards/margins": -0.376708984375, "rewards/rejected": 0.326171875, "step": 11 }, { "epoch": 0.008888888888888889, "grad_norm": 1.4385411739349365, "learning_rate": 9.911111111111111e-07, "logits/chosen": 1.9833984375, "logits/rejected": 1.638671875, "logps/chosen": -20.5, "logps/rejected": -33.3125, "loss": 0.6528, "rewards/accuracies": 0.75, "rewards/chosen": 0.03631591796875, "rewards/margins": 0.1104736328125, "rewards/rejected": -0.07421875, "step": 12 }, { "epoch": 0.00962962962962963, "grad_norm": 1.869625210762024, "learning_rate": 9.903703703703702e-07, "logits/chosen": 2.71484375, "logits/rejected": 1.66015625, "logps/chosen": -30.390625, "logps/rejected": -35.8125, "loss": 0.792, "rewards/accuracies": 0.25, "rewards/chosen": -0.048858642578125, "rewards/margins": -0.171142578125, "rewards/rejected": 0.122314453125, "step": 13 }, { "epoch": 0.01037037037037037, "grad_norm": 1.9671366214752197, "learning_rate": 9.896296296296297e-07, "logits/chosen": 1.55078125, "logits/rejected": 1.5205078125, "logps/chosen": -27.5, "logps/rejected": -41.71875, "loss": 0.8462, "rewards/accuracies": 0.25, "rewards/chosen": -0.1529541015625, "rewards/margins": -0.27197265625, "rewards/rejected": 0.119140625, "step": 14 }, { "epoch": 0.011111111111111112, "grad_norm": 48.449520111083984, "learning_rate": 9.88888888888889e-07, "logits/chosen": 1.6796875, "logits/rejected": 1.62109375, "logps/chosen": -51.4375, "logps/rejected": -29.71875, "loss": 0.6816, "rewards/accuracies": 0.5, "rewards/chosen": -0.026947021484375, "rewards/margins": 0.0253753662109375, "rewards/rejected": -0.052337646484375, "step": 15 }, { "epoch": 0.011851851851851851, "grad_norm": 2.1922855377197266, "learning_rate": 9.88148148148148e-07, "logits/chosen": 1.4521484375, "logits/rejected": 1.6298828125, "logps/chosen": -33.875, "logps/rejected": -39.4375, "loss": 0.7671, "rewards/accuracies": 0.0, "rewards/chosen": -0.044891357421875, "rewards/margins": -0.1409912109375, "rewards/rejected": 0.0960693359375, "step": 16 }, { "epoch": 0.012592592592592593, "grad_norm": 1.6107385158538818, "learning_rate": 9.874074074074073e-07, "logits/chosen": 1.3935546875, "logits/rejected": 1.2333984375, "logps/chosen": -22.96875, "logps/rejected": -56.75, "loss": 0.6431, "rewards/accuracies": 0.5, "rewards/chosen": 0.03125, "rewards/margins": 0.1097412109375, "rewards/rejected": -0.0784912109375, "step": 17 }, { "epoch": 0.013333333333333334, "grad_norm": 2.0069165229797363, "learning_rate": 9.866666666666666e-07, "logits/chosen": 1.9541015625, "logits/rejected": 1.5869140625, "logps/chosen": -39.875, "logps/rejected": -33.9375, "loss": 0.6318, "rewards/accuracies": 1.0, "rewards/chosen": -0.068359375, "rewards/margins": 0.126953125, "rewards/rejected": -0.1953125, "step": 18 }, { "epoch": 0.014074074074074074, "grad_norm": 1.6153367757797241, "learning_rate": 9.859259259259259e-07, "logits/chosen": 1.875, "logits/rejected": 1.611328125, "logps/chosen": -34.65625, "logps/rejected": -39.84375, "loss": 0.7314, "rewards/accuracies": 0.25, "rewards/chosen": 0.157470703125, "rewards/margins": 0.06573486328125, "rewards/rejected": 0.091796875, "step": 19 }, { "epoch": 0.014814814814814815, "grad_norm": 2.2569918632507324, "learning_rate": 9.851851851851852e-07, "logits/chosen": 0.8740234375, "logits/rejected": 1.60546875, "logps/chosen": -32.53125, "logps/rejected": -36.03125, "loss": 0.7681, "rewards/accuracies": 0.25, "rewards/chosen": -0.07928466796875, "rewards/margins": -0.137451171875, "rewards/rejected": 0.0582275390625, "step": 20 }, { "epoch": 0.015555555555555555, "grad_norm": 4.415426254272461, "learning_rate": 9.844444444444445e-07, "logits/chosen": 1.3388671875, "logits/rejected": 1.5, "logps/chosen": -62.375, "logps/rejected": -55.78125, "loss": 2.584, "rewards/accuracies": 0.5, "rewards/chosen": -0.09649658203125, "rewards/margins": -2.07421875, "rewards/rejected": 1.978515625, "step": 21 }, { "epoch": 0.016296296296296295, "grad_norm": 1.5060640573501587, "learning_rate": 9.837037037037037e-07, "logits/chosen": 1.7119140625, "logits/rejected": 1.2705078125, "logps/chosen": -38.375, "logps/rejected": -35.40625, "loss": 0.6299, "rewards/accuracies": 0.75, "rewards/chosen": 0.098876953125, "rewards/margins": 0.13916015625, "rewards/rejected": -0.04022216796875, "step": 22 }, { "epoch": 0.017037037037037038, "grad_norm": 2.0784590244293213, "learning_rate": 9.829629629629628e-07, "logits/chosen": 0.68505859375, "logits/rejected": 1.5546875, "logps/chosen": -33.8125, "logps/rejected": -46.25, "loss": 0.8804, "rewards/accuracies": 0.5, "rewards/chosen": -0.142333984375, "rewards/margins": -0.294677734375, "rewards/rejected": 0.15234375, "step": 23 }, { "epoch": 0.017777777777777778, "grad_norm": 1.24666166305542, "learning_rate": 9.82222222222222e-07, "logits/chosen": 1.7421875, "logits/rejected": 1.9384765625, "logps/chosen": -25.484375, "logps/rejected": -43.9375, "loss": 0.6104, "rewards/accuracies": 1.0, "rewards/chosen": 0.06329345703125, "rewards/margins": 0.1773681640625, "rewards/rejected": -0.11407470703125, "step": 24 }, { "epoch": 0.018518518518518517, "grad_norm": 2.9330646991729736, "learning_rate": 9.814814814814814e-07, "logits/chosen": 1.3017578125, "logits/rejected": 2.240234375, "logps/chosen": -39.59375, "logps/rejected": -56.625, "loss": 0.7769, "rewards/accuracies": 0.5, "rewards/chosen": 0.06561279296875, "rewards/margins": -0.10107421875, "rewards/rejected": 0.166748046875, "step": 25 }, { "epoch": 0.01925925925925926, "grad_norm": 1.3839192390441895, "learning_rate": 9.807407407407407e-07, "logits/chosen": 1.884765625, "logits/rejected": 1.61328125, "logps/chosen": -24.265625, "logps/rejected": -35.5, "loss": 0.7134, "rewards/accuracies": 0.25, "rewards/chosen": -0.00079345703125, "rewards/margins": -0.03399658203125, "rewards/rejected": 0.033203125, "step": 26 }, { "epoch": 0.02, "grad_norm": 1.8206408023834229, "learning_rate": 9.8e-07, "logits/chosen": 1.59765625, "logits/rejected": 2.3125, "logps/chosen": -38.0, "logps/rejected": -29.21875, "loss": 0.6289, "rewards/accuracies": 0.75, "rewards/chosen": 0.017181396484375, "rewards/margins": 0.14453125, "rewards/rejected": -0.1273193359375, "step": 27 }, { "epoch": 0.02074074074074074, "grad_norm": 5.286537170410156, "learning_rate": 9.792592592592592e-07, "logits/chosen": 1.4599609375, "logits/rejected": 1.115234375, "logps/chosen": -27.375, "logps/rejected": -39.78125, "loss": 0.666, "rewards/accuracies": 0.5, "rewards/chosen": 0.0013580322265625, "rewards/margins": 0.0657958984375, "rewards/rejected": -0.064453125, "step": 28 }, { "epoch": 0.02148148148148148, "grad_norm": 2.2526278495788574, "learning_rate": 9.785185185185185e-07, "logits/chosen": 0.84716796875, "logits/rejected": 1.9697265625, "logps/chosen": -49.84375, "logps/rejected": -32.59375, "loss": 0.6772, "rewards/accuracies": 0.75, "rewards/chosen": 0.00743865966796875, "rewards/margins": 0.0660400390625, "rewards/rejected": -0.05859375, "step": 29 }, { "epoch": 0.022222222222222223, "grad_norm": 4.4223456382751465, "learning_rate": 9.777777777777778e-07, "logits/chosen": 1.921875, "logits/rejected": 1.578125, "logps/chosen": -22.0625, "logps/rejected": -34.8125, "loss": 0.7847, "rewards/accuracies": 0.25, "rewards/chosen": -0.10919189453125, "rewards/margins": -0.1693115234375, "rewards/rejected": 0.060150146484375, "step": 30 }, { "epoch": 0.022962962962962963, "grad_norm": 1.407668948173523, "learning_rate": 9.77037037037037e-07, "logits/chosen": 1.2685546875, "logits/rejected": 1.255859375, "logps/chosen": -17.640625, "logps/rejected": -51.90625, "loss": 0.6753, "rewards/accuracies": 0.75, "rewards/chosen": 0.0146636962890625, "rewards/margins": 0.04046630859375, "rewards/rejected": -0.025787353515625, "step": 31 }, { "epoch": 0.023703703703703703, "grad_norm": 2.488368034362793, "learning_rate": 9.762962962962963e-07, "logits/chosen": 2.423828125, "logits/rejected": 1.84375, "logps/chosen": -55.53125, "logps/rejected": -38.90625, "loss": 0.6777, "rewards/accuracies": 0.5, "rewards/chosen": 0.2083740234375, "rewards/margins": 0.04437255859375, "rewards/rejected": 0.1640625, "step": 32 }, { "epoch": 0.024444444444444446, "grad_norm": 3.9211957454681396, "learning_rate": 9.755555555555554e-07, "logits/chosen": 1.6103515625, "logits/rejected": 1.798828125, "logps/chosen": -57.34375, "logps/rejected": -22.53125, "loss": 0.7012, "rewards/accuracies": 0.75, "rewards/chosen": 0.02734375, "rewards/margins": 0.0004119873046875, "rewards/rejected": 0.0269775390625, "step": 33 }, { "epoch": 0.025185185185185185, "grad_norm": 2.1408495903015137, "learning_rate": 9.748148148148147e-07, "logits/chosen": 2.046875, "logits/rejected": 1.873046875, "logps/chosen": -27.625, "logps/rejected": -50.125, "loss": 0.7368, "rewards/accuracies": 0.5, "rewards/chosen": -0.085205078125, "rewards/margins": -0.067626953125, "rewards/rejected": -0.01751708984375, "step": 34 }, { "epoch": 0.025925925925925925, "grad_norm": 1.5011073350906372, "learning_rate": 9.74074074074074e-07, "logits/chosen": 1.5966796875, "logits/rejected": 1.9658203125, "logps/chosen": -19.796875, "logps/rejected": -24.3125, "loss": 0.9316, "rewards/accuracies": 0.0, "rewards/chosen": -0.06915283203125, "rewards/margins": -0.423095703125, "rewards/rejected": 0.35400390625, "step": 35 }, { "epoch": 0.02666666666666667, "grad_norm": 1.4139801263809204, "learning_rate": 9.733333333333333e-07, "logits/chosen": 0.814453125, "logits/rejected": 1.560546875, "logps/chosen": -26.671875, "logps/rejected": -32.21875, "loss": 0.6392, "rewards/accuracies": 0.75, "rewards/chosen": 0.0933837890625, "rewards/margins": 0.13134765625, "rewards/rejected": -0.03790283203125, "step": 36 }, { "epoch": 0.027407407407407408, "grad_norm": 2.7646968364715576, "learning_rate": 9.725925925925925e-07, "logits/chosen": 0.9833984375, "logits/rejected": 1.333984375, "logps/chosen": -25.125, "logps/rejected": -36.8125, "loss": 0.6465, "rewards/accuracies": 0.5, "rewards/chosen": -0.0234375, "rewards/margins": 0.10858154296875, "rewards/rejected": -0.132080078125, "step": 37 }, { "epoch": 0.028148148148148148, "grad_norm": 1.6990362405776978, "learning_rate": 9.718518518518518e-07, "logits/chosen": 1.0361328125, "logits/rejected": 1.4599609375, "logps/chosen": -23.265625, "logps/rejected": -32.625, "loss": 0.6782, "rewards/accuracies": 0.5, "rewards/chosen": 0.039276123046875, "rewards/margins": 0.038818359375, "rewards/rejected": 0.000408172607421875, "step": 38 }, { "epoch": 0.028888888888888888, "grad_norm": 4.393378734588623, "learning_rate": 9.711111111111111e-07, "logits/chosen": 1.8916015625, "logits/rejected": 1.4599609375, "logps/chosen": -30.875, "logps/rejected": -35.46875, "loss": 0.8184, "rewards/accuracies": 0.25, "rewards/chosen": -0.205078125, "rewards/margins": -0.2210693359375, "rewards/rejected": 0.016021728515625, "step": 39 }, { "epoch": 0.02962962962962963, "grad_norm": 2.762968063354492, "learning_rate": 9.703703703703704e-07, "logits/chosen": 1.5703125, "logits/rejected": 2.17578125, "logps/chosen": -30.3125, "logps/rejected": -50.53125, "loss": 0.8804, "rewards/accuracies": 0.0, "rewards/chosen": -0.138427734375, "rewards/margins": -0.339111328125, "rewards/rejected": 0.2005615234375, "step": 40 }, { "epoch": 0.03037037037037037, "grad_norm": 1.9449876546859741, "learning_rate": 9.696296296296297e-07, "logits/chosen": 0.96337890625, "logits/rejected": 1.873046875, "logps/chosen": -27.65625, "logps/rejected": -46.78125, "loss": 0.7422, "rewards/accuracies": 0.25, "rewards/chosen": -0.018341064453125, "rewards/margins": -0.088623046875, "rewards/rejected": 0.0703125, "step": 41 }, { "epoch": 0.03111111111111111, "grad_norm": 1.2967299222946167, "learning_rate": 9.68888888888889e-07, "logits/chosen": 1.93359375, "logits/rejected": 2.033203125, "logps/chosen": -33.25, "logps/rejected": -26.328125, "loss": 0.5273, "rewards/accuracies": 1.0, "rewards/chosen": 0.11248779296875, "rewards/margins": 0.365234375, "rewards/rejected": -0.252685546875, "step": 42 }, { "epoch": 0.03185185185185185, "grad_norm": 2.122598886489868, "learning_rate": 9.681481481481482e-07, "logits/chosen": 1.32421875, "logits/rejected": 1.53125, "logps/chosen": -39.625, "logps/rejected": -46.5, "loss": 0.7461, "rewards/accuracies": 0.25, "rewards/chosen": -0.150390625, "rewards/margins": -0.06085205078125, "rewards/rejected": -0.0894775390625, "step": 43 }, { "epoch": 0.03259259259259259, "grad_norm": 2.163186550140381, "learning_rate": 9.674074074074073e-07, "logits/chosen": 0.99169921875, "logits/rejected": 1.1904296875, "logps/chosen": -34.6875, "logps/rejected": -32.5, "loss": 0.7422, "rewards/accuracies": 0.25, "rewards/chosen": 0.052337646484375, "rewards/margins": -0.08905029296875, "rewards/rejected": 0.141357421875, "step": 44 }, { "epoch": 0.03333333333333333, "grad_norm": 1.4395321607589722, "learning_rate": 9.666666666666666e-07, "logits/chosen": 1.7197265625, "logits/rejected": 1.77734375, "logps/chosen": -25.78125, "logps/rejected": -38.625, "loss": 0.7134, "rewards/accuracies": 0.5, "rewards/chosen": -0.024993896484375, "rewards/margins": -0.03863525390625, "rewards/rejected": 0.01364898681640625, "step": 45 }, { "epoch": 0.034074074074074076, "grad_norm": 2.809899091720581, "learning_rate": 9.659259259259259e-07, "logits/chosen": 2.046875, "logits/rejected": 2.01953125, "logps/chosen": -55.5, "logps/rejected": -45.0625, "loss": 1.3184, "rewards/accuracies": 0.25, "rewards/chosen": -0.93603515625, "rewards/margins": -0.87451171875, "rewards/rejected": -0.06170654296875, "step": 46 }, { "epoch": 0.03481481481481481, "grad_norm": 2.4809296131134033, "learning_rate": 9.651851851851852e-07, "logits/chosen": 1.5068359375, "logits/rejected": 1.9580078125, "logps/chosen": -26.578125, "logps/rejected": -46.75, "loss": 0.7812, "rewards/accuracies": 0.0, "rewards/chosen": -0.004680633544921875, "rewards/margins": -0.166748046875, "rewards/rejected": 0.162109375, "step": 47 }, { "epoch": 0.035555555555555556, "grad_norm": 2.9411468505859375, "learning_rate": 9.644444444444444e-07, "logits/chosen": 2.1484375, "logits/rejected": 1.9638671875, "logps/chosen": -39.46875, "logps/rejected": -69.3125, "loss": 0.7905, "rewards/accuracies": 0.25, "rewards/chosen": -0.10546875, "rewards/margins": -0.1741943359375, "rewards/rejected": 0.0687255859375, "step": 48 }, { "epoch": 0.0362962962962963, "grad_norm": 1.494801640510559, "learning_rate": 9.637037037037037e-07, "logits/chosen": 1.2529296875, "logits/rejected": 2.01953125, "logps/chosen": -20.046875, "logps/rejected": -53.65625, "loss": 0.7607, "rewards/accuracies": 0.25, "rewards/chosen": -0.0361328125, "rewards/margins": -0.1209716796875, "rewards/rejected": 0.08477783203125, "step": 49 }, { "epoch": 0.037037037037037035, "grad_norm": 2.3947806358337402, "learning_rate": 9.629629629629628e-07, "logits/chosen": 1.6875, "logits/rejected": 0.8359375, "logps/chosen": -47.75, "logps/rejected": -74.5, "loss": 0.855, "rewards/accuracies": 0.0, "rewards/chosen": 0.0113372802734375, "rewards/margins": -0.298095703125, "rewards/rejected": 0.309326171875, "step": 50 }, { "epoch": 0.03777777777777778, "grad_norm": 1.7600998878479004, "learning_rate": 9.62222222222222e-07, "logits/chosen": 1.37109375, "logits/rejected": 1.55859375, "logps/chosen": -25.671875, "logps/rejected": -26.75, "loss": 0.7358, "rewards/accuracies": 0.25, "rewards/chosen": -0.1396484375, "rewards/margins": -0.0771484375, "rewards/rejected": -0.0625, "step": 51 }, { "epoch": 0.03851851851851852, "grad_norm": 3.9829797744750977, "learning_rate": 9.614814814814814e-07, "logits/chosen": 1.0478515625, "logits/rejected": 1.326171875, "logps/chosen": -30.375, "logps/rejected": -65.1875, "loss": 1.0469, "rewards/accuracies": 0.25, "rewards/chosen": -0.251220703125, "rewards/margins": -0.54638671875, "rewards/rejected": 0.295166015625, "step": 52 }, { "epoch": 0.03925925925925926, "grad_norm": 1.6792786121368408, "learning_rate": 9.607407407407408e-07, "logits/chosen": 1.7880859375, "logits/rejected": 1.646484375, "logps/chosen": -27.671875, "logps/rejected": -45.40625, "loss": 0.6265, "rewards/accuracies": 1.0, "rewards/chosen": 0.037109375, "rewards/margins": 0.138671875, "rewards/rejected": -0.1015625, "step": 53 }, { "epoch": 0.04, "grad_norm": 1.5386943817138672, "learning_rate": 9.6e-07, "logits/chosen": 1.5849609375, "logits/rejected": 1.3037109375, "logps/chosen": -31.484375, "logps/rejected": -29.28125, "loss": 0.6572, "rewards/accuracies": 0.75, "rewards/chosen": -0.0452880859375, "rewards/margins": 0.0758056640625, "rewards/rejected": -0.12109375, "step": 54 }, { "epoch": 0.040740740740740744, "grad_norm": 1.6544708013534546, "learning_rate": 9.592592592592592e-07, "logits/chosen": 1.5380859375, "logits/rejected": 2.025390625, "logps/chosen": -28.328125, "logps/rejected": -35.1875, "loss": 0.8608, "rewards/accuracies": 0.25, "rewards/chosen": -0.010345458984375, "rewards/margins": -0.225830078125, "rewards/rejected": 0.215576171875, "step": 55 }, { "epoch": 0.04148148148148148, "grad_norm": 1.7164744138717651, "learning_rate": 9.585185185185185e-07, "logits/chosen": 1.4638671875, "logits/rejected": 1.0400390625, "logps/chosen": -24.90625, "logps/rejected": -17.453125, "loss": 0.7349, "rewards/accuracies": 0.25, "rewards/chosen": -0.0816650390625, "rewards/margins": -0.0743408203125, "rewards/rejected": -0.00727081298828125, "step": 56 }, { "epoch": 0.042222222222222223, "grad_norm": 1.4448028802871704, "learning_rate": 9.577777777777778e-07, "logits/chosen": 1.986328125, "logits/rejected": 1.3505859375, "logps/chosen": -30.1875, "logps/rejected": -30.15625, "loss": 0.7334, "rewards/accuracies": 0.25, "rewards/chosen": 0.0113372802734375, "rewards/margins": -0.0758056640625, "rewards/rejected": 0.087158203125, "step": 57 }, { "epoch": 0.04296296296296296, "grad_norm": 2.4448392391204834, "learning_rate": 9.57037037037037e-07, "logits/chosen": 1.318359375, "logits/rejected": 1.271484375, "logps/chosen": -24.046875, "logps/rejected": -35.59375, "loss": 0.7573, "rewards/accuracies": 0.5, "rewards/chosen": 0.05230712890625, "rewards/margins": -0.09381103515625, "rewards/rejected": 0.1461181640625, "step": 58 }, { "epoch": 0.0437037037037037, "grad_norm": 3.23909068107605, "learning_rate": 9.562962962962963e-07, "logits/chosen": 2.08984375, "logits/rejected": 2.154296875, "logps/chosen": -52.5625, "logps/rejected": -24.734375, "loss": 0.6377, "rewards/accuracies": 0.75, "rewards/chosen": 0.06951904296875, "rewards/margins": 0.139892578125, "rewards/rejected": -0.0703125, "step": 59 }, { "epoch": 0.044444444444444446, "grad_norm": 4.1933979988098145, "learning_rate": 9.555555555555556e-07, "logits/chosen": 1.4462890625, "logits/rejected": 2.244140625, "logps/chosen": -52.4375, "logps/rejected": -30.46875, "loss": 0.8047, "rewards/accuracies": 0.25, "rewards/chosen": -0.05889892578125, "rewards/margins": -0.14208984375, "rewards/rejected": 0.083251953125, "step": 60 }, { "epoch": 0.04518518518518518, "grad_norm": 1.3689892292022705, "learning_rate": 9.548148148148147e-07, "logits/chosen": 1.3291015625, "logits/rejected": 1.544921875, "logps/chosen": -29.734375, "logps/rejected": -33.71875, "loss": 0.6582, "rewards/accuracies": 0.75, "rewards/chosen": 0.048431396484375, "rewards/margins": 0.078125, "rewards/rejected": -0.029693603515625, "step": 61 }, { "epoch": 0.045925925925925926, "grad_norm": 3.5360302925109863, "learning_rate": 9.54074074074074e-07, "logits/chosen": 1.404296875, "logits/rejected": 1.40625, "logps/chosen": -58.6875, "logps/rejected": -29.84375, "loss": 0.6816, "rewards/accuracies": 0.5, "rewards/chosen": -0.1038818359375, "rewards/margins": 0.0250244140625, "rewards/rejected": -0.12890625, "step": 62 }, { "epoch": 0.04666666666666667, "grad_norm": 2.857361078262329, "learning_rate": 9.533333333333333e-07, "logits/chosen": 1.572265625, "logits/rejected": 1.83203125, "logps/chosen": -83.375, "logps/rejected": -52.625, "loss": 0.6812, "rewards/accuracies": 0.5, "rewards/chosen": 0.19140625, "rewards/margins": 0.048828125, "rewards/rejected": 0.142578125, "step": 63 }, { "epoch": 0.047407407407407405, "grad_norm": 2.7168045043945312, "learning_rate": 9.525925925925925e-07, "logits/chosen": 1.84375, "logits/rejected": 1.984375, "logps/chosen": -41.8125, "logps/rejected": -34.40625, "loss": 0.7539, "rewards/accuracies": 0.5, "rewards/chosen": -0.150390625, "rewards/margins": -0.077392578125, "rewards/rejected": -0.072998046875, "step": 64 }, { "epoch": 0.04814814814814815, "grad_norm": 1.7161030769348145, "learning_rate": 9.518518518518518e-07, "logits/chosen": 1.517578125, "logits/rejected": 1.5087890625, "logps/chosen": -39.59375, "logps/rejected": -52.1875, "loss": 0.4382, "rewards/accuracies": 0.75, "rewards/chosen": 0.6591796875, "rewards/margins": 0.8291015625, "rewards/rejected": -0.1702880859375, "step": 65 }, { "epoch": 0.04888888888888889, "grad_norm": 2.778795003890991, "learning_rate": 9.511111111111111e-07, "logits/chosen": 1.7421875, "logits/rejected": 2.2578125, "logps/chosen": -37.78125, "logps/rejected": -73.375, "loss": 0.7417, "rewards/accuracies": 0.25, "rewards/chosen": 0.07891845703125, "rewards/margins": -0.09100341796875, "rewards/rejected": 0.169921875, "step": 66 }, { "epoch": 0.04962962962962963, "grad_norm": 1.6289427280426025, "learning_rate": 9.503703703703704e-07, "logits/chosen": 1.642578125, "logits/rejected": 1.599609375, "logps/chosen": -24.109375, "logps/rejected": -32.25, "loss": 0.5137, "rewards/accuracies": 1.0, "rewards/chosen": 0.0972900390625, "rewards/margins": 0.420654296875, "rewards/rejected": -0.323486328125, "step": 67 }, { "epoch": 0.05037037037037037, "grad_norm": 1.9179104566574097, "learning_rate": 9.496296296296295e-07, "logits/chosen": 1.7392578125, "logits/rejected": 1.46875, "logps/chosen": -37.0, "logps/rejected": -45.8125, "loss": 0.7808, "rewards/accuracies": 0.25, "rewards/chosen": 0.08514404296875, "rewards/margins": -0.155517578125, "rewards/rejected": 0.2406005859375, "step": 68 }, { "epoch": 0.051111111111111114, "grad_norm": 1.8439216613769531, "learning_rate": 9.488888888888888e-07, "logits/chosen": 1.3671875, "logits/rejected": 1.7548828125, "logps/chosen": -36.5625, "logps/rejected": -33.09375, "loss": 0.5762, "rewards/accuracies": 0.5, "rewards/chosen": -0.081298828125, "rewards/margins": 0.368408203125, "rewards/rejected": -0.44970703125, "step": 69 }, { "epoch": 0.05185185185185185, "grad_norm": 1.7316319942474365, "learning_rate": 9.481481481481481e-07, "logits/chosen": 1.361328125, "logits/rejected": 1.791015625, "logps/chosen": -30.046875, "logps/rejected": -41.84375, "loss": 0.6289, "rewards/accuracies": 0.75, "rewards/chosen": 0.103515625, "rewards/margins": 0.1500244140625, "rewards/rejected": -0.0465087890625, "step": 70 }, { "epoch": 0.052592592592592594, "grad_norm": 1.7266093492507935, "learning_rate": 9.474074074074073e-07, "logits/chosen": 1.7021484375, "logits/rejected": 1.779296875, "logps/chosen": -36.84375, "logps/rejected": -65.0, "loss": 0.5605, "rewards/accuracies": 1.0, "rewards/chosen": 0.141357421875, "rewards/margins": 0.2939453125, "rewards/rejected": -0.1527099609375, "step": 71 }, { "epoch": 0.05333333333333334, "grad_norm": 1.1331974267959595, "learning_rate": 9.466666666666666e-07, "logits/chosen": 0.978515625, "logits/rejected": 1.259765625, "logps/chosen": -31.78125, "logps/rejected": -37.625, "loss": 0.4963, "rewards/accuracies": 1.0, "rewards/chosen": 0.2254638671875, "rewards/margins": 0.44677734375, "rewards/rejected": -0.22119140625, "step": 72 }, { "epoch": 0.05407407407407407, "grad_norm": 2.172950029373169, "learning_rate": 9.45925925925926e-07, "logits/chosen": 1.1943359375, "logits/rejected": 1.732421875, "logps/chosen": -18.53125, "logps/rejected": -35.25, "loss": 0.7139, "rewards/accuracies": 0.0, "rewards/chosen": -0.0865478515625, "rewards/margins": -0.040802001953125, "rewards/rejected": -0.04571533203125, "step": 73 }, { "epoch": 0.054814814814814816, "grad_norm": 1.4130868911743164, "learning_rate": 9.451851851851852e-07, "logits/chosen": 1.2705078125, "logits/rejected": 1.994140625, "logps/chosen": -22.546875, "logps/rejected": -25.234375, "loss": 0.707, "rewards/accuracies": 0.5, "rewards/chosen": 0.0703125, "rewards/margins": -0.019927978515625, "rewards/rejected": 0.0902099609375, "step": 74 }, { "epoch": 0.05555555555555555, "grad_norm": 1.7967153787612915, "learning_rate": 9.444444444444444e-07, "logits/chosen": 1.5634765625, "logits/rejected": 1.1845703125, "logps/chosen": -37.4375, "logps/rejected": -39.125, "loss": 0.645, "rewards/accuracies": 0.75, "rewards/chosen": 0.173095703125, "rewards/margins": 0.1229248046875, "rewards/rejected": 0.050048828125, "step": 75 }, { "epoch": 0.056296296296296296, "grad_norm": 1.9167500734329224, "learning_rate": 9.437037037037037e-07, "logits/chosen": 1.2607421875, "logits/rejected": 1.8671875, "logps/chosen": -32.09375, "logps/rejected": -42.21875, "loss": 0.5342, "rewards/accuracies": 0.5, "rewards/chosen": 0.775390625, "rewards/margins": 0.79443359375, "rewards/rejected": -0.0195465087890625, "step": 76 }, { "epoch": 0.05703703703703704, "grad_norm": 5.336838722229004, "learning_rate": 9.42962962962963e-07, "logits/chosen": 1.6142578125, "logits/rejected": 2.095703125, "logps/chosen": -32.6875, "logps/rejected": -76.0, "loss": 1.0049, "rewards/accuracies": 0.25, "rewards/chosen": 0.01444244384765625, "rewards/margins": -0.452392578125, "rewards/rejected": 0.466796875, "step": 77 }, { "epoch": 0.057777777777777775, "grad_norm": 2.286437749862671, "learning_rate": 9.422222222222222e-07, "logits/chosen": 1.5224609375, "logits/rejected": 1.90625, "logps/chosen": -37.59375, "logps/rejected": -39.0, "loss": 0.7236, "rewards/accuracies": 0.5, "rewards/chosen": 0.001953125, "rewards/margins": -0.01055908203125, "rewards/rejected": 0.0124969482421875, "step": 78 }, { "epoch": 0.05851851851851852, "grad_norm": 2.4176154136657715, "learning_rate": 9.414814814814814e-07, "logits/chosen": 1.509765625, "logits/rejected": 1.548828125, "logps/chosen": -35.90625, "logps/rejected": -38.96875, "loss": 1.0078, "rewards/accuracies": 0.25, "rewards/chosen": -0.384765625, "rewards/margins": -0.447265625, "rewards/rejected": 0.0625, "step": 79 }, { "epoch": 0.05925925925925926, "grad_norm": 3.1612393856048584, "learning_rate": 9.407407407407407e-07, "logits/chosen": 2.3515625, "logits/rejected": 1.6943359375, "logps/chosen": -28.890625, "logps/rejected": -74.0, "loss": 0.8984, "rewards/accuracies": 0.25, "rewards/chosen": -0.05194091796875, "rewards/margins": -0.334716796875, "rewards/rejected": 0.282958984375, "step": 80 }, { "epoch": 0.06, "grad_norm": 1.7341508865356445, "learning_rate": 9.399999999999999e-07, "logits/chosen": 1.8115234375, "logits/rejected": 2.12109375, "logps/chosen": -23.953125, "logps/rejected": -44.0625, "loss": 0.7607, "rewards/accuracies": 0.25, "rewards/chosen": -0.13671875, "rewards/margins": -0.12310791015625, "rewards/rejected": -0.01367950439453125, "step": 81 }, { "epoch": 0.06074074074074074, "grad_norm": 1.1564910411834717, "learning_rate": 9.392592592592592e-07, "logits/chosen": 1.5087890625, "logits/rejected": 1.05078125, "logps/chosen": -23.703125, "logps/rejected": -31.375, "loss": 0.5767, "rewards/accuracies": 1.0, "rewards/chosen": 0.2081298828125, "rewards/margins": 0.262939453125, "rewards/rejected": -0.0546875, "step": 82 }, { "epoch": 0.061481481481481484, "grad_norm": 4.601666450500488, "learning_rate": 9.385185185185185e-07, "logits/chosen": 1.4638671875, "logits/rejected": 1.56640625, "logps/chosen": -46.0, "logps/rejected": -99.125, "loss": 0.813, "rewards/accuracies": 0.75, "rewards/chosen": 0.134521484375, "rewards/margins": -0.11376953125, "rewards/rejected": 0.2484130859375, "step": 83 }, { "epoch": 0.06222222222222222, "grad_norm": 1.8301589488983154, "learning_rate": 9.377777777777777e-07, "logits/chosen": 0.95068359375, "logits/rejected": 1.142578125, "logps/chosen": -30.96875, "logps/rejected": -47.4375, "loss": 0.7422, "rewards/accuracies": 0.5, "rewards/chosen": -0.01015472412109375, "rewards/margins": -0.0859375, "rewards/rejected": 0.0758056640625, "step": 84 }, { "epoch": 0.06296296296296296, "grad_norm": 1.4311474561691284, "learning_rate": 9.370370370370369e-07, "logits/chosen": 1.2001953125, "logits/rejected": 1.59375, "logps/chosen": -29.75, "logps/rejected": -36.5, "loss": 0.6196, "rewards/accuracies": 1.0, "rewards/chosen": 0.076171875, "rewards/margins": 0.1539306640625, "rewards/rejected": -0.0777587890625, "step": 85 }, { "epoch": 0.0637037037037037, "grad_norm": 2.13525128364563, "learning_rate": 9.362962962962962e-07, "logits/chosen": 2.06640625, "logits/rejected": 1.912109375, "logps/chosen": -29.5625, "logps/rejected": -33.40625, "loss": 0.7012, "rewards/accuracies": 0.5, "rewards/chosen": 0.08905029296875, "rewards/margins": -0.0086212158203125, "rewards/rejected": 0.09765625, "step": 86 }, { "epoch": 0.06444444444444444, "grad_norm": 3.512944459915161, "learning_rate": 9.355555555555556e-07, "logits/chosen": 1.4404296875, "logits/rejected": 1.6435546875, "logps/chosen": -24.21875, "logps/rejected": -28.578125, "loss": 0.9419, "rewards/accuracies": 0.5, "rewards/chosen": -0.06719970703125, "rewards/margins": -0.293212890625, "rewards/rejected": 0.225830078125, "step": 87 }, { "epoch": 0.06518518518518518, "grad_norm": 1.5494365692138672, "learning_rate": 9.348148148148148e-07, "logits/chosen": 1.8623046875, "logits/rejected": 1.4443359375, "logps/chosen": -23.84375, "logps/rejected": -39.21875, "loss": 0.6729, "rewards/accuracies": 0.5, "rewards/chosen": 0.046875, "rewards/margins": 0.046875, "rewards/rejected": 1.1444091796875e-05, "step": 88 }, { "epoch": 0.06592592592592593, "grad_norm": 2.470848321914673, "learning_rate": 9.34074074074074e-07, "logits/chosen": 1.623046875, "logits/rejected": 1.8974609375, "logps/chosen": -58.96875, "logps/rejected": -44.75, "loss": 0.6108, "rewards/accuracies": 0.75, "rewards/chosen": 0.1507568359375, "rewards/margins": 0.2235107421875, "rewards/rejected": -0.0726318359375, "step": 89 }, { "epoch": 0.06666666666666667, "grad_norm": 1.8561320304870605, "learning_rate": 9.333333333333333e-07, "logits/chosen": 1.486328125, "logits/rejected": 1.8076171875, "logps/chosen": -39.25, "logps/rejected": -66.75, "loss": 0.519, "rewards/accuracies": 1.0, "rewards/chosen": 0.344482421875, "rewards/margins": 0.41748046875, "rewards/rejected": -0.072998046875, "step": 90 }, { "epoch": 0.0674074074074074, "grad_norm": 1.8979724645614624, "learning_rate": 9.325925925925926e-07, "logits/chosen": 2.0625, "logits/rejected": 1.8125, "logps/chosen": -28.453125, "logps/rejected": -41.75, "loss": 0.8457, "rewards/accuracies": 0.25, "rewards/chosen": -0.0439453125, "rewards/margins": -0.2646484375, "rewards/rejected": 0.220703125, "step": 91 }, { "epoch": 0.06814814814814815, "grad_norm": 1.9745796918869019, "learning_rate": 9.318518518518518e-07, "logits/chosen": 1.923828125, "logits/rejected": 1.416015625, "logps/chosen": -45.1875, "logps/rejected": -51.78125, "loss": 0.624, "rewards/accuracies": 0.75, "rewards/chosen": 0.1171875, "rewards/margins": 0.169189453125, "rewards/rejected": -0.051971435546875, "step": 92 }, { "epoch": 0.06888888888888889, "grad_norm": 1.8896679878234863, "learning_rate": 9.311111111111111e-07, "logits/chosen": 1.482421875, "logits/rejected": 1.626953125, "logps/chosen": -28.453125, "logps/rejected": -60.03125, "loss": 0.6846, "rewards/accuracies": 0.75, "rewards/chosen": -0.0222625732421875, "rewards/margins": 0.0234222412109375, "rewards/rejected": -0.045684814453125, "step": 93 }, { "epoch": 0.06962962962962962, "grad_norm": 2.1872029304504395, "learning_rate": 9.303703703703703e-07, "logits/chosen": 1.4306640625, "logits/rejected": 1.8828125, "logps/chosen": -26.90625, "logps/rejected": -34.0625, "loss": 0.6196, "rewards/accuracies": 0.5, "rewards/chosen": 0.003322601318359375, "rewards/margins": 0.316162109375, "rewards/rejected": -0.312744140625, "step": 94 }, { "epoch": 0.07037037037037037, "grad_norm": 1.7848429679870605, "learning_rate": 9.296296296296295e-07, "logits/chosen": 1.4775390625, "logits/rejected": 1.4423828125, "logps/chosen": -35.0, "logps/rejected": -52.75, "loss": 0.6626, "rewards/accuracies": 0.5, "rewards/chosen": 0.138916015625, "rewards/margins": 0.077880859375, "rewards/rejected": 0.06097412109375, "step": 95 }, { "epoch": 0.07111111111111111, "grad_norm": 1.578696608543396, "learning_rate": 9.288888888888888e-07, "logits/chosen": 1.6796875, "logits/rejected": 1.681640625, "logps/chosen": -30.75, "logps/rejected": -36.46875, "loss": 0.5991, "rewards/accuracies": 0.75, "rewards/chosen": -0.1285400390625, "rewards/margins": 0.209716796875, "rewards/rejected": -0.338134765625, "step": 96 }, { "epoch": 0.07185185185185185, "grad_norm": 2.6949567794799805, "learning_rate": 9.281481481481481e-07, "logits/chosen": 2.12109375, "logits/rejected": 1.6640625, "logps/chosen": -32.15625, "logps/rejected": -62.96875, "loss": 2.2188, "rewards/accuracies": 0.5, "rewards/chosen": 0.059783935546875, "rewards/margins": -1.66796875, "rewards/rejected": 1.7275390625, "step": 97 }, { "epoch": 0.0725925925925926, "grad_norm": 3.0538835525512695, "learning_rate": 9.274074074074074e-07, "logits/chosen": 1.3671875, "logits/rejected": 1.62109375, "logps/chosen": -41.1875, "logps/rejected": -49.71875, "loss": 0.5088, "rewards/accuracies": 0.75, "rewards/chosen": 0.6259765625, "rewards/margins": 0.5146484375, "rewards/rejected": 0.1109619140625, "step": 98 }, { "epoch": 0.07333333333333333, "grad_norm": 1.9351308345794678, "learning_rate": 9.266666666666665e-07, "logits/chosen": 1.19140625, "logits/rejected": 1.693359375, "logps/chosen": -22.5625, "logps/rejected": -33.0, "loss": 0.876, "rewards/accuracies": 0.25, "rewards/chosen": -0.21826171875, "rewards/margins": -0.31298828125, "rewards/rejected": 0.0946044921875, "step": 99 }, { "epoch": 0.07407407407407407, "grad_norm": 2.076357126235962, "learning_rate": 9.259259259259259e-07, "logits/chosen": 1.4912109375, "logits/rejected": 1.8935546875, "logps/chosen": -33.34375, "logps/rejected": -41.125, "loss": 0.7471, "rewards/accuracies": 0.25, "rewards/chosen": 0.059356689453125, "rewards/margins": -0.0777587890625, "rewards/rejected": 0.1370849609375, "step": 100 }, { "epoch": 0.07481481481481482, "grad_norm": 1.7061930894851685, "learning_rate": 9.251851851851852e-07, "logits/chosen": 1.796875, "logits/rejected": 0.97900390625, "logps/chosen": -34.5, "logps/rejected": -35.34375, "loss": 0.5552, "rewards/accuracies": 0.75, "rewards/chosen": 0.11700439453125, "rewards/margins": 0.33447265625, "rewards/rejected": -0.217529296875, "step": 101 }, { "epoch": 0.07555555555555556, "grad_norm": 1.2975479364395142, "learning_rate": 9.244444444444444e-07, "logits/chosen": 1.337890625, "logits/rejected": 0.409423828125, "logps/chosen": -31.890625, "logps/rejected": -43.03125, "loss": 0.5908, "rewards/accuracies": 0.75, "rewards/chosen": 0.26220703125, "rewards/margins": 0.263671875, "rewards/rejected": -0.0015869140625, "step": 102 }, { "epoch": 0.07629629629629629, "grad_norm": 2.0951430797576904, "learning_rate": 9.237037037037037e-07, "logits/chosen": 1.29296875, "logits/rejected": 0.8916015625, "logps/chosen": -27.796875, "logps/rejected": -61.84375, "loss": 0.7388, "rewards/accuracies": 0.25, "rewards/chosen": 0.041412353515625, "rewards/margins": -0.08477783203125, "rewards/rejected": 0.1260986328125, "step": 103 }, { "epoch": 0.07703703703703704, "grad_norm": 1.942586064338684, "learning_rate": 9.22962962962963e-07, "logits/chosen": 1.7861328125, "logits/rejected": 2.27734375, "logps/chosen": -36.34375, "logps/rejected": -45.09375, "loss": 0.8633, "rewards/accuracies": 0.25, "rewards/chosen": -0.0953369140625, "rewards/margins": -0.300048828125, "rewards/rejected": 0.2047119140625, "step": 104 }, { "epoch": 0.07777777777777778, "grad_norm": 1.7574442625045776, "learning_rate": 9.222222222222222e-07, "logits/chosen": 1.732421875, "logits/rejected": 1.9423828125, "logps/chosen": -25.65625, "logps/rejected": -66.9375, "loss": 0.7246, "rewards/accuracies": 0.75, "rewards/chosen": 0.06488037109375, "rewards/margins": -0.030517578125, "rewards/rejected": 0.0953369140625, "step": 105 }, { "epoch": 0.07851851851851852, "grad_norm": 1.6302132606506348, "learning_rate": 9.214814814814814e-07, "logits/chosen": 1.7958984375, "logits/rejected": 2.45703125, "logps/chosen": -18.609375, "logps/rejected": -41.90625, "loss": 0.522, "rewards/accuracies": 0.75, "rewards/chosen": 0.04766845703125, "rewards/margins": 0.4580078125, "rewards/rejected": -0.41015625, "step": 106 }, { "epoch": 0.07925925925925927, "grad_norm": 1.6446911096572876, "learning_rate": 9.207407407407407e-07, "logits/chosen": 1.8349609375, "logits/rejected": 1.814453125, "logps/chosen": -30.5, "logps/rejected": -45.8125, "loss": 0.7256, "rewards/accuracies": 0.75, "rewards/chosen": -0.0625, "rewards/margins": -0.049224853515625, "rewards/rejected": -0.013275146484375, "step": 107 }, { "epoch": 0.08, "grad_norm": 1.7142443656921387, "learning_rate": 9.2e-07, "logits/chosen": 1.294921875, "logits/rejected": 1.6875, "logps/chosen": -23.109375, "logps/rejected": -41.78125, "loss": 0.7515, "rewards/accuracies": 0.5, "rewards/chosen": -0.09490966796875, "rewards/margins": -0.0968017578125, "rewards/rejected": 0.001949310302734375, "step": 108 }, { "epoch": 0.08074074074074074, "grad_norm": 1.392536997795105, "learning_rate": 9.192592592592592e-07, "logits/chosen": 1.5048828125, "logits/rejected": 1.4248046875, "logps/chosen": -24.5, "logps/rejected": -33.34375, "loss": 0.6763, "rewards/accuracies": 0.5, "rewards/chosen": 0.0670166015625, "rewards/margins": 0.03851318359375, "rewards/rejected": 0.02850341796875, "step": 109 }, { "epoch": 0.08148148148148149, "grad_norm": 2.0637660026550293, "learning_rate": 9.185185185185184e-07, "logits/chosen": 1.3876953125, "logits/rejected": 2.404296875, "logps/chosen": -31.96875, "logps/rejected": -54.09375, "loss": 0.5449, "rewards/accuracies": 0.75, "rewards/chosen": 0.0804443359375, "rewards/margins": 0.483642578125, "rewards/rejected": -0.4033203125, "step": 110 }, { "epoch": 0.08222222222222222, "grad_norm": 2.1035854816436768, "learning_rate": 9.177777777777777e-07, "logits/chosen": 1.693359375, "logits/rejected": 1.09765625, "logps/chosen": -32.59375, "logps/rejected": -62.375, "loss": 0.6167, "rewards/accuracies": 0.75, "rewards/chosen": 0.042572021484375, "rewards/margins": 0.2113037109375, "rewards/rejected": -0.168701171875, "step": 111 }, { "epoch": 0.08296296296296296, "grad_norm": 6.183417797088623, "learning_rate": 9.170370370370369e-07, "logits/chosen": 1.6279296875, "logits/rejected": 1.7626953125, "logps/chosen": -41.28125, "logps/rejected": -82.25, "loss": 0.7275, "rewards/accuracies": 0.5, "rewards/chosen": -0.0953369140625, "rewards/margins": -0.0625, "rewards/rejected": -0.0328369140625, "step": 112 }, { "epoch": 0.0837037037037037, "grad_norm": 1.8894670009613037, "learning_rate": 9.162962962962963e-07, "logits/chosen": 1.4833984375, "logits/rejected": 1.6357421875, "logps/chosen": -27.078125, "logps/rejected": -46.9375, "loss": 0.7588, "rewards/accuracies": 0.5, "rewards/chosen": -0.0091552734375, "rewards/margins": -0.11383056640625, "rewards/rejected": 0.10467529296875, "step": 113 }, { "epoch": 0.08444444444444445, "grad_norm": 1.5271714925765991, "learning_rate": 9.155555555555556e-07, "logits/chosen": 1.1279296875, "logits/rejected": 1.025390625, "logps/chosen": -29.09375, "logps/rejected": -42.125, "loss": 0.667, "rewards/accuracies": 0.75, "rewards/chosen": 0.04803466796875, "rewards/margins": 0.05621337890625, "rewards/rejected": -0.008209228515625, "step": 114 }, { "epoch": 0.08518518518518518, "grad_norm": 1.7770020961761475, "learning_rate": 9.148148148148148e-07, "logits/chosen": 1.068359375, "logits/rejected": 1.208984375, "logps/chosen": -33.71875, "logps/rejected": -60.09375, "loss": 0.7158, "rewards/accuracies": 0.5, "rewards/chosen": 0.1480712890625, "rewards/margins": -0.0152130126953125, "rewards/rejected": 0.163330078125, "step": 115 }, { "epoch": 0.08592592592592592, "grad_norm": 10.570402145385742, "learning_rate": 9.14074074074074e-07, "logits/chosen": 1.455078125, "logits/rejected": 1.4873046875, "logps/chosen": -38.25, "logps/rejected": -41.4375, "loss": 0.7842, "rewards/accuracies": 0.75, "rewards/chosen": 0.0777587890625, "rewards/margins": -0.054290771484375, "rewards/rejected": 0.132080078125, "step": 116 }, { "epoch": 0.08666666666666667, "grad_norm": 1.9264127016067505, "learning_rate": 9.133333333333333e-07, "logits/chosen": 1.9677734375, "logits/rejected": 1.16796875, "logps/chosen": -35.21875, "logps/rejected": -41.875, "loss": 0.5913, "rewards/accuracies": 1.0, "rewards/chosen": 0.181884765625, "rewards/margins": 0.2252197265625, "rewards/rejected": -0.043365478515625, "step": 117 }, { "epoch": 0.0874074074074074, "grad_norm": 1.9232323169708252, "learning_rate": 9.125925925925926e-07, "logits/chosen": 1.462890625, "logits/rejected": 1.392578125, "logps/chosen": -31.78125, "logps/rejected": -28.671875, "loss": 0.627, "rewards/accuracies": 0.75, "rewards/chosen": 0.0107574462890625, "rewards/margins": 0.1405029296875, "rewards/rejected": -0.1297607421875, "step": 118 }, { "epoch": 0.08814814814814814, "grad_norm": 1.3664600849151611, "learning_rate": 9.118518518518518e-07, "logits/chosen": 1.3857421875, "logits/rejected": 1.3974609375, "logps/chosen": -24.40625, "logps/rejected": -30.734375, "loss": 0.707, "rewards/accuracies": 0.5, "rewards/chosen": 0.01172637939453125, "rewards/margins": -0.0241851806640625, "rewards/rejected": 0.035919189453125, "step": 119 }, { "epoch": 0.08888888888888889, "grad_norm": 1.5009490251541138, "learning_rate": 9.11111111111111e-07, "logits/chosen": 1.2158203125, "logits/rejected": 1.4716796875, "logps/chosen": -36.1875, "logps/rejected": -50.1875, "loss": 0.6587, "rewards/accuracies": 0.75, "rewards/chosen": -0.0328369140625, "rewards/margins": 0.0726318359375, "rewards/rejected": -0.10546875, "step": 120 }, { "epoch": 0.08962962962962963, "grad_norm": 8.541365623474121, "learning_rate": 9.103703703703703e-07, "logits/chosen": 1.728515625, "logits/rejected": 2.3515625, "logps/chosen": -25.515625, "logps/rejected": -51.625, "loss": 0.7764, "rewards/accuracies": 0.5, "rewards/chosen": -0.18701171875, "rewards/margins": 0.1060791015625, "rewards/rejected": -0.29296875, "step": 121 }, { "epoch": 0.09037037037037036, "grad_norm": 1.5448198318481445, "learning_rate": 9.096296296296296e-07, "logits/chosen": 1.474609375, "logits/rejected": 1.69921875, "logps/chosen": -51.75, "logps/rejected": -43.96875, "loss": 0.4602, "rewards/accuracies": 0.75, "rewards/chosen": 0.623046875, "rewards/margins": 0.7177734375, "rewards/rejected": -0.0948486328125, "step": 122 }, { "epoch": 0.09111111111111111, "grad_norm": 3.565706491470337, "learning_rate": 9.088888888888888e-07, "logits/chosen": 1.935546875, "logits/rejected": 1.8154296875, "logps/chosen": -26.5, "logps/rejected": -28.8125, "loss": 0.7925, "rewards/accuracies": 0.0, "rewards/chosen": -0.1429443359375, "rewards/margins": -0.18408203125, "rewards/rejected": 0.041046142578125, "step": 123 }, { "epoch": 0.09185185185185185, "grad_norm": 2.8574256896972656, "learning_rate": 9.081481481481481e-07, "logits/chosen": 1.5361328125, "logits/rejected": 0.98388671875, "logps/chosen": -40.75, "logps/rejected": -42.25, "loss": 0.8662, "rewards/accuracies": 0.25, "rewards/chosen": -0.14599609375, "rewards/margins": -0.282958984375, "rewards/rejected": 0.136962890625, "step": 124 }, { "epoch": 0.09259259259259259, "grad_norm": 2.5346219539642334, "learning_rate": 9.074074074074074e-07, "logits/chosen": 1.58984375, "logits/rejected": 2.375, "logps/chosen": -37.78125, "logps/rejected": -32.03125, "loss": 0.9326, "rewards/accuracies": 0.0, "rewards/chosen": -0.316650390625, "rewards/margins": -0.42333984375, "rewards/rejected": 0.106689453125, "step": 125 }, { "epoch": 0.09333333333333334, "grad_norm": 1.7480666637420654, "learning_rate": 9.066666666666665e-07, "logits/chosen": 1.1708984375, "logits/rejected": 1.5283203125, "logps/chosen": -51.3125, "logps/rejected": -28.953125, "loss": 0.6577, "rewards/accuracies": 1.0, "rewards/chosen": -0.015625, "rewards/margins": 0.07342529296875, "rewards/rejected": -0.08905029296875, "step": 126 }, { "epoch": 0.09407407407407407, "grad_norm": 1.8191701173782349, "learning_rate": 9.059259259259259e-07, "logits/chosen": 1.28515625, "logits/rejected": 1.5390625, "logps/chosen": -30.484375, "logps/rejected": -79.5625, "loss": 0.5962, "rewards/accuracies": 0.75, "rewards/chosen": 0.0902099609375, "rewards/margins": 0.2109375, "rewards/rejected": -0.1207275390625, "step": 127 }, { "epoch": 0.09481481481481481, "grad_norm": 1.8807669878005981, "learning_rate": 9.051851851851852e-07, "logits/chosen": 1.5185546875, "logits/rejected": 2.21875, "logps/chosen": -24.859375, "logps/rejected": -40.46875, "loss": 0.8057, "rewards/accuracies": 0.5, "rewards/chosen": -0.117919921875, "rewards/margins": -0.1983642578125, "rewards/rejected": 0.08050537109375, "step": 128 }, { "epoch": 0.09555555555555556, "grad_norm": 4.491909027099609, "learning_rate": 9.044444444444445e-07, "logits/chosen": 0.8740234375, "logits/rejected": 1.3388671875, "logps/chosen": -24.125, "logps/rejected": -31.3125, "loss": 0.7012, "rewards/accuracies": 0.5, "rewards/chosen": -0.0621337890625, "rewards/margins": -0.00274658203125, "rewards/rejected": -0.05938720703125, "step": 129 }, { "epoch": 0.0962962962962963, "grad_norm": 1.5080593824386597, "learning_rate": 9.037037037037037e-07, "logits/chosen": 1.6689453125, "logits/rejected": 2.40625, "logps/chosen": -29.921875, "logps/rejected": -22.71875, "loss": 0.8057, "rewards/accuracies": 0.25, "rewards/chosen": -0.052734375, "rewards/margins": -0.200439453125, "rewards/rejected": 0.147705078125, "step": 130 }, { "epoch": 0.09703703703703703, "grad_norm": 2.041835069656372, "learning_rate": 9.029629629629629e-07, "logits/chosen": 1.4423828125, "logits/rejected": 1.5732421875, "logps/chosen": -42.5, "logps/rejected": -48.5625, "loss": 0.7529, "rewards/accuracies": 0.5, "rewards/chosen": 0.0171966552734375, "rewards/margins": -0.10821533203125, "rewards/rejected": 0.1253662109375, "step": 131 }, { "epoch": 0.09777777777777778, "grad_norm": 1.6740189790725708, "learning_rate": 9.022222222222222e-07, "logits/chosen": 0.99853515625, "logits/rejected": 1.5205078125, "logps/chosen": -45.03125, "logps/rejected": -42.0, "loss": 0.5176, "rewards/accuracies": 0.5, "rewards/chosen": 0.202392578125, "rewards/margins": 0.62890625, "rewards/rejected": -0.426513671875, "step": 132 }, { "epoch": 0.09851851851851852, "grad_norm": 1.8791605234146118, "learning_rate": 9.014814814814814e-07, "logits/chosen": 1.8408203125, "logits/rejected": 2.0390625, "logps/chosen": -33.8125, "logps/rejected": -61.9375, "loss": 0.5977, "rewards/accuracies": 0.25, "rewards/chosen": -0.089111328125, "rewards/margins": 0.47265625, "rewards/rejected": -0.5615234375, "step": 133 }, { "epoch": 0.09925925925925926, "grad_norm": 2.5299437046051025, "learning_rate": 9.007407407407407e-07, "logits/chosen": 1.6044921875, "logits/rejected": 2.197265625, "logps/chosen": -61.65625, "logps/rejected": -60.6875, "loss": 0.7554, "rewards/accuracies": 0.5, "rewards/chosen": -0.1668701171875, "rewards/margins": -0.1011962890625, "rewards/rejected": -0.065673828125, "step": 134 }, { "epoch": 0.1, "grad_norm": 3.061716079711914, "learning_rate": 9e-07, "logits/chosen": 1.9697265625, "logits/rejected": 0.92724609375, "logps/chosen": -49.75, "logps/rejected": -53.71875, "loss": 0.665, "rewards/accuracies": 0.5, "rewards/chosen": 0.458740234375, "rewards/margins": 0.342529296875, "rewards/rejected": 0.11639404296875, "step": 135 }, { "epoch": 0.10074074074074074, "grad_norm": 1.7938332557678223, "learning_rate": 8.992592592592592e-07, "logits/chosen": 1.4296875, "logits/rejected": 1.7841796875, "logps/chosen": -28.34375, "logps/rejected": -51.6875, "loss": 0.7021, "rewards/accuracies": 0.5, "rewards/chosen": 0.056243896484375, "rewards/margins": -0.0148468017578125, "rewards/rejected": 0.07110595703125, "step": 136 }, { "epoch": 0.10148148148148148, "grad_norm": 1.6767981052398682, "learning_rate": 8.985185185185184e-07, "logits/chosen": 1.0791015625, "logits/rejected": 0.8896484375, "logps/chosen": -24.0, "logps/rejected": -35.8125, "loss": 0.8623, "rewards/accuracies": 0.25, "rewards/chosen": 0.00839996337890625, "rewards/margins": -0.29638671875, "rewards/rejected": 0.3046875, "step": 137 }, { "epoch": 0.10222222222222223, "grad_norm": 6.019056797027588, "learning_rate": 8.977777777777777e-07, "logits/chosen": 1.552734375, "logits/rejected": 1.427734375, "logps/chosen": -26.125, "logps/rejected": -39.9375, "loss": 0.8564, "rewards/accuracies": 0.25, "rewards/chosen": -0.00079345703125, "rewards/margins": -0.26171875, "rewards/rejected": 0.260986328125, "step": 138 }, { "epoch": 0.10296296296296296, "grad_norm": 1.7942883968353271, "learning_rate": 8.970370370370371e-07, "logits/chosen": 1.685546875, "logits/rejected": 1.66015625, "logps/chosen": -33.96875, "logps/rejected": -44.21875, "loss": 0.6328, "rewards/accuracies": 0.75, "rewards/chosen": -0.033966064453125, "rewards/margins": 0.1663818359375, "rewards/rejected": -0.2003173828125, "step": 139 }, { "epoch": 0.1037037037037037, "grad_norm": 2.0401008129119873, "learning_rate": 8.962962962962963e-07, "logits/chosen": 1.119140625, "logits/rejected": 1.671875, "logps/chosen": -28.125, "logps/rejected": -55.09375, "loss": 0.7637, "rewards/accuracies": 0.0, "rewards/chosen": -0.060150146484375, "rewards/margins": -0.1343994140625, "rewards/rejected": 0.07421875, "step": 140 }, { "epoch": 0.10444444444444445, "grad_norm": 1.2732681035995483, "learning_rate": 8.955555555555555e-07, "logits/chosen": 1.509765625, "logits/rejected": 1.4814453125, "logps/chosen": -28.953125, "logps/rejected": -31.53125, "loss": 0.5933, "rewards/accuracies": 1.0, "rewards/chosen": 0.2340087890625, "rewards/margins": 0.220458984375, "rewards/rejected": 0.01364898681640625, "step": 141 }, { "epoch": 0.10518518518518519, "grad_norm": 2.8495545387268066, "learning_rate": 8.948148148148148e-07, "logits/chosen": 2.17578125, "logits/rejected": 1.451171875, "logps/chosen": -34.90625, "logps/rejected": -39.125, "loss": 0.7339, "rewards/accuracies": 0.5, "rewards/chosen": -0.03436279296875, "rewards/margins": -0.0714111328125, "rewards/rejected": 0.037109375, "step": 142 }, { "epoch": 0.10592592592592592, "grad_norm": 1.601641297340393, "learning_rate": 8.94074074074074e-07, "logits/chosen": 0.99072265625, "logits/rejected": 1.7138671875, "logps/chosen": -26.9375, "logps/rejected": -27.34375, "loss": 0.7949, "rewards/accuracies": 0.0, "rewards/chosen": 0.045318603515625, "rewards/margins": -0.1929931640625, "rewards/rejected": 0.23828125, "step": 143 }, { "epoch": 0.10666666666666667, "grad_norm": 1.6668552160263062, "learning_rate": 8.933333333333333e-07, "logits/chosen": 1.42578125, "logits/rejected": 1.619140625, "logps/chosen": -28.359375, "logps/rejected": -46.65625, "loss": 0.6748, "rewards/accuracies": 0.5, "rewards/chosen": -0.082763671875, "rewards/margins": 0.042999267578125, "rewards/rejected": -0.1258544921875, "step": 144 }, { "epoch": 0.10740740740740741, "grad_norm": 1.5135114192962646, "learning_rate": 8.925925925925926e-07, "logits/chosen": 1.623046875, "logits/rejected": 1.1337890625, "logps/chosen": -27.4375, "logps/rejected": -25.671875, "loss": 0.7651, "rewards/accuracies": 0.5, "rewards/chosen": -0.06561279296875, "rewards/margins": -0.122314453125, "rewards/rejected": 0.056640625, "step": 145 }, { "epoch": 0.10814814814814815, "grad_norm": 1.6372497081756592, "learning_rate": 8.918518518518518e-07, "logits/chosen": 1.6962890625, "logits/rejected": 1.603515625, "logps/chosen": -24.703125, "logps/rejected": -44.28125, "loss": 0.6309, "rewards/accuracies": 0.5, "rewards/chosen": 0.0924072265625, "rewards/margins": 0.1396484375, "rewards/rejected": -0.047271728515625, "step": 146 }, { "epoch": 0.10888888888888888, "grad_norm": 4.837470054626465, "learning_rate": 8.91111111111111e-07, "logits/chosen": 2.630859375, "logits/rejected": 1.8408203125, "logps/chosen": -39.9375, "logps/rejected": -36.5, "loss": 1.1836, "rewards/accuracies": 0.0, "rewards/chosen": -0.60888671875, "rewards/margins": -0.7109375, "rewards/rejected": 0.1019287109375, "step": 147 }, { "epoch": 0.10962962962962963, "grad_norm": 1.9674649238586426, "learning_rate": 8.903703703703703e-07, "logits/chosen": 1.642578125, "logits/rejected": 2.119140625, "logps/chosen": -32.5, "logps/rejected": -52.9375, "loss": 0.6899, "rewards/accuracies": 0.75, "rewards/chosen": -0.044525146484375, "rewards/margins": 0.03515625, "rewards/rejected": -0.0797119140625, "step": 148 }, { "epoch": 0.11037037037037037, "grad_norm": 1.4145994186401367, "learning_rate": 8.896296296296296e-07, "logits/chosen": 1.890625, "logits/rejected": 1.62890625, "logps/chosen": -34.1875, "logps/rejected": -42.34375, "loss": 0.6777, "rewards/accuracies": 0.5, "rewards/chosen": 0.02496337890625, "rewards/margins": 0.07891845703125, "rewards/rejected": -0.053924560546875, "step": 149 }, { "epoch": 0.1111111111111111, "grad_norm": 2.9220926761627197, "learning_rate": 8.888888888888888e-07, "logits/chosen": 1.3154296875, "logits/rejected": 1.8681640625, "logps/chosen": -52.875, "logps/rejected": -65.5, "loss": 1.0332, "rewards/accuracies": 0.75, "rewards/chosen": 0.288330078125, "rewards/margins": -0.38623046875, "rewards/rejected": 0.67431640625, "step": 150 }, { "epoch": 0.11185185185185186, "grad_norm": 5.850963592529297, "learning_rate": 8.88148148148148e-07, "logits/chosen": 1.4208984375, "logits/rejected": 1.87109375, "logps/chosen": -54.46875, "logps/rejected": -54.84375, "loss": 0.6538, "rewards/accuracies": 0.75, "rewards/chosen": 0.05859375, "rewards/margins": 0.08526611328125, "rewards/rejected": -0.0267333984375, "step": 151 }, { "epoch": 0.11259259259259259, "grad_norm": 2.3008079528808594, "learning_rate": 8.874074074074073e-07, "logits/chosen": 1.068359375, "logits/rejected": 2.0546875, "logps/chosen": -28.25, "logps/rejected": -62.8125, "loss": 0.5752, "rewards/accuracies": 0.75, "rewards/chosen": 0.1409912109375, "rewards/margins": 0.264404296875, "rewards/rejected": -0.12347412109375, "step": 152 }, { "epoch": 0.11333333333333333, "grad_norm": 1.887286901473999, "learning_rate": 8.866666666666667e-07, "logits/chosen": 2.34765625, "logits/rejected": 1.3662109375, "logps/chosen": -34.75, "logps/rejected": -39.375, "loss": 0.7021, "rewards/accuracies": 0.5, "rewards/chosen": -0.065673828125, "rewards/margins": 0.004974365234375, "rewards/rejected": -0.0706787109375, "step": 153 }, { "epoch": 0.11407407407407408, "grad_norm": 1.7913914918899536, "learning_rate": 8.859259259259259e-07, "logits/chosen": 1.544921875, "logits/rejected": 1.380859375, "logps/chosen": -32.4375, "logps/rejected": -43.53125, "loss": 0.6875, "rewards/accuracies": 0.5, "rewards/chosen": -0.012115478515625, "rewards/margins": 0.021820068359375, "rewards/rejected": -0.033966064453125, "step": 154 }, { "epoch": 0.11481481481481481, "grad_norm": 3.6313586235046387, "learning_rate": 8.851851851851852e-07, "logits/chosen": 1.69921875, "logits/rejected": 1.5869140625, "logps/chosen": -48.125, "logps/rejected": -79.0625, "loss": 1.207, "rewards/accuracies": 0.25, "rewards/chosen": -0.293701171875, "rewards/margins": -0.70849609375, "rewards/rejected": 0.414794921875, "step": 155 }, { "epoch": 0.11555555555555555, "grad_norm": 8.826051712036133, "learning_rate": 8.844444444444445e-07, "logits/chosen": 2.486328125, "logits/rejected": 2.09375, "logps/chosen": -65.4375, "logps/rejected": -49.46875, "loss": 1.7881, "rewards/accuracies": 0.5, "rewards/chosen": -1.15234375, "rewards/margins": -1.1748046875, "rewards/rejected": 0.02227783203125, "step": 156 }, { "epoch": 0.1162962962962963, "grad_norm": 3.470027446746826, "learning_rate": 8.837037037037036e-07, "logits/chosen": 2.30859375, "logits/rejected": 1.4970703125, "logps/chosen": -53.0, "logps/rejected": -69.9375, "loss": 0.6479, "rewards/accuracies": 0.75, "rewards/chosen": 0.0211029052734375, "rewards/margins": 0.09759521484375, "rewards/rejected": -0.0765380859375, "step": 157 }, { "epoch": 0.11703703703703704, "grad_norm": 2.938032627105713, "learning_rate": 8.829629629629629e-07, "logits/chosen": 2.009765625, "logits/rejected": 2.40625, "logps/chosen": -28.109375, "logps/rejected": -53.8125, "loss": 0.6543, "rewards/accuracies": 0.5, "rewards/chosen": -0.07110595703125, "rewards/margins": 0.094482421875, "rewards/rejected": -0.1656494140625, "step": 158 }, { "epoch": 0.11777777777777777, "grad_norm": 1.900913953781128, "learning_rate": 8.822222222222222e-07, "logits/chosen": 0.92431640625, "logits/rejected": 1.1318359375, "logps/chosen": -20.84375, "logps/rejected": -60.0, "loss": 0.5967, "rewards/accuracies": 0.75, "rewards/chosen": 0.1259765625, "rewards/margins": 0.210693359375, "rewards/rejected": -0.084716796875, "step": 159 }, { "epoch": 0.11851851851851852, "grad_norm": 16.73974609375, "learning_rate": 8.814814814814815e-07, "logits/chosen": 1.859375, "logits/rejected": 1.9189453125, "logps/chosen": -36.28125, "logps/rejected": -45.40625, "loss": 0.5513, "rewards/accuracies": 0.75, "rewards/chosen": -0.0933837890625, "rewards/margins": 0.493896484375, "rewards/rejected": -0.58740234375, "step": 160 }, { "epoch": 0.11925925925925926, "grad_norm": 3.3840134143829346, "learning_rate": 8.807407407407407e-07, "logits/chosen": 0.9482421875, "logits/rejected": 1.8173828125, "logps/chosen": -42.65625, "logps/rejected": -63.1875, "loss": 0.6357, "rewards/accuracies": 0.75, "rewards/chosen": 0.007415771484375, "rewards/margins": 0.14599609375, "rewards/rejected": -0.138671875, "step": 161 }, { "epoch": 0.12, "grad_norm": 2.5771584510803223, "learning_rate": 8.799999999999999e-07, "logits/chosen": 0.78125, "logits/rejected": 1.3076171875, "logps/chosen": -18.953125, "logps/rejected": -53.53125, "loss": 0.6611, "rewards/accuracies": 0.5, "rewards/chosen": -0.020721435546875, "rewards/margins": 0.078857421875, "rewards/rejected": -0.099609375, "step": 162 }, { "epoch": 0.12074074074074075, "grad_norm": 1.8053687810897827, "learning_rate": 8.792592592592592e-07, "logits/chosen": 1.099609375, "logits/rejected": 1.6572265625, "logps/chosen": -29.71875, "logps/rejected": -49.5, "loss": 0.7393, "rewards/accuracies": 0.25, "rewards/chosen": 0.005889892578125, "rewards/margins": -0.06982421875, "rewards/rejected": 0.0758056640625, "step": 163 }, { "epoch": 0.12148148148148148, "grad_norm": 1.6655983924865723, "learning_rate": 8.785185185185184e-07, "logits/chosen": 1.705078125, "logits/rejected": 1.572265625, "logps/chosen": -40.59375, "logps/rejected": -25.125, "loss": 0.793, "rewards/accuracies": 0.25, "rewards/chosen": 0.046295166015625, "rewards/margins": -0.1763916015625, "rewards/rejected": 0.22265625, "step": 164 }, { "epoch": 0.12222222222222222, "grad_norm": 1.9076720476150513, "learning_rate": 8.777777777777777e-07, "logits/chosen": 1.1142578125, "logits/rejected": 2.130859375, "logps/chosen": -36.21875, "logps/rejected": -39.8125, "loss": 0.6636, "rewards/accuracies": 0.5, "rewards/chosen": 0.05194091796875, "rewards/margins": 0.0960693359375, "rewards/rejected": -0.044189453125, "step": 165 }, { "epoch": 0.12296296296296297, "grad_norm": 1.5032323598861694, "learning_rate": 8.770370370370371e-07, "logits/chosen": 1.505859375, "logits/rejected": 2.201171875, "logps/chosen": -24.84375, "logps/rejected": -35.46875, "loss": 0.7002, "rewards/accuracies": 0.5, "rewards/chosen": 0.0128936767578125, "rewards/margins": -0.012298583984375, "rewards/rejected": 0.0251922607421875, "step": 166 }, { "epoch": 0.1237037037037037, "grad_norm": 1.5208780765533447, "learning_rate": 8.762962962962963e-07, "logits/chosen": 1.46875, "logits/rejected": 0.9228515625, "logps/chosen": -23.03125, "logps/rejected": -42.0625, "loss": 0.6743, "rewards/accuracies": 0.75, "rewards/chosen": -0.036346435546875, "rewards/margins": 0.03985595703125, "rewards/rejected": -0.076171875, "step": 167 }, { "epoch": 0.12444444444444444, "grad_norm": 1.4704526662826538, "learning_rate": 8.755555555555555e-07, "logits/chosen": 1.4130859375, "logits/rejected": 1.58203125, "logps/chosen": -20.40625, "logps/rejected": -39.28125, "loss": 0.7646, "rewards/accuracies": 0.25, "rewards/chosen": -0.006072998046875, "rewards/margins": -0.1240234375, "rewards/rejected": 0.117919921875, "step": 168 }, { "epoch": 0.12518518518518518, "grad_norm": 1.829183578491211, "learning_rate": 8.748148148148148e-07, "logits/chosen": 1.349609375, "logits/rejected": 1.55859375, "logps/chosen": -38.0, "logps/rejected": -44.375, "loss": 0.5801, "rewards/accuracies": 0.75, "rewards/chosen": 0.19580078125, "rewards/margins": 0.29736328125, "rewards/rejected": -0.1015625, "step": 169 }, { "epoch": 0.1259259259259259, "grad_norm": 2.849519968032837, "learning_rate": 8.740740740740741e-07, "logits/chosen": 2.01171875, "logits/rejected": 2.138671875, "logps/chosen": -22.71875, "logps/rejected": -67.0, "loss": 0.6665, "rewards/accuracies": 0.5, "rewards/chosen": -0.052734375, "rewards/margins": 0.3193359375, "rewards/rejected": -0.3720703125, "step": 170 }, { "epoch": 0.12666666666666668, "grad_norm": 1.4956328868865967, "learning_rate": 8.733333333333333e-07, "logits/chosen": 1.43359375, "logits/rejected": 1.138671875, "logps/chosen": -25.015625, "logps/rejected": -39.375, "loss": 0.7417, "rewards/accuracies": 0.5, "rewards/chosen": -0.04180908203125, "rewards/margins": -0.0804443359375, "rewards/rejected": 0.03863525390625, "step": 171 }, { "epoch": 0.1274074074074074, "grad_norm": 2.4786899089813232, "learning_rate": 8.725925925925925e-07, "logits/chosen": 1.2880859375, "logits/rejected": 1.4091796875, "logps/chosen": -29.09375, "logps/rejected": -30.40625, "loss": 0.8037, "rewards/accuracies": 0.5, "rewards/chosen": -0.07171630859375, "rewards/margins": -0.181396484375, "rewards/rejected": 0.1097412109375, "step": 172 }, { "epoch": 0.12814814814814815, "grad_norm": 1.7629395723342896, "learning_rate": 8.718518518518518e-07, "logits/chosen": 1.1005859375, "logits/rejected": 1.0078125, "logps/chosen": -27.65625, "logps/rejected": -35.34375, "loss": 0.7104, "rewards/accuracies": 0.5, "rewards/chosen": -0.015228271484375, "rewards/margins": -0.01751708984375, "rewards/rejected": 0.0023193359375, "step": 173 }, { "epoch": 0.1288888888888889, "grad_norm": 3.987826108932495, "learning_rate": 8.71111111111111e-07, "logits/chosen": 1.4287109375, "logits/rejected": 1.5927734375, "logps/chosen": -33.09375, "logps/rejected": -80.1875, "loss": 0.7021, "rewards/accuracies": 0.5, "rewards/chosen": 0.08245849609375, "rewards/margins": -0.001953125, "rewards/rejected": 0.0843505859375, "step": 174 }, { "epoch": 0.12962962962962962, "grad_norm": 1.7692269086837769, "learning_rate": 8.703703703703703e-07, "logits/chosen": 1.197265625, "logits/rejected": 1.5458984375, "logps/chosen": -39.65625, "logps/rejected": -36.28125, "loss": 0.7256, "rewards/accuracies": 0.5, "rewards/chosen": 0.0238037109375, "rewards/margins": -0.0203857421875, "rewards/rejected": 0.04412841796875, "step": 175 }, { "epoch": 0.13037037037037036, "grad_norm": 1.7780685424804688, "learning_rate": 8.696296296296296e-07, "logits/chosen": 1.740234375, "logits/rejected": 2.177734375, "logps/chosen": -40.8125, "logps/rejected": -42.375, "loss": 0.749, "rewards/accuracies": 0.0, "rewards/chosen": -0.0849609375, "rewards/margins": -0.1083984375, "rewards/rejected": 0.0234375, "step": 176 }, { "epoch": 0.13111111111111112, "grad_norm": 2.6569297313690186, "learning_rate": 8.688888888888889e-07, "logits/chosen": 2.13671875, "logits/rejected": 1.4794921875, "logps/chosen": -27.625, "logps/rejected": -33.75, "loss": 0.75, "rewards/accuracies": 0.25, "rewards/chosen": -0.053924560546875, "rewards/margins": -0.0989990234375, "rewards/rejected": 0.04510498046875, "step": 177 }, { "epoch": 0.13185185185185186, "grad_norm": 3.820129871368408, "learning_rate": 8.68148148148148e-07, "logits/chosen": 1.3525390625, "logits/rejected": 1.9990234375, "logps/chosen": -51.5, "logps/rejected": -44.8125, "loss": 0.7871, "rewards/accuracies": 0.75, "rewards/chosen": -0.2366943359375, "rewards/margins": -0.139892578125, "rewards/rejected": -0.09686279296875, "step": 178 }, { "epoch": 0.1325925925925926, "grad_norm": 2.6426408290863037, "learning_rate": 8.674074074074074e-07, "logits/chosen": 1.18359375, "logits/rejected": 2.171875, "logps/chosen": -21.578125, "logps/rejected": -79.5, "loss": 0.5273, "rewards/accuracies": 0.5, "rewards/chosen": -0.01132965087890625, "rewards/margins": 0.6865234375, "rewards/rejected": -0.69775390625, "step": 179 }, { "epoch": 0.13333333333333333, "grad_norm": 1.7909836769104004, "learning_rate": 8.666666666666667e-07, "logits/chosen": 1.21875, "logits/rejected": 0.9462890625, "logps/chosen": -50.0625, "logps/rejected": -38.1875, "loss": 0.7061, "rewards/accuracies": 0.5, "rewards/chosen": -0.060546875, "rewards/margins": -0.0195159912109375, "rewards/rejected": -0.041015625, "step": 180 }, { "epoch": 0.13407407407407407, "grad_norm": 1.6749005317687988, "learning_rate": 8.659259259259259e-07, "logits/chosen": 1.31640625, "logits/rejected": 1.7353515625, "logps/chosen": -18.296875, "logps/rejected": -34.78125, "loss": 0.6133, "rewards/accuracies": 0.75, "rewards/chosen": 0.10699462890625, "rewards/margins": 0.207763671875, "rewards/rejected": -0.10076904296875, "step": 181 }, { "epoch": 0.1348148148148148, "grad_norm": 2.109138250350952, "learning_rate": 8.651851851851852e-07, "logits/chosen": 1.3447265625, "logits/rejected": 1.8232421875, "logps/chosen": -47.96875, "logps/rejected": -82.25, "loss": 0.4932, "rewards/accuracies": 1.0, "rewards/chosen": 0.6240234375, "rewards/margins": 0.66357421875, "rewards/rejected": -0.039459228515625, "step": 182 }, { "epoch": 0.13555555555555557, "grad_norm": 2.3300368785858154, "learning_rate": 8.644444444444444e-07, "logits/chosen": 1.4736328125, "logits/rejected": 1.7216796875, "logps/chosen": -31.203125, "logps/rejected": -45.90625, "loss": 0.4937, "rewards/accuracies": 0.75, "rewards/chosen": 0.69775390625, "rewards/margins": 0.78173828125, "rewards/rejected": -0.0836181640625, "step": 183 }, { "epoch": 0.1362962962962963, "grad_norm": 1.4489330053329468, "learning_rate": 8.637037037037037e-07, "logits/chosen": 1.732421875, "logits/rejected": 1.9560546875, "logps/chosen": -24.953125, "logps/rejected": -31.359375, "loss": 0.6885, "rewards/accuracies": 0.5, "rewards/chosen": -0.048431396484375, "rewards/margins": 0.035919189453125, "rewards/rejected": -0.08441162109375, "step": 184 }, { "epoch": 0.13703703703703704, "grad_norm": 1.376103401184082, "learning_rate": 8.629629629629629e-07, "logits/chosen": 1.955078125, "logits/rejected": 1.724609375, "logps/chosen": -28.046875, "logps/rejected": -36.90625, "loss": 0.7163, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005950927734375, "rewards/margins": -0.026336669921875, "rewards/rejected": 0.026947021484375, "step": 185 }, { "epoch": 0.13777777777777778, "grad_norm": 1.5130923986434937, "learning_rate": 8.622222222222222e-07, "logits/chosen": 0.5986328125, "logits/rejected": 1.6337890625, "logps/chosen": -27.4375, "logps/rejected": -42.96875, "loss": 0.7266, "rewards/accuracies": 0.25, "rewards/chosen": -0.057403564453125, "rewards/margins": -0.0628662109375, "rewards/rejected": 0.005462646484375, "step": 186 }, { "epoch": 0.1385185185185185, "grad_norm": 2.954091787338257, "learning_rate": 8.614814814814815e-07, "logits/chosen": 1.5185546875, "logits/rejected": 1.650390625, "logps/chosen": -32.21875, "logps/rejected": -49.0, "loss": 0.5859, "rewards/accuracies": 0.75, "rewards/chosen": 0.436767578125, "rewards/margins": 0.370361328125, "rewards/rejected": 0.06640625, "step": 187 }, { "epoch": 0.13925925925925925, "grad_norm": 2.933481216430664, "learning_rate": 8.607407407407406e-07, "logits/chosen": 1.3857421875, "logits/rejected": 1.2470703125, "logps/chosen": -39.5, "logps/rejected": -91.125, "loss": 1.0205, "rewards/accuracies": 0.75, "rewards/chosen": 0.11016845703125, "rewards/margins": -0.353759765625, "rewards/rejected": 0.4638671875, "step": 188 }, { "epoch": 0.14, "grad_norm": 5.6009931564331055, "learning_rate": 8.599999999999999e-07, "logits/chosen": 2.322265625, "logits/rejected": 1.345703125, "logps/chosen": -35.125, "logps/rejected": -26.59375, "loss": 1.1113, "rewards/accuracies": 0.0, "rewards/chosen": -0.50927734375, "rewards/margins": -0.666015625, "rewards/rejected": 0.1566162109375, "step": 189 }, { "epoch": 0.14074074074074075, "grad_norm": 1.7580374479293823, "learning_rate": 8.592592592592592e-07, "logits/chosen": 1.3271484375, "logits/rejected": 1.1181640625, "logps/chosen": -29.5625, "logps/rejected": -22.78125, "loss": 0.7178, "rewards/accuracies": 0.5, "rewards/chosen": -0.09295654296875, "rewards/margins": -0.04412841796875, "rewards/rejected": -0.048828125, "step": 190 }, { "epoch": 0.14148148148148149, "grad_norm": 2.5580179691314697, "learning_rate": 8.585185185185185e-07, "logits/chosen": 0.736328125, "logits/rejected": 1.5263671875, "logps/chosen": -24.109375, "logps/rejected": -79.25, "loss": 0.668, "rewards/accuracies": 0.5, "rewards/chosen": 0.24755859375, "rewards/margins": 0.094482421875, "rewards/rejected": 0.153076171875, "step": 191 }, { "epoch": 0.14222222222222222, "grad_norm": 3.4474384784698486, "learning_rate": 8.577777777777777e-07, "logits/chosen": 1.37109375, "logits/rejected": 1.4384765625, "logps/chosen": -36.40625, "logps/rejected": -58.21875, "loss": 0.7358, "rewards/accuracies": 0.5, "rewards/chosen": 0.029693603515625, "rewards/margins": -0.078125, "rewards/rejected": 0.1077880859375, "step": 192 }, { "epoch": 0.14296296296296296, "grad_norm": 2.372187852859497, "learning_rate": 8.57037037037037e-07, "logits/chosen": 1.1455078125, "logits/rejected": 1.3564453125, "logps/chosen": -30.765625, "logps/rejected": -25.15625, "loss": 0.6978, "rewards/accuracies": 0.25, "rewards/chosen": 0.015625, "rewards/margins": 0.001190185546875, "rewards/rejected": 0.0144195556640625, "step": 193 }, { "epoch": 0.1437037037037037, "grad_norm": 1.4769777059555054, "learning_rate": 8.562962962962963e-07, "logits/chosen": 0.72900390625, "logits/rejected": 1.572265625, "logps/chosen": -27.296875, "logps/rejected": -32.59375, "loss": 0.6548, "rewards/accuracies": 0.75, "rewards/chosen": -0.051971435546875, "rewards/margins": 0.09381103515625, "rewards/rejected": -0.145751953125, "step": 194 }, { "epoch": 0.14444444444444443, "grad_norm": 1.9953533411026, "learning_rate": 8.555555555555555e-07, "logits/chosen": 1.435546875, "logits/rejected": 1.5263671875, "logps/chosen": -30.875, "logps/rejected": -45.09375, "loss": 0.8721, "rewards/accuracies": 0.0, "rewards/chosen": -0.0816650390625, "rewards/margins": -0.322265625, "rewards/rejected": 0.2406005859375, "step": 195 }, { "epoch": 0.1451851851851852, "grad_norm": 2.243703603744507, "learning_rate": 8.548148148148148e-07, "logits/chosen": 1.47265625, "logits/rejected": 2.279296875, "logps/chosen": -39.96875, "logps/rejected": -66.125, "loss": 1.0205, "rewards/accuracies": 0.5, "rewards/chosen": 0.1429443359375, "rewards/margins": -0.208984375, "rewards/rejected": 0.351806640625, "step": 196 }, { "epoch": 0.14592592592592593, "grad_norm": 2.080677032470703, "learning_rate": 8.540740740740741e-07, "logits/chosen": 1.42578125, "logits/rejected": 1.240234375, "logps/chosen": -27.28125, "logps/rejected": -43.21875, "loss": 0.8115, "rewards/accuracies": 0.25, "rewards/chosen": 0.12890625, "rewards/margins": -0.19677734375, "rewards/rejected": 0.32568359375, "step": 197 }, { "epoch": 0.14666666666666667, "grad_norm": 2.304171562194824, "learning_rate": 8.533333333333334e-07, "logits/chosen": 1.1171875, "logits/rejected": 2.16015625, "logps/chosen": -34.84375, "logps/rejected": -77.25, "loss": 0.6372, "rewards/accuracies": 0.75, "rewards/chosen": 0.091796875, "rewards/margins": 0.11834716796875, "rewards/rejected": -0.02655029296875, "step": 198 }, { "epoch": 0.1474074074074074, "grad_norm": 2.0973360538482666, "learning_rate": 8.525925925925925e-07, "logits/chosen": 1.37109375, "logits/rejected": 1.7919921875, "logps/chosen": -40.78125, "logps/rejected": -32.0625, "loss": 0.7461, "rewards/accuracies": 0.5, "rewards/chosen": -0.033203125, "rewards/margins": -0.0867919921875, "rewards/rejected": 0.05352783203125, "step": 199 }, { "epoch": 0.14814814814814814, "grad_norm": 2.0427348613739014, "learning_rate": 8.518518518518518e-07, "logits/chosen": 1.072265625, "logits/rejected": 1.044921875, "logps/chosen": -30.28125, "logps/rejected": -45.5625, "loss": 0.8203, "rewards/accuracies": 0.25, "rewards/chosen": -0.07928466796875, "rewards/margins": -0.208984375, "rewards/rejected": 0.129638671875, "step": 200 }, { "epoch": 0.14888888888888888, "grad_norm": 3.3482773303985596, "learning_rate": 8.511111111111111e-07, "logits/chosen": 1.2578125, "logits/rejected": 1.146484375, "logps/chosen": -37.625, "logps/rejected": -69.0, "loss": 0.6426, "rewards/accuracies": 0.75, "rewards/chosen": -0.08184814453125, "rewards/margins": 0.12322998046875, "rewards/rejected": -0.205078125, "step": 201 }, { "epoch": 0.14962962962962964, "grad_norm": 1.6270242929458618, "learning_rate": 8.503703703703703e-07, "logits/chosen": 1.78515625, "logits/rejected": 2.103515625, "logps/chosen": -53.90625, "logps/rejected": -48.40625, "loss": 0.584, "rewards/accuracies": 1.0, "rewards/chosen": 0.0589599609375, "rewards/margins": 0.233154296875, "rewards/rejected": -0.1741943359375, "step": 202 }, { "epoch": 0.15037037037037038, "grad_norm": 1.6718250513076782, "learning_rate": 8.496296296296295e-07, "logits/chosen": 1.400390625, "logits/rejected": 1.4140625, "logps/chosen": -33.25, "logps/rejected": -42.4375, "loss": 0.6982, "rewards/accuracies": 0.5, "rewards/chosen": -0.078125, "rewards/margins": -0.0062408447265625, "rewards/rejected": -0.0718994140625, "step": 203 }, { "epoch": 0.1511111111111111, "grad_norm": 1.545513391494751, "learning_rate": 8.488888888888888e-07, "logits/chosen": 1.43359375, "logits/rejected": 1.32421875, "logps/chosen": -38.1875, "logps/rejected": -47.46875, "loss": 0.564, "rewards/accuracies": 0.75, "rewards/chosen": 0.2265625, "rewards/margins": 0.309326171875, "rewards/rejected": -0.082763671875, "step": 204 }, { "epoch": 0.15185185185185185, "grad_norm": 2.0691592693328857, "learning_rate": 8.48148148148148e-07, "logits/chosen": 1.43359375, "logits/rejected": 1.4560546875, "logps/chosen": -45.75, "logps/rejected": -31.53125, "loss": 0.6465, "rewards/accuracies": 0.75, "rewards/chosen": -0.01212310791015625, "rewards/margins": 0.100341796875, "rewards/rejected": -0.112548828125, "step": 205 }, { "epoch": 0.15259259259259259, "grad_norm": 1.3163093328475952, "learning_rate": 8.474074074074074e-07, "logits/chosen": 1.4990234375, "logits/rejected": 1.3984375, "logps/chosen": -21.578125, "logps/rejected": -32.5625, "loss": 0.7056, "rewards/accuracies": 0.25, "rewards/chosen": 0.0625, "rewards/margins": 0.0058135986328125, "rewards/rejected": 0.056671142578125, "step": 206 }, { "epoch": 0.15333333333333332, "grad_norm": 2.261596441268921, "learning_rate": 8.466666666666667e-07, "logits/chosen": 1.3388671875, "logits/rejected": 1.2255859375, "logps/chosen": -29.703125, "logps/rejected": -36.90625, "loss": 0.6294, "rewards/accuracies": 0.75, "rewards/chosen": 0.125732421875, "rewards/margins": 0.1495361328125, "rewards/rejected": -0.023834228515625, "step": 207 }, { "epoch": 0.15407407407407409, "grad_norm": 1.335241675376892, "learning_rate": 8.45925925925926e-07, "logits/chosen": 1.013671875, "logits/rejected": 1.650390625, "logps/chosen": -27.9375, "logps/rejected": -31.40625, "loss": 0.6494, "rewards/accuracies": 0.5, "rewards/chosen": 0.035919189453125, "rewards/margins": 0.09686279296875, "rewards/rejected": -0.060943603515625, "step": 208 }, { "epoch": 0.15481481481481482, "grad_norm": 2.880917549133301, "learning_rate": 8.451851851851851e-07, "logits/chosen": 1.3505859375, "logits/rejected": 1.5419921875, "logps/chosen": -41.875, "logps/rejected": -56.375, "loss": 0.6382, "rewards/accuracies": 0.5, "rewards/chosen": -0.1397705078125, "rewards/margins": 0.1422119140625, "rewards/rejected": -0.281982421875, "step": 209 }, { "epoch": 0.15555555555555556, "grad_norm": 1.6469677686691284, "learning_rate": 8.444444444444444e-07, "logits/chosen": 1.84375, "logits/rejected": 1.6943359375, "logps/chosen": -33.8125, "logps/rejected": -41.625, "loss": 0.7524, "rewards/accuracies": 0.5, "rewards/chosen": 0.04571533203125, "rewards/margins": -0.091796875, "rewards/rejected": 0.137451171875, "step": 210 }, { "epoch": 0.1562962962962963, "grad_norm": 3.3703622817993164, "learning_rate": 8.437037037037037e-07, "logits/chosen": 2.9375, "logits/rejected": 1.0927734375, "logps/chosen": -24.5, "logps/rejected": -61.03125, "loss": 0.6255, "rewards/accuracies": 1.0, "rewards/chosen": 0.09649658203125, "rewards/margins": 0.141845703125, "rewards/rejected": -0.045318603515625, "step": 211 }, { "epoch": 0.15703703703703703, "grad_norm": 3.072322368621826, "learning_rate": 8.429629629629629e-07, "logits/chosen": 1.8251953125, "logits/rejected": 1.76953125, "logps/chosen": -30.90625, "logps/rejected": -50.46875, "loss": 0.709, "rewards/accuracies": 0.5, "rewards/chosen": 0.0031280517578125, "rewards/margins": -0.00445556640625, "rewards/rejected": 0.00762939453125, "step": 212 }, { "epoch": 0.15777777777777777, "grad_norm": 1.9014493227005005, "learning_rate": 8.422222222222222e-07, "logits/chosen": 1.3095703125, "logits/rejected": 1.2060546875, "logps/chosen": -27.609375, "logps/rejected": -53.46875, "loss": 0.6577, "rewards/accuracies": 0.5, "rewards/chosen": 0.090576171875, "rewards/margins": 0.08514404296875, "rewards/rejected": 0.0054779052734375, "step": 213 }, { "epoch": 0.15851851851851853, "grad_norm": 4.430942535400391, "learning_rate": 8.414814814814814e-07, "logits/chosen": 1.12109375, "logits/rejected": 2.060546875, "logps/chosen": -35.375, "logps/rejected": -87.375, "loss": 0.916, "rewards/accuracies": 0.0, "rewards/chosen": 0.0859375, "rewards/margins": -0.366455078125, "rewards/rejected": 0.452392578125, "step": 214 }, { "epoch": 0.15925925925925927, "grad_norm": 1.8050898313522339, "learning_rate": 8.407407407407407e-07, "logits/chosen": 0.958984375, "logits/rejected": 0.94921875, "logps/chosen": -25.03125, "logps/rejected": -31.8125, "loss": 0.6831, "rewards/accuracies": 0.5, "rewards/chosen": 0.017181396484375, "rewards/margins": 0.027740478515625, "rewards/rejected": -0.0105743408203125, "step": 215 }, { "epoch": 0.16, "grad_norm": 1.8514763116836548, "learning_rate": 8.399999999999999e-07, "logits/chosen": 1.3603515625, "logits/rejected": 1.251953125, "logps/chosen": -23.5, "logps/rejected": -49.875, "loss": 0.8125, "rewards/accuracies": 0.5, "rewards/chosen": -0.08441162109375, "rewards/margins": -0.203857421875, "rewards/rejected": 0.1195068359375, "step": 216 }, { "epoch": 0.16074074074074074, "grad_norm": 1.8794881105422974, "learning_rate": 8.392592592592592e-07, "logits/chosen": 0.8310546875, "logits/rejected": 0.82470703125, "logps/chosen": -44.375, "logps/rejected": -36.0, "loss": 0.8008, "rewards/accuracies": 0.75, "rewards/chosen": 0.111328125, "rewards/margins": -0.126953125, "rewards/rejected": 0.23828125, "step": 217 }, { "epoch": 0.16148148148148148, "grad_norm": 1.8308616876602173, "learning_rate": 8.385185185185185e-07, "logits/chosen": 1.685546875, "logits/rejected": 1.787109375, "logps/chosen": -25.171875, "logps/rejected": -49.90625, "loss": 0.6621, "rewards/accuracies": 0.75, "rewards/chosen": -0.056243896484375, "rewards/margins": 0.07464599609375, "rewards/rejected": -0.130859375, "step": 218 }, { "epoch": 0.1622222222222222, "grad_norm": 2.492219924926758, "learning_rate": 8.377777777777777e-07, "logits/chosen": 1.837890625, "logits/rejected": 0.5654296875, "logps/chosen": -58.6875, "logps/rejected": -45.3125, "loss": 0.8267, "rewards/accuracies": 0.25, "rewards/chosen": -0.22509765625, "rewards/margins": -0.2359619140625, "rewards/rejected": 0.01094818115234375, "step": 219 }, { "epoch": 0.16296296296296298, "grad_norm": 1.651410460472107, "learning_rate": 8.37037037037037e-07, "logits/chosen": 1.5166015625, "logits/rejected": 1.5888671875, "logps/chosen": -42.78125, "logps/rejected": -48.3125, "loss": 0.6143, "rewards/accuracies": 0.75, "rewards/chosen": -0.0078125, "rewards/margins": 0.185546875, "rewards/rejected": -0.193359375, "step": 220 }, { "epoch": 0.1637037037037037, "grad_norm": 3.8369882106781006, "learning_rate": 8.362962962962963e-07, "logits/chosen": 1.62109375, "logits/rejected": 1.1201171875, "logps/chosen": -52.25, "logps/rejected": -48.0, "loss": 1.3066, "rewards/accuracies": 0.25, "rewards/chosen": -0.6240234375, "rewards/margins": -0.7490234375, "rewards/rejected": 0.124755859375, "step": 221 }, { "epoch": 0.16444444444444445, "grad_norm": 2.426375150680542, "learning_rate": 8.355555555555556e-07, "logits/chosen": 1.443359375, "logits/rejected": 1.6435546875, "logps/chosen": -26.59375, "logps/rejected": -65.75, "loss": 0.7539, "rewards/accuracies": 0.25, "rewards/chosen": 0.037078857421875, "rewards/margins": -0.1102294921875, "rewards/rejected": 0.147216796875, "step": 222 }, { "epoch": 0.16518518518518518, "grad_norm": 1.9819586277008057, "learning_rate": 8.348148148148148e-07, "logits/chosen": 1.4619140625, "logits/rejected": 1.609375, "logps/chosen": -28.59375, "logps/rejected": -26.40625, "loss": 0.7344, "rewards/accuracies": 0.5, "rewards/chosen": -0.08673095703125, "rewards/margins": -0.060546875, "rewards/rejected": -0.02618408203125, "step": 223 }, { "epoch": 0.16592592592592592, "grad_norm": 2.263676643371582, "learning_rate": 8.34074074074074e-07, "logits/chosen": 1.2353515625, "logits/rejected": 1.458984375, "logps/chosen": -29.234375, "logps/rejected": -59.9375, "loss": 0.8311, "rewards/accuracies": 0.0, "rewards/chosen": -0.014862060546875, "rewards/margins": -0.257080078125, "rewards/rejected": 0.2421875, "step": 224 }, { "epoch": 0.16666666666666666, "grad_norm": 2.1558451652526855, "learning_rate": 8.333333333333333e-07, "logits/chosen": 1.66796875, "logits/rejected": 1.5498046875, "logps/chosen": -31.46875, "logps/rejected": -36.28125, "loss": 0.7661, "rewards/accuracies": 0.25, "rewards/chosen": -0.0242156982421875, "rewards/margins": -0.1246337890625, "rewards/rejected": 0.100341796875, "step": 225 }, { "epoch": 0.1674074074074074, "grad_norm": 1.5760055780410767, "learning_rate": 8.325925925925925e-07, "logits/chosen": 1.6611328125, "logits/rejected": 1.0263671875, "logps/chosen": -27.03125, "logps/rejected": -52.125, "loss": 0.7275, "rewards/accuracies": 0.25, "rewards/chosen": -0.0328369140625, "rewards/margins": -0.0660400390625, "rewards/rejected": 0.033203125, "step": 226 }, { "epoch": 0.16814814814814816, "grad_norm": 3.681295394897461, "learning_rate": 8.318518518518518e-07, "logits/chosen": 1.017578125, "logits/rejected": 1.7578125, "logps/chosen": -24.609375, "logps/rejected": -47.90625, "loss": 0.748, "rewards/accuracies": 0.25, "rewards/chosen": 0.01873779296875, "rewards/margins": -0.102294921875, "rewards/rejected": 0.12103271484375, "step": 227 }, { "epoch": 0.1688888888888889, "grad_norm": 2.2785322666168213, "learning_rate": 8.311111111111111e-07, "logits/chosen": 1.5625, "logits/rejected": 1.4599609375, "logps/chosen": -26.140625, "logps/rejected": -72.875, "loss": 0.646, "rewards/accuracies": 0.75, "rewards/chosen": -0.00914764404296875, "rewards/margins": 0.1044921875, "rewards/rejected": -0.1136474609375, "step": 228 }, { "epoch": 0.16962962962962963, "grad_norm": 2.9708521366119385, "learning_rate": 8.303703703703704e-07, "logits/chosen": 1.3798828125, "logits/rejected": 2.19921875, "logps/chosen": -34.59375, "logps/rejected": -39.3125, "loss": 0.6582, "rewards/accuracies": 0.5, "rewards/chosen": 0.1363525390625, "rewards/margins": 0.09375, "rewards/rejected": 0.0426025390625, "step": 229 }, { "epoch": 0.17037037037037037, "grad_norm": 3.1945033073425293, "learning_rate": 8.296296296296295e-07, "logits/chosen": 2.3671875, "logits/rejected": 1.236328125, "logps/chosen": -44.46875, "logps/rejected": -39.75, "loss": 0.6826, "rewards/accuracies": 0.5, "rewards/chosen": 0.040191650390625, "rewards/margins": 0.07275390625, "rewards/rejected": -0.032470703125, "step": 230 }, { "epoch": 0.1711111111111111, "grad_norm": 2.0270450115203857, "learning_rate": 8.288888888888888e-07, "logits/chosen": 1.658203125, "logits/rejected": 1.318359375, "logps/chosen": -39.75, "logps/rejected": -38.15625, "loss": 0.7686, "rewards/accuracies": 0.25, "rewards/chosen": 0.046478271484375, "rewards/margins": -0.1129150390625, "rewards/rejected": 0.1593017578125, "step": 231 }, { "epoch": 0.17185185185185184, "grad_norm": 2.6325366497039795, "learning_rate": 8.281481481481482e-07, "logits/chosen": 1.8603515625, "logits/rejected": 1.455078125, "logps/chosen": -41.5, "logps/rejected": -44.25, "loss": 0.8862, "rewards/accuracies": 0.25, "rewards/chosen": -0.3125, "rewards/margins": -0.3359375, "rewards/rejected": 0.023406982421875, "step": 232 }, { "epoch": 0.1725925925925926, "grad_norm": 1.9028801918029785, "learning_rate": 8.274074074074074e-07, "logits/chosen": 1.060546875, "logits/rejected": 1.2236328125, "logps/chosen": -34.09375, "logps/rejected": -25.140625, "loss": 0.6953, "rewards/accuracies": 0.75, "rewards/chosen": -0.03985595703125, "rewards/margins": -0.003749847412109375, "rewards/rejected": -0.036102294921875, "step": 233 }, { "epoch": 0.17333333333333334, "grad_norm": 6.684110164642334, "learning_rate": 8.266666666666667e-07, "logits/chosen": 2.419921875, "logits/rejected": 2.04296875, "logps/chosen": -34.34375, "logps/rejected": -34.375, "loss": 1.2715, "rewards/accuracies": 0.0, "rewards/chosen": -0.27001953125, "rewards/margins": -0.89453125, "rewards/rejected": 0.62451171875, "step": 234 }, { "epoch": 0.17407407407407408, "grad_norm": 2.0475170612335205, "learning_rate": 8.259259259259259e-07, "logits/chosen": 1.220703125, "logits/rejected": 1.779296875, "logps/chosen": -22.28125, "logps/rejected": -59.125, "loss": 0.6392, "rewards/accuracies": 0.75, "rewards/chosen": 0.072021484375, "rewards/margins": 0.126708984375, "rewards/rejected": -0.054656982421875, "step": 235 }, { "epoch": 0.1748148148148148, "grad_norm": 1.8349069356918335, "learning_rate": 8.251851851851851e-07, "logits/chosen": 1.2939453125, "logits/rejected": 1.373046875, "logps/chosen": -40.21875, "logps/rejected": -36.125, "loss": 0.6719, "rewards/accuracies": 0.5, "rewards/chosen": -0.014862060546875, "rewards/margins": 0.05743408203125, "rewards/rejected": -0.072265625, "step": 236 }, { "epoch": 0.17555555555555555, "grad_norm": 1.7639189958572388, "learning_rate": 8.244444444444444e-07, "logits/chosen": 1.9892578125, "logits/rejected": 1.7265625, "logps/chosen": -32.34375, "logps/rejected": -40.3125, "loss": 0.5957, "rewards/accuracies": 0.75, "rewards/chosen": 0.14794921875, "rewards/margins": 0.338623046875, "rewards/rejected": -0.1905517578125, "step": 237 }, { "epoch": 0.17629629629629628, "grad_norm": 1.6870185136795044, "learning_rate": 8.237037037037037e-07, "logits/chosen": 1.7421875, "logits/rejected": 1.8837890625, "logps/chosen": -37.75, "logps/rejected": -37.03125, "loss": 0.6992, "rewards/accuracies": 0.5, "rewards/chosen": -0.09295654296875, "rewards/margins": -0.00311279296875, "rewards/rejected": -0.08984375, "step": 238 }, { "epoch": 0.17703703703703705, "grad_norm": 1.9206172227859497, "learning_rate": 8.22962962962963e-07, "logits/chosen": 1.1455078125, "logits/rejected": 1.7451171875, "logps/chosen": -34.25, "logps/rejected": -57.625, "loss": 0.5742, "rewards/accuracies": 0.75, "rewards/chosen": 0.0703125, "rewards/margins": 0.26806640625, "rewards/rejected": -0.1976318359375, "step": 239 }, { "epoch": 0.17777777777777778, "grad_norm": 2.3741023540496826, "learning_rate": 8.222222222222221e-07, "logits/chosen": 0.9658203125, "logits/rejected": 1.2802734375, "logps/chosen": -40.40625, "logps/rejected": -39.8125, "loss": 0.9209, "rewards/accuracies": 0.25, "rewards/chosen": -0.05859375, "rewards/margins": -0.357666015625, "rewards/rejected": 0.299072265625, "step": 240 }, { "epoch": 0.17851851851851852, "grad_norm": 1.3776217699050903, "learning_rate": 8.214814814814814e-07, "logits/chosen": 1.1162109375, "logits/rejected": 1.263671875, "logps/chosen": -26.25, "logps/rejected": -31.5, "loss": 0.728, "rewards/accuracies": 0.0, "rewards/chosen": 0.048431396484375, "rewards/margins": -0.06793212890625, "rewards/rejected": 0.11639404296875, "step": 241 }, { "epoch": 0.17925925925925926, "grad_norm": 2.2373597621917725, "learning_rate": 8.207407407407407e-07, "logits/chosen": 1.3515625, "logits/rejected": 1.9033203125, "logps/chosen": -24.421875, "logps/rejected": -63.65625, "loss": 0.6968, "rewards/accuracies": 0.5, "rewards/chosen": -0.0426025390625, "rewards/margins": 0.0008087158203125, "rewards/rejected": -0.04339599609375, "step": 242 }, { "epoch": 0.18, "grad_norm": 3.9554989337921143, "learning_rate": 8.199999999999999e-07, "logits/chosen": 1.6611328125, "logits/rejected": 1.88671875, "logps/chosen": -55.03125, "logps/rejected": -47.78125, "loss": 0.8662, "rewards/accuracies": 0.5, "rewards/chosen": -0.1597900390625, "rewards/margins": -0.283935546875, "rewards/rejected": 0.124267578125, "step": 243 }, { "epoch": 0.18074074074074073, "grad_norm": 3.215411424636841, "learning_rate": 8.192592592592592e-07, "logits/chosen": 1.5625, "logits/rejected": 1.2353515625, "logps/chosen": -35.1875, "logps/rejected": -46.4375, "loss": 0.8003, "rewards/accuracies": 0.75, "rewards/chosen": -0.207763671875, "rewards/margins": -0.1280517578125, "rewards/rejected": -0.0797119140625, "step": 244 }, { "epoch": 0.1814814814814815, "grad_norm": 2.1751644611358643, "learning_rate": 8.185185185185185e-07, "logits/chosen": 1.474609375, "logits/rejected": 1.5302734375, "logps/chosen": -19.84375, "logps/rejected": -33.1875, "loss": 0.8013, "rewards/accuracies": 0.5, "rewards/chosen": 0.0462646484375, "rewards/margins": -0.1927490234375, "rewards/rejected": 0.239013671875, "step": 245 }, { "epoch": 0.18222222222222223, "grad_norm": 3.1076557636260986, "learning_rate": 8.177777777777778e-07, "logits/chosen": 1.916015625, "logits/rejected": 2.0234375, "logps/chosen": -25.125, "logps/rejected": -27.484375, "loss": 0.5664, "rewards/accuracies": 0.75, "rewards/chosen": 0.1480712890625, "rewards/margins": 0.33642578125, "rewards/rejected": -0.1883544921875, "step": 246 }, { "epoch": 0.18296296296296297, "grad_norm": 1.428389072418213, "learning_rate": 8.17037037037037e-07, "logits/chosen": 1.8212890625, "logits/rejected": 1.3291015625, "logps/chosen": -56.75, "logps/rejected": -35.0625, "loss": 0.6436, "rewards/accuracies": 0.5, "rewards/chosen": 0.07891845703125, "rewards/margins": 0.131591796875, "rewards/rejected": -0.052734375, "step": 247 }, { "epoch": 0.1837037037037037, "grad_norm": 2.2816996574401855, "learning_rate": 8.162962962962963e-07, "logits/chosen": 1.798828125, "logits/rejected": 1.900390625, "logps/chosen": -34.0, "logps/rejected": -35.15625, "loss": 0.6152, "rewards/accuracies": 0.5, "rewards/chosen": -0.0136871337890625, "rewards/margins": 0.38427734375, "rewards/rejected": -0.39794921875, "step": 248 }, { "epoch": 0.18444444444444444, "grad_norm": 2.0598337650299072, "learning_rate": 8.155555555555556e-07, "logits/chosen": 1.302734375, "logits/rejected": 1.2529296875, "logps/chosen": -42.78125, "logps/rejected": -44.9375, "loss": 0.6577, "rewards/accuracies": 0.5, "rewards/chosen": 0.10345458984375, "rewards/margins": 0.126953125, "rewards/rejected": -0.0234375, "step": 249 }, { "epoch": 0.18518518518518517, "grad_norm": 2.000070333480835, "learning_rate": 8.148148148148147e-07, "logits/chosen": 1.3671875, "logits/rejected": 2.09375, "logps/chosen": -35.59375, "logps/rejected": -65.125, "loss": 0.752, "rewards/accuracies": 0.5, "rewards/chosen": 0.0164031982421875, "rewards/margins": -0.09765625, "rewards/rejected": 0.11407470703125, "step": 250 }, { "epoch": 0.18592592592592594, "grad_norm": 1.8396055698394775, "learning_rate": 8.14074074074074e-07, "logits/chosen": 1.0673828125, "logits/rejected": 1.123046875, "logps/chosen": -30.96875, "logps/rejected": -33.78125, "loss": 0.7041, "rewards/accuracies": 0.5, "rewards/chosen": -0.04998779296875, "rewards/margins": -0.0171966552734375, "rewards/rejected": -0.032806396484375, "step": 251 }, { "epoch": 0.18666666666666668, "grad_norm": 1.786758303642273, "learning_rate": 8.133333333333333e-07, "logits/chosen": 1.4462890625, "logits/rejected": 1.3505859375, "logps/chosen": -28.15625, "logps/rejected": -38.0, "loss": 0.813, "rewards/accuracies": 0.75, "rewards/chosen": -0.1656494140625, "rewards/margins": -0.1981201171875, "rewards/rejected": 0.032470703125, "step": 252 }, { "epoch": 0.1874074074074074, "grad_norm": 2.159968852996826, "learning_rate": 8.125925925925926e-07, "logits/chosen": 1.4150390625, "logits/rejected": 2.279296875, "logps/chosen": -27.734375, "logps/rejected": -60.90625, "loss": 0.6738, "rewards/accuracies": 0.5, "rewards/chosen": -0.09295654296875, "rewards/margins": 0.043731689453125, "rewards/rejected": -0.13671875, "step": 253 }, { "epoch": 0.18814814814814815, "grad_norm": 7.048370361328125, "learning_rate": 8.118518518518518e-07, "logits/chosen": 1.541015625, "logits/rejected": 2.33203125, "logps/chosen": -28.96875, "logps/rejected": -30.53125, "loss": 0.7891, "rewards/accuracies": 0.25, "rewards/chosen": -0.050384521484375, "rewards/margins": -0.1728515625, "rewards/rejected": 0.1224365234375, "step": 254 }, { "epoch": 0.18888888888888888, "grad_norm": 2.19209623336792, "learning_rate": 8.11111111111111e-07, "logits/chosen": 1.990234375, "logits/rejected": 1.90234375, "logps/chosen": -28.53125, "logps/rejected": -89.875, "loss": 0.6182, "rewards/accuracies": 0.5, "rewards/chosen": 0.339111328125, "rewards/margins": 0.2421875, "rewards/rejected": 0.09686279296875, "step": 255 }, { "epoch": 0.18962962962962962, "grad_norm": 1.648237943649292, "learning_rate": 8.103703703703703e-07, "logits/chosen": 1.2509765625, "logits/rejected": 1.5517578125, "logps/chosen": -33.125, "logps/rejected": -68.8125, "loss": 0.6973, "rewards/accuracies": 0.25, "rewards/chosen": 0.02850341796875, "rewards/margins": -0.004302978515625, "rewards/rejected": 0.032806396484375, "step": 256 }, { "epoch": 0.19037037037037038, "grad_norm": 1.9257570505142212, "learning_rate": 8.096296296296295e-07, "logits/chosen": 1.576171875, "logits/rejected": 1.9111328125, "logps/chosen": -33.25, "logps/rejected": -44.21875, "loss": 0.6646, "rewards/accuracies": 0.75, "rewards/chosen": 0.06719970703125, "rewards/margins": 0.07696533203125, "rewards/rejected": -0.00975799560546875, "step": 257 }, { "epoch": 0.19111111111111112, "grad_norm": 2.5043020248413086, "learning_rate": 8.088888888888888e-07, "logits/chosen": 0.859375, "logits/rejected": 1.869140625, "logps/chosen": -35.65625, "logps/rejected": -71.5625, "loss": 0.6274, "rewards/accuracies": 0.75, "rewards/chosen": -0.001155853271484375, "rewards/margins": 0.153564453125, "rewards/rejected": -0.1546630859375, "step": 258 }, { "epoch": 0.19185185185185186, "grad_norm": 2.163364887237549, "learning_rate": 8.081481481481482e-07, "logits/chosen": 1.8095703125, "logits/rejected": 1.603515625, "logps/chosen": -42.375, "logps/rejected": -42.75, "loss": 0.8691, "rewards/accuracies": 0.5, "rewards/chosen": -0.31201171875, "rewards/margins": -0.28125, "rewards/rejected": -0.0308837890625, "step": 259 }, { "epoch": 0.1925925925925926, "grad_norm": 2.4575347900390625, "learning_rate": 8.074074074074075e-07, "logits/chosen": 1.6142578125, "logits/rejected": 1.6962890625, "logps/chosen": -26.765625, "logps/rejected": -54.03125, "loss": 0.6416, "rewards/accuracies": 0.75, "rewards/chosen": -0.024200439453125, "rewards/margins": 0.1136474609375, "rewards/rejected": -0.137939453125, "step": 260 }, { "epoch": 0.19333333333333333, "grad_norm": 1.4644170999526978, "learning_rate": 8.066666666666666e-07, "logits/chosen": 1.46484375, "logits/rejected": 1.3828125, "logps/chosen": -39.40625, "logps/rejected": -40.5625, "loss": 0.5405, "rewards/accuracies": 0.75, "rewards/chosen": 0.46923828125, "rewards/margins": 0.4951171875, "rewards/rejected": -0.0257568359375, "step": 261 }, { "epoch": 0.19407407407407407, "grad_norm": 1.4037277698516846, "learning_rate": 8.059259259259259e-07, "logits/chosen": 1.8408203125, "logits/rejected": 1.5625, "logps/chosen": -26.375, "logps/rejected": -38.3125, "loss": 0.6436, "rewards/accuracies": 0.5, "rewards/chosen": 0.0911865234375, "rewards/margins": 0.11151123046875, "rewards/rejected": -0.02032470703125, "step": 262 }, { "epoch": 0.1948148148148148, "grad_norm": 1.5431324243545532, "learning_rate": 8.051851851851852e-07, "logits/chosen": 1.3642578125, "logits/rejected": 1.8046875, "logps/chosen": -33.46875, "logps/rejected": -61.625, "loss": 0.6245, "rewards/accuracies": 0.75, "rewards/chosen": 0.1441650390625, "rewards/margins": 0.1722412109375, "rewards/rejected": -0.02813720703125, "step": 263 }, { "epoch": 0.19555555555555557, "grad_norm": 2.3464651107788086, "learning_rate": 8.044444444444444e-07, "logits/chosen": 2.3671875, "logits/rejected": 1.7001953125, "logps/chosen": -46.5, "logps/rejected": -23.078125, "loss": 0.7588, "rewards/accuracies": 0.25, "rewards/chosen": -0.164794921875, "rewards/margins": -0.1156005859375, "rewards/rejected": -0.049224853515625, "step": 264 }, { "epoch": 0.1962962962962963, "grad_norm": 1.5068137645721436, "learning_rate": 8.037037037037037e-07, "logits/chosen": 1.4130859375, "logits/rejected": 1.2890625, "logps/chosen": -34.53125, "logps/rejected": -30.96875, "loss": 0.5376, "rewards/accuracies": 0.75, "rewards/chosen": 0.52685546875, "rewards/margins": 0.501953125, "rewards/rejected": 0.0250091552734375, "step": 265 }, { "epoch": 0.19703703703703704, "grad_norm": 1.9280704259872437, "learning_rate": 8.029629629629629e-07, "logits/chosen": 2.2265625, "logits/rejected": 2.01171875, "logps/chosen": -25.5625, "logps/rejected": -20.40625, "loss": 0.728, "rewards/accuracies": 0.5, "rewards/chosen": -0.05230712890625, "rewards/margins": -0.05859375, "rewards/rejected": 0.006244659423828125, "step": 266 }, { "epoch": 0.19777777777777777, "grad_norm": 1.9236844778060913, "learning_rate": 8.022222222222221e-07, "logits/chosen": 1.734375, "logits/rejected": 1.583984375, "logps/chosen": -37.0, "logps/rejected": -56.875, "loss": 0.769, "rewards/accuracies": 0.0, "rewards/chosen": -0.039093017578125, "rewards/margins": -0.1414794921875, "rewards/rejected": 0.102294921875, "step": 267 }, { "epoch": 0.1985185185185185, "grad_norm": 2.050992965698242, "learning_rate": 8.014814814814814e-07, "logits/chosen": 1.619140625, "logits/rejected": 2.240234375, "logps/chosen": -30.875, "logps/rejected": -83.5, "loss": 0.8203, "rewards/accuracies": 0.25, "rewards/chosen": -0.0625, "rewards/margins": -0.19921875, "rewards/rejected": 0.13671875, "step": 268 }, { "epoch": 0.19925925925925925, "grad_norm": 2.5876834392547607, "learning_rate": 8.007407407407407e-07, "logits/chosen": 2.162109375, "logits/rejected": 1.501953125, "logps/chosen": -50.6875, "logps/rejected": -37.6875, "loss": 0.6997, "rewards/accuracies": 0.5, "rewards/chosen": -0.10150146484375, "rewards/margins": 0.03759765625, "rewards/rejected": -0.1390380859375, "step": 269 }, { "epoch": 0.2, "grad_norm": 2.069194793701172, "learning_rate": 8e-07, "logits/chosen": 1.369140625, "logits/rejected": 2.09765625, "logps/chosen": -27.234375, "logps/rejected": -53.6875, "loss": 0.6318, "rewards/accuracies": 0.75, "rewards/chosen": 0.00152587890625, "rewards/margins": 0.143310546875, "rewards/rejected": -0.1417236328125, "step": 270 }, { "epoch": 0.20074074074074075, "grad_norm": 1.4234380722045898, "learning_rate": 7.992592592592591e-07, "logits/chosen": 0.7978515625, "logits/rejected": 1.376953125, "logps/chosen": -25.671875, "logps/rejected": -24.828125, "loss": 0.6221, "rewards/accuracies": 0.75, "rewards/chosen": 0.031646728515625, "rewards/margins": 0.15234375, "rewards/rejected": -0.1207275390625, "step": 271 }, { "epoch": 0.20148148148148148, "grad_norm": 2.90312123298645, "learning_rate": 7.985185185185185e-07, "logits/chosen": 1.9892578125, "logits/rejected": 2.03515625, "logps/chosen": -27.875, "logps/rejected": -49.46875, "loss": 0.6089, "rewards/accuracies": 0.5, "rewards/chosen": -0.0117340087890625, "rewards/margins": 0.1932373046875, "rewards/rejected": -0.2049560546875, "step": 272 }, { "epoch": 0.20222222222222222, "grad_norm": 2.4243385791778564, "learning_rate": 7.977777777777778e-07, "logits/chosen": 1.4794921875, "logits/rejected": 2.041015625, "logps/chosen": -30.5625, "logps/rejected": -42.1875, "loss": 0.7236, "rewards/accuracies": 0.5, "rewards/chosen": -0.0863037109375, "rewards/margins": -0.04571533203125, "rewards/rejected": -0.040618896484375, "step": 273 }, { "epoch": 0.20296296296296296, "grad_norm": 2.1663036346435547, "learning_rate": 7.97037037037037e-07, "logits/chosen": 1.2353515625, "logits/rejected": 1.328125, "logps/chosen": -29.5625, "logps/rejected": -28.875, "loss": 0.7793, "rewards/accuracies": 0.5, "rewards/chosen": -0.01287841796875, "rewards/margins": -0.1497802734375, "rewards/rejected": 0.1368408203125, "step": 274 }, { "epoch": 0.2037037037037037, "grad_norm": 3.3968710899353027, "learning_rate": 7.962962962962963e-07, "logits/chosen": 1.142578125, "logits/rejected": 1.5205078125, "logps/chosen": -33.375, "logps/rejected": -54.375, "loss": 1.0498, "rewards/accuracies": 0.0, "rewards/chosen": -0.115234375, "rewards/margins": -0.6171875, "rewards/rejected": 0.50244140625, "step": 275 }, { "epoch": 0.20444444444444446, "grad_norm": 2.314389228820801, "learning_rate": 7.955555555555556e-07, "logits/chosen": 1.3212890625, "logits/rejected": 1.130859375, "logps/chosen": -37.4375, "logps/rejected": -29.625, "loss": 0.8994, "rewards/accuracies": 0.25, "rewards/chosen": -0.21044921875, "rewards/margins": -0.33935546875, "rewards/rejected": 0.12890625, "step": 276 }, { "epoch": 0.2051851851851852, "grad_norm": 1.7456897497177124, "learning_rate": 7.948148148148148e-07, "logits/chosen": 2.3359375, "logits/rejected": 1.27734375, "logps/chosen": -27.859375, "logps/rejected": -40.0, "loss": 0.6851, "rewards/accuracies": 0.25, "rewards/chosen": 0.041778564453125, "rewards/margins": 0.044830322265625, "rewards/rejected": -0.00312042236328125, "step": 277 }, { "epoch": 0.20592592592592593, "grad_norm": 10.714460372924805, "learning_rate": 7.94074074074074e-07, "logits/chosen": 1.6728515625, "logits/rejected": 1.703125, "logps/chosen": -25.625, "logps/rejected": -29.15625, "loss": 0.7842, "rewards/accuracies": 0.5, "rewards/chosen": -0.2313232421875, "rewards/margins": -0.10406494140625, "rewards/rejected": -0.127197265625, "step": 278 }, { "epoch": 0.20666666666666667, "grad_norm": 2.0191898345947266, "learning_rate": 7.933333333333333e-07, "logits/chosen": 2.025390625, "logits/rejected": 1.7998046875, "logps/chosen": -42.8125, "logps/rejected": -45.59375, "loss": 0.7715, "rewards/accuracies": 0.25, "rewards/chosen": 0.049224853515625, "rewards/margins": -0.1365966796875, "rewards/rejected": 0.1859130859375, "step": 279 }, { "epoch": 0.2074074074074074, "grad_norm": 1.944420576095581, "learning_rate": 7.925925925925926e-07, "logits/chosen": 1.765625, "logits/rejected": 2.392578125, "logps/chosen": -47.875, "logps/rejected": -36.5625, "loss": 0.6831, "rewards/accuracies": 0.25, "rewards/chosen": 0.017181396484375, "rewards/margins": 0.04644775390625, "rewards/rejected": -0.0292510986328125, "step": 280 }, { "epoch": 0.20814814814814814, "grad_norm": 1.3491781949996948, "learning_rate": 7.918518518518517e-07, "logits/chosen": 1.7255859375, "logits/rejected": 1.4765625, "logps/chosen": -32.75, "logps/rejected": -37.3125, "loss": 0.6436, "rewards/accuracies": 0.5, "rewards/chosen": 0.14453125, "rewards/margins": 0.1182861328125, "rewards/rejected": 0.026153564453125, "step": 281 }, { "epoch": 0.2088888888888889, "grad_norm": 2.299572706222534, "learning_rate": 7.91111111111111e-07, "logits/chosen": 1.9970703125, "logits/rejected": 1.5556640625, "logps/chosen": -37.6875, "logps/rejected": -60.65625, "loss": 0.7358, "rewards/accuracies": 0.75, "rewards/chosen": -0.175537109375, "rewards/margins": -0.04107666015625, "rewards/rejected": -0.1343994140625, "step": 282 }, { "epoch": 0.20962962962962964, "grad_norm": 2.6585025787353516, "learning_rate": 7.903703703703703e-07, "logits/chosen": 2.478515625, "logits/rejected": 2.224609375, "logps/chosen": -43.125, "logps/rejected": -42.40625, "loss": 0.8662, "rewards/accuracies": 0.25, "rewards/chosen": -0.2008056640625, "rewards/margins": -0.29345703125, "rewards/rejected": 0.09259033203125, "step": 283 }, { "epoch": 0.21037037037037037, "grad_norm": 2.6715924739837646, "learning_rate": 7.896296296296296e-07, "logits/chosen": 1.771484375, "logits/rejected": 2.419921875, "logps/chosen": -34.46875, "logps/rejected": -31.125, "loss": 0.96, "rewards/accuracies": 0.0, "rewards/chosen": -0.2366943359375, "rewards/margins": -0.446044921875, "rewards/rejected": 0.2093505859375, "step": 284 }, { "epoch": 0.2111111111111111, "grad_norm": 1.825315237045288, "learning_rate": 7.888888888888889e-07, "logits/chosen": 1.6591796875, "logits/rejected": 1.6005859375, "logps/chosen": -27.25, "logps/rejected": -51.46875, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": 0.045318603515625, "rewards/margins": 0.046875, "rewards/rejected": -0.001575469970703125, "step": 285 }, { "epoch": 0.21185185185185185, "grad_norm": 1.970694661140442, "learning_rate": 7.881481481481482e-07, "logits/chosen": 1.1455078125, "logits/rejected": 2.341796875, "logps/chosen": -39.84375, "logps/rejected": -42.59375, "loss": 0.8184, "rewards/accuracies": 0.0, "rewards/chosen": -0.0960693359375, "rewards/margins": -0.2340087890625, "rewards/rejected": 0.137939453125, "step": 286 }, { "epoch": 0.21259259259259258, "grad_norm": 1.2306185960769653, "learning_rate": 7.874074074074074e-07, "logits/chosen": 1.404296875, "logits/rejected": 1.80859375, "logps/chosen": -22.46875, "logps/rejected": -20.71875, "loss": 0.5962, "rewards/accuracies": 0.5, "rewards/chosen": 0.1956787109375, "rewards/margins": 0.2587890625, "rewards/rejected": -0.06304931640625, "step": 287 }, { "epoch": 0.21333333333333335, "grad_norm": 2.246931314468384, "learning_rate": 7.866666666666666e-07, "logits/chosen": 1.7392578125, "logits/rejected": 1.71875, "logps/chosen": -37.625, "logps/rejected": -55.28125, "loss": 0.7358, "rewards/accuracies": 0.25, "rewards/chosen": -0.037506103515625, "rewards/margins": -0.06793212890625, "rewards/rejected": 0.0304718017578125, "step": 288 }, { "epoch": 0.21407407407407408, "grad_norm": 4.514585018157959, "learning_rate": 7.859259259259259e-07, "logits/chosen": 2.283203125, "logits/rejected": 1.9189453125, "logps/chosen": -34.78125, "logps/rejected": -33.96875, "loss": 0.915, "rewards/accuracies": 0.25, "rewards/chosen": -0.32421875, "rewards/margins": -0.3876953125, "rewards/rejected": 0.06329345703125, "step": 289 }, { "epoch": 0.21481481481481482, "grad_norm": 2.1825931072235107, "learning_rate": 7.851851851851852e-07, "logits/chosen": 1.4814453125, "logits/rejected": 1.6455078125, "logps/chosen": -28.140625, "logps/rejected": -52.9375, "loss": 0.5894, "rewards/accuracies": 0.75, "rewards/chosen": 0.00821685791015625, "rewards/margins": 0.2315673828125, "rewards/rejected": -0.223388671875, "step": 290 }, { "epoch": 0.21555555555555556, "grad_norm": 2.2956888675689697, "learning_rate": 7.844444444444445e-07, "logits/chosen": 1.591796875, "logits/rejected": 2.302734375, "logps/chosen": -47.6875, "logps/rejected": -50.03125, "loss": 0.7559, "rewards/accuracies": 0.75, "rewards/chosen": 0.067626953125, "rewards/margins": -0.09381103515625, "rewards/rejected": 0.161376953125, "step": 291 }, { "epoch": 0.2162962962962963, "grad_norm": 2.4352071285247803, "learning_rate": 7.837037037037036e-07, "logits/chosen": 1.583984375, "logits/rejected": 1.9501953125, "logps/chosen": -30.5625, "logps/rejected": -53.09375, "loss": 0.6167, "rewards/accuracies": 0.5, "rewards/chosen": 0.0667724609375, "rewards/margins": 0.2109375, "rewards/rejected": -0.14404296875, "step": 292 }, { "epoch": 0.21703703703703703, "grad_norm": 2.3552515506744385, "learning_rate": 7.829629629629629e-07, "logits/chosen": 1.9326171875, "logits/rejected": 2.279296875, "logps/chosen": -27.265625, "logps/rejected": -79.25, "loss": 0.9272, "rewards/accuracies": 0.25, "rewards/chosen": 0.0203094482421875, "rewards/margins": -0.40234375, "rewards/rejected": 0.422607421875, "step": 293 }, { "epoch": 0.21777777777777776, "grad_norm": 2.36565899848938, "learning_rate": 7.822222222222222e-07, "logits/chosen": 1.19921875, "logits/rejected": 1.4033203125, "logps/chosen": -36.84375, "logps/rejected": -46.875, "loss": 0.6484, "rewards/accuracies": 0.75, "rewards/chosen": -0.0273590087890625, "rewards/margins": 0.09765625, "rewards/rejected": -0.125, "step": 294 }, { "epoch": 0.21851851851851853, "grad_norm": 2.8133413791656494, "learning_rate": 7.814814814814814e-07, "logits/chosen": 1.3974609375, "logits/rejected": 1.5927734375, "logps/chosen": -45.28125, "logps/rejected": -34.40625, "loss": 0.7441, "rewards/accuracies": 0.5, "rewards/chosen": -0.0117034912109375, "rewards/margins": -0.0882568359375, "rewards/rejected": 0.0765380859375, "step": 295 }, { "epoch": 0.21925925925925926, "grad_norm": 2.6976068019866943, "learning_rate": 7.807407407407407e-07, "logits/chosen": 2.13671875, "logits/rejected": 1.96484375, "logps/chosen": -25.234375, "logps/rejected": -46.1875, "loss": 0.5718, "rewards/accuracies": 1.0, "rewards/chosen": 0.061309814453125, "rewards/margins": 0.272216796875, "rewards/rejected": -0.2109375, "step": 296 }, { "epoch": 0.22, "grad_norm": 2.658271551132202, "learning_rate": 7.799999999999999e-07, "logits/chosen": 1.19140625, "logits/rejected": 1.1611328125, "logps/chosen": -35.625, "logps/rejected": -71.875, "loss": 0.8086, "rewards/accuracies": 0.0, "rewards/chosen": 0.021453857421875, "rewards/margins": -0.216796875, "rewards/rejected": 0.23828125, "step": 297 }, { "epoch": 0.22074074074074074, "grad_norm": 1.1470898389816284, "learning_rate": 7.792592592592591e-07, "logits/chosen": 1.4013671875, "logits/rejected": 2.466796875, "logps/chosen": -26.234375, "logps/rejected": -58.65625, "loss": 0.4648, "rewards/accuracies": 0.75, "rewards/chosen": 0.133544921875, "rewards/margins": 1.0498046875, "rewards/rejected": -0.916015625, "step": 298 }, { "epoch": 0.22148148148148147, "grad_norm": 2.117278575897217, "learning_rate": 7.785185185185185e-07, "logits/chosen": 1.0302734375, "logits/rejected": 0.783203125, "logps/chosen": -27.421875, "logps/rejected": -39.34375, "loss": 0.7842, "rewards/accuracies": 0.25, "rewards/chosen": -0.03240966796875, "rewards/margins": -0.1663818359375, "rewards/rejected": 0.134033203125, "step": 299 }, { "epoch": 0.2222222222222222, "grad_norm": 1.76519775390625, "learning_rate": 7.777777777777778e-07, "logits/chosen": 1.255859375, "logits/rejected": 1.5859375, "logps/chosen": -28.546875, "logps/rejected": -51.5625, "loss": 0.7764, "rewards/accuracies": 0.25, "rewards/chosen": -0.1431884765625, "rewards/margins": -0.144287109375, "rewards/rejected": 0.001190185546875, "step": 300 }, { "epoch": 0.22296296296296297, "grad_norm": 2.267862319946289, "learning_rate": 7.770370370370371e-07, "logits/chosen": 1.630859375, "logits/rejected": 1.849609375, "logps/chosen": -26.625, "logps/rejected": -76.625, "loss": 0.5762, "rewards/accuracies": 0.75, "rewards/chosen": -0.0374755859375, "rewards/margins": 0.2685546875, "rewards/rejected": -0.30615234375, "step": 301 }, { "epoch": 0.2237037037037037, "grad_norm": 2.5962467193603516, "learning_rate": 7.762962962962962e-07, "logits/chosen": 1.83984375, "logits/rejected": 1.3916015625, "logps/chosen": -27.46875, "logps/rejected": -64.3125, "loss": 0.6074, "rewards/accuracies": 1.0, "rewards/chosen": 0.138671875, "rewards/margins": 0.1875, "rewards/rejected": -0.048828125, "step": 302 }, { "epoch": 0.22444444444444445, "grad_norm": 1.83242928981781, "learning_rate": 7.755555555555555e-07, "logits/chosen": 1.181640625, "logits/rejected": 1.283203125, "logps/chosen": -34.375, "logps/rejected": -41.375, "loss": 0.7207, "rewards/accuracies": 0.5, "rewards/chosen": -0.0285186767578125, "rewards/margins": -0.046539306640625, "rewards/rejected": 0.01800537109375, "step": 303 }, { "epoch": 0.22518518518518518, "grad_norm": 2.1306967735290527, "learning_rate": 7.748148148148148e-07, "logits/chosen": 0.69189453125, "logits/rejected": 1.6025390625, "logps/chosen": -30.203125, "logps/rejected": -35.59375, "loss": 0.6802, "rewards/accuracies": 0.5, "rewards/chosen": 0.01446533203125, "rewards/margins": 0.039459228515625, "rewards/rejected": -0.024993896484375, "step": 304 }, { "epoch": 0.22592592592592592, "grad_norm": 1.8109172582626343, "learning_rate": 7.74074074074074e-07, "logits/chosen": 1.7265625, "logits/rejected": 1.873046875, "logps/chosen": -48.75, "logps/rejected": -64.0, "loss": 0.5068, "rewards/accuracies": 0.75, "rewards/chosen": 0.27734375, "rewards/margins": 0.49072265625, "rewards/rejected": -0.21337890625, "step": 305 }, { "epoch": 0.22666666666666666, "grad_norm": 1.9110348224639893, "learning_rate": 7.733333333333333e-07, "logits/chosen": 1.1748046875, "logits/rejected": 1.1708984375, "logps/chosen": -25.328125, "logps/rejected": -35.125, "loss": 0.6313, "rewards/accuracies": 0.75, "rewards/chosen": 0.055084228515625, "rewards/margins": 0.133544921875, "rewards/rejected": -0.0784912109375, "step": 306 }, { "epoch": 0.22740740740740742, "grad_norm": 1.959365725517273, "learning_rate": 7.725925925925926e-07, "logits/chosen": 1.7587890625, "logits/rejected": 1.869140625, "logps/chosen": -20.515625, "logps/rejected": -47.84375, "loss": 0.9414, "rewards/accuracies": 0.5, "rewards/chosen": -0.0599365234375, "rewards/margins": -0.404296875, "rewards/rejected": 0.344482421875, "step": 307 }, { "epoch": 0.22814814814814816, "grad_norm": 2.176011800765991, "learning_rate": 7.718518518518518e-07, "logits/chosen": 1.037109375, "logits/rejected": 1.6513671875, "logps/chosen": -36.46875, "logps/rejected": -29.0625, "loss": 0.7866, "rewards/accuracies": 0.25, "rewards/chosen": -0.135986328125, "rewards/margins": -0.16357421875, "rewards/rejected": 0.0277252197265625, "step": 308 }, { "epoch": 0.2288888888888889, "grad_norm": 2.4642012119293213, "learning_rate": 7.71111111111111e-07, "logits/chosen": 0.9482421875, "logits/rejected": 1.1533203125, "logps/chosen": -32.75, "logps/rejected": -40.53125, "loss": 0.769, "rewards/accuracies": 0.5, "rewards/chosen": -0.07733154296875, "rewards/margins": -0.114013671875, "rewards/rejected": 0.036712646484375, "step": 309 }, { "epoch": 0.22962962962962963, "grad_norm": 1.6358293294906616, "learning_rate": 7.703703703703703e-07, "logits/chosen": 0.9765625, "logits/rejected": 1.5947265625, "logps/chosen": -44.125, "logps/rejected": -48.40625, "loss": 0.5806, "rewards/accuracies": 0.75, "rewards/chosen": 0.218017578125, "rewards/margins": 0.28466796875, "rewards/rejected": -0.06658935546875, "step": 310 }, { "epoch": 0.23037037037037036, "grad_norm": 1.681739330291748, "learning_rate": 7.696296296296297e-07, "logits/chosen": 1.126953125, "logits/rejected": 1.986328125, "logps/chosen": -32.3125, "logps/rejected": -52.25, "loss": 0.646, "rewards/accuracies": 0.75, "rewards/chosen": -0.056640625, "rewards/margins": 0.130126953125, "rewards/rejected": -0.186767578125, "step": 311 }, { "epoch": 0.2311111111111111, "grad_norm": 1.60736882686615, "learning_rate": 7.688888888888889e-07, "logits/chosen": 1.603515625, "logits/rejected": 1.75, "logps/chosen": -17.28125, "logps/rejected": -36.0, "loss": 0.6724, "rewards/accuracies": 0.75, "rewards/chosen": -0.0185546875, "rewards/margins": 0.052947998046875, "rewards/rejected": -0.071533203125, "step": 312 }, { "epoch": 0.23185185185185186, "grad_norm": 1.8204543590545654, "learning_rate": 7.681481481481481e-07, "logits/chosen": 2.216796875, "logits/rejected": 1.51953125, "logps/chosen": -33.0, "logps/rejected": -25.375, "loss": 0.5488, "rewards/accuracies": 0.75, "rewards/chosen": 0.474609375, "rewards/margins": 0.47119140625, "rewards/rejected": 0.003528594970703125, "step": 313 }, { "epoch": 0.2325925925925926, "grad_norm": 1.6979783773422241, "learning_rate": 7.674074074074074e-07, "logits/chosen": 1.2880859375, "logits/rejected": 2.091796875, "logps/chosen": -24.21875, "logps/rejected": -57.21875, "loss": 0.6348, "rewards/accuracies": 0.75, "rewards/chosen": -0.02813720703125, "rewards/margins": 0.12841796875, "rewards/rejected": -0.1566162109375, "step": 314 }, { "epoch": 0.23333333333333334, "grad_norm": 1.7239829301834106, "learning_rate": 7.666666666666667e-07, "logits/chosen": 1.6044921875, "logits/rejected": 1.0556640625, "logps/chosen": -24.21875, "logps/rejected": -42.21875, "loss": 0.8496, "rewards/accuracies": 0.0, "rewards/chosen": -0.1263427734375, "rewards/margins": -0.283935546875, "rewards/rejected": 0.157470703125, "step": 315 }, { "epoch": 0.23407407407407407, "grad_norm": 1.610681414604187, "learning_rate": 7.659259259259259e-07, "logits/chosen": 1.5341796875, "logits/rejected": 1.1611328125, "logps/chosen": -23.484375, "logps/rejected": -36.875, "loss": 0.6299, "rewards/accuracies": 0.75, "rewards/chosen": 0.08648681640625, "rewards/margins": 0.144287109375, "rewards/rejected": -0.05780029296875, "step": 316 }, { "epoch": 0.2348148148148148, "grad_norm": 1.7667667865753174, "learning_rate": 7.651851851851852e-07, "logits/chosen": 1.4345703125, "logits/rejected": 1.029296875, "logps/chosen": -34.5625, "logps/rejected": -20.359375, "loss": 0.6318, "rewards/accuracies": 0.75, "rewards/chosen": 0.035919189453125, "rewards/margins": 0.13818359375, "rewards/rejected": -0.10235595703125, "step": 317 }, { "epoch": 0.23555555555555555, "grad_norm": 1.7427012920379639, "learning_rate": 7.644444444444444e-07, "logits/chosen": 1.6943359375, "logits/rejected": 1.9619140625, "logps/chosen": -35.34375, "logps/rejected": -32.46875, "loss": 0.833, "rewards/accuracies": 0.0, "rewards/chosen": -0.162109375, "rewards/margins": -0.259033203125, "rewards/rejected": 0.09686279296875, "step": 318 }, { "epoch": 0.2362962962962963, "grad_norm": 2.22676420211792, "learning_rate": 7.637037037037036e-07, "logits/chosen": 1.10546875, "logits/rejected": 1.494140625, "logps/chosen": -51.3125, "logps/rejected": -57.5, "loss": 0.5352, "rewards/accuracies": 1.0, "rewards/chosen": 0.0745849609375, "rewards/margins": 0.360107421875, "rewards/rejected": -0.285400390625, "step": 319 }, { "epoch": 0.23703703703703705, "grad_norm": 1.9817768335342407, "learning_rate": 7.629629629629629e-07, "logits/chosen": 1.6552734375, "logits/rejected": 1.234375, "logps/chosen": -47.6875, "logps/rejected": -45.5625, "loss": 0.7656, "rewards/accuracies": 0.25, "rewards/chosen": -0.1148681640625, "rewards/margins": -0.1109619140625, "rewards/rejected": -0.003887176513671875, "step": 320 }, { "epoch": 0.23777777777777778, "grad_norm": 3.668964147567749, "learning_rate": 7.622222222222222e-07, "logits/chosen": 0.77587890625, "logits/rejected": 1.345703125, "logps/chosen": -30.390625, "logps/rejected": -35.09375, "loss": 0.6484, "rewards/accuracies": 0.5, "rewards/chosen": -0.04766845703125, "rewards/margins": 0.12451171875, "rewards/rejected": -0.1722412109375, "step": 321 }, { "epoch": 0.23851851851851852, "grad_norm": 2.002743721008301, "learning_rate": 7.614814814814815e-07, "logits/chosen": 1.4677734375, "logits/rejected": 1.7177734375, "logps/chosen": -54.0, "logps/rejected": -33.28125, "loss": 0.7158, "rewards/accuracies": 0.5, "rewards/chosen": 0.138671875, "rewards/margins": -0.0269775390625, "rewards/rejected": 0.1656494140625, "step": 322 }, { "epoch": 0.23925925925925925, "grad_norm": 2.490262985229492, "learning_rate": 7.607407407407406e-07, "logits/chosen": 1.8203125, "logits/rejected": 1.322265625, "logps/chosen": -25.375, "logps/rejected": -79.9375, "loss": 0.6289, "rewards/accuracies": 0.5, "rewards/chosen": 0.028045654296875, "rewards/margins": 0.170166015625, "rewards/rejected": -0.1422119140625, "step": 323 }, { "epoch": 0.24, "grad_norm": 2.152221918106079, "learning_rate": 7.599999999999999e-07, "logits/chosen": 1.875, "logits/rejected": 1.578125, "logps/chosen": -22.921875, "logps/rejected": -67.375, "loss": 0.6528, "rewards/accuracies": 0.75, "rewards/chosen": 0.16796875, "rewards/margins": 0.084716796875, "rewards/rejected": 0.08319091796875, "step": 324 }, { "epoch": 0.24074074074074073, "grad_norm": 10.045042037963867, "learning_rate": 7.592592592592593e-07, "logits/chosen": 1.7587890625, "logits/rejected": 1.6767578125, "logps/chosen": -39.0, "logps/rejected": -48.21875, "loss": 1.4307, "rewards/accuracies": 0.5, "rewards/chosen": -0.447998046875, "rewards/margins": -0.87548828125, "rewards/rejected": 0.427734375, "step": 325 }, { "epoch": 0.2414814814814815, "grad_norm": 2.4386157989501953, "learning_rate": 7.585185185185185e-07, "logits/chosen": 2.046875, "logits/rejected": 2.3046875, "logps/chosen": -36.21875, "logps/rejected": -38.4375, "loss": 0.7422, "rewards/accuracies": 0.5, "rewards/chosen": -0.07501220703125, "rewards/margins": -0.0816650390625, "rewards/rejected": 0.00661468505859375, "step": 326 }, { "epoch": 0.24222222222222223, "grad_norm": 1.8238967657089233, "learning_rate": 7.577777777777778e-07, "logits/chosen": 2.01171875, "logits/rejected": 1.791015625, "logps/chosen": -24.71875, "logps/rejected": -23.34375, "loss": 0.7471, "rewards/accuracies": 0.25, "rewards/chosen": -0.05584716796875, "rewards/margins": -0.09588623046875, "rewards/rejected": 0.0400390625, "step": 327 }, { "epoch": 0.24296296296296296, "grad_norm": 2.4680442810058594, "learning_rate": 7.57037037037037e-07, "logits/chosen": 1.3486328125, "logits/rejected": 1.986328125, "logps/chosen": -30.21875, "logps/rejected": -38.5, "loss": 0.7539, "rewards/accuracies": 0.25, "rewards/chosen": -0.04534912109375, "rewards/margins": -0.1109619140625, "rewards/rejected": 0.065673828125, "step": 328 }, { "epoch": 0.2437037037037037, "grad_norm": 1.6430476903915405, "learning_rate": 7.562962962962962e-07, "logits/chosen": 1.5703125, "logits/rejected": 1.8466796875, "logps/chosen": -53.375, "logps/rejected": -40.4375, "loss": 0.5781, "rewards/accuracies": 1.0, "rewards/chosen": 0.1781005859375, "rewards/margins": 0.25390625, "rewards/rejected": -0.0758056640625, "step": 329 }, { "epoch": 0.24444444444444444, "grad_norm": 2.3547496795654297, "learning_rate": 7.555555555555555e-07, "logits/chosen": 1.4033203125, "logits/rejected": 1.4892578125, "logps/chosen": -23.015625, "logps/rejected": -50.9375, "loss": 0.6729, "rewards/accuracies": 0.5, "rewards/chosen": -0.08221435546875, "rewards/margins": 0.066650390625, "rewards/rejected": -0.1488037109375, "step": 330 }, { "epoch": 0.24518518518518517, "grad_norm": 3.5084545612335205, "learning_rate": 7.548148148148148e-07, "logits/chosen": 1.3037109375, "logits/rejected": 1.736328125, "logps/chosen": -35.3125, "logps/rejected": -46.84375, "loss": 0.5439, "rewards/accuracies": 0.75, "rewards/chosen": 0.214111328125, "rewards/margins": 0.357177734375, "rewards/rejected": -0.1431884765625, "step": 331 }, { "epoch": 0.24592592592592594, "grad_norm": 2.6150286197662354, "learning_rate": 7.540740740740741e-07, "logits/chosen": 1.55859375, "logits/rejected": 1.2119140625, "logps/chosen": -35.46875, "logps/rejected": -66.25, "loss": 0.9209, "rewards/accuracies": 0.25, "rewards/chosen": -0.049652099609375, "rewards/margins": -0.3759765625, "rewards/rejected": 0.326416015625, "step": 332 }, { "epoch": 0.24666666666666667, "grad_norm": 1.3137154579162598, "learning_rate": 7.533333333333332e-07, "logits/chosen": 1.3369140625, "logits/rejected": 2.083984375, "logps/chosen": -57.90625, "logps/rejected": -62.75, "loss": 0.5381, "rewards/accuracies": 0.75, "rewards/chosen": 0.32666015625, "rewards/margins": 0.405517578125, "rewards/rejected": -0.07891845703125, "step": 333 }, { "epoch": 0.2474074074074074, "grad_norm": 2.195786714553833, "learning_rate": 7.525925925925925e-07, "logits/chosen": 1.5400390625, "logits/rejected": 1.8974609375, "logps/chosen": -29.203125, "logps/rejected": -64.375, "loss": 0.7227, "rewards/accuracies": 0.5, "rewards/chosen": 0.048828125, "rewards/margins": -0.041778564453125, "rewards/rejected": 0.090576171875, "step": 334 }, { "epoch": 0.24814814814814815, "grad_norm": 2.977818012237549, "learning_rate": 7.518518518518518e-07, "logits/chosen": 1.9794921875, "logits/rejected": 1.890625, "logps/chosen": -30.453125, "logps/rejected": -60.75, "loss": 0.793, "rewards/accuracies": 0.5, "rewards/chosen": 0.078125, "rewards/margins": -0.1781005859375, "rewards/rejected": 0.25634765625, "step": 335 }, { "epoch": 0.24888888888888888, "grad_norm": 2.284646511077881, "learning_rate": 7.51111111111111e-07, "logits/chosen": 0.87939453125, "logits/rejected": 1.484375, "logps/chosen": -31.890625, "logps/rejected": -34.59375, "loss": 0.8359, "rewards/accuracies": 0.5, "rewards/chosen": -0.1917724609375, "rewards/margins": -0.242919921875, "rewards/rejected": 0.0511474609375, "step": 336 }, { "epoch": 0.24962962962962962, "grad_norm": 1.6125632524490356, "learning_rate": 7.503703703703703e-07, "logits/chosen": 1.9794921875, "logits/rejected": 1.5, "logps/chosen": -23.921875, "logps/rejected": -39.8125, "loss": 0.6836, "rewards/accuracies": 0.5, "rewards/chosen": 0.04412841796875, "rewards/margins": 0.025726318359375, "rewards/rejected": 0.018402099609375, "step": 337 }, { "epoch": 0.25037037037037035, "grad_norm": 2.1803219318389893, "learning_rate": 7.496296296296297e-07, "logits/chosen": 1.5234375, "logits/rejected": 1.916015625, "logps/chosen": -26.359375, "logps/rejected": -32.34375, "loss": 0.6982, "rewards/accuracies": 0.75, "rewards/chosen": 0.171142578125, "rewards/margins": 0.0034637451171875, "rewards/rejected": 0.1676025390625, "step": 338 }, { "epoch": 0.2511111111111111, "grad_norm": 1.9830107688903809, "learning_rate": 7.488888888888889e-07, "logits/chosen": 1.775390625, "logits/rejected": 1.939453125, "logps/chosen": -23.640625, "logps/rejected": -34.78125, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": -0.020111083984375, "rewards/margins": 0.0079803466796875, "rewards/rejected": -0.028106689453125, "step": 339 }, { "epoch": 0.2518518518518518, "grad_norm": 1.3675764799118042, "learning_rate": 7.481481481481481e-07, "logits/chosen": 1.5283203125, "logits/rejected": 1.3564453125, "logps/chosen": -20.984375, "logps/rejected": -34.90625, "loss": 0.752, "rewards/accuracies": 0.25, "rewards/chosen": -0.1107177734375, "rewards/margins": -0.0970458984375, "rewards/rejected": -0.0136566162109375, "step": 340 }, { "epoch": 0.2525925925925926, "grad_norm": 2.139458656311035, "learning_rate": 7.474074074074074e-07, "logits/chosen": 1.4833984375, "logits/rejected": 0.8642578125, "logps/chosen": -25.984375, "logps/rejected": -29.8125, "loss": 0.6621, "rewards/accuracies": 0.5, "rewards/chosen": 0.0777587890625, "rewards/margins": 0.0909423828125, "rewards/rejected": -0.01324462890625, "step": 341 }, { "epoch": 0.25333333333333335, "grad_norm": 2.312653064727783, "learning_rate": 7.466666666666667e-07, "logits/chosen": 1.474609375, "logits/rejected": 1.328125, "logps/chosen": -29.671875, "logps/rejected": -36.125, "loss": 0.9683, "rewards/accuracies": 0.0, "rewards/chosen": -0.07147216796875, "rewards/margins": -0.445556640625, "rewards/rejected": 0.374267578125, "step": 342 }, { "epoch": 0.25407407407407406, "grad_norm": 1.6363229751586914, "learning_rate": 7.459259259259259e-07, "logits/chosen": 1.4091796875, "logits/rejected": 1.2236328125, "logps/chosen": -24.765625, "logps/rejected": -33.65625, "loss": 0.7227, "rewards/accuracies": 0.5, "rewards/chosen": 0.1610107421875, "rewards/margins": -0.033966064453125, "rewards/rejected": 0.1949462890625, "step": 343 }, { "epoch": 0.2548148148148148, "grad_norm": 1.2757682800292969, "learning_rate": 7.451851851851851e-07, "logits/chosen": 1.669921875, "logits/rejected": 1.255859375, "logps/chosen": -40.90625, "logps/rejected": -26.53125, "loss": 0.5996, "rewards/accuracies": 1.0, "rewards/chosen": 0.13330078125, "rewards/margins": 0.1981201171875, "rewards/rejected": -0.0648193359375, "step": 344 }, { "epoch": 0.25555555555555554, "grad_norm": 1.6158013343811035, "learning_rate": 7.444444444444444e-07, "logits/chosen": 1.16015625, "logits/rejected": 1.2958984375, "logps/chosen": -42.46875, "logps/rejected": -32.4375, "loss": 0.5747, "rewards/accuracies": 0.75, "rewards/chosen": 0.373779296875, "rewards/margins": 0.302734375, "rewards/rejected": 0.0706787109375, "step": 345 }, { "epoch": 0.2562962962962963, "grad_norm": 1.8329062461853027, "learning_rate": 7.437037037037037e-07, "logits/chosen": 1.2587890625, "logits/rejected": 1.681640625, "logps/chosen": -35.8125, "logps/rejected": -62.4375, "loss": 0.6318, "rewards/accuracies": 0.5, "rewards/chosen": 0.04901123046875, "rewards/margins": 0.193603515625, "rewards/rejected": -0.14453125, "step": 346 }, { "epoch": 0.25703703703703706, "grad_norm": 1.9032002687454224, "learning_rate": 7.429629629629629e-07, "logits/chosen": 1.94921875, "logits/rejected": 1.4970703125, "logps/chosen": -30.21875, "logps/rejected": -55.25, "loss": 0.6494, "rewards/accuracies": 0.75, "rewards/chosen": 0.03558349609375, "rewards/margins": 0.09808349609375, "rewards/rejected": -0.0625, "step": 347 }, { "epoch": 0.2577777777777778, "grad_norm": 2.1436784267425537, "learning_rate": 7.422222222222222e-07, "logits/chosen": 1.609375, "logits/rejected": 2.21484375, "logps/chosen": -32.34375, "logps/rejected": -54.90625, "loss": 0.8281, "rewards/accuracies": 0.25, "rewards/chosen": -0.1441650390625, "rewards/margins": -0.246826171875, "rewards/rejected": 0.1026611328125, "step": 348 }, { "epoch": 0.25851851851851854, "grad_norm": 2.258258819580078, "learning_rate": 7.414814814814814e-07, "logits/chosen": 1.4296875, "logits/rejected": 1.8525390625, "logps/chosen": -33.0625, "logps/rejected": -24.84375, "loss": 0.6509, "rewards/accuracies": 0.75, "rewards/chosen": 0.1285400390625, "rewards/margins": 0.0902099609375, "rewards/rejected": 0.038299560546875, "step": 349 }, { "epoch": 0.25925925925925924, "grad_norm": 2.8432679176330566, "learning_rate": 7.407407407407406e-07, "logits/chosen": 1.3046875, "logits/rejected": 1.4111328125, "logps/chosen": -41.75, "logps/rejected": -82.0, "loss": 0.8145, "rewards/accuracies": 0.0, "rewards/chosen": -0.104248046875, "rewards/margins": -0.229248046875, "rewards/rejected": 0.125, "step": 350 }, { "epoch": 0.26, "grad_norm": 1.4319182634353638, "learning_rate": 7.4e-07, "logits/chosen": 1.314453125, "logits/rejected": 1.4296875, "logps/chosen": -20.328125, "logps/rejected": -34.375, "loss": 0.7163, "rewards/accuracies": 0.5, "rewards/chosen": -0.034759521484375, "rewards/margins": -0.00732421875, "rewards/rejected": -0.02740478515625, "step": 351 }, { "epoch": 0.2607407407407407, "grad_norm": 2.174873113632202, "learning_rate": 7.392592592592593e-07, "logits/chosen": 1.248046875, "logits/rejected": 0.91015625, "logps/chosen": -29.046875, "logps/rejected": -42.625, "loss": 0.7402, "rewards/accuracies": 0.25, "rewards/chosen": 0.091064453125, "rewards/margins": -0.08056640625, "rewards/rejected": 0.1715087890625, "step": 352 }, { "epoch": 0.2614814814814815, "grad_norm": 1.3425871133804321, "learning_rate": 7.385185185185185e-07, "logits/chosen": 1.513671875, "logits/rejected": 1.2421875, "logps/chosen": -27.8125, "logps/rejected": -27.328125, "loss": 0.6577, "rewards/accuracies": 0.5, "rewards/chosen": 0.0046844482421875, "rewards/margins": 0.09136962890625, "rewards/rejected": -0.08673095703125, "step": 353 }, { "epoch": 0.26222222222222225, "grad_norm": 2.0060324668884277, "learning_rate": 7.377777777777777e-07, "logits/chosen": 1.626953125, "logits/rejected": 1.990234375, "logps/chosen": -35.0, "logps/rejected": -36.21875, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": 0.0335693359375, "rewards/margins": 0.11248779296875, "rewards/rejected": -0.07891845703125, "step": 354 }, { "epoch": 0.26296296296296295, "grad_norm": 1.5937716960906982, "learning_rate": 7.37037037037037e-07, "logits/chosen": 1.818359375, "logits/rejected": 1.3701171875, "logps/chosen": -38.28125, "logps/rejected": -47.1875, "loss": 0.5938, "rewards/accuracies": 0.5, "rewards/chosen": 0.30078125, "rewards/margins": 0.2430419921875, "rewards/rejected": 0.05780029296875, "step": 355 }, { "epoch": 0.2637037037037037, "grad_norm": 2.16455340385437, "learning_rate": 7.362962962962963e-07, "logits/chosen": 1.51171875, "logits/rejected": 2.28515625, "logps/chosen": -33.5625, "logps/rejected": -70.25, "loss": 0.4456, "rewards/accuracies": 0.75, "rewards/chosen": -0.1109619140625, "rewards/margins": 0.71240234375, "rewards/rejected": -0.82373046875, "step": 356 }, { "epoch": 0.2644444444444444, "grad_norm": 1.2938421964645386, "learning_rate": 7.355555555555555e-07, "logits/chosen": 1.3505859375, "logits/rejected": 1.767578125, "logps/chosen": -26.65625, "logps/rejected": -20.65625, "loss": 0.6318, "rewards/accuracies": 1.0, "rewards/chosen": 0.060150146484375, "rewards/margins": 0.1285400390625, "rewards/rejected": -0.068359375, "step": 357 }, { "epoch": 0.2651851851851852, "grad_norm": 6.120645999908447, "learning_rate": 7.348148148148148e-07, "logits/chosen": 2.078125, "logits/rejected": 1.4501953125, "logps/chosen": -40.03125, "logps/rejected": -90.3125, "loss": 0.8296, "rewards/accuracies": 0.75, "rewards/chosen": -0.382080078125, "rewards/margins": -0.0892333984375, "rewards/rejected": -0.29296875, "step": 358 }, { "epoch": 0.2659259259259259, "grad_norm": 1.856943964958191, "learning_rate": 7.340740740740741e-07, "logits/chosen": 1.4365234375, "logits/rejected": 1.4208984375, "logps/chosen": -23.46875, "logps/rejected": -59.0, "loss": 0.6953, "rewards/accuracies": 0.75, "rewards/chosen": 0.146484375, "rewards/margins": 0.12939453125, "rewards/rejected": 0.0172882080078125, "step": 359 }, { "epoch": 0.26666666666666666, "grad_norm": 2.3367302417755127, "learning_rate": 7.333333333333332e-07, "logits/chosen": 1.59765625, "logits/rejected": 1.6611328125, "logps/chosen": -29.25, "logps/rejected": -48.875, "loss": 0.8525, "rewards/accuracies": 0.25, "rewards/chosen": -0.04217529296875, "rewards/margins": -0.26123046875, "rewards/rejected": 0.2191162109375, "step": 360 }, { "epoch": 0.2674074074074074, "grad_norm": 1.675307035446167, "learning_rate": 7.325925925925925e-07, "logits/chosen": 1.8974609375, "logits/rejected": 1.8759765625, "logps/chosen": -29.734375, "logps/rejected": -38.125, "loss": 0.6152, "rewards/accuracies": 0.75, "rewards/chosen": 0.020355224609375, "rewards/margins": 0.1796875, "rewards/rejected": -0.159423828125, "step": 361 }, { "epoch": 0.26814814814814814, "grad_norm": 1.3968263864517212, "learning_rate": 7.318518518518518e-07, "logits/chosen": 1.7529296875, "logits/rejected": 1.58984375, "logps/chosen": -20.09375, "logps/rejected": -40.0, "loss": 0.6489, "rewards/accuracies": 0.75, "rewards/chosen": 0.115234375, "rewards/margins": 0.112060546875, "rewards/rejected": 0.0031280517578125, "step": 362 }, { "epoch": 0.2688888888888889, "grad_norm": 2.362663745880127, "learning_rate": 7.311111111111111e-07, "logits/chosen": 2.103515625, "logits/rejected": 2.423828125, "logps/chosen": -49.6875, "logps/rejected": -44.78125, "loss": 0.6016, "rewards/accuracies": 0.75, "rewards/chosen": 0.007415771484375, "rewards/margins": 0.2344970703125, "rewards/rejected": -0.2271728515625, "step": 363 }, { "epoch": 0.2696296296296296, "grad_norm": 5.020481109619141, "learning_rate": 7.303703703703703e-07, "logits/chosen": 1.2578125, "logits/rejected": 1.744140625, "logps/chosen": -18.84375, "logps/rejected": -48.34375, "loss": 0.7998, "rewards/accuracies": 0.25, "rewards/chosen": -0.061309814453125, "rewards/margins": -0.184814453125, "rewards/rejected": 0.1234130859375, "step": 364 }, { "epoch": 0.27037037037037037, "grad_norm": 2.2178895473480225, "learning_rate": 7.296296296296296e-07, "logits/chosen": 0.86572265625, "logits/rejected": 1.53515625, "logps/chosen": -65.75, "logps/rejected": -60.28125, "loss": 0.7817, "rewards/accuracies": 0.5, "rewards/chosen": -0.0051116943359375, "rewards/margins": -0.1448974609375, "rewards/rejected": 0.139892578125, "step": 365 }, { "epoch": 0.27111111111111114, "grad_norm": 1.432281255722046, "learning_rate": 7.288888888888889e-07, "logits/chosen": 1.78515625, "logits/rejected": 2.103515625, "logps/chosen": -21.828125, "logps/rejected": -46.40625, "loss": 0.5952, "rewards/accuracies": 0.75, "rewards/chosen": 0.10272216796875, "rewards/margins": 0.2420654296875, "rewards/rejected": -0.139404296875, "step": 366 }, { "epoch": 0.27185185185185184, "grad_norm": 3.43861985206604, "learning_rate": 7.281481481481481e-07, "logits/chosen": 1.6572265625, "logits/rejected": 2.31640625, "logps/chosen": -44.0, "logps/rejected": -63.9375, "loss": 0.6826, "rewards/accuracies": 0.5, "rewards/chosen": 0.003917694091796875, "rewards/margins": 0.03131103515625, "rewards/rejected": -0.027374267578125, "step": 367 }, { "epoch": 0.2725925925925926, "grad_norm": 1.7148710489273071, "learning_rate": 7.274074074074074e-07, "logits/chosen": 1.5546875, "logits/rejected": 1.6259765625, "logps/chosen": -37.3125, "logps/rejected": -43.625, "loss": 0.5869, "rewards/accuracies": 1.0, "rewards/chosen": 0.1624755859375, "rewards/margins": 0.22705078125, "rewards/rejected": -0.064453125, "step": 368 }, { "epoch": 0.2733333333333333, "grad_norm": 1.500014066696167, "learning_rate": 7.266666666666667e-07, "logits/chosen": 0.86572265625, "logits/rejected": 1.3828125, "logps/chosen": -30.546875, "logps/rejected": -38.75, "loss": 0.6064, "rewards/accuracies": 0.75, "rewards/chosen": 0.1468505859375, "rewards/margins": 0.20166015625, "rewards/rejected": -0.0546875, "step": 369 }, { "epoch": 0.2740740740740741, "grad_norm": 6.626343727111816, "learning_rate": 7.259259259259259e-07, "logits/chosen": 1.3955078125, "logits/rejected": 1.6796875, "logps/chosen": -42.0, "logps/rejected": -25.625, "loss": 0.9277, "rewards/accuracies": 0.25, "rewards/chosen": -0.432373046875, "rewards/margins": -0.37158203125, "rewards/rejected": -0.060546875, "step": 370 }, { "epoch": 0.2748148148148148, "grad_norm": 3.8077099323272705, "learning_rate": 7.251851851851851e-07, "logits/chosen": 1.3583984375, "logits/rejected": 1.9208984375, "logps/chosen": -34.90625, "logps/rejected": -30.28125, "loss": 0.7041, "rewards/accuracies": 0.25, "rewards/chosen": 0.0660400390625, "rewards/margins": -0.0175628662109375, "rewards/rejected": 0.0836181640625, "step": 371 }, { "epoch": 0.27555555555555555, "grad_norm": 1.5663537979125977, "learning_rate": 7.244444444444444e-07, "logits/chosen": 1.77734375, "logits/rejected": 1.3076171875, "logps/chosen": -23.1875, "logps/rejected": -26.1875, "loss": 0.7261, "rewards/accuracies": 0.5, "rewards/chosen": -0.030853271484375, "rewards/margins": -0.05706787109375, "rewards/rejected": 0.0261993408203125, "step": 372 }, { "epoch": 0.2762962962962963, "grad_norm": 2.7193825244903564, "learning_rate": 7.237037037037037e-07, "logits/chosen": 1.9931640625, "logits/rejected": 1.8720703125, "logps/chosen": -41.3125, "logps/rejected": -55.5625, "loss": 0.6729, "rewards/accuracies": 0.5, "rewards/chosen": 0.01445770263671875, "rewards/margins": 0.0628662109375, "rewards/rejected": -0.0484619140625, "step": 373 }, { "epoch": 0.277037037037037, "grad_norm": 1.5214704275131226, "learning_rate": 7.229629629629629e-07, "logits/chosen": 1.982421875, "logits/rejected": 1.4794921875, "logps/chosen": -24.765625, "logps/rejected": -50.1875, "loss": 0.5449, "rewards/accuracies": 0.75, "rewards/chosen": 0.68017578125, "rewards/margins": 0.59033203125, "rewards/rejected": 0.08984375, "step": 374 }, { "epoch": 0.2777777777777778, "grad_norm": 2.313647747039795, "learning_rate": 7.222222222222221e-07, "logits/chosen": 1.755859375, "logits/rejected": 1.5546875, "logps/chosen": -50.75, "logps/rejected": -71.0625, "loss": 0.5879, "rewards/accuracies": 1.0, "rewards/chosen": -0.07696533203125, "rewards/margins": 0.246826171875, "rewards/rejected": -0.32373046875, "step": 375 }, { "epoch": 0.2785185185185185, "grad_norm": 1.5400835275650024, "learning_rate": 7.214814814814814e-07, "logits/chosen": 1.3876953125, "logits/rejected": 2.111328125, "logps/chosen": -26.453125, "logps/rejected": -39.34375, "loss": 0.6992, "rewards/accuracies": 0.5, "rewards/chosen": 0.0222625732421875, "rewards/margins": -0.00975799560546875, "rewards/rejected": 0.032012939453125, "step": 376 }, { "epoch": 0.27925925925925926, "grad_norm": 29.896013259887695, "learning_rate": 7.207407407407408e-07, "logits/chosen": 1.396484375, "logits/rejected": 1.5712890625, "logps/chosen": -47.625, "logps/rejected": -56.375, "loss": 0.6157, "rewards/accuracies": 0.75, "rewards/chosen": 0.227294921875, "rewards/margins": 0.2027587890625, "rewards/rejected": 0.024627685546875, "step": 377 }, { "epoch": 0.28, "grad_norm": 1.9397859573364258, "learning_rate": 7.2e-07, "logits/chosen": 1.203125, "logits/rejected": 1.447265625, "logps/chosen": -23.421875, "logps/rejected": -35.0625, "loss": 1.2129, "rewards/accuracies": 0.5, "rewards/chosen": 0.12420654296875, "rewards/margins": -0.66650390625, "rewards/rejected": 0.791015625, "step": 378 }, { "epoch": 0.28074074074074074, "grad_norm": 1.7438231706619263, "learning_rate": 7.192592592592593e-07, "logits/chosen": 1.8916015625, "logits/rejected": 1.80078125, "logps/chosen": -28.25, "logps/rejected": -39.625, "loss": 0.6558, "rewards/accuracies": 0.75, "rewards/chosen": 0.06719970703125, "rewards/margins": 0.09619140625, "rewards/rejected": -0.0289459228515625, "step": 379 }, { "epoch": 0.2814814814814815, "grad_norm": 2.451754331588745, "learning_rate": 7.185185185185186e-07, "logits/chosen": 1.662109375, "logits/rejected": 1.6669921875, "logps/chosen": -32.8125, "logps/rejected": -76.875, "loss": 0.6387, "rewards/accuracies": 1.0, "rewards/chosen": -0.03851318359375, "rewards/margins": 0.11505126953125, "rewards/rejected": -0.153564453125, "step": 380 }, { "epoch": 0.2822222222222222, "grad_norm": 1.8156601190567017, "learning_rate": 7.177777777777777e-07, "logits/chosen": 1.578125, "logits/rejected": 1.392578125, "logps/chosen": -28.140625, "logps/rejected": -58.6875, "loss": 0.627, "rewards/accuracies": 0.75, "rewards/chosen": 0.01233673095703125, "rewards/margins": 0.1639404296875, "rewards/rejected": -0.151611328125, "step": 381 }, { "epoch": 0.28296296296296297, "grad_norm": 2.0041327476501465, "learning_rate": 7.17037037037037e-07, "logits/chosen": 1.4853515625, "logits/rejected": 1.4658203125, "logps/chosen": -46.875, "logps/rejected": -25.21875, "loss": 0.7637, "rewards/accuracies": 0.5, "rewards/chosen": -0.11328125, "rewards/margins": -0.12152099609375, "rewards/rejected": 0.0081939697265625, "step": 382 }, { "epoch": 0.2837037037037037, "grad_norm": 1.8957109451293945, "learning_rate": 7.162962962962963e-07, "logits/chosen": 1.6435546875, "logits/rejected": 1.5498046875, "logps/chosen": -33.625, "logps/rejected": -41.09375, "loss": 0.7466, "rewards/accuracies": 0.25, "rewards/chosen": -0.0667724609375, "rewards/margins": -0.087158203125, "rewards/rejected": 0.020355224609375, "step": 383 }, { "epoch": 0.28444444444444444, "grad_norm": 2.5130856037139893, "learning_rate": 7.155555555555555e-07, "logits/chosen": 1.7119140625, "logits/rejected": 1.36328125, "logps/chosen": -32.1875, "logps/rejected": -42.53125, "loss": 0.8481, "rewards/accuracies": 0.0, "rewards/chosen": -0.25390625, "rewards/margins": -0.28466796875, "rewards/rejected": 0.03082275390625, "step": 384 }, { "epoch": 0.2851851851851852, "grad_norm": 2.1090030670166016, "learning_rate": 7.148148148148148e-07, "logits/chosen": 1.6689453125, "logits/rejected": 1.8076171875, "logps/chosen": -36.5, "logps/rejected": -27.890625, "loss": 0.7056, "rewards/accuracies": 0.25, "rewards/chosen": 0.034393310546875, "rewards/margins": -0.010528564453125, "rewards/rejected": 0.044921875, "step": 385 }, { "epoch": 0.2859259259259259, "grad_norm": 1.5634794235229492, "learning_rate": 7.14074074074074e-07, "logits/chosen": 1.6875, "logits/rejected": 1.5927734375, "logps/chosen": -28.421875, "logps/rejected": -41.9375, "loss": 0.6533, "rewards/accuracies": 0.5, "rewards/chosen": 0.025360107421875, "rewards/margins": 0.09881591796875, "rewards/rejected": -0.07342529296875, "step": 386 }, { "epoch": 0.2866666666666667, "grad_norm": 1.6746305227279663, "learning_rate": 7.133333333333333e-07, "logits/chosen": 1.330078125, "logits/rejected": 1.46484375, "logps/chosen": -38.75, "logps/rejected": -34.28125, "loss": 0.7358, "rewards/accuracies": 0.5, "rewards/chosen": 0.05859375, "rewards/margins": -0.021942138671875, "rewards/rejected": 0.0804443359375, "step": 387 }, { "epoch": 0.2874074074074074, "grad_norm": 2.4502265453338623, "learning_rate": 7.125925925925925e-07, "logits/chosen": 1.939453125, "logits/rejected": 1.99609375, "logps/chosen": -41.71875, "logps/rejected": -68.0, "loss": 0.6606, "rewards/accuracies": 0.5, "rewards/chosen": 0.041015625, "rewards/margins": 0.0797119140625, "rewards/rejected": -0.038665771484375, "step": 388 }, { "epoch": 0.28814814814814815, "grad_norm": 3.571444034576416, "learning_rate": 7.118518518518518e-07, "logits/chosen": 1.798828125, "logits/rejected": 1.8212890625, "logps/chosen": -36.3125, "logps/rejected": -80.1875, "loss": 0.7812, "rewards/accuracies": 0.5, "rewards/chosen": 0.216064453125, "rewards/margins": -0.09130859375, "rewards/rejected": 0.307373046875, "step": 389 }, { "epoch": 0.28888888888888886, "grad_norm": 1.9790172576904297, "learning_rate": 7.111111111111111e-07, "logits/chosen": 2.32421875, "logits/rejected": 1.607421875, "logps/chosen": -43.34375, "logps/rejected": -39.0625, "loss": 0.7119, "rewards/accuracies": 0.5, "rewards/chosen": 0.11834716796875, "rewards/margins": -6.103515625e-05, "rewards/rejected": 0.118408203125, "step": 390 }, { "epoch": 0.2896296296296296, "grad_norm": 62.00128936767578, "learning_rate": 7.103703703703703e-07, "logits/chosen": 1.720703125, "logits/rejected": 1.572265625, "logps/chosen": -40.4375, "logps/rejected": -42.65625, "loss": 0.5293, "rewards/accuracies": 0.75, "rewards/chosen": 0.5419921875, "rewards/margins": 0.6064453125, "rewards/rejected": -0.064453125, "step": 391 }, { "epoch": 0.2903703703703704, "grad_norm": 1.4439634084701538, "learning_rate": 7.096296296296296e-07, "logits/chosen": 1.4404296875, "logits/rejected": 1.423828125, "logps/chosen": -24.3125, "logps/rejected": -34.125, "loss": 0.7939, "rewards/accuracies": 0.0, "rewards/chosen": -0.09765625, "rewards/margins": -0.188720703125, "rewards/rejected": 0.09100341796875, "step": 392 }, { "epoch": 0.2911111111111111, "grad_norm": 8.461791038513184, "learning_rate": 7.088888888888889e-07, "logits/chosen": 1.8408203125, "logits/rejected": 2.2421875, "logps/chosen": -30.671875, "logps/rejected": -56.59375, "loss": 0.875, "rewards/accuracies": 0.0, "rewards/chosen": -0.289306640625, "rewards/margins": -0.328125, "rewards/rejected": 0.038665771484375, "step": 393 }, { "epoch": 0.29185185185185186, "grad_norm": 2.26845645904541, "learning_rate": 7.081481481481482e-07, "logits/chosen": 1.494140625, "logits/rejected": 1.8173828125, "logps/chosen": -30.609375, "logps/rejected": -44.9375, "loss": 0.8174, "rewards/accuracies": 0.0, "rewards/chosen": -0.067138671875, "rewards/margins": -0.2242431640625, "rewards/rejected": 0.156982421875, "step": 394 }, { "epoch": 0.29259259259259257, "grad_norm": 1.7463706731796265, "learning_rate": 7.074074074074074e-07, "logits/chosen": 1.943359375, "logits/rejected": 1.93359375, "logps/chosen": -25.46875, "logps/rejected": -77.6875, "loss": 0.71, "rewards/accuracies": 0.5, "rewards/chosen": 0.024993896484375, "rewards/margins": -0.010589599609375, "rewards/rejected": 0.03558349609375, "step": 395 }, { "epoch": 0.29333333333333333, "grad_norm": 2.654940128326416, "learning_rate": 7.066666666666666e-07, "logits/chosen": 1.7890625, "logits/rejected": 1.462890625, "logps/chosen": -25.734375, "logps/rejected": -28.421875, "loss": 0.7769, "rewards/accuracies": 0.25, "rewards/chosen": -0.107421875, "rewards/margins": -0.15380859375, "rewards/rejected": 0.046478271484375, "step": 396 }, { "epoch": 0.2940740740740741, "grad_norm": 2.9578258991241455, "learning_rate": 7.059259259259259e-07, "logits/chosen": 1.3564453125, "logits/rejected": 1.3310546875, "logps/chosen": -35.3125, "logps/rejected": -67.3125, "loss": 0.7788, "rewards/accuracies": 0.75, "rewards/chosen": 0.08245849609375, "rewards/margins": -0.10736083984375, "rewards/rejected": 0.1898193359375, "step": 397 }, { "epoch": 0.2948148148148148, "grad_norm": 2.042158365249634, "learning_rate": 7.051851851851851e-07, "logits/chosen": 1.91015625, "logits/rejected": 1.9033203125, "logps/chosen": -24.875, "logps/rejected": -42.25, "loss": 0.584, "rewards/accuracies": 0.75, "rewards/chosen": 0.125, "rewards/margins": 0.28857421875, "rewards/rejected": -0.1636962890625, "step": 398 }, { "epoch": 0.29555555555555557, "grad_norm": 5.538578033447266, "learning_rate": 7.044444444444444e-07, "logits/chosen": 1.7197265625, "logits/rejected": 2.193359375, "logps/chosen": -27.359375, "logps/rejected": -80.0625, "loss": 0.6025, "rewards/accuracies": 0.5, "rewards/chosen": 0.01328277587890625, "rewards/margins": 0.224609375, "rewards/rejected": -0.2113037109375, "step": 399 }, { "epoch": 0.2962962962962963, "grad_norm": 1.9173061847686768, "learning_rate": 7.037037037037037e-07, "logits/chosen": 1.556640625, "logits/rejected": 1.9150390625, "logps/chosen": -32.53125, "logps/rejected": -41.4375, "loss": 0.8467, "rewards/accuracies": 0.5, "rewards/chosen": -0.010955810546875, "rewards/margins": -0.26171875, "rewards/rejected": 0.250732421875, "step": 400 }, { "epoch": 0.29703703703703704, "grad_norm": 3.12882924079895, "learning_rate": 7.029629629629629e-07, "logits/chosen": 1.826171875, "logits/rejected": 1.6455078125, "logps/chosen": -48.25, "logps/rejected": -61.84375, "loss": 1.2285, "rewards/accuracies": 0.75, "rewards/chosen": -0.0546875, "rewards/margins": -0.5185546875, "rewards/rejected": 0.46337890625, "step": 401 }, { "epoch": 0.29777777777777775, "grad_norm": 2.2704288959503174, "learning_rate": 7.022222222222221e-07, "logits/chosen": 2.349609375, "logits/rejected": 1.43359375, "logps/chosen": -39.78125, "logps/rejected": -44.71875, "loss": 0.7729, "rewards/accuracies": 0.25, "rewards/chosen": -0.22265625, "rewards/margins": -0.11285400390625, "rewards/rejected": -0.1097412109375, "step": 402 }, { "epoch": 0.2985185185185185, "grad_norm": 2.1775527000427246, "learning_rate": 7.014814814814814e-07, "logits/chosen": 1.4541015625, "logits/rejected": 1.548828125, "logps/chosen": -32.875, "logps/rejected": -63.875, "loss": 0.832, "rewards/accuracies": 0.0, "rewards/chosen": -0.1259765625, "rewards/margins": -0.25244140625, "rewards/rejected": 0.1265869140625, "step": 403 }, { "epoch": 0.2992592592592593, "grad_norm": 6.645136833190918, "learning_rate": 7.007407407407408e-07, "logits/chosen": 1.4248046875, "logits/rejected": 1.845703125, "logps/chosen": -36.25, "logps/rejected": -32.1875, "loss": 0.7192, "rewards/accuracies": 0.25, "rewards/chosen": -0.0102386474609375, "rewards/margins": -0.01806640625, "rewards/rejected": 0.0078125, "step": 404 }, { "epoch": 0.3, "grad_norm": 1.819262981414795, "learning_rate": 7e-07, "logits/chosen": 2.1328125, "logits/rejected": 2.1875, "logps/chosen": -36.25, "logps/rejected": -59.4375, "loss": 0.6504, "rewards/accuracies": 0.5, "rewards/chosen": 0.00275421142578125, "rewards/margins": 0.09295654296875, "rewards/rejected": -0.0902099609375, "step": 405 }, { "epoch": 0.30074074074074075, "grad_norm": 1.7437958717346191, "learning_rate": 6.992592592592593e-07, "logits/chosen": 1.23046875, "logits/rejected": 1.6240234375, "logps/chosen": -23.0625, "logps/rejected": -53.53125, "loss": 0.6807, "rewards/accuracies": 0.5, "rewards/chosen": -0.04296875, "rewards/margins": 0.033966064453125, "rewards/rejected": -0.07696533203125, "step": 406 }, { "epoch": 0.30148148148148146, "grad_norm": 1.965521216392517, "learning_rate": 6.985185185185185e-07, "logits/chosen": 1.1767578125, "logits/rejected": 1.5478515625, "logps/chosen": -38.40625, "logps/rejected": -51.4375, "loss": 0.6987, "rewards/accuracies": 0.5, "rewards/chosen": -0.03204345703125, "rewards/margins": -0.00705718994140625, "rewards/rejected": -0.024993896484375, "step": 407 }, { "epoch": 0.3022222222222222, "grad_norm": 2.151381015777588, "learning_rate": 6.977777777777778e-07, "logits/chosen": 1.3369140625, "logits/rejected": 1.4716796875, "logps/chosen": -27.5, "logps/rejected": -42.46875, "loss": 1.0547, "rewards/accuracies": 0.0, "rewards/chosen": -0.2406005859375, "rewards/margins": -0.6103515625, "rewards/rejected": 0.36962890625, "step": 408 }, { "epoch": 0.302962962962963, "grad_norm": 1.923752784729004, "learning_rate": 6.97037037037037e-07, "logits/chosen": 0.9833984375, "logits/rejected": 1.6669921875, "logps/chosen": -46.6875, "logps/rejected": -37.40625, "loss": 0.873, "rewards/accuracies": 0.5, "rewards/chosen": -0.0265655517578125, "rewards/margins": -0.2900390625, "rewards/rejected": 0.263427734375, "step": 409 }, { "epoch": 0.3037037037037037, "grad_norm": 1.8734281063079834, "learning_rate": 6.962962962962963e-07, "logits/chosen": 1.6025390625, "logits/rejected": 1.833984375, "logps/chosen": -25.25, "logps/rejected": -44.71875, "loss": 0.7275, "rewards/accuracies": 0.5, "rewards/chosen": 0.2125244140625, "rewards/margins": -0.0311279296875, "rewards/rejected": 0.24365234375, "step": 410 }, { "epoch": 0.30444444444444446, "grad_norm": 1.58878755569458, "learning_rate": 6.955555555555556e-07, "logits/chosen": 1.9453125, "logits/rejected": 1.6142578125, "logps/chosen": -28.1875, "logps/rejected": -32.75, "loss": 0.6567, "rewards/accuracies": 0.75, "rewards/chosen": 0.0693359375, "rewards/margins": 0.09429931640625, "rewards/rejected": -0.0250244140625, "step": 411 }, { "epoch": 0.30518518518518517, "grad_norm": 1.669329285621643, "learning_rate": 6.948148148148147e-07, "logits/chosen": 1.6318359375, "logits/rejected": 2.060546875, "logps/chosen": -36.0, "logps/rejected": -51.40625, "loss": 0.6045, "rewards/accuracies": 1.0, "rewards/chosen": 0.0635986328125, "rewards/margins": 0.1988525390625, "rewards/rejected": -0.13525390625, "step": 412 }, { "epoch": 0.30592592592592593, "grad_norm": 1.9134403467178345, "learning_rate": 6.94074074074074e-07, "logits/chosen": 1.361328125, "logits/rejected": 1.7236328125, "logps/chosen": -34.4375, "logps/rejected": -38.0, "loss": 0.6675, "rewards/accuracies": 0.5, "rewards/chosen": 0.0804443359375, "rewards/margins": 0.08282470703125, "rewards/rejected": -0.002288818359375, "step": 413 }, { "epoch": 0.30666666666666664, "grad_norm": 4.587564945220947, "learning_rate": 6.933333333333333e-07, "logits/chosen": 2.2578125, "logits/rejected": 1.796875, "logps/chosen": -48.625, "logps/rejected": -42.53125, "loss": 0.8213, "rewards/accuracies": 0.5, "rewards/chosen": -0.1531982421875, "rewards/margins": -0.212158203125, "rewards/rejected": 0.0589599609375, "step": 414 }, { "epoch": 0.3074074074074074, "grad_norm": 1.564726710319519, "learning_rate": 6.925925925925925e-07, "logits/chosen": 1.00390625, "logits/rejected": 1.2939453125, "logps/chosen": -23.859375, "logps/rejected": -46.59375, "loss": 0.542, "rewards/accuracies": 1.0, "rewards/chosen": 0.10821533203125, "rewards/margins": 0.36572265625, "rewards/rejected": -0.25732421875, "step": 415 }, { "epoch": 0.30814814814814817, "grad_norm": 3.087045192718506, "learning_rate": 6.918518518518518e-07, "logits/chosen": 2.546875, "logits/rejected": 2.044921875, "logps/chosen": -33.375, "logps/rejected": -36.65625, "loss": 0.6777, "rewards/accuracies": 0.5, "rewards/chosen": -0.135986328125, "rewards/margins": 0.051513671875, "rewards/rejected": -0.1875, "step": 416 }, { "epoch": 0.3088888888888889, "grad_norm": 1.2077984809875488, "learning_rate": 6.911111111111111e-07, "logits/chosen": 1.5634765625, "logits/rejected": 1.60546875, "logps/chosen": -28.234375, "logps/rejected": -39.90625, "loss": 0.6499, "rewards/accuracies": 0.75, "rewards/chosen": 0.164794921875, "rewards/margins": 0.1260986328125, "rewards/rejected": 0.0386962890625, "step": 417 }, { "epoch": 0.30962962962962964, "grad_norm": 2.2305774688720703, "learning_rate": 6.903703703703704e-07, "logits/chosen": 1.6123046875, "logits/rejected": 1.3388671875, "logps/chosen": -44.1875, "logps/rejected": -29.234375, "loss": 0.6494, "rewards/accuracies": 0.5, "rewards/chosen": 0.147216796875, "rewards/margins": 0.16015625, "rewards/rejected": -0.0128936767578125, "step": 418 }, { "epoch": 0.31037037037037035, "grad_norm": 2.087620496749878, "learning_rate": 6.896296296296296e-07, "logits/chosen": 1.595703125, "logits/rejected": 1.7548828125, "logps/chosen": -32.8125, "logps/rejected": -52.0, "loss": 0.729, "rewards/accuracies": 0.25, "rewards/chosen": -0.0252227783203125, "rewards/margins": 0.138427734375, "rewards/rejected": -0.163818359375, "step": 419 }, { "epoch": 0.3111111111111111, "grad_norm": 1.8313374519348145, "learning_rate": 6.888888888888889e-07, "logits/chosen": 1.720703125, "logits/rejected": 1.5029296875, "logps/chosen": -30.859375, "logps/rejected": -47.21875, "loss": 0.7832, "rewards/accuracies": 0.25, "rewards/chosen": -0.11053466796875, "rewards/margins": -0.166015625, "rewards/rejected": 0.05548095703125, "step": 420 }, { "epoch": 0.3118518518518518, "grad_norm": 1.9491385221481323, "learning_rate": 6.881481481481482e-07, "logits/chosen": 1.9853515625, "logits/rejected": 2.46484375, "logps/chosen": -35.09375, "logps/rejected": -36.0625, "loss": 0.7993, "rewards/accuracies": 0.0, "rewards/chosen": -0.202392578125, "rewards/margins": -0.1988525390625, "rewards/rejected": -0.003505706787109375, "step": 421 }, { "epoch": 0.3125925925925926, "grad_norm": 2.065169334411621, "learning_rate": 6.874074074074073e-07, "logits/chosen": 1.7412109375, "logits/rejected": 1.5576171875, "logps/chosen": -56.34375, "logps/rejected": -35.125, "loss": 0.6357, "rewards/accuracies": 0.5, "rewards/chosen": 0.29345703125, "rewards/margins": 0.232177734375, "rewards/rejected": 0.06134033203125, "step": 422 }, { "epoch": 0.31333333333333335, "grad_norm": 2.21032977104187, "learning_rate": 6.866666666666666e-07, "logits/chosen": 1.1962890625, "logits/rejected": 1.9912109375, "logps/chosen": -39.0625, "logps/rejected": -82.0625, "loss": 0.5732, "rewards/accuracies": 0.75, "rewards/chosen": -0.0595703125, "rewards/margins": 0.41552734375, "rewards/rejected": -0.47509765625, "step": 423 }, { "epoch": 0.31407407407407406, "grad_norm": 1.5167747735977173, "learning_rate": 6.859259259259259e-07, "logits/chosen": 1.3623046875, "logits/rejected": 2.20703125, "logps/chosen": -37.375, "logps/rejected": -52.5, "loss": 0.6514, "rewards/accuracies": 0.75, "rewards/chosen": -0.0250091552734375, "rewards/margins": 0.08673095703125, "rewards/rejected": -0.11175537109375, "step": 424 }, { "epoch": 0.3148148148148148, "grad_norm": 1.7637958526611328, "learning_rate": 6.851851851851852e-07, "logits/chosen": 1.591796875, "logits/rejected": 1.8173828125, "logps/chosen": -31.75, "logps/rejected": -45.8125, "loss": 0.6768, "rewards/accuracies": 0.5, "rewards/chosen": -0.0797119140625, "rewards/margins": 0.050048828125, "rewards/rejected": -0.129638671875, "step": 425 }, { "epoch": 0.31555555555555553, "grad_norm": 1.861825704574585, "learning_rate": 6.844444444444444e-07, "logits/chosen": 0.96533203125, "logits/rejected": 1.076171875, "logps/chosen": -36.8125, "logps/rejected": -36.21875, "loss": 0.7871, "rewards/accuracies": 0.25, "rewards/chosen": -0.142578125, "rewards/margins": -0.1683349609375, "rewards/rejected": 0.025787353515625, "step": 426 }, { "epoch": 0.3162962962962963, "grad_norm": 1.7682939767837524, "learning_rate": 6.837037037037036e-07, "logits/chosen": 1.8955078125, "logits/rejected": 1.9482421875, "logps/chosen": -38.0625, "logps/rejected": -68.875, "loss": 0.6538, "rewards/accuracies": 0.5, "rewards/chosen": -0.07769775390625, "rewards/margins": 0.1082763671875, "rewards/rejected": -0.18603515625, "step": 427 }, { "epoch": 0.31703703703703706, "grad_norm": 2.490684986114502, "learning_rate": 6.829629629629629e-07, "logits/chosen": 1.712890625, "logits/rejected": 1.845703125, "logps/chosen": -26.828125, "logps/rejected": -45.375, "loss": 0.7988, "rewards/accuracies": 0.25, "rewards/chosen": -0.1617431640625, "rewards/margins": -0.176513671875, "rewards/rejected": 0.01483917236328125, "step": 428 }, { "epoch": 0.31777777777777777, "grad_norm": 1.3977816104888916, "learning_rate": 6.822222222222221e-07, "logits/chosen": 1.798828125, "logits/rejected": 1.5986328125, "logps/chosen": -24.34375, "logps/rejected": -27.546875, "loss": 0.6128, "rewards/accuracies": 1.0, "rewards/chosen": 0.0648193359375, "rewards/margins": 0.169921875, "rewards/rejected": -0.1051025390625, "step": 429 }, { "epoch": 0.31851851851851853, "grad_norm": 1.269136667251587, "learning_rate": 6.814814814814814e-07, "logits/chosen": 1.6171875, "logits/rejected": 2.568359375, "logps/chosen": -33.03125, "logps/rejected": -34.0625, "loss": 0.5317, "rewards/accuracies": 0.5, "rewards/chosen": 0.03594970703125, "rewards/margins": 0.66748046875, "rewards/rejected": -0.6318359375, "step": 430 }, { "epoch": 0.31925925925925924, "grad_norm": 1.8698382377624512, "learning_rate": 6.807407407407408e-07, "logits/chosen": 1.3583984375, "logits/rejected": 0.99365234375, "logps/chosen": -21.421875, "logps/rejected": -59.15625, "loss": 0.6533, "rewards/accuracies": 0.75, "rewards/chosen": 0.13037109375, "rewards/margins": 0.105224609375, "rewards/rejected": 0.0250396728515625, "step": 431 }, { "epoch": 0.32, "grad_norm": 1.8278090953826904, "learning_rate": 6.800000000000001e-07, "logits/chosen": 1.615234375, "logits/rejected": 1.8671875, "logps/chosen": -29.34375, "logps/rejected": -63.84375, "loss": 0.7388, "rewards/accuracies": 0.25, "rewards/chosen": -0.0804443359375, "rewards/margins": -0.0882568359375, "rewards/rejected": 0.00780487060546875, "step": 432 }, { "epoch": 0.3207407407407407, "grad_norm": 1.6990636587142944, "learning_rate": 6.792592592592592e-07, "logits/chosen": 1.78125, "logits/rejected": 2.166015625, "logps/chosen": -32.21875, "logps/rejected": -33.375, "loss": 0.5752, "rewards/accuracies": 0.75, "rewards/chosen": 0.161376953125, "rewards/margins": 0.264404296875, "rewards/rejected": -0.10308837890625, "step": 433 }, { "epoch": 0.3214814814814815, "grad_norm": 1.6766248941421509, "learning_rate": 6.785185185185185e-07, "logits/chosen": 1.271484375, "logits/rejected": 1.2529296875, "logps/chosen": -36.5625, "logps/rejected": -31.015625, "loss": 0.5908, "rewards/accuracies": 0.75, "rewards/chosen": 0.1917724609375, "rewards/margins": 0.2288818359375, "rewards/rejected": -0.037109375, "step": 434 }, { "epoch": 0.32222222222222224, "grad_norm": 2.505631446838379, "learning_rate": 6.777777777777778e-07, "logits/chosen": 1.798828125, "logits/rejected": 1.3076171875, "logps/chosen": -48.09375, "logps/rejected": -35.6875, "loss": 0.7114, "rewards/accuracies": 0.25, "rewards/chosen": -0.057830810546875, "rewards/margins": -0.0054931640625, "rewards/rejected": -0.052337646484375, "step": 435 }, { "epoch": 0.32296296296296295, "grad_norm": 1.8028271198272705, "learning_rate": 6.77037037037037e-07, "logits/chosen": 1.68359375, "logits/rejected": 1.662109375, "logps/chosen": -31.625, "logps/rejected": -33.1875, "loss": 0.7515, "rewards/accuracies": 0.25, "rewards/chosen": 0.046478271484375, "rewards/margins": -0.108642578125, "rewards/rejected": 0.155029296875, "step": 436 }, { "epoch": 0.3237037037037037, "grad_norm": 2.78373122215271, "learning_rate": 6.762962962962963e-07, "logits/chosen": 1.671875, "logits/rejected": 1.634765625, "logps/chosen": -30.28125, "logps/rejected": -36.71875, "loss": 0.6196, "rewards/accuracies": 0.75, "rewards/chosen": 0.133544921875, "rewards/margins": 0.1558837890625, "rewards/rejected": -0.0222625732421875, "step": 437 }, { "epoch": 0.3244444444444444, "grad_norm": 2.1129331588745117, "learning_rate": 6.755555555555555e-07, "logits/chosen": 1.53125, "logits/rejected": 1.7822265625, "logps/chosen": -30.84375, "logps/rejected": -36.09375, "loss": 0.8389, "rewards/accuracies": 0.0, "rewards/chosen": -0.1715087890625, "rewards/margins": -0.26806640625, "rewards/rejected": 0.09649658203125, "step": 438 }, { "epoch": 0.3251851851851852, "grad_norm": 2.01611328125, "learning_rate": 6.748148148148148e-07, "logits/chosen": 2.0078125, "logits/rejected": 2.3046875, "logps/chosen": -49.75, "logps/rejected": -64.625, "loss": 0.5195, "rewards/accuracies": 0.5, "rewards/chosen": 0.3125, "rewards/margins": 1.1591796875, "rewards/rejected": -0.84619140625, "step": 439 }, { "epoch": 0.32592592592592595, "grad_norm": 2.3418636322021484, "learning_rate": 6.74074074074074e-07, "logits/chosen": 0.8642578125, "logits/rejected": 1.7666015625, "logps/chosen": -42.6875, "logps/rejected": -65.3125, "loss": 0.7402, "rewards/accuracies": 0.25, "rewards/chosen": 0.0211029052734375, "rewards/margins": -0.0621337890625, "rewards/rejected": 0.08319091796875, "step": 440 }, { "epoch": 0.32666666666666666, "grad_norm": 2.7923402786254883, "learning_rate": 6.733333333333333e-07, "logits/chosen": 1.1435546875, "logits/rejected": 1.7333984375, "logps/chosen": -32.875, "logps/rejected": -36.375, "loss": 0.8872, "rewards/accuracies": 0.25, "rewards/chosen": -0.09375, "rewards/margins": -0.27880859375, "rewards/rejected": 0.1851806640625, "step": 441 }, { "epoch": 0.3274074074074074, "grad_norm": 10.839447975158691, "learning_rate": 6.725925925925926e-07, "logits/chosen": 1.5205078125, "logits/rejected": 1.734375, "logps/chosen": -37.78125, "logps/rejected": -50.25, "loss": 0.9561, "rewards/accuracies": 0.75, "rewards/chosen": -0.108642578125, "rewards/margins": -0.28076171875, "rewards/rejected": 0.1717529296875, "step": 442 }, { "epoch": 0.32814814814814813, "grad_norm": 1.753770112991333, "learning_rate": 6.718518518518517e-07, "logits/chosen": 1.8515625, "logits/rejected": 1.435546875, "logps/chosen": -29.578125, "logps/rejected": -36.9375, "loss": 0.749, "rewards/accuracies": 0.25, "rewards/chosen": -0.0992431640625, "rewards/margins": -0.08941650390625, "rewards/rejected": -0.00975799560546875, "step": 443 }, { "epoch": 0.3288888888888889, "grad_norm": 1.955764889717102, "learning_rate": 6.711111111111111e-07, "logits/chosen": 1.498046875, "logits/rejected": 2.076171875, "logps/chosen": -35.09375, "logps/rejected": -53.875, "loss": 0.6113, "rewards/accuracies": 0.5, "rewards/chosen": -0.0625, "rewards/margins": 0.53759765625, "rewards/rejected": -0.60009765625, "step": 444 }, { "epoch": 0.3296296296296296, "grad_norm": 3.317291259765625, "learning_rate": 6.703703703703704e-07, "logits/chosen": 1.7841796875, "logits/rejected": 1.705078125, "logps/chosen": -30.0, "logps/rejected": -49.75, "loss": 0.7476, "rewards/accuracies": 0.25, "rewards/chosen": -0.1383056640625, "rewards/margins": -0.0992431640625, "rewards/rejected": -0.0390625, "step": 445 }, { "epoch": 0.33037037037037037, "grad_norm": 1.8782461881637573, "learning_rate": 6.696296296296296e-07, "logits/chosen": 1.6796875, "logits/rejected": 1.5205078125, "logps/chosen": -59.15625, "logps/rejected": -35.59375, "loss": 0.6631, "rewards/accuracies": 0.75, "rewards/chosen": 0.07574462890625, "rewards/margins": 0.085205078125, "rewards/rejected": -0.0093536376953125, "step": 446 }, { "epoch": 0.33111111111111113, "grad_norm": 1.8609412908554077, "learning_rate": 6.688888888888889e-07, "logits/chosen": 1.490234375, "logits/rejected": 1.138671875, "logps/chosen": -34.5625, "logps/rejected": -36.90625, "loss": 0.7021, "rewards/accuracies": 0.75, "rewards/chosen": 0.01015472412109375, "rewards/margins": 0.00238800048828125, "rewards/rejected": 0.00778961181640625, "step": 447 }, { "epoch": 0.33185185185185184, "grad_norm": 2.145643949508667, "learning_rate": 6.681481481481481e-07, "logits/chosen": 1.41015625, "logits/rejected": 1.7685546875, "logps/chosen": -27.8125, "logps/rejected": -38.46875, "loss": 0.7217, "rewards/accuracies": 0.25, "rewards/chosen": -0.047882080078125, "rewards/margins": -0.042816162109375, "rewards/rejected": -0.00510406494140625, "step": 448 }, { "epoch": 0.3325925925925926, "grad_norm": 4.124001979827881, "learning_rate": 6.674074074074074e-07, "logits/chosen": 1.3740234375, "logits/rejected": 1.8583984375, "logps/chosen": -32.0625, "logps/rejected": -43.75, "loss": 3.1953, "rewards/accuracies": 0.25, "rewards/chosen": -0.1397705078125, "rewards/margins": -2.654296875, "rewards/rejected": 2.515625, "step": 449 }, { "epoch": 0.3333333333333333, "grad_norm": 1.4577915668487549, "learning_rate": 6.666666666666666e-07, "logits/chosen": 1.591796875, "logits/rejected": 1.1318359375, "logps/chosen": -42.4375, "logps/rejected": -36.375, "loss": 0.5464, "rewards/accuracies": 0.5, "rewards/chosen": 0.74853515625, "rewards/margins": 0.87744140625, "rewards/rejected": -0.1285400390625, "step": 450 }, { "epoch": 0.3340740740740741, "grad_norm": 4.16238260269165, "learning_rate": 6.659259259259259e-07, "logits/chosen": 1.2373046875, "logits/rejected": 1.712890625, "logps/chosen": -30.859375, "logps/rejected": -40.125, "loss": 0.7109, "rewards/accuracies": 0.25, "rewards/chosen": 0.166015625, "rewards/margins": -0.016387939453125, "rewards/rejected": 0.182373046875, "step": 451 }, { "epoch": 0.3348148148148148, "grad_norm": 1.655313491821289, "learning_rate": 6.651851851851852e-07, "logits/chosen": 1.7216796875, "logits/rejected": 1.78125, "logps/chosen": -30.671875, "logps/rejected": -38.28125, "loss": 0.7393, "rewards/accuracies": 0.5, "rewards/chosen": -0.06524658203125, "rewards/margins": -0.0640869140625, "rewards/rejected": -0.001171112060546875, "step": 452 }, { "epoch": 0.33555555555555555, "grad_norm": 1.9757471084594727, "learning_rate": 6.644444444444443e-07, "logits/chosen": 2.384765625, "logits/rejected": 2.013671875, "logps/chosen": -54.71875, "logps/rejected": -49.9375, "loss": 0.752, "rewards/accuracies": 0.25, "rewards/chosen": 0.04217529296875, "rewards/margins": -0.08587646484375, "rewards/rejected": 0.1280517578125, "step": 453 }, { "epoch": 0.3362962962962963, "grad_norm": 2.6172914505004883, "learning_rate": 6.637037037037036e-07, "logits/chosen": 1.6708984375, "logits/rejected": 2.2734375, "logps/chosen": -26.953125, "logps/rejected": -46.84375, "loss": 0.6484, "rewards/accuracies": 0.75, "rewards/chosen": 0.2144775390625, "rewards/margins": 0.1072998046875, "rewards/rejected": 0.107177734375, "step": 454 }, { "epoch": 0.337037037037037, "grad_norm": 1.4523365497589111, "learning_rate": 6.629629629629629e-07, "logits/chosen": 1.076171875, "logits/rejected": 1.2421875, "logps/chosen": -24.59375, "logps/rejected": -32.0625, "loss": 0.6738, "rewards/accuracies": 0.75, "rewards/chosen": 0.05859375, "rewards/margins": 0.04022216796875, "rewards/rejected": 0.01837158203125, "step": 455 }, { "epoch": 0.3377777777777778, "grad_norm": 3.5335328578948975, "learning_rate": 6.622222222222222e-07, "logits/chosen": 1.3583984375, "logits/rejected": 1.2919921875, "logps/chosen": -51.625, "logps/rejected": -43.875, "loss": 0.748, "rewards/accuracies": 0.25, "rewards/chosen": -0.021087646484375, "rewards/margins": -0.10406494140625, "rewards/rejected": 0.0830078125, "step": 456 }, { "epoch": 0.3385185185185185, "grad_norm": 2.1750729084014893, "learning_rate": 6.614814814814815e-07, "logits/chosen": 1.9736328125, "logits/rejected": 1.845703125, "logps/chosen": -37.59375, "logps/rejected": -30.859375, "loss": 0.6748, "rewards/accuracies": 0.25, "rewards/chosen": -0.097900390625, "rewards/margins": 0.06146240234375, "rewards/rejected": -0.1593017578125, "step": 457 }, { "epoch": 0.33925925925925926, "grad_norm": 2.1253581047058105, "learning_rate": 6.607407407407408e-07, "logits/chosen": 2.048828125, "logits/rejected": 1.6064453125, "logps/chosen": -27.625, "logps/rejected": -65.5, "loss": 0.7119, "rewards/accuracies": 0.25, "rewards/chosen": 0.07421875, "rewards/margins": -0.03240966796875, "rewards/rejected": 0.10662841796875, "step": 458 }, { "epoch": 0.34, "grad_norm": 1.9839017391204834, "learning_rate": 6.6e-07, "logits/chosen": 1.125, "logits/rejected": 1.798828125, "logps/chosen": -35.84375, "logps/rejected": -31.34375, "loss": 0.7769, "rewards/accuracies": 0.5, "rewards/chosen": -0.190185546875, "rewards/margins": -0.1507568359375, "rewards/rejected": -0.039459228515625, "step": 459 }, { "epoch": 0.34074074074074073, "grad_norm": 2.517029047012329, "learning_rate": 6.592592592592592e-07, "logits/chosen": 1.8779296875, "logits/rejected": 2.455078125, "logps/chosen": -42.5, "logps/rejected": -74.4375, "loss": 0.8418, "rewards/accuracies": 0.5, "rewards/chosen": 0.131591796875, "rewards/margins": -0.2457275390625, "rewards/rejected": 0.37744140625, "step": 460 }, { "epoch": 0.3414814814814815, "grad_norm": 1.5461701154708862, "learning_rate": 6.585185185185185e-07, "logits/chosen": 1.380859375, "logits/rejected": 1.1611328125, "logps/chosen": -24.109375, "logps/rejected": -42.625, "loss": 0.7119, "rewards/accuracies": 0.5, "rewards/chosen": -0.03204345703125, "rewards/margins": -0.03070068359375, "rewards/rejected": -0.0013427734375, "step": 461 }, { "epoch": 0.3422222222222222, "grad_norm": 1.7017426490783691, "learning_rate": 6.577777777777778e-07, "logits/chosen": 2.662109375, "logits/rejected": 1.94921875, "logps/chosen": -24.375, "logps/rejected": -45.34375, "loss": 0.7114, "rewards/accuracies": 0.5, "rewards/chosen": -0.087890625, "rewards/margins": -0.019989013671875, "rewards/rejected": -0.06793212890625, "step": 462 }, { "epoch": 0.34296296296296297, "grad_norm": 7.290791988372803, "learning_rate": 6.570370370370371e-07, "logits/chosen": 2.173828125, "logits/rejected": 1.943359375, "logps/chosen": -51.21875, "logps/rejected": -35.6875, "loss": 0.9282, "rewards/accuracies": 0.5, "rewards/chosen": -0.2142333984375, "rewards/margins": -0.1776123046875, "rewards/rejected": -0.03668212890625, "step": 463 }, { "epoch": 0.3437037037037037, "grad_norm": 2.1122193336486816, "learning_rate": 6.562962962962962e-07, "logits/chosen": 2.05078125, "logits/rejected": 2.27734375, "logps/chosen": -37.75, "logps/rejected": -59.65625, "loss": 0.8052, "rewards/accuracies": 0.0, "rewards/chosen": -0.0335693359375, "rewards/margins": -0.21240234375, "rewards/rejected": 0.178955078125, "step": 464 }, { "epoch": 0.34444444444444444, "grad_norm": 1.9867185354232788, "learning_rate": 6.555555555555555e-07, "logits/chosen": 1.8291015625, "logits/rejected": 1.439453125, "logps/chosen": -34.84375, "logps/rejected": -42.75, "loss": 0.7432, "rewards/accuracies": 0.5, "rewards/chosen": -0.0355224609375, "rewards/margins": -0.091796875, "rewards/rejected": 0.0562744140625, "step": 465 }, { "epoch": 0.3451851851851852, "grad_norm": 1.9283727407455444, "learning_rate": 6.548148148148148e-07, "logits/chosen": 1.146484375, "logits/rejected": 1.5380859375, "logps/chosen": -47.0625, "logps/rejected": -51.03125, "loss": 0.6504, "rewards/accuracies": 1.0, "rewards/chosen": 0.1767578125, "rewards/margins": 0.088134765625, "rewards/rejected": 0.088623046875, "step": 466 }, { "epoch": 0.3459259259259259, "grad_norm": 1.8308229446411133, "learning_rate": 6.54074074074074e-07, "logits/chosen": 1.205078125, "logits/rejected": 1.5771484375, "logps/chosen": -24.0625, "logps/rejected": -33.65625, "loss": 0.5806, "rewards/accuracies": 1.0, "rewards/chosen": 0.103759765625, "rewards/margins": 0.248046875, "rewards/rejected": -0.144287109375, "step": 467 }, { "epoch": 0.3466666666666667, "grad_norm": 2.0023815631866455, "learning_rate": 6.533333333333333e-07, "logits/chosen": 1.6962890625, "logits/rejected": 2.025390625, "logps/chosen": -22.828125, "logps/rejected": -40.8125, "loss": 0.6421, "rewards/accuracies": 0.75, "rewards/chosen": 0.0142669677734375, "rewards/margins": 0.1295166015625, "rewards/rejected": -0.115234375, "step": 468 }, { "epoch": 0.3474074074074074, "grad_norm": 1.4594495296478271, "learning_rate": 6.525925925925925e-07, "logits/chosen": 1.7763671875, "logits/rejected": 1.7744140625, "logps/chosen": -30.90625, "logps/rejected": -35.65625, "loss": 0.6802, "rewards/accuracies": 0.5, "rewards/chosen": -0.11993408203125, "rewards/margins": 0.035919189453125, "rewards/rejected": -0.1558837890625, "step": 469 }, { "epoch": 0.34814814814814815, "grad_norm": 2.185635805130005, "learning_rate": 6.518518518518519e-07, "logits/chosen": 1.1552734375, "logits/rejected": 1.3466796875, "logps/chosen": -24.4375, "logps/rejected": -42.0, "loss": 0.6782, "rewards/accuracies": 0.5, "rewards/chosen": 0.0220947265625, "rewards/margins": 0.0408935546875, "rewards/rejected": -0.018768310546875, "step": 470 }, { "epoch": 0.3488888888888889, "grad_norm": 1.4653098583221436, "learning_rate": 6.511111111111111e-07, "logits/chosen": 1.2587890625, "logits/rejected": 1.60546875, "logps/chosen": -24.734375, "logps/rejected": -40.53125, "loss": 0.6538, "rewards/accuracies": 0.75, "rewards/chosen": 0.205810546875, "rewards/margins": 0.08209228515625, "rewards/rejected": 0.12384033203125, "step": 471 }, { "epoch": 0.3496296296296296, "grad_norm": 3.4303319454193115, "learning_rate": 6.503703703703704e-07, "logits/chosen": 1.005859375, "logits/rejected": 1.5693359375, "logps/chosen": -39.25, "logps/rejected": -69.0625, "loss": 0.8257, "rewards/accuracies": 0.0, "rewards/chosen": -0.1278076171875, "rewards/margins": -0.248046875, "rewards/rejected": 0.12030029296875, "step": 472 }, { "epoch": 0.3503703703703704, "grad_norm": 1.6837379932403564, "learning_rate": 6.496296296296297e-07, "logits/chosen": 1.8046875, "logits/rejected": 1.333984375, "logps/chosen": -24.203125, "logps/rejected": -54.0625, "loss": 0.6816, "rewards/accuracies": 0.5, "rewards/chosen": 0.07989501953125, "rewards/margins": 0.0290985107421875, "rewards/rejected": 0.05078125, "step": 473 }, { "epoch": 0.3511111111111111, "grad_norm": 1.4754915237426758, "learning_rate": 6.488888888888888e-07, "logits/chosen": 0.81640625, "logits/rejected": 1.814453125, "logps/chosen": -24.53125, "logps/rejected": -22.28125, "loss": 0.6797, "rewards/accuracies": 0.75, "rewards/chosen": 0.06561279296875, "rewards/margins": 0.0400390625, "rewards/rejected": 0.025604248046875, "step": 474 }, { "epoch": 0.35185185185185186, "grad_norm": 2.087291717529297, "learning_rate": 6.481481481481481e-07, "logits/chosen": 1.326171875, "logits/rejected": 1.0830078125, "logps/chosen": -61.5, "logps/rejected": -34.625, "loss": 0.6963, "rewards/accuracies": 0.75, "rewards/chosen": 0.04022216796875, "rewards/margins": 0.003143310546875, "rewards/rejected": 0.037109375, "step": 475 }, { "epoch": 0.35259259259259257, "grad_norm": 2.797666311264038, "learning_rate": 6.474074074074074e-07, "logits/chosen": 2.01171875, "logits/rejected": 2.025390625, "logps/chosen": -35.03125, "logps/rejected": -48.09375, "loss": 0.7407, "rewards/accuracies": 0.5, "rewards/chosen": 0.1875, "rewards/margins": -0.0733642578125, "rewards/rejected": 0.260986328125, "step": 476 }, { "epoch": 0.35333333333333333, "grad_norm": 2.2543108463287354, "learning_rate": 6.466666666666666e-07, "logits/chosen": 2.005859375, "logits/rejected": 1.9853515625, "logps/chosen": -37.96875, "logps/rejected": -59.78125, "loss": 0.9189, "rewards/accuracies": 0.0, "rewards/chosen": -0.30615234375, "rewards/margins": -0.39404296875, "rewards/rejected": 0.087890625, "step": 477 }, { "epoch": 0.3540740740740741, "grad_norm": 1.441838264465332, "learning_rate": 6.459259259259259e-07, "logits/chosen": 1.697265625, "logits/rejected": 1.0576171875, "logps/chosen": -27.34375, "logps/rejected": -34.34375, "loss": 0.7695, "rewards/accuracies": 0.5, "rewards/chosen": 0.06524658203125, "rewards/margins": -0.1136474609375, "rewards/rejected": 0.1788330078125, "step": 478 }, { "epoch": 0.3548148148148148, "grad_norm": 1.4794845581054688, "learning_rate": 6.451851851851851e-07, "logits/chosen": 1.1640625, "logits/rejected": 1.6513671875, "logps/chosen": -30.328125, "logps/rejected": -38.28125, "loss": 0.7715, "rewards/accuracies": 0.25, "rewards/chosen": -0.0347900390625, "rewards/margins": -0.142578125, "rewards/rejected": 0.1077880859375, "step": 479 }, { "epoch": 0.35555555555555557, "grad_norm": 1.8031450510025024, "learning_rate": 6.444444444444444e-07, "logits/chosen": 1.70703125, "logits/rejected": 1.583984375, "logps/chosen": -33.96875, "logps/rejected": -25.9375, "loss": 0.6328, "rewards/accuracies": 1.0, "rewards/chosen": 0.057220458984375, "rewards/margins": 0.1334228515625, "rewards/rejected": -0.076171875, "step": 480 }, { "epoch": 0.3562962962962963, "grad_norm": 2.319058418273926, "learning_rate": 6.437037037037036e-07, "logits/chosen": 1.2490234375, "logits/rejected": 1.9443359375, "logps/chosen": -36.9375, "logps/rejected": -37.9375, "loss": 0.8145, "rewards/accuracies": 0.5, "rewards/chosen": 0.03515625, "rewards/margins": -0.201904296875, "rewards/rejected": 0.237060546875, "step": 481 }, { "epoch": 0.35703703703703704, "grad_norm": 1.7966830730438232, "learning_rate": 6.429629629629629e-07, "logits/chosen": 1.6884765625, "logits/rejected": 1.4658203125, "logps/chosen": -26.5625, "logps/rejected": -38.03125, "loss": 0.708, "rewards/accuracies": 0.5, "rewards/chosen": -0.058197021484375, "rewards/margins": 0.00286865234375, "rewards/rejected": -0.06109619140625, "step": 482 }, { "epoch": 0.35777777777777775, "grad_norm": 2.6904256343841553, "learning_rate": 6.422222222222223e-07, "logits/chosen": 1.248046875, "logits/rejected": 1.861328125, "logps/chosen": -43.4375, "logps/rejected": -75.9375, "loss": 0.6211, "rewards/accuracies": 0.75, "rewards/chosen": -0.06170654296875, "rewards/margins": 0.1617431640625, "rewards/rejected": -0.2235107421875, "step": 483 }, { "epoch": 0.3585185185185185, "grad_norm": 2.012667179107666, "learning_rate": 6.414814814814814e-07, "logits/chosen": 1.0517578125, "logits/rejected": 1.513671875, "logps/chosen": -27.21875, "logps/rejected": -39.71875, "loss": 0.7075, "rewards/accuracies": 0.5, "rewards/chosen": -0.07305908203125, "rewards/margins": -0.0261993408203125, "rewards/rejected": -0.046875, "step": 484 }, { "epoch": 0.3592592592592593, "grad_norm": 2.2084836959838867, "learning_rate": 6.407407407407407e-07, "logits/chosen": 1.4951171875, "logits/rejected": 1.5810546875, "logps/chosen": -24.421875, "logps/rejected": -59.03125, "loss": 0.8057, "rewards/accuracies": 0.25, "rewards/chosen": 0.017181396484375, "rewards/margins": -0.1961669921875, "rewards/rejected": 0.21337890625, "step": 485 }, { "epoch": 0.36, "grad_norm": 1.6620522737503052, "learning_rate": 6.4e-07, "logits/chosen": 1.2373046875, "logits/rejected": 1.7294921875, "logps/chosen": -29.53125, "logps/rejected": -32.0, "loss": 0.6904, "rewards/accuracies": 0.5, "rewards/chosen": -0.09918212890625, "rewards/margins": 0.012939453125, "rewards/rejected": -0.11212158203125, "step": 486 }, { "epoch": 0.36074074074074075, "grad_norm": 1.7713388204574585, "learning_rate": 6.392592592592593e-07, "logits/chosen": 1.69921875, "logits/rejected": 1.564453125, "logps/chosen": -32.46875, "logps/rejected": -35.15625, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": 0.00624847412109375, "rewards/margins": 0.109375, "rewards/rejected": -0.1031494140625, "step": 487 }, { "epoch": 0.36148148148148146, "grad_norm": 1.9898687601089478, "learning_rate": 6.385185185185185e-07, "logits/chosen": 1.60546875, "logits/rejected": 1.634765625, "logps/chosen": -34.5, "logps/rejected": -39.59375, "loss": 0.7397, "rewards/accuracies": 0.25, "rewards/chosen": -0.169921875, "rewards/margins": -0.0635986328125, "rewards/rejected": -0.1063232421875, "step": 488 }, { "epoch": 0.3622222222222222, "grad_norm": 2.6874196529388428, "learning_rate": 6.377777777777778e-07, "logits/chosen": 2.169921875, "logits/rejected": 2.201171875, "logps/chosen": -45.8125, "logps/rejected": -47.1875, "loss": 0.792, "rewards/accuracies": 0.5, "rewards/chosen": -0.155517578125, "rewards/margins": -0.174560546875, "rewards/rejected": 0.0191192626953125, "step": 489 }, { "epoch": 0.362962962962963, "grad_norm": 1.8975374698638916, "learning_rate": 6.37037037037037e-07, "logits/chosen": 1.3115234375, "logits/rejected": 1.435546875, "logps/chosen": -27.515625, "logps/rejected": -39.4375, "loss": 0.7451, "rewards/accuracies": 0.5, "rewards/chosen": -0.03790283203125, "rewards/margins": -0.0804443359375, "rewards/rejected": 0.04254150390625, "step": 490 }, { "epoch": 0.3637037037037037, "grad_norm": 2.0524604320526123, "learning_rate": 6.362962962962962e-07, "logits/chosen": 1.5546875, "logits/rejected": 1.833984375, "logps/chosen": -35.125, "logps/rejected": -51.6875, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.045684814453125, "rewards/margins": 0.06170654296875, "rewards/rejected": -0.016021728515625, "step": 491 }, { "epoch": 0.36444444444444446, "grad_norm": 5.8298139572143555, "learning_rate": 6.355555555555555e-07, "logits/chosen": 1.72265625, "logits/rejected": 2.103515625, "logps/chosen": -33.5, "logps/rejected": -49.5625, "loss": 0.6367, "rewards/accuracies": 0.5, "rewards/chosen": -0.08282470703125, "rewards/margins": 0.160400390625, "rewards/rejected": -0.2432861328125, "step": 492 }, { "epoch": 0.36518518518518517, "grad_norm": 1.744916558265686, "learning_rate": 6.348148148148148e-07, "logits/chosen": 1.390625, "logits/rejected": 1.765625, "logps/chosen": -29.25, "logps/rejected": -41.96875, "loss": 0.7002, "rewards/accuracies": 0.5, "rewards/chosen": 0.05511474609375, "rewards/margins": 0.023193359375, "rewards/rejected": 0.03204345703125, "step": 493 }, { "epoch": 0.36592592592592593, "grad_norm": 1.8004305362701416, "learning_rate": 6.340740740740741e-07, "logits/chosen": 1.259765625, "logits/rejected": 1.9296875, "logps/chosen": -40.71875, "logps/rejected": -48.0625, "loss": 0.7461, "rewards/accuracies": 0.25, "rewards/chosen": 0.048492431640625, "rewards/margins": -0.0875244140625, "rewards/rejected": 0.135986328125, "step": 494 }, { "epoch": 0.36666666666666664, "grad_norm": 1.7961413860321045, "learning_rate": 6.333333333333332e-07, "logits/chosen": 1.3037109375, "logits/rejected": 1.8583984375, "logps/chosen": -29.359375, "logps/rejected": -30.84375, "loss": 0.9189, "rewards/accuracies": 0.25, "rewards/chosen": -0.04180908203125, "rewards/margins": -0.357421875, "rewards/rejected": 0.315673828125, "step": 495 }, { "epoch": 0.3674074074074074, "grad_norm": 1.6819223165512085, "learning_rate": 6.325925925925925e-07, "logits/chosen": 1.8076171875, "logits/rejected": 1.2978515625, "logps/chosen": -23.828125, "logps/rejected": -49.84375, "loss": 0.6641, "rewards/accuracies": 0.5, "rewards/chosen": 0.06915283203125, "rewards/margins": 0.0660400390625, "rewards/rejected": 0.00311279296875, "step": 496 }, { "epoch": 0.36814814814814817, "grad_norm": 6.208568096160889, "learning_rate": 6.318518518518519e-07, "logits/chosen": 1.3525390625, "logits/rejected": 0.93212890625, "logps/chosen": -37.6875, "logps/rejected": -63.96875, "loss": 0.8369, "rewards/accuracies": 0.25, "rewards/chosen": -0.26513671875, "rewards/margins": -0.2335205078125, "rewards/rejected": -0.0316162109375, "step": 497 }, { "epoch": 0.3688888888888889, "grad_norm": 5.608078479766846, "learning_rate": 6.311111111111111e-07, "logits/chosen": 1.5146484375, "logits/rejected": 1.037109375, "logps/chosen": -27.515625, "logps/rejected": -51.46875, "loss": 0.8691, "rewards/accuracies": 0.25, "rewards/chosen": -0.246826171875, "rewards/margins": -0.277099609375, "rewards/rejected": 0.030029296875, "step": 498 }, { "epoch": 0.36962962962962964, "grad_norm": 1.5323432683944702, "learning_rate": 6.303703703703704e-07, "logits/chosen": 1.953125, "logits/rejected": 1.103515625, "logps/chosen": -33.78125, "logps/rejected": -34.28125, "loss": 0.7212, "rewards/accuracies": 0.25, "rewards/chosen": -0.057830810546875, "rewards/margins": -0.052764892578125, "rewards/rejected": -0.00507354736328125, "step": 499 }, { "epoch": 0.37037037037037035, "grad_norm": 2.833388090133667, "learning_rate": 6.296296296296296e-07, "logits/chosen": 2.203125, "logits/rejected": 2.046875, "logps/chosen": -77.3125, "logps/rejected": -58.4375, "loss": 0.4524, "rewards/accuracies": 0.75, "rewards/chosen": 0.77099609375, "rewards/margins": 0.7412109375, "rewards/rejected": 0.0296630859375, "step": 500 }, { "epoch": 0.3711111111111111, "grad_norm": 1.8182889223098755, "learning_rate": 6.288888888888889e-07, "logits/chosen": 1.625, "logits/rejected": 2.3515625, "logps/chosen": -40.6875, "logps/rejected": -68.9375, "loss": 0.6392, "rewards/accuracies": 0.75, "rewards/chosen": 0.149169921875, "rewards/margins": 0.165771484375, "rewards/rejected": -0.016448974609375, "step": 501 }, { "epoch": 0.3718518518518519, "grad_norm": 1.5209425687789917, "learning_rate": 6.281481481481481e-07, "logits/chosen": 1.1220703125, "logits/rejected": 1.5048828125, "logps/chosen": -49.125, "logps/rejected": -41.5, "loss": 0.5776, "rewards/accuracies": 0.75, "rewards/chosen": -0.01210784912109375, "rewards/margins": 0.26171875, "rewards/rejected": -0.27392578125, "step": 502 }, { "epoch": 0.3725925925925926, "grad_norm": 12.818269729614258, "learning_rate": 6.274074074074074e-07, "logits/chosen": 1.1845703125, "logits/rejected": 1.1630859375, "logps/chosen": -22.3125, "logps/rejected": -20.25, "loss": 0.6699, "rewards/accuracies": 0.5, "rewards/chosen": -0.0693359375, "rewards/margins": 0.0556640625, "rewards/rejected": -0.125, "step": 503 }, { "epoch": 0.37333333333333335, "grad_norm": 2.5224151611328125, "learning_rate": 6.266666666666667e-07, "logits/chosen": 1.427734375, "logits/rejected": 1.482421875, "logps/chosen": -66.0625, "logps/rejected": -48.8125, "loss": 0.8232, "rewards/accuracies": 0.25, "rewards/chosen": -0.2281494140625, "rewards/margins": -0.190185546875, "rewards/rejected": -0.03790283203125, "step": 504 }, { "epoch": 0.37407407407407406, "grad_norm": 1.2100120782852173, "learning_rate": 6.259259259259258e-07, "logits/chosen": 1.451171875, "logits/rejected": 2.224609375, "logps/chosen": -28.5625, "logps/rejected": -28.0, "loss": 0.6353, "rewards/accuracies": 0.75, "rewards/chosen": 0.139404296875, "rewards/margins": 0.16064453125, "rewards/rejected": -0.0212860107421875, "step": 505 }, { "epoch": 0.3748148148148148, "grad_norm": 1.8281985521316528, "learning_rate": 6.251851851851851e-07, "logits/chosen": 1.7646484375, "logits/rejected": 1.6689453125, "logps/chosen": -33.15625, "logps/rejected": -26.875, "loss": 0.7065, "rewards/accuracies": 0.5, "rewards/chosen": -0.026947021484375, "rewards/margins": -0.021270751953125, "rewards/rejected": -0.00566864013671875, "step": 506 }, { "epoch": 0.37555555555555553, "grad_norm": 1.7602806091308594, "learning_rate": 6.244444444444444e-07, "logits/chosen": 1.458984375, "logits/rejected": 1.6953125, "logps/chosen": -23.5625, "logps/rejected": -63.4375, "loss": 0.6895, "rewards/accuracies": 0.25, "rewards/chosen": -0.045318603515625, "rewards/margins": 0.060272216796875, "rewards/rejected": -0.10552978515625, "step": 507 }, { "epoch": 0.3762962962962963, "grad_norm": 1.866105556488037, "learning_rate": 6.237037037037036e-07, "logits/chosen": 1.3271484375, "logits/rejected": 1.8154296875, "logps/chosen": -18.34375, "logps/rejected": -59.4375, "loss": 0.6045, "rewards/accuracies": 0.75, "rewards/chosen": -0.035369873046875, "rewards/margins": 0.201416015625, "rewards/rejected": -0.23681640625, "step": 508 }, { "epoch": 0.37703703703703706, "grad_norm": 1.4509495496749878, "learning_rate": 6.229629629629629e-07, "logits/chosen": 1.3916015625, "logits/rejected": 1.1875, "logps/chosen": -27.59375, "logps/rejected": -45.09375, "loss": 0.6699, "rewards/accuracies": 0.75, "rewards/chosen": -0.0301055908203125, "rewards/margins": 0.054656982421875, "rewards/rejected": -0.08477783203125, "step": 509 }, { "epoch": 0.37777777777777777, "grad_norm": 1.4842498302459717, "learning_rate": 6.222222222222223e-07, "logits/chosen": 1.609375, "logits/rejected": 1.427734375, "logps/chosen": -22.703125, "logps/rejected": -29.40625, "loss": 0.6738, "rewards/accuracies": 0.5, "rewards/chosen": -0.0579833984375, "rewards/margins": 0.04864501953125, "rewards/rejected": -0.10662841796875, "step": 510 }, { "epoch": 0.37851851851851853, "grad_norm": 1.4125391244888306, "learning_rate": 6.214814814814815e-07, "logits/chosen": 1.302734375, "logits/rejected": 1.7431640625, "logps/chosen": -36.4375, "logps/rejected": -35.4375, "loss": 0.5815, "rewards/accuracies": 0.75, "rewards/chosen": 0.0726318359375, "rewards/margins": 0.268310546875, "rewards/rejected": -0.1956787109375, "step": 511 }, { "epoch": 0.37925925925925924, "grad_norm": 1.8960353136062622, "learning_rate": 6.207407407407407e-07, "logits/chosen": 1.560546875, "logits/rejected": 1.3232421875, "logps/chosen": -28.4375, "logps/rejected": -39.78125, "loss": 0.7334, "rewards/accuracies": 0.5, "rewards/chosen": -0.131591796875, "rewards/margins": -0.0648193359375, "rewards/rejected": -0.06683349609375, "step": 512 }, { "epoch": 0.38, "grad_norm": 2.2425520420074463, "learning_rate": 6.2e-07, "logits/chosen": 1.28515625, "logits/rejected": 1.51171875, "logps/chosen": -30.96875, "logps/rejected": -63.9375, "loss": 0.8154, "rewards/accuracies": 0.25, "rewards/chosen": 0.1527099609375, "rewards/margins": -0.1199951171875, "rewards/rejected": 0.272705078125, "step": 513 }, { "epoch": 0.38074074074074077, "grad_norm": 1.8072893619537354, "learning_rate": 6.192592592592593e-07, "logits/chosen": 1.1171875, "logits/rejected": 1.384765625, "logps/chosen": -43.4375, "logps/rejected": -27.984375, "loss": 0.7847, "rewards/accuracies": 0.25, "rewards/chosen": -0.11993408203125, "rewards/margins": -0.146484375, "rewards/rejected": 0.026519775390625, "step": 514 }, { "epoch": 0.3814814814814815, "grad_norm": 2.0978031158447266, "learning_rate": 6.185185185185185e-07, "logits/chosen": 1.5625, "logits/rejected": 1.169921875, "logps/chosen": -29.78125, "logps/rejected": -32.03125, "loss": 0.7139, "rewards/accuracies": 0.75, "rewards/chosen": -0.08868408203125, "rewards/margins": -0.023040771484375, "rewards/rejected": -0.06561279296875, "step": 515 }, { "epoch": 0.38222222222222224, "grad_norm": 1.4509474039077759, "learning_rate": 6.177777777777777e-07, "logits/chosen": 1.197265625, "logits/rejected": 1.5546875, "logps/chosen": -23.890625, "logps/rejected": -35.46875, "loss": 0.7031, "rewards/accuracies": 0.5, "rewards/chosen": 0.08673095703125, "rewards/margins": -0.014068603515625, "rewards/rejected": 0.10076904296875, "step": 516 }, { "epoch": 0.38296296296296295, "grad_norm": 1.9049890041351318, "learning_rate": 6.17037037037037e-07, "logits/chosen": 1.3984375, "logits/rejected": 1.6591796875, "logps/chosen": -45.03125, "logps/rejected": -34.03125, "loss": 0.6304, "rewards/accuracies": 0.75, "rewards/chosen": 0.0254058837890625, "rewards/margins": 0.1644287109375, "rewards/rejected": -0.1390380859375, "step": 517 }, { "epoch": 0.3837037037037037, "grad_norm": 1.6452964544296265, "learning_rate": 6.162962962962963e-07, "logits/chosen": 1.8447265625, "logits/rejected": 1.828125, "logps/chosen": -22.484375, "logps/rejected": -21.453125, "loss": 0.8271, "rewards/accuracies": 0.25, "rewards/chosen": -0.147216796875, "rewards/margins": -0.243896484375, "rewards/rejected": 0.0966796875, "step": 518 }, { "epoch": 0.3844444444444444, "grad_norm": 3.8821284770965576, "learning_rate": 6.155555555555555e-07, "logits/chosen": 2.392578125, "logits/rejected": 1.6943359375, "logps/chosen": -28.859375, "logps/rejected": -52.59375, "loss": 0.6001, "rewards/accuracies": 0.75, "rewards/chosen": 0.09490966796875, "rewards/margins": 0.223876953125, "rewards/rejected": -0.12890625, "step": 519 }, { "epoch": 0.3851851851851852, "grad_norm": 1.9126580953598022, "learning_rate": 6.148148148148148e-07, "logits/chosen": 1.388671875, "logits/rejected": 1.5595703125, "logps/chosen": -33.6875, "logps/rejected": -28.6875, "loss": 0.8584, "rewards/accuracies": 0.25, "rewards/chosen": -0.30712890625, "rewards/margins": -0.29833984375, "rewards/rejected": -0.0085906982421875, "step": 520 }, { "epoch": 0.38592592592592595, "grad_norm": 2.6713626384735107, "learning_rate": 6.14074074074074e-07, "logits/chosen": 0.97509765625, "logits/rejected": 1.4638671875, "logps/chosen": -61.15625, "logps/rejected": -41.5, "loss": 0.8442, "rewards/accuracies": 0.0, "rewards/chosen": -0.003509521484375, "rewards/margins": -0.27587890625, "rewards/rejected": 0.272216796875, "step": 521 }, { "epoch": 0.38666666666666666, "grad_norm": 1.6378461122512817, "learning_rate": 6.133333333333332e-07, "logits/chosen": 1.583984375, "logits/rejected": 2.330078125, "logps/chosen": -23.09375, "logps/rejected": -44.96875, "loss": 0.8115, "rewards/accuracies": 0.25, "rewards/chosen": -0.0941162109375, "rewards/margins": -0.21484375, "rewards/rejected": 0.12066650390625, "step": 522 }, { "epoch": 0.3874074074074074, "grad_norm": 2.704759359359741, "learning_rate": 6.125925925925926e-07, "logits/chosen": 1.5859375, "logits/rejected": 1.1533203125, "logps/chosen": -26.9375, "logps/rejected": -48.71875, "loss": 0.7607, "rewards/accuracies": 0.5, "rewards/chosen": -0.01290130615234375, "rewards/margins": -0.1136474609375, "rewards/rejected": 0.100830078125, "step": 523 }, { "epoch": 0.38814814814814813, "grad_norm": 3.916377544403076, "learning_rate": 6.118518518518519e-07, "logits/chosen": 1.6416015625, "logits/rejected": 2.015625, "logps/chosen": -41.09375, "logps/rejected": -37.46875, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": 0.210205078125, "rewards/margins": 0.0308837890625, "rewards/rejected": 0.1793212890625, "step": 524 }, { "epoch": 0.3888888888888889, "grad_norm": 2.1990652084350586, "learning_rate": 6.111111111111112e-07, "logits/chosen": 1.703125, "logits/rejected": 1.375, "logps/chosen": -54.21875, "logps/rejected": -48.03125, "loss": 0.7271, "rewards/accuracies": 0.5, "rewards/chosen": -0.11053466796875, "rewards/margins": 0.01611328125, "rewards/rejected": -0.1265869140625, "step": 525 }, { "epoch": 0.3896296296296296, "grad_norm": 4.127274036407471, "learning_rate": 6.103703703703703e-07, "logits/chosen": 2.326171875, "logits/rejected": 1.6826171875, "logps/chosen": -61.8125, "logps/rejected": -37.21875, "loss": 0.6343, "rewards/accuracies": 0.75, "rewards/chosen": 0.0726318359375, "rewards/margins": 0.129150390625, "rewards/rejected": -0.05645751953125, "step": 526 }, { "epoch": 0.39037037037037037, "grad_norm": 3.0810697078704834, "learning_rate": 6.096296296296296e-07, "logits/chosen": 1.8603515625, "logits/rejected": 2.16796875, "logps/chosen": -31.34375, "logps/rejected": -35.125, "loss": 0.7773, "rewards/accuracies": 0.25, "rewards/chosen": -0.031829833984375, "rewards/margins": -0.1368408203125, "rewards/rejected": 0.1051025390625, "step": 527 }, { "epoch": 0.39111111111111113, "grad_norm": 1.8111908435821533, "learning_rate": 6.088888888888889e-07, "logits/chosen": 1.564453125, "logits/rejected": 1.98828125, "logps/chosen": -36.34375, "logps/rejected": -52.875, "loss": 0.7236, "rewards/accuracies": 0.25, "rewards/chosen": -0.12890625, "rewards/margins": -0.055908203125, "rewards/rejected": -0.072998046875, "step": 528 }, { "epoch": 0.39185185185185184, "grad_norm": 2.49320912361145, "learning_rate": 6.081481481481481e-07, "logits/chosen": 1.4140625, "logits/rejected": 1.873046875, "logps/chosen": -36.1875, "logps/rejected": -63.625, "loss": 0.7007, "rewards/accuracies": 0.25, "rewards/chosen": -0.1336669921875, "rewards/margins": 0.03155517578125, "rewards/rejected": -0.165283203125, "step": 529 }, { "epoch": 0.3925925925925926, "grad_norm": 2.145428419113159, "learning_rate": 6.074074074074074e-07, "logits/chosen": 1.6591796875, "logits/rejected": 1.3408203125, "logps/chosen": -22.84375, "logps/rejected": -49.3125, "loss": 0.7837, "rewards/accuracies": 0.25, "rewards/chosen": -0.04046630859375, "rewards/margins": -0.14453125, "rewards/rejected": 0.10406494140625, "step": 530 }, { "epoch": 0.3933333333333333, "grad_norm": 2.4802231788635254, "learning_rate": 6.066666666666666e-07, "logits/chosen": 1.822265625, "logits/rejected": 1.8916015625, "logps/chosen": -27.28125, "logps/rejected": -48.25, "loss": 1.0107, "rewards/accuracies": 0.25, "rewards/chosen": -0.174560546875, "rewards/margins": -0.5107421875, "rewards/rejected": 0.3359375, "step": 531 }, { "epoch": 0.3940740740740741, "grad_norm": 2.1587655544281006, "learning_rate": 6.059259259259259e-07, "logits/chosen": 1.224609375, "logits/rejected": 1.7421875, "logps/chosen": -18.0625, "logps/rejected": -56.34375, "loss": 0.6523, "rewards/accuracies": 0.75, "rewards/chosen": 0.09002685546875, "rewards/margins": 0.1009521484375, "rewards/rejected": -0.01092529296875, "step": 532 }, { "epoch": 0.39481481481481484, "grad_norm": 1.6091697216033936, "learning_rate": 6.051851851851851e-07, "logits/chosen": 1.6923828125, "logits/rejected": 1.103515625, "logps/chosen": -29.328125, "logps/rejected": -33.0625, "loss": 0.6641, "rewards/accuracies": 0.5, "rewards/chosen": 0.1484375, "rewards/margins": 0.08905029296875, "rewards/rejected": 0.05938720703125, "step": 533 }, { "epoch": 0.39555555555555555, "grad_norm": 3.833308219909668, "learning_rate": 6.044444444444444e-07, "logits/chosen": 1.5927734375, "logits/rejected": 2.001953125, "logps/chosen": -28.3125, "logps/rejected": -86.75, "loss": 0.5747, "rewards/accuracies": 0.75, "rewards/chosen": -0.215576171875, "rewards/margins": 0.32421875, "rewards/rejected": -0.53955078125, "step": 534 }, { "epoch": 0.3962962962962963, "grad_norm": 11.685052871704102, "learning_rate": 6.037037037037037e-07, "logits/chosen": 1.4189453125, "logits/rejected": 2.3359375, "logps/chosen": -23.984375, "logps/rejected": -73.5625, "loss": 0.6855, "rewards/accuracies": 0.75, "rewards/chosen": -0.09100341796875, "rewards/margins": 0.0386962890625, "rewards/rejected": -0.129638671875, "step": 535 }, { "epoch": 0.397037037037037, "grad_norm": 2.852125406265259, "learning_rate": 6.029629629629628e-07, "logits/chosen": 1.474609375, "logits/rejected": 1.068359375, "logps/chosen": -30.890625, "logps/rejected": -75.9375, "loss": 0.7759, "rewards/accuracies": 0.25, "rewards/chosen": -0.02032470703125, "rewards/margins": -0.14453125, "rewards/rejected": 0.1241455078125, "step": 536 }, { "epoch": 0.3977777777777778, "grad_norm": 1.7067608833312988, "learning_rate": 6.022222222222222e-07, "logits/chosen": 1.837890625, "logits/rejected": 0.96142578125, "logps/chosen": -31.734375, "logps/rejected": -46.875, "loss": 0.7227, "rewards/accuracies": 0.5, "rewards/chosen": -0.08160400390625, "rewards/margins": -0.0401611328125, "rewards/rejected": -0.041412353515625, "step": 537 }, { "epoch": 0.3985185185185185, "grad_norm": 1.5633240938186646, "learning_rate": 6.014814814814815e-07, "logits/chosen": 1.72265625, "logits/rejected": 2.36328125, "logps/chosen": -32.6875, "logps/rejected": -29.296875, "loss": 0.6328, "rewards/accuracies": 0.75, "rewards/chosen": 0.045318603515625, "rewards/margins": 0.126220703125, "rewards/rejected": -0.08087158203125, "step": 538 }, { "epoch": 0.39925925925925926, "grad_norm": 1.8415247201919556, "learning_rate": 6.007407407407407e-07, "logits/chosen": 2.32421875, "logits/rejected": 2.220703125, "logps/chosen": -28.890625, "logps/rejected": -29.453125, "loss": 0.7192, "rewards/accuracies": 0.75, "rewards/chosen": -0.009552001953125, "rewards/margins": -0.037322998046875, "rewards/rejected": 0.0277557373046875, "step": 539 }, { "epoch": 0.4, "grad_norm": 1.4531553983688354, "learning_rate": 6e-07, "logits/chosen": 1.525390625, "logits/rejected": 1.8154296875, "logps/chosen": -26.84375, "logps/rejected": -44.09375, "loss": 0.6167, "rewards/accuracies": 0.5, "rewards/chosen": 0.0179595947265625, "rewards/margins": 0.1715087890625, "rewards/rejected": -0.153564453125, "step": 540 }, { "epoch": 0.40074074074074073, "grad_norm": 13.447209358215332, "learning_rate": 5.992592592592593e-07, "logits/chosen": 1.5009765625, "logits/rejected": 1.626953125, "logps/chosen": -36.0, "logps/rejected": -83.5625, "loss": 0.73, "rewards/accuracies": 0.25, "rewards/chosen": -0.143310546875, "rewards/margins": -0.06207275390625, "rewards/rejected": -0.08123779296875, "step": 541 }, { "epoch": 0.4014814814814815, "grad_norm": 1.4425565004348755, "learning_rate": 5.985185185185185e-07, "logits/chosen": 1.45703125, "logits/rejected": 1.0966796875, "logps/chosen": -27.96875, "logps/rejected": -44.25, "loss": 0.6377, "rewards/accuracies": 0.75, "rewards/chosen": -0.02032470703125, "rewards/margins": 0.12030029296875, "rewards/rejected": -0.140625, "step": 542 }, { "epoch": 0.4022222222222222, "grad_norm": 3.2026729583740234, "learning_rate": 5.977777777777777e-07, "logits/chosen": 2.1484375, "logits/rejected": 1.51171875, "logps/chosen": -47.65625, "logps/rejected": -24.421875, "loss": 1.2617, "rewards/accuracies": 0.0, "rewards/chosen": -0.57470703125, "rewards/margins": -0.86767578125, "rewards/rejected": 0.29296875, "step": 543 }, { "epoch": 0.40296296296296297, "grad_norm": 1.969161868095398, "learning_rate": 5.97037037037037e-07, "logits/chosen": 1.8994140625, "logits/rejected": 1.400390625, "logps/chosen": -25.796875, "logps/rejected": -49.25, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": -0.08203125, "rewards/margins": 0.00238037109375, "rewards/rejected": -0.08441162109375, "step": 544 }, { "epoch": 0.40370370370370373, "grad_norm": 1.5762442350387573, "learning_rate": 5.962962962962963e-07, "logits/chosen": 2.337890625, "logits/rejected": 1.70703125, "logps/chosen": -29.703125, "logps/rejected": -39.3125, "loss": 0.6099, "rewards/accuracies": 0.75, "rewards/chosen": 0.0046844482421875, "rewards/margins": 0.1781005859375, "rewards/rejected": -0.1734619140625, "step": 545 }, { "epoch": 0.40444444444444444, "grad_norm": 2.017941474914551, "learning_rate": 5.955555555555555e-07, "logits/chosen": 1.12890625, "logits/rejected": 1.8095703125, "logps/chosen": -32.53125, "logps/rejected": -56.5625, "loss": 0.5859, "rewards/accuracies": 1.0, "rewards/chosen": 0.053131103515625, "rewards/margins": 0.236328125, "rewards/rejected": -0.1832275390625, "step": 546 }, { "epoch": 0.4051851851851852, "grad_norm": 2.331641674041748, "learning_rate": 5.948148148148147e-07, "logits/chosen": 1.9951171875, "logits/rejected": 1.6181640625, "logps/chosen": -37.46875, "logps/rejected": -32.28125, "loss": 0.7871, "rewards/accuracies": 0.0, "rewards/chosen": -0.11285400390625, "rewards/margins": -0.17529296875, "rewards/rejected": 0.0625, "step": 547 }, { "epoch": 0.4059259259259259, "grad_norm": 2.0243592262268066, "learning_rate": 5.94074074074074e-07, "logits/chosen": 1.2451171875, "logits/rejected": 1.7939453125, "logps/chosen": -36.8125, "logps/rejected": -44.0625, "loss": 0.6641, "rewards/accuracies": 0.5, "rewards/chosen": -0.0601806640625, "rewards/margins": 0.08978271484375, "rewards/rejected": -0.1500244140625, "step": 548 }, { "epoch": 0.4066666666666667, "grad_norm": 2.303347110748291, "learning_rate": 5.933333333333334e-07, "logits/chosen": 1.4404296875, "logits/rejected": 1.9443359375, "logps/chosen": -32.15625, "logps/rejected": -45.3125, "loss": 0.6572, "rewards/accuracies": 0.5, "rewards/chosen": 0.038665771484375, "rewards/margins": 0.09136962890625, "rewards/rejected": -0.052734375, "step": 549 }, { "epoch": 0.4074074074074074, "grad_norm": 2.047020673751831, "learning_rate": 5.925925925925926e-07, "logits/chosen": 0.8916015625, "logits/rejected": 1.640625, "logps/chosen": -35.03125, "logps/rejected": -63.25, "loss": 0.7266, "rewards/accuracies": 0.25, "rewards/chosen": 0.03985595703125, "rewards/margins": -0.060943603515625, "rewards/rejected": 0.10076904296875, "step": 550 }, { "epoch": 0.40814814814814815, "grad_norm": 1.6266810894012451, "learning_rate": 5.918518518518519e-07, "logits/chosen": 1.6435546875, "logits/rejected": 1.66015625, "logps/chosen": -35.25, "logps/rejected": -33.0, "loss": 0.5308, "rewards/accuracies": 0.75, "rewards/chosen": 0.5859375, "rewards/margins": 0.58447265625, "rewards/rejected": 0.001556396484375, "step": 551 }, { "epoch": 0.4088888888888889, "grad_norm": 2.6806013584136963, "learning_rate": 5.911111111111111e-07, "logits/chosen": 1.8916015625, "logits/rejected": 1.8818359375, "logps/chosen": -26.84375, "logps/rejected": -80.5625, "loss": 0.7095, "rewards/accuracies": 0.75, "rewards/chosen": 0.03125, "rewards/margins": -0.00460052490234375, "rewards/rejected": 0.035919189453125, "step": 552 }, { "epoch": 0.4096296296296296, "grad_norm": 1.7849647998809814, "learning_rate": 5.903703703703703e-07, "logits/chosen": 0.951171875, "logits/rejected": 1.6494140625, "logps/chosen": -24.890625, "logps/rejected": -29.6875, "loss": 0.9731, "rewards/accuracies": 0.0, "rewards/chosen": -0.1138916015625, "rewards/margins": -0.486083984375, "rewards/rejected": 0.372314453125, "step": 553 }, { "epoch": 0.4103703703703704, "grad_norm": 2.1140432357788086, "learning_rate": 5.896296296296296e-07, "logits/chosen": 1.4580078125, "logits/rejected": 2.001953125, "logps/chosen": -33.0625, "logps/rejected": -37.5625, "loss": 0.8154, "rewards/accuracies": 0.25, "rewards/chosen": -0.27099609375, "rewards/margins": -0.2109375, "rewards/rejected": -0.060150146484375, "step": 554 }, { "epoch": 0.4111111111111111, "grad_norm": 1.8596899509429932, "learning_rate": 5.888888888888889e-07, "logits/chosen": 1.234375, "logits/rejected": 1.5751953125, "logps/chosen": -30.875, "logps/rejected": -41.21875, "loss": 0.71, "rewards/accuracies": 0.5, "rewards/chosen": -0.01678466796875, "rewards/margins": -0.031982421875, "rewards/rejected": 0.0152130126953125, "step": 555 }, { "epoch": 0.41185185185185186, "grad_norm": 1.5571502447128296, "learning_rate": 5.881481481481482e-07, "logits/chosen": 1.4912109375, "logits/rejected": 1.41015625, "logps/chosen": -22.03125, "logps/rejected": -34.78125, "loss": 0.7017, "rewards/accuracies": 0.75, "rewards/chosen": 0.057830810546875, "rewards/margins": -0.0083770751953125, "rewards/rejected": 0.06622314453125, "step": 556 }, { "epoch": 0.41259259259259257, "grad_norm": 1.3708677291870117, "learning_rate": 5.874074074074073e-07, "logits/chosen": 1.12890625, "logits/rejected": 1.1982421875, "logps/chosen": -35.3125, "logps/rejected": -26.640625, "loss": 0.7373, "rewards/accuracies": 0.25, "rewards/chosen": -0.0491943359375, "rewards/margins": -0.082763671875, "rewards/rejected": 0.033599853515625, "step": 557 }, { "epoch": 0.41333333333333333, "grad_norm": 1.798439621925354, "learning_rate": 5.866666666666666e-07, "logits/chosen": 1.1162109375, "logits/rejected": 1.771484375, "logps/chosen": -20.9375, "logps/rejected": -42.96875, "loss": 0.75, "rewards/accuracies": 0.5, "rewards/chosen": -0.00858306884765625, "rewards/margins": -0.0916748046875, "rewards/rejected": 0.08319091796875, "step": 558 }, { "epoch": 0.4140740740740741, "grad_norm": 1.4651398658752441, "learning_rate": 5.859259259259259e-07, "logits/chosen": 1.822265625, "logits/rejected": 1.798828125, "logps/chosen": -22.625, "logps/rejected": -61.5625, "loss": 0.5889, "rewards/accuracies": 0.75, "rewards/chosen": 0.0738525390625, "rewards/margins": 0.28564453125, "rewards/rejected": -0.211669921875, "step": 559 }, { "epoch": 0.4148148148148148, "grad_norm": 1.4328234195709229, "learning_rate": 5.851851851851851e-07, "logits/chosen": 2.4921875, "logits/rejected": 1.9130859375, "logps/chosen": -23.421875, "logps/rejected": -37.90625, "loss": 0.6846, "rewards/accuracies": 0.25, "rewards/chosen": -0.0203094482421875, "rewards/margins": 0.018585205078125, "rewards/rejected": -0.03887939453125, "step": 560 }, { "epoch": 0.41555555555555557, "grad_norm": 1.6191445589065552, "learning_rate": 5.844444444444444e-07, "logits/chosen": 1.060546875, "logits/rejected": 1.7431640625, "logps/chosen": -25.015625, "logps/rejected": -40.90625, "loss": 0.6777, "rewards/accuracies": 0.5, "rewards/chosen": 0.003124237060546875, "rewards/margins": 0.048828125, "rewards/rejected": -0.045654296875, "step": 561 }, { "epoch": 0.4162962962962963, "grad_norm": 1.6613516807556152, "learning_rate": 5.837037037037036e-07, "logits/chosen": 1.4970703125, "logits/rejected": 1.8681640625, "logps/chosen": -22.28125, "logps/rejected": -29.0, "loss": 0.8594, "rewards/accuracies": 0.0, "rewards/chosen": -0.1138916015625, "rewards/margins": -0.3037109375, "rewards/rejected": 0.1898193359375, "step": 562 }, { "epoch": 0.41703703703703704, "grad_norm": 1.834051251411438, "learning_rate": 5.82962962962963e-07, "logits/chosen": 1.2177734375, "logits/rejected": 1.40234375, "logps/chosen": -26.21875, "logps/rejected": -33.34375, "loss": 0.6973, "rewards/accuracies": 0.25, "rewards/chosen": 0.1527099609375, "rewards/margins": -0.003143310546875, "rewards/rejected": 0.1558837890625, "step": 563 }, { "epoch": 0.4177777777777778, "grad_norm": 1.699149250984192, "learning_rate": 5.822222222222222e-07, "logits/chosen": 1.55859375, "logits/rejected": 2.30859375, "logps/chosen": -23.859375, "logps/rejected": -51.4375, "loss": 0.8184, "rewards/accuracies": 0.25, "rewards/chosen": -0.0758056640625, "rewards/margins": -0.2218017578125, "rewards/rejected": 0.1461181640625, "step": 564 }, { "epoch": 0.4185185185185185, "grad_norm": 1.5084997415542603, "learning_rate": 5.814814814814815e-07, "logits/chosen": 2.0234375, "logits/rejected": 0.8857421875, "logps/chosen": -26.765625, "logps/rejected": -23.0625, "loss": 0.6841, "rewards/accuracies": 0.5, "rewards/chosen": 0.060546875, "rewards/margins": 0.026123046875, "rewards/rejected": 0.034393310546875, "step": 565 }, { "epoch": 0.4192592592592593, "grad_norm": 1.7888669967651367, "learning_rate": 5.807407407407408e-07, "logits/chosen": 1.4794921875, "logits/rejected": 1.7763671875, "logps/chosen": -34.03125, "logps/rejected": -56.46875, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": 0.034393310546875, "rewards/margins": -0.0030975341796875, "rewards/rejected": 0.0374755859375, "step": 566 }, { "epoch": 0.42, "grad_norm": 1.3836296796798706, "learning_rate": 5.8e-07, "logits/chosen": 1.734375, "logits/rejected": 1.33984375, "logps/chosen": -30.65625, "logps/rejected": -29.3125, "loss": 0.749, "rewards/accuracies": 0.25, "rewards/chosen": -0.038299560546875, "rewards/margins": -0.0947265625, "rewards/rejected": 0.056427001953125, "step": 567 }, { "epoch": 0.42074074074074075, "grad_norm": 2.0955233573913574, "learning_rate": 5.792592592592592e-07, "logits/chosen": 2.20703125, "logits/rejected": 1.25, "logps/chosen": -39.5625, "logps/rejected": -47.0, "loss": 0.8369, "rewards/accuracies": 0.25, "rewards/chosen": -0.134765625, "rewards/margins": -0.259765625, "rewards/rejected": 0.125, "step": 568 }, { "epoch": 0.42148148148148146, "grad_norm": 2.0266058444976807, "learning_rate": 5.785185185185185e-07, "logits/chosen": 1.6474609375, "logits/rejected": 1.4775390625, "logps/chosen": -43.09375, "logps/rejected": -42.03125, "loss": 0.751, "rewards/accuracies": 0.25, "rewards/chosen": 0.03631591796875, "rewards/margins": -0.1090087890625, "rewards/rejected": 0.145263671875, "step": 569 }, { "epoch": 0.4222222222222222, "grad_norm": 6.810372829437256, "learning_rate": 5.777777777777777e-07, "logits/chosen": 1.25390625, "logits/rejected": 2.0625, "logps/chosen": -24.03125, "logps/rejected": -40.0625, "loss": 0.8354, "rewards/accuracies": 0.5, "rewards/chosen": 0.03436279296875, "rewards/margins": -0.1737060546875, "rewards/rejected": 0.2078857421875, "step": 570 }, { "epoch": 0.422962962962963, "grad_norm": 3.0939090251922607, "learning_rate": 5.77037037037037e-07, "logits/chosen": 1.353515625, "logits/rejected": 1.7001953125, "logps/chosen": -49.6875, "logps/rejected": -78.4375, "loss": 0.5366, "rewards/accuracies": 1.0, "rewards/chosen": 0.11407470703125, "rewards/margins": 0.36328125, "rewards/rejected": -0.249267578125, "step": 571 }, { "epoch": 0.4237037037037037, "grad_norm": 2.0099618434906006, "learning_rate": 5.762962962962963e-07, "logits/chosen": 1.5595703125, "logits/rejected": 1.99609375, "logps/chosen": -26.734375, "logps/rejected": -55.25, "loss": 0.646, "rewards/accuracies": 0.75, "rewards/chosen": 0.07537841796875, "rewards/margins": 0.11053466796875, "rewards/rejected": -0.03515625, "step": 572 }, { "epoch": 0.42444444444444446, "grad_norm": 1.376263976097107, "learning_rate": 5.755555555555555e-07, "logits/chosen": 1.69140625, "logits/rejected": 1.7177734375, "logps/chosen": -21.484375, "logps/rejected": -33.96875, "loss": 0.6699, "rewards/accuracies": 0.75, "rewards/chosen": 0.06402587890625, "rewards/margins": 0.0550537109375, "rewards/rejected": 0.0090179443359375, "step": 573 }, { "epoch": 0.42518518518518517, "grad_norm": 1.3996357917785645, "learning_rate": 5.748148148148147e-07, "logits/chosen": 1.5830078125, "logits/rejected": 1.3359375, "logps/chosen": -24.234375, "logps/rejected": -34.96875, "loss": 0.6655, "rewards/accuracies": 0.5, "rewards/chosen": 0.041015625, "rewards/margins": 0.0703125, "rewards/rejected": -0.029327392578125, "step": 574 }, { "epoch": 0.42592592592592593, "grad_norm": 1.859938144683838, "learning_rate": 5.74074074074074e-07, "logits/chosen": 1.1767578125, "logits/rejected": 1.287109375, "logps/chosen": -29.59375, "logps/rejected": -52.34375, "loss": 0.6118, "rewards/accuracies": 1.0, "rewards/chosen": 0.140625, "rewards/margins": 0.1754150390625, "rewards/rejected": -0.034759521484375, "step": 575 }, { "epoch": 0.4266666666666667, "grad_norm": 2.545891284942627, "learning_rate": 5.733333333333334e-07, "logits/chosen": 2.021484375, "logits/rejected": 1.5859375, "logps/chosen": -40.9375, "logps/rejected": -40.4375, "loss": 0.8027, "rewards/accuracies": 0.75, "rewards/chosen": -0.185546875, "rewards/margins": -0.1265869140625, "rewards/rejected": -0.0589599609375, "step": 576 }, { "epoch": 0.4274074074074074, "grad_norm": 5.727237701416016, "learning_rate": 5.725925925925926e-07, "logits/chosen": 1.6953125, "logits/rejected": 1.7841796875, "logps/chosen": -30.5625, "logps/rejected": -28.90625, "loss": 0.7842, "rewards/accuracies": 0.0, "rewards/chosen": -0.0277252197265625, "rewards/margins": -0.1722412109375, "rewards/rejected": 0.14453125, "step": 577 }, { "epoch": 0.42814814814814817, "grad_norm": 2.1978869438171387, "learning_rate": 5.718518518518518e-07, "logits/chosen": 1.845703125, "logits/rejected": 1.248046875, "logps/chosen": -45.9375, "logps/rejected": -62.5, "loss": 0.6343, "rewards/accuracies": 0.5, "rewards/chosen": -0.0246124267578125, "rewards/margins": 0.2236328125, "rewards/rejected": -0.2484130859375, "step": 578 }, { "epoch": 0.4288888888888889, "grad_norm": 2.279515266418457, "learning_rate": 5.711111111111111e-07, "logits/chosen": 1.744140625, "logits/rejected": 1.416015625, "logps/chosen": -67.6875, "logps/rejected": -58.9375, "loss": 0.7979, "rewards/accuracies": 0.0, "rewards/chosen": -0.2047119140625, "rewards/margins": -0.1953125, "rewards/rejected": -0.0094146728515625, "step": 579 }, { "epoch": 0.42962962962962964, "grad_norm": 3.2038443088531494, "learning_rate": 5.703703703703704e-07, "logits/chosen": 1.57421875, "logits/rejected": 0.96435546875, "logps/chosen": -35.75, "logps/rejected": -48.65625, "loss": 1.3408, "rewards/accuracies": 0.25, "rewards/chosen": -0.1981201171875, "rewards/margins": -0.83203125, "rewards/rejected": 0.6337890625, "step": 580 }, { "epoch": 0.43037037037037035, "grad_norm": 2.77423095703125, "learning_rate": 5.696296296296296e-07, "logits/chosen": 1.4892578125, "logits/rejected": 1.169921875, "logps/chosen": -32.21875, "logps/rejected": -66.6875, "loss": 3.2344, "rewards/accuracies": 0.0, "rewards/chosen": -0.06365966796875, "rewards/margins": -2.873046875, "rewards/rejected": 2.80859375, "step": 581 }, { "epoch": 0.4311111111111111, "grad_norm": 1.4729530811309814, "learning_rate": 5.688888888888889e-07, "logits/chosen": 1.0810546875, "logits/rejected": 1.3388671875, "logps/chosen": -31.953125, "logps/rejected": -39.875, "loss": 0.7207, "rewards/accuracies": 0.5, "rewards/chosen": -0.03399658203125, "rewards/margins": -0.03662109375, "rewards/rejected": 0.0027008056640625, "step": 582 }, { "epoch": 0.4318518518518519, "grad_norm": 1.7691676616668701, "learning_rate": 5.681481481481481e-07, "logits/chosen": 1.4384765625, "logits/rejected": 1.970703125, "logps/chosen": -32.3125, "logps/rejected": -43.65625, "loss": 0.7373, "rewards/accuracies": 0.25, "rewards/chosen": 1.1444091796875e-05, "rewards/margins": -0.08160400390625, "rewards/rejected": 0.0816650390625, "step": 583 }, { "epoch": 0.4325925925925926, "grad_norm": 1.9803569316864014, "learning_rate": 5.674074074074073e-07, "logits/chosen": 1.4208984375, "logits/rejected": 1.7822265625, "logps/chosen": -20.578125, "logps/rejected": -46.4375, "loss": 0.708, "rewards/accuracies": 0.5, "rewards/chosen": -0.0128936767578125, "rewards/margins": -0.02734375, "rewards/rejected": 0.01444244384765625, "step": 584 }, { "epoch": 0.43333333333333335, "grad_norm": 1.644366979598999, "learning_rate": 5.666666666666666e-07, "logits/chosen": 1.236328125, "logits/rejected": 1.220703125, "logps/chosen": -36.59375, "logps/rejected": -20.875, "loss": 0.7168, "rewards/accuracies": 0.5, "rewards/chosen": 0.0250244140625, "rewards/margins": -0.02874755859375, "rewards/rejected": 0.053741455078125, "step": 585 }, { "epoch": 0.43407407407407406, "grad_norm": 1.8033450841903687, "learning_rate": 5.659259259259259e-07, "logits/chosen": 1.599609375, "logits/rejected": 1.5224609375, "logps/chosen": -35.1875, "logps/rejected": -71.375, "loss": 0.6753, "rewards/accuracies": 0.5, "rewards/chosen": 0.044158935546875, "rewards/margins": 0.044952392578125, "rewards/rejected": -0.0007781982421875, "step": 586 }, { "epoch": 0.4348148148148148, "grad_norm": 1.825880765914917, "learning_rate": 5.651851851851852e-07, "logits/chosen": 0.4755859375, "logits/rejected": 1.3486328125, "logps/chosen": -32.21875, "logps/rejected": -45.96875, "loss": 0.8018, "rewards/accuracies": 0.0, "rewards/chosen": -0.09417724609375, "rewards/margins": -0.201904296875, "rewards/rejected": 0.10784912109375, "step": 587 }, { "epoch": 0.43555555555555553, "grad_norm": 1.6633987426757812, "learning_rate": 5.644444444444443e-07, "logits/chosen": 1.0419921875, "logits/rejected": 1.603515625, "logps/chosen": -30.28125, "logps/rejected": -27.015625, "loss": 0.6826, "rewards/accuracies": 0.5, "rewards/chosen": 0.0927734375, "rewards/margins": 0.026397705078125, "rewards/rejected": 0.06640625, "step": 588 }, { "epoch": 0.4362962962962963, "grad_norm": 1.5782067775726318, "learning_rate": 5.637037037037037e-07, "logits/chosen": 1.982421875, "logits/rejected": 2.537109375, "logps/chosen": -33.59375, "logps/rejected": -42.3125, "loss": 0.6338, "rewards/accuracies": 0.75, "rewards/chosen": 0.01525115966796875, "rewards/margins": 0.130859375, "rewards/rejected": -0.1156005859375, "step": 589 }, { "epoch": 0.43703703703703706, "grad_norm": 1.6883841753005981, "learning_rate": 5.62962962962963e-07, "logits/chosen": 1.482421875, "logits/rejected": 1.6025390625, "logps/chosen": -49.34375, "logps/rejected": -75.75, "loss": 0.6055, "rewards/accuracies": 0.5, "rewards/chosen": 0.057037353515625, "rewards/margins": 0.2230224609375, "rewards/rejected": -0.166015625, "step": 590 }, { "epoch": 0.43777777777777777, "grad_norm": 2.6128628253936768, "learning_rate": 5.622222222222222e-07, "logits/chosen": 1.337890625, "logits/rejected": 1.822265625, "logps/chosen": -31.3125, "logps/rejected": -64.4375, "loss": 0.7515, "rewards/accuracies": 0.5, "rewards/chosen": 0.005859375, "rewards/margins": -0.06756591796875, "rewards/rejected": 0.073486328125, "step": 591 }, { "epoch": 0.43851851851851853, "grad_norm": 1.3485307693481445, "learning_rate": 5.614814814814815e-07, "logits/chosen": 1.7861328125, "logits/rejected": 1.1416015625, "logps/chosen": -32.21875, "logps/rejected": -21.671875, "loss": 0.7393, "rewards/accuracies": 0.25, "rewards/chosen": 0.0257568359375, "rewards/margins": -0.0816650390625, "rewards/rejected": 0.107421875, "step": 592 }, { "epoch": 0.43925925925925924, "grad_norm": 1.264238953590393, "learning_rate": 5.607407407407408e-07, "logits/chosen": 1.8251953125, "logits/rejected": 1.537109375, "logps/chosen": -33.71875, "logps/rejected": -21.765625, "loss": 0.7012, "rewards/accuracies": 0.5, "rewards/chosen": 0.015228271484375, "rewards/margins": -0.008209228515625, "rewards/rejected": 0.023406982421875, "step": 593 }, { "epoch": 0.44, "grad_norm": 1.9147392511367798, "learning_rate": 5.6e-07, "logits/chosen": 1.6103515625, "logits/rejected": 1.900390625, "logps/chosen": -29.75, "logps/rejected": -33.0625, "loss": 0.7886, "rewards/accuracies": 0.5, "rewards/chosen": -0.08514404296875, "rewards/margins": -0.167236328125, "rewards/rejected": 0.08203125, "step": 594 }, { "epoch": 0.44074074074074077, "grad_norm": 1.3411505222320557, "learning_rate": 5.592592592592592e-07, "logits/chosen": 1.6083984375, "logits/rejected": 1.8232421875, "logps/chosen": -27.84375, "logps/rejected": -23.921875, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": 0.0191497802734375, "rewards/margins": 0.000396728515625, "rewards/rejected": 0.0187530517578125, "step": 595 }, { "epoch": 0.4414814814814815, "grad_norm": 1.9907143115997314, "learning_rate": 5.585185185185185e-07, "logits/chosen": 1.3310546875, "logits/rejected": 1.818359375, "logps/chosen": -33.78125, "logps/rejected": -60.53125, "loss": 0.5771, "rewards/accuracies": 0.75, "rewards/chosen": 1.013671875, "rewards/margins": 0.8828125, "rewards/rejected": 0.1309814453125, "step": 596 }, { "epoch": 0.44222222222222224, "grad_norm": 4.03506326675415, "learning_rate": 5.577777777777778e-07, "logits/chosen": 2.09765625, "logits/rejected": 1.5791015625, "logps/chosen": -31.40625, "logps/rejected": -47.96875, "loss": 0.7021, "rewards/accuracies": 0.25, "rewards/chosen": -0.02423095703125, "rewards/margins": -0.01641845703125, "rewards/rejected": -0.00780487060546875, "step": 597 }, { "epoch": 0.44296296296296295, "grad_norm": 1.5641202926635742, "learning_rate": 5.57037037037037e-07, "logits/chosen": 1.126953125, "logits/rejected": 1.3056640625, "logps/chosen": -30.3125, "logps/rejected": -36.59375, "loss": 0.7402, "rewards/accuracies": 0.25, "rewards/chosen": 0.01873779296875, "rewards/margins": -0.0711669921875, "rewards/rejected": 0.08984375, "step": 598 }, { "epoch": 0.4437037037037037, "grad_norm": 2.011223554611206, "learning_rate": 5.562962962962962e-07, "logits/chosen": 1.2333984375, "logits/rejected": 1.88671875, "logps/chosen": -27.03125, "logps/rejected": -66.0625, "loss": 0.8027, "rewards/accuracies": 0.25, "rewards/chosen": -0.046295166015625, "rewards/margins": -0.1900634765625, "rewards/rejected": 0.1436767578125, "step": 599 }, { "epoch": 0.4444444444444444, "grad_norm": 1.662796139717102, "learning_rate": 5.555555555555555e-07, "logits/chosen": 1.71484375, "logits/rejected": 1.94921875, "logps/chosen": -26.046875, "logps/rejected": -51.28125, "loss": 0.686, "rewards/accuracies": 0.25, "rewards/chosen": -0.0238037109375, "rewards/margins": 0.0687255859375, "rewards/rejected": -0.09259033203125, "step": 600 }, { "epoch": 0.4451851851851852, "grad_norm": 20.19692039489746, "learning_rate": 5.548148148148147e-07, "logits/chosen": 1.798828125, "logits/rejected": 2.4375, "logps/chosen": -40.15625, "logps/rejected": -63.125, "loss": 0.5288, "rewards/accuracies": 0.75, "rewards/chosen": 0.226806640625, "rewards/margins": 0.5341796875, "rewards/rejected": -0.30712890625, "step": 601 }, { "epoch": 0.44592592592592595, "grad_norm": 1.8734709024429321, "learning_rate": 5.54074074074074e-07, "logits/chosen": 1.677734375, "logits/rejected": 1.2607421875, "logps/chosen": -21.71875, "logps/rejected": -32.8125, "loss": 0.7075, "rewards/accuracies": 0.25, "rewards/chosen": 0.029296875, "rewards/margins": -0.00933837890625, "rewards/rejected": 0.038665771484375, "step": 602 }, { "epoch": 0.44666666666666666, "grad_norm": 2.53767991065979, "learning_rate": 5.533333333333334e-07, "logits/chosen": 1.904296875, "logits/rejected": 1.8505859375, "logps/chosen": -38.9375, "logps/rejected": -36.6875, "loss": 0.6802, "rewards/accuracies": 0.75, "rewards/chosen": -0.167724609375, "rewards/margins": 0.034210205078125, "rewards/rejected": -0.2020263671875, "step": 603 }, { "epoch": 0.4474074074074074, "grad_norm": 2.671329975128174, "learning_rate": 5.525925925925926e-07, "logits/chosen": 1.8056640625, "logits/rejected": 1.9765625, "logps/chosen": -33.21875, "logps/rejected": -52.3125, "loss": 0.7852, "rewards/accuracies": 0.0, "rewards/chosen": -0.129638671875, "rewards/margins": -0.171875, "rewards/rejected": 0.04217529296875, "step": 604 }, { "epoch": 0.44814814814814813, "grad_norm": 2.6322474479675293, "learning_rate": 5.518518518518518e-07, "logits/chosen": 1.1357421875, "logits/rejected": 1.48046875, "logps/chosen": -32.46875, "logps/rejected": -57.625, "loss": 1.499, "rewards/accuracies": 0.5, "rewards/chosen": 0.1038818359375, "rewards/margins": -0.93603515625, "rewards/rejected": 1.0400390625, "step": 605 }, { "epoch": 0.4488888888888889, "grad_norm": 1.853480339050293, "learning_rate": 5.511111111111111e-07, "logits/chosen": 1.294921875, "logits/rejected": 1.609375, "logps/chosen": -40.28125, "logps/rejected": -47.1875, "loss": 0.6709, "rewards/accuracies": 0.5, "rewards/chosen": 0.099609375, "rewards/margins": 0.051605224609375, "rewards/rejected": 0.04803466796875, "step": 606 }, { "epoch": 0.44962962962962966, "grad_norm": 2.0646109580993652, "learning_rate": 5.503703703703704e-07, "logits/chosen": 0.88720703125, "logits/rejected": 1.7421875, "logps/chosen": -32.1875, "logps/rejected": -44.15625, "loss": 0.9219, "rewards/accuracies": 0.0, "rewards/chosen": -0.07891845703125, "rewards/margins": -0.37841796875, "rewards/rejected": 0.29931640625, "step": 607 }, { "epoch": 0.45037037037037037, "grad_norm": 3.3660523891448975, "learning_rate": 5.496296296296296e-07, "logits/chosen": 1.443359375, "logits/rejected": 1.548828125, "logps/chosen": -30.0625, "logps/rejected": -42.4375, "loss": 0.6274, "rewards/accuracies": 0.75, "rewards/chosen": 0.04803466796875, "rewards/margins": 0.1409912109375, "rewards/rejected": -0.093017578125, "step": 608 }, { "epoch": 0.45111111111111113, "grad_norm": 1.988142490386963, "learning_rate": 5.488888888888888e-07, "logits/chosen": 2.046875, "logits/rejected": 1.7177734375, "logps/chosen": -32.3125, "logps/rejected": -31.53125, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.11444091796875, "rewards/margins": 0.007415771484375, "rewards/rejected": -0.12188720703125, "step": 609 }, { "epoch": 0.45185185185185184, "grad_norm": 1.7168614864349365, "learning_rate": 5.481481481481481e-07, "logits/chosen": 1.0546875, "logits/rejected": 1.58984375, "logps/chosen": -25.015625, "logps/rejected": -24.359375, "loss": 0.7144, "rewards/accuracies": 0.5, "rewards/chosen": 0.0623779296875, "rewards/margins": -0.0113067626953125, "rewards/rejected": 0.07366943359375, "step": 610 }, { "epoch": 0.4525925925925926, "grad_norm": 1.9067034721374512, "learning_rate": 5.474074074074074e-07, "logits/chosen": 2.404296875, "logits/rejected": 1.8828125, "logps/chosen": -46.21875, "logps/rejected": -33.3125, "loss": 0.647, "rewards/accuracies": 0.75, "rewards/chosen": 0.04022216796875, "rewards/margins": 0.104736328125, "rewards/rejected": -0.064453125, "step": 611 }, { "epoch": 0.4533333333333333, "grad_norm": 1.6319828033447266, "learning_rate": 5.466666666666666e-07, "logits/chosen": 0.89306640625, "logits/rejected": 1.1953125, "logps/chosen": -21.890625, "logps/rejected": -41.875, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.080078125, "rewards/margins": 0.0042724609375, "rewards/rejected": 0.0758056640625, "step": 612 }, { "epoch": 0.4540740740740741, "grad_norm": 1.7181618213653564, "learning_rate": 5.459259259259259e-07, "logits/chosen": 1.1591796875, "logits/rejected": 1.51171875, "logps/chosen": -20.65625, "logps/rejected": -48.8125, "loss": 0.7114, "rewards/accuracies": 0.25, "rewards/chosen": -0.01213836669921875, "rewards/margins": -0.026153564453125, "rewards/rejected": 0.0140533447265625, "step": 613 }, { "epoch": 0.45481481481481484, "grad_norm": 3.1123106479644775, "learning_rate": 5.451851851851851e-07, "logits/chosen": 1.72265625, "logits/rejected": 2.005859375, "logps/chosen": -35.65625, "logps/rejected": -31.265625, "loss": 0.9453, "rewards/accuracies": 0.5, "rewards/chosen": 0.1527099609375, "rewards/margins": -0.319580078125, "rewards/rejected": 0.47216796875, "step": 614 }, { "epoch": 0.45555555555555555, "grad_norm": 2.4917755126953125, "learning_rate": 5.444444444444443e-07, "logits/chosen": 2.123046875, "logits/rejected": 1.7744140625, "logps/chosen": -85.625, "logps/rejected": -55.375, "loss": 0.6445, "rewards/accuracies": 0.75, "rewards/chosen": 0.15625, "rewards/margins": 0.14453125, "rewards/rejected": 0.01165771484375, "step": 615 }, { "epoch": 0.4562962962962963, "grad_norm": 1.864400029182434, "learning_rate": 5.437037037037037e-07, "logits/chosen": 1.1669921875, "logits/rejected": 2.095703125, "logps/chosen": -38.6875, "logps/rejected": -43.15625, "loss": 0.6113, "rewards/accuracies": 0.5, "rewards/chosen": -0.04998779296875, "rewards/margins": 0.1905517578125, "rewards/rejected": -0.2406005859375, "step": 616 }, { "epoch": 0.457037037037037, "grad_norm": 1.6123391389846802, "learning_rate": 5.42962962962963e-07, "logits/chosen": 2.203125, "logits/rejected": 1.3720703125, "logps/chosen": -20.265625, "logps/rejected": -32.5, "loss": 0.79, "rewards/accuracies": 0.75, "rewards/chosen": 0.0999755859375, "rewards/margins": -0.130615234375, "rewards/rejected": 0.23046875, "step": 617 }, { "epoch": 0.4577777777777778, "grad_norm": 1.7711257934570312, "learning_rate": 5.422222222222223e-07, "logits/chosen": 0.63818359375, "logits/rejected": 1.5947265625, "logps/chosen": -32.625, "logps/rejected": -45.59375, "loss": 0.7632, "rewards/accuracies": 0.25, "rewards/chosen": -0.07342529296875, "rewards/margins": -0.12890625, "rewards/rejected": 0.055450439453125, "step": 618 }, { "epoch": 0.4585185185185185, "grad_norm": 1.6801837682724, "learning_rate": 5.414814814814815e-07, "logits/chosen": 1.5341796875, "logits/rejected": 1.904296875, "logps/chosen": -40.25, "logps/rejected": -41.875, "loss": 0.6089, "rewards/accuracies": 0.75, "rewards/chosen": 0.003509521484375, "rewards/margins": 0.1807861328125, "rewards/rejected": -0.1773681640625, "step": 619 }, { "epoch": 0.45925925925925926, "grad_norm": 3.110001802444458, "learning_rate": 5.407407407407407e-07, "logits/chosen": 1.7294921875, "logits/rejected": 1.66796875, "logps/chosen": -41.75, "logps/rejected": -38.375, "loss": 1.3701, "rewards/accuracies": 0.25, "rewards/chosen": -0.796875, "rewards/margins": -0.85888671875, "rewards/rejected": 0.0625, "step": 620 }, { "epoch": 0.46, "grad_norm": 2.050389289855957, "learning_rate": 5.4e-07, "logits/chosen": 1.595703125, "logits/rejected": 1.828125, "logps/chosen": -39.0625, "logps/rejected": -56.875, "loss": 0.6626, "rewards/accuracies": 0.5, "rewards/chosen": -0.045654296875, "rewards/margins": 0.1156005859375, "rewards/rejected": -0.1612548828125, "step": 621 }, { "epoch": 0.46074074074074073, "grad_norm": 4.966048717498779, "learning_rate": 5.392592592592592e-07, "logits/chosen": 1.96484375, "logits/rejected": 2.056640625, "logps/chosen": -20.546875, "logps/rejected": -31.609375, "loss": 0.7051, "rewards/accuracies": 0.5, "rewards/chosen": 0.041778564453125, "rewards/margins": -0.0195465087890625, "rewards/rejected": 0.06134033203125, "step": 622 }, { "epoch": 0.4614814814814815, "grad_norm": 2.325407028198242, "learning_rate": 5.385185185185185e-07, "logits/chosen": 0.78564453125, "logits/rejected": 1.8271484375, "logps/chosen": -25.15625, "logps/rejected": -54.59375, "loss": 0.6245, "rewards/accuracies": 0.75, "rewards/chosen": 0.0797119140625, "rewards/margins": 0.1573486328125, "rewards/rejected": -0.07769775390625, "step": 623 }, { "epoch": 0.4622222222222222, "grad_norm": 1.616652011871338, "learning_rate": 5.377777777777778e-07, "logits/chosen": 1.7841796875, "logits/rejected": 1.94921875, "logps/chosen": -30.515625, "logps/rejected": -27.828125, "loss": 0.8477, "rewards/accuracies": 0.0, "rewards/chosen": -0.1195068359375, "rewards/margins": -0.285400390625, "rewards/rejected": 0.1658935546875, "step": 624 }, { "epoch": 0.46296296296296297, "grad_norm": 1.8994189500808716, "learning_rate": 5.37037037037037e-07, "logits/chosen": 1.83984375, "logits/rejected": 1.7265625, "logps/chosen": -27.09375, "logps/rejected": -40.5, "loss": 0.8022, "rewards/accuracies": 0.5, "rewards/chosen": -0.108154296875, "rewards/margins": -0.1629638671875, "rewards/rejected": 0.0546875, "step": 625 }, { "epoch": 0.46370370370370373, "grad_norm": 1.5560752153396606, "learning_rate": 5.362962962962962e-07, "logits/chosen": 1.517578125, "logits/rejected": 1.564453125, "logps/chosen": -25.125, "logps/rejected": -32.375, "loss": 0.6807, "rewards/accuracies": 0.75, "rewards/chosen": 0.0972900390625, "rewards/margins": 0.0269775390625, "rewards/rejected": 0.0703125, "step": 626 }, { "epoch": 0.46444444444444444, "grad_norm": 1.1944535970687866, "learning_rate": 5.355555555555555e-07, "logits/chosen": 1.09375, "logits/rejected": 0.94189453125, "logps/chosen": -34.78125, "logps/rejected": -23.59375, "loss": 0.6421, "rewards/accuracies": 0.5, "rewards/chosen": 0.09710693359375, "rewards/margins": 0.134033203125, "rewards/rejected": -0.036895751953125, "step": 627 }, { "epoch": 0.4651851851851852, "grad_norm": 1.380565881729126, "learning_rate": 5.348148148148148e-07, "logits/chosen": 1.494140625, "logits/rejected": 1.7880859375, "logps/chosen": -16.828125, "logps/rejected": -34.65625, "loss": 0.5938, "rewards/accuracies": 1.0, "rewards/chosen": -0.006053924560546875, "rewards/margins": 0.21533203125, "rewards/rejected": -0.221435546875, "step": 628 }, { "epoch": 0.4659259259259259, "grad_norm": 1.4563472270965576, "learning_rate": 5.340740740740741e-07, "logits/chosen": 1.2255859375, "logits/rejected": 1.2119140625, "logps/chosen": -35.65625, "logps/rejected": -42.0, "loss": 0.5684, "rewards/accuracies": 0.75, "rewards/chosen": 0.1136474609375, "rewards/margins": 0.290283203125, "rewards/rejected": -0.176513671875, "step": 629 }, { "epoch": 0.4666666666666667, "grad_norm": 1.9232321977615356, "learning_rate": 5.333333333333333e-07, "logits/chosen": 1.0556640625, "logits/rejected": 1.638671875, "logps/chosen": -39.5, "logps/rejected": -46.09375, "loss": 0.5947, "rewards/accuracies": 1.0, "rewards/chosen": -0.04144287109375, "rewards/margins": 0.2109375, "rewards/rejected": -0.25244140625, "step": 630 }, { "epoch": 0.4674074074074074, "grad_norm": 3.714301347732544, "learning_rate": 5.325925925925926e-07, "logits/chosen": 1.8564453125, "logits/rejected": 1.28125, "logps/chosen": -22.9375, "logps/rejected": -37.125, "loss": 1.2676, "rewards/accuracies": 0.25, "rewards/chosen": -0.0148468017578125, "rewards/margins": -0.77734375, "rewards/rejected": 0.7626953125, "step": 631 }, { "epoch": 0.46814814814814815, "grad_norm": 1.424956202507019, "learning_rate": 5.318518518518518e-07, "logits/chosen": 1.3720703125, "logits/rejected": 2.03515625, "logps/chosen": -29.5625, "logps/rejected": -43.3125, "loss": 0.6211, "rewards/accuracies": 0.5, "rewards/chosen": 0.1636962890625, "rewards/margins": 0.184814453125, "rewards/rejected": -0.0211029052734375, "step": 632 }, { "epoch": 0.4688888888888889, "grad_norm": 1.3084474802017212, "learning_rate": 5.311111111111111e-07, "logits/chosen": 1.603515625, "logits/rejected": 2.232421875, "logps/chosen": -27.3125, "logps/rejected": -45.78125, "loss": 0.5996, "rewards/accuracies": 1.0, "rewards/chosen": 0.09027099609375, "rewards/margins": 0.2093505859375, "rewards/rejected": -0.119140625, "step": 633 }, { "epoch": 0.4696296296296296, "grad_norm": 2.0474965572357178, "learning_rate": 5.303703703703704e-07, "logits/chosen": 1.904296875, "logits/rejected": 1.71875, "logps/chosen": -52.84375, "logps/rejected": -19.75, "loss": 0.7061, "rewards/accuracies": 0.75, "rewards/chosen": -0.1331787109375, "rewards/margins": -0.0201416015625, "rewards/rejected": -0.11309814453125, "step": 634 }, { "epoch": 0.4703703703703704, "grad_norm": 1.3630985021591187, "learning_rate": 5.296296296296296e-07, "logits/chosen": 1.6337890625, "logits/rejected": 1.8515625, "logps/chosen": -25.90625, "logps/rejected": -48.1875, "loss": 0.6768, "rewards/accuracies": 0.75, "rewards/chosen": 0.040802001953125, "rewards/margins": 0.05767822265625, "rewards/rejected": -0.016815185546875, "step": 635 }, { "epoch": 0.4711111111111111, "grad_norm": 2.211440324783325, "learning_rate": 5.288888888888888e-07, "logits/chosen": 1.642578125, "logits/rejected": 2.017578125, "logps/chosen": -36.21875, "logps/rejected": -38.09375, "loss": 0.6123, "rewards/accuracies": 0.75, "rewards/chosen": 0.1370849609375, "rewards/margins": 0.18701171875, "rewards/rejected": -0.04998779296875, "step": 636 }, { "epoch": 0.47185185185185186, "grad_norm": 1.4046270847320557, "learning_rate": 5.281481481481481e-07, "logits/chosen": 1.654296875, "logits/rejected": 1.7099609375, "logps/chosen": -19.15625, "logps/rejected": -27.921875, "loss": 0.7266, "rewards/accuracies": 0.25, "rewards/chosen": 0.0377197265625, "rewards/margins": -0.05877685546875, "rewards/rejected": 0.09649658203125, "step": 637 }, { "epoch": 0.4725925925925926, "grad_norm": 2.8028206825256348, "learning_rate": 5.274074074074074e-07, "logits/chosen": 1.517578125, "logits/rejected": 2.015625, "logps/chosen": -28.34375, "logps/rejected": -38.09375, "loss": 0.5737, "rewards/accuracies": 1.0, "rewards/chosen": 0.192138671875, "rewards/margins": 0.28662109375, "rewards/rejected": -0.09454345703125, "step": 638 }, { "epoch": 0.47333333333333333, "grad_norm": 1.4353030920028687, "learning_rate": 5.266666666666666e-07, "logits/chosen": 1.2294921875, "logits/rejected": 1.07421875, "logps/chosen": -22.34375, "logps/rejected": -31.578125, "loss": 0.6816, "rewards/accuracies": 0.75, "rewards/chosen": 0.0989990234375, "rewards/margins": 0.03314208984375, "rewards/rejected": 0.0657958984375, "step": 639 }, { "epoch": 0.4740740740740741, "grad_norm": 1.6964426040649414, "learning_rate": 5.259259259259258e-07, "logits/chosen": 1.16015625, "logits/rejected": 1.3193359375, "logps/chosen": -23.890625, "logps/rejected": -58.03125, "loss": 0.6738, "rewards/accuracies": 0.5, "rewards/chosen": 0.06329345703125, "rewards/margins": 0.10784912109375, "rewards/rejected": -0.044525146484375, "step": 640 }, { "epoch": 0.4748148148148148, "grad_norm": 2.259495973587036, "learning_rate": 5.251851851851851e-07, "logits/chosen": 1.4814453125, "logits/rejected": 1.6435546875, "logps/chosen": -33.1875, "logps/rejected": -42.125, "loss": 0.9648, "rewards/accuracies": 0.5, "rewards/chosen": 0.039459228515625, "rewards/margins": -0.38525390625, "rewards/rejected": 0.4248046875, "step": 641 }, { "epoch": 0.47555555555555556, "grad_norm": 1.6477956771850586, "learning_rate": 5.244444444444445e-07, "logits/chosen": 1.2861328125, "logits/rejected": 1.6787109375, "logps/chosen": -26.03125, "logps/rejected": -43.125, "loss": 0.668, "rewards/accuracies": 0.5, "rewards/chosen": 0.063232421875, "rewards/margins": 0.06561279296875, "rewards/rejected": -0.0023651123046875, "step": 642 }, { "epoch": 0.4762962962962963, "grad_norm": 1.880053997039795, "learning_rate": 5.237037037037037e-07, "logits/chosen": 1.3779296875, "logits/rejected": 1.8095703125, "logps/chosen": -32.46875, "logps/rejected": -42.8125, "loss": 0.6685, "rewards/accuracies": 0.25, "rewards/chosen": 0.1370849609375, "rewards/margins": 0.073486328125, "rewards/rejected": 0.06365966796875, "step": 643 }, { "epoch": 0.47703703703703704, "grad_norm": 2.7476911544799805, "learning_rate": 5.22962962962963e-07, "logits/chosen": 0.9443359375, "logits/rejected": 1.3017578125, "logps/chosen": -19.890625, "logps/rejected": -39.34375, "loss": 0.7485, "rewards/accuracies": 0.5, "rewards/chosen": -0.05157470703125, "rewards/margins": -0.09619140625, "rewards/rejected": 0.044525146484375, "step": 644 }, { "epoch": 0.4777777777777778, "grad_norm": 1.4597879648208618, "learning_rate": 5.222222222222223e-07, "logits/chosen": 1.16015625, "logits/rejected": 2.294921875, "logps/chosen": -21.671875, "logps/rejected": -65.1875, "loss": 0.5371, "rewards/accuracies": 1.0, "rewards/chosen": 0.1361083984375, "rewards/margins": 0.423828125, "rewards/rejected": -0.28759765625, "step": 645 }, { "epoch": 0.4785185185185185, "grad_norm": 2.9167613983154297, "learning_rate": 5.214814814814814e-07, "logits/chosen": 1.06640625, "logits/rejected": 1.42578125, "logps/chosen": -22.890625, "logps/rejected": -55.0625, "loss": 0.9214, "rewards/accuracies": 0.75, "rewards/chosen": 0.016204833984375, "rewards/margins": -0.2498779296875, "rewards/rejected": 0.26611328125, "step": 646 }, { "epoch": 0.4792592592592593, "grad_norm": 1.6914663314819336, "learning_rate": 5.207407407407407e-07, "logits/chosen": 1.7236328125, "logits/rejected": 1.7255859375, "logps/chosen": -28.890625, "logps/rejected": -32.90625, "loss": 0.7192, "rewards/accuracies": 0.25, "rewards/chosen": -0.0191497802734375, "rewards/margins": -0.044921875, "rewards/rejected": 0.025787353515625, "step": 647 }, { "epoch": 0.48, "grad_norm": 1.521723985671997, "learning_rate": 5.2e-07, "logits/chosen": 1.1962890625, "logits/rejected": 1.8359375, "logps/chosen": -24.34375, "logps/rejected": -37.09375, "loss": 0.6743, "rewards/accuracies": 0.5, "rewards/chosen": 0.039459228515625, "rewards/margins": 0.04571533203125, "rewards/rejected": -0.006267547607421875, "step": 648 }, { "epoch": 0.48074074074074075, "grad_norm": 2.4310526847839355, "learning_rate": 5.192592592592593e-07, "logits/chosen": 1.8134765625, "logits/rejected": 2.068359375, "logps/chosen": -41.25, "logps/rejected": -52.6875, "loss": 0.6919, "rewards/accuracies": 0.25, "rewards/chosen": -0.01088714599609375, "rewards/margins": 0.030548095703125, "rewards/rejected": -0.041412353515625, "step": 649 }, { "epoch": 0.48148148148148145, "grad_norm": 1.499647617340088, "learning_rate": 5.185185185185185e-07, "logits/chosen": 1.3095703125, "logits/rejected": 1.4814453125, "logps/chosen": -24.0625, "logps/rejected": -27.625, "loss": 0.7842, "rewards/accuracies": 0.25, "rewards/chosen": 0.0074462890625, "rewards/margins": -0.155517578125, "rewards/rejected": 0.1629638671875, "step": 650 }, { "epoch": 0.4822222222222222, "grad_norm": 1.9862843751907349, "learning_rate": 5.177777777777777e-07, "logits/chosen": 2.0859375, "logits/rejected": 1.923828125, "logps/chosen": -48.75, "logps/rejected": -46.8125, "loss": 0.7207, "rewards/accuracies": 0.5, "rewards/chosen": 0.01055908203125, "rewards/margins": -0.044921875, "rewards/rejected": 0.055450439453125, "step": 651 }, { "epoch": 0.482962962962963, "grad_norm": 1.6323633193969727, "learning_rate": 5.17037037037037e-07, "logits/chosen": 2.189453125, "logits/rejected": 1.455078125, "logps/chosen": -25.53125, "logps/rejected": -73.9375, "loss": 0.5601, "rewards/accuracies": 1.0, "rewards/chosen": 0.024627685546875, "rewards/margins": 0.29052734375, "rewards/rejected": -0.26611328125, "step": 652 }, { "epoch": 0.4837037037037037, "grad_norm": 1.542822003364563, "learning_rate": 5.162962962962962e-07, "logits/chosen": 1.796875, "logits/rejected": 1.5859375, "logps/chosen": -38.09375, "logps/rejected": -44.0, "loss": 0.603, "rewards/accuracies": 1.0, "rewards/chosen": 0.1605224609375, "rewards/margins": 0.195556640625, "rewards/rejected": -0.034942626953125, "step": 653 }, { "epoch": 0.48444444444444446, "grad_norm": 1.9661865234375, "learning_rate": 5.155555555555555e-07, "logits/chosen": 1.7783203125, "logits/rejected": 2.08984375, "logps/chosen": -29.8125, "logps/rejected": -67.125, "loss": 0.7114, "rewards/accuracies": 0.75, "rewards/chosen": 0.09844970703125, "rewards/margins": 0.0960693359375, "rewards/rejected": 0.00234222412109375, "step": 654 }, { "epoch": 0.48518518518518516, "grad_norm": 2.2897138595581055, "learning_rate": 5.148148148148148e-07, "logits/chosen": 1.2978515625, "logits/rejected": 1.41015625, "logps/chosen": -77.125, "logps/rejected": -34.375, "loss": 0.708, "rewards/accuracies": 0.75, "rewards/chosen": -0.08514404296875, "rewards/margins": -0.02069091796875, "rewards/rejected": -0.064453125, "step": 655 }, { "epoch": 0.48592592592592593, "grad_norm": 1.470751404762268, "learning_rate": 5.140740740740741e-07, "logits/chosen": 2.49609375, "logits/rejected": 0.9130859375, "logps/chosen": -22.359375, "logps/rejected": -26.421875, "loss": 0.6411, "rewards/accuracies": 0.75, "rewards/chosen": -0.05548095703125, "rewards/margins": 0.116455078125, "rewards/rejected": -0.171875, "step": 656 }, { "epoch": 0.4866666666666667, "grad_norm": 2.22971248626709, "learning_rate": 5.133333333333333e-07, "logits/chosen": 2.01953125, "logits/rejected": 2.34375, "logps/chosen": -46.6875, "logps/rejected": -60.6875, "loss": 0.5801, "rewards/accuracies": 0.75, "rewards/chosen": -0.11053466796875, "rewards/margins": 0.60107421875, "rewards/rejected": -0.7119140625, "step": 657 }, { "epoch": 0.4874074074074074, "grad_norm": 1.7591098546981812, "learning_rate": 5.125925925925926e-07, "logits/chosen": 1.5478515625, "logits/rejected": 1.3876953125, "logps/chosen": -46.75, "logps/rejected": -58.6875, "loss": 0.7129, "rewards/accuracies": 0.5, "rewards/chosen": -0.137939453125, "rewards/margins": -0.01678466796875, "rewards/rejected": -0.12109375, "step": 658 }, { "epoch": 0.48814814814814816, "grad_norm": 1.5868374109268188, "learning_rate": 5.118518518518519e-07, "logits/chosen": 1.248046875, "logits/rejected": 1.564453125, "logps/chosen": -18.5625, "logps/rejected": -28.953125, "loss": 0.7104, "rewards/accuracies": 0.25, "rewards/chosen": -0.08203125, "rewards/margins": -0.0062713623046875, "rewards/rejected": -0.07574462890625, "step": 659 }, { "epoch": 0.4888888888888889, "grad_norm": 1.5081945657730103, "learning_rate": 5.111111111111111e-07, "logits/chosen": 2.314453125, "logits/rejected": 1.6953125, "logps/chosen": -23.90625, "logps/rejected": -28.515625, "loss": 0.6611, "rewards/accuracies": 0.75, "rewards/chosen": -0.0718994140625, "rewards/margins": 0.0699462890625, "rewards/rejected": -0.141845703125, "step": 660 }, { "epoch": 0.48962962962962964, "grad_norm": 1.6494272947311401, "learning_rate": 5.103703703703703e-07, "logits/chosen": 1.3173828125, "logits/rejected": 1.396484375, "logps/chosen": -30.609375, "logps/rejected": -29.65625, "loss": 0.7378, "rewards/accuracies": 0.25, "rewards/chosen": -0.1468505859375, "rewards/margins": -0.0726318359375, "rewards/rejected": -0.07421875, "step": 661 }, { "epoch": 0.49037037037037035, "grad_norm": 1.2971428632736206, "learning_rate": 5.096296296296296e-07, "logits/chosen": 1.822265625, "logits/rejected": 1.4345703125, "logps/chosen": -22.640625, "logps/rejected": -30.90625, "loss": 0.5908, "rewards/accuracies": 0.75, "rewards/chosen": 0.220947265625, "rewards/margins": 0.2220458984375, "rewards/rejected": -0.001220703125, "step": 662 }, { "epoch": 0.4911111111111111, "grad_norm": 1.4383028745651245, "learning_rate": 5.088888888888888e-07, "logits/chosen": 1.2890625, "logits/rejected": 1.9775390625, "logps/chosen": -31.90625, "logps/rejected": -23.5, "loss": 0.6636, "rewards/accuracies": 0.5, "rewards/chosen": 0.1148681640625, "rewards/margins": 0.07147216796875, "rewards/rejected": 0.04339599609375, "step": 663 }, { "epoch": 0.4918518518518519, "grad_norm": 3.9970152378082275, "learning_rate": 5.081481481481481e-07, "logits/chosen": 2.2578125, "logits/rejected": 1.373046875, "logps/chosen": -46.3125, "logps/rejected": -27.703125, "loss": 0.8926, "rewards/accuracies": 0.25, "rewards/chosen": -0.20654296875, "rewards/margins": -0.334716796875, "rewards/rejected": 0.128173828125, "step": 664 }, { "epoch": 0.4925925925925926, "grad_norm": 1.3933091163635254, "learning_rate": 5.074074074074074e-07, "logits/chosen": 0.95703125, "logits/rejected": 1.517578125, "logps/chosen": -19.125, "logps/rejected": -30.8125, "loss": 0.6768, "rewards/accuracies": 0.75, "rewards/chosen": 0.044342041015625, "rewards/margins": 0.036529541015625, "rewards/rejected": 0.0077972412109375, "step": 665 }, { "epoch": 0.49333333333333335, "grad_norm": 1.8502899408340454, "learning_rate": 5.066666666666667e-07, "logits/chosen": 1.6748046875, "logits/rejected": 1.2939453125, "logps/chosen": -18.0, "logps/rejected": -43.3125, "loss": 0.6646, "rewards/accuracies": 0.75, "rewards/chosen": 0.10174560546875, "rewards/margins": 0.07122802734375, "rewards/rejected": 0.03045654296875, "step": 666 }, { "epoch": 0.49407407407407405, "grad_norm": 2.5901107788085938, "learning_rate": 5.059259259259258e-07, "logits/chosen": 1.412109375, "logits/rejected": 1.521484375, "logps/chosen": -48.25, "logps/rejected": -41.53125, "loss": 0.7285, "rewards/accuracies": 0.25, "rewards/chosen": 0.112548828125, "rewards/margins": -0.06329345703125, "rewards/rejected": 0.17578125, "step": 667 }, { "epoch": 0.4948148148148148, "grad_norm": 2.499746799468994, "learning_rate": 5.051851851851851e-07, "logits/chosen": 2.078125, "logits/rejected": 1.46484375, "logps/chosen": -47.125, "logps/rejected": -36.75, "loss": 0.6982, "rewards/accuracies": 0.25, "rewards/chosen": -0.06951904296875, "rewards/margins": 0.000396728515625, "rewards/rejected": -0.0699462890625, "step": 668 }, { "epoch": 0.4955555555555556, "grad_norm": 2.233760118484497, "learning_rate": 5.044444444444445e-07, "logits/chosen": 1.3681640625, "logits/rejected": 1.189453125, "logps/chosen": -35.46875, "logps/rejected": -53.9375, "loss": 0.8467, "rewards/accuracies": 0.0, "rewards/chosen": -0.1707763671875, "rewards/margins": -0.278564453125, "rewards/rejected": 0.1077880859375, "step": 669 }, { "epoch": 0.4962962962962963, "grad_norm": 1.7941110134124756, "learning_rate": 5.037037037037037e-07, "logits/chosen": 1.8505859375, "logits/rejected": 2.240234375, "logps/chosen": -27.4375, "logps/rejected": -59.40625, "loss": 0.7461, "rewards/accuracies": 0.25, "rewards/chosen": -0.1097412109375, "rewards/margins": -0.10040283203125, "rewards/rejected": -0.009368896484375, "step": 670 }, { "epoch": 0.49703703703703705, "grad_norm": 1.429930567741394, "learning_rate": 5.02962962962963e-07, "logits/chosen": 1.9873046875, "logits/rejected": 1.7216796875, "logps/chosen": -36.03125, "logps/rejected": -36.125, "loss": 0.6025, "rewards/accuracies": 0.75, "rewards/chosen": 0.1351318359375, "rewards/margins": 0.2098388671875, "rewards/rejected": -0.0745849609375, "step": 671 }, { "epoch": 0.49777777777777776, "grad_norm": 1.8619531393051147, "learning_rate": 5.022222222222222e-07, "logits/chosen": 1.48046875, "logits/rejected": 1.978515625, "logps/chosen": -19.96875, "logps/rejected": -73.9375, "loss": 0.5498, "rewards/accuracies": 1.0, "rewards/chosen": 0.2298583984375, "rewards/margins": 0.316162109375, "rewards/rejected": -0.0863037109375, "step": 672 }, { "epoch": 0.4985185185185185, "grad_norm": 4.975447177886963, "learning_rate": 5.014814814814815e-07, "logits/chosen": 2.65625, "logits/rejected": 1.9306640625, "logps/chosen": -28.640625, "logps/rejected": -26.109375, "loss": 0.793, "rewards/accuracies": 0.5, "rewards/chosen": -0.1531982421875, "rewards/margins": -0.157958984375, "rewards/rejected": 0.0046844482421875, "step": 673 }, { "epoch": 0.49925925925925924, "grad_norm": 1.8530441522598267, "learning_rate": 5.007407407407407e-07, "logits/chosen": 1.255859375, "logits/rejected": 0.61376953125, "logps/chosen": -31.125, "logps/rejected": -24.703125, "loss": 0.5752, "rewards/accuracies": 1.0, "rewards/chosen": 0.190185546875, "rewards/margins": 0.265625, "rewards/rejected": -0.07537841796875, "step": 674 }, { "epoch": 0.5, "grad_norm": 2.0574753284454346, "learning_rate": 5e-07, "logits/chosen": 1.1865234375, "logits/rejected": 1.1240234375, "logps/chosen": -37.71875, "logps/rejected": -36.125, "loss": 0.7651, "rewards/accuracies": 0.25, "rewards/chosen": -0.06036376953125, "rewards/margins": -0.1256103515625, "rewards/rejected": 0.065185546875, "step": 675 }, { "epoch": 0.5007407407407407, "grad_norm": 2.423372507095337, "learning_rate": 4.992592592592593e-07, "logits/chosen": 1.138671875, "logits/rejected": 1.2626953125, "logps/chosen": -55.4375, "logps/rejected": -49.8125, "loss": 0.6519, "rewards/accuracies": 0.75, "rewards/chosen": 0.1187744140625, "rewards/margins": 0.089111328125, "rewards/rejected": 0.029693603515625, "step": 676 }, { "epoch": 0.5014814814814815, "grad_norm": 1.6717145442962646, "learning_rate": 4.985185185185185e-07, "logits/chosen": 1.1826171875, "logits/rejected": 0.97021484375, "logps/chosen": -26.640625, "logps/rejected": -56.90625, "loss": 0.6758, "rewards/accuracies": 0.5, "rewards/chosen": 0.0531005859375, "rewards/margins": 0.044525146484375, "rewards/rejected": 0.008575439453125, "step": 677 }, { "epoch": 0.5022222222222222, "grad_norm": 2.044325590133667, "learning_rate": 4.977777777777777e-07, "logits/chosen": 0.888671875, "logits/rejected": 2.32421875, "logps/chosen": -21.984375, "logps/rejected": -67.125, "loss": 0.6855, "rewards/accuracies": 0.5, "rewards/chosen": -0.050201416015625, "rewards/margins": 0.03607177734375, "rewards/rejected": -0.0863037109375, "step": 678 }, { "epoch": 0.502962962962963, "grad_norm": 2.2953221797943115, "learning_rate": 4.97037037037037e-07, "logits/chosen": 1.587890625, "logits/rejected": 1.958984375, "logps/chosen": -34.3125, "logps/rejected": -25.765625, "loss": 0.6001, "rewards/accuracies": 0.75, "rewards/chosen": 0.112548828125, "rewards/margins": 0.2015380859375, "rewards/rejected": -0.08905029296875, "step": 679 }, { "epoch": 0.5037037037037037, "grad_norm": 1.9481828212738037, "learning_rate": 4.962962962962963e-07, "logits/chosen": 1.76171875, "logits/rejected": 1.12109375, "logps/chosen": -36.0625, "logps/rejected": -24.078125, "loss": 0.7295, "rewards/accuracies": 0.5, "rewards/chosen": -0.041534423828125, "rewards/margins": -0.0204620361328125, "rewards/rejected": -0.0211029052734375, "step": 680 }, { "epoch": 0.5044444444444445, "grad_norm": 1.7346090078353882, "learning_rate": 4.955555555555556e-07, "logits/chosen": 2.4140625, "logits/rejected": 1.330078125, "logps/chosen": -31.40625, "logps/rejected": -30.125, "loss": 0.8564, "rewards/accuracies": 0.25, "rewards/chosen": 0.0687255859375, "rewards/margins": -0.2314453125, "rewards/rejected": 0.300048828125, "step": 681 }, { "epoch": 0.5051851851851852, "grad_norm": 1.4069464206695557, "learning_rate": 4.948148148148148e-07, "logits/chosen": 1.55078125, "logits/rejected": 1.31640625, "logps/chosen": -31.609375, "logps/rejected": -28.125, "loss": 0.6729, "rewards/accuracies": 0.75, "rewards/chosen": 0.141357421875, "rewards/margins": 0.045684814453125, "rewards/rejected": 0.095703125, "step": 682 }, { "epoch": 0.5059259259259259, "grad_norm": 2.034707546234131, "learning_rate": 4.94074074074074e-07, "logits/chosen": 1.396484375, "logits/rejected": 1.9892578125, "logps/chosen": -22.5625, "logps/rejected": -38.25, "loss": 0.7246, "rewards/accuracies": 0.25, "rewards/chosen": -0.0007781982421875, "rewards/margins": -0.0531005859375, "rewards/rejected": 0.052337646484375, "step": 683 }, { "epoch": 0.5066666666666667, "grad_norm": 1.852798581123352, "learning_rate": 4.933333333333333e-07, "logits/chosen": 1.89453125, "logits/rejected": 1.666015625, "logps/chosen": -41.4375, "logps/rejected": -45.6875, "loss": 0.6938, "rewards/accuracies": 0.75, "rewards/chosen": -0.05352783203125, "rewards/margins": 0.01409912109375, "rewards/rejected": -0.06756591796875, "step": 684 }, { "epoch": 0.5074074074074074, "grad_norm": 1.7201472520828247, "learning_rate": 4.925925925925926e-07, "logits/chosen": 1.21875, "logits/rejected": 1.4365234375, "logps/chosen": -21.46875, "logps/rejected": -62.21875, "loss": 0.6504, "rewards/accuracies": 0.75, "rewards/chosen": 0.1015625, "rewards/margins": 0.1195068359375, "rewards/rejected": -0.0179901123046875, "step": 685 }, { "epoch": 0.5081481481481481, "grad_norm": 2.476853609085083, "learning_rate": 4.918518518518519e-07, "logits/chosen": 2.03125, "logits/rejected": 1.8173828125, "logps/chosen": -35.875, "logps/rejected": -41.71875, "loss": 1.0254, "rewards/accuracies": 0.25, "rewards/chosen": -0.262939453125, "rewards/margins": -0.49169921875, "rewards/rejected": 0.22900390625, "step": 686 }, { "epoch": 0.5088888888888888, "grad_norm": 1.5021766424179077, "learning_rate": 4.91111111111111e-07, "logits/chosen": 1.8017578125, "logits/rejected": 1.2275390625, "logps/chosen": -32.125, "logps/rejected": -33.34375, "loss": 0.6719, "rewards/accuracies": 0.75, "rewards/chosen": 0.0226593017578125, "rewards/margins": 0.054290771484375, "rewards/rejected": -0.0316162109375, "step": 687 }, { "epoch": 0.5096296296296297, "grad_norm": 2.4465489387512207, "learning_rate": 4.903703703703703e-07, "logits/chosen": 1.4326171875, "logits/rejected": 1.41796875, "logps/chosen": -28.296875, "logps/rejected": -38.375, "loss": 0.96, "rewards/accuracies": 0.25, "rewards/chosen": -0.193115234375, "rewards/margins": -0.421142578125, "rewards/rejected": 0.22802734375, "step": 688 }, { "epoch": 0.5103703703703704, "grad_norm": 2.113250970840454, "learning_rate": 4.896296296296296e-07, "logits/chosen": 1.05859375, "logits/rejected": 1.0966796875, "logps/chosen": -32.6875, "logps/rejected": -48.96875, "loss": 0.7046, "rewards/accuracies": 0.5, "rewards/chosen": 0.135986328125, "rewards/margins": -0.00933837890625, "rewards/rejected": 0.145263671875, "step": 689 }, { "epoch": 0.5111111111111111, "grad_norm": 1.4770156145095825, "learning_rate": 4.888888888888889e-07, "logits/chosen": 1.4609375, "logits/rejected": 1.333984375, "logps/chosen": -28.828125, "logps/rejected": -57.09375, "loss": 0.6782, "rewards/accuracies": 0.25, "rewards/chosen": 0.140625, "rewards/margins": 0.053192138671875, "rewards/rejected": 0.08746337890625, "step": 690 }, { "epoch": 0.5118518518518519, "grad_norm": 1.7195093631744385, "learning_rate": 4.881481481481482e-07, "logits/chosen": 1.7236328125, "logits/rejected": 1.3359375, "logps/chosen": -33.65625, "logps/rejected": -39.53125, "loss": 0.6528, "rewards/accuracies": 1.0, "rewards/chosen": -0.022674560546875, "rewards/margins": 0.0843505859375, "rewards/rejected": -0.1070556640625, "step": 691 }, { "epoch": 0.5125925925925926, "grad_norm": 1.792357087135315, "learning_rate": 4.874074074074073e-07, "logits/chosen": 1.4287109375, "logits/rejected": 2.126953125, "logps/chosen": -31.78125, "logps/rejected": -24.359375, "loss": 0.7881, "rewards/accuracies": 0.5, "rewards/chosen": 0.043182373046875, "rewards/margins": -0.151611328125, "rewards/rejected": 0.1947021484375, "step": 692 }, { "epoch": 0.5133333333333333, "grad_norm": 1.3114339113235474, "learning_rate": 4.866666666666666e-07, "logits/chosen": 1.9345703125, "logits/rejected": 1.90625, "logps/chosen": -21.96875, "logps/rejected": -32.625, "loss": 0.7358, "rewards/accuracies": 0.5, "rewards/chosen": -0.068359375, "rewards/margins": -0.057830810546875, "rewards/rejected": -0.0105133056640625, "step": 693 }, { "epoch": 0.5140740740740741, "grad_norm": 7.353246212005615, "learning_rate": 4.859259259259259e-07, "logits/chosen": 1.81640625, "logits/rejected": 1.4619140625, "logps/chosen": -46.75, "logps/rejected": -77.5625, "loss": 0.7427, "rewards/accuracies": 0.75, "rewards/chosen": 0.2008056640625, "rewards/margins": 0.010986328125, "rewards/rejected": 0.1898193359375, "step": 694 }, { "epoch": 0.5148148148148148, "grad_norm": 1.8378249406814575, "learning_rate": 4.851851851851852e-07, "logits/chosen": 1.87109375, "logits/rejected": 1.314453125, "logps/chosen": -31.90625, "logps/rejected": -34.21875, "loss": 0.7539, "rewards/accuracies": 0.25, "rewards/chosen": -0.132080078125, "rewards/margins": -0.1129150390625, "rewards/rejected": -0.019134521484375, "step": 695 }, { "epoch": 0.5155555555555555, "grad_norm": 2.1003427505493164, "learning_rate": 4.844444444444445e-07, "logits/chosen": 1.6337890625, "logits/rejected": 1.375, "logps/chosen": -36.375, "logps/rejected": -49.3125, "loss": 0.7573, "rewards/accuracies": 0.5, "rewards/chosen": 0.0105438232421875, "rewards/margins": -0.10223388671875, "rewards/rejected": 0.11279296875, "step": 696 }, { "epoch": 0.5162962962962963, "grad_norm": 1.8160394430160522, "learning_rate": 4.837037037037037e-07, "logits/chosen": 1.767578125, "logits/rejected": 1.2919921875, "logps/chosen": -35.15625, "logps/rejected": -46.5625, "loss": 0.6572, "rewards/accuracies": 1.0, "rewards/chosen": 0.1356201171875, "rewards/margins": 0.073486328125, "rewards/rejected": 0.062103271484375, "step": 697 }, { "epoch": 0.5170370370370371, "grad_norm": 2.717268943786621, "learning_rate": 4.829629629629629e-07, "logits/chosen": 1.5341796875, "logits/rejected": 1.4921875, "logps/chosen": -26.5625, "logps/rejected": -79.9375, "loss": 0.6479, "rewards/accuracies": 0.75, "rewards/chosen": 0.1568603515625, "rewards/margins": 0.158935546875, "rewards/rejected": -0.001983642578125, "step": 698 }, { "epoch": 0.5177777777777778, "grad_norm": 1.6106252670288086, "learning_rate": 4.822222222222222e-07, "logits/chosen": 1.8486328125, "logits/rejected": 1.5166015625, "logps/chosen": -36.78125, "logps/rejected": -62.5, "loss": 0.4939, "rewards/accuracies": 1.0, "rewards/chosen": 0.48681640625, "rewards/margins": 0.467529296875, "rewards/rejected": 0.0191497802734375, "step": 699 }, { "epoch": 0.5185185185185185, "grad_norm": 1.3885531425476074, "learning_rate": 4.814814814814814e-07, "logits/chosen": 2.001953125, "logits/rejected": 1.3408203125, "logps/chosen": -31.8125, "logps/rejected": -80.0625, "loss": 0.5654, "rewards/accuracies": 0.5, "rewards/chosen": 0.158203125, "rewards/margins": 0.449951171875, "rewards/rejected": -0.291748046875, "step": 700 }, { "epoch": 0.5192592592592593, "grad_norm": 2.0945327281951904, "learning_rate": 4.807407407407407e-07, "logits/chosen": 1.0400390625, "logits/rejected": 2.037109375, "logps/chosen": -27.28125, "logps/rejected": -34.34375, "loss": 0.7949, "rewards/accuracies": 0.25, "rewards/chosen": -0.021881103515625, "rewards/margins": -0.18115234375, "rewards/rejected": 0.1593017578125, "step": 701 }, { "epoch": 0.52, "grad_norm": 1.6657545566558838, "learning_rate": 4.8e-07, "logits/chosen": 1.759765625, "logits/rejected": 1.626953125, "logps/chosen": -22.375, "logps/rejected": -30.765625, "loss": 0.6748, "rewards/accuracies": 0.5, "rewards/chosen": -0.10272216796875, "rewards/margins": 0.0445556640625, "rewards/rejected": -0.1473388671875, "step": 702 }, { "epoch": 0.5207407407407407, "grad_norm": 2.392091989517212, "learning_rate": 4.792592592592592e-07, "logits/chosen": 1.107421875, "logits/rejected": 1.658203125, "logps/chosen": -34.65625, "logps/rejected": -51.875, "loss": 0.8081, "rewards/accuracies": 0.5, "rewards/chosen": -0.2421875, "rewards/margins": -0.20263671875, "rewards/rejected": -0.039459228515625, "step": 703 }, { "epoch": 0.5214814814814814, "grad_norm": 1.5902702808380127, "learning_rate": 4.785185185185185e-07, "logits/chosen": 1.96484375, "logits/rejected": 1.3291015625, "logps/chosen": -34.6875, "logps/rejected": -43.90625, "loss": 0.7593, "rewards/accuracies": 0.5, "rewards/chosen": -0.021881103515625, "rewards/margins": -0.11676025390625, "rewards/rejected": 0.09490966796875, "step": 704 }, { "epoch": 0.5222222222222223, "grad_norm": 2.696850061416626, "learning_rate": 4.777777777777778e-07, "logits/chosen": 1.76953125, "logits/rejected": 1.5283203125, "logps/chosen": -32.5, "logps/rejected": -72.875, "loss": 0.7612, "rewards/accuracies": 0.25, "rewards/chosen": 0.021087646484375, "rewards/margins": -0.125, "rewards/rejected": 0.1461181640625, "step": 705 }, { "epoch": 0.522962962962963, "grad_norm": 2.1329751014709473, "learning_rate": 4.77037037037037e-07, "logits/chosen": 1.8505859375, "logits/rejected": 2.294921875, "logps/chosen": -36.5, "logps/rejected": -42.40625, "loss": 0.667, "rewards/accuracies": 0.25, "rewards/chosen": 0.0099639892578125, "rewards/margins": 0.306396484375, "rewards/rejected": -0.29638671875, "step": 706 }, { "epoch": 0.5237037037037037, "grad_norm": 1.3549343347549438, "learning_rate": 4.7629629629629626e-07, "logits/chosen": 1.3544921875, "logits/rejected": 1.0771484375, "logps/chosen": -30.75, "logps/rejected": -23.640625, "loss": 0.7563, "rewards/accuracies": 0.5, "rewards/chosen": -0.058807373046875, "rewards/margins": -0.115478515625, "rewards/rejected": 0.056640625, "step": 707 }, { "epoch": 0.5244444444444445, "grad_norm": 1.9516210556030273, "learning_rate": 4.7555555555555554e-07, "logits/chosen": 1.6689453125, "logits/rejected": 1.830078125, "logps/chosen": -36.96875, "logps/rejected": -60.4375, "loss": 0.6201, "rewards/accuracies": 1.0, "rewards/chosen": 0.07379150390625, "rewards/margins": 0.153076171875, "rewards/rejected": -0.07928466796875, "step": 708 }, { "epoch": 0.5251851851851852, "grad_norm": 4.594093322753906, "learning_rate": 4.7481481481481477e-07, "logits/chosen": 2.24609375, "logits/rejected": 1.958984375, "logps/chosen": -41.75, "logps/rejected": -41.25, "loss": 1.4521, "rewards/accuracies": 0.0, "rewards/chosen": -0.8984375, "rewards/margins": -1.099609375, "rewards/rejected": 0.2008056640625, "step": 709 }, { "epoch": 0.5259259259259259, "grad_norm": 3.4787302017211914, "learning_rate": 4.7407407407407405e-07, "logits/chosen": 1.35546875, "logits/rejected": 1.5908203125, "logps/chosen": -37.28125, "logps/rejected": -82.0625, "loss": 0.813, "rewards/accuracies": 0.5, "rewards/chosen": 0.0211029052734375, "rewards/margins": -0.192138671875, "rewards/rejected": 0.2132568359375, "step": 710 }, { "epoch": 0.5266666666666666, "grad_norm": 2.629810094833374, "learning_rate": 4.733333333333333e-07, "logits/chosen": 1.6845703125, "logits/rejected": 1.6328125, "logps/chosen": -47.25, "logps/rejected": -31.703125, "loss": 1.1543, "rewards/accuracies": 0.25, "rewards/chosen": -0.5703125, "rewards/margins": -0.6865234375, "rewards/rejected": 0.11639404296875, "step": 711 }, { "epoch": 0.5274074074074074, "grad_norm": 1.8143720626831055, "learning_rate": 4.725925925925926e-07, "logits/chosen": 1.8876953125, "logits/rejected": 1.6171875, "logps/chosen": -29.484375, "logps/rejected": -36.875, "loss": 0.8833, "rewards/accuracies": 0.0, "rewards/chosen": -0.2081298828125, "rewards/margins": -0.343017578125, "rewards/rejected": 0.134765625, "step": 712 }, { "epoch": 0.5281481481481481, "grad_norm": 1.8604735136032104, "learning_rate": 4.7185185185185185e-07, "logits/chosen": 1.845703125, "logits/rejected": 2.529296875, "logps/chosen": -25.484375, "logps/rejected": -25.84375, "loss": 0.7993, "rewards/accuracies": 0.25, "rewards/chosen": -0.072265625, "rewards/margins": -0.1917724609375, "rewards/rejected": 0.1195068359375, "step": 713 }, { "epoch": 0.5288888888888889, "grad_norm": 2.163407802581787, "learning_rate": 4.711111111111111e-07, "logits/chosen": 1.1689453125, "logits/rejected": 1.1220703125, "logps/chosen": -27.4375, "logps/rejected": -28.28125, "loss": 0.7314, "rewards/accuracies": 0.25, "rewards/chosen": 0.00390625, "rewards/margins": -0.0628662109375, "rewards/rejected": 0.0667724609375, "step": 714 }, { "epoch": 0.5296296296296297, "grad_norm": 2.911320447921753, "learning_rate": 4.7037037037037036e-07, "logits/chosen": 1.8193359375, "logits/rejected": 1.9853515625, "logps/chosen": -33.6875, "logps/rejected": -49.375, "loss": 0.6826, "rewards/accuracies": 0.5, "rewards/chosen": -0.06756591796875, "rewards/margins": 0.0234222412109375, "rewards/rejected": -0.09100341796875, "step": 715 }, { "epoch": 0.5303703703703704, "grad_norm": 1.567124843597412, "learning_rate": 4.696296296296296e-07, "logits/chosen": 2.291015625, "logits/rejected": 1.93359375, "logps/chosen": -20.84375, "logps/rejected": -21.8125, "loss": 0.6484, "rewards/accuracies": 0.75, "rewards/chosen": 0.0230560302734375, "rewards/margins": 0.10101318359375, "rewards/rejected": -0.07794189453125, "step": 716 }, { "epoch": 0.5311111111111111, "grad_norm": 1.282373070716858, "learning_rate": 4.6888888888888887e-07, "logits/chosen": 1.4716796875, "logits/rejected": 1.2646484375, "logps/chosen": -31.09375, "logps/rejected": -24.84375, "loss": 0.5186, "rewards/accuracies": 0.5, "rewards/chosen": 0.64794921875, "rewards/margins": 0.71435546875, "rewards/rejected": -0.06622314453125, "step": 717 }, { "epoch": 0.5318518518518518, "grad_norm": 1.4779692888259888, "learning_rate": 4.681481481481481e-07, "logits/chosen": 1.576171875, "logits/rejected": 1.2158203125, "logps/chosen": -24.796875, "logps/rejected": -48.84375, "loss": 0.6406, "rewards/accuracies": 0.75, "rewards/chosen": 0.0445556640625, "rewards/margins": 0.18212890625, "rewards/rejected": -0.1375732421875, "step": 718 }, { "epoch": 0.5325925925925926, "grad_norm": 1.7630385160446167, "learning_rate": 4.674074074074074e-07, "logits/chosen": 1.76953125, "logits/rejected": 1.7861328125, "logps/chosen": -26.71875, "logps/rejected": -38.1875, "loss": 0.666, "rewards/accuracies": 0.5, "rewards/chosen": -0.01953125, "rewards/margins": 0.061309814453125, "rewards/rejected": -0.08087158203125, "step": 719 }, { "epoch": 0.5333333333333333, "grad_norm": 1.7927309274673462, "learning_rate": 4.6666666666666666e-07, "logits/chosen": 1.1630859375, "logits/rejected": 1.728515625, "logps/chosen": -28.625, "logps/rejected": -60.625, "loss": 0.7852, "rewards/accuracies": 0.5, "rewards/chosen": -0.06781005859375, "rewards/margins": -0.10223388671875, "rewards/rejected": 0.034423828125, "step": 720 }, { "epoch": 0.534074074074074, "grad_norm": 2.059576988220215, "learning_rate": 4.659259259259259e-07, "logits/chosen": 1.509765625, "logits/rejected": 1.880859375, "logps/chosen": -26.328125, "logps/rejected": -47.53125, "loss": 0.7729, "rewards/accuracies": 0.0, "rewards/chosen": -0.01444244384765625, "rewards/margins": -0.150390625, "rewards/rejected": 0.135986328125, "step": 721 }, { "epoch": 0.5348148148148149, "grad_norm": 1.6466984748840332, "learning_rate": 4.651851851851852e-07, "logits/chosen": 1.5458984375, "logits/rejected": 1.609375, "logps/chosen": -29.96875, "logps/rejected": -26.421875, "loss": 0.7417, "rewards/accuracies": 0.0, "rewards/chosen": -0.12066650390625, "rewards/margins": -0.0933837890625, "rewards/rejected": -0.02734375, "step": 722 }, { "epoch": 0.5355555555555556, "grad_norm": 2.6911230087280273, "learning_rate": 4.644444444444444e-07, "logits/chosen": 2.09375, "logits/rejected": 1.57421875, "logps/chosen": -32.09375, "logps/rejected": -28.4375, "loss": 0.7148, "rewards/accuracies": 0.5, "rewards/chosen": 0.01326751708984375, "rewards/margins": -0.038299560546875, "rewards/rejected": 0.05157470703125, "step": 723 }, { "epoch": 0.5362962962962963, "grad_norm": 2.60774564743042, "learning_rate": 4.637037037037037e-07, "logits/chosen": 1.73828125, "logits/rejected": 1.8349609375, "logps/chosen": -40.625, "logps/rejected": -84.0625, "loss": 0.7524, "rewards/accuracies": 0.5, "rewards/chosen": -0.0816650390625, "rewards/margins": -0.027740478515625, "rewards/rejected": -0.053863525390625, "step": 724 }, { "epoch": 0.5370370370370371, "grad_norm": 5.085193634033203, "learning_rate": 4.6296296296296297e-07, "logits/chosen": 1.68359375, "logits/rejected": 1.884765625, "logps/chosen": -34.96875, "logps/rejected": -57.0, "loss": 0.6787, "rewards/accuracies": 0.75, "rewards/chosen": 0.04296875, "rewards/margins": 0.0289306640625, "rewards/rejected": 0.0140380859375, "step": 725 }, { "epoch": 0.5377777777777778, "grad_norm": 1.8818252086639404, "learning_rate": 4.622222222222222e-07, "logits/chosen": 1.208984375, "logits/rejected": 1.7958984375, "logps/chosen": -31.40625, "logps/rejected": -34.03125, "loss": 0.6685, "rewards/accuracies": 0.25, "rewards/chosen": 0.428955078125, "rewards/margins": 0.2509765625, "rewards/rejected": 0.1781005859375, "step": 726 }, { "epoch": 0.5385185185185185, "grad_norm": 1.810715913772583, "learning_rate": 4.614814814814815e-07, "logits/chosen": 1.71484375, "logits/rejected": 1.5556640625, "logps/chosen": -23.515625, "logps/rejected": -41.6875, "loss": 0.627, "rewards/accuracies": 0.75, "rewards/chosen": 0.04376220703125, "rewards/margins": 0.2020263671875, "rewards/rejected": -0.158203125, "step": 727 }, { "epoch": 0.5392592592592592, "grad_norm": 1.5404330492019653, "learning_rate": 4.607407407407407e-07, "logits/chosen": 1.1044921875, "logits/rejected": 1.302734375, "logps/chosen": -28.28125, "logps/rejected": -41.3125, "loss": 0.6445, "rewards/accuracies": 0.5, "rewards/chosen": 0.11444091796875, "rewards/margins": 0.133544921875, "rewards/rejected": -0.019134521484375, "step": 728 }, { "epoch": 0.54, "grad_norm": 2.3713488578796387, "learning_rate": 4.6e-07, "logits/chosen": 0.9619140625, "logits/rejected": 1.6591796875, "logps/chosen": -53.125, "logps/rejected": -41.21875, "loss": 0.8496, "rewards/accuracies": 0.5, "rewards/chosen": -0.1585693359375, "rewards/margins": -0.248779296875, "rewards/rejected": 0.09027099609375, "step": 729 }, { "epoch": 0.5407407407407407, "grad_norm": 2.097318410873413, "learning_rate": 4.592592592592592e-07, "logits/chosen": 1.564453125, "logits/rejected": 1.7451171875, "logps/chosen": -26.5625, "logps/rejected": -34.78125, "loss": 1.082, "rewards/accuracies": 0.5, "rewards/chosen": -0.0136566162109375, "rewards/margins": -0.5361328125, "rewards/rejected": 0.52294921875, "step": 730 }, { "epoch": 0.5414814814814815, "grad_norm": 2.0353000164031982, "learning_rate": 4.5851851851851845e-07, "logits/chosen": 1.9267578125, "logits/rejected": 1.5595703125, "logps/chosen": -33.15625, "logps/rejected": -36.59375, "loss": 0.7246, "rewards/accuracies": 0.25, "rewards/chosen": -0.057403564453125, "rewards/margins": -0.0482177734375, "rewards/rejected": -0.0092010498046875, "step": 731 }, { "epoch": 0.5422222222222223, "grad_norm": 3.595329999923706, "learning_rate": 4.577777777777778e-07, "logits/chosen": 1.9658203125, "logits/rejected": 2.0859375, "logps/chosen": -48.8125, "logps/rejected": -51.25, "loss": 0.6519, "rewards/accuracies": 0.5, "rewards/chosen": 0.02655029296875, "rewards/margins": 0.10003662109375, "rewards/rejected": -0.073486328125, "step": 732 }, { "epoch": 0.542962962962963, "grad_norm": 1.0476272106170654, "learning_rate": 4.57037037037037e-07, "logits/chosen": 1.3251953125, "logits/rejected": 1.7333984375, "logps/chosen": -39.625, "logps/rejected": -38.9375, "loss": 0.377, "rewards/accuracies": 0.75, "rewards/chosen": 1.1650390625, "rewards/margins": 1.189453125, "rewards/rejected": -0.0237884521484375, "step": 733 }, { "epoch": 0.5437037037037037, "grad_norm": 2.341029167175293, "learning_rate": 4.562962962962963e-07, "logits/chosen": 1.78515625, "logits/rejected": 1.404296875, "logps/chosen": -39.4375, "logps/rejected": -27.9375, "loss": 0.7441, "rewards/accuracies": 0.5, "rewards/chosen": 0.0347900390625, "rewards/margins": -0.06298828125, "rewards/rejected": 0.09783935546875, "step": 734 }, { "epoch": 0.5444444444444444, "grad_norm": 1.7097506523132324, "learning_rate": 4.555555555555555e-07, "logits/chosen": 1.28125, "logits/rejected": 1.35546875, "logps/chosen": -21.25, "logps/rejected": -32.1875, "loss": 0.7583, "rewards/accuracies": 0.5, "rewards/chosen": 0.01444244384765625, "rewards/margins": -0.11248779296875, "rewards/rejected": 0.126953125, "step": 735 }, { "epoch": 0.5451851851851852, "grad_norm": 2.278510093688965, "learning_rate": 4.548148148148148e-07, "logits/chosen": 1.306640625, "logits/rejected": 0.837890625, "logps/chosen": -36.96875, "logps/rejected": -22.578125, "loss": 0.7207, "rewards/accuracies": 0.25, "rewards/chosen": -0.04608154296875, "rewards/margins": -0.04217529296875, "rewards/rejected": -0.0039215087890625, "step": 736 }, { "epoch": 0.5459259259259259, "grad_norm": 2.8361587524414062, "learning_rate": 4.5407407407407403e-07, "logits/chosen": 1.587890625, "logits/rejected": 1.8681640625, "logps/chosen": -32.625, "logps/rejected": -90.5, "loss": 0.6797, "rewards/accuracies": 0.5, "rewards/chosen": 0.08734130859375, "rewards/margins": 0.0584716796875, "rewards/rejected": 0.0289154052734375, "step": 737 }, { "epoch": 0.5466666666666666, "grad_norm": 1.9620707035064697, "learning_rate": 4.5333333333333326e-07, "logits/chosen": 1.1103515625, "logits/rejected": 0.86279296875, "logps/chosen": -28.609375, "logps/rejected": -37.625, "loss": 0.791, "rewards/accuracies": 0.25, "rewards/chosen": -0.060546875, "rewards/margins": -0.1741943359375, "rewards/rejected": 0.11370849609375, "step": 738 }, { "epoch": 0.5474074074074075, "grad_norm": 2.704090118408203, "learning_rate": 4.525925925925926e-07, "logits/chosen": 1.3994140625, "logits/rejected": 0.80859375, "logps/chosen": -22.453125, "logps/rejected": -63.375, "loss": 0.6709, "rewards/accuracies": 0.25, "rewards/chosen": -0.0699462890625, "rewards/margins": 0.065185546875, "rewards/rejected": -0.1351318359375, "step": 739 }, { "epoch": 0.5481481481481482, "grad_norm": 3.249873638153076, "learning_rate": 4.5185185185185183e-07, "logits/chosen": 1.5751953125, "logits/rejected": 1.484375, "logps/chosen": -33.46875, "logps/rejected": -29.265625, "loss": 0.9023, "rewards/accuracies": 0.0, "rewards/chosen": -0.328369140625, "rewards/margins": -0.37744140625, "rewards/rejected": 0.049224853515625, "step": 740 }, { "epoch": 0.5488888888888889, "grad_norm": 2.4050920009613037, "learning_rate": 4.511111111111111e-07, "logits/chosen": 1.29296875, "logits/rejected": 1.908203125, "logps/chosen": -26.578125, "logps/rejected": -53.90625, "loss": 0.75, "rewards/accuracies": 0.25, "rewards/chosen": -0.09259033203125, "rewards/margins": -0.08837890625, "rewards/rejected": -0.0042877197265625, "step": 741 }, { "epoch": 0.5496296296296296, "grad_norm": 12.136543273925781, "learning_rate": 4.5037037037037034e-07, "logits/chosen": 1.08203125, "logits/rejected": 1.978515625, "logps/chosen": -28.390625, "logps/rejected": -41.1875, "loss": 0.6938, "rewards/accuracies": 0.25, "rewards/chosen": -0.08123779296875, "rewards/margins": 0.0097503662109375, "rewards/rejected": -0.09100341796875, "step": 742 }, { "epoch": 0.5503703703703704, "grad_norm": 2.5166399478912354, "learning_rate": 4.496296296296296e-07, "logits/chosen": 1.78125, "logits/rejected": 1.7421875, "logps/chosen": -44.6875, "logps/rejected": -64.4375, "loss": 0.7812, "rewards/accuracies": 0.75, "rewards/chosen": -0.1383056640625, "rewards/margins": -0.08502197265625, "rewards/rejected": -0.053192138671875, "step": 743 }, { "epoch": 0.5511111111111111, "grad_norm": 1.9228860139846802, "learning_rate": 4.4888888888888885e-07, "logits/chosen": 1.1630859375, "logits/rejected": 1.1240234375, "logps/chosen": -50.125, "logps/rejected": -37.09375, "loss": 0.7129, "rewards/accuracies": 0.25, "rewards/chosen": 0.119873046875, "rewards/margins": 0.046295166015625, "rewards/rejected": 0.0736083984375, "step": 744 }, { "epoch": 0.5518518518518518, "grad_norm": 1.5194240808486938, "learning_rate": 4.4814814814814813e-07, "logits/chosen": 1.7080078125, "logits/rejected": 1.3203125, "logps/chosen": -44.78125, "logps/rejected": -30.4375, "loss": 0.7383, "rewards/accuracies": 0.25, "rewards/chosen": -0.07086181640625, "rewards/margins": -0.07757568359375, "rewards/rejected": 0.0066375732421875, "step": 745 }, { "epoch": 0.5525925925925926, "grad_norm": 1.784528374671936, "learning_rate": 4.474074074074074e-07, "logits/chosen": 1.744140625, "logits/rejected": 2.3046875, "logps/chosen": -30.890625, "logps/rejected": -36.25, "loss": 0.7715, "rewards/accuracies": 0.25, "rewards/chosen": -0.091796875, "rewards/margins": -0.1448974609375, "rewards/rejected": 0.0531005859375, "step": 746 }, { "epoch": 0.5533333333333333, "grad_norm": 1.907842993736267, "learning_rate": 4.4666666666666664e-07, "logits/chosen": 1.0654296875, "logits/rejected": 1.2158203125, "logps/chosen": -35.78125, "logps/rejected": -38.1875, "loss": 0.8193, "rewards/accuracies": 0.5, "rewards/chosen": -0.051177978515625, "rewards/margins": -0.212890625, "rewards/rejected": 0.1617431640625, "step": 747 }, { "epoch": 0.554074074074074, "grad_norm": 1.90972101688385, "learning_rate": 4.459259259259259e-07, "logits/chosen": 1.4404296875, "logits/rejected": 1.6875, "logps/chosen": -32.625, "logps/rejected": -34.46875, "loss": 0.6807, "rewards/accuracies": 0.5, "rewards/chosen": 0.0804443359375, "rewards/margins": 0.050384521484375, "rewards/rejected": 0.03009033203125, "step": 748 }, { "epoch": 0.5548148148148148, "grad_norm": 2.06888484954834, "learning_rate": 4.4518518518518515e-07, "logits/chosen": 1.5341796875, "logits/rejected": 2.556640625, "logps/chosen": -38.15625, "logps/rejected": -57.4375, "loss": 0.7646, "rewards/accuracies": 0.0, "rewards/chosen": -0.1722412109375, "rewards/margins": -0.135986328125, "rewards/rejected": -0.036346435546875, "step": 749 }, { "epoch": 0.5555555555555556, "grad_norm": 1.8129616975784302, "learning_rate": 4.444444444444444e-07, "logits/chosen": 0.91015625, "logits/rejected": 1.349609375, "logps/chosen": -33.90625, "logps/rejected": -40.125, "loss": 0.8013, "rewards/accuracies": 0.5, "rewards/chosen": -0.1058349609375, "rewards/margins": -0.1875, "rewards/rejected": 0.0816650390625, "step": 750 }, { "epoch": 0.5562962962962963, "grad_norm": 2.3295557498931885, "learning_rate": 4.4370370370370367e-07, "logits/chosen": 1.48828125, "logits/rejected": 1.3798828125, "logps/chosen": -38.75, "logps/rejected": -56.375, "loss": 0.6641, "rewards/accuracies": 0.75, "rewards/chosen": 0.03167724609375, "rewards/margins": 0.064453125, "rewards/rejected": -0.0328369140625, "step": 751 }, { "epoch": 0.557037037037037, "grad_norm": 1.4498933553695679, "learning_rate": 4.4296296296296295e-07, "logits/chosen": 1.51953125, "logits/rejected": 1.59765625, "logps/chosen": -30.796875, "logps/rejected": -25.90625, "loss": 0.6733, "rewards/accuracies": 1.0, "rewards/chosen": -0.0273284912109375, "rewards/margins": 0.0406494140625, "rewards/rejected": -0.0679931640625, "step": 752 }, { "epoch": 0.5577777777777778, "grad_norm": 2.261164903640747, "learning_rate": 4.4222222222222223e-07, "logits/chosen": 1.255859375, "logits/rejected": 0.921875, "logps/chosen": -48.09375, "logps/rejected": -30.84375, "loss": 0.6279, "rewards/accuracies": 0.5, "rewards/chosen": 0.0992431640625, "rewards/margins": 0.145263671875, "rewards/rejected": -0.04608154296875, "step": 753 }, { "epoch": 0.5585185185185185, "grad_norm": 3.0137109756469727, "learning_rate": 4.4148148148148146e-07, "logits/chosen": 1.283203125, "logits/rejected": 1.12890625, "logps/chosen": -26.390625, "logps/rejected": -62.125, "loss": 0.8135, "rewards/accuracies": 0.25, "rewards/chosen": -0.058990478515625, "rewards/margins": -0.20458984375, "rewards/rejected": 0.145751953125, "step": 754 }, { "epoch": 0.5592592592592592, "grad_norm": 3.131995916366577, "learning_rate": 4.4074074074074074e-07, "logits/chosen": 1.8642578125, "logits/rejected": 1.83984375, "logps/chosen": -26.25, "logps/rejected": -63.5625, "loss": 0.8745, "rewards/accuracies": 0.25, "rewards/chosen": -0.033599853515625, "rewards/margins": -0.315185546875, "rewards/rejected": 0.28173828125, "step": 755 }, { "epoch": 0.56, "grad_norm": 1.5919482707977295, "learning_rate": 4.3999999999999997e-07, "logits/chosen": 1.1806640625, "logits/rejected": 1.203125, "logps/chosen": -26.25, "logps/rejected": -40.90625, "loss": 0.6807, "rewards/accuracies": 0.75, "rewards/chosen": 0.00310516357421875, "rewards/margins": 0.03668212890625, "rewards/rejected": -0.0335693359375, "step": 756 }, { "epoch": 0.5607407407407408, "grad_norm": 3.325421094894409, "learning_rate": 4.392592592592592e-07, "logits/chosen": 2.025390625, "logits/rejected": 1.904296875, "logps/chosen": -52.0625, "logps/rejected": -34.71875, "loss": 0.8589, "rewards/accuracies": 0.25, "rewards/chosen": -0.26953125, "rewards/margins": -0.26513671875, "rewards/rejected": -0.00433349609375, "step": 757 }, { "epoch": 0.5614814814814815, "grad_norm": 1.8396071195602417, "learning_rate": 4.3851851851851853e-07, "logits/chosen": 0.9306640625, "logits/rejected": 1.28125, "logps/chosen": -22.46875, "logps/rejected": -45.90625, "loss": 0.7285, "rewards/accuracies": 0.5, "rewards/chosen": -0.01171875, "rewards/margins": -0.059051513671875, "rewards/rejected": 0.047271728515625, "step": 758 }, { "epoch": 0.5622222222222222, "grad_norm": 2.0081098079681396, "learning_rate": 4.3777777777777776e-07, "logits/chosen": 1.7236328125, "logits/rejected": 1.525390625, "logps/chosen": -43.125, "logps/rejected": -52.25, "loss": 0.6816, "rewards/accuracies": 0.5, "rewards/chosen": 0.00742340087890625, "rewards/margins": 0.036346435546875, "rewards/rejected": -0.028900146484375, "step": 759 }, { "epoch": 0.562962962962963, "grad_norm": 2.288923978805542, "learning_rate": 4.3703703703703704e-07, "logits/chosen": 1.908203125, "logits/rejected": 1.6982421875, "logps/chosen": -37.625, "logps/rejected": -94.375, "loss": 0.4788, "rewards/accuracies": 0.75, "rewards/chosen": 0.0706787109375, "rewards/margins": 0.55029296875, "rewards/rejected": -0.479736328125, "step": 760 }, { "epoch": 0.5637037037037037, "grad_norm": 1.9725369215011597, "learning_rate": 4.362962962962963e-07, "logits/chosen": 1.5146484375, "logits/rejected": 1.916015625, "logps/chosen": -26.890625, "logps/rejected": -63.6875, "loss": 0.7471, "rewards/accuracies": 0.5, "rewards/chosen": 0.01287841796875, "rewards/margins": -0.072265625, "rewards/rejected": 0.085205078125, "step": 761 }, { "epoch": 0.5644444444444444, "grad_norm": 1.880658745765686, "learning_rate": 4.355555555555555e-07, "logits/chosen": 1.626953125, "logits/rejected": 1.953125, "logps/chosen": -26.171875, "logps/rejected": -53.0625, "loss": 0.7085, "rewards/accuracies": 0.75, "rewards/chosen": -0.003513336181640625, "rewards/margins": -0.0234375, "rewards/rejected": 0.01995849609375, "step": 762 }, { "epoch": 0.5651851851851852, "grad_norm": 1.5504729747772217, "learning_rate": 4.348148148148148e-07, "logits/chosen": 1.822265625, "logits/rejected": 1.7353515625, "logps/chosen": -26.84375, "logps/rejected": -34.40625, "loss": 0.6836, "rewards/accuracies": 0.75, "rewards/chosen": -0.0121002197265625, "rewards/margins": 0.02264404296875, "rewards/rejected": -0.034759521484375, "step": 763 }, { "epoch": 0.5659259259259259, "grad_norm": 2.5843794345855713, "learning_rate": 4.34074074074074e-07, "logits/chosen": 1.767578125, "logits/rejected": 1.578125, "logps/chosen": -29.59375, "logps/rejected": -47.65625, "loss": 0.7349, "rewards/accuracies": 0.5, "rewards/chosen": 0.106689453125, "rewards/margins": -0.0120849609375, "rewards/rejected": 0.11871337890625, "step": 764 }, { "epoch": 0.5666666666666667, "grad_norm": 1.7402193546295166, "learning_rate": 4.3333333333333335e-07, "logits/chosen": 1.6103515625, "logits/rejected": 1.6826171875, "logps/chosen": -38.3125, "logps/rejected": -40.5, "loss": 0.6362, "rewards/accuracies": 1.0, "rewards/chosen": 0.06134033203125, "rewards/margins": 0.119140625, "rewards/rejected": -0.05780029296875, "step": 765 }, { "epoch": 0.5674074074074074, "grad_norm": 2.6318798065185547, "learning_rate": 4.325925925925926e-07, "logits/chosen": 1.669921875, "logits/rejected": 1.6083984375, "logps/chosen": -23.296875, "logps/rejected": -54.78125, "loss": 0.7588, "rewards/accuracies": 0.25, "rewards/chosen": 0.178955078125, "rewards/margins": -0.1171875, "rewards/rejected": 0.296142578125, "step": 766 }, { "epoch": 0.5681481481481482, "grad_norm": 1.859459400177002, "learning_rate": 4.3185185185185186e-07, "logits/chosen": 1.513671875, "logits/rejected": 1.458984375, "logps/chosen": -42.6875, "logps/rejected": -30.6875, "loss": 0.6982, "rewards/accuracies": 0.5, "rewards/chosen": 0.0012054443359375, "rewards/margins": 0.01220703125, "rewards/rejected": -0.01093292236328125, "step": 767 }, { "epoch": 0.5688888888888889, "grad_norm": 1.9788113832473755, "learning_rate": 4.311111111111111e-07, "logits/chosen": 1.3681640625, "logits/rejected": 2.046875, "logps/chosen": -30.78125, "logps/rejected": -37.4375, "loss": 0.707, "rewards/accuracies": 0.5, "rewards/chosen": -0.043304443359375, "rewards/margins": -0.00560760498046875, "rewards/rejected": -0.037689208984375, "step": 768 }, { "epoch": 0.5696296296296296, "grad_norm": 2.2392213344573975, "learning_rate": 4.303703703703703e-07, "logits/chosen": 1.3203125, "logits/rejected": 1.822265625, "logps/chosen": -61.15625, "logps/rejected": -44.6875, "loss": 0.7129, "rewards/accuracies": 0.25, "rewards/chosen": -0.0875244140625, "rewards/margins": -0.030487060546875, "rewards/rejected": -0.0570068359375, "step": 769 }, { "epoch": 0.5703703703703704, "grad_norm": 2.0623843669891357, "learning_rate": 4.296296296296296e-07, "logits/chosen": 2.623046875, "logits/rejected": 1.9716796875, "logps/chosen": -34.25, "logps/rejected": -54.5, "loss": 0.7363, "rewards/accuracies": 0.5, "rewards/chosen": -0.1656494140625, "rewards/margins": -0.029327392578125, "rewards/rejected": -0.13623046875, "step": 770 }, { "epoch": 0.5711111111111111, "grad_norm": 1.829289197921753, "learning_rate": 4.2888888888888883e-07, "logits/chosen": 1.9228515625, "logits/rejected": 1.6142578125, "logps/chosen": -32.09375, "logps/rejected": -35.9375, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.0031223297119140625, "rewards/margins": 0.001556396484375, "rewards/rejected": -0.004669189453125, "step": 771 }, { "epoch": 0.5718518518518518, "grad_norm": 2.1200177669525146, "learning_rate": 4.2814814814814816e-07, "logits/chosen": 1.7353515625, "logits/rejected": 1.8583984375, "logps/chosen": -33.5, "logps/rejected": -90.0625, "loss": 0.6875, "rewards/accuracies": 0.25, "rewards/chosen": -0.001556396484375, "rewards/margins": 0.030426025390625, "rewards/rejected": -0.03204345703125, "step": 772 }, { "epoch": 0.5725925925925925, "grad_norm": 1.9873347282409668, "learning_rate": 4.274074074074074e-07, "logits/chosen": 2.009765625, "logits/rejected": 1.9951171875, "logps/chosen": -31.21875, "logps/rejected": -53.8125, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": 0.006267547607421875, "rewards/margins": 0.03399658203125, "rewards/rejected": -0.027740478515625, "step": 773 }, { "epoch": 0.5733333333333334, "grad_norm": 1.3704866170883179, "learning_rate": 4.266666666666667e-07, "logits/chosen": 0.82958984375, "logits/rejected": 2.392578125, "logps/chosen": -29.21875, "logps/rejected": -78.4375, "loss": 0.4551, "rewards/accuracies": 1.0, "rewards/chosen": -0.002716064453125, "rewards/margins": 1.3017578125, "rewards/rejected": -1.3046875, "step": 774 }, { "epoch": 0.5740740740740741, "grad_norm": 2.007692337036133, "learning_rate": 4.259259259259259e-07, "logits/chosen": 1.814453125, "logits/rejected": 1.521484375, "logps/chosen": -27.0625, "logps/rejected": -79.625, "loss": 0.6396, "rewards/accuracies": 0.75, "rewards/chosen": 0.07598876953125, "rewards/margins": 0.13623046875, "rewards/rejected": -0.0601806640625, "step": 775 }, { "epoch": 0.5748148148148148, "grad_norm": 1.9734653234481812, "learning_rate": 4.2518518518518513e-07, "logits/chosen": 1.0908203125, "logits/rejected": 1.5634765625, "logps/chosen": -29.515625, "logps/rejected": -66.125, "loss": 0.7842, "rewards/accuracies": 0.25, "rewards/chosen": -0.02423095703125, "rewards/margins": -0.150390625, "rewards/rejected": 0.126220703125, "step": 776 }, { "epoch": 0.5755555555555556, "grad_norm": 2.516965866088867, "learning_rate": 4.244444444444444e-07, "logits/chosen": 1.6171875, "logits/rejected": 1.07421875, "logps/chosen": -49.28125, "logps/rejected": -47.625, "loss": 0.752, "rewards/accuracies": 0.0, "rewards/chosen": -0.055877685546875, "rewards/margins": -0.1129150390625, "rewards/rejected": 0.05706787109375, "step": 777 }, { "epoch": 0.5762962962962963, "grad_norm": 1.5714045763015747, "learning_rate": 4.237037037037037e-07, "logits/chosen": 1.0234375, "logits/rejected": 1.36328125, "logps/chosen": -34.59375, "logps/rejected": -29.96875, "loss": 0.8037, "rewards/accuracies": 0.25, "rewards/chosen": 0.201171875, "rewards/margins": -0.1981201171875, "rewards/rejected": 0.399169921875, "step": 778 }, { "epoch": 0.577037037037037, "grad_norm": 1.7392443418502808, "learning_rate": 4.22962962962963e-07, "logits/chosen": 1.599609375, "logits/rejected": 1.26171875, "logps/chosen": -39.1875, "logps/rejected": -47.65625, "loss": 0.623, "rewards/accuracies": 0.5, "rewards/chosen": 0.0390625, "rewards/margins": 0.218994140625, "rewards/rejected": -0.179931640625, "step": 779 }, { "epoch": 0.5777777777777777, "grad_norm": 1.941686987876892, "learning_rate": 4.222222222222222e-07, "logits/chosen": 1.796875, "logits/rejected": 1.1142578125, "logps/chosen": -47.15625, "logps/rejected": -44.34375, "loss": 0.7549, "rewards/accuracies": 0.0, "rewards/chosen": -0.1195068359375, "rewards/margins": -0.11248779296875, "rewards/rejected": -0.00705718994140625, "step": 780 }, { "epoch": 0.5785185185185185, "grad_norm": 2.1002354621887207, "learning_rate": 4.2148148148148144e-07, "logits/chosen": 1.4541015625, "logits/rejected": 1.89453125, "logps/chosen": -29.625, "logps/rejected": -42.78125, "loss": 0.7017, "rewards/accuracies": 0.5, "rewards/chosen": 0.07415771484375, "rewards/margins": -0.00278472900390625, "rewards/rejected": 0.07696533203125, "step": 781 }, { "epoch": 0.5792592592592593, "grad_norm": 2.8718326091766357, "learning_rate": 4.207407407407407e-07, "logits/chosen": 2.03125, "logits/rejected": 1.75, "logps/chosen": -45.53125, "logps/rejected": -71.75, "loss": 0.7305, "rewards/accuracies": 0.25, "rewards/chosen": 0.080078125, "rewards/margins": -0.011322021484375, "rewards/rejected": 0.0914306640625, "step": 782 }, { "epoch": 0.58, "grad_norm": 1.6617463827133179, "learning_rate": 4.1999999999999995e-07, "logits/chosen": 1.7080078125, "logits/rejected": 1.849609375, "logps/chosen": -31.03125, "logps/rejected": -37.0625, "loss": 0.6675, "rewards/accuracies": 0.5, "rewards/chosen": -0.0204925537109375, "rewards/margins": 0.058807373046875, "rewards/rejected": -0.07928466796875, "step": 783 }, { "epoch": 0.5807407407407408, "grad_norm": 1.9902186393737793, "learning_rate": 4.1925925925925923e-07, "logits/chosen": 2.142578125, "logits/rejected": 1.50390625, "logps/chosen": -30.453125, "logps/rejected": -36.875, "loss": 0.6343, "rewards/accuracies": 0.5, "rewards/chosen": 0.00588226318359375, "rewards/margins": 0.140625, "rewards/rejected": -0.134765625, "step": 784 }, { "epoch": 0.5814814814814815, "grad_norm": 2.9434404373168945, "learning_rate": 4.185185185185185e-07, "logits/chosen": 2.033203125, "logits/rejected": 1.9326171875, "logps/chosen": -71.125, "logps/rejected": -48.6875, "loss": 0.6855, "rewards/accuracies": 0.75, "rewards/chosen": 0.055755615234375, "rewards/margins": 0.05108642578125, "rewards/rejected": 0.00467681884765625, "step": 785 }, { "epoch": 0.5822222222222222, "grad_norm": 1.3971874713897705, "learning_rate": 4.177777777777778e-07, "logits/chosen": 1.845703125, "logits/rejected": 1.6650390625, "logps/chosen": -25.0625, "logps/rejected": -38.65625, "loss": 0.6841, "rewards/accuracies": 0.75, "rewards/chosen": 0.039459228515625, "rewards/margins": 0.020721435546875, "rewards/rejected": 0.01873779296875, "step": 786 }, { "epoch": 0.582962962962963, "grad_norm": 2.4093685150146484, "learning_rate": 4.17037037037037e-07, "logits/chosen": 0.935546875, "logits/rejected": 1.7021484375, "logps/chosen": -38.875, "logps/rejected": -33.6875, "loss": 1.5488, "rewards/accuracies": 0.25, "rewards/chosen": 0.08868408203125, "rewards/margins": -1.03125, "rewards/rejected": 1.1201171875, "step": 787 }, { "epoch": 0.5837037037037037, "grad_norm": 1.8859354257583618, "learning_rate": 4.1629629629629625e-07, "logits/chosen": 1.8388671875, "logits/rejected": 1.3935546875, "logps/chosen": -33.46875, "logps/rejected": -56.8125, "loss": 0.6338, "rewards/accuracies": 0.25, "rewards/chosen": 1.1162109375, "rewards/margins": 0.921875, "rewards/rejected": 0.194580078125, "step": 788 }, { "epoch": 0.5844444444444444, "grad_norm": 1.510131597518921, "learning_rate": 4.1555555555555554e-07, "logits/chosen": 1.6884765625, "logits/rejected": 1.3056640625, "logps/chosen": -21.90625, "logps/rejected": -29.234375, "loss": 0.6812, "rewards/accuracies": 0.5, "rewards/chosen": 0.1239013671875, "rewards/margins": 0.0733642578125, "rewards/rejected": 0.05059814453125, "step": 789 }, { "epoch": 0.5851851851851851, "grad_norm": 2.02984356880188, "learning_rate": 4.1481481481481476e-07, "logits/chosen": 2.005859375, "logits/rejected": 2.48046875, "logps/chosen": -33.65625, "logps/rejected": -43.6875, "loss": 0.6396, "rewards/accuracies": 0.75, "rewards/chosen": 0.028106689453125, "rewards/margins": 0.11407470703125, "rewards/rejected": -0.0859375, "step": 790 }, { "epoch": 0.585925925925926, "grad_norm": 1.617429256439209, "learning_rate": 4.140740740740741e-07, "logits/chosen": 1.287109375, "logits/rejected": 1.04296875, "logps/chosen": -31.1875, "logps/rejected": -29.3125, "loss": 0.7178, "rewards/accuracies": 0.5, "rewards/chosen": -0.0229949951171875, "rewards/margins": -0.03277587890625, "rewards/rejected": 0.0097503662109375, "step": 791 }, { "epoch": 0.5866666666666667, "grad_norm": 1.8557548522949219, "learning_rate": 4.1333333333333333e-07, "logits/chosen": 1.560546875, "logits/rejected": 1.42578125, "logps/chosen": -39.0, "logps/rejected": -35.8125, "loss": 0.5767, "rewards/accuracies": 0.75, "rewards/chosen": 0.317138671875, "rewards/margins": 0.341064453125, "rewards/rejected": -0.023834228515625, "step": 792 }, { "epoch": 0.5874074074074074, "grad_norm": 1.483217716217041, "learning_rate": 4.1259259259259256e-07, "logits/chosen": 1.42578125, "logits/rejected": 1.2587890625, "logps/chosen": -19.265625, "logps/rejected": -25.0, "loss": 0.6523, "rewards/accuracies": 0.75, "rewards/chosen": 0.12249755859375, "rewards/margins": 0.1025390625, "rewards/rejected": 0.0199432373046875, "step": 793 }, { "epoch": 0.5881481481481482, "grad_norm": 1.8026537895202637, "learning_rate": 4.1185185185185184e-07, "logits/chosen": 2.017578125, "logits/rejected": 2.330078125, "logps/chosen": -20.59375, "logps/rejected": -37.03125, "loss": 0.6128, "rewards/accuracies": 0.5, "rewards/chosen": -0.039794921875, "rewards/margins": 0.2296142578125, "rewards/rejected": -0.26953125, "step": 794 }, { "epoch": 0.5888888888888889, "grad_norm": 2.006551504135132, "learning_rate": 4.1111111111111107e-07, "logits/chosen": 1.080078125, "logits/rejected": 1.4912109375, "logps/chosen": -39.15625, "logps/rejected": -40.375, "loss": 0.7969, "rewards/accuracies": 0.25, "rewards/chosen": -0.132080078125, "rewards/margins": -0.173095703125, "rewards/rejected": 0.041015625, "step": 795 }, { "epoch": 0.5896296296296296, "grad_norm": 3.910236358642578, "learning_rate": 4.1037037037037035e-07, "logits/chosen": 0.9755859375, "logits/rejected": 1.1064453125, "logps/chosen": -33.21875, "logps/rejected": -40.75, "loss": 0.625, "rewards/accuracies": 0.75, "rewards/chosen": 0.1090087890625, "rewards/margins": 0.1578369140625, "rewards/rejected": -0.048797607421875, "step": 796 }, { "epoch": 0.5903703703703703, "grad_norm": 1.0489825010299683, "learning_rate": 4.096296296296296e-07, "logits/chosen": 0.98291015625, "logits/rejected": 1.080078125, "logps/chosen": -21.625, "logps/rejected": -24.984375, "loss": 0.667, "rewards/accuracies": 0.5, "rewards/chosen": 0.20849609375, "rewards/margins": 0.0721435546875, "rewards/rejected": 0.1363525390625, "step": 797 }, { "epoch": 0.5911111111111111, "grad_norm": 1.3622843027114868, "learning_rate": 4.088888888888889e-07, "logits/chosen": 1.236328125, "logits/rejected": 1.3740234375, "logps/chosen": -42.34375, "logps/rejected": -58.15625, "loss": 0.5225, "rewards/accuracies": 0.5, "rewards/chosen": 2.75390625, "rewards/margins": 2.841796875, "rewards/rejected": -0.087890625, "step": 798 }, { "epoch": 0.5918518518518519, "grad_norm": 1.6424837112426758, "learning_rate": 4.0814814814814814e-07, "logits/chosen": 1.1416015625, "logits/rejected": 1.6220703125, "logps/chosen": -25.0, "logps/rejected": -30.671875, "loss": 0.8232, "rewards/accuracies": 0.25, "rewards/chosen": -0.06817626953125, "rewards/margins": -0.23388671875, "rewards/rejected": 0.1656494140625, "step": 799 }, { "epoch": 0.5925925925925926, "grad_norm": 1.6052502393722534, "learning_rate": 4.0740740740740737e-07, "logits/chosen": 2.009765625, "logits/rejected": 2.05859375, "logps/chosen": -31.59375, "logps/rejected": -55.3125, "loss": 0.7012, "rewards/accuracies": 0.75, "rewards/chosen": 0.1195068359375, "rewards/margins": -0.0020294189453125, "rewards/rejected": 0.12152099609375, "step": 800 }, { "epoch": 0.5933333333333334, "grad_norm": 2.5745530128479004, "learning_rate": 4.0666666666666666e-07, "logits/chosen": 1.861328125, "logits/rejected": 1.625, "logps/chosen": -37.5625, "logps/rejected": -52.15625, "loss": 0.7666, "rewards/accuracies": 0.25, "rewards/chosen": -0.0679931640625, "rewards/margins": -0.13671875, "rewards/rejected": 0.0687255859375, "step": 801 }, { "epoch": 0.5940740740740741, "grad_norm": 2.983015537261963, "learning_rate": 4.059259259259259e-07, "logits/chosen": 1.1611328125, "logits/rejected": 1.7568359375, "logps/chosen": -54.84375, "logps/rejected": -65.75, "loss": 0.8037, "rewards/accuracies": 0.75, "rewards/chosen": -0.1727294921875, "rewards/margins": -0.1173095703125, "rewards/rejected": -0.055450439453125, "step": 802 }, { "epoch": 0.5948148148148148, "grad_norm": 1.793632984161377, "learning_rate": 4.0518518518518517e-07, "logits/chosen": 1.08203125, "logits/rejected": 1.412109375, "logps/chosen": -30.125, "logps/rejected": -29.3125, "loss": 0.7109, "rewards/accuracies": 0.5, "rewards/chosen": -0.1441650390625, "rewards/margins": -0.0289306640625, "rewards/rejected": -0.115234375, "step": 803 }, { "epoch": 0.5955555555555555, "grad_norm": 2.5279645919799805, "learning_rate": 4.044444444444444e-07, "logits/chosen": 0.71533203125, "logits/rejected": 1.283203125, "logps/chosen": -21.859375, "logps/rejected": -45.84375, "loss": 0.4839, "rewards/accuracies": 1.0, "rewards/chosen": 0.274169921875, "rewards/margins": 0.49072265625, "rewards/rejected": -0.2164306640625, "step": 804 }, { "epoch": 0.5962962962962963, "grad_norm": 1.9574904441833496, "learning_rate": 4.0370370370370373e-07, "logits/chosen": 1.4541015625, "logits/rejected": 1.28515625, "logps/chosen": -35.5, "logps/rejected": -33.3125, "loss": 0.7158, "rewards/accuracies": 0.25, "rewards/chosen": -0.03948974609375, "rewards/margins": -0.03790283203125, "rewards/rejected": -0.0015583038330078125, "step": 805 }, { "epoch": 0.597037037037037, "grad_norm": 2.124068021774292, "learning_rate": 4.0296296296296296e-07, "logits/chosen": 2.296875, "logits/rejected": 1.212890625, "logps/chosen": -35.59375, "logps/rejected": -29.28125, "loss": 1.1396, "rewards/accuracies": 0.25, "rewards/chosen": -0.56494140625, "rewards/margins": -0.673828125, "rewards/rejected": 0.10894775390625, "step": 806 }, { "epoch": 0.5977777777777777, "grad_norm": 1.662988543510437, "learning_rate": 4.022222222222222e-07, "logits/chosen": 2.064453125, "logits/rejected": 2.4609375, "logps/chosen": -55.4375, "logps/rejected": -68.5625, "loss": 0.5996, "rewards/accuracies": 0.75, "rewards/chosen": 0.0777587890625, "rewards/margins": 0.217529296875, "rewards/rejected": -0.1397705078125, "step": 807 }, { "epoch": 0.5985185185185186, "grad_norm": 1.3315539360046387, "learning_rate": 4.0148148148148147e-07, "logits/chosen": 0.8388671875, "logits/rejected": 1.8173828125, "logps/chosen": -29.546875, "logps/rejected": -43.875, "loss": 0.6494, "rewards/accuracies": 0.75, "rewards/chosen": 0.1441650390625, "rewards/margins": 0.1116943359375, "rewards/rejected": 0.03240966796875, "step": 808 }, { "epoch": 0.5992592592592593, "grad_norm": 1.7363468408584595, "learning_rate": 4.007407407407407e-07, "logits/chosen": 1.875, "logits/rejected": 1.4296875, "logps/chosen": -41.125, "logps/rejected": -31.265625, "loss": 0.5361, "rewards/accuracies": 0.75, "rewards/chosen": 0.423095703125, "rewards/margins": 0.38525390625, "rewards/rejected": 0.03790283203125, "step": 809 }, { "epoch": 0.6, "grad_norm": 5.102403163909912, "learning_rate": 4e-07, "logits/chosen": 1.6240234375, "logits/rejected": 0.896484375, "logps/chosen": -28.375, "logps/rejected": -27.765625, "loss": 0.979, "rewards/accuracies": 0.25, "rewards/chosen": 0.0167999267578125, "rewards/margins": -0.42041015625, "rewards/rejected": 0.437255859375, "step": 810 }, { "epoch": 0.6007407407407407, "grad_norm": 1.6682318449020386, "learning_rate": 3.9925925925925926e-07, "logits/chosen": 1.662109375, "logits/rejected": 0.64599609375, "logps/chosen": -42.28125, "logps/rejected": -42.34375, "loss": 0.7236, "rewards/accuracies": 0.5, "rewards/chosen": 0.01092529296875, "rewards/margins": -0.05474853515625, "rewards/rejected": 0.06561279296875, "step": 811 }, { "epoch": 0.6014814814814815, "grad_norm": 1.6341618299484253, "learning_rate": 3.985185185185185e-07, "logits/chosen": 1.712890625, "logits/rejected": 1.2626953125, "logps/chosen": -22.59375, "logps/rejected": -50.8125, "loss": 0.7256, "rewards/accuracies": 0.25, "rewards/chosen": -0.041412353515625, "rewards/margins": -0.06170654296875, "rewards/rejected": 0.02032470703125, "step": 812 }, { "epoch": 0.6022222222222222, "grad_norm": 2.366171360015869, "learning_rate": 3.977777777777778e-07, "logits/chosen": 1.255859375, "logits/rejected": 1.4833984375, "logps/chosen": -35.25, "logps/rejected": -33.4375, "loss": 0.8423, "rewards/accuracies": 0.25, "rewards/chosen": -0.137451171875, "rewards/margins": -0.26123046875, "rewards/rejected": 0.12384033203125, "step": 813 }, { "epoch": 0.6029629629629629, "grad_norm": 1.735720157623291, "learning_rate": 3.97037037037037e-07, "logits/chosen": 1.8203125, "logits/rejected": 1.544921875, "logps/chosen": -27.71875, "logps/rejected": -47.3125, "loss": 0.667, "rewards/accuracies": 0.5, "rewards/chosen": 0.129638671875, "rewards/margins": 0.0699462890625, "rewards/rejected": 0.059722900390625, "step": 814 }, { "epoch": 0.6037037037037037, "grad_norm": 5.372259140014648, "learning_rate": 3.962962962962963e-07, "logits/chosen": 2.130859375, "logits/rejected": 2.515625, "logps/chosen": -44.9375, "logps/rejected": -52.21875, "loss": 1.0752, "rewards/accuracies": 0.75, "rewards/chosen": -0.6416015625, "rewards/margins": 0.42041015625, "rewards/rejected": -1.0615234375, "step": 815 }, { "epoch": 0.6044444444444445, "grad_norm": 1.5588780641555786, "learning_rate": 3.955555555555555e-07, "logits/chosen": 1.580078125, "logits/rejected": 1.4091796875, "logps/chosen": -26.421875, "logps/rejected": -54.21875, "loss": 0.707, "rewards/accuracies": 0.5, "rewards/chosen": 0.0234375, "rewards/margins": -0.017547607421875, "rewards/rejected": 0.041015625, "step": 816 }, { "epoch": 0.6051851851851852, "grad_norm": 1.7730090618133545, "learning_rate": 3.948148148148148e-07, "logits/chosen": 2.3828125, "logits/rejected": 2.244140625, "logps/chosen": -34.3125, "logps/rejected": -40.9375, "loss": 0.6621, "rewards/accuracies": 0.75, "rewards/chosen": 0.1148681640625, "rewards/margins": 0.074462890625, "rewards/rejected": 0.0404052734375, "step": 817 }, { "epoch": 0.605925925925926, "grad_norm": 1.3063970804214478, "learning_rate": 3.940740740740741e-07, "logits/chosen": 1.5244140625, "logits/rejected": 1.6826171875, "logps/chosen": -27.609375, "logps/rejected": -44.09375, "loss": 0.6699, "rewards/accuracies": 0.5, "rewards/chosen": 0.060943603515625, "rewards/margins": 0.05352783203125, "rewards/rejected": 0.0074005126953125, "step": 818 }, { "epoch": 0.6066666666666667, "grad_norm": 3.907332181930542, "learning_rate": 3.933333333333333e-07, "logits/chosen": 2.029296875, "logits/rejected": 1.9697265625, "logps/chosen": -37.34375, "logps/rejected": -71.5, "loss": 0.6069, "rewards/accuracies": 1.0, "rewards/chosen": 0.11407470703125, "rewards/margins": 0.1917724609375, "rewards/rejected": -0.07769775390625, "step": 819 }, { "epoch": 0.6074074074074074, "grad_norm": 2.182652711868286, "learning_rate": 3.925925925925926e-07, "logits/chosen": 2.025390625, "logits/rejected": 1.8505859375, "logps/chosen": -30.8125, "logps/rejected": -32.34375, "loss": 1.2363, "rewards/accuracies": 0.5, "rewards/chosen": -0.07769775390625, "rewards/margins": -0.6982421875, "rewards/rejected": 0.62060546875, "step": 820 }, { "epoch": 0.6081481481481481, "grad_norm": 2.276838541030884, "learning_rate": 3.918518518518518e-07, "logits/chosen": 1.099609375, "logits/rejected": 1.5263671875, "logps/chosen": -34.96875, "logps/rejected": -27.84375, "loss": 0.688, "rewards/accuracies": 0.75, "rewards/chosen": 0.0367431640625, "rewards/margins": 0.0124969482421875, "rewards/rejected": 0.02423095703125, "step": 821 }, { "epoch": 0.6088888888888889, "grad_norm": 1.9969605207443237, "learning_rate": 3.911111111111111e-07, "logits/chosen": 2.021484375, "logits/rejected": 1.6845703125, "logps/chosen": -29.46875, "logps/rejected": -72.125, "loss": 0.6421, "rewards/accuracies": 0.75, "rewards/chosen": 0.044525146484375, "rewards/margins": 0.1405029296875, "rewards/rejected": -0.0960693359375, "step": 822 }, { "epoch": 0.6096296296296296, "grad_norm": 4.745499610900879, "learning_rate": 3.9037037037037033e-07, "logits/chosen": 1.4443359375, "logits/rejected": 1.5322265625, "logps/chosen": -32.875, "logps/rejected": -36.0625, "loss": 0.7529, "rewards/accuracies": 0.5, "rewards/chosen": -0.005859375, "rewards/margins": -0.0914306640625, "rewards/rejected": 0.08551025390625, "step": 823 }, { "epoch": 0.6103703703703703, "grad_norm": 2.2281248569488525, "learning_rate": 3.8962962962962956e-07, "logits/chosen": 1.8095703125, "logits/rejected": 1.4541015625, "logps/chosen": -37.4375, "logps/rejected": -48.875, "loss": 0.8647, "rewards/accuracies": 0.5, "rewards/chosen": 0.190673828125, "rewards/margins": -0.234619140625, "rewards/rejected": 0.42529296875, "step": 824 }, { "epoch": 0.6111111111111112, "grad_norm": 1.9630359411239624, "learning_rate": 3.888888888888889e-07, "logits/chosen": 2.251953125, "logits/rejected": 1.87890625, "logps/chosen": -44.0, "logps/rejected": -34.84375, "loss": 0.7441, "rewards/accuracies": 0.25, "rewards/chosen": -0.07928466796875, "rewards/margins": -0.09136962890625, "rewards/rejected": 0.01210784912109375, "step": 825 }, { "epoch": 0.6118518518518519, "grad_norm": 2.0065789222717285, "learning_rate": 3.881481481481481e-07, "logits/chosen": 1.6884765625, "logits/rejected": 1.5673828125, "logps/chosen": -52.90625, "logps/rejected": -55.375, "loss": 0.6724, "rewards/accuracies": 0.5, "rewards/chosen": 0.1793212890625, "rewards/margins": 0.04449462890625, "rewards/rejected": 0.134765625, "step": 826 }, { "epoch": 0.6125925925925926, "grad_norm": 3.7927122116088867, "learning_rate": 3.874074074074074e-07, "logits/chosen": 1.3583984375, "logits/rejected": 1.1123046875, "logps/chosen": -29.109375, "logps/rejected": -32.65625, "loss": 0.7812, "rewards/accuracies": 0.0, "rewards/chosen": -0.104248046875, "rewards/margins": -0.164794921875, "rewards/rejected": 0.060546875, "step": 827 }, { "epoch": 0.6133333333333333, "grad_norm": 1.186486840248108, "learning_rate": 3.8666666666666664e-07, "logits/chosen": 1.4833984375, "logits/rejected": 2.44921875, "logps/chosen": -25.046875, "logps/rejected": -41.15625, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 0.198486328125, "rewards/margins": 0.451171875, "rewards/rejected": -0.252685546875, "step": 828 }, { "epoch": 0.6140740740740741, "grad_norm": 1.8565629720687866, "learning_rate": 3.859259259259259e-07, "logits/chosen": 1.572265625, "logits/rejected": 1.4287109375, "logps/chosen": -42.5625, "logps/rejected": -31.3125, "loss": 0.8184, "rewards/accuracies": 0.0, "rewards/chosen": -0.05078125, "rewards/margins": -0.236328125, "rewards/rejected": 0.185546875, "step": 829 }, { "epoch": 0.6148148148148148, "grad_norm": 1.5664383172988892, "learning_rate": 3.8518518518518515e-07, "logits/chosen": 1.484375, "logits/rejected": 1.9560546875, "logps/chosen": -29.40625, "logps/rejected": -34.09375, "loss": 0.6348, "rewards/accuracies": 0.75, "rewards/chosen": -0.025604248046875, "rewards/margins": 0.124755859375, "rewards/rejected": -0.150390625, "step": 830 }, { "epoch": 0.6155555555555555, "grad_norm": 3.062150478363037, "learning_rate": 3.8444444444444443e-07, "logits/chosen": 0.58447265625, "logits/rejected": 0.671875, "logps/chosen": -23.53125, "logps/rejected": -35.59375, "loss": 0.6128, "rewards/accuracies": 0.75, "rewards/chosen": 0.0128936767578125, "rewards/margins": 0.177978515625, "rewards/rejected": -0.1650390625, "step": 831 }, { "epoch": 0.6162962962962963, "grad_norm": 2.654770612716675, "learning_rate": 3.837037037037037e-07, "logits/chosen": 1.095703125, "logits/rejected": 1.236328125, "logps/chosen": -28.53125, "logps/rejected": -28.765625, "loss": 0.8442, "rewards/accuracies": 0.0, "rewards/chosen": -0.125, "rewards/margins": -0.261962890625, "rewards/rejected": 0.1370849609375, "step": 832 }, { "epoch": 0.617037037037037, "grad_norm": 2.4965693950653076, "learning_rate": 3.8296296296296294e-07, "logits/chosen": 1.357421875, "logits/rejected": 1.7314453125, "logps/chosen": -23.71875, "logps/rejected": -42.0, "loss": 0.8877, "rewards/accuracies": 0.75, "rewards/chosen": -0.033599853515625, "rewards/margins": -0.27978515625, "rewards/rejected": 0.2462158203125, "step": 833 }, { "epoch": 0.6177777777777778, "grad_norm": 2.1012368202209473, "learning_rate": 3.822222222222222e-07, "logits/chosen": 1.1689453125, "logits/rejected": 1.1669921875, "logps/chosen": -52.375, "logps/rejected": -65.3125, "loss": 0.6758, "rewards/accuracies": 0.5, "rewards/chosen": 0.007843017578125, "rewards/margins": 0.044769287109375, "rewards/rejected": -0.03692626953125, "step": 834 }, { "epoch": 0.6185185185185185, "grad_norm": 2.9580190181732178, "learning_rate": 3.8148148148148145e-07, "logits/chosen": 1.5380859375, "logits/rejected": 1.7158203125, "logps/chosen": -28.59375, "logps/rejected": -118.625, "loss": 0.9092, "rewards/accuracies": 0.25, "rewards/chosen": -0.027740478515625, "rewards/margins": -0.343017578125, "rewards/rejected": 0.315185546875, "step": 835 }, { "epoch": 0.6192592592592593, "grad_norm": 1.5635594129562378, "learning_rate": 3.8074074074074073e-07, "logits/chosen": 1.6806640625, "logits/rejected": 1.6015625, "logps/chosen": -33.46875, "logps/rejected": -43.96875, "loss": 0.7593, "rewards/accuracies": 0.25, "rewards/chosen": 0.0035247802734375, "rewards/margins": -0.1116943359375, "rewards/rejected": 0.115234375, "step": 836 }, { "epoch": 0.62, "grad_norm": 2.4449775218963623, "learning_rate": 3.7999999999999996e-07, "logits/chosen": 1.544921875, "logits/rejected": 1.8369140625, "logps/chosen": -53.4375, "logps/rejected": -65.3125, "loss": 0.8481, "rewards/accuracies": 0.5, "rewards/chosen": 0.11175537109375, "rewards/margins": -0.193359375, "rewards/rejected": 0.304931640625, "step": 837 }, { "epoch": 0.6207407407407407, "grad_norm": 2.4088032245635986, "learning_rate": 3.7925925925925924e-07, "logits/chosen": 1.453125, "logits/rejected": 2.068359375, "logps/chosen": -50.34375, "logps/rejected": -69.375, "loss": 0.7241, "rewards/accuracies": 0.5, "rewards/chosen": -0.10546875, "rewards/margins": -0.055511474609375, "rewards/rejected": -0.04998779296875, "step": 838 }, { "epoch": 0.6214814814814815, "grad_norm": 2.097715377807617, "learning_rate": 3.785185185185185e-07, "logits/chosen": 1.4501953125, "logits/rejected": 1.48828125, "logps/chosen": -25.109375, "logps/rejected": -42.03125, "loss": 0.8486, "rewards/accuracies": 0.0, "rewards/chosen": -0.127197265625, "rewards/margins": -0.28564453125, "rewards/rejected": 0.1585693359375, "step": 839 }, { "epoch": 0.6222222222222222, "grad_norm": 2.605437994003296, "learning_rate": 3.7777777777777775e-07, "logits/chosen": 0.9736328125, "logits/rejected": 1.0244140625, "logps/chosen": -25.75, "logps/rejected": -63.0625, "loss": 0.9053, "rewards/accuracies": 0.25, "rewards/chosen": -0.086669921875, "rewards/margins": -0.323486328125, "rewards/rejected": 0.2366943359375, "step": 840 }, { "epoch": 0.6229629629629629, "grad_norm": 1.9063141345977783, "learning_rate": 3.7703703703703704e-07, "logits/chosen": 1.1904296875, "logits/rejected": 1.373046875, "logps/chosen": -30.390625, "logps/rejected": -38.46875, "loss": 0.5498, "rewards/accuracies": 0.75, "rewards/chosen": 0.08514404296875, "rewards/margins": 0.40576171875, "rewards/rejected": -0.32080078125, "step": 841 }, { "epoch": 0.6237037037037036, "grad_norm": 1.364068865776062, "learning_rate": 3.7629629629629627e-07, "logits/chosen": 0.96923828125, "logits/rejected": 1.48828125, "logps/chosen": -25.21875, "logps/rejected": -33.59375, "loss": 0.7046, "rewards/accuracies": 0.5, "rewards/chosen": 0.04022216796875, "rewards/margins": -0.021087646484375, "rewards/rejected": 0.061309814453125, "step": 842 }, { "epoch": 0.6244444444444445, "grad_norm": 1.486285924911499, "learning_rate": 3.755555555555555e-07, "logits/chosen": 1.634765625, "logits/rejected": 1.4921875, "logps/chosen": -31.84375, "logps/rejected": -30.0, "loss": 0.5737, "rewards/accuracies": 1.0, "rewards/chosen": 0.179931640625, "rewards/margins": 0.27490234375, "rewards/rejected": -0.09490966796875, "step": 843 }, { "epoch": 0.6251851851851852, "grad_norm": 2.1523494720458984, "learning_rate": 3.7481481481481483e-07, "logits/chosen": 1.4033203125, "logits/rejected": 2.125, "logps/chosen": -35.0625, "logps/rejected": -37.25, "loss": 0.7158, "rewards/accuracies": 0.5, "rewards/chosen": 0.09222412109375, "rewards/margins": -0.003143310546875, "rewards/rejected": 0.0953369140625, "step": 844 }, { "epoch": 0.6259259259259259, "grad_norm": 1.2703070640563965, "learning_rate": 3.7407407407407406e-07, "logits/chosen": 1.3603515625, "logits/rejected": 1.0771484375, "logps/chosen": -31.96875, "logps/rejected": -35.90625, "loss": 0.6396, "rewards/accuracies": 0.5, "rewards/chosen": 0.11444091796875, "rewards/margins": 0.11798095703125, "rewards/rejected": -0.0035247802734375, "step": 845 }, { "epoch": 0.6266666666666667, "grad_norm": 1.6231383085250854, "learning_rate": 3.7333333333333334e-07, "logits/chosen": 1.2373046875, "logits/rejected": 1.818359375, "logps/chosen": -24.3125, "logps/rejected": -29.96875, "loss": 0.6465, "rewards/accuracies": 0.75, "rewards/chosen": 0.11834716796875, "rewards/margins": 0.100830078125, "rewards/rejected": 0.017547607421875, "step": 846 }, { "epoch": 0.6274074074074074, "grad_norm": 2.1436285972595215, "learning_rate": 3.7259259259259257e-07, "logits/chosen": 1.7744140625, "logits/rejected": 2.142578125, "logps/chosen": -31.9375, "logps/rejected": -45.4375, "loss": 0.79, "rewards/accuracies": 0.25, "rewards/chosen": -0.27587890625, "rewards/margins": -0.071044921875, "rewards/rejected": -0.2047119140625, "step": 847 }, { "epoch": 0.6281481481481481, "grad_norm": 2.0909476280212402, "learning_rate": 3.7185185185185185e-07, "logits/chosen": 1.296875, "logits/rejected": 1.7412109375, "logps/chosen": -34.40625, "logps/rejected": -44.21875, "loss": 0.7446, "rewards/accuracies": 0.5, "rewards/chosen": -0.2998046875, "rewards/margins": -0.0775146484375, "rewards/rejected": -0.2222900390625, "step": 848 }, { "epoch": 0.6288888888888889, "grad_norm": 1.391313910484314, "learning_rate": 3.711111111111111e-07, "logits/chosen": 1.8671875, "logits/rejected": 2.33984375, "logps/chosen": -28.03125, "logps/rejected": -54.34375, "loss": 0.5854, "rewards/accuracies": 0.5, "rewards/chosen": -0.0745849609375, "rewards/margins": 0.59375, "rewards/rejected": -0.66845703125, "step": 849 }, { "epoch": 0.6296296296296297, "grad_norm": 3.879032850265503, "learning_rate": 3.703703703703703e-07, "logits/chosen": 1.6484375, "logits/rejected": 1.748046875, "logps/chosen": -49.03125, "logps/rejected": -51.65625, "loss": 0.9214, "rewards/accuracies": 0.0, "rewards/chosen": -0.09490966796875, "rewards/margins": -0.408203125, "rewards/rejected": 0.313232421875, "step": 850 }, { "epoch": 0.6303703703703704, "grad_norm": 1.8258591890335083, "learning_rate": 3.6962962962962965e-07, "logits/chosen": 1.3505859375, "logits/rejected": 1.63671875, "logps/chosen": -32.5, "logps/rejected": -72.625, "loss": 0.6094, "rewards/accuracies": 0.75, "rewards/chosen": 0.0982666015625, "rewards/margins": 0.201416015625, "rewards/rejected": -0.1031494140625, "step": 851 }, { "epoch": 0.6311111111111111, "grad_norm": 6.125811576843262, "learning_rate": 3.688888888888889e-07, "logits/chosen": 1.7080078125, "logits/rejected": 1.06640625, "logps/chosen": -36.0, "logps/rejected": -37.75, "loss": 0.7314, "rewards/accuracies": 0.25, "rewards/chosen": -0.0673828125, "rewards/margins": -0.0673828125, "rewards/rejected": 0.0, "step": 852 }, { "epoch": 0.6318518518518519, "grad_norm": 2.5561141967773438, "learning_rate": 3.6814814814814816e-07, "logits/chosen": 1.515625, "logits/rejected": 1.724609375, "logps/chosen": -39.90625, "logps/rejected": -26.65625, "loss": 0.6436, "rewards/accuracies": 0.5, "rewards/chosen": -0.016387939453125, "rewards/margins": 0.1396484375, "rewards/rejected": -0.156005859375, "step": 853 }, { "epoch": 0.6325925925925926, "grad_norm": 1.3978261947631836, "learning_rate": 3.674074074074074e-07, "logits/chosen": 1.763671875, "logits/rejected": 1.5087890625, "logps/chosen": -43.53125, "logps/rejected": -30.875, "loss": 0.6523, "rewards/accuracies": 0.5, "rewards/chosen": 0.134765625, "rewards/margins": 0.09844970703125, "rewards/rejected": 0.036346435546875, "step": 854 }, { "epoch": 0.6333333333333333, "grad_norm": 2.738867998123169, "learning_rate": 3.666666666666666e-07, "logits/chosen": 1.8408203125, "logits/rejected": 1.798828125, "logps/chosen": -39.78125, "logps/rejected": -93.8125, "loss": 0.6157, "rewards/accuracies": 0.75, "rewards/chosen": 0.15380859375, "rewards/margins": 0.2003173828125, "rewards/rejected": -0.0465087890625, "step": 855 }, { "epoch": 0.6340740740740741, "grad_norm": 1.367793083190918, "learning_rate": 3.659259259259259e-07, "logits/chosen": 1.8662109375, "logits/rejected": 1.4521484375, "logps/chosen": -20.84375, "logps/rejected": -45.53125, "loss": 0.665, "rewards/accuracies": 0.25, "rewards/chosen": 0.043548583984375, "rewards/margins": 0.0693359375, "rewards/rejected": -0.025787353515625, "step": 856 }, { "epoch": 0.6348148148148148, "grad_norm": 2.382589340209961, "learning_rate": 3.651851851851851e-07, "logits/chosen": 1.5625, "logits/rejected": 1.2998046875, "logps/chosen": -20.75, "logps/rejected": -46.53125, "loss": 0.8149, "rewards/accuracies": 0.5, "rewards/chosen": -0.0875244140625, "rewards/margins": -0.204345703125, "rewards/rejected": 0.1168212890625, "step": 857 }, { "epoch": 0.6355555555555555, "grad_norm": 1.5667803287506104, "learning_rate": 3.6444444444444446e-07, "logits/chosen": 1.453125, "logits/rejected": 1.4794921875, "logps/chosen": -30.21875, "logps/rejected": -27.96875, "loss": 0.7588, "rewards/accuracies": 0.25, "rewards/chosen": -0.0546875, "rewards/margins": -0.121826171875, "rewards/rejected": 0.06719970703125, "step": 858 }, { "epoch": 0.6362962962962962, "grad_norm": 1.491284728050232, "learning_rate": 3.637037037037037e-07, "logits/chosen": 1.93359375, "logits/rejected": 1.2578125, "logps/chosen": -22.21875, "logps/rejected": -36.03125, "loss": 0.7393, "rewards/accuracies": 0.25, "rewards/chosen": 0.00820159912109375, "rewards/margins": -0.08514404296875, "rewards/rejected": 0.0933837890625, "step": 859 }, { "epoch": 0.6370370370370371, "grad_norm": 2.6582157611846924, "learning_rate": 3.6296296296296297e-07, "logits/chosen": 1.7177734375, "logits/rejected": 2.09765625, "logps/chosen": -41.4375, "logps/rejected": -25.984375, "loss": 1.1816, "rewards/accuracies": 0.0, "rewards/chosen": -0.1378173828125, "rewards/margins": -0.74365234375, "rewards/rejected": 0.60595703125, "step": 860 }, { "epoch": 0.6377777777777778, "grad_norm": 1.8098505735397339, "learning_rate": 3.622222222222222e-07, "logits/chosen": 1.97265625, "logits/rejected": 1.626953125, "logps/chosen": -55.5625, "logps/rejected": -37.3125, "loss": 0.6665, "rewards/accuracies": 0.75, "rewards/chosen": 0.0687255859375, "rewards/margins": 0.1644287109375, "rewards/rejected": -0.095703125, "step": 861 }, { "epoch": 0.6385185185185185, "grad_norm": 1.721792221069336, "learning_rate": 3.6148148148148143e-07, "logits/chosen": 2.345703125, "logits/rejected": 2.119140625, "logps/chosen": -26.28125, "logps/rejected": -34.75, "loss": 0.6494, "rewards/accuracies": 0.5, "rewards/chosen": -0.01446533203125, "rewards/margins": 0.09368896484375, "rewards/rejected": -0.10821533203125, "step": 862 }, { "epoch": 0.6392592592592593, "grad_norm": 2.7645912170410156, "learning_rate": 3.607407407407407e-07, "logits/chosen": 1.328125, "logits/rejected": 1.212890625, "logps/chosen": -33.40625, "logps/rejected": -66.5, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": 0.03204345703125, "rewards/margins": 0.0228424072265625, "rewards/rejected": 0.00917816162109375, "step": 863 }, { "epoch": 0.64, "grad_norm": 1.7325780391693115, "learning_rate": 3.6e-07, "logits/chosen": 1.2421875, "logits/rejected": 1.0625, "logps/chosen": -37.09375, "logps/rejected": -44.8125, "loss": 0.749, "rewards/accuracies": 0.5, "rewards/chosen": 0.0238189697265625, "rewards/margins": -0.09747314453125, "rewards/rejected": 0.12127685546875, "step": 864 }, { "epoch": 0.6407407407407407, "grad_norm": 1.776251196861267, "learning_rate": 3.592592592592593e-07, "logits/chosen": 1.162109375, "logits/rejected": 1.0634765625, "logps/chosen": -39.375, "logps/rejected": -35.1875, "loss": 0.6953, "rewards/accuracies": 0.75, "rewards/chosen": 0.0160064697265625, "rewards/margins": 0.00077056884765625, "rewards/rejected": 0.015228271484375, "step": 865 }, { "epoch": 0.6414814814814814, "grad_norm": 1.8741190433502197, "learning_rate": 3.585185185185185e-07, "logits/chosen": 1.6591796875, "logits/rejected": 1.8251953125, "logps/chosen": -40.3125, "logps/rejected": -31.3125, "loss": 0.77, "rewards/accuracies": 0.0, "rewards/chosen": -0.09454345703125, "rewards/margins": -0.146484375, "rewards/rejected": 0.05194091796875, "step": 866 }, { "epoch": 0.6422222222222222, "grad_norm": 2.3671114444732666, "learning_rate": 3.5777777777777773e-07, "logits/chosen": 1.7568359375, "logits/rejected": 1.8564453125, "logps/chosen": -50.09375, "logps/rejected": -50.96875, "loss": 0.7246, "rewards/accuracies": 0.25, "rewards/chosen": -0.039459228515625, "rewards/margins": -0.037139892578125, "rewards/rejected": -0.0023193359375, "step": 867 }, { "epoch": 0.642962962962963, "grad_norm": 2.166721820831299, "learning_rate": 3.57037037037037e-07, "logits/chosen": 1.8779296875, "logits/rejected": 1.5205078125, "logps/chosen": -44.90625, "logps/rejected": -48.8125, "loss": 0.8784, "rewards/accuracies": 0.75, "rewards/chosen": 0.0736083984375, "rewards/margins": -0.1962890625, "rewards/rejected": 0.269775390625, "step": 868 }, { "epoch": 0.6437037037037037, "grad_norm": 4.86507511138916, "learning_rate": 3.5629629629629625e-07, "logits/chosen": 1.009765625, "logits/rejected": 1.587890625, "logps/chosen": -42.1875, "logps/rejected": -56.9375, "loss": 0.7891, "rewards/accuracies": 0.5, "rewards/chosen": -0.063720703125, "rewards/margins": -0.1512451171875, "rewards/rejected": 0.0875244140625, "step": 869 }, { "epoch": 0.6444444444444445, "grad_norm": 2.167405843734741, "learning_rate": 3.5555555555555553e-07, "logits/chosen": 1.548828125, "logits/rejected": 2.017578125, "logps/chosen": -30.671875, "logps/rejected": -53.5, "loss": 0.6367, "rewards/accuracies": 0.75, "rewards/chosen": 0.0953369140625, "rewards/margins": 0.141357421875, "rewards/rejected": -0.04608154296875, "step": 870 }, { "epoch": 0.6451851851851852, "grad_norm": 1.2681621313095093, "learning_rate": 3.548148148148148e-07, "logits/chosen": 0.77001953125, "logits/rejected": 1.109375, "logps/chosen": -41.1875, "logps/rejected": -31.3125, "loss": 0.5957, "rewards/accuracies": 0.75, "rewards/chosen": 0.2081298828125, "rewards/margins": 0.228515625, "rewards/rejected": -0.0203094482421875, "step": 871 }, { "epoch": 0.6459259259259259, "grad_norm": 2.5228500366210938, "learning_rate": 3.540740740740741e-07, "logits/chosen": 2.0, "logits/rejected": 1.2021484375, "logps/chosen": -31.546875, "logps/rejected": -54.71875, "loss": 0.8486, "rewards/accuracies": 0.5, "rewards/chosen": -0.02984619140625, "rewards/margins": -0.2481689453125, "rewards/rejected": 0.21826171875, "step": 872 }, { "epoch": 0.6466666666666666, "grad_norm": 1.814112663269043, "learning_rate": 3.533333333333333e-07, "logits/chosen": 1.4755859375, "logits/rejected": 1.5185546875, "logps/chosen": -22.78125, "logps/rejected": -36.3125, "loss": 0.7192, "rewards/accuracies": 0.25, "rewards/chosen": -0.00389862060546875, "rewards/margins": -0.02984619140625, "rewards/rejected": 0.025970458984375, "step": 873 }, { "epoch": 0.6474074074074074, "grad_norm": 2.087252378463745, "learning_rate": 3.5259259259259255e-07, "logits/chosen": 1.6044921875, "logits/rejected": 1.478515625, "logps/chosen": -25.734375, "logps/rejected": -38.90625, "loss": 0.8262, "rewards/accuracies": 0.25, "rewards/chosen": 0.0089874267578125, "rewards/margins": -0.216796875, "rewards/rejected": 0.225830078125, "step": 874 }, { "epoch": 0.6481481481481481, "grad_norm": 2.2130837440490723, "learning_rate": 3.5185185185185183e-07, "logits/chosen": 1.2978515625, "logits/rejected": 1.7783203125, "logps/chosen": -43.28125, "logps/rejected": -59.9375, "loss": 0.7749, "rewards/accuracies": 0.0, "rewards/chosen": -0.146484375, "rewards/margins": -0.1558837890625, "rewards/rejected": 0.0093841552734375, "step": 875 }, { "epoch": 0.6488888888888888, "grad_norm": 1.6776313781738281, "learning_rate": 3.5111111111111106e-07, "logits/chosen": 1.8427734375, "logits/rejected": 1.88671875, "logps/chosen": -26.96875, "logps/rejected": -43.6875, "loss": 0.7637, "rewards/accuracies": 0.25, "rewards/chosen": -0.122314453125, "rewards/margins": -0.130859375, "rewards/rejected": 0.00859832763671875, "step": 876 }, { "epoch": 0.6496296296296297, "grad_norm": 1.8195737600326538, "learning_rate": 3.503703703703704e-07, "logits/chosen": 1.2763671875, "logits/rejected": 0.8857421875, "logps/chosen": -39.15625, "logps/rejected": -49.25, "loss": 0.7061, "rewards/accuracies": 0.5, "rewards/chosen": -0.0164031982421875, "rewards/margins": -0.0030975341796875, "rewards/rejected": -0.013275146484375, "step": 877 }, { "epoch": 0.6503703703703704, "grad_norm": 2.4383134841918945, "learning_rate": 3.496296296296296e-07, "logits/chosen": 2.615234375, "logits/rejected": 2.03125, "logps/chosen": -41.6875, "logps/rejected": -32.9375, "loss": 0.6992, "rewards/accuracies": 0.5, "rewards/chosen": -0.061737060546875, "rewards/margins": 0.02008056640625, "rewards/rejected": -0.081787109375, "step": 878 }, { "epoch": 0.6511111111111111, "grad_norm": 2.6277050971984863, "learning_rate": 3.488888888888889e-07, "logits/chosen": 1.9736328125, "logits/rejected": 1.2041015625, "logps/chosen": -35.34375, "logps/rejected": -31.078125, "loss": 0.6782, "rewards/accuracies": 0.75, "rewards/chosen": 0.106201171875, "rewards/margins": 0.10498046875, "rewards/rejected": 0.001190185546875, "step": 879 }, { "epoch": 0.6518518518518519, "grad_norm": 1.7106379270553589, "learning_rate": 3.4814814814814814e-07, "logits/chosen": 1.861328125, "logits/rejected": 1.4873046875, "logps/chosen": -31.25, "logps/rejected": -48.625, "loss": 0.6333, "rewards/accuracies": 1.0, "rewards/chosen": 0.0300750732421875, "rewards/margins": 0.1273193359375, "rewards/rejected": -0.09722900390625, "step": 880 }, { "epoch": 0.6525925925925926, "grad_norm": 2.4621570110321045, "learning_rate": 3.4740740740740737e-07, "logits/chosen": 1.6552734375, "logits/rejected": 1.2734375, "logps/chosen": -29.625, "logps/rejected": -72.5, "loss": 0.7271, "rewards/accuracies": 0.25, "rewards/chosen": 0.0726318359375, "rewards/margins": -0.063232421875, "rewards/rejected": 0.135986328125, "step": 881 }, { "epoch": 0.6533333333333333, "grad_norm": 1.4527162313461304, "learning_rate": 3.4666666666666665e-07, "logits/chosen": 1.3876953125, "logits/rejected": 1.853515625, "logps/chosen": -25.5, "logps/rejected": -38.65625, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": -0.0335693359375, "rewards/margins": 0.041351318359375, "rewards/rejected": -0.074951171875, "step": 882 }, { "epoch": 0.654074074074074, "grad_norm": 1.8756300210952759, "learning_rate": 3.459259259259259e-07, "logits/chosen": 1.8232421875, "logits/rejected": 1.9609375, "logps/chosen": -31.65625, "logps/rejected": -51.5, "loss": 0.6768, "rewards/accuracies": 0.25, "rewards/chosen": 0.1343994140625, "rewards/margins": 0.0562744140625, "rewards/rejected": 0.078125, "step": 883 }, { "epoch": 0.6548148148148148, "grad_norm": 1.276760458946228, "learning_rate": 3.451851851851852e-07, "logits/chosen": 1.6689453125, "logits/rejected": 1.3896484375, "logps/chosen": -21.90625, "logps/rejected": -25.296875, "loss": 0.6777, "rewards/accuracies": 0.5, "rewards/chosen": -0.07110595703125, "rewards/margins": 0.04351806640625, "rewards/rejected": -0.1146240234375, "step": 884 }, { "epoch": 0.6555555555555556, "grad_norm": 1.6449166536331177, "learning_rate": 3.4444444444444444e-07, "logits/chosen": 1.6435546875, "logits/rejected": 1.705078125, "logps/chosen": -40.1875, "logps/rejected": -46.1875, "loss": 0.7334, "rewards/accuracies": 0.5, "rewards/chosen": -0.096923828125, "rewards/margins": -0.069580078125, "rewards/rejected": -0.0273284912109375, "step": 885 }, { "epoch": 0.6562962962962963, "grad_norm": 2.9171712398529053, "learning_rate": 3.4370370370370367e-07, "logits/chosen": 1.451171875, "logits/rejected": 1.265625, "logps/chosen": -34.71875, "logps/rejected": -69.8125, "loss": 0.7578, "rewards/accuracies": 0.5, "rewards/chosen": -0.00527191162109375, "rewards/margins": -0.1021728515625, "rewards/rejected": 0.096923828125, "step": 886 }, { "epoch": 0.6570370370370371, "grad_norm": 1.5801424980163574, "learning_rate": 3.4296296296296295e-07, "logits/chosen": 0.96533203125, "logits/rejected": 1.25, "logps/chosen": -31.96875, "logps/rejected": -28.796875, "loss": 0.6157, "rewards/accuracies": 0.75, "rewards/chosen": 0.25439453125, "rewards/margins": 0.177734375, "rewards/rejected": 0.0765380859375, "step": 887 }, { "epoch": 0.6577777777777778, "grad_norm": 2.168917417526245, "learning_rate": 3.422222222222222e-07, "logits/chosen": 1.630859375, "logits/rejected": 2.126953125, "logps/chosen": -38.40625, "logps/rejected": -53.25, "loss": 0.4875, "rewards/accuracies": 1.0, "rewards/chosen": 0.41064453125, "rewards/margins": 0.53466796875, "rewards/rejected": -0.124267578125, "step": 888 }, { "epoch": 0.6585185185185185, "grad_norm": 1.7953357696533203, "learning_rate": 3.4148148148148146e-07, "logits/chosen": 1.625, "logits/rejected": 1.4150390625, "logps/chosen": -26.9375, "logps/rejected": -36.5625, "loss": 0.8887, "rewards/accuracies": 0.25, "rewards/chosen": -0.019927978515625, "rewards/margins": -0.307861328125, "rewards/rejected": 0.287841796875, "step": 889 }, { "epoch": 0.6592592592592592, "grad_norm": 1.9321461915969849, "learning_rate": 3.407407407407407e-07, "logits/chosen": 1.3408203125, "logits/rejected": 1.4765625, "logps/chosen": -27.59375, "logps/rejected": -42.09375, "loss": 0.7437, "rewards/accuracies": 0.5, "rewards/chosen": 0.00859832763671875, "rewards/margins": -0.0804443359375, "rewards/rejected": 0.08905029296875, "step": 890 }, { "epoch": 0.66, "grad_norm": 2.633348226547241, "learning_rate": 3.4000000000000003e-07, "logits/chosen": 1.3798828125, "logits/rejected": 1.8564453125, "logps/chosen": -24.40625, "logps/rejected": -98.9375, "loss": 0.7686, "rewards/accuracies": 0.5, "rewards/chosen": -0.00312042236328125, "rewards/margins": -0.129638671875, "rewards/rejected": 0.12646484375, "step": 891 }, { "epoch": 0.6607407407407407, "grad_norm": 2.1395363807678223, "learning_rate": 3.3925925925925926e-07, "logits/chosen": 2.13671875, "logits/rejected": 1.7724609375, "logps/chosen": -35.8125, "logps/rejected": -42.21875, "loss": 0.7954, "rewards/accuracies": 0.25, "rewards/chosen": 0.18896484375, "rewards/margins": -0.10858154296875, "rewards/rejected": 0.297607421875, "step": 892 }, { "epoch": 0.6614814814814814, "grad_norm": 1.938617467880249, "learning_rate": 3.385185185185185e-07, "logits/chosen": 0.99072265625, "logits/rejected": 0.7470703125, "logps/chosen": -44.09375, "logps/rejected": -40.9375, "loss": 0.6255, "rewards/accuracies": 0.75, "rewards/chosen": 0.326171875, "rewards/margins": 0.19921875, "rewards/rejected": 0.126953125, "step": 893 }, { "epoch": 0.6622222222222223, "grad_norm": 1.7490407228469849, "learning_rate": 3.3777777777777777e-07, "logits/chosen": 1.705078125, "logits/rejected": 1.3984375, "logps/chosen": -40.1875, "logps/rejected": -25.234375, "loss": 0.7046, "rewards/accuracies": 0.25, "rewards/chosen": 3.0517578125e-05, "rewards/margins": 0.009765625, "rewards/rejected": -0.009765625, "step": 894 }, { "epoch": 0.662962962962963, "grad_norm": 1.2862377166748047, "learning_rate": 3.37037037037037e-07, "logits/chosen": 0.8603515625, "logits/rejected": 1.3759765625, "logps/chosen": -36.1875, "logps/rejected": -36.65625, "loss": 0.5127, "rewards/accuracies": 0.75, "rewards/chosen": 0.5732421875, "rewards/margins": 0.5634765625, "rewards/rejected": 0.00995635986328125, "step": 895 }, { "epoch": 0.6637037037037037, "grad_norm": 1.6681389808654785, "learning_rate": 3.362962962962963e-07, "logits/chosen": 1.6416015625, "logits/rejected": 1.8447265625, "logps/chosen": -36.96875, "logps/rejected": -38.96875, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": 0.01641845703125, "rewards/margins": 0.05352783203125, "rewards/rejected": -0.037109375, "step": 896 }, { "epoch": 0.6644444444444444, "grad_norm": 5.182636737823486, "learning_rate": 3.3555555555555556e-07, "logits/chosen": 2.01171875, "logits/rejected": 2.814453125, "logps/chosen": -45.375, "logps/rejected": -55.21875, "loss": 1.3281, "rewards/accuracies": 0.25, "rewards/chosen": -0.022247314453125, "rewards/margins": -0.79931640625, "rewards/rejected": 0.77685546875, "step": 897 }, { "epoch": 0.6651851851851852, "grad_norm": 1.6169071197509766, "learning_rate": 3.348148148148148e-07, "logits/chosen": 2.33203125, "logits/rejected": 2.12109375, "logps/chosen": -30.125, "logps/rejected": -47.96875, "loss": 0.7949, "rewards/accuracies": 0.25, "rewards/chosen": -0.1136474609375, "rewards/margins": -0.1898193359375, "rewards/rejected": 0.076171875, "step": 898 }, { "epoch": 0.6659259259259259, "grad_norm": 2.4818341732025146, "learning_rate": 3.3407407407407407e-07, "logits/chosen": 1.1533203125, "logits/rejected": 1.3798828125, "logps/chosen": -36.78125, "logps/rejected": -62.03125, "loss": 0.8828, "rewards/accuracies": 0.25, "rewards/chosen": 0.28466796875, "rewards/margins": -0.25927734375, "rewards/rejected": 0.5439453125, "step": 899 }, { "epoch": 0.6666666666666666, "grad_norm": 3.179344654083252, "learning_rate": 3.333333333333333e-07, "logits/chosen": 1.7197265625, "logits/rejected": 1.5625, "logps/chosen": -68.4375, "logps/rejected": -92.0625, "loss": 0.7007, "rewards/accuracies": 0.5, "rewards/chosen": 0.00433349609375, "rewards/margins": -0.0003662109375, "rewards/rejected": 0.0046844482421875, "step": 900 }, { "epoch": 0.6674074074074074, "grad_norm": 1.2474024295806885, "learning_rate": 3.325925925925926e-07, "logits/chosen": 1.2177734375, "logits/rejected": 1.5654296875, "logps/chosen": -24.859375, "logps/rejected": -26.21875, "loss": 0.6533, "rewards/accuracies": 0.75, "rewards/chosen": 0.1553955078125, "rewards/margins": 0.0911865234375, "rewards/rejected": 0.06427001953125, "step": 901 }, { "epoch": 0.6681481481481482, "grad_norm": 1.8071002960205078, "learning_rate": 3.318518518518518e-07, "logits/chosen": 1.640625, "logits/rejected": 2.494140625, "logps/chosen": -38.09375, "logps/rejected": -47.3125, "loss": 0.4138, "rewards/accuracies": 1.0, "rewards/chosen": 0.123046875, "rewards/margins": 1.296875, "rewards/rejected": -1.173828125, "step": 902 }, { "epoch": 0.6688888888888889, "grad_norm": 1.546075463294983, "learning_rate": 3.311111111111111e-07, "logits/chosen": 1.72265625, "logits/rejected": 2.228515625, "logps/chosen": -31.46875, "logps/rejected": -51.375, "loss": 0.5718, "rewards/accuracies": 0.75, "rewards/chosen": 0.2125244140625, "rewards/margins": 0.31298828125, "rewards/rejected": -0.100341796875, "step": 903 }, { "epoch": 0.6696296296296296, "grad_norm": 2.117060661315918, "learning_rate": 3.303703703703704e-07, "logits/chosen": 1.2900390625, "logits/rejected": 1.6240234375, "logps/chosen": -37.6875, "logps/rejected": -69.625, "loss": 0.71, "rewards/accuracies": 0.25, "rewards/chosen": -0.135986328125, "rewards/margins": -0.028900146484375, "rewards/rejected": -0.1070556640625, "step": 904 }, { "epoch": 0.6703703703703704, "grad_norm": 46.48044204711914, "learning_rate": 3.296296296296296e-07, "logits/chosen": 1.845703125, "logits/rejected": 2.310546875, "logps/chosen": -28.0625, "logps/rejected": -39.1875, "loss": 0.6636, "rewards/accuracies": 0.5, "rewards/chosen": 0.061309814453125, "rewards/margins": 0.1397705078125, "rewards/rejected": -0.0784912109375, "step": 905 }, { "epoch": 0.6711111111111111, "grad_norm": 33.575439453125, "learning_rate": 3.288888888888889e-07, "logits/chosen": 2.107421875, "logits/rejected": 1.771484375, "logps/chosen": -77.0, "logps/rejected": -42.09375, "loss": 0.6323, "rewards/accuracies": 0.75, "rewards/chosen": 0.1566162109375, "rewards/margins": 0.181884765625, "rewards/rejected": -0.0253143310546875, "step": 906 }, { "epoch": 0.6718518518518518, "grad_norm": 1.880321979522705, "learning_rate": 3.281481481481481e-07, "logits/chosen": 1.9365234375, "logits/rejected": 1.697265625, "logps/chosen": -41.9375, "logps/rejected": -46.875, "loss": 0.7148, "rewards/accuracies": 0.5, "rewards/chosen": 0.07696533203125, "rewards/margins": -0.017578125, "rewards/rejected": 0.09454345703125, "step": 907 }, { "epoch": 0.6725925925925926, "grad_norm": 1.6603459119796753, "learning_rate": 3.274074074074074e-07, "logits/chosen": 1.349609375, "logits/rejected": 1.5703125, "logps/chosen": -40.1875, "logps/rejected": -26.484375, "loss": 0.7471, "rewards/accuracies": 0.5, "rewards/chosen": -0.0582275390625, "rewards/margins": -0.09814453125, "rewards/rejected": 0.03985595703125, "step": 908 }, { "epoch": 0.6733333333333333, "grad_norm": 1.735735297203064, "learning_rate": 3.2666666666666663e-07, "logits/chosen": 0.6953125, "logits/rejected": 1.673828125, "logps/chosen": -20.6875, "logps/rejected": -55.03125, "loss": 0.6875, "rewards/accuracies": 0.75, "rewards/chosen": -0.03887939453125, "rewards/margins": 0.05877685546875, "rewards/rejected": -0.09765625, "step": 909 }, { "epoch": 0.674074074074074, "grad_norm": 1.5610203742980957, "learning_rate": 3.2592592592592596e-07, "logits/chosen": 1.6630859375, "logits/rejected": 1.4306640625, "logps/chosen": -21.953125, "logps/rejected": -46.8125, "loss": 0.7427, "rewards/accuracies": 0.25, "rewards/chosen": 0.025787353515625, "rewards/margins": -0.09527587890625, "rewards/rejected": 0.12109375, "step": 910 }, { "epoch": 0.6748148148148149, "grad_norm": 1.8129892349243164, "learning_rate": 3.251851851851852e-07, "logits/chosen": 1.9755859375, "logits/rejected": 2.345703125, "logps/chosen": -30.28125, "logps/rejected": -38.1875, "loss": 0.6621, "rewards/accuracies": 0.5, "rewards/chosen": 0.1292724609375, "rewards/margins": 0.0994873046875, "rewards/rejected": 0.0298919677734375, "step": 911 }, { "epoch": 0.6755555555555556, "grad_norm": 1.8457353115081787, "learning_rate": 3.244444444444444e-07, "logits/chosen": 2.17578125, "logits/rejected": 1.7783203125, "logps/chosen": -35.90625, "logps/rejected": -41.34375, "loss": 0.7256, "rewards/accuracies": 0.5, "rewards/chosen": -0.0706787109375, "rewards/margins": -0.057373046875, "rewards/rejected": -0.013275146484375, "step": 912 }, { "epoch": 0.6762962962962963, "grad_norm": 2.3761990070343018, "learning_rate": 3.237037037037037e-07, "logits/chosen": 2.025390625, "logits/rejected": 2.26171875, "logps/chosen": -28.546875, "logps/rejected": -60.0, "loss": 0.8828, "rewards/accuracies": 0.25, "rewards/chosen": 0.20458984375, "rewards/margins": -0.242919921875, "rewards/rejected": 0.447509765625, "step": 913 }, { "epoch": 0.677037037037037, "grad_norm": 1.870398998260498, "learning_rate": 3.2296296296296293e-07, "logits/chosen": 1.314453125, "logits/rejected": 1.7109375, "logps/chosen": -26.671875, "logps/rejected": -34.875, "loss": 0.7192, "rewards/accuracies": 0.5, "rewards/chosen": 0.064208984375, "rewards/margins": -0.032470703125, "rewards/rejected": 0.0966796875, "step": 914 }, { "epoch": 0.6777777777777778, "grad_norm": 1.850276231765747, "learning_rate": 3.222222222222222e-07, "logits/chosen": 1.60546875, "logits/rejected": 1.6123046875, "logps/chosen": -31.046875, "logps/rejected": -48.9375, "loss": 0.792, "rewards/accuracies": 0.5, "rewards/chosen": 0.04296875, "rewards/margins": -0.146728515625, "rewards/rejected": 0.1898193359375, "step": 915 }, { "epoch": 0.6785185185185185, "grad_norm": 1.5471912622451782, "learning_rate": 3.2148148148148144e-07, "logits/chosen": 1.1337890625, "logits/rejected": 1.5947265625, "logps/chosen": -23.25, "logps/rejected": -46.34375, "loss": 0.7109, "rewards/accuracies": 0.5, "rewards/chosen": 0.0074005126953125, "rewards/margins": -0.000396728515625, "rewards/rejected": 0.00780487060546875, "step": 916 }, { "epoch": 0.6792592592592592, "grad_norm": 1.8474730253219604, "learning_rate": 3.207407407407407e-07, "logits/chosen": 2.166015625, "logits/rejected": 1.78515625, "logps/chosen": -23.328125, "logps/rejected": -35.375, "loss": 0.6548, "rewards/accuracies": 0.5, "rewards/chosen": 0.0540771484375, "rewards/margins": 0.198974609375, "rewards/rejected": -0.14501953125, "step": 917 }, { "epoch": 0.68, "grad_norm": 2.3828227519989014, "learning_rate": 3.2e-07, "logits/chosen": 2.0625, "logits/rejected": 1.7978515625, "logps/chosen": -25.859375, "logps/rejected": -33.78125, "loss": 0.6748, "rewards/accuracies": 0.75, "rewards/chosen": -0.044342041015625, "rewards/margins": 0.040771484375, "rewards/rejected": -0.08514404296875, "step": 918 }, { "epoch": 0.6807407407407408, "grad_norm": 1.9724891185760498, "learning_rate": 3.1925925925925924e-07, "logits/chosen": 1.869140625, "logits/rejected": 1.931640625, "logps/chosen": -33.09375, "logps/rejected": -47.875, "loss": 0.7485, "rewards/accuracies": 0.25, "rewards/chosen": 0.0183563232421875, "rewards/margins": -0.087890625, "rewards/rejected": 0.10626220703125, "step": 919 }, { "epoch": 0.6814814814814815, "grad_norm": 1.6486858129501343, "learning_rate": 3.185185185185185e-07, "logits/chosen": 1.6357421875, "logits/rejected": 2.333984375, "logps/chosen": -43.78125, "logps/rejected": -59.53125, "loss": 0.6953, "rewards/accuracies": 0.75, "rewards/chosen": -0.0226898193359375, "rewards/margins": 0.01287078857421875, "rewards/rejected": -0.035552978515625, "step": 920 }, { "epoch": 0.6822222222222222, "grad_norm": 1.8931868076324463, "learning_rate": 3.1777777777777775e-07, "logits/chosen": 1.2333984375, "logits/rejected": 1.2490234375, "logps/chosen": -41.375, "logps/rejected": -33.5625, "loss": 0.7832, "rewards/accuracies": 0.5, "rewards/chosen": 0.0238037109375, "rewards/margins": -0.0274658203125, "rewards/rejected": 0.05120849609375, "step": 921 }, { "epoch": 0.682962962962963, "grad_norm": 2.2286787033081055, "learning_rate": 3.1703703703703703e-07, "logits/chosen": 1.6376953125, "logits/rejected": 1.7373046875, "logps/chosen": -34.09375, "logps/rejected": -31.265625, "loss": 0.7529, "rewards/accuracies": 0.25, "rewards/chosen": -0.091796875, "rewards/margins": -0.1048583984375, "rewards/rejected": 0.013092041015625, "step": 922 }, { "epoch": 0.6837037037037037, "grad_norm": 1.9443023204803467, "learning_rate": 3.1629629629629626e-07, "logits/chosen": 1.9970703125, "logits/rejected": 1.7802734375, "logps/chosen": -30.578125, "logps/rejected": -47.03125, "loss": 0.7266, "rewards/accuracies": 0.25, "rewards/chosen": 0.0804443359375, "rewards/margins": -0.026519775390625, "rewards/rejected": 0.1070556640625, "step": 923 }, { "epoch": 0.6844444444444444, "grad_norm": 2.2257440090179443, "learning_rate": 3.1555555555555554e-07, "logits/chosen": 1.5087890625, "logits/rejected": 1.8076171875, "logps/chosen": -20.25, "logps/rejected": -61.75, "loss": 0.6655, "rewards/accuracies": 0.5, "rewards/chosen": 0.032440185546875, "rewards/margins": 0.0667724609375, "rewards/rejected": -0.034332275390625, "step": 924 }, { "epoch": 0.6851851851851852, "grad_norm": 2.9656853675842285, "learning_rate": 3.148148148148148e-07, "logits/chosen": 1.3154296875, "logits/rejected": 1.6640625, "logps/chosen": -41.1875, "logps/rejected": -38.5, "loss": 0.8018, "rewards/accuracies": 0.5, "rewards/chosen": -0.07183837890625, "rewards/margins": -0.11328125, "rewards/rejected": 0.0413818359375, "step": 925 }, { "epoch": 0.6859259259259259, "grad_norm": 2.106062412261963, "learning_rate": 3.1407407407407405e-07, "logits/chosen": 1.1142578125, "logits/rejected": 1.99609375, "logps/chosen": -49.125, "logps/rejected": -47.1875, "loss": 0.6826, "rewards/accuracies": 0.5, "rewards/chosen": 0.09063720703125, "rewards/margins": 0.0406494140625, "rewards/rejected": 0.04998779296875, "step": 926 }, { "epoch": 0.6866666666666666, "grad_norm": 1.9319919347763062, "learning_rate": 3.1333333333333333e-07, "logits/chosen": 1.8427734375, "logits/rejected": 1.412109375, "logps/chosen": -34.0, "logps/rejected": -38.46875, "loss": 0.6738, "rewards/accuracies": 0.5, "rewards/chosen": -0.078125, "rewards/margins": 0.04803466796875, "rewards/rejected": -0.126220703125, "step": 927 }, { "epoch": 0.6874074074074074, "grad_norm": 1.8502893447875977, "learning_rate": 3.1259259259259256e-07, "logits/chosen": 1.1357421875, "logits/rejected": 0.66064453125, "logps/chosen": -56.90625, "logps/rejected": -32.6875, "loss": 0.7246, "rewards/accuracies": 0.25, "rewards/chosen": 0.10235595703125, "rewards/margins": -0.007415771484375, "rewards/rejected": 0.1097412109375, "step": 928 }, { "epoch": 0.6881481481481482, "grad_norm": 2.7263054847717285, "learning_rate": 3.118518518518518e-07, "logits/chosen": 2.20703125, "logits/rejected": 2.375, "logps/chosen": -34.9375, "logps/rejected": -63.34375, "loss": 0.6753, "rewards/accuracies": 0.5, "rewards/chosen": -0.1556396484375, "rewards/margins": 0.038848876953125, "rewards/rejected": -0.1944580078125, "step": 929 }, { "epoch": 0.6888888888888889, "grad_norm": 2.378528356552124, "learning_rate": 3.111111111111111e-07, "logits/chosen": 1.408203125, "logits/rejected": 0.78076171875, "logps/chosen": -34.96875, "logps/rejected": -36.96875, "loss": 1.002, "rewards/accuracies": 0.25, "rewards/chosen": -0.466552734375, "rewards/margins": -0.47802734375, "rewards/rejected": 0.01174163818359375, "step": 930 }, { "epoch": 0.6896296296296296, "grad_norm": 4.461935997009277, "learning_rate": 3.1037037037037036e-07, "logits/chosen": 1.55078125, "logits/rejected": 1.607421875, "logps/chosen": -19.0625, "logps/rejected": -64.875, "loss": 0.585, "rewards/accuracies": 0.75, "rewards/chosen": 0.041778564453125, "rewards/margins": 0.28662109375, "rewards/rejected": -0.244873046875, "step": 931 }, { "epoch": 0.6903703703703704, "grad_norm": 2.7571873664855957, "learning_rate": 3.0962962962962964e-07, "logits/chosen": 1.521484375, "logits/rejected": 2.1328125, "logps/chosen": -33.125, "logps/rejected": -57.875, "loss": 0.9355, "rewards/accuracies": 0.5, "rewards/chosen": -0.1297607421875, "rewards/margins": -0.34521484375, "rewards/rejected": 0.215576171875, "step": 932 }, { "epoch": 0.6911111111111111, "grad_norm": 2.0705296993255615, "learning_rate": 3.0888888888888887e-07, "logits/chosen": 1.7177734375, "logits/rejected": 1.755859375, "logps/chosen": -38.8125, "logps/rejected": -47.375, "loss": 0.7705, "rewards/accuracies": 0.5, "rewards/chosen": -0.19140625, "rewards/margins": -0.13671875, "rewards/rejected": -0.054656982421875, "step": 933 }, { "epoch": 0.6918518518518518, "grad_norm": 2.167171001434326, "learning_rate": 3.0814814814814815e-07, "logits/chosen": 2.275390625, "logits/rejected": 2.259765625, "logps/chosen": -21.65625, "logps/rejected": -59.09375, "loss": 0.8369, "rewards/accuracies": 0.5, "rewards/chosen": 0.025787353515625, "rewards/margins": -0.185302734375, "rewards/rejected": 0.2109375, "step": 934 }, { "epoch": 0.6925925925925925, "grad_norm": 1.5204890966415405, "learning_rate": 3.074074074074074e-07, "logits/chosen": 1.5400390625, "logits/rejected": 1.26171875, "logps/chosen": -31.734375, "logps/rejected": -38.03125, "loss": 0.7871, "rewards/accuracies": 0.5, "rewards/chosen": -0.12890625, "rewards/margins": -0.1605224609375, "rewards/rejected": 0.03167724609375, "step": 935 }, { "epoch": 0.6933333333333334, "grad_norm": 2.3931596279144287, "learning_rate": 3.066666666666666e-07, "logits/chosen": 1.650390625, "logits/rejected": 1.2529296875, "logps/chosen": -63.4375, "logps/rejected": -81.75, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": -0.0050811767578125, "rewards/margins": 0.032470703125, "rewards/rejected": -0.037506103515625, "step": 936 }, { "epoch": 0.6940740740740741, "grad_norm": 14.519962310791016, "learning_rate": 3.0592592592592594e-07, "logits/chosen": 3.03515625, "logits/rejected": 1.783203125, "logps/chosen": -59.0, "logps/rejected": -72.0625, "loss": 1.8369, "rewards/accuracies": 0.5, "rewards/chosen": -1.3583984375, "rewards/margins": -1.263671875, "rewards/rejected": -0.09375, "step": 937 }, { "epoch": 0.6948148148148148, "grad_norm": 1.4251569509506226, "learning_rate": 3.0518518518518517e-07, "logits/chosen": 2.576171875, "logits/rejected": 2.02734375, "logps/chosen": -28.953125, "logps/rejected": -40.21875, "loss": 0.5835, "rewards/accuracies": 1.0, "rewards/chosen": 0.125244140625, "rewards/margins": 0.243896484375, "rewards/rejected": -0.11871337890625, "step": 938 }, { "epoch": 0.6955555555555556, "grad_norm": 7.103194713592529, "learning_rate": 3.0444444444444445e-07, "logits/chosen": 2.01953125, "logits/rejected": 1.71484375, "logps/chosen": -53.5, "logps/rejected": -32.5, "loss": 1.0264, "rewards/accuracies": 0.25, "rewards/chosen": -0.41162109375, "rewards/margins": -0.474609375, "rewards/rejected": 0.0628662109375, "step": 939 }, { "epoch": 0.6962962962962963, "grad_norm": 1.7151657342910767, "learning_rate": 3.037037037037037e-07, "logits/chosen": 1.7587890625, "logits/rejected": 1.8330078125, "logps/chosen": -28.625, "logps/rejected": -69.5625, "loss": 0.5444, "rewards/accuracies": 0.5, "rewards/chosen": 1.732421875, "rewards/margins": 1.81640625, "rewards/rejected": -0.08349609375, "step": 940 }, { "epoch": 0.697037037037037, "grad_norm": 2.032496929168701, "learning_rate": 3.0296296296296296e-07, "logits/chosen": 1.5615234375, "logits/rejected": 1.34375, "logps/chosen": -27.78125, "logps/rejected": -48.96875, "loss": 0.7441, "rewards/accuracies": 0.25, "rewards/chosen": -0.15625, "rewards/margins": -0.0859375, "rewards/rejected": -0.0703125, "step": 941 }, { "epoch": 0.6977777777777778, "grad_norm": 3.504485607147217, "learning_rate": 3.022222222222222e-07, "logits/chosen": 1.77734375, "logits/rejected": 1.4013671875, "logps/chosen": -23.09375, "logps/rejected": -22.34375, "loss": 0.9194, "rewards/accuracies": 0.25, "rewards/chosen": -0.319091796875, "rewards/margins": -0.36376953125, "rewards/rejected": 0.044525146484375, "step": 942 }, { "epoch": 0.6985185185185185, "grad_norm": 1.9781968593597412, "learning_rate": 3.014814814814814e-07, "logits/chosen": 1.8515625, "logits/rejected": 1.9375, "logps/chosen": -21.53125, "logps/rejected": -74.1875, "loss": 0.6328, "rewards/accuracies": 0.5, "rewards/chosen": 0.06719970703125, "rewards/margins": 0.1484375, "rewards/rejected": -0.0811767578125, "step": 943 }, { "epoch": 0.6992592592592592, "grad_norm": 2.672520637512207, "learning_rate": 3.0074074074074076e-07, "logits/chosen": 1.5029296875, "logits/rejected": 1.5947265625, "logps/chosen": -41.25, "logps/rejected": -66.125, "loss": 1.9473, "rewards/accuracies": 0.0, "rewards/chosen": -0.09375, "rewards/margins": -1.5361328125, "rewards/rejected": 1.4423828125, "step": 944 }, { "epoch": 0.7, "grad_norm": 1.8212379217147827, "learning_rate": 3e-07, "logits/chosen": 1.46875, "logits/rejected": 1.138671875, "logps/chosen": -28.78125, "logps/rejected": -33.9375, "loss": 0.7197, "rewards/accuracies": 0.5, "rewards/chosen": -0.041412353515625, "rewards/margins": -0.03631591796875, "rewards/rejected": -0.00507354736328125, "step": 945 }, { "epoch": 0.7007407407407408, "grad_norm": 1.6803067922592163, "learning_rate": 2.9925925925925927e-07, "logits/chosen": 1.798828125, "logits/rejected": 0.7421875, "logps/chosen": -33.96875, "logps/rejected": -28.453125, "loss": 0.8535, "rewards/accuracies": 0.75, "rewards/chosen": -0.0977783203125, "rewards/margins": -0.1973876953125, "rewards/rejected": 0.09979248046875, "step": 946 }, { "epoch": 0.7014814814814815, "grad_norm": 2.5869626998901367, "learning_rate": 2.985185185185185e-07, "logits/chosen": 1.455078125, "logits/rejected": 2.15234375, "logps/chosen": -30.328125, "logps/rejected": -26.046875, "loss": 0.6484, "rewards/accuracies": 0.5, "rewards/chosen": -0.04022216796875, "rewards/margins": 0.1259765625, "rewards/rejected": -0.166259765625, "step": 947 }, { "epoch": 0.7022222222222222, "grad_norm": 3.599205255508423, "learning_rate": 2.9777777777777773e-07, "logits/chosen": 1.2822265625, "logits/rejected": 1.2265625, "logps/chosen": -45.28125, "logps/rejected": -55.625, "loss": 0.5098, "rewards/accuracies": 0.5, "rewards/chosen": 1.884765625, "rewards/margins": 1.96484375, "rewards/rejected": -0.0804443359375, "step": 948 }, { "epoch": 0.702962962962963, "grad_norm": 2.059014320373535, "learning_rate": 2.97037037037037e-07, "logits/chosen": 1.6220703125, "logits/rejected": 1.724609375, "logps/chosen": -23.375, "logps/rejected": -25.78125, "loss": 0.7197, "rewards/accuracies": 0.25, "rewards/chosen": -0.023834228515625, "rewards/margins": -0.04669189453125, "rewards/rejected": 0.022857666015625, "step": 949 }, { "epoch": 0.7037037037037037, "grad_norm": 1.4243770837783813, "learning_rate": 2.962962962962963e-07, "logits/chosen": 1.548828125, "logits/rejected": 1.4853515625, "logps/chosen": -27.046875, "logps/rejected": -43.4375, "loss": 0.5957, "rewards/accuracies": 1.0, "rewards/chosen": 0.0570068359375, "rewards/margins": 0.21044921875, "rewards/rejected": -0.153564453125, "step": 950 }, { "epoch": 0.7044444444444444, "grad_norm": 2.7220473289489746, "learning_rate": 2.9555555555555557e-07, "logits/chosen": 2.2890625, "logits/rejected": 1.677734375, "logps/chosen": -40.28125, "logps/rejected": -46.53125, "loss": 0.8374, "rewards/accuracies": 0.5, "rewards/chosen": -0.273681640625, "rewards/margins": -0.20654296875, "rewards/rejected": -0.06719970703125, "step": 951 }, { "epoch": 0.7051851851851851, "grad_norm": 1.6745678186416626, "learning_rate": 2.948148148148148e-07, "logits/chosen": 1.66015625, "logits/rejected": 1.3349609375, "logps/chosen": -25.71875, "logps/rejected": -37.125, "loss": 0.7383, "rewards/accuracies": 0.5, "rewards/chosen": -0.0078125, "rewards/margins": -0.08087158203125, "rewards/rejected": 0.07305908203125, "step": 952 }, { "epoch": 0.705925925925926, "grad_norm": 1.4609286785125732, "learning_rate": 2.940740740740741e-07, "logits/chosen": 1.6171875, "logits/rejected": 1.5791015625, "logps/chosen": -40.34375, "logps/rejected": -20.1875, "loss": 0.7432, "rewards/accuracies": 0.25, "rewards/chosen": 0.0234375, "rewards/margins": -0.09417724609375, "rewards/rejected": 0.11761474609375, "step": 953 }, { "epoch": 0.7066666666666667, "grad_norm": 2.255383253097534, "learning_rate": 2.933333333333333e-07, "logits/chosen": 2.205078125, "logits/rejected": 1.4794921875, "logps/chosen": -22.21875, "logps/rejected": -46.53125, "loss": 0.8735, "rewards/accuracies": 0.0, "rewards/chosen": -0.1751708984375, "rewards/margins": -0.326171875, "rewards/rejected": 0.151123046875, "step": 954 }, { "epoch": 0.7074074074074074, "grad_norm": 1.4303722381591797, "learning_rate": 2.9259259259259254e-07, "logits/chosen": 1.6474609375, "logits/rejected": 1.185546875, "logps/chosen": -32.65625, "logps/rejected": -43.34375, "loss": 0.6357, "rewards/accuracies": 0.75, "rewards/chosen": 0.06640625, "rewards/margins": 0.1234130859375, "rewards/rejected": -0.0570068359375, "step": 955 }, { "epoch": 0.7081481481481482, "grad_norm": 3.276134490966797, "learning_rate": 2.918518518518518e-07, "logits/chosen": 1.24609375, "logits/rejected": 1.5224609375, "logps/chosen": -37.375, "logps/rejected": -77.125, "loss": 0.9346, "rewards/accuracies": 0.5, "rewards/chosen": 0.080810546875, "rewards/margins": -0.37548828125, "rewards/rejected": 0.456298828125, "step": 956 }, { "epoch": 0.7088888888888889, "grad_norm": 1.8886034488677979, "learning_rate": 2.911111111111111e-07, "logits/chosen": 1.8076171875, "logits/rejected": 2.4765625, "logps/chosen": -26.34375, "logps/rejected": -26.65625, "loss": 0.6543, "rewards/accuracies": 0.5, "rewards/chosen": 0.127685546875, "rewards/margins": 0.0921630859375, "rewards/rejected": 0.035552978515625, "step": 957 }, { "epoch": 0.7096296296296296, "grad_norm": 2.077956199645996, "learning_rate": 2.903703703703704e-07, "logits/chosen": 1.0791015625, "logits/rejected": 1.8544921875, "logps/chosen": -33.9375, "logps/rejected": -30.6875, "loss": 0.7861, "rewards/accuracies": 0.25, "rewards/chosen": -0.11480712890625, "rewards/margins": -0.1422119140625, "rewards/rejected": 0.02734375, "step": 958 }, { "epoch": 0.7103703703703703, "grad_norm": 1.9774980545043945, "learning_rate": 2.896296296296296e-07, "logits/chosen": 1.689453125, "logits/rejected": 1.5703125, "logps/chosen": -45.84375, "logps/rejected": -36.40625, "loss": 0.7075, "rewards/accuracies": 0.5, "rewards/chosen": -0.05859375, "rewards/margins": -0.020294189453125, "rewards/rejected": -0.038299560546875, "step": 959 }, { "epoch": 0.7111111111111111, "grad_norm": 3.681986093521118, "learning_rate": 2.8888888888888885e-07, "logits/chosen": 2.044921875, "logits/rejected": 1.50390625, "logps/chosen": -37.5625, "logps/rejected": -81.8125, "loss": 0.9238, "rewards/accuracies": 0.25, "rewards/chosen": -0.298583984375, "rewards/margins": -0.35595703125, "rewards/rejected": 0.057373046875, "step": 960 }, { "epoch": 0.7118518518518518, "grad_norm": 1.7548617124557495, "learning_rate": 2.8814814814814813e-07, "logits/chosen": 1.521484375, "logits/rejected": 1.724609375, "logps/chosen": -34.5625, "logps/rejected": -43.9375, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": 0.11761474609375, "rewards/margins": 0.032440185546875, "rewards/rejected": 0.085205078125, "step": 961 }, { "epoch": 0.7125925925925926, "grad_norm": 1.8512877225875854, "learning_rate": 2.8740740740740736e-07, "logits/chosen": 2.328125, "logits/rejected": 2.173828125, "logps/chosen": -30.484375, "logps/rejected": -55.90625, "loss": 0.7666, "rewards/accuracies": 0.25, "rewards/chosen": 0.0390625, "rewards/margins": -0.13671875, "rewards/rejected": 0.17578125, "step": 962 }, { "epoch": 0.7133333333333334, "grad_norm": 2.3040084838867188, "learning_rate": 2.866666666666667e-07, "logits/chosen": 1.6025390625, "logits/rejected": 1.6953125, "logps/chosen": -25.796875, "logps/rejected": -41.6875, "loss": 0.8315, "rewards/accuracies": 0.0, "rewards/chosen": -0.1500244140625, "rewards/margins": -0.248046875, "rewards/rejected": 0.0980224609375, "step": 963 }, { "epoch": 0.7140740740740741, "grad_norm": 2.542794942855835, "learning_rate": 2.859259259259259e-07, "logits/chosen": 1.2255859375, "logits/rejected": 0.91064453125, "logps/chosen": -52.03125, "logps/rejected": -25.4375, "loss": 0.7217, "rewards/accuracies": 0.5, "rewards/chosen": 0.0589599609375, "rewards/margins": -0.019927978515625, "rewards/rejected": 0.07891845703125, "step": 964 }, { "epoch": 0.7148148148148148, "grad_norm": 1.5819746255874634, "learning_rate": 2.851851851851852e-07, "logits/chosen": 1.779296875, "logits/rejected": 1.89453125, "logps/chosen": -35.15625, "logps/rejected": -38.9375, "loss": 0.603, "rewards/accuracies": 0.75, "rewards/chosen": -0.024627685546875, "rewards/margins": 0.208251953125, "rewards/rejected": -0.2327880859375, "step": 965 }, { "epoch": 0.7155555555555555, "grad_norm": 1.619256615638733, "learning_rate": 2.8444444444444443e-07, "logits/chosen": 1.48828125, "logits/rejected": 1.1083984375, "logps/chosen": -31.40625, "logps/rejected": -23.15625, "loss": 0.7681, "rewards/accuracies": 0.25, "rewards/chosen": -0.146484375, "rewards/margins": -0.1407470703125, "rewards/rejected": -0.0056610107421875, "step": 966 }, { "epoch": 0.7162962962962963, "grad_norm": 2.321901559829712, "learning_rate": 2.8370370370370366e-07, "logits/chosen": 1.130859375, "logits/rejected": 2.013671875, "logps/chosen": -22.484375, "logps/rejected": -72.375, "loss": 0.6323, "rewards/accuracies": 1.0, "rewards/chosen": -0.031646728515625, "rewards/margins": 0.1279296875, "rewards/rejected": -0.1595458984375, "step": 967 }, { "epoch": 0.717037037037037, "grad_norm": 3.121300458908081, "learning_rate": 2.8296296296296294e-07, "logits/chosen": 1.32421875, "logits/rejected": 2.1796875, "logps/chosen": -41.6875, "logps/rejected": -32.09375, "loss": 0.7139, "rewards/accuracies": 0.5, "rewards/chosen": -0.01641845703125, "rewards/margins": -0.027099609375, "rewards/rejected": 0.01070404052734375, "step": 968 }, { "epoch": 0.7177777777777777, "grad_norm": 1.4379807710647583, "learning_rate": 2.8222222222222217e-07, "logits/chosen": 1.7421875, "logits/rejected": 1.716796875, "logps/chosen": -32.1875, "logps/rejected": -22.6875, "loss": 0.688, "rewards/accuracies": 0.75, "rewards/chosen": -0.037109375, "rewards/margins": 0.025177001953125, "rewards/rejected": -0.062286376953125, "step": 969 }, { "epoch": 0.7185185185185186, "grad_norm": 1.7148059606552124, "learning_rate": 2.814814814814815e-07, "logits/chosen": 1.5478515625, "logits/rejected": 1.16015625, "logps/chosen": -57.5625, "logps/rejected": -34.03125, "loss": 0.6265, "rewards/accuracies": 0.75, "rewards/chosen": 0.0765380859375, "rewards/margins": 0.1656494140625, "rewards/rejected": -0.08905029296875, "step": 970 }, { "epoch": 0.7192592592592593, "grad_norm": 2.22796630859375, "learning_rate": 2.8074074074074074e-07, "logits/chosen": 1.3310546875, "logits/rejected": 1.9345703125, "logps/chosen": -51.625, "logps/rejected": -53.4375, "loss": 0.7715, "rewards/accuracies": 0.5, "rewards/chosen": 0.1019287109375, "rewards/margins": -0.1343994140625, "rewards/rejected": 0.236328125, "step": 971 }, { "epoch": 0.72, "grad_norm": 1.3165274858474731, "learning_rate": 2.8e-07, "logits/chosen": 0.96435546875, "logits/rejected": 1.6181640625, "logps/chosen": -33.375, "logps/rejected": -30.34375, "loss": 0.6021, "rewards/accuracies": 0.75, "rewards/chosen": 0.09027099609375, "rewards/margins": 0.243896484375, "rewards/rejected": -0.1536865234375, "step": 972 }, { "epoch": 0.7207407407407408, "grad_norm": 1.9418383836746216, "learning_rate": 2.7925925925925925e-07, "logits/chosen": 1.9052734375, "logits/rejected": 1.7197265625, "logps/chosen": -22.9375, "logps/rejected": -61.46875, "loss": 0.7427, "rewards/accuracies": 0.25, "rewards/chosen": -0.05816650390625, "rewards/margins": -0.08160400390625, "rewards/rejected": 0.0234375, "step": 973 }, { "epoch": 0.7214814814814815, "grad_norm": 1.9271938800811768, "learning_rate": 2.785185185185185e-07, "logits/chosen": 2.072265625, "logits/rejected": 1.2265625, "logps/chosen": -46.21875, "logps/rejected": -33.5625, "loss": 0.853, "rewards/accuracies": 0.25, "rewards/chosen": -0.1702880859375, "rewards/margins": -0.28759765625, "rewards/rejected": 0.1171875, "step": 974 }, { "epoch": 0.7222222222222222, "grad_norm": 1.2062174081802368, "learning_rate": 2.7777777777777776e-07, "logits/chosen": 1.2314453125, "logits/rejected": 1.275390625, "logps/chosen": -26.375, "logps/rejected": -21.34375, "loss": 0.7002, "rewards/accuracies": 0.25, "rewards/chosen": -0.081787109375, "rewards/margins": -0.0136566162109375, "rewards/rejected": -0.068115234375, "step": 975 }, { "epoch": 0.7229629629629629, "grad_norm": 1.9957544803619385, "learning_rate": 2.77037037037037e-07, "logits/chosen": 1.390625, "logits/rejected": 1.94921875, "logps/chosen": -33.875, "logps/rejected": -55.53125, "loss": 0.748, "rewards/accuracies": 0.5, "rewards/chosen": 0.017974853515625, "rewards/margins": -0.0906982421875, "rewards/rejected": 0.1087646484375, "step": 976 }, { "epoch": 0.7237037037037037, "grad_norm": 2.0710177421569824, "learning_rate": 2.762962962962963e-07, "logits/chosen": 1.78125, "logits/rejected": 2.123046875, "logps/chosen": -29.75, "logps/rejected": -49.8125, "loss": 0.6895, "rewards/accuracies": 0.75, "rewards/chosen": -0.03277587890625, "rewards/margins": 0.03314208984375, "rewards/rejected": -0.06597900390625, "step": 977 }, { "epoch": 0.7244444444444444, "grad_norm": 2.7810218334198, "learning_rate": 2.7555555555555555e-07, "logits/chosen": 1.60546875, "logits/rejected": 1.2470703125, "logps/chosen": -29.140625, "logps/rejected": -38.75, "loss": 0.6685, "rewards/accuracies": 0.75, "rewards/chosen": -0.04864501953125, "rewards/margins": 0.124755859375, "rewards/rejected": -0.1734619140625, "step": 978 }, { "epoch": 0.7251851851851852, "grad_norm": 2.234997272491455, "learning_rate": 2.748148148148148e-07, "logits/chosen": 1.1083984375, "logits/rejected": 1.1845703125, "logps/chosen": -38.40625, "logps/rejected": -47.59375, "loss": 0.6104, "rewards/accuracies": 1.0, "rewards/chosen": 0.0546875, "rewards/margins": 0.177734375, "rewards/rejected": -0.123046875, "step": 979 }, { "epoch": 0.725925925925926, "grad_norm": 1.9534730911254883, "learning_rate": 2.7407407407407406e-07, "logits/chosen": 1.5517578125, "logits/rejected": 2.201171875, "logps/chosen": -33.5625, "logps/rejected": -57.1875, "loss": 0.6865, "rewards/accuracies": 0.75, "rewards/chosen": 0.2366943359375, "rewards/margins": 0.093017578125, "rewards/rejected": 0.143798828125, "step": 980 }, { "epoch": 0.7266666666666667, "grad_norm": 2.518557071685791, "learning_rate": 2.733333333333333e-07, "logits/chosen": 1.6708984375, "logits/rejected": 1.837890625, "logps/chosen": -25.453125, "logps/rejected": -38.0625, "loss": 0.6616, "rewards/accuracies": 0.5, "rewards/chosen": 0.048431396484375, "rewards/margins": 0.07928466796875, "rewards/rejected": -0.030853271484375, "step": 981 }, { "epoch": 0.7274074074074074, "grad_norm": 2.0500993728637695, "learning_rate": 2.725925925925926e-07, "logits/chosen": 1.0947265625, "logits/rejected": 1.2392578125, "logps/chosen": -24.578125, "logps/rejected": -32.84375, "loss": 0.7222, "rewards/accuracies": 0.25, "rewards/chosen": 0.0099639892578125, "rewards/margins": -0.050567626953125, "rewards/rejected": 0.060546875, "step": 982 }, { "epoch": 0.7281481481481481, "grad_norm": 1.5316767692565918, "learning_rate": 2.7185185185185186e-07, "logits/chosen": 0.96533203125, "logits/rejected": 1.1083984375, "logps/chosen": -28.71875, "logps/rejected": -22.03125, "loss": 0.6987, "rewards/accuracies": 0.75, "rewards/chosen": 0.009735107421875, "rewards/margins": -0.00372314453125, "rewards/rejected": 0.01345062255859375, "step": 983 }, { "epoch": 0.7288888888888889, "grad_norm": 5.02484655380249, "learning_rate": 2.7111111111111114e-07, "logits/chosen": 1.5869140625, "logits/rejected": 1.716796875, "logps/chosen": -33.0625, "logps/rejected": -42.875, "loss": 1.001, "rewards/accuracies": 0.5, "rewards/chosen": 0.1064453125, "rewards/margins": -0.3212890625, "rewards/rejected": 0.427978515625, "step": 984 }, { "epoch": 0.7296296296296296, "grad_norm": 2.6260433197021484, "learning_rate": 2.7037037037037037e-07, "logits/chosen": 1.9541015625, "logits/rejected": 1.6435546875, "logps/chosen": -25.4375, "logps/rejected": -60.71875, "loss": 0.7603, "rewards/accuracies": 0.25, "rewards/chosen": -0.1234130859375, "rewards/margins": -0.123779296875, "rewards/rejected": 0.000392913818359375, "step": 985 }, { "epoch": 0.7303703703703703, "grad_norm": 10.327241897583008, "learning_rate": 2.696296296296296e-07, "logits/chosen": 1.2646484375, "logits/rejected": 1.8935546875, "logps/chosen": -27.671875, "logps/rejected": -42.90625, "loss": 0.7397, "rewards/accuracies": 0.25, "rewards/chosen": -0.05645751953125, "rewards/margins": -0.08184814453125, "rewards/rejected": 0.025390625, "step": 986 }, { "epoch": 0.7311111111111112, "grad_norm": 1.4957249164581299, "learning_rate": 2.688888888888889e-07, "logits/chosen": 2.228515625, "logits/rejected": 1.5302734375, "logps/chosen": -25.921875, "logps/rejected": -50.96875, "loss": 0.6592, "rewards/accuracies": 0.75, "rewards/chosen": 0.06683349609375, "rewards/margins": 0.07855224609375, "rewards/rejected": -0.01171875, "step": 987 }, { "epoch": 0.7318518518518519, "grad_norm": 1.6627143621444702, "learning_rate": 2.681481481481481e-07, "logits/chosen": 1.384765625, "logits/rejected": 0.982421875, "logps/chosen": -30.171875, "logps/rejected": -25.5, "loss": 0.8428, "rewards/accuracies": 0.0, "rewards/chosen": -0.2432861328125, "rewards/margins": -0.263671875, "rewards/rejected": 0.0203094482421875, "step": 988 }, { "epoch": 0.7325925925925926, "grad_norm": 2.4718034267425537, "learning_rate": 2.674074074074074e-07, "logits/chosen": 1.07421875, "logits/rejected": 1.7734375, "logps/chosen": -28.921875, "logps/rejected": -57.34375, "loss": 0.7759, "rewards/accuracies": 0.75, "rewards/chosen": 0.317626953125, "rewards/margins": -0.065185546875, "rewards/rejected": 0.3828125, "step": 989 }, { "epoch": 0.7333333333333333, "grad_norm": 2.1991684436798096, "learning_rate": 2.6666666666666667e-07, "logits/chosen": 1.2919921875, "logits/rejected": 1.1357421875, "logps/chosen": -29.84375, "logps/rejected": -45.3125, "loss": 0.7236, "rewards/accuracies": 0.25, "rewards/chosen": 0.290771484375, "rewards/margins": 0.045440673828125, "rewards/rejected": 0.2452392578125, "step": 990 }, { "epoch": 0.7340740740740741, "grad_norm": 1.5445107221603394, "learning_rate": 2.659259259259259e-07, "logits/chosen": 1.33203125, "logits/rejected": 1.2939453125, "logps/chosen": -24.359375, "logps/rejected": -46.46875, "loss": 0.7271, "rewards/accuracies": 0.5, "rewards/chosen": -0.034027099609375, "rewards/margins": -0.042236328125, "rewards/rejected": 0.0081939697265625, "step": 991 }, { "epoch": 0.7348148148148148, "grad_norm": 2.1904642581939697, "learning_rate": 2.651851851851852e-07, "logits/chosen": 1.3740234375, "logits/rejected": 1.3271484375, "logps/chosen": -38.8125, "logps/rejected": -37.4375, "loss": 0.75, "rewards/accuracies": 0.25, "rewards/chosen": -0.1484375, "rewards/margins": -0.103515625, "rewards/rejected": -0.044891357421875, "step": 992 }, { "epoch": 0.7355555555555555, "grad_norm": 1.6442524194717407, "learning_rate": 2.644444444444444e-07, "logits/chosen": 1.244140625, "logits/rejected": 1.990234375, "logps/chosen": -31.53125, "logps/rejected": -52.3125, "loss": 0.6982, "rewards/accuracies": 0.5, "rewards/chosen": -0.0230560302734375, "rewards/margins": 0.0019683837890625, "rewards/rejected": -0.0250396728515625, "step": 993 }, { "epoch": 0.7362962962962963, "grad_norm": 1.553065538406372, "learning_rate": 2.637037037037037e-07, "logits/chosen": 1.3466796875, "logits/rejected": 2.0, "logps/chosen": -35.625, "logps/rejected": -33.5, "loss": 0.7173, "rewards/accuracies": 0.25, "rewards/chosen": 0.06329345703125, "rewards/margins": -0.045318603515625, "rewards/rejected": 0.10858154296875, "step": 994 }, { "epoch": 0.737037037037037, "grad_norm": 1.8908278942108154, "learning_rate": 2.629629629629629e-07, "logits/chosen": 1.3583984375, "logits/rejected": 1.7646484375, "logps/chosen": -30.234375, "logps/rejected": -44.34375, "loss": 0.7822, "rewards/accuracies": 0.25, "rewards/chosen": -0.1168212890625, "rewards/margins": -0.150390625, "rewards/rejected": 0.033599853515625, "step": 995 }, { "epoch": 0.7377777777777778, "grad_norm": 3.4501399993896484, "learning_rate": 2.6222222222222226e-07, "logits/chosen": 0.68603515625, "logits/rejected": 1.423828125, "logps/chosen": -28.65625, "logps/rejected": -94.6875, "loss": 0.6377, "rewards/accuracies": 0.5, "rewards/chosen": 0.0894775390625, "rewards/margins": 0.141357421875, "rewards/rejected": -0.0518798828125, "step": 996 }, { "epoch": 0.7385185185185185, "grad_norm": 1.9451533555984497, "learning_rate": 2.614814814814815e-07, "logits/chosen": 2.771484375, "logits/rejected": 1.9462890625, "logps/chosen": -28.375, "logps/rejected": -58.53125, "loss": 0.605, "rewards/accuracies": 0.75, "rewards/chosen": 0.2449951171875, "rewards/margins": 0.22314453125, "rewards/rejected": 0.02191162109375, "step": 997 }, { "epoch": 0.7392592592592593, "grad_norm": 1.659032940864563, "learning_rate": 2.607407407407407e-07, "logits/chosen": 1.7822265625, "logits/rejected": 1.7900390625, "logps/chosen": -33.09375, "logps/rejected": -33.34375, "loss": 0.7114, "rewards/accuracies": 0.5, "rewards/chosen": -0.002716064453125, "rewards/margins": -0.01678466796875, "rewards/rejected": 0.0140838623046875, "step": 998 }, { "epoch": 0.74, "grad_norm": 1.395810604095459, "learning_rate": 2.6e-07, "logits/chosen": 0.64697265625, "logits/rejected": 1.3994140625, "logps/chosen": -24.375, "logps/rejected": -31.734375, "loss": 0.6538, "rewards/accuracies": 0.5, "rewards/chosen": -0.033599853515625, "rewards/margins": 0.115478515625, "rewards/rejected": -0.1490478515625, "step": 999 }, { "epoch": 0.7407407407407407, "grad_norm": 1.8654844760894775, "learning_rate": 2.5925925925925923e-07, "logits/chosen": 1.7314453125, "logits/rejected": 1.9833984375, "logps/chosen": -39.78125, "logps/rejected": -39.875, "loss": 0.7002, "rewards/accuracies": 0.5, "rewards/chosen": -0.092529296875, "rewards/margins": 0.009765625, "rewards/rejected": -0.102294921875, "step": 1000 }, { "epoch": 0.7414814814814815, "grad_norm": 2.2580008506774902, "learning_rate": 2.585185185185185e-07, "logits/chosen": 1.0634765625, "logits/rejected": 1.3349609375, "logps/chosen": -39.25, "logps/rejected": -50.0, "loss": 0.7217, "rewards/accuracies": 0.25, "rewards/chosen": 0.027740478515625, "rewards/margins": -0.04803466796875, "rewards/rejected": 0.0758056640625, "step": 1001 }, { "epoch": 0.7422222222222222, "grad_norm": 1.3463491201400757, "learning_rate": 2.5777777777777774e-07, "logits/chosen": 1.35546875, "logits/rejected": 1.275390625, "logps/chosen": -27.375, "logps/rejected": -24.484375, "loss": 0.6846, "rewards/accuracies": 0.75, "rewards/chosen": 0.0609130859375, "rewards/margins": 0.03656005859375, "rewards/rejected": 0.024383544921875, "step": 1002 }, { "epoch": 0.7429629629629629, "grad_norm": 2.1129953861236572, "learning_rate": 2.570370370370371e-07, "logits/chosen": 1.255859375, "logits/rejected": 1.6064453125, "logps/chosen": -26.5, "logps/rejected": -61.0625, "loss": 0.9053, "rewards/accuracies": 0.0, "rewards/chosen": -0.0953369140625, "rewards/margins": -0.370361328125, "rewards/rejected": 0.27490234375, "step": 1003 }, { "epoch": 0.7437037037037038, "grad_norm": 4.826375961303711, "learning_rate": 2.562962962962963e-07, "logits/chosen": 1.43359375, "logits/rejected": 1.65625, "logps/chosen": -39.59375, "logps/rejected": -84.875, "loss": 0.9824, "rewards/accuracies": 0.25, "rewards/chosen": -0.0019378662109375, "rewards/margins": -0.4140625, "rewards/rejected": 0.412109375, "step": 1004 }, { "epoch": 0.7444444444444445, "grad_norm": 1.9931447505950928, "learning_rate": 2.5555555555555553e-07, "logits/chosen": 1.5869140625, "logits/rejected": 1.4560546875, "logps/chosen": -23.828125, "logps/rejected": -68.875, "loss": 0.7002, "rewards/accuracies": 0.5, "rewards/chosen": 0.068359375, "rewards/margins": 0.014801025390625, "rewards/rejected": 0.053497314453125, "step": 1005 }, { "epoch": 0.7451851851851852, "grad_norm": 2.1166584491729736, "learning_rate": 2.548148148148148e-07, "logits/chosen": 1.5048828125, "logits/rejected": 1.9677734375, "logps/chosen": -50.4375, "logps/rejected": -37.59375, "loss": 0.6357, "rewards/accuracies": 0.5, "rewards/chosen": 0.016021728515625, "rewards/margins": 0.1500244140625, "rewards/rejected": -0.1339111328125, "step": 1006 }, { "epoch": 0.7459259259259259, "grad_norm": 2.058443546295166, "learning_rate": 2.5407407407407404e-07, "logits/chosen": 1.48828125, "logits/rejected": 1.7421875, "logps/chosen": -27.15625, "logps/rejected": -30.71875, "loss": 0.7148, "rewards/accuracies": 0.25, "rewards/chosen": -0.1300048828125, "rewards/margins": -0.04217529296875, "rewards/rejected": -0.087890625, "step": 1007 }, { "epoch": 0.7466666666666667, "grad_norm": 2.3463492393493652, "learning_rate": 2.533333333333333e-07, "logits/chosen": 1.2890625, "logits/rejected": 2.158203125, "logps/chosen": -30.171875, "logps/rejected": -68.9375, "loss": 0.7588, "rewards/accuracies": 0.25, "rewards/chosen": -0.018768310546875, "rewards/margins": -0.11895751953125, "rewards/rejected": 0.1002197265625, "step": 1008 }, { "epoch": 0.7474074074074074, "grad_norm": 1.580915927886963, "learning_rate": 2.5259259259259255e-07, "logits/chosen": 1.716796875, "logits/rejected": 1.3369140625, "logps/chosen": -34.46875, "logps/rejected": -33.25, "loss": 0.6362, "rewards/accuracies": 0.75, "rewards/chosen": 0.043701171875, "rewards/margins": 0.1605224609375, "rewards/rejected": -0.1168212890625, "step": 1009 }, { "epoch": 0.7481481481481481, "grad_norm": 3.9784023761749268, "learning_rate": 2.5185185185185184e-07, "logits/chosen": 1.9033203125, "logits/rejected": 1.1767578125, "logps/chosen": -40.40625, "logps/rejected": -35.9375, "loss": 0.7412, "rewards/accuracies": 0.25, "rewards/chosen": -0.07244873046875, "rewards/margins": -0.0745849609375, "rewards/rejected": 0.00214385986328125, "step": 1010 }, { "epoch": 0.7488888888888889, "grad_norm": 1.3903050422668457, "learning_rate": 2.511111111111111e-07, "logits/chosen": 1.87890625, "logits/rejected": 1.677734375, "logps/chosen": -24.6875, "logps/rejected": -36.625, "loss": 0.6909, "rewards/accuracies": 0.75, "rewards/chosen": 0.06304931640625, "rewards/margins": 0.010101318359375, "rewards/rejected": 0.052978515625, "step": 1011 }, { "epoch": 0.7496296296296296, "grad_norm": 2.298724412918091, "learning_rate": 2.5037037037037035e-07, "logits/chosen": 1.6455078125, "logits/rejected": 2.1484375, "logps/chosen": -40.625, "logps/rejected": -67.625, "loss": 0.5635, "rewards/accuracies": 0.75, "rewards/chosen": -0.020294189453125, "rewards/margins": 0.298583984375, "rewards/rejected": -0.31884765625, "step": 1012 }, { "epoch": 0.7503703703703704, "grad_norm": 2.6452434062957764, "learning_rate": 2.4962962962962963e-07, "logits/chosen": 1.388671875, "logits/rejected": 1.6025390625, "logps/chosen": -33.90625, "logps/rejected": -29.6875, "loss": 0.8267, "rewards/accuracies": 0.75, "rewards/chosen": 0.062103271484375, "rewards/margins": -0.2056884765625, "rewards/rejected": 0.267822265625, "step": 1013 }, { "epoch": 0.7511111111111111, "grad_norm": 1.5204989910125732, "learning_rate": 2.4888888888888886e-07, "logits/chosen": 1.1884765625, "logits/rejected": 1.572265625, "logps/chosen": -30.546875, "logps/rejected": -57.5625, "loss": 0.6929, "rewards/accuracies": 0.25, "rewards/chosen": 0.0882568359375, "rewards/margins": 0.0093536376953125, "rewards/rejected": 0.07891845703125, "step": 1014 }, { "epoch": 0.7518518518518519, "grad_norm": 1.5848608016967773, "learning_rate": 2.4814814814814814e-07, "logits/chosen": 1.154296875, "logits/rejected": 1.0126953125, "logps/chosen": -20.03125, "logps/rejected": -22.1875, "loss": 0.6914, "rewards/accuracies": 0.25, "rewards/chosen": -0.0523681640625, "rewards/margins": 0.0426025390625, "rewards/rejected": -0.09490966796875, "step": 1015 }, { "epoch": 0.7525925925925926, "grad_norm": 1.880411148071289, "learning_rate": 2.474074074074074e-07, "logits/chosen": 1.7109375, "logits/rejected": 2.091796875, "logps/chosen": -37.6875, "logps/rejected": -34.5625, "loss": 0.7051, "rewards/accuracies": 0.5, "rewards/chosen": 0.2086181640625, "rewards/margins": 0.006744384765625, "rewards/rejected": 0.201904296875, "step": 1016 }, { "epoch": 0.7533333333333333, "grad_norm": 4.854467868804932, "learning_rate": 2.4666666666666665e-07, "logits/chosen": 1.8330078125, "logits/rejected": 1.8408203125, "logps/chosen": -32.875, "logps/rejected": -41.4375, "loss": 0.8228, "rewards/accuracies": 0.0, "rewards/chosen": -0.17431640625, "rewards/margins": -0.23681640625, "rewards/rejected": 0.0625, "step": 1017 }, { "epoch": 0.7540740740740741, "grad_norm": 2.7598111629486084, "learning_rate": 2.4592592592592593e-07, "logits/chosen": 1.6748046875, "logits/rejected": 1.685546875, "logps/chosen": -46.3125, "logps/rejected": -41.3125, "loss": 0.7725, "rewards/accuracies": 0.25, "rewards/chosen": -0.32568359375, "rewards/margins": -0.11553955078125, "rewards/rejected": -0.210205078125, "step": 1018 }, { "epoch": 0.7548148148148148, "grad_norm": 1.5368813276290894, "learning_rate": 2.4518518518518516e-07, "logits/chosen": 1.9541015625, "logits/rejected": 2.146484375, "logps/chosen": -32.53125, "logps/rejected": -25.921875, "loss": 0.6646, "rewards/accuracies": 0.5, "rewards/chosen": 0.2491455078125, "rewards/margins": 0.0726318359375, "rewards/rejected": 0.176513671875, "step": 1019 }, { "epoch": 0.7555555555555555, "grad_norm": 2.098513603210449, "learning_rate": 2.4444444444444445e-07, "logits/chosen": 1.7177734375, "logits/rejected": 1.822265625, "logps/chosen": -25.84375, "logps/rejected": -56.75, "loss": 0.7656, "rewards/accuracies": 0.5, "rewards/chosen": 0.0364990234375, "rewards/margins": -0.07635498046875, "rewards/rejected": 0.11285400390625, "step": 1020 }, { "epoch": 0.7562962962962962, "grad_norm": 1.9605960845947266, "learning_rate": 2.437037037037037e-07, "logits/chosen": 1.96875, "logits/rejected": 1.927734375, "logps/chosen": -33.9375, "logps/rejected": -53.75, "loss": 0.7432, "rewards/accuracies": 0.5, "rewards/chosen": -0.03399658203125, "rewards/margins": -0.068359375, "rewards/rejected": 0.03436279296875, "step": 1021 }, { "epoch": 0.7570370370370371, "grad_norm": 1.7460365295410156, "learning_rate": 2.4296296296296296e-07, "logits/chosen": 1.330078125, "logits/rejected": 1.4638671875, "logps/chosen": -23.140625, "logps/rejected": -35.78125, "loss": 0.8555, "rewards/accuracies": 0.0, "rewards/chosen": -0.1031494140625, "rewards/margins": -0.286376953125, "rewards/rejected": 0.18310546875, "step": 1022 }, { "epoch": 0.7577777777777778, "grad_norm": 1.7372430562973022, "learning_rate": 2.4222222222222224e-07, "logits/chosen": 1.14453125, "logits/rejected": 1.255859375, "logps/chosen": -34.75, "logps/rejected": -43.78125, "loss": 0.7119, "rewards/accuracies": 0.5, "rewards/chosen": -0.0914306640625, "rewards/margins": -0.028106689453125, "rewards/rejected": -0.0633544921875, "step": 1023 }, { "epoch": 0.7585185185185185, "grad_norm": 1.8055509328842163, "learning_rate": 2.4148148148148147e-07, "logits/chosen": 1.4453125, "logits/rejected": 2.1328125, "logps/chosen": -34.9375, "logps/rejected": -45.84375, "loss": 0.6855, "rewards/accuracies": 0.75, "rewards/chosen": 0.019134521484375, "rewards/margins": 0.034698486328125, "rewards/rejected": -0.015625, "step": 1024 }, { "epoch": 0.7592592592592593, "grad_norm": 2.6581201553344727, "learning_rate": 2.407407407407407e-07, "logits/chosen": 1.900390625, "logits/rejected": 2.2890625, "logps/chosen": -21.0, "logps/rejected": -77.125, "loss": 0.6299, "rewards/accuracies": 0.75, "rewards/chosen": 0.0172119140625, "rewards/margins": 0.1390380859375, "rewards/rejected": -0.12188720703125, "step": 1025 }, { "epoch": 0.76, "grad_norm": 1.3157415390014648, "learning_rate": 2.4e-07, "logits/chosen": 1.275390625, "logits/rejected": 1.7685546875, "logps/chosen": -28.0625, "logps/rejected": -30.375, "loss": 0.6162, "rewards/accuracies": 0.75, "rewards/chosen": 0.07537841796875, "rewards/margins": 0.17578125, "rewards/rejected": -0.1004638671875, "step": 1026 }, { "epoch": 0.7607407407407407, "grad_norm": 1.6472811698913574, "learning_rate": 2.3925925925925926e-07, "logits/chosen": 1.8271484375, "logits/rejected": 0.97265625, "logps/chosen": -32.59375, "logps/rejected": -34.875, "loss": 0.7192, "rewards/accuracies": 0.5, "rewards/chosen": 0.01015472412109375, "rewards/margins": -0.039825439453125, "rewards/rejected": 0.04998779296875, "step": 1027 }, { "epoch": 0.7614814814814815, "grad_norm": 1.3338083028793335, "learning_rate": 2.385185185185185e-07, "logits/chosen": 1.72265625, "logits/rejected": 1.970703125, "logps/chosen": -42.1875, "logps/rejected": -26.0625, "loss": 0.6611, "rewards/accuracies": 0.5, "rewards/chosen": 0.1434326171875, "rewards/margins": 0.109375, "rewards/rejected": 0.034027099609375, "step": 1028 }, { "epoch": 0.7622222222222222, "grad_norm": 1.5228089094161987, "learning_rate": 2.3777777777777777e-07, "logits/chosen": 1.822265625, "logits/rejected": 2.091796875, "logps/chosen": -34.84375, "logps/rejected": -33.8125, "loss": 0.7939, "rewards/accuracies": 0.0, "rewards/chosen": -0.056671142578125, "rewards/margins": -0.18798828125, "rewards/rejected": 0.1312255859375, "step": 1029 }, { "epoch": 0.762962962962963, "grad_norm": 2.5455739498138428, "learning_rate": 2.3703703703703703e-07, "logits/chosen": 1.171875, "logits/rejected": 1.4580078125, "logps/chosen": -40.5, "logps/rejected": -44.15625, "loss": 0.5264, "rewards/accuracies": 1.0, "rewards/chosen": 0.06719970703125, "rewards/margins": 0.373291015625, "rewards/rejected": -0.30615234375, "step": 1030 }, { "epoch": 0.7637037037037037, "grad_norm": 1.7623727321624756, "learning_rate": 2.362962962962963e-07, "logits/chosen": 0.923828125, "logits/rejected": 1.80859375, "logps/chosen": -23.109375, "logps/rejected": -55.25, "loss": 0.6816, "rewards/accuracies": 0.5, "rewards/chosen": 0.051177978515625, "rewards/margins": 0.029693603515625, "rewards/rejected": 0.0214996337890625, "step": 1031 }, { "epoch": 0.7644444444444445, "grad_norm": 1.645853877067566, "learning_rate": 2.3555555555555554e-07, "logits/chosen": 1.57421875, "logits/rejected": 1.1572265625, "logps/chosen": -30.71875, "logps/rejected": -35.5625, "loss": 0.8301, "rewards/accuracies": 0.25, "rewards/chosen": -0.099609375, "rewards/margins": -0.242919921875, "rewards/rejected": 0.143310546875, "step": 1032 }, { "epoch": 0.7651851851851852, "grad_norm": 2.435769557952881, "learning_rate": 2.348148148148148e-07, "logits/chosen": 1.4365234375, "logits/rejected": 1.3046875, "logps/chosen": -31.875, "logps/rejected": -39.1875, "loss": 0.7397, "rewards/accuracies": 0.5, "rewards/chosen": -0.0771484375, "rewards/margins": -0.086181640625, "rewards/rejected": 0.00897216796875, "step": 1033 }, { "epoch": 0.7659259259259259, "grad_norm": 1.6701709032058716, "learning_rate": 2.3407407407407405e-07, "logits/chosen": 1.708984375, "logits/rejected": 1.435546875, "logps/chosen": -27.40625, "logps/rejected": -75.625, "loss": 0.6387, "rewards/accuracies": 0.75, "rewards/chosen": 0.1038818359375, "rewards/margins": 0.14208984375, "rewards/rejected": -0.038299560546875, "step": 1034 }, { "epoch": 0.7666666666666667, "grad_norm": 1.75874662399292, "learning_rate": 2.3333333333333333e-07, "logits/chosen": 1.515625, "logits/rejected": 1.7138671875, "logps/chosen": -37.46875, "logps/rejected": -53.78125, "loss": 0.6436, "rewards/accuracies": 0.5, "rewards/chosen": 0.039093017578125, "rewards/margins": 0.12347412109375, "rewards/rejected": -0.0843505859375, "step": 1035 }, { "epoch": 0.7674074074074074, "grad_norm": 4.345081329345703, "learning_rate": 2.325925925925926e-07, "logits/chosen": 1.6435546875, "logits/rejected": 1.6943359375, "logps/chosen": -19.75, "logps/rejected": -52.78125, "loss": 0.812, "rewards/accuracies": 0.5, "rewards/chosen": 0.1644287109375, "rewards/margins": -0.099365234375, "rewards/rejected": 0.263671875, "step": 1036 }, { "epoch": 0.7681481481481481, "grad_norm": 1.6978340148925781, "learning_rate": 2.3185185185185184e-07, "logits/chosen": 1.50390625, "logits/rejected": 1.521484375, "logps/chosen": -26.421875, "logps/rejected": -30.359375, "loss": 0.6191, "rewards/accuracies": 0.75, "rewards/chosen": 0.126220703125, "rewards/margins": 0.158203125, "rewards/rejected": -0.03204345703125, "step": 1037 }, { "epoch": 0.7688888888888888, "grad_norm": 2.335965156555176, "learning_rate": 2.311111111111111e-07, "logits/chosen": 1.0556640625, "logits/rejected": 1.9248046875, "logps/chosen": -27.25, "logps/rejected": -40.25, "loss": 0.668, "rewards/accuracies": 0.5, "rewards/chosen": -0.06719970703125, "rewards/margins": 0.064697265625, "rewards/rejected": -0.1319580078125, "step": 1038 }, { "epoch": 0.7696296296296297, "grad_norm": 1.9121147394180298, "learning_rate": 2.3037037037037035e-07, "logits/chosen": 1.5634765625, "logits/rejected": 1.6748046875, "logps/chosen": -25.875, "logps/rejected": -36.46875, "loss": 0.8174, "rewards/accuracies": 0.25, "rewards/chosen": -0.2318115234375, "rewards/margins": -0.22607421875, "rewards/rejected": -0.005828857421875, "step": 1039 }, { "epoch": 0.7703703703703704, "grad_norm": 1.7327617406845093, "learning_rate": 2.296296296296296e-07, "logits/chosen": 1.021484375, "logits/rejected": 1.486328125, "logps/chosen": -33.53125, "logps/rejected": -42.125, "loss": 0.7686, "rewards/accuracies": 0.0, "rewards/chosen": -0.00116729736328125, "rewards/margins": -0.1409912109375, "rewards/rejected": 0.139892578125, "step": 1040 }, { "epoch": 0.7711111111111111, "grad_norm": 1.3777400255203247, "learning_rate": 2.288888888888889e-07, "logits/chosen": 2.05859375, "logits/rejected": 1.6123046875, "logps/chosen": -39.53125, "logps/rejected": -33.34375, "loss": 0.5942, "rewards/accuracies": 0.5, "rewards/chosen": 0.163330078125, "rewards/margins": 0.2269287109375, "rewards/rejected": -0.06365966796875, "step": 1041 }, { "epoch": 0.7718518518518519, "grad_norm": 1.4900598526000977, "learning_rate": 2.2814814814814815e-07, "logits/chosen": 2.24609375, "logits/rejected": 2.212890625, "logps/chosen": -29.78125, "logps/rejected": -26.6875, "loss": 0.6406, "rewards/accuracies": 0.75, "rewards/chosen": 0.060150146484375, "rewards/margins": 0.116455078125, "rewards/rejected": -0.0562744140625, "step": 1042 }, { "epoch": 0.7725925925925926, "grad_norm": 2.2438442707061768, "learning_rate": 2.274074074074074e-07, "logits/chosen": 1.1259765625, "logits/rejected": 1.599609375, "logps/chosen": -25.828125, "logps/rejected": -46.625, "loss": 0.8174, "rewards/accuracies": 0.25, "rewards/chosen": -0.00467681884765625, "rewards/margins": -0.227294921875, "rewards/rejected": 0.22265625, "step": 1043 }, { "epoch": 0.7733333333333333, "grad_norm": 1.8918286561965942, "learning_rate": 2.2666666666666663e-07, "logits/chosen": 1.9365234375, "logits/rejected": 1.7236328125, "logps/chosen": -21.328125, "logps/rejected": -37.625, "loss": 0.8032, "rewards/accuracies": 0.25, "rewards/chosen": -0.119140625, "rewards/margins": -0.1922607421875, "rewards/rejected": 0.07305908203125, "step": 1044 }, { "epoch": 0.774074074074074, "grad_norm": 2.9135122299194336, "learning_rate": 2.2592592592592591e-07, "logits/chosen": 2.154296875, "logits/rejected": 2.412109375, "logps/chosen": -37.75, "logps/rejected": -70.25, "loss": 0.9463, "rewards/accuracies": 0.25, "rewards/chosen": -0.09100341796875, "rewards/margins": -0.218994140625, "rewards/rejected": 0.127685546875, "step": 1045 }, { "epoch": 0.7748148148148148, "grad_norm": 2.07027530670166, "learning_rate": 2.2518518518518517e-07, "logits/chosen": 1.4072265625, "logits/rejected": 1.21484375, "logps/chosen": -45.09375, "logps/rejected": -69.75, "loss": 0.7334, "rewards/accuracies": 0.5, "rewards/chosen": -0.0250244140625, "rewards/margins": -0.053924560546875, "rewards/rejected": 0.028900146484375, "step": 1046 }, { "epoch": 0.7755555555555556, "grad_norm": 2.99290132522583, "learning_rate": 2.2444444444444442e-07, "logits/chosen": 1.984375, "logits/rejected": 1.412109375, "logps/chosen": -33.5625, "logps/rejected": -50.46875, "loss": 0.7139, "rewards/accuracies": 0.5, "rewards/chosen": -0.020721435546875, "rewards/margins": -0.029296875, "rewards/rejected": 0.00859832763671875, "step": 1047 }, { "epoch": 0.7762962962962963, "grad_norm": 1.5196453332901, "learning_rate": 2.237037037037037e-07, "logits/chosen": 1.2587890625, "logits/rejected": 2.103515625, "logps/chosen": -33.375, "logps/rejected": -44.34375, "loss": 0.5762, "rewards/accuracies": 1.0, "rewards/chosen": 0.060150146484375, "rewards/margins": 0.252685546875, "rewards/rejected": -0.192626953125, "step": 1048 }, { "epoch": 0.7770370370370371, "grad_norm": 1.930935025215149, "learning_rate": 2.2296296296296296e-07, "logits/chosen": 1.1435546875, "logits/rejected": 1.46484375, "logps/chosen": -46.375, "logps/rejected": -55.78125, "loss": 0.6201, "rewards/accuracies": 0.75, "rewards/chosen": 0.09686279296875, "rewards/margins": 0.1729736328125, "rewards/rejected": -0.076171875, "step": 1049 }, { "epoch": 0.7777777777777778, "grad_norm": 1.6319860219955444, "learning_rate": 2.222222222222222e-07, "logits/chosen": 0.81103515625, "logits/rejected": 1.0302734375, "logps/chosen": -28.71875, "logps/rejected": -30.15625, "loss": 0.708, "rewards/accuracies": 0.5, "rewards/chosen": -0.00505828857421875, "rewards/margins": -0.0203094482421875, "rewards/rejected": 0.0152435302734375, "step": 1050 }, { "epoch": 0.7785185185185185, "grad_norm": 2.1886024475097656, "learning_rate": 2.2148148148148147e-07, "logits/chosen": 1.7041015625, "logits/rejected": 1.875, "logps/chosen": -26.46875, "logps/rejected": -50.3125, "loss": 0.6797, "rewards/accuracies": 0.5, "rewards/chosen": 0.0004119873046875, "rewards/margins": 0.05474853515625, "rewards/rejected": -0.05426025390625, "step": 1051 }, { "epoch": 0.7792592592592592, "grad_norm": 2.0249106884002686, "learning_rate": 2.2074074074074073e-07, "logits/chosen": 1.4287109375, "logits/rejected": 1.6298828125, "logps/chosen": -26.078125, "logps/rejected": -32.15625, "loss": 0.7417, "rewards/accuracies": 0.25, "rewards/chosen": -0.035186767578125, "rewards/margins": -0.08990478515625, "rewards/rejected": 0.0546875, "step": 1052 }, { "epoch": 0.78, "grad_norm": 1.834521770477295, "learning_rate": 2.1999999999999998e-07, "logits/chosen": 1.724609375, "logits/rejected": 1.46875, "logps/chosen": -31.734375, "logps/rejected": -29.09375, "loss": 0.9976, "rewards/accuracies": 0.5, "rewards/chosen": 0.01953125, "rewards/margins": -0.428466796875, "rewards/rejected": 0.447998046875, "step": 1053 }, { "epoch": 0.7807407407407407, "grad_norm": 1.3174036741256714, "learning_rate": 2.1925925925925927e-07, "logits/chosen": 1.3984375, "logits/rejected": 1.2744140625, "logps/chosen": -39.28125, "logps/rejected": -47.46875, "loss": 0.5747, "rewards/accuracies": 0.75, "rewards/chosen": 0.04449462890625, "rewards/margins": 0.26416015625, "rewards/rejected": -0.2197265625, "step": 1054 }, { "epoch": 0.7814814814814814, "grad_norm": 1.9745562076568604, "learning_rate": 2.1851851851851852e-07, "logits/chosen": 1.3408203125, "logits/rejected": 1.5234375, "logps/chosen": -31.96875, "logps/rejected": -74.5, "loss": 0.5405, "rewards/accuracies": 0.5, "rewards/chosen": -0.11212158203125, "rewards/margins": 0.56298828125, "rewards/rejected": -0.6748046875, "step": 1055 }, { "epoch": 0.7822222222222223, "grad_norm": 2.0671870708465576, "learning_rate": 2.1777777777777775e-07, "logits/chosen": 1.4921875, "logits/rejected": 2.609375, "logps/chosen": -36.90625, "logps/rejected": -56.625, "loss": 0.3877, "rewards/accuracies": 1.0, "rewards/chosen": 0.056640625, "rewards/margins": 1.1513671875, "rewards/rejected": -1.095703125, "step": 1056 }, { "epoch": 0.782962962962963, "grad_norm": 2.06219220161438, "learning_rate": 2.17037037037037e-07, "logits/chosen": 1.982421875, "logits/rejected": 1.9052734375, "logps/chosen": -36.4375, "logps/rejected": -45.3125, "loss": 0.6758, "rewards/accuracies": 0.75, "rewards/chosen": 0.1322021484375, "rewards/margins": 0.05255126953125, "rewards/rejected": 0.07965087890625, "step": 1057 }, { "epoch": 0.7837037037037037, "grad_norm": 4.772838115692139, "learning_rate": 2.162962962962963e-07, "logits/chosen": 1.857421875, "logits/rejected": 1.23828125, "logps/chosen": -34.4375, "logps/rejected": -54.3125, "loss": 0.6807, "rewards/accuracies": 0.5, "rewards/chosen": 0.159423828125, "rewards/margins": 0.0377197265625, "rewards/rejected": 0.1217041015625, "step": 1058 }, { "epoch": 0.7844444444444445, "grad_norm": 3.577005624771118, "learning_rate": 2.1555555555555554e-07, "logits/chosen": 1.669921875, "logits/rejected": 1.4189453125, "logps/chosen": -31.09375, "logps/rejected": -43.03125, "loss": 0.8809, "rewards/accuracies": 0.25, "rewards/chosen": -0.294189453125, "rewards/margins": -0.271484375, "rewards/rejected": -0.022674560546875, "step": 1059 }, { "epoch": 0.7851851851851852, "grad_norm": 1.776997685432434, "learning_rate": 2.148148148148148e-07, "logits/chosen": 1.7568359375, "logits/rejected": 1.6513671875, "logps/chosen": -29.8125, "logps/rejected": -65.3125, "loss": 0.668, "rewards/accuracies": 0.5, "rewards/chosen": 0.001560211181640625, "rewards/margins": 0.07147216796875, "rewards/rejected": -0.0699462890625, "step": 1060 }, { "epoch": 0.7859259259259259, "grad_norm": 1.4095962047576904, "learning_rate": 2.1407407407407408e-07, "logits/chosen": 1.2978515625, "logits/rejected": 1.4853515625, "logps/chosen": -21.59375, "logps/rejected": -35.6875, "loss": 0.6899, "rewards/accuracies": 0.25, "rewards/chosen": 0.014434814453125, "rewards/margins": 0.0120849609375, "rewards/rejected": 0.002349853515625, "step": 1061 }, { "epoch": 0.7866666666666666, "grad_norm": 1.6514836549758911, "learning_rate": 2.1333333333333334e-07, "logits/chosen": 1.7802734375, "logits/rejected": 1.75390625, "logps/chosen": -31.546875, "logps/rejected": -56.53125, "loss": 0.6226, "rewards/accuracies": 0.75, "rewards/chosen": 0.1868896484375, "rewards/margins": 0.154052734375, "rewards/rejected": 0.032806396484375, "step": 1062 }, { "epoch": 0.7874074074074074, "grad_norm": 2.606109857559204, "learning_rate": 2.1259259259259257e-07, "logits/chosen": 0.849609375, "logits/rejected": 1.61328125, "logps/chosen": -48.15625, "logps/rejected": -49.03125, "loss": 0.6157, "rewards/accuracies": 0.75, "rewards/chosen": 0.1514892578125, "rewards/margins": 0.1927490234375, "rewards/rejected": -0.04119873046875, "step": 1063 }, { "epoch": 0.7881481481481482, "grad_norm": 1.5183436870574951, "learning_rate": 2.1185185185185185e-07, "logits/chosen": 0.9111328125, "logits/rejected": 1.572265625, "logps/chosen": -24.265625, "logps/rejected": -27.8125, "loss": 0.7002, "rewards/accuracies": 0.5, "rewards/chosen": -0.054290771484375, "rewards/margins": -0.00469207763671875, "rewards/rejected": -0.04962158203125, "step": 1064 }, { "epoch": 0.7888888888888889, "grad_norm": 3.3643014430999756, "learning_rate": 2.111111111111111e-07, "logits/chosen": 1.91015625, "logits/rejected": 1.564453125, "logps/chosen": -20.296875, "logps/rejected": -46.5625, "loss": 0.7559, "rewards/accuracies": 0.5, "rewards/chosen": 0.08612060546875, "rewards/margins": -0.11505126953125, "rewards/rejected": 0.201171875, "step": 1065 }, { "epoch": 0.7896296296296297, "grad_norm": 1.8740826845169067, "learning_rate": 2.1037037037037036e-07, "logits/chosen": 2.03125, "logits/rejected": 1.9814453125, "logps/chosen": -38.21875, "logps/rejected": -65.1875, "loss": 0.583, "rewards/accuracies": 1.0, "rewards/chosen": 0.04998779296875, "rewards/margins": 0.2425537109375, "rewards/rejected": -0.1925048828125, "step": 1066 }, { "epoch": 0.7903703703703704, "grad_norm": 1.7087851762771606, "learning_rate": 2.0962962962962962e-07, "logits/chosen": 1.0791015625, "logits/rejected": 1.2138671875, "logps/chosen": -23.453125, "logps/rejected": -67.875, "loss": 0.5156, "rewards/accuracies": 1.0, "rewards/chosen": -0.0009613037109375, "rewards/margins": 0.405517578125, "rewards/rejected": -0.406494140625, "step": 1067 }, { "epoch": 0.7911111111111111, "grad_norm": 1.5483951568603516, "learning_rate": 2.088888888888889e-07, "logits/chosen": 1.1533203125, "logits/rejected": 1.96875, "logps/chosen": -24.921875, "logps/rejected": -47.15625, "loss": 0.6226, "rewards/accuracies": 1.0, "rewards/chosen": 0.034759521484375, "rewards/margins": 0.147705078125, "rewards/rejected": -0.1129150390625, "step": 1068 }, { "epoch": 0.7918518518518518, "grad_norm": 3.027921676635742, "learning_rate": 2.0814814814814813e-07, "logits/chosen": 2.30859375, "logits/rejected": 1.8330078125, "logps/chosen": -28.828125, "logps/rejected": -74.1875, "loss": 0.7749, "rewards/accuracies": 0.5, "rewards/chosen": -0.166015625, "rewards/margins": -0.0777587890625, "rewards/rejected": -0.0882568359375, "step": 1069 }, { "epoch": 0.7925925925925926, "grad_norm": 2.2611169815063477, "learning_rate": 2.0740740740740738e-07, "logits/chosen": 1.583984375, "logits/rejected": 1.8525390625, "logps/chosen": -34.75, "logps/rejected": -43.84375, "loss": 0.6641, "rewards/accuracies": 0.75, "rewards/chosen": 0.06048583984375, "rewards/margins": 0.10418701171875, "rewards/rejected": -0.043731689453125, "step": 1070 }, { "epoch": 0.7933333333333333, "grad_norm": 2.0921621322631836, "learning_rate": 2.0666666666666666e-07, "logits/chosen": 1.5751953125, "logits/rejected": 1.599609375, "logps/chosen": -31.125, "logps/rejected": -28.75, "loss": 0.7383, "rewards/accuracies": 0.75, "rewards/chosen": -0.0928955078125, "rewards/margins": -0.023284912109375, "rewards/rejected": -0.0697021484375, "step": 1071 }, { "epoch": 0.794074074074074, "grad_norm": 2.8622045516967773, "learning_rate": 2.0592592592592592e-07, "logits/chosen": 1.1181640625, "logits/rejected": 1.6181640625, "logps/chosen": -20.09375, "logps/rejected": -75.5, "loss": 0.8506, "rewards/accuracies": 0.25, "rewards/chosen": 0.03125, "rewards/margins": -0.27490234375, "rewards/rejected": 0.30615234375, "step": 1072 }, { "epoch": 0.7948148148148149, "grad_norm": 2.4575083255767822, "learning_rate": 2.0518518518518518e-07, "logits/chosen": 1.8896484375, "logits/rejected": 1.6005859375, "logps/chosen": -40.78125, "logps/rejected": -51.0, "loss": 0.8506, "rewards/accuracies": 0.5, "rewards/chosen": -0.0875244140625, "rewards/margins": -0.263916015625, "rewards/rejected": 0.1763916015625, "step": 1073 }, { "epoch": 0.7955555555555556, "grad_norm": 2.3810274600982666, "learning_rate": 2.0444444444444446e-07, "logits/chosen": 0.98974609375, "logits/rejected": 2.306640625, "logps/chosen": -22.4375, "logps/rejected": -63.09375, "loss": 0.5103, "rewards/accuracies": 1.0, "rewards/chosen": 0.11602783203125, "rewards/margins": 0.5146484375, "rewards/rejected": -0.3984375, "step": 1074 }, { "epoch": 0.7962962962962963, "grad_norm": 1.9645600318908691, "learning_rate": 2.0370370370370369e-07, "logits/chosen": 1.4248046875, "logits/rejected": 1.9521484375, "logps/chosen": -22.0, "logps/rejected": -53.5625, "loss": 0.707, "rewards/accuracies": 0.5, "rewards/chosen": -0.015625, "rewards/margins": -0.017578125, "rewards/rejected": 0.00196075439453125, "step": 1075 }, { "epoch": 0.797037037037037, "grad_norm": 1.6753545999526978, "learning_rate": 2.0296296296296294e-07, "logits/chosen": 1.412109375, "logits/rejected": 2.146484375, "logps/chosen": -25.078125, "logps/rejected": -65.625, "loss": 0.6787, "rewards/accuracies": 0.5, "rewards/chosen": -0.0648193359375, "rewards/margins": 0.046234130859375, "rewards/rejected": -0.11114501953125, "step": 1076 }, { "epoch": 0.7977777777777778, "grad_norm": 1.7800359725952148, "learning_rate": 2.022222222222222e-07, "logits/chosen": 2.34765625, "logits/rejected": 1.396484375, "logps/chosen": -33.5, "logps/rejected": -26.859375, "loss": 0.7246, "rewards/accuracies": 0.5, "rewards/chosen": 0.05670166015625, "rewards/margins": -0.04229736328125, "rewards/rejected": 0.0989990234375, "step": 1077 }, { "epoch": 0.7985185185185185, "grad_norm": 1.7000658512115479, "learning_rate": 2.0148148148148148e-07, "logits/chosen": 1.7451171875, "logits/rejected": 1.3876953125, "logps/chosen": -26.046875, "logps/rejected": -22.296875, "loss": 0.7783, "rewards/accuracies": 0.0, "rewards/chosen": -0.1566162109375, "rewards/margins": -0.1591796875, "rewards/rejected": 0.002532958984375, "step": 1078 }, { "epoch": 0.7992592592592592, "grad_norm": 1.4804857969284058, "learning_rate": 2.0074074074074074e-07, "logits/chosen": 1.5078125, "logits/rejected": 1.6630859375, "logps/chosen": -18.15625, "logps/rejected": -49.53125, "loss": 0.6699, "rewards/accuracies": 0.25, "rewards/chosen": 0.1363525390625, "rewards/margins": 0.08154296875, "rewards/rejected": 0.0546875, "step": 1079 }, { "epoch": 0.8, "grad_norm": 18.626602172851562, "learning_rate": 2e-07, "logits/chosen": 1.8681640625, "logits/rejected": 1.68359375, "logps/chosen": -31.046875, "logps/rejected": -50.5, "loss": 0.6519, "rewards/accuracies": 0.75, "rewards/chosen": 0.0941162109375, "rewards/margins": 0.099609375, "rewards/rejected": -0.005474090576171875, "step": 1080 }, { "epoch": 0.8007407407407408, "grad_norm": 1.6188570261001587, "learning_rate": 1.9925925925925925e-07, "logits/chosen": 2.599609375, "logits/rejected": 1.333984375, "logps/chosen": -23.671875, "logps/rejected": -27.1875, "loss": 0.7212, "rewards/accuracies": 0.5, "rewards/chosen": 0.0297088623046875, "rewards/margins": -0.041748046875, "rewards/rejected": 0.07147216796875, "step": 1081 }, { "epoch": 0.8014814814814815, "grad_norm": 1.5158840417861938, "learning_rate": 1.985185185185185e-07, "logits/chosen": 1.59765625, "logits/rejected": 0.88037109375, "logps/chosen": -38.5, "logps/rejected": -31.03125, "loss": 0.6567, "rewards/accuracies": 0.5, "rewards/chosen": 0.00042724609375, "rewards/margins": 0.1051025390625, "rewards/rejected": -0.10467529296875, "step": 1082 }, { "epoch": 0.8022222222222222, "grad_norm": 1.7420251369476318, "learning_rate": 1.9777777777777776e-07, "logits/chosen": 1.830078125, "logits/rejected": 1.970703125, "logps/chosen": -31.359375, "logps/rejected": -83.5625, "loss": 0.5483, "rewards/accuracies": 1.0, "rewards/chosen": 0.01055908203125, "rewards/margins": 0.322998046875, "rewards/rejected": -0.3125, "step": 1083 }, { "epoch": 0.802962962962963, "grad_norm": 1.768765926361084, "learning_rate": 1.9703703703703704e-07, "logits/chosen": 1.9912109375, "logits/rejected": 2.0546875, "logps/chosen": -32.375, "logps/rejected": -67.3125, "loss": 0.7236, "rewards/accuracies": 0.5, "rewards/chosen": -0.01580810546875, "rewards/margins": -0.041595458984375, "rewards/rejected": 0.025787353515625, "step": 1084 }, { "epoch": 0.8037037037037037, "grad_norm": 1.522062063217163, "learning_rate": 1.962962962962963e-07, "logits/chosen": 0.8603515625, "logits/rejected": 1.66015625, "logps/chosen": -48.71875, "logps/rejected": -41.40625, "loss": 0.5386, "rewards/accuracies": 0.75, "rewards/chosen": -0.046478271484375, "rewards/margins": 0.5341796875, "rewards/rejected": -0.58056640625, "step": 1085 }, { "epoch": 0.8044444444444444, "grad_norm": 8.052519798278809, "learning_rate": 1.9555555555555555e-07, "logits/chosen": 2.005859375, "logits/rejected": 2.013671875, "logps/chosen": -33.1875, "logps/rejected": -57.1875, "loss": 0.7476, "rewards/accuracies": 0.25, "rewards/chosen": -0.16064453125, "rewards/margins": -0.0782470703125, "rewards/rejected": -0.08233642578125, "step": 1086 }, { "epoch": 0.8051851851851852, "grad_norm": 1.9704253673553467, "learning_rate": 1.9481481481481478e-07, "logits/chosen": 1.8994140625, "logits/rejected": 1.828125, "logps/chosen": -27.1875, "logps/rejected": -53.40625, "loss": 0.7246, "rewards/accuracies": 0.25, "rewards/chosen": 0.016021728515625, "rewards/margins": -0.056640625, "rewards/rejected": 0.0726318359375, "step": 1087 }, { "epoch": 0.8059259259259259, "grad_norm": 1.8889256715774536, "learning_rate": 1.9407407407407406e-07, "logits/chosen": 1.783203125, "logits/rejected": 1.6201171875, "logps/chosen": -55.15625, "logps/rejected": -50.90625, "loss": 0.623, "rewards/accuracies": 0.5, "rewards/chosen": 0.32470703125, "rewards/margins": 0.186767578125, "rewards/rejected": 0.137939453125, "step": 1088 }, { "epoch": 0.8066666666666666, "grad_norm": 1.7614178657531738, "learning_rate": 1.9333333333333332e-07, "logits/chosen": 1.2109375, "logits/rejected": 1.3056640625, "logps/chosen": -48.21875, "logps/rejected": -39.6875, "loss": 0.666, "rewards/accuracies": 0.75, "rewards/chosen": -0.1304931640625, "rewards/margins": 0.0726318359375, "rewards/rejected": -0.203125, "step": 1089 }, { "epoch": 0.8074074074074075, "grad_norm": 1.5402708053588867, "learning_rate": 1.9259259259259257e-07, "logits/chosen": 1.427734375, "logits/rejected": 1.439453125, "logps/chosen": -23.015625, "logps/rejected": -47.03125, "loss": 0.6992, "rewards/accuracies": 0.25, "rewards/chosen": -0.035186767578125, "rewards/margins": 0.00035858154296875, "rewards/rejected": -0.035552978515625, "step": 1090 }, { "epoch": 0.8081481481481482, "grad_norm": 2.14095139503479, "learning_rate": 1.9185185185185186e-07, "logits/chosen": 1.2568359375, "logits/rejected": 1.0810546875, "logps/chosen": -37.15625, "logps/rejected": -35.03125, "loss": 0.7578, "rewards/accuracies": 0.25, "rewards/chosen": -0.05743408203125, "rewards/margins": -0.10040283203125, "rewards/rejected": 0.042999267578125, "step": 1091 }, { "epoch": 0.8088888888888889, "grad_norm": 1.9506940841674805, "learning_rate": 1.911111111111111e-07, "logits/chosen": 1.7138671875, "logits/rejected": 1.0673828125, "logps/chosen": -25.546875, "logps/rejected": -30.265625, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": 0.0265655517578125, "rewards/margins": 0.1090087890625, "rewards/rejected": -0.0823974609375, "step": 1092 }, { "epoch": 0.8096296296296296, "grad_norm": 1.3547577857971191, "learning_rate": 1.9037037037037037e-07, "logits/chosen": 1.71875, "logits/rejected": 1.271484375, "logps/chosen": -22.703125, "logps/rejected": -28.71875, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": 0.0293121337890625, "rewards/margins": 0.1136474609375, "rewards/rejected": -0.0843505859375, "step": 1093 }, { "epoch": 0.8103703703703704, "grad_norm": 1.7479428052902222, "learning_rate": 1.8962962962962962e-07, "logits/chosen": 2.158203125, "logits/rejected": 1.7978515625, "logps/chosen": -29.96875, "logps/rejected": -60.5, "loss": 0.6543, "rewards/accuracies": 0.75, "rewards/chosen": 0.11328125, "rewards/margins": 0.091796875, "rewards/rejected": 0.021514892578125, "step": 1094 }, { "epoch": 0.8111111111111111, "grad_norm": 2.4858548641204834, "learning_rate": 1.8888888888888888e-07, "logits/chosen": 1.3740234375, "logits/rejected": 2.05859375, "logps/chosen": -26.375, "logps/rejected": -54.71875, "loss": 1.3428, "rewards/accuracies": 0.5, "rewards/chosen": 0.056671142578125, "rewards/margins": -0.71142578125, "rewards/rejected": 0.76806640625, "step": 1095 }, { "epoch": 0.8118518518518518, "grad_norm": 2.283325433731079, "learning_rate": 1.8814814814814813e-07, "logits/chosen": 1.5947265625, "logits/rejected": 0.8740234375, "logps/chosen": -40.5625, "logps/rejected": -31.5625, "loss": 0.7158, "rewards/accuracies": 0.5, "rewards/chosen": -0.0765380859375, "rewards/margins": -0.041412353515625, "rewards/rejected": -0.03515625, "step": 1096 }, { "epoch": 0.8125925925925926, "grad_norm": 1.8546795845031738, "learning_rate": 1.8740740740740742e-07, "logits/chosen": 1.0341796875, "logits/rejected": 1.17578125, "logps/chosen": -45.875, "logps/rejected": -47.59375, "loss": 0.7695, "rewards/accuracies": 0.25, "rewards/chosen": 0.004261016845703125, "rewards/margins": -0.124267578125, "rewards/rejected": 0.1285400390625, "step": 1097 }, { "epoch": 0.8133333333333334, "grad_norm": 1.8924667835235596, "learning_rate": 1.8666666666666667e-07, "logits/chosen": 1.326171875, "logits/rejected": 1.9091796875, "logps/chosen": -24.328125, "logps/rejected": -68.0, "loss": 0.6689, "rewards/accuracies": 0.5, "rewards/chosen": 0.147216796875, "rewards/margins": 0.06207275390625, "rewards/rejected": 0.08514404296875, "step": 1098 }, { "epoch": 0.8140740740740741, "grad_norm": 3.0807056427001953, "learning_rate": 1.8592592592592593e-07, "logits/chosen": 2.1875, "logits/rejected": 1.2763671875, "logps/chosen": -39.34375, "logps/rejected": -25.15625, "loss": 0.6338, "rewards/accuracies": 0.75, "rewards/chosen": -0.052734375, "rewards/margins": 0.12420654296875, "rewards/rejected": -0.1768798828125, "step": 1099 }, { "epoch": 0.8148148148148148, "grad_norm": 1.8874224424362183, "learning_rate": 1.8518518518518516e-07, "logits/chosen": 1.5849609375, "logits/rejected": 1.6337890625, "logps/chosen": -25.9375, "logps/rejected": -25.421875, "loss": 0.6011, "rewards/accuracies": 0.75, "rewards/chosen": 0.1234130859375, "rewards/margins": 0.207763671875, "rewards/rejected": -0.0843505859375, "step": 1100 }, { "epoch": 0.8155555555555556, "grad_norm": 2.699521064758301, "learning_rate": 1.8444444444444444e-07, "logits/chosen": 1.748046875, "logits/rejected": 1.8134765625, "logps/chosen": -56.53125, "logps/rejected": -88.625, "loss": 0.4775, "rewards/accuracies": 1.0, "rewards/chosen": 0.408203125, "rewards/margins": 0.71142578125, "rewards/rejected": -0.30322265625, "step": 1101 }, { "epoch": 0.8162962962962963, "grad_norm": 2.072744131088257, "learning_rate": 1.837037037037037e-07, "logits/chosen": 2.259765625, "logits/rejected": 1.7138671875, "logps/chosen": -40.75, "logps/rejected": -47.9375, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": 0.034759521484375, "rewards/margins": 0.0159912109375, "rewards/rejected": 0.018768310546875, "step": 1102 }, { "epoch": 0.817037037037037, "grad_norm": 5.528613090515137, "learning_rate": 1.8296296296296295e-07, "logits/chosen": 1.93359375, "logits/rejected": 1.853515625, "logps/chosen": -34.21875, "logps/rejected": -40.15625, "loss": 0.8394, "rewards/accuracies": 0.25, "rewards/chosen": -0.006633758544921875, "rewards/margins": -0.25, "rewards/rejected": 0.243408203125, "step": 1103 }, { "epoch": 0.8177777777777778, "grad_norm": 1.875883936882019, "learning_rate": 1.8222222222222223e-07, "logits/chosen": 1.802734375, "logits/rejected": 1.107421875, "logps/chosen": -45.40625, "logps/rejected": -34.75, "loss": 0.5635, "rewards/accuracies": 1.0, "rewards/chosen": 0.278076171875, "rewards/margins": 0.283935546875, "rewards/rejected": -0.005859375, "step": 1104 }, { "epoch": 0.8185185185185185, "grad_norm": 13.575528144836426, "learning_rate": 1.8148148148148149e-07, "logits/chosen": 2.83203125, "logits/rejected": 1.08984375, "logps/chosen": -39.46875, "logps/rejected": -32.28125, "loss": 1.2451, "rewards/accuracies": 0.5, "rewards/chosen": -0.73095703125, "rewards/margins": -0.677734375, "rewards/rejected": -0.053131103515625, "step": 1105 }, { "epoch": 0.8192592592592592, "grad_norm": 1.8909302949905396, "learning_rate": 1.8074074074074072e-07, "logits/chosen": 1.904296875, "logits/rejected": 2.001953125, "logps/chosen": -32.0, "logps/rejected": -54.1875, "loss": 0.6445, "rewards/accuracies": 0.5, "rewards/chosen": -0.053131103515625, "rewards/margins": 0.1365966796875, "rewards/rejected": -0.1898193359375, "step": 1106 }, { "epoch": 0.82, "grad_norm": 2.1693553924560547, "learning_rate": 1.8e-07, "logits/chosen": 2.140625, "logits/rejected": 1.646484375, "logps/chosen": -39.5625, "logps/rejected": -34.65625, "loss": 0.7964, "rewards/accuracies": 0.25, "rewards/chosen": -0.1507568359375, "rewards/margins": -0.1917724609375, "rewards/rejected": 0.041015625, "step": 1107 }, { "epoch": 0.8207407407407408, "grad_norm": 2.350558280944824, "learning_rate": 1.7925925925925925e-07, "logits/chosen": 1.08984375, "logits/rejected": 1.43359375, "logps/chosen": -45.53125, "logps/rejected": -46.78125, "loss": 0.7944, "rewards/accuracies": 0.5, "rewards/chosen": -0.10394287109375, "rewards/margins": -0.16650390625, "rewards/rejected": 0.0625, "step": 1108 }, { "epoch": 0.8214814814814815, "grad_norm": 1.5286400318145752, "learning_rate": 1.785185185185185e-07, "logits/chosen": 1.2626953125, "logits/rejected": 1.23046875, "logps/chosen": -34.3125, "logps/rejected": -32.4375, "loss": 0.7158, "rewards/accuracies": 0.5, "rewards/chosen": 0.0124969482421875, "rewards/margins": -0.043365478515625, "rewards/rejected": 0.05584716796875, "step": 1109 }, { "epoch": 0.8222222222222222, "grad_norm": 1.5817409753799438, "learning_rate": 1.7777777777777776e-07, "logits/chosen": 1.439453125, "logits/rejected": 1.888671875, "logps/chosen": -28.875, "logps/rejected": -51.34375, "loss": 0.7158, "rewards/accuracies": 0.75, "rewards/chosen": -0.09228515625, "rewards/margins": -0.01104736328125, "rewards/rejected": -0.08123779296875, "step": 1110 }, { "epoch": 0.822962962962963, "grad_norm": 1.669926404953003, "learning_rate": 1.7703703703703705e-07, "logits/chosen": 1.2548828125, "logits/rejected": 1.75390625, "logps/chosen": -36.46875, "logps/rejected": -37.78125, "loss": 0.6992, "rewards/accuracies": 0.5, "rewards/chosen": 0.091796875, "rewards/margins": 0.00115966796875, "rewards/rejected": 0.09063720703125, "step": 1111 }, { "epoch": 0.8237037037037037, "grad_norm": 1.812578558921814, "learning_rate": 1.7629629629629627e-07, "logits/chosen": 2.771484375, "logits/rejected": 1.521484375, "logps/chosen": -29.78125, "logps/rejected": -52.40625, "loss": 0.6143, "rewards/accuracies": 1.0, "rewards/chosen": 0.035552978515625, "rewards/margins": 0.166748046875, "rewards/rejected": -0.1312255859375, "step": 1112 }, { "epoch": 0.8244444444444444, "grad_norm": 2.390624523162842, "learning_rate": 1.7555555555555553e-07, "logits/chosen": 1.384765625, "logits/rejected": 1.4921875, "logps/chosen": -25.3125, "logps/rejected": -37.6875, "loss": 0.7988, "rewards/accuracies": 0.25, "rewards/chosen": -0.177001953125, "rewards/margins": -0.165283203125, "rewards/rejected": -0.0117340087890625, "step": 1113 }, { "epoch": 0.8251851851851851, "grad_norm": 1.5191305875778198, "learning_rate": 1.748148148148148e-07, "logits/chosen": 1.2470703125, "logits/rejected": 1.6884765625, "logps/chosen": -29.8125, "logps/rejected": -54.90625, "loss": 0.6548, "rewards/accuracies": 0.75, "rewards/chosen": 0.018768310546875, "rewards/margins": 0.083251953125, "rewards/rejected": -0.06451416015625, "step": 1114 }, { "epoch": 0.825925925925926, "grad_norm": 2.424330234527588, "learning_rate": 1.7407407407407407e-07, "logits/chosen": 1.884765625, "logits/rejected": 1.7353515625, "logps/chosen": -41.25, "logps/rejected": -32.59375, "loss": 0.7114, "rewards/accuracies": 0.5, "rewards/chosen": -0.04998779296875, "rewards/margins": -0.014892578125, "rewards/rejected": -0.035125732421875, "step": 1115 }, { "epoch": 0.8266666666666667, "grad_norm": 2.8277781009674072, "learning_rate": 1.7333333333333332e-07, "logits/chosen": 1.6318359375, "logits/rejected": 1.63671875, "logps/chosen": -36.6875, "logps/rejected": -52.65625, "loss": 1.0566, "rewards/accuracies": 0.25, "rewards/chosen": -0.26953125, "rewards/margins": -0.546875, "rewards/rejected": 0.27734375, "step": 1116 }, { "epoch": 0.8274074074074074, "grad_norm": 1.8008240461349487, "learning_rate": 1.725925925925926e-07, "logits/chosen": 2.724609375, "logits/rejected": 2.0625, "logps/chosen": -30.0, "logps/rejected": -44.28125, "loss": 0.5605, "rewards/accuracies": 0.75, "rewards/chosen": -0.0300750732421875, "rewards/margins": 0.33349609375, "rewards/rejected": -0.363525390625, "step": 1117 }, { "epoch": 0.8281481481481482, "grad_norm": 3.543198823928833, "learning_rate": 1.7185185185185183e-07, "logits/chosen": 1.8037109375, "logits/rejected": 2.1328125, "logps/chosen": -27.25, "logps/rejected": -39.1875, "loss": 0.4531, "rewards/accuracies": 1.0, "rewards/chosen": 0.47021484375, "rewards/margins": 0.74609375, "rewards/rejected": -0.27587890625, "step": 1118 }, { "epoch": 0.8288888888888889, "grad_norm": 1.9727433919906616, "learning_rate": 1.711111111111111e-07, "logits/chosen": 1.4326171875, "logits/rejected": 1.755859375, "logps/chosen": -27.59375, "logps/rejected": -55.59375, "loss": 0.8145, "rewards/accuracies": 0.25, "rewards/chosen": 0.0472412109375, "rewards/margins": -0.202392578125, "rewards/rejected": 0.2496337890625, "step": 1119 }, { "epoch": 0.8296296296296296, "grad_norm": 1.6687302589416504, "learning_rate": 1.7037037037037035e-07, "logits/chosen": 1.2734375, "logits/rejected": 1.783203125, "logps/chosen": -27.453125, "logps/rejected": -51.3125, "loss": 0.7588, "rewards/accuracies": 0.25, "rewards/chosen": -0.08087158203125, "rewards/margins": -0.1015625, "rewards/rejected": 0.0207061767578125, "step": 1120 }, { "epoch": 0.8303703703703704, "grad_norm": 1.9289741516113281, "learning_rate": 1.6962962962962963e-07, "logits/chosen": 2.0703125, "logits/rejected": 1.7958984375, "logps/chosen": -60.46875, "logps/rejected": -35.0, "loss": 0.6519, "rewards/accuracies": 0.75, "rewards/chosen": 0.06170654296875, "rewards/margins": 0.08941650390625, "rewards/rejected": -0.0277099609375, "step": 1121 }, { "epoch": 0.8311111111111111, "grad_norm": 2.0075247287750244, "learning_rate": 1.6888888888888888e-07, "logits/chosen": 1.48046875, "logits/rejected": 1.365234375, "logps/chosen": -33.96875, "logps/rejected": -35.5, "loss": 0.6992, "rewards/accuracies": 0.5, "rewards/chosen": -0.052734375, "rewards/margins": -0.00821685791015625, "rewards/rejected": -0.044525146484375, "step": 1122 }, { "epoch": 0.8318518518518518, "grad_norm": 1.6724261045455933, "learning_rate": 1.6814814814814814e-07, "logits/chosen": 1.158203125, "logits/rejected": 1.724609375, "logps/chosen": -31.625, "logps/rejected": -30.59375, "loss": 0.708, "rewards/accuracies": 0.5, "rewards/chosen": -0.05035400390625, "rewards/margins": -0.0273590087890625, "rewards/rejected": -0.0230255126953125, "step": 1123 }, { "epoch": 0.8325925925925926, "grad_norm": 1.60440993309021, "learning_rate": 1.674074074074074e-07, "logits/chosen": 1.2900390625, "logits/rejected": 1.5966796875, "logps/chosen": -28.125, "logps/rejected": -30.625, "loss": 0.7529, "rewards/accuracies": 0.5, "rewards/chosen": -0.1708984375, "rewards/margins": -0.1002197265625, "rewards/rejected": -0.0706787109375, "step": 1124 }, { "epoch": 0.8333333333333334, "grad_norm": 1.5056993961334229, "learning_rate": 1.6666666666666665e-07, "logits/chosen": 1.2724609375, "logits/rejected": 1.498046875, "logps/chosen": -46.8125, "logps/rejected": -62.71875, "loss": 0.6094, "rewards/accuracies": 0.5, "rewards/chosen": 1.1044921875, "rewards/margins": 1.1376953125, "rewards/rejected": -0.033233642578125, "step": 1125 }, { "epoch": 0.8340740740740741, "grad_norm": 1.685770869255066, "learning_rate": 1.659259259259259e-07, "logits/chosen": 1.607421875, "logits/rejected": 1.38671875, "logps/chosen": -30.734375, "logps/rejected": -47.4375, "loss": 0.7148, "rewards/accuracies": 0.5, "rewards/chosen": 0.035552978515625, "rewards/margins": -0.0257720947265625, "rewards/rejected": 0.061309814453125, "step": 1126 }, { "epoch": 0.8348148148148148, "grad_norm": 2.8336095809936523, "learning_rate": 1.651851851851852e-07, "logits/chosen": 1.23046875, "logits/rejected": 1.447265625, "logps/chosen": -29.046875, "logps/rejected": -65.375, "loss": 0.8408, "rewards/accuracies": 0.25, "rewards/chosen": -0.033203125, "rewards/margins": -0.258056640625, "rewards/rejected": 0.2249755859375, "step": 1127 }, { "epoch": 0.8355555555555556, "grad_norm": 1.5640534162521362, "learning_rate": 1.6444444444444444e-07, "logits/chosen": 0.53857421875, "logits/rejected": 0.8232421875, "logps/chosen": -33.875, "logps/rejected": -29.90625, "loss": 0.7559, "rewards/accuracies": 0.25, "rewards/chosen": 0.02105712890625, "rewards/margins": -0.11175537109375, "rewards/rejected": 0.1328125, "step": 1128 }, { "epoch": 0.8362962962962963, "grad_norm": 2.457181215286255, "learning_rate": 1.637037037037037e-07, "logits/chosen": 1.2578125, "logits/rejected": 1.1220703125, "logps/chosen": -56.3125, "logps/rejected": -34.3125, "loss": 0.667, "rewards/accuracies": 0.5, "rewards/chosen": 0.2666015625, "rewards/margins": 0.1583251953125, "rewards/rejected": 0.108154296875, "step": 1129 }, { "epoch": 0.837037037037037, "grad_norm": 1.3275154829025269, "learning_rate": 1.6296296296296298e-07, "logits/chosen": 1.041015625, "logits/rejected": 1.607421875, "logps/chosen": -37.1875, "logps/rejected": -44.09375, "loss": 0.5493, "rewards/accuracies": 0.75, "rewards/chosen": 0.498291015625, "rewards/margins": 0.52685546875, "rewards/rejected": -0.02850341796875, "step": 1130 }, { "epoch": 0.8377777777777777, "grad_norm": 1.8947583436965942, "learning_rate": 1.622222222222222e-07, "logits/chosen": 1.7177734375, "logits/rejected": 1.5263671875, "logps/chosen": -30.84375, "logps/rejected": -38.53125, "loss": 0.7451, "rewards/accuracies": 0.25, "rewards/chosen": 0.041595458984375, "rewards/margins": -0.09906005859375, "rewards/rejected": 0.140625, "step": 1131 }, { "epoch": 0.8385185185185186, "grad_norm": 1.501983642578125, "learning_rate": 1.6148148148148147e-07, "logits/chosen": 1.25, "logits/rejected": 1.5615234375, "logps/chosen": -27.25, "logps/rejected": -31.265625, "loss": 0.7695, "rewards/accuracies": 0.25, "rewards/chosen": 0.044158935546875, "rewards/margins": -0.135498046875, "rewards/rejected": 0.1796875, "step": 1132 }, { "epoch": 0.8392592592592593, "grad_norm": 2.8450770378112793, "learning_rate": 1.6074074074074072e-07, "logits/chosen": 1.330078125, "logits/rejected": 1.5458984375, "logps/chosen": -36.21875, "logps/rejected": -68.125, "loss": 0.6289, "rewards/accuracies": 1.0, "rewards/chosen": -0.0546875, "rewards/margins": 0.1328125, "rewards/rejected": -0.1875, "step": 1133 }, { "epoch": 0.84, "grad_norm": 2.012524127960205, "learning_rate": 1.6e-07, "logits/chosen": 1.5751953125, "logits/rejected": 1.81640625, "logps/chosen": -25.65625, "logps/rejected": -48.375, "loss": 0.5698, "rewards/accuracies": 0.5, "rewards/chosen": 0.52880859375, "rewards/margins": 0.427001953125, "rewards/rejected": 0.10198974609375, "step": 1134 }, { "epoch": 0.8407407407407408, "grad_norm": 3.8514792919158936, "learning_rate": 1.5925925925925926e-07, "logits/chosen": 1.2646484375, "logits/rejected": 0.68212890625, "logps/chosen": -28.984375, "logps/rejected": -74.4375, "loss": 0.6323, "rewards/accuracies": 0.75, "rewards/chosen": 0.09161376953125, "rewards/margins": 0.13134765625, "rewards/rejected": -0.03985595703125, "step": 1135 }, { "epoch": 0.8414814814814815, "grad_norm": 1.5397732257843018, "learning_rate": 1.5851851851851851e-07, "logits/chosen": 1.595703125, "logits/rejected": 1.9814453125, "logps/chosen": -41.53125, "logps/rejected": -33.125, "loss": 0.7178, "rewards/accuracies": 0.5, "rewards/chosen": -0.04022216796875, "rewards/margins": -0.03912353515625, "rewards/rejected": -0.00116729736328125, "step": 1136 }, { "epoch": 0.8422222222222222, "grad_norm": 4.246097564697266, "learning_rate": 1.5777777777777777e-07, "logits/chosen": 1.97265625, "logits/rejected": 2.046875, "logps/chosen": -53.25, "logps/rejected": -76.25, "loss": 2.9688, "rewards/accuracies": 0.25, "rewards/chosen": -0.389892578125, "rewards/margins": -2.48046875, "rewards/rejected": 2.091796875, "step": 1137 }, { "epoch": 0.8429629629629629, "grad_norm": 2.1967856884002686, "learning_rate": 1.5703703703703703e-07, "logits/chosen": 1.34765625, "logits/rejected": 1.7939453125, "logps/chosen": -27.453125, "logps/rejected": -26.171875, "loss": 0.7505, "rewards/accuracies": 0.5, "rewards/chosen": 0.06402587890625, "rewards/margins": -0.068359375, "rewards/rejected": 0.1324462890625, "step": 1138 }, { "epoch": 0.8437037037037037, "grad_norm": 2.143775463104248, "learning_rate": 1.5629629629629628e-07, "logits/chosen": 1.6220703125, "logits/rejected": 1.8017578125, "logps/chosen": -36.1875, "logps/rejected": -56.65625, "loss": 0.6904, "rewards/accuracies": 0.5, "rewards/chosen": -0.0511474609375, "rewards/margins": 0.0328369140625, "rewards/rejected": -0.083984375, "step": 1139 }, { "epoch": 0.8444444444444444, "grad_norm": 1.496329426765442, "learning_rate": 1.5555555555555556e-07, "logits/chosen": 1.392578125, "logits/rejected": 1.0732421875, "logps/chosen": -26.296875, "logps/rejected": -37.53125, "loss": 0.772, "rewards/accuracies": 0.25, "rewards/chosen": -0.0355224609375, "rewards/margins": -0.1484375, "rewards/rejected": 0.1129150390625, "step": 1140 }, { "epoch": 0.8451851851851852, "grad_norm": 1.9902766942977905, "learning_rate": 1.5481481481481482e-07, "logits/chosen": 1.75, "logits/rejected": 1.7392578125, "logps/chosen": -50.96875, "logps/rejected": -48.25, "loss": 0.604, "rewards/accuracies": 0.75, "rewards/chosen": 0.0718994140625, "rewards/margins": 0.1995849609375, "rewards/rejected": -0.127685546875, "step": 1141 }, { "epoch": 0.845925925925926, "grad_norm": 1.7072882652282715, "learning_rate": 1.5407407407407407e-07, "logits/chosen": 2.00390625, "logits/rejected": 2.486328125, "logps/chosen": -29.25, "logps/rejected": -36.84375, "loss": 0.7563, "rewards/accuracies": 0.5, "rewards/chosen": -0.019134521484375, "rewards/margins": -0.09918212890625, "rewards/rejected": 0.08013916015625, "step": 1142 }, { "epoch": 0.8466666666666667, "grad_norm": 3.5263824462890625, "learning_rate": 1.533333333333333e-07, "logits/chosen": 2.490234375, "logits/rejected": 1.3251953125, "logps/chosen": -46.0625, "logps/rejected": -68.6875, "loss": 0.9814, "rewards/accuracies": 0.0, "rewards/chosen": -0.30615234375, "rewards/margins": -0.50537109375, "rewards/rejected": 0.19921875, "step": 1143 }, { "epoch": 0.8474074074074074, "grad_norm": 2.7961583137512207, "learning_rate": 1.5259259259259259e-07, "logits/chosen": 1.2509765625, "logits/rejected": 1.4306640625, "logps/chosen": -32.3125, "logps/rejected": -42.40625, "loss": 0.8364, "rewards/accuracies": 0.25, "rewards/chosen": -0.198486328125, "rewards/margins": -0.227783203125, "rewards/rejected": 0.0293121337890625, "step": 1144 }, { "epoch": 0.8481481481481481, "grad_norm": 2.561253070831299, "learning_rate": 1.5185185185185184e-07, "logits/chosen": 1.2109375, "logits/rejected": 1.587890625, "logps/chosen": -28.15625, "logps/rejected": -62.5, "loss": 0.8008, "rewards/accuracies": 0.5, "rewards/chosen": 0.0246124267578125, "rewards/margins": -0.173095703125, "rewards/rejected": 0.19775390625, "step": 1145 }, { "epoch": 0.8488888888888889, "grad_norm": 4.168071269989014, "learning_rate": 1.511111111111111e-07, "logits/chosen": 1.662109375, "logits/rejected": 1.4833984375, "logps/chosen": -26.796875, "logps/rejected": -43.25, "loss": 0.8501, "rewards/accuracies": 0.5, "rewards/chosen": -0.0703125, "rewards/margins": -0.2333984375, "rewards/rejected": 0.1629638671875, "step": 1146 }, { "epoch": 0.8496296296296296, "grad_norm": 2.1559696197509766, "learning_rate": 1.5037037037037038e-07, "logits/chosen": 1.125, "logits/rejected": 1.228515625, "logps/chosen": -37.0625, "logps/rejected": -54.34375, "loss": 0.834, "rewards/accuracies": 0.5, "rewards/chosen": -0.11639404296875, "rewards/margins": -0.21240234375, "rewards/rejected": 0.09600830078125, "step": 1147 }, { "epoch": 0.8503703703703703, "grad_norm": 1.47141695022583, "learning_rate": 1.4962962962962963e-07, "logits/chosen": 1.53125, "logits/rejected": 1.3310546875, "logps/chosen": -34.34375, "logps/rejected": -32.03125, "loss": 0.6631, "rewards/accuracies": 0.5, "rewards/chosen": 0.134765625, "rewards/margins": 0.07421875, "rewards/rejected": 0.060516357421875, "step": 1148 }, { "epoch": 0.8511111111111112, "grad_norm": 2.349705696105957, "learning_rate": 1.4888888888888886e-07, "logits/chosen": 1.11328125, "logits/rejected": 1.693359375, "logps/chosen": -26.578125, "logps/rejected": -63.25, "loss": 0.6387, "rewards/accuracies": 0.5, "rewards/chosen": 0.17041015625, "rewards/margins": 0.139892578125, "rewards/rejected": 0.03045654296875, "step": 1149 }, { "epoch": 0.8518518518518519, "grad_norm": 2.1970629692077637, "learning_rate": 1.4814814814814815e-07, "logits/chosen": 1.3623046875, "logits/rejected": 1.294921875, "logps/chosen": -34.28125, "logps/rejected": -27.859375, "loss": 0.6406, "rewards/accuracies": 0.5, "rewards/chosen": 0.119140625, "rewards/margins": 0.1204833984375, "rewards/rejected": -0.00141143798828125, "step": 1150 }, { "epoch": 0.8525925925925926, "grad_norm": 1.5760210752487183, "learning_rate": 1.474074074074074e-07, "logits/chosen": 1.376953125, "logits/rejected": 1.017578125, "logps/chosen": -31.21875, "logps/rejected": -34.0, "loss": 0.8018, "rewards/accuracies": 0.25, "rewards/chosen": 0.037109375, "rewards/margins": -0.1903076171875, "rewards/rejected": 0.227294921875, "step": 1151 }, { "epoch": 0.8533333333333334, "grad_norm": 2.1017959117889404, "learning_rate": 1.4666666666666666e-07, "logits/chosen": 1.9296875, "logits/rejected": 1.57421875, "logps/chosen": -43.5625, "logps/rejected": -28.546875, "loss": 0.7051, "rewards/accuracies": 0.5, "rewards/chosen": 0.0055084228515625, "rewards/margins": 0.006256103515625, "rewards/rejected": -0.0007781982421875, "step": 1152 }, { "epoch": 0.8540740740740741, "grad_norm": 1.8537412881851196, "learning_rate": 1.459259259259259e-07, "logits/chosen": 1.8662109375, "logits/rejected": 1.7890625, "logps/chosen": -30.28125, "logps/rejected": -33.0, "loss": 0.6611, "rewards/accuracies": 0.5, "rewards/chosen": 0.00469970703125, "rewards/margins": 0.07623291015625, "rewards/rejected": -0.07147216796875, "step": 1153 }, { "epoch": 0.8548148148148148, "grad_norm": 1.5002022981643677, "learning_rate": 1.451851851851852e-07, "logits/chosen": 1.6943359375, "logits/rejected": 1.4892578125, "logps/chosen": -25.46875, "logps/rejected": -28.125, "loss": 0.7236, "rewards/accuracies": 0.5, "rewards/chosen": 0.0222625732421875, "rewards/margins": -0.05230712890625, "rewards/rejected": 0.0745849609375, "step": 1154 }, { "epoch": 0.8555555555555555, "grad_norm": 1.6637895107269287, "learning_rate": 1.4444444444444442e-07, "logits/chosen": 1.4697265625, "logits/rejected": 1.494140625, "logps/chosen": -47.125, "logps/rejected": -42.375, "loss": 0.5947, "rewards/accuracies": 0.75, "rewards/chosen": 0.1644287109375, "rewards/margins": 0.233642578125, "rewards/rejected": -0.06915283203125, "step": 1155 }, { "epoch": 0.8562962962962963, "grad_norm": 3.0815162658691406, "learning_rate": 1.4370370370370368e-07, "logits/chosen": 1.7158203125, "logits/rejected": 2.599609375, "logps/chosen": -34.125, "logps/rejected": -65.1875, "loss": 0.8652, "rewards/accuracies": 0.5, "rewards/chosen": 0.03515625, "rewards/margins": -0.25634765625, "rewards/rejected": 0.29150390625, "step": 1156 }, { "epoch": 0.857037037037037, "grad_norm": 3.9430837631225586, "learning_rate": 1.4296296296296296e-07, "logits/chosen": 1.8466796875, "logits/rejected": 1.9697265625, "logps/chosen": -46.40625, "logps/rejected": -50.875, "loss": 0.7861, "rewards/accuracies": 0.25, "rewards/chosen": -0.23046875, "rewards/margins": -0.1527099609375, "rewards/rejected": -0.0777587890625, "step": 1157 }, { "epoch": 0.8577777777777778, "grad_norm": 1.5960257053375244, "learning_rate": 1.4222222222222222e-07, "logits/chosen": 1.18359375, "logits/rejected": 1.4970703125, "logps/chosen": -27.6875, "logps/rejected": -35.8125, "loss": 0.7661, "rewards/accuracies": 0.5, "rewards/chosen": 0.1651611328125, "rewards/margins": -0.08160400390625, "rewards/rejected": 0.246826171875, "step": 1158 }, { "epoch": 0.8585185185185186, "grad_norm": 1.7705096006393433, "learning_rate": 1.4148148148148147e-07, "logits/chosen": 1.7646484375, "logits/rejected": 1.859375, "logps/chosen": -32.375, "logps/rejected": -37.03125, "loss": 0.7275, "rewards/accuracies": 0.5, "rewards/chosen": 0.00351715087890625, "rewards/margins": -0.041015625, "rewards/rejected": 0.0445556640625, "step": 1159 }, { "epoch": 0.8592592592592593, "grad_norm": 2.0160162448883057, "learning_rate": 1.4074074074074075e-07, "logits/chosen": 1.71484375, "logits/rejected": 1.0673828125, "logps/chosen": -36.375, "logps/rejected": -42.6875, "loss": 0.7026, "rewards/accuracies": 0.25, "rewards/chosen": -0.0171966552734375, "rewards/margins": -0.005462646484375, "rewards/rejected": -0.0117340087890625, "step": 1160 }, { "epoch": 0.86, "grad_norm": 1.960257887840271, "learning_rate": 1.4e-07, "logits/chosen": 1.7109375, "logits/rejected": 1.5458984375, "logps/chosen": -37.8125, "logps/rejected": -57.125, "loss": 0.7476, "rewards/accuracies": 0.25, "rewards/chosen": 0.0179901123046875, "rewards/margins": -0.09686279296875, "rewards/rejected": 0.1148681640625, "step": 1161 }, { "epoch": 0.8607407407407407, "grad_norm": 1.4101052284240723, "learning_rate": 1.3925925925925924e-07, "logits/chosen": 2.15234375, "logits/rejected": 1.310546875, "logps/chosen": -25.4375, "logps/rejected": -34.21875, "loss": 0.6841, "rewards/accuracies": 0.25, "rewards/chosen": -0.001178741455078125, "rewards/margins": 0.0261383056640625, "rewards/rejected": -0.02734375, "step": 1162 }, { "epoch": 0.8614814814814815, "grad_norm": 1.8213801383972168, "learning_rate": 1.385185185185185e-07, "logits/chosen": 1.5458984375, "logits/rejected": 1.6484375, "logps/chosen": -27.046875, "logps/rejected": -50.6875, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": -0.017181396484375, "rewards/margins": 0.0164337158203125, "rewards/rejected": -0.033599853515625, "step": 1163 }, { "epoch": 0.8622222222222222, "grad_norm": 1.7692500352859497, "learning_rate": 1.3777777777777778e-07, "logits/chosen": 1.1650390625, "logits/rejected": 1.453125, "logps/chosen": -29.625, "logps/rejected": -37.28125, "loss": 0.6597, "rewards/accuracies": 0.75, "rewards/chosen": 0.078125, "rewards/margins": 0.06951904296875, "rewards/rejected": 0.0085906982421875, "step": 1164 }, { "epoch": 0.8629629629629629, "grad_norm": 2.029812812805176, "learning_rate": 1.3703703703703703e-07, "logits/chosen": 1.580078125, "logits/rejected": 1.6123046875, "logps/chosen": -30.09375, "logps/rejected": -38.21875, "loss": 0.8174, "rewards/accuracies": 0.5, "rewards/chosen": -0.147705078125, "rewards/margins": -0.20703125, "rewards/rejected": 0.059356689453125, "step": 1165 }, { "epoch": 0.8637037037037038, "grad_norm": 2.0848610401153564, "learning_rate": 1.362962962962963e-07, "logits/chosen": 1.5439453125, "logits/rejected": 2.109375, "logps/chosen": -43.5, "logps/rejected": -44.0, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": 0.052337646484375, "rewards/margins": 0.0207061767578125, "rewards/rejected": 0.031646728515625, "step": 1166 }, { "epoch": 0.8644444444444445, "grad_norm": 1.692430853843689, "learning_rate": 1.3555555555555557e-07, "logits/chosen": 0.9873046875, "logits/rejected": 1.2724609375, "logps/chosen": -29.796875, "logps/rejected": -36.28125, "loss": 0.6914, "rewards/accuracies": 0.75, "rewards/chosen": 0.219970703125, "rewards/margins": 0.0262451171875, "rewards/rejected": 0.1937255859375, "step": 1167 }, { "epoch": 0.8651851851851852, "grad_norm": 1.1788252592086792, "learning_rate": 1.348148148148148e-07, "logits/chosen": 0.6591796875, "logits/rejected": 1.3671875, "logps/chosen": -54.8125, "logps/rejected": -65.875, "loss": 0.4495, "rewards/accuracies": 0.75, "rewards/chosen": 0.880859375, "rewards/margins": 0.837890625, "rewards/rejected": 0.0426025390625, "step": 1168 }, { "epoch": 0.8659259259259259, "grad_norm": 2.535634756088257, "learning_rate": 1.3407407407407405e-07, "logits/chosen": 1.8984375, "logits/rejected": 1.748046875, "logps/chosen": -31.578125, "logps/rejected": -43.4375, "loss": 0.8916, "rewards/accuracies": 0.0, "rewards/chosen": -0.24169921875, "rewards/margins": -0.35986328125, "rewards/rejected": 0.11834716796875, "step": 1169 }, { "epoch": 0.8666666666666667, "grad_norm": 2.4004762172698975, "learning_rate": 1.3333333333333334e-07, "logits/chosen": 1.306640625, "logits/rejected": 1.296875, "logps/chosen": -26.125, "logps/rejected": -37.4375, "loss": 0.6299, "rewards/accuracies": 1.0, "rewards/chosen": 0.053924560546875, "rewards/margins": 0.132080078125, "rewards/rejected": -0.078125, "step": 1170 }, { "epoch": 0.8674074074074074, "grad_norm": 1.2829416990280151, "learning_rate": 1.325925925925926e-07, "logits/chosen": 1.9599609375, "logits/rejected": 1.7763671875, "logps/chosen": -29.75, "logps/rejected": -34.03125, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": 0.0229949951171875, "rewards/margins": 0.14404296875, "rewards/rejected": -0.12109375, "step": 1171 }, { "epoch": 0.8681481481481481, "grad_norm": 2.983761787414551, "learning_rate": 1.3185185185185185e-07, "logits/chosen": 1.4736328125, "logits/rejected": 1.83984375, "logps/chosen": -33.40625, "logps/rejected": -51.8125, "loss": 1.2871, "rewards/accuracies": 0.0, "rewards/chosen": 0.1156005859375, "rewards/margins": -0.79736328125, "rewards/rejected": 0.9130859375, "step": 1172 }, { "epoch": 0.8688888888888889, "grad_norm": 2.6171982288360596, "learning_rate": 1.3111111111111113e-07, "logits/chosen": 1.630859375, "logits/rejected": 1.5546875, "logps/chosen": -33.71875, "logps/rejected": -87.5, "loss": 0.7578, "rewards/accuracies": 0.25, "rewards/chosen": -0.250732421875, "rewards/margins": -0.12200927734375, "rewards/rejected": -0.128662109375, "step": 1173 }, { "epoch": 0.8696296296296296, "grad_norm": 1.249174952507019, "learning_rate": 1.3037037037037036e-07, "logits/chosen": 1.7021484375, "logits/rejected": 1.3935546875, "logps/chosen": -33.625, "logps/rejected": -26.890625, "loss": 0.5928, "rewards/accuracies": 1.0, "rewards/chosen": 0.126220703125, "rewards/margins": 0.216552734375, "rewards/rejected": -0.0904541015625, "step": 1174 }, { "epoch": 0.8703703703703703, "grad_norm": 2.5749518871307373, "learning_rate": 1.2962962962962961e-07, "logits/chosen": 1.7939453125, "logits/rejected": 1.90625, "logps/chosen": -24.765625, "logps/rejected": -55.15625, "loss": 0.6855, "rewards/accuracies": 0.75, "rewards/chosen": -0.0113372802734375, "rewards/margins": 0.2227783203125, "rewards/rejected": -0.234130859375, "step": 1175 }, { "epoch": 0.8711111111111111, "grad_norm": 1.8035178184509277, "learning_rate": 1.2888888888888887e-07, "logits/chosen": 1.708984375, "logits/rejected": 1.548828125, "logps/chosen": -29.59375, "logps/rejected": -74.5, "loss": 0.6421, "rewards/accuracies": 0.75, "rewards/chosen": -0.03167724609375, "rewards/margins": 0.1343994140625, "rewards/rejected": -0.166015625, "step": 1176 }, { "epoch": 0.8718518518518519, "grad_norm": 1.3461555242538452, "learning_rate": 1.2814814814814815e-07, "logits/chosen": 1.716796875, "logits/rejected": 1.1376953125, "logps/chosen": -25.84375, "logps/rejected": -26.375, "loss": 0.6216, "rewards/accuracies": 0.75, "rewards/chosen": 0.1358642578125, "rewards/margins": 0.1585693359375, "rewards/rejected": -0.022674560546875, "step": 1177 }, { "epoch": 0.8725925925925926, "grad_norm": 1.9183024168014526, "learning_rate": 1.274074074074074e-07, "logits/chosen": 1.9814453125, "logits/rejected": 2.1953125, "logps/chosen": -24.5625, "logps/rejected": -39.0, "loss": 0.6001, "rewards/accuracies": 0.75, "rewards/chosen": 0.2276611328125, "rewards/margins": 0.208251953125, "rewards/rejected": 0.01953125, "step": 1178 }, { "epoch": 0.8733333333333333, "grad_norm": 1.4803504943847656, "learning_rate": 1.2666666666666666e-07, "logits/chosen": 1.5009765625, "logits/rejected": 1.46484375, "logps/chosen": -39.1875, "logps/rejected": -41.90625, "loss": 0.6436, "rewards/accuracies": 0.5, "rewards/chosen": 0.0060577392578125, "rewards/margins": 0.1146240234375, "rewards/rejected": -0.10858154296875, "step": 1179 }, { "epoch": 0.8740740740740741, "grad_norm": 16.776721954345703, "learning_rate": 1.2592592592592592e-07, "logits/chosen": 1.0234375, "logits/rejected": 1.0703125, "logps/chosen": -46.3125, "logps/rejected": -39.78125, "loss": 0.7314, "rewards/accuracies": 0.25, "rewards/chosen": -0.135986328125, "rewards/margins": -0.0660400390625, "rewards/rejected": -0.0699462890625, "step": 1180 }, { "epoch": 0.8748148148148148, "grad_norm": 2.0789942741394043, "learning_rate": 1.2518518518518517e-07, "logits/chosen": 1.060546875, "logits/rejected": 0.73095703125, "logps/chosen": -49.5625, "logps/rejected": -51.0625, "loss": 0.7607, "rewards/accuracies": 0.25, "rewards/chosen": -0.06640625, "rewards/margins": -0.11016845703125, "rewards/rejected": 0.04376220703125, "step": 1181 }, { "epoch": 0.8755555555555555, "grad_norm": 1.5289911031723022, "learning_rate": 1.2444444444444443e-07, "logits/chosen": 2.017578125, "logits/rejected": 2.38671875, "logps/chosen": -26.6875, "logps/rejected": -35.1875, "loss": 0.6353, "rewards/accuracies": 0.75, "rewards/chosen": 0.05078125, "rewards/margins": 0.126953125, "rewards/rejected": -0.076171875, "step": 1182 }, { "epoch": 0.8762962962962964, "grad_norm": 1.976792812347412, "learning_rate": 1.237037037037037e-07, "logits/chosen": 1.3935546875, "logits/rejected": 1.6337890625, "logps/chosen": -43.0, "logps/rejected": -47.46875, "loss": 0.7988, "rewards/accuracies": 0.25, "rewards/chosen": -0.12384033203125, "rewards/margins": -0.160400390625, "rewards/rejected": 0.03668212890625, "step": 1183 }, { "epoch": 0.8770370370370371, "grad_norm": 1.4791673421859741, "learning_rate": 1.2296296296296297e-07, "logits/chosen": 1.7958984375, "logits/rejected": 1.9189453125, "logps/chosen": -44.15625, "logps/rejected": -21.09375, "loss": 0.6709, "rewards/accuracies": 0.75, "rewards/chosen": 0.11480712890625, "rewards/margins": 0.0556640625, "rewards/rejected": 0.059173583984375, "step": 1184 }, { "epoch": 0.8777777777777778, "grad_norm": 1.6169387102127075, "learning_rate": 1.2222222222222222e-07, "logits/chosen": 0.94091796875, "logits/rejected": 1.5869140625, "logps/chosen": -36.8125, "logps/rejected": -29.875, "loss": 0.644, "rewards/accuracies": 0.75, "rewards/chosen": 0.1015625, "rewards/margins": 0.11328125, "rewards/rejected": -0.01171112060546875, "step": 1185 }, { "epoch": 0.8785185185185185, "grad_norm": 1.9453134536743164, "learning_rate": 1.2148148148148148e-07, "logits/chosen": 1.3349609375, "logits/rejected": 1.5107421875, "logps/chosen": -34.84375, "logps/rejected": -45.8125, "loss": 0.8057, "rewards/accuracies": 0.25, "rewards/chosen": -0.11053466796875, "rewards/margins": -0.2047119140625, "rewards/rejected": 0.0941162109375, "step": 1186 }, { "epoch": 0.8792592592592593, "grad_norm": 1.0326685905456543, "learning_rate": 1.2074074074074073e-07, "logits/chosen": 1.46875, "logits/rejected": 1.11328125, "logps/chosen": -20.15625, "logps/rejected": -21.625, "loss": 0.6602, "rewards/accuracies": 0.75, "rewards/chosen": 0.065673828125, "rewards/margins": 0.0699462890625, "rewards/rejected": -0.00428009033203125, "step": 1187 }, { "epoch": 0.88, "grad_norm": 2.3732845783233643, "learning_rate": 1.2e-07, "logits/chosen": 1.4375, "logits/rejected": 1.96484375, "logps/chosen": -27.828125, "logps/rejected": -45.96875, "loss": 0.5635, "rewards/accuracies": 0.75, "rewards/chosen": 0.11639404296875, "rewards/margins": 0.319580078125, "rewards/rejected": -0.203125, "step": 1188 }, { "epoch": 0.8807407407407407, "grad_norm": 2.216913938522339, "learning_rate": 1.1925925925925924e-07, "logits/chosen": 0.98828125, "logits/rejected": 1.525390625, "logps/chosen": -24.734375, "logps/rejected": -68.8125, "loss": 0.6001, "rewards/accuracies": 0.75, "rewards/chosen": 0.0046844482421875, "rewards/margins": 0.241455078125, "rewards/rejected": -0.2366943359375, "step": 1189 }, { "epoch": 0.8814814814814815, "grad_norm": 4.271926403045654, "learning_rate": 1.1851851851851851e-07, "logits/chosen": 1.466796875, "logits/rejected": 2.326171875, "logps/chosen": -24.640625, "logps/rejected": -79.5, "loss": 1.5693, "rewards/accuracies": 0.0, "rewards/chosen": 0.01346588134765625, "rewards/margins": -1.115234375, "rewards/rejected": 1.12890625, "step": 1190 }, { "epoch": 0.8822222222222222, "grad_norm": 1.9725685119628906, "learning_rate": 1.1777777777777777e-07, "logits/chosen": 1.5771484375, "logits/rejected": 2.083984375, "logps/chosen": -43.65625, "logps/rejected": -77.4375, "loss": 0.6074, "rewards/accuracies": 0.5, "rewards/chosen": 0.057830810546875, "rewards/margins": 0.40576171875, "rewards/rejected": -0.34765625, "step": 1191 }, { "epoch": 0.882962962962963, "grad_norm": 2.5998032093048096, "learning_rate": 1.1703703703703702e-07, "logits/chosen": 1.8251953125, "logits/rejected": 2.09375, "logps/chosen": -33.9375, "logps/rejected": -50.03125, "loss": 0.7749, "rewards/accuracies": 0.0, "rewards/chosen": -0.072265625, "rewards/margins": -0.1488037109375, "rewards/rejected": 0.0765380859375, "step": 1192 }, { "epoch": 0.8837037037037037, "grad_norm": 1.949986219406128, "learning_rate": 1.162962962962963e-07, "logits/chosen": 1.294921875, "logits/rejected": 1.0380859375, "logps/chosen": -28.671875, "logps/rejected": -33.4375, "loss": 0.5732, "rewards/accuracies": 0.5, "rewards/chosen": 0.385986328125, "rewards/margins": 0.330078125, "rewards/rejected": 0.05584716796875, "step": 1193 }, { "epoch": 0.8844444444444445, "grad_norm": 19.552040100097656, "learning_rate": 1.1555555555555555e-07, "logits/chosen": 2.03515625, "logits/rejected": 1.6279296875, "logps/chosen": -69.25, "logps/rejected": -61.84375, "loss": 1.7236, "rewards/accuracies": 0.5, "rewards/chosen": -1.3154296875, "rewards/margins": -1.150390625, "rewards/rejected": -0.165283203125, "step": 1194 }, { "epoch": 0.8851851851851852, "grad_norm": 1.329053282737732, "learning_rate": 1.148148148148148e-07, "logits/chosen": 1.6474609375, "logits/rejected": 1.828125, "logps/chosen": -28.640625, "logps/rejected": -30.578125, "loss": 0.6719, "rewards/accuracies": 0.5, "rewards/chosen": -0.01523590087890625, "rewards/margins": 0.05938720703125, "rewards/rejected": -0.0745849609375, "step": 1195 }, { "epoch": 0.8859259259259259, "grad_norm": 2.459321975708008, "learning_rate": 1.1407407407407407e-07, "logits/chosen": 2.044921875, "logits/rejected": 1.6591796875, "logps/chosen": -35.28125, "logps/rejected": -45.75, "loss": 0.7583, "rewards/accuracies": 0.5, "rewards/chosen": -0.06488037109375, "rewards/margins": -0.1038818359375, "rewards/rejected": 0.039031982421875, "step": 1196 }, { "epoch": 0.8866666666666667, "grad_norm": 1.5701358318328857, "learning_rate": 1.1333333333333332e-07, "logits/chosen": 1.39453125, "logits/rejected": 1.62109375, "logps/chosen": -28.203125, "logps/rejected": -50.25, "loss": 0.7583, "rewards/accuracies": 0.25, "rewards/chosen": -0.12030029296875, "rewards/margins": -0.11444091796875, "rewards/rejected": -0.00586700439453125, "step": 1197 }, { "epoch": 0.8874074074074074, "grad_norm": 2.2585721015930176, "learning_rate": 1.1259259259259258e-07, "logits/chosen": 1.26953125, "logits/rejected": 1.654296875, "logps/chosen": -32.75, "logps/rejected": -49.8125, "loss": 0.5557, "rewards/accuracies": 1.0, "rewards/chosen": 0.07733154296875, "rewards/margins": 0.30224609375, "rewards/rejected": -0.2249755859375, "step": 1198 }, { "epoch": 0.8881481481481481, "grad_norm": 2.2284762859344482, "learning_rate": 1.1185185185185185e-07, "logits/chosen": 1.6025390625, "logits/rejected": 1.609375, "logps/chosen": -29.453125, "logps/rejected": -54.90625, "loss": 0.8408, "rewards/accuracies": 0.5, "rewards/chosen": 0.079833984375, "rewards/margins": -0.20654296875, "rewards/rejected": 0.286376953125, "step": 1199 }, { "epoch": 0.8888888888888888, "grad_norm": 1.7138595581054688, "learning_rate": 1.111111111111111e-07, "logits/chosen": 1.34375, "logits/rejected": 1.732421875, "logps/chosen": -42.875, "logps/rejected": -44.0, "loss": 0.73, "rewards/accuracies": 0.75, "rewards/chosen": -0.03948974609375, "rewards/margins": -0.06292724609375, "rewards/rejected": 0.0234527587890625, "step": 1200 }, { "epoch": 0.8896296296296297, "grad_norm": 1.5469387769699097, "learning_rate": 1.1037037037037036e-07, "logits/chosen": 1.138671875, "logits/rejected": 1.3564453125, "logps/chosen": -31.0625, "logps/rejected": -59.5, "loss": 0.583, "rewards/accuracies": 0.75, "rewards/chosen": 0.2047119140625, "rewards/margins": 0.258544921875, "rewards/rejected": -0.05389404296875, "step": 1201 }, { "epoch": 0.8903703703703704, "grad_norm": 1.703850269317627, "learning_rate": 1.0962962962962963e-07, "logits/chosen": 1.1298828125, "logits/rejected": 1.68359375, "logps/chosen": -35.96875, "logps/rejected": -39.375, "loss": 0.7607, "rewards/accuracies": 0.25, "rewards/chosen": -0.104736328125, "rewards/margins": -0.1278076171875, "rewards/rejected": 0.0230712890625, "step": 1202 }, { "epoch": 0.8911111111111111, "grad_norm": 3.2221179008483887, "learning_rate": 1.0888888888888888e-07, "logits/chosen": 1.994140625, "logits/rejected": 1.623046875, "logps/chosen": -45.125, "logps/rejected": -39.96875, "loss": 0.8623, "rewards/accuracies": 0.75, "rewards/chosen": -0.3857421875, "rewards/margins": -0.2296142578125, "rewards/rejected": -0.15625, "step": 1203 }, { "epoch": 0.8918518518518519, "grad_norm": 1.8718359470367432, "learning_rate": 1.0814814814814814e-07, "logits/chosen": 1.388671875, "logits/rejected": 1.6171875, "logps/chosen": -46.125, "logps/rejected": -41.15625, "loss": 0.6206, "rewards/accuracies": 0.75, "rewards/chosen": 0.149658203125, "rewards/margins": 0.1676025390625, "rewards/rejected": -0.0179443359375, "step": 1204 }, { "epoch": 0.8925925925925926, "grad_norm": 1.747053623199463, "learning_rate": 1.074074074074074e-07, "logits/chosen": 1.4794921875, "logits/rejected": 1.2451171875, "logps/chosen": -41.4375, "logps/rejected": -25.734375, "loss": 0.6631, "rewards/accuracies": 0.75, "rewards/chosen": 0.03985595703125, "rewards/margins": 0.07147216796875, "rewards/rejected": -0.031646728515625, "step": 1205 }, { "epoch": 0.8933333333333333, "grad_norm": 4.0941057205200195, "learning_rate": 1.0666666666666667e-07, "logits/chosen": 2.263671875, "logits/rejected": 1.5048828125, "logps/chosen": -31.34375, "logps/rejected": -38.1875, "loss": 0.9824, "rewards/accuracies": 0.75, "rewards/chosen": -0.42626953125, "rewards/margins": -0.369873046875, "rewards/rejected": -0.056243896484375, "step": 1206 }, { "epoch": 0.894074074074074, "grad_norm": 4.00205659866333, "learning_rate": 1.0592592592592592e-07, "logits/chosen": 2.345703125, "logits/rejected": 1.7236328125, "logps/chosen": -34.0, "logps/rejected": -35.15625, "loss": 0.8926, "rewards/accuracies": 0.25, "rewards/chosen": -0.346923828125, "rewards/margins": -0.321044921875, "rewards/rejected": -0.025787353515625, "step": 1207 }, { "epoch": 0.8948148148148148, "grad_norm": 2.0077829360961914, "learning_rate": 1.0518518518518518e-07, "logits/chosen": 1.5107421875, "logits/rejected": 1.45703125, "logps/chosen": -48.34375, "logps/rejected": -27.40625, "loss": 0.5747, "rewards/accuracies": 0.5, "rewards/chosen": 0.1375732421875, "rewards/margins": 0.373779296875, "rewards/rejected": -0.236328125, "step": 1208 }, { "epoch": 0.8955555555555555, "grad_norm": 2.6821324825286865, "learning_rate": 1.0444444444444445e-07, "logits/chosen": 1.5029296875, "logits/rejected": 1.6904296875, "logps/chosen": -68.625, "logps/rejected": -57.625, "loss": 0.7983, "rewards/accuracies": 0.25, "rewards/chosen": -0.0784912109375, "rewards/margins": -0.1929931640625, "rewards/rejected": 0.114501953125, "step": 1209 }, { "epoch": 0.8962962962962963, "grad_norm": 1.9878628253936768, "learning_rate": 1.0370370370370369e-07, "logits/chosen": 1.5341796875, "logits/rejected": 1.3154296875, "logps/chosen": -25.203125, "logps/rejected": -29.46875, "loss": 0.5796, "rewards/accuracies": 0.75, "rewards/chosen": 0.0117340087890625, "rewards/margins": 0.25048828125, "rewards/rejected": -0.2386474609375, "step": 1210 }, { "epoch": 0.8970370370370371, "grad_norm": 1.2938557863235474, "learning_rate": 1.0296296296296296e-07, "logits/chosen": 1.599609375, "logits/rejected": 2.1953125, "logps/chosen": -35.90625, "logps/rejected": -41.3125, "loss": 0.5684, "rewards/accuracies": 1.0, "rewards/chosen": 0.09783935546875, "rewards/margins": 0.26806640625, "rewards/rejected": -0.170166015625, "step": 1211 }, { "epoch": 0.8977777777777778, "grad_norm": 1.3492412567138672, "learning_rate": 1.0222222222222223e-07, "logits/chosen": 1.9443359375, "logits/rejected": 1.9453125, "logps/chosen": -26.3125, "logps/rejected": -31.375, "loss": 0.6836, "rewards/accuracies": 0.75, "rewards/chosen": -0.04962158203125, "rewards/margins": 0.04022216796875, "rewards/rejected": -0.08984375, "step": 1212 }, { "epoch": 0.8985185185185185, "grad_norm": 1.6293039321899414, "learning_rate": 1.0148148148148147e-07, "logits/chosen": 0.7607421875, "logits/rejected": 1.5087890625, "logps/chosen": -24.21875, "logps/rejected": -54.125, "loss": 0.7129, "rewards/accuracies": 0.25, "rewards/chosen": -0.037109375, "rewards/margins": -0.003143310546875, "rewards/rejected": -0.03399658203125, "step": 1213 }, { "epoch": 0.8992592592592593, "grad_norm": 2.284670829772949, "learning_rate": 1.0074074074074074e-07, "logits/chosen": 1.6083984375, "logits/rejected": 0.199462890625, "logps/chosen": -31.609375, "logps/rejected": -38.40625, "loss": 2.0234, "rewards/accuracies": 0.25, "rewards/chosen": -0.02423095703125, "rewards/margins": -1.5546875, "rewards/rejected": 1.53125, "step": 1214 }, { "epoch": 0.9, "grad_norm": 1.5993221998214722, "learning_rate": 1e-07, "logits/chosen": 1.0390625, "logits/rejected": 1.322265625, "logps/chosen": -28.671875, "logps/rejected": -32.65625, "loss": 0.6953, "rewards/accuracies": 0.5, "rewards/chosen": -0.061309814453125, "rewards/margins": 0.0019683837890625, "rewards/rejected": -0.06329345703125, "step": 1215 }, { "epoch": 0.9007407407407407, "grad_norm": 2.8866140842437744, "learning_rate": 9.925925925925925e-08, "logits/chosen": 1.990234375, "logits/rejected": 2.068359375, "logps/chosen": -49.28125, "logps/rejected": -61.25, "loss": 0.7261, "rewards/accuracies": 0.75, "rewards/chosen": 0.1312255859375, "rewards/margins": -0.01763916015625, "rewards/rejected": 0.14892578125, "step": 1216 }, { "epoch": 0.9014814814814814, "grad_norm": 2.1790285110473633, "learning_rate": 9.851851851851852e-08, "logits/chosen": 2.078125, "logits/rejected": 2.35546875, "logps/chosen": -34.59375, "logps/rejected": -38.8125, "loss": 0.8164, "rewards/accuracies": 0.25, "rewards/chosen": -0.110107421875, "rewards/margins": -0.2054443359375, "rewards/rejected": 0.0953369140625, "step": 1217 }, { "epoch": 0.9022222222222223, "grad_norm": 1.9990448951721191, "learning_rate": 9.777777777777778e-08, "logits/chosen": 1.3671875, "logits/rejected": 1.7421875, "logps/chosen": -20.390625, "logps/rejected": -56.28125, "loss": 0.6172, "rewards/accuracies": 0.75, "rewards/chosen": 0.03594970703125, "rewards/margins": 0.19091796875, "rewards/rejected": -0.15478515625, "step": 1218 }, { "epoch": 0.902962962962963, "grad_norm": 2.997955560684204, "learning_rate": 9.703703703703703e-08, "logits/chosen": 1.8232421875, "logits/rejected": 1.2333984375, "logps/chosen": -41.46875, "logps/rejected": -44.4375, "loss": 0.5903, "rewards/accuracies": 0.5, "rewards/chosen": 0.129638671875, "rewards/margins": 0.29541015625, "rewards/rejected": -0.1656494140625, "step": 1219 }, { "epoch": 0.9037037037037037, "grad_norm": 1.9291967153549194, "learning_rate": 9.629629629629629e-08, "logits/chosen": 2.0, "logits/rejected": 1.46484375, "logps/chosen": -25.5, "logps/rejected": -34.0, "loss": 0.7368, "rewards/accuracies": 0.5, "rewards/chosen": -0.182373046875, "rewards/margins": -0.071533203125, "rewards/rejected": -0.11090087890625, "step": 1220 }, { "epoch": 0.9044444444444445, "grad_norm": 2.103719472885132, "learning_rate": 9.555555555555556e-08, "logits/chosen": 1.11328125, "logits/rejected": 1.4912109375, "logps/chosen": -37.34375, "logps/rejected": -68.375, "loss": 0.729, "rewards/accuracies": 0.25, "rewards/chosen": -0.067626953125, "rewards/margins": -0.06842041015625, "rewards/rejected": 0.00081634521484375, "step": 1221 }, { "epoch": 0.9051851851851852, "grad_norm": 2.6408677101135254, "learning_rate": 9.481481481481481e-08, "logits/chosen": 1.9169921875, "logits/rejected": 1.6455078125, "logps/chosen": -27.8125, "logps/rejected": -66.875, "loss": 0.6982, "rewards/accuracies": 0.5, "rewards/chosen": 0.08477783203125, "rewards/margins": 0.004364013671875, "rewards/rejected": 0.0804443359375, "step": 1222 }, { "epoch": 0.9059259259259259, "grad_norm": 1.6091299057006836, "learning_rate": 9.407407407407407e-08, "logits/chosen": 1.7412109375, "logits/rejected": 1.728515625, "logps/chosen": -37.03125, "logps/rejected": -69.0625, "loss": 0.4678, "rewards/accuracies": 0.75, "rewards/chosen": 0.2109375, "rewards/margins": 0.9814453125, "rewards/rejected": -0.77001953125, "step": 1223 }, { "epoch": 0.9066666666666666, "grad_norm": 1.4675836563110352, "learning_rate": 9.333333333333334e-08, "logits/chosen": 1.6416015625, "logits/rejected": 1.587890625, "logps/chosen": -26.5625, "logps/rejected": -35.75, "loss": 0.6978, "rewards/accuracies": 0.25, "rewards/chosen": 0.034393310546875, "rewards/margins": -0.0062255859375, "rewards/rejected": 0.040618896484375, "step": 1224 }, { "epoch": 0.9074074074074074, "grad_norm": 5.934322357177734, "learning_rate": 9.259259259259258e-08, "logits/chosen": 1.5732421875, "logits/rejected": 1.7080078125, "logps/chosen": -23.640625, "logps/rejected": -23.53125, "loss": 0.6465, "rewards/accuracies": 0.5, "rewards/chosen": 0.053558349609375, "rewards/margins": 0.1116943359375, "rewards/rejected": -0.058197021484375, "step": 1225 }, { "epoch": 0.9081481481481481, "grad_norm": 2.6510438919067383, "learning_rate": 9.185185185185185e-08, "logits/chosen": 1.8154296875, "logits/rejected": 1.5341796875, "logps/chosen": -39.25, "logps/rejected": -25.75, "loss": 0.6685, "rewards/accuracies": 0.75, "rewards/chosen": -0.045166015625, "rewards/margins": 0.132080078125, "rewards/rejected": -0.1771240234375, "step": 1226 }, { "epoch": 0.9088888888888889, "grad_norm": 1.6626832485198975, "learning_rate": 9.111111111111112e-08, "logits/chosen": 1.8916015625, "logits/rejected": 2.447265625, "logps/chosen": -30.4375, "logps/rejected": -48.5, "loss": 0.7393, "rewards/accuracies": 0.5, "rewards/chosen": 0.2061767578125, "rewards/margins": -0.04620361328125, "rewards/rejected": 0.25244140625, "step": 1227 }, { "epoch": 0.9096296296296297, "grad_norm": 1.6290862560272217, "learning_rate": 9.037037037037036e-08, "logits/chosen": 1.9990234375, "logits/rejected": 1.634765625, "logps/chosen": -26.90625, "logps/rejected": -53.28125, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": 0.1331787109375, "rewards/margins": 0.1175537109375, "rewards/rejected": 0.01560211181640625, "step": 1228 }, { "epoch": 0.9103703703703704, "grad_norm": 2.135939598083496, "learning_rate": 8.962962962962963e-08, "logits/chosen": 1.095703125, "logits/rejected": 1.7587890625, "logps/chosen": -22.875, "logps/rejected": -52.0625, "loss": 0.6587, "rewards/accuracies": 0.75, "rewards/chosen": 0.0214691162109375, "rewards/margins": 0.09490966796875, "rewards/rejected": -0.07342529296875, "step": 1229 }, { "epoch": 0.9111111111111111, "grad_norm": 2.016975164413452, "learning_rate": 8.888888888888888e-08, "logits/chosen": 2.228515625, "logits/rejected": 2.265625, "logps/chosen": -37.28125, "logps/rejected": -51.0, "loss": 0.752, "rewards/accuracies": 0.25, "rewards/chosen": -0.139404296875, "rewards/margins": -0.112060546875, "rewards/rejected": -0.0273590087890625, "step": 1230 }, { "epoch": 0.9118518518518518, "grad_norm": 1.7169431447982788, "learning_rate": 8.814814814814814e-08, "logits/chosen": 1.4912109375, "logits/rejected": 1.6806640625, "logps/chosen": -25.734375, "logps/rejected": -69.5625, "loss": 0.7358, "rewards/accuracies": 0.5, "rewards/chosen": -0.0214691162109375, "rewards/margins": -0.07928466796875, "rewards/rejected": 0.057830810546875, "step": 1231 }, { "epoch": 0.9125925925925926, "grad_norm": 2.54238224029541, "learning_rate": 8.74074074074074e-08, "logits/chosen": 1.412109375, "logits/rejected": 2.13671875, "logps/chosen": -71.1875, "logps/rejected": -57.9375, "loss": 0.7368, "rewards/accuracies": 0.5, "rewards/chosen": -0.1534423828125, "rewards/margins": -0.0252685546875, "rewards/rejected": -0.128173828125, "step": 1232 }, { "epoch": 0.9133333333333333, "grad_norm": 2.9056813716888428, "learning_rate": 8.666666666666666e-08, "logits/chosen": 1.6025390625, "logits/rejected": 1.87890625, "logps/chosen": -44.875, "logps/rejected": -47.4375, "loss": 0.8364, "rewards/accuracies": 0.5, "rewards/chosen": -0.197265625, "rewards/margins": -0.21923828125, "rewards/rejected": 0.02191162109375, "step": 1233 }, { "epoch": 0.914074074074074, "grad_norm": 3.1883316040039062, "learning_rate": 8.592592592592592e-08, "logits/chosen": 1.7099609375, "logits/rejected": 2.15234375, "logps/chosen": -45.9375, "logps/rejected": -57.09375, "loss": 0.9658, "rewards/accuracies": 0.25, "rewards/chosen": -0.29345703125, "rewards/margins": -0.4423828125, "rewards/rejected": 0.149169921875, "step": 1234 }, { "epoch": 0.9148148148148149, "grad_norm": 1.6328847408294678, "learning_rate": 8.518518518518517e-08, "logits/chosen": 1.4541015625, "logits/rejected": 1.8515625, "logps/chosen": -27.890625, "logps/rejected": -42.40625, "loss": 0.7158, "rewards/accuracies": 0.5, "rewards/chosen": 0.011322021484375, "rewards/margins": -0.0269622802734375, "rewards/rejected": 0.038330078125, "step": 1235 }, { "epoch": 0.9155555555555556, "grad_norm": 3.1205825805664062, "learning_rate": 8.444444444444444e-08, "logits/chosen": 1.091796875, "logits/rejected": 2.24609375, "logps/chosen": -47.03125, "logps/rejected": -45.9375, "loss": 0.6821, "rewards/accuracies": 0.5, "rewards/chosen": 0.0836181640625, "rewards/margins": 0.036956787109375, "rewards/rejected": 0.046661376953125, "step": 1236 }, { "epoch": 0.9162962962962963, "grad_norm": 2.5165507793426514, "learning_rate": 8.37037037037037e-08, "logits/chosen": 2.3046875, "logits/rejected": 1.810546875, "logps/chosen": -44.78125, "logps/rejected": -70.125, "loss": 0.79, "rewards/accuracies": 0.25, "rewards/chosen": -0.05621337890625, "rewards/margins": -0.170654296875, "rewards/rejected": 0.11444091796875, "step": 1237 }, { "epoch": 0.917037037037037, "grad_norm": 1.9520310163497925, "learning_rate": 8.296296296296295e-08, "logits/chosen": 1.373046875, "logits/rejected": 1.6435546875, "logps/chosen": -22.140625, "logps/rejected": -51.625, "loss": 0.6353, "rewards/accuracies": 0.5, "rewards/chosen": 0.04864501953125, "rewards/margins": 0.1396484375, "rewards/rejected": -0.0909423828125, "step": 1238 }, { "epoch": 0.9177777777777778, "grad_norm": 2.160733699798584, "learning_rate": 8.222222222222222e-08, "logits/chosen": 2.236328125, "logits/rejected": 2.001953125, "logps/chosen": -37.0, "logps/rejected": -45.0, "loss": 0.7012, "rewards/accuracies": 0.5, "rewards/chosen": 0.046478271484375, "rewards/margins": 0.013641357421875, "rewards/rejected": 0.032806396484375, "step": 1239 }, { "epoch": 0.9185185185185185, "grad_norm": 2.943401575088501, "learning_rate": 8.148148148148149e-08, "logits/chosen": 2.162109375, "logits/rejected": 2.083984375, "logps/chosen": -45.59375, "logps/rejected": -44.4375, "loss": 0.8164, "rewards/accuracies": 0.5, "rewards/chosen": -0.221435546875, "rewards/margins": -0.1844482421875, "rewards/rejected": -0.03704833984375, "step": 1240 }, { "epoch": 0.9192592592592592, "grad_norm": 1.4279967546463013, "learning_rate": 8.074074074074073e-08, "logits/chosen": 2.197265625, "logits/rejected": 1.2265625, "logps/chosen": -28.953125, "logps/rejected": -31.984375, "loss": 0.5952, "rewards/accuracies": 1.0, "rewards/chosen": 0.11248779296875, "rewards/margins": 0.210205078125, "rewards/rejected": -0.09765625, "step": 1241 }, { "epoch": 0.92, "grad_norm": 2.299074172973633, "learning_rate": 8e-08, "logits/chosen": 0.96630859375, "logits/rejected": 1.7626953125, "logps/chosen": -38.90625, "logps/rejected": -45.5, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": -0.09259033203125, "rewards/margins": 0.0445556640625, "rewards/rejected": -0.1370849609375, "step": 1242 }, { "epoch": 0.9207407407407407, "grad_norm": 1.2712347507476807, "learning_rate": 7.925925925925926e-08, "logits/chosen": 0.9267578125, "logits/rejected": 1.099609375, "logps/chosen": -33.5625, "logps/rejected": -20.875, "loss": 0.6567, "rewards/accuracies": 0.5, "rewards/chosen": 0.134033203125, "rewards/margins": 0.0833740234375, "rewards/rejected": 0.05059814453125, "step": 1243 }, { "epoch": 0.9214814814814815, "grad_norm": 1.6472797393798828, "learning_rate": 7.851851851851851e-08, "logits/chosen": 1.423828125, "logits/rejected": 1.630859375, "logps/chosen": -27.09375, "logps/rejected": -43.8125, "loss": 0.6265, "rewards/accuracies": 0.75, "rewards/chosen": 0.060577392578125, "rewards/margins": 0.14599609375, "rewards/rejected": -0.08538818359375, "step": 1244 }, { "epoch": 0.9222222222222223, "grad_norm": 2.184174060821533, "learning_rate": 7.777777777777778e-08, "logits/chosen": 1.4794921875, "logits/rejected": 2.23046875, "logps/chosen": -25.96875, "logps/rejected": -52.875, "loss": 0.7915, "rewards/accuracies": 0.0, "rewards/chosen": -0.10235595703125, "rewards/margins": -0.1800537109375, "rewards/rejected": 0.07769775390625, "step": 1245 }, { "epoch": 0.922962962962963, "grad_norm": 1.9824894666671753, "learning_rate": 7.703703703703704e-08, "logits/chosen": 2.041015625, "logits/rejected": 1.3076171875, "logps/chosen": -26.0625, "logps/rejected": -53.9375, "loss": 0.5869, "rewards/accuracies": 0.75, "rewards/chosen": 0.079345703125, "rewards/margins": 0.234375, "rewards/rejected": -0.155029296875, "step": 1246 }, { "epoch": 0.9237037037037037, "grad_norm": 1.394948959350586, "learning_rate": 7.629629629629629e-08, "logits/chosen": 1.619140625, "logits/rejected": 1.8466796875, "logps/chosen": -22.046875, "logps/rejected": -27.65625, "loss": 0.6108, "rewards/accuracies": 0.75, "rewards/chosen": 0.04644775390625, "rewards/margins": 0.18212890625, "rewards/rejected": -0.1357421875, "step": 1247 }, { "epoch": 0.9244444444444444, "grad_norm": 1.4856292009353638, "learning_rate": 7.555555555555555e-08, "logits/chosen": 2.10546875, "logits/rejected": 1.2392578125, "logps/chosen": -23.09375, "logps/rejected": -28.625, "loss": 0.6836, "rewards/accuracies": 0.5, "rewards/chosen": -0.0207061767578125, "rewards/margins": 0.0269775390625, "rewards/rejected": -0.04766845703125, "step": 1248 }, { "epoch": 0.9251851851851852, "grad_norm": 1.703323483467102, "learning_rate": 7.481481481481482e-08, "logits/chosen": 1.6533203125, "logits/rejected": 1.7294921875, "logps/chosen": -24.625, "logps/rejected": -38.0, "loss": 0.7168, "rewards/accuracies": 0.5, "rewards/chosen": 0.0191497802734375, "rewards/margins": -0.0413818359375, "rewards/rejected": 0.060546875, "step": 1249 }, { "epoch": 0.9259259259259259, "grad_norm": 1.4671030044555664, "learning_rate": 7.407407407407407e-08, "logits/chosen": 1.140625, "logits/rejected": 1.3779296875, "logps/chosen": -29.9375, "logps/rejected": -52.875, "loss": 0.5518, "rewards/accuracies": 1.0, "rewards/chosen": 0.1468505859375, "rewards/margins": 0.317138671875, "rewards/rejected": -0.17041015625, "step": 1250 }, { "epoch": 0.9266666666666666, "grad_norm": 1.9548602104187012, "learning_rate": 7.333333333333333e-08, "logits/chosen": 1.521484375, "logits/rejected": 1.6142578125, "logps/chosen": -50.03125, "logps/rejected": -38.5, "loss": 0.6973, "rewards/accuracies": 0.5, "rewards/chosen": 0.050811767578125, "rewards/margins": -0.00152587890625, "rewards/rejected": 0.052337646484375, "step": 1251 }, { "epoch": 0.9274074074074075, "grad_norm": 1.5840626955032349, "learning_rate": 7.25925925925926e-08, "logits/chosen": 1.2998046875, "logits/rejected": 1.375, "logps/chosen": -30.71875, "logps/rejected": -38.0625, "loss": 0.5537, "rewards/accuracies": 0.75, "rewards/chosen": 0.1761474609375, "rewards/margins": 0.335693359375, "rewards/rejected": -0.159423828125, "step": 1252 }, { "epoch": 0.9281481481481482, "grad_norm": 1.0514678955078125, "learning_rate": 7.185185185185184e-08, "logits/chosen": 1.3583984375, "logits/rejected": 1.0048828125, "logps/chosen": -24.421875, "logps/rejected": -21.4375, "loss": 0.6465, "rewards/accuracies": 0.5, "rewards/chosen": 0.137451171875, "rewards/margins": 0.1058349609375, "rewards/rejected": 0.031646728515625, "step": 1253 }, { "epoch": 0.9288888888888889, "grad_norm": 4.178462028503418, "learning_rate": 7.111111111111111e-08, "logits/chosen": 1.9443359375, "logits/rejected": 1.7021484375, "logps/chosen": -28.4375, "logps/rejected": -98.0625, "loss": 0.8184, "rewards/accuracies": 0.25, "rewards/chosen": 0.16064453125, "rewards/margins": -0.031494140625, "rewards/rejected": 0.192138671875, "step": 1254 }, { "epoch": 0.9296296296296296, "grad_norm": 2.0782482624053955, "learning_rate": 7.037037037037038e-08, "logits/chosen": 1.7607421875, "logits/rejected": 1.1748046875, "logps/chosen": -47.78125, "logps/rejected": -67.1875, "loss": 0.5918, "rewards/accuracies": 1.0, "rewards/chosen": 0.06719970703125, "rewards/margins": 0.21484375, "rewards/rejected": -0.147705078125, "step": 1255 }, { "epoch": 0.9303703703703704, "grad_norm": 2.2950406074523926, "learning_rate": 6.962962962962962e-08, "logits/chosen": 1.68359375, "logits/rejected": 1.57421875, "logps/chosen": -45.28125, "logps/rejected": -84.75, "loss": 0.7217, "rewards/accuracies": 0.25, "rewards/chosen": 0.005462646484375, "rewards/margins": -0.024627685546875, "rewards/rejected": 0.030059814453125, "step": 1256 }, { "epoch": 0.9311111111111111, "grad_norm": 2.52500057220459, "learning_rate": 6.888888888888889e-08, "logits/chosen": 1.484375, "logits/rejected": 1.734375, "logps/chosen": -27.546875, "logps/rejected": -62.90625, "loss": 0.709, "rewards/accuracies": 0.25, "rewards/chosen": 0.0042724609375, "rewards/margins": -0.0168914794921875, "rewards/rejected": 0.0211029052734375, "step": 1257 }, { "epoch": 0.9318518518518518, "grad_norm": 1.3138399124145508, "learning_rate": 6.814814814814814e-08, "logits/chosen": 1.712890625, "logits/rejected": 1.408203125, "logps/chosen": -29.40625, "logps/rejected": -47.8125, "loss": 0.5698, "rewards/accuracies": 0.75, "rewards/chosen": 0.1922607421875, "rewards/margins": 0.345703125, "rewards/rejected": -0.153564453125, "step": 1258 }, { "epoch": 0.9325925925925926, "grad_norm": 1.9169514179229736, "learning_rate": 6.74074074074074e-08, "logits/chosen": 1.474609375, "logits/rejected": 1.076171875, "logps/chosen": -30.5625, "logps/rejected": -30.09375, "loss": 0.833, "rewards/accuracies": 0.25, "rewards/chosen": -0.155517578125, "rewards/margins": -0.25390625, "rewards/rejected": 0.09844970703125, "step": 1259 }, { "epoch": 0.9333333333333333, "grad_norm": 3.9665136337280273, "learning_rate": 6.666666666666667e-08, "logits/chosen": 1.94140625, "logits/rejected": 2.501953125, "logps/chosen": -49.40625, "logps/rejected": -62.21875, "loss": 0.6826, "rewards/accuracies": 0.5, "rewards/chosen": -0.1605224609375, "rewards/margins": 0.04571533203125, "rewards/rejected": -0.2061767578125, "step": 1260 }, { "epoch": 0.9340740740740741, "grad_norm": 1.6072123050689697, "learning_rate": 6.592592592592592e-08, "logits/chosen": 1.443359375, "logits/rejected": 1.9619140625, "logps/chosen": -27.96875, "logps/rejected": -28.59375, "loss": 0.6416, "rewards/accuracies": 0.75, "rewards/chosen": -0.0037097930908203125, "rewards/margins": 0.1087646484375, "rewards/rejected": -0.11248779296875, "step": 1261 }, { "epoch": 0.9348148148148148, "grad_norm": 2.9702577590942383, "learning_rate": 6.518518518518518e-08, "logits/chosen": 1.5751953125, "logits/rejected": 2.154296875, "logps/chosen": -62.0, "logps/rejected": -53.0625, "loss": 0.5396, "rewards/accuracies": 0.75, "rewards/chosen": 0.4169921875, "rewards/margins": 0.391357421875, "rewards/rejected": 0.0253753662109375, "step": 1262 }, { "epoch": 0.9355555555555556, "grad_norm": 2.3536429405212402, "learning_rate": 6.444444444444443e-08, "logits/chosen": 1.677734375, "logits/rejected": 1.619140625, "logps/chosen": -42.875, "logps/rejected": -34.96875, "loss": 0.6431, "rewards/accuracies": 0.5, "rewards/chosen": -0.0386962890625, "rewards/margins": 0.11712646484375, "rewards/rejected": -0.15576171875, "step": 1263 }, { "epoch": 0.9362962962962963, "grad_norm": 1.9070905447006226, "learning_rate": 6.37037037037037e-08, "logits/chosen": 1.064453125, "logits/rejected": 1.5576171875, "logps/chosen": -27.875, "logps/rejected": -31.125, "loss": 0.7563, "rewards/accuracies": 0.25, "rewards/chosen": -0.012481689453125, "rewards/margins": -0.114013671875, "rewards/rejected": 0.1015625, "step": 1264 }, { "epoch": 0.937037037037037, "grad_norm": 2.184288501739502, "learning_rate": 6.296296296296296e-08, "logits/chosen": 1.0205078125, "logits/rejected": 1.958984375, "logps/chosen": -48.3125, "logps/rejected": -58.46875, "loss": 0.7236, "rewards/accuracies": 0.5, "rewards/chosen": -0.03790283203125, "rewards/margins": -0.0406494140625, "rewards/rejected": 0.0027313232421875, "step": 1265 }, { "epoch": 0.9377777777777778, "grad_norm": 1.5807157754898071, "learning_rate": 6.222222222222221e-08, "logits/chosen": 1.6650390625, "logits/rejected": 1.708984375, "logps/chosen": -23.4375, "logps/rejected": -31.34375, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": 0.02618408203125, "rewards/margins": 0.0640869140625, "rewards/rejected": -0.03790283203125, "step": 1266 }, { "epoch": 0.9385185185185185, "grad_norm": 2.153454303741455, "learning_rate": 6.148148148148148e-08, "logits/chosen": 1.0615234375, "logits/rejected": 2.02734375, "logps/chosen": -26.75, "logps/rejected": -55.46875, "loss": 0.7627, "rewards/accuracies": 0.25, "rewards/chosen": 0.00823974609375, "rewards/margins": -0.12890625, "rewards/rejected": 0.1370849609375, "step": 1267 }, { "epoch": 0.9392592592592592, "grad_norm": 1.811307668685913, "learning_rate": 6.074074074074074e-08, "logits/chosen": 1.0390625, "logits/rejected": 1.7373046875, "logps/chosen": -28.359375, "logps/rejected": -48.4375, "loss": 0.6851, "rewards/accuracies": 0.5, "rewards/chosen": 0.007022857666015625, "rewards/margins": 0.02655029296875, "rewards/rejected": -0.0195465087890625, "step": 1268 }, { "epoch": 0.94, "grad_norm": 1.6228883266448975, "learning_rate": 6e-08, "logits/chosen": 1.580078125, "logits/rejected": 1.8095703125, "logps/chosen": -32.3125, "logps/rejected": -50.6875, "loss": 0.5688, "rewards/accuracies": 0.5, "rewards/chosen": 0.031982421875, "rewards/margins": 0.403076171875, "rewards/rejected": -0.37109375, "step": 1269 }, { "epoch": 0.9407407407407408, "grad_norm": 3.232849359512329, "learning_rate": 5.925925925925926e-08, "logits/chosen": 1.8828125, "logits/rejected": 1.0712890625, "logps/chosen": -35.3125, "logps/rejected": -54.46875, "loss": 0.6772, "rewards/accuracies": 0.5, "rewards/chosen": 0.134765625, "rewards/margins": 0.059722900390625, "rewards/rejected": 0.07501220703125, "step": 1270 }, { "epoch": 0.9414814814814815, "grad_norm": 1.8290431499481201, "learning_rate": 5.851851851851851e-08, "logits/chosen": 1.15625, "logits/rejected": 1.30859375, "logps/chosen": -24.140625, "logps/rejected": -48.90625, "loss": 0.6616, "rewards/accuracies": 0.5, "rewards/chosen": 0.08770751953125, "rewards/margins": 0.089599609375, "rewards/rejected": -0.001953125, "step": 1271 }, { "epoch": 0.9422222222222222, "grad_norm": 1.9044137001037598, "learning_rate": 5.7777777777777775e-08, "logits/chosen": 1.771484375, "logits/rejected": 1.298828125, "logps/chosen": -23.375, "logps/rejected": -70.0625, "loss": 0.6206, "rewards/accuracies": 0.75, "rewards/chosen": 0.0562744140625, "rewards/margins": 0.155517578125, "rewards/rejected": -0.0992431640625, "step": 1272 }, { "epoch": 0.942962962962963, "grad_norm": 2.3848876953125, "learning_rate": 5.703703703703704e-08, "logits/chosen": 1.2177734375, "logits/rejected": 1.771484375, "logps/chosen": -47.59375, "logps/rejected": -87.125, "loss": 0.6523, "rewards/accuracies": 0.5, "rewards/chosen": 0.0367431640625, "rewards/margins": 0.1055908203125, "rewards/rejected": -0.06884765625, "step": 1273 }, { "epoch": 0.9437037037037037, "grad_norm": 2.453953266143799, "learning_rate": 5.629629629629629e-08, "logits/chosen": 1.4130859375, "logits/rejected": 1.9248046875, "logps/chosen": -26.65625, "logps/rejected": -45.34375, "loss": 0.606, "rewards/accuracies": 1.0, "rewards/chosen": 0.09710693359375, "rewards/margins": 0.183837890625, "rewards/rejected": -0.08673095703125, "step": 1274 }, { "epoch": 0.9444444444444444, "grad_norm": 2.26716685295105, "learning_rate": 5.555555555555555e-08, "logits/chosen": 2.15625, "logits/rejected": 1.7958984375, "logps/chosen": -34.46875, "logps/rejected": -53.65625, "loss": 0.8154, "rewards/accuracies": 0.0, "rewards/chosen": -0.201171875, "rewards/margins": -0.2261962890625, "rewards/rejected": 0.024993896484375, "step": 1275 }, { "epoch": 0.9451851851851852, "grad_norm": 1.7311047315597534, "learning_rate": 5.481481481481482e-08, "logits/chosen": 1.353515625, "logits/rejected": 1.4150390625, "logps/chosen": -29.234375, "logps/rejected": -46.5625, "loss": 0.6353, "rewards/accuracies": 0.75, "rewards/chosen": 0.0960693359375, "rewards/margins": 0.12225341796875, "rewards/rejected": -0.02618408203125, "step": 1276 }, { "epoch": 0.945925925925926, "grad_norm": 3.68677020072937, "learning_rate": 5.407407407407407e-08, "logits/chosen": 1.1689453125, "logits/rejected": 1.9794921875, "logps/chosen": -68.25, "logps/rejected": -55.75, "loss": 0.8564, "rewards/accuracies": 0.5, "rewards/chosen": -0.343994140625, "rewards/margins": -0.268310546875, "rewards/rejected": -0.0758056640625, "step": 1277 }, { "epoch": 0.9466666666666667, "grad_norm": 2.8666152954101562, "learning_rate": 5.3333333333333334e-08, "logits/chosen": 1.48046875, "logits/rejected": 1.5810546875, "logps/chosen": -32.9375, "logps/rejected": -55.0625, "loss": 1.499, "rewards/accuracies": 0.5, "rewards/chosen": 0.048431396484375, "rewards/margins": -0.9833984375, "rewards/rejected": 1.03125, "step": 1278 }, { "epoch": 0.9474074074074074, "grad_norm": 1.407652735710144, "learning_rate": 5.259259259259259e-08, "logits/chosen": 1.818359375, "logits/rejected": 1.662109375, "logps/chosen": -22.40625, "logps/rejected": -27.578125, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": -0.007049560546875, "rewards/margins": 0.017181396484375, "rewards/rejected": -0.02423095703125, "step": 1279 }, { "epoch": 0.9481481481481482, "grad_norm": 2.017878532409668, "learning_rate": 5.1851851851851846e-08, "logits/chosen": 1.55859375, "logits/rejected": 1.5166015625, "logps/chosen": -25.53125, "logps/rejected": -56.09375, "loss": 0.6602, "rewards/accuracies": 0.5, "rewards/chosen": 0.041015625, "rewards/margins": 0.072265625, "rewards/rejected": -0.031219482421875, "step": 1280 }, { "epoch": 0.9488888888888889, "grad_norm": 3.1725783348083496, "learning_rate": 5.1111111111111114e-08, "logits/chosen": 1.48828125, "logits/rejected": 1.6064453125, "logps/chosen": -33.40625, "logps/rejected": -59.9375, "loss": 0.6602, "rewards/accuracies": 0.75, "rewards/chosen": -0.064453125, "rewards/margins": 0.082763671875, "rewards/rejected": -0.147216796875, "step": 1281 }, { "epoch": 0.9496296296296296, "grad_norm": 1.5282577276229858, "learning_rate": 5.037037037037037e-08, "logits/chosen": 1.1728515625, "logits/rejected": 0.75146484375, "logps/chosen": -24.0625, "logps/rejected": -17.390625, "loss": 0.6382, "rewards/accuracies": 0.75, "rewards/chosen": 0.018768310546875, "rewards/margins": 0.126220703125, "rewards/rejected": -0.107421875, "step": 1282 }, { "epoch": 0.9503703703703704, "grad_norm": 2.0128278732299805, "learning_rate": 4.9629629629629626e-08, "logits/chosen": 1.7421875, "logits/rejected": 1.3896484375, "logps/chosen": -40.65625, "logps/rejected": -50.375, "loss": 0.6572, "rewards/accuracies": 0.75, "rewards/chosen": 0.09759521484375, "rewards/margins": 0.139892578125, "rewards/rejected": -0.042205810546875, "step": 1283 }, { "epoch": 0.9511111111111111, "grad_norm": 1.8344544172286987, "learning_rate": 4.888888888888889e-08, "logits/chosen": 1.5419921875, "logits/rejected": 1.55859375, "logps/chosen": -23.890625, "logps/rejected": -35.9375, "loss": 0.7651, "rewards/accuracies": 0.5, "rewards/chosen": -0.0136871337890625, "rewards/margins": -0.107421875, "rewards/rejected": 0.09375, "step": 1284 }, { "epoch": 0.9518518518518518, "grad_norm": 1.4429408311843872, "learning_rate": 4.814814814814814e-08, "logits/chosen": 1.46875, "logits/rejected": 0.7939453125, "logps/chosen": -46.40625, "logps/rejected": -23.703125, "loss": 0.6069, "rewards/accuracies": 0.5, "rewards/chosen": 0.175048828125, "rewards/margins": 0.2188720703125, "rewards/rejected": -0.043731689453125, "step": 1285 }, { "epoch": 0.9525925925925925, "grad_norm": 1.0586721897125244, "learning_rate": 4.7407407407407405e-08, "logits/chosen": 1.51171875, "logits/rejected": 1.5908203125, "logps/chosen": -24.484375, "logps/rejected": -24.25, "loss": 0.6475, "rewards/accuracies": 0.5, "rewards/chosen": 0.1953125, "rewards/margins": 0.1107177734375, "rewards/rejected": 0.0845947265625, "step": 1286 }, { "epoch": 0.9533333333333334, "grad_norm": 2.522719621658325, "learning_rate": 4.666666666666667e-08, "logits/chosen": 1.5751953125, "logits/rejected": 1.2392578125, "logps/chosen": -30.484375, "logps/rejected": -32.5, "loss": 0.7837, "rewards/accuracies": 0.0, "rewards/chosen": -0.270751953125, "rewards/margins": -0.1676025390625, "rewards/rejected": -0.1031494140625, "step": 1287 }, { "epoch": 0.9540740740740741, "grad_norm": 1.1103898286819458, "learning_rate": 4.592592592592592e-08, "logits/chosen": 1.28125, "logits/rejected": 1.5283203125, "logps/chosen": -22.796875, "logps/rejected": -23.34375, "loss": 0.625, "rewards/accuracies": 1.0, "rewards/chosen": 0.0855712890625, "rewards/margins": 0.1456298828125, "rewards/rejected": -0.060150146484375, "step": 1288 }, { "epoch": 0.9548148148148148, "grad_norm": 2.6189584732055664, "learning_rate": 4.518518518518518e-08, "logits/chosen": 1.5556640625, "logits/rejected": 1.666015625, "logps/chosen": -23.390625, "logps/rejected": -59.125, "loss": 0.7119, "rewards/accuracies": 0.75, "rewards/chosen": -0.044189453125, "rewards/margins": -0.01220703125, "rewards/rejected": -0.03204345703125, "step": 1289 }, { "epoch": 0.9555555555555556, "grad_norm": 2.160517692565918, "learning_rate": 4.444444444444444e-08, "logits/chosen": 1.7734375, "logits/rejected": 1.998046875, "logps/chosen": -24.765625, "logps/rejected": -59.5, "loss": 0.8516, "rewards/accuracies": 0.0, "rewards/chosen": -0.173828125, "rewards/margins": -0.2861328125, "rewards/rejected": 0.1123046875, "step": 1290 }, { "epoch": 0.9562962962962963, "grad_norm": 1.8564354181289673, "learning_rate": 4.37037037037037e-08, "logits/chosen": 1.3798828125, "logits/rejected": 1.5048828125, "logps/chosen": -38.4375, "logps/rejected": -34.4375, "loss": 0.7573, "rewards/accuracies": 0.5, "rewards/chosen": -0.154296875, "rewards/margins": -0.102294921875, "rewards/rejected": -0.051971435546875, "step": 1291 }, { "epoch": 0.957037037037037, "grad_norm": 1.9100217819213867, "learning_rate": 4.296296296296296e-08, "logits/chosen": 1.548828125, "logits/rejected": 1.369140625, "logps/chosen": -26.34375, "logps/rejected": -44.96875, "loss": 0.604, "rewards/accuracies": 0.75, "rewards/chosen": -0.048065185546875, "rewards/margins": 0.1905517578125, "rewards/rejected": -0.2386474609375, "step": 1292 }, { "epoch": 0.9577777777777777, "grad_norm": 2.8836724758148193, "learning_rate": 4.222222222222222e-08, "logits/chosen": 1.75, "logits/rejected": 1.6083984375, "logps/chosen": -53.5625, "logps/rejected": -40.4375, "loss": 0.7549, "rewards/accuracies": 0.25, "rewards/chosen": -0.0960693359375, "rewards/margins": -0.1019287109375, "rewards/rejected": 0.005859375, "step": 1293 }, { "epoch": 0.9585185185185185, "grad_norm": 1.0936033725738525, "learning_rate": 4.1481481481481476e-08, "logits/chosen": 1.4013671875, "logits/rejected": 1.705078125, "logps/chosen": -43.0, "logps/rejected": -24.046875, "loss": 0.498, "rewards/accuracies": 0.5, "rewards/chosen": 1.0029296875, "rewards/margins": 1.0263671875, "rewards/rejected": -0.0231170654296875, "step": 1294 }, { "epoch": 0.9592592592592593, "grad_norm": 6.272210121154785, "learning_rate": 4.0740740740740745e-08, "logits/chosen": 1.27734375, "logits/rejected": 2.04296875, "logps/chosen": -44.71875, "logps/rejected": -55.84375, "loss": 0.626, "rewards/accuracies": 1.0, "rewards/chosen": 0.08441162109375, "rewards/margins": 0.141357421875, "rewards/rejected": -0.0570068359375, "step": 1295 }, { "epoch": 0.96, "grad_norm": 1.4430062770843506, "learning_rate": 4e-08, "logits/chosen": 1.4345703125, "logits/rejected": 1.54296875, "logps/chosen": -25.4375, "logps/rejected": -30.453125, "loss": 0.7261, "rewards/accuracies": 0.25, "rewards/chosen": 0.0218658447265625, "rewards/margins": -0.06207275390625, "rewards/rejected": 0.083984375, "step": 1296 }, { "epoch": 0.9607407407407408, "grad_norm": 1.5129932165145874, "learning_rate": 3.9259259259259256e-08, "logits/chosen": 1.162109375, "logits/rejected": 1.537109375, "logps/chosen": -25.3125, "logps/rejected": -26.0625, "loss": 0.7061, "rewards/accuracies": 0.75, "rewards/chosen": 0.11639404296875, "rewards/margins": 0.0191650390625, "rewards/rejected": 0.09722900390625, "step": 1297 }, { "epoch": 0.9614814814814815, "grad_norm": 1.919368028640747, "learning_rate": 3.851851851851852e-08, "logits/chosen": 1.7177734375, "logits/rejected": 1.767578125, "logps/chosen": -31.125, "logps/rejected": -35.5625, "loss": 0.7676, "rewards/accuracies": 0.0, "rewards/chosen": 0.07733154296875, "rewards/margins": -0.14111328125, "rewards/rejected": 0.2183837890625, "step": 1298 }, { "epoch": 0.9622222222222222, "grad_norm": 29.027856826782227, "learning_rate": 3.7777777777777774e-08, "logits/chosen": 2.166015625, "logits/rejected": 1.685546875, "logps/chosen": -42.5625, "logps/rejected": -40.96875, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": -0.017974853515625, "rewards/margins": 0.0191650390625, "rewards/rejected": -0.037109375, "step": 1299 }, { "epoch": 0.9629629629629629, "grad_norm": 9.408013343811035, "learning_rate": 3.7037037037037036e-08, "logits/chosen": 1.7763671875, "logits/rejected": 1.509765625, "logps/chosen": -47.125, "logps/rejected": -32.21875, "loss": 0.8916, "rewards/accuracies": 0.5, "rewards/chosen": -0.31640625, "rewards/margins": -0.266357421875, "rewards/rejected": -0.04998779296875, "step": 1300 }, { "epoch": 0.9637037037037037, "grad_norm": 1.8811546564102173, "learning_rate": 3.62962962962963e-08, "logits/chosen": 1.787109375, "logits/rejected": 1.9287109375, "logps/chosen": -20.734375, "logps/rejected": -38.875, "loss": 0.6855, "rewards/accuracies": 0.5, "rewards/chosen": 0.10589599609375, "rewards/margins": 0.01837158203125, "rewards/rejected": 0.0875244140625, "step": 1301 }, { "epoch": 0.9644444444444444, "grad_norm": 2.4342172145843506, "learning_rate": 3.5555555555555554e-08, "logits/chosen": 1.6337890625, "logits/rejected": 1.625, "logps/chosen": -44.28125, "logps/rejected": -28.8125, "loss": 0.8413, "rewards/accuracies": 0.25, "rewards/chosen": -0.0859375, "rewards/margins": -0.246826171875, "rewards/rejected": 0.1610107421875, "step": 1302 }, { "epoch": 0.9651851851851851, "grad_norm": 1.9190845489501953, "learning_rate": 3.481481481481481e-08, "logits/chosen": 1.62890625, "logits/rejected": 1.78125, "logps/chosen": -22.03125, "logps/rejected": -38.6875, "loss": 0.7319, "rewards/accuracies": 0.75, "rewards/chosen": 0.06719970703125, "rewards/margins": -0.03204345703125, "rewards/rejected": 0.0992431640625, "step": 1303 }, { "epoch": 0.965925925925926, "grad_norm": 1.8117953538894653, "learning_rate": 3.407407407407407e-08, "logits/chosen": 1.58984375, "logits/rejected": 1.3466796875, "logps/chosen": -31.84375, "logps/rejected": -33.8125, "loss": 0.7114, "rewards/accuracies": 0.25, "rewards/chosen": 0.0140533447265625, "rewards/margins": -0.0292816162109375, "rewards/rejected": 0.04339599609375, "step": 1304 }, { "epoch": 0.9666666666666667, "grad_norm": 1.6402372121810913, "learning_rate": 3.3333333333333334e-08, "logits/chosen": 0.38037109375, "logits/rejected": 0.87939453125, "logps/chosen": -33.5625, "logps/rejected": -35.28125, "loss": 0.6582, "rewards/accuracies": 0.75, "rewards/chosen": 0.1343994140625, "rewards/margins": 0.087890625, "rewards/rejected": 0.046478271484375, "step": 1305 }, { "epoch": 0.9674074074074074, "grad_norm": 1.8283448219299316, "learning_rate": 3.259259259259259e-08, "logits/chosen": 0.84619140625, "logits/rejected": 1.390625, "logps/chosen": -21.03125, "logps/rejected": -77.5625, "loss": 0.5825, "rewards/accuracies": 1.0, "rewards/chosen": 0.06524658203125, "rewards/margins": 0.2626953125, "rewards/rejected": -0.1976318359375, "step": 1306 }, { "epoch": 0.9681481481481482, "grad_norm": 2.8078808784484863, "learning_rate": 3.185185185185185e-08, "logits/chosen": 1.7119140625, "logits/rejected": 1.6181640625, "logps/chosen": -31.96875, "logps/rejected": -39.4375, "loss": 0.8896, "rewards/accuracies": 0.0, "rewards/chosen": -0.1500244140625, "rewards/margins": -0.34033203125, "rewards/rejected": 0.1904296875, "step": 1307 }, { "epoch": 0.9688888888888889, "grad_norm": 1.3117742538452148, "learning_rate": 3.111111111111111e-08, "logits/chosen": 1.3955078125, "logits/rejected": 1.646484375, "logps/chosen": -24.265625, "logps/rejected": -34.34375, "loss": 0.7041, "rewards/accuracies": 0.5, "rewards/chosen": 0.11053466796875, "rewards/margins": -0.00897216796875, "rewards/rejected": 0.1195068359375, "step": 1308 }, { "epoch": 0.9696296296296296, "grad_norm": 1.7666571140289307, "learning_rate": 3.037037037037037e-08, "logits/chosen": 1.7080078125, "logits/rejected": 1.578125, "logps/chosen": -39.09375, "logps/rejected": -59.15625, "loss": 0.6206, "rewards/accuracies": 0.75, "rewards/chosen": 0.021087646484375, "rewards/margins": 0.155029296875, "rewards/rejected": -0.134033203125, "step": 1309 }, { "epoch": 0.9703703703703703, "grad_norm": 2.126194953918457, "learning_rate": 2.962962962962963e-08, "logits/chosen": 1.21484375, "logits/rejected": 1.9453125, "logps/chosen": -48.59375, "logps/rejected": -37.09375, "loss": 0.7705, "rewards/accuracies": 0.25, "rewards/chosen": -0.0816650390625, "rewards/margins": -0.1422119140625, "rewards/rejected": 0.060577392578125, "step": 1310 }, { "epoch": 0.9711111111111111, "grad_norm": 1.8105257749557495, "learning_rate": 2.8888888888888887e-08, "logits/chosen": 1.2548828125, "logits/rejected": 1.6650390625, "logps/chosen": -25.84375, "logps/rejected": -36.71875, "loss": 0.7783, "rewards/accuracies": 0.5, "rewards/chosen": -0.03045654296875, "rewards/margins": -0.150390625, "rewards/rejected": 0.11993408203125, "step": 1311 }, { "epoch": 0.9718518518518519, "grad_norm": 1.7449243068695068, "learning_rate": 2.8148148148148146e-08, "logits/chosen": 0.8896484375, "logits/rejected": 1.1513671875, "logps/chosen": -37.71875, "logps/rejected": -24.609375, "loss": 0.8027, "rewards/accuracies": 0.25, "rewards/chosen": -0.067626953125, "rewards/margins": -0.1934814453125, "rewards/rejected": 0.125732421875, "step": 1312 }, { "epoch": 0.9725925925925926, "grad_norm": 1.9265878200531006, "learning_rate": 2.740740740740741e-08, "logits/chosen": 1.986328125, "logits/rejected": 1.4892578125, "logps/chosen": -21.3125, "logps/rejected": -54.03125, "loss": 0.7275, "rewards/accuracies": 0.5, "rewards/chosen": -0.023040771484375, "rewards/margins": -0.060882568359375, "rewards/rejected": 0.037872314453125, "step": 1313 }, { "epoch": 0.9733333333333334, "grad_norm": 1.5576473474502563, "learning_rate": 2.6666666666666667e-08, "logits/chosen": 1.6298828125, "logits/rejected": 1.171875, "logps/chosen": -33.625, "logps/rejected": -39.46875, "loss": 0.5967, "rewards/accuracies": 0.75, "rewards/chosen": 0.246826171875, "rewards/margins": 0.218017578125, "rewards/rejected": 0.0288848876953125, "step": 1314 }, { "epoch": 0.9740740740740741, "grad_norm": 19.330554962158203, "learning_rate": 2.5925925925925923e-08, "logits/chosen": 1.962890625, "logits/rejected": 2.296875, "logps/chosen": -26.90625, "logps/rejected": -56.28125, "loss": 0.4673, "rewards/accuracies": 1.0, "rewards/chosen": 0.07305908203125, "rewards/margins": 0.650390625, "rewards/rejected": -0.5771484375, "step": 1315 }, { "epoch": 0.9748148148148148, "grad_norm": 2.030810594558716, "learning_rate": 2.5185185185185185e-08, "logits/chosen": 1.779296875, "logits/rejected": 1.568359375, "logps/chosen": -38.1875, "logps/rejected": -46.0, "loss": 0.7617, "rewards/accuracies": 0.25, "rewards/chosen": -0.0062713623046875, "rewards/margins": -0.11492919921875, "rewards/rejected": 0.108642578125, "step": 1316 }, { "epoch": 0.9755555555555555, "grad_norm": 3.243661642074585, "learning_rate": 2.4444444444444444e-08, "logits/chosen": 1.765625, "logits/rejected": 2.01171875, "logps/chosen": -56.8125, "logps/rejected": -25.078125, "loss": 0.8428, "rewards/accuracies": 0.25, "rewards/chosen": -0.1917724609375, "rewards/margins": -0.2744140625, "rewards/rejected": 0.08258056640625, "step": 1317 }, { "epoch": 0.9762962962962963, "grad_norm": 2.0314221382141113, "learning_rate": 2.3703703703703703e-08, "logits/chosen": 1.2255859375, "logits/rejected": 1.6904296875, "logps/chosen": -27.828125, "logps/rejected": -70.5625, "loss": 0.8145, "rewards/accuracies": 0.25, "rewards/chosen": -0.02655029296875, "rewards/margins": -0.2242431640625, "rewards/rejected": 0.1976318359375, "step": 1318 }, { "epoch": 0.977037037037037, "grad_norm": 2.112006902694702, "learning_rate": 2.296296296296296e-08, "logits/chosen": 1.1083984375, "logits/rejected": 1.40234375, "logps/chosen": -23.84375, "logps/rejected": -24.96875, "loss": 0.7959, "rewards/accuracies": 0.25, "rewards/chosen": -0.11212158203125, "rewards/margins": -0.1651611328125, "rewards/rejected": 0.053131103515625, "step": 1319 }, { "epoch": 0.9777777777777777, "grad_norm": 1.8511836528778076, "learning_rate": 2.222222222222222e-08, "logits/chosen": 1.7236328125, "logits/rejected": 1.677734375, "logps/chosen": -30.625, "logps/rejected": -35.78125, "loss": 0.6455, "rewards/accuracies": 0.75, "rewards/chosen": -0.056243896484375, "rewards/margins": 0.1038818359375, "rewards/rejected": -0.16015625, "step": 1320 }, { "epoch": 0.9785185185185186, "grad_norm": 2.0372910499572754, "learning_rate": 2.148148148148148e-08, "logits/chosen": 0.72314453125, "logits/rejected": 1.35546875, "logps/chosen": -26.421875, "logps/rejected": -35.0, "loss": 0.8076, "rewards/accuracies": 0.0, "rewards/chosen": -0.0712890625, "rewards/margins": -0.2138671875, "rewards/rejected": 0.142578125, "step": 1321 }, { "epoch": 0.9792592592592593, "grad_norm": 1.6871439218521118, "learning_rate": 2.0740740740740738e-08, "logits/chosen": 2.017578125, "logits/rejected": 1.6689453125, "logps/chosen": -40.75, "logps/rejected": -71.625, "loss": 0.5498, "rewards/accuracies": 1.0, "rewards/chosen": 0.1549072265625, "rewards/margins": 0.31689453125, "rewards/rejected": -0.162109375, "step": 1322 }, { "epoch": 0.98, "grad_norm": 2.1053085327148438, "learning_rate": 2e-08, "logits/chosen": 2.123046875, "logits/rejected": 1.771484375, "logps/chosen": -45.84375, "logps/rejected": -36.875, "loss": 0.9458, "rewards/accuracies": 0.0, "rewards/chosen": -0.1754150390625, "rewards/margins": -0.431396484375, "rewards/rejected": 0.255859375, "step": 1323 }, { "epoch": 0.9807407407407407, "grad_norm": 1.430292010307312, "learning_rate": 1.925925925925926e-08, "logits/chosen": 1.44921875, "logits/rejected": 1.7353515625, "logps/chosen": -22.9375, "logps/rejected": -40.8125, "loss": 0.6621, "rewards/accuracies": 0.75, "rewards/chosen": 0.0941162109375, "rewards/margins": 0.08172607421875, "rewards/rejected": 0.01247406005859375, "step": 1324 }, { "epoch": 0.9814814814814815, "grad_norm": 2.0143930912017822, "learning_rate": 1.8518518518518518e-08, "logits/chosen": 1.8583984375, "logits/rejected": 1.4609375, "logps/chosen": -24.296875, "logps/rejected": -52.25, "loss": 0.7188, "rewards/accuracies": 0.75, "rewards/chosen": 0.0187530517578125, "rewards/margins": -0.039886474609375, "rewards/rejected": 0.05865478515625, "step": 1325 }, { "epoch": 0.9822222222222222, "grad_norm": 2.9405465126037598, "learning_rate": 1.7777777777777777e-08, "logits/chosen": 1.7041015625, "logits/rejected": 1.8857421875, "logps/chosen": -31.03125, "logps/rejected": -54.84375, "loss": 0.7583, "rewards/accuracies": 0.25, "rewards/chosen": -0.06640625, "rewards/margins": -0.1090087890625, "rewards/rejected": 0.0426025390625, "step": 1326 }, { "epoch": 0.9829629629629629, "grad_norm": 1.7142229080200195, "learning_rate": 1.7037037037037036e-08, "logits/chosen": 1.087890625, "logits/rejected": 2.390625, "logps/chosen": -29.0625, "logps/rejected": -58.96875, "loss": 0.6729, "rewards/accuracies": 0.75, "rewards/chosen": 0.07794189453125, "rewards/margins": 0.05255126953125, "rewards/rejected": 0.025390625, "step": 1327 }, { "epoch": 0.9837037037037037, "grad_norm": 1.5674097537994385, "learning_rate": 1.6296296296296295e-08, "logits/chosen": 1.2919921875, "logits/rejected": 1.7099609375, "logps/chosen": -44.4375, "logps/rejected": -51.9375, "loss": 0.5625, "rewards/accuracies": 0.75, "rewards/chosen": 0.143310546875, "rewards/margins": 0.29150390625, "rewards/rejected": -0.1480712890625, "step": 1328 }, { "epoch": 0.9844444444444445, "grad_norm": 2.016116142272949, "learning_rate": 1.5555555555555554e-08, "logits/chosen": 0.75927734375, "logits/rejected": 1.646484375, "logps/chosen": -57.0625, "logps/rejected": -52.25, "loss": 0.7568, "rewards/accuracies": 0.25, "rewards/chosen": -0.12225341796875, "rewards/margins": -0.1217041015625, "rewards/rejected": -0.0005826950073242188, "step": 1329 }, { "epoch": 0.9851851851851852, "grad_norm": 1.4160914421081543, "learning_rate": 1.4814814814814814e-08, "logits/chosen": 2.1328125, "logits/rejected": 1.296875, "logps/chosen": -28.578125, "logps/rejected": -28.625, "loss": 0.6538, "rewards/accuracies": 0.75, "rewards/chosen": 0.028106689453125, "rewards/margins": 0.083984375, "rewards/rejected": -0.05584716796875, "step": 1330 }, { "epoch": 0.9859259259259259, "grad_norm": 1.9385197162628174, "learning_rate": 1.4074074074074073e-08, "logits/chosen": 1.9541015625, "logits/rejected": 1.8359375, "logps/chosen": -51.625, "logps/rejected": -27.796875, "loss": 0.7007, "rewards/accuracies": 0.25, "rewards/chosen": -0.10699462890625, "rewards/margins": -0.00580596923828125, "rewards/rejected": -0.1011962890625, "step": 1331 }, { "epoch": 0.9866666666666667, "grad_norm": 2.391014814376831, "learning_rate": 1.3333333333333334e-08, "logits/chosen": 1.0966796875, "logits/rejected": 1.28515625, "logps/chosen": -23.0, "logps/rejected": -59.1875, "loss": 0.5693, "rewards/accuracies": 0.5, "rewards/chosen": 0.146484375, "rewards/margins": 0.351318359375, "rewards/rejected": -0.204833984375, "step": 1332 }, { "epoch": 0.9874074074074074, "grad_norm": 2.6977813243865967, "learning_rate": 1.2592592592592592e-08, "logits/chosen": 1.0712890625, "logits/rejected": 1.4189453125, "logps/chosen": -44.4375, "logps/rejected": -43.21875, "loss": 0.9131, "rewards/accuracies": 0.0, "rewards/chosen": -0.307373046875, "rewards/margins": -0.380859375, "rewards/rejected": 0.07342529296875, "step": 1333 }, { "epoch": 0.9881481481481481, "grad_norm": 6.245678424835205, "learning_rate": 1.1851851851851851e-08, "logits/chosen": 1.7939453125, "logits/rejected": 2.39453125, "logps/chosen": -30.703125, "logps/rejected": -95.1875, "loss": 0.6465, "rewards/accuracies": 0.5, "rewards/chosen": 0.08477783203125, "rewards/margins": 0.171142578125, "rewards/rejected": -0.08636474609375, "step": 1334 }, { "epoch": 0.9888888888888889, "grad_norm": 1.3540430068969727, "learning_rate": 1.111111111111111e-08, "logits/chosen": 1.7568359375, "logits/rejected": 1.2041015625, "logps/chosen": -35.65625, "logps/rejected": -36.28125, "loss": 0.6118, "rewards/accuracies": 0.75, "rewards/chosen": 0.1395263671875, "rewards/margins": 0.1793212890625, "rewards/rejected": -0.03985595703125, "step": 1335 }, { "epoch": 0.9896296296296296, "grad_norm": 1.5270277261734009, "learning_rate": 1.0370370370370369e-08, "logits/chosen": 1.4423828125, "logits/rejected": 1.8896484375, "logps/chosen": -37.3125, "logps/rejected": -45.125, "loss": 0.5503, "rewards/accuracies": 0.75, "rewards/chosen": 0.1851806640625, "rewards/margins": 0.39990234375, "rewards/rejected": -0.21484375, "step": 1336 }, { "epoch": 0.9903703703703703, "grad_norm": 1.3816496133804321, "learning_rate": 9.62962962962963e-09, "logits/chosen": 1.2646484375, "logits/rejected": 0.99169921875, "logps/chosen": -19.0625, "logps/rejected": -37.0, "loss": 0.7178, "rewards/accuracies": 0.25, "rewards/chosen": 0.01348876953125, "rewards/margins": -0.02557373046875, "rewards/rejected": 0.0390625, "step": 1337 }, { "epoch": 0.9911111111111112, "grad_norm": 5.089969635009766, "learning_rate": 8.888888888888889e-09, "logits/chosen": 1.708984375, "logits/rejected": 1.53125, "logps/chosen": -52.25, "logps/rejected": -55.59375, "loss": 0.8047, "rewards/accuracies": 0.5, "rewards/chosen": -0.2120361328125, "rewards/margins": -0.1346435546875, "rewards/rejected": -0.077392578125, "step": 1338 }, { "epoch": 0.9918518518518519, "grad_norm": 1.5096096992492676, "learning_rate": 8.148148148148147e-09, "logits/chosen": 1.642578125, "logits/rejected": 1.591796875, "logps/chosen": -25.078125, "logps/rejected": -30.40625, "loss": 0.7256, "rewards/accuracies": 0.5, "rewards/chosen": -0.06951904296875, "rewards/margins": -0.057464599609375, "rewards/rejected": -0.01210784912109375, "step": 1339 }, { "epoch": 0.9925925925925926, "grad_norm": 1.953653335571289, "learning_rate": 7.407407407407407e-09, "logits/chosen": 1.4091796875, "logits/rejected": 1.7431640625, "logps/chosen": -34.9375, "logps/rejected": -33.875, "loss": 0.7881, "rewards/accuracies": 0.0, "rewards/chosen": -0.09844970703125, "rewards/margins": -0.180419921875, "rewards/rejected": 0.08203125, "step": 1340 }, { "epoch": 0.9933333333333333, "grad_norm": 4.7146100997924805, "learning_rate": 6.666666666666667e-09, "logits/chosen": 1.1572265625, "logits/rejected": 1.46484375, "logps/chosen": -27.6875, "logps/rejected": -62.21875, "loss": 0.9878, "rewards/accuracies": 0.0, "rewards/chosen": -0.1177978515625, "rewards/margins": -0.47265625, "rewards/rejected": 0.35498046875, "step": 1341 }, { "epoch": 0.9940740740740741, "grad_norm": 1.487478494644165, "learning_rate": 5.925925925925926e-09, "logits/chosen": 1.7333984375, "logits/rejected": 1.6318359375, "logps/chosen": -30.375, "logps/rejected": -29.90625, "loss": 0.7275, "rewards/accuracies": 0.5, "rewards/chosen": -0.153564453125, "rewards/margins": -0.05157470703125, "rewards/rejected": -0.1019287109375, "step": 1342 }, { "epoch": 0.9948148148148148, "grad_norm": 2.5732593536376953, "learning_rate": 5.1851851851851846e-09, "logits/chosen": 1.9130859375, "logits/rejected": 1.27734375, "logps/chosen": -46.90625, "logps/rejected": -44.375, "loss": 0.709, "rewards/accuracies": 0.25, "rewards/chosen": -0.1781005859375, "rewards/margins": -0.029693603515625, "rewards/rejected": -0.1484375, "step": 1343 }, { "epoch": 0.9955555555555555, "grad_norm": 1.9419912099838257, "learning_rate": 4.444444444444444e-09, "logits/chosen": 1.9833984375, "logits/rejected": 1.94140625, "logps/chosen": -29.9375, "logps/rejected": -56.84375, "loss": 0.7168, "rewards/accuracies": 0.5, "rewards/chosen": -0.015625, "rewards/margins": -0.0242156982421875, "rewards/rejected": 0.0085601806640625, "step": 1344 }, { "epoch": 0.9962962962962963, "grad_norm": 19.50688934326172, "learning_rate": 3.7037037037037036e-09, "logits/chosen": 1.2314453125, "logits/rejected": 1.408203125, "logps/chosen": -32.75, "logps/rejected": -36.78125, "loss": 0.6021, "rewards/accuracies": 0.75, "rewards/chosen": 0.10467529296875, "rewards/margins": 0.196044921875, "rewards/rejected": -0.0914306640625, "step": 1345 }, { "epoch": 0.997037037037037, "grad_norm": 1.5584927797317505, "learning_rate": 2.962962962962963e-09, "logits/chosen": 1.2529296875, "logits/rejected": 1.892578125, "logps/chosen": -21.890625, "logps/rejected": -37.0625, "loss": 0.5894, "rewards/accuracies": 0.75, "rewards/chosen": 0.00666046142578125, "rewards/margins": 0.29052734375, "rewards/rejected": -0.283935546875, "step": 1346 }, { "epoch": 0.9977777777777778, "grad_norm": 2.347027540206909, "learning_rate": 2.222222222222222e-09, "logits/chosen": 1.9892578125, "logits/rejected": 1.466796875, "logps/chosen": -27.609375, "logps/rejected": -45.875, "loss": 0.7344, "rewards/accuracies": 0.5, "rewards/chosen": -0.1097412109375, "rewards/margins": -0.06756591796875, "rewards/rejected": -0.042205810546875, "step": 1347 }, { "epoch": 0.9985185185185185, "grad_norm": 2.3896875381469727, "learning_rate": 1.4814814814814814e-09, "logits/chosen": 0.8701171875, "logits/rejected": 2.2265625, "logps/chosen": -41.40625, "logps/rejected": -40.1875, "loss": 0.7119, "rewards/accuracies": 0.5, "rewards/chosen": 0.1558837890625, "rewards/margins": -0.0179595947265625, "rewards/rejected": 0.173828125, "step": 1348 }, { "epoch": 0.9992592592592593, "grad_norm": 2.030752420425415, "learning_rate": 7.407407407407407e-10, "logits/chosen": 1.310546875, "logits/rejected": 1.55859375, "logps/chosen": -23.1875, "logps/rejected": -44.9375, "loss": 0.8408, "rewards/accuracies": 0.0, "rewards/chosen": -0.061737060546875, "rewards/margins": -0.2646484375, "rewards/rejected": 0.2027587890625, "step": 1349 }, { "epoch": 1.0, "grad_norm": 2.1538290977478027, "learning_rate": 0.0, "logits/chosen": 1.7392578125, "logits/rejected": 2.2734375, "logps/chosen": -27.375, "logps/rejected": -36.0, "loss": 0.7764, "rewards/accuracies": 0.25, "rewards/chosen": -0.1597900390625, "rewards/margins": -0.154296875, "rewards/rejected": -0.00548553466796875, "step": 1350 } ], "logging_steps": 1, "max_steps": 1350, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7713718274451046e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }