{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5293, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00018895554820728424, "grad_norm": 3.571836040630624, "learning_rate": 0.0, "logits/chosen": 1.505859375, "logits/rejected": 2.2568359375, "logps/chosen": -681.0, "logps/rejected": -779.5, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0003779110964145685, "grad_norm": 6.596523433895283, "learning_rate": 1.8867924528301888e-09, "logits/chosen": 2.001953125, "logits/rejected": 2.568359375, "logps/chosen": -927.0, "logps/rejected": -1550.5, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0005668666446218527, "grad_norm": 3.5988224418426107, "learning_rate": 3.7735849056603775e-09, "logits/chosen": 2.55078125, "logits/rejected": 2.921875, "logps/chosen": -785.5, "logps/rejected": -856.5, "loss": 1.0018, "rewards/accuracies": 0.15625, "rewards/chosen": -0.014057159423828125, "rewards/margins": -0.009349822998046875, "rewards/rejected": -0.00466156005859375, "step": 3 }, { "epoch": 0.000755822192829137, "grad_norm": 3.631646626167623, "learning_rate": 5.660377358490565e-09, "logits/chosen": 2.41796875, "logits/rejected": 3.21875, "logps/chosen": -928.0, "logps/rejected": -1004.0, "loss": 0.9885, "rewards/accuracies": 0.25, "rewards/chosen": 0.021549224853515625, "rewards/margins": 0.048828125, "rewards/rejected": -0.02727508544921875, "step": 4 }, { "epoch": 0.0009447777410364212, "grad_norm": 7.568821364532047, "learning_rate": 7.547169811320755e-09, "logits/chosen": 3.03515625, "logits/rejected": 4.6328125, "logps/chosen": -859.0, "logps/rejected": -1704.0, "loss": 1.0059, "rewards/accuracies": 0.28125, "rewards/chosen": -0.020084381103515625, "rewards/margins": -0.02558135986328125, "rewards/rejected": 0.005462646484375, "step": 5 }, { "epoch": 0.0011337332892437054, "grad_norm": 2.839460365129992, "learning_rate": 9.433962264150943e-09, "logits/chosen": 2.0322265625, "logits/rejected": 2.833984375, "logps/chosen": -550.0, "logps/rejected": -786.0, "loss": 1.0046, "rewards/accuracies": 0.21875, "rewards/chosen": -0.0074329376220703125, "rewards/margins": -0.02093505859375, "rewards/rejected": 0.01348876953125, "step": 6 }, { "epoch": 0.0013226888374509897, "grad_norm": 3.1986383462527304, "learning_rate": 1.132075471698113e-08, "logits/chosen": 3.39453125, "logits/rejected": 3.69921875, "logps/chosen": -897.0, "logps/rejected": -664.5, "loss": 0.995, "rewards/accuracies": 0.25, "rewards/chosen": 0.01339578628540039, "rewards/margins": 0.020305633544921875, "rewards/rejected": -0.006940364837646484, "step": 7 }, { "epoch": 0.001511644385658274, "grad_norm": 3.385409513310567, "learning_rate": 1.320754716981132e-08, "logits/chosen": 3.3046875, "logits/rejected": 3.3828125, "logps/chosen": -831.0, "logps/rejected": -635.5, "loss": 0.9943, "rewards/accuracies": 0.28125, "rewards/chosen": 0.018772125244140625, "rewards/margins": 0.023769378662109375, "rewards/rejected": -0.004985809326171875, "step": 8 }, { "epoch": 0.0017005999338655581, "grad_norm": 3.0496695959671953, "learning_rate": 1.509433962264151e-08, "logits/chosen": 2.68359375, "logits/rejected": 3.515625, "logps/chosen": -677.5, "logps/rejected": -683.0, "loss": 0.9999, "rewards/accuracies": 0.21875, "rewards/chosen": -0.016872406005859375, "rewards/margins": -0.00075531005859375, "rewards/rejected": -0.01602935791015625, "step": 9 }, { "epoch": 0.0018895554820728424, "grad_norm": 4.102491465166911, "learning_rate": 1.69811320754717e-08, "logits/chosen": 2.5234375, "logits/rejected": 2.68359375, "logps/chosen": -1107.0, "logps/rejected": -940.5, "loss": 0.9957, "rewards/accuracies": 0.125, "rewards/chosen": -0.001567840576171875, "rewards/margins": 0.01800537109375, "rewards/rejected": -0.019550323486328125, "step": 10 }, { "epoch": 0.0020785110302801264, "grad_norm": 4.744289543553102, "learning_rate": 1.8867924528301887e-08, "logits/chosen": 2.29296875, "logits/rejected": 2.8515625, "logps/chosen": -1158.0, "logps/rejected": -1112.0, "loss": 0.9854, "rewards/accuracies": 0.21875, "rewards/chosen": 0.0148773193359375, "rewards/margins": 0.06103515625, "rewards/rejected": -0.046131134033203125, "step": 11 }, { "epoch": 0.002267466578487411, "grad_norm": 3.5641256383500894, "learning_rate": 2.0754716981132072e-08, "logits/chosen": 2.416015625, "logits/rejected": 2.59375, "logps/chosen": -778.0, "logps/rejected": -827.0, "loss": 1.0042, "rewards/accuracies": 0.1875, "rewards/chosen": 0.011507987976074219, "rewards/margins": -0.0189208984375, "rewards/rejected": 0.03055572509765625, "step": 12 }, { "epoch": 0.002456422126694695, "grad_norm": 7.6440575350013145, "learning_rate": 2.264150943396226e-08, "logits/chosen": 2.19140625, "logits/rejected": 3.31640625, "logps/chosen": -999.0, "logps/rejected": -2015.0, "loss": 0.9971, "rewards/accuracies": 0.28125, "rewards/chosen": 0.017803192138671875, "rewards/margins": 0.009918212890625, "rewards/rejected": 0.007817268371582031, "step": 13 }, { "epoch": 0.0026453776749019793, "grad_norm": 4.455697605574527, "learning_rate": 2.4528301886792452e-08, "logits/chosen": 2.8203125, "logits/rejected": 3.796875, "logps/chosen": -1001.0, "logps/rejected": -1168.0, "loss": 1.0072, "rewards/accuracies": 0.09375, "rewards/chosen": 0.0017671585083007812, "rewards/margins": -0.03170013427734375, "rewards/rejected": 0.03347015380859375, "step": 14 }, { "epoch": 0.0028343332231092633, "grad_norm": 4.134706129764918, "learning_rate": 2.641509433962264e-08, "logits/chosen": 2.201171875, "logits/rejected": 2.98828125, "logps/chosen": -1020.0, "logps/rejected": -942.0, "loss": 0.985, "rewards/accuracies": 0.25, "rewards/chosen": 0.06223106384277344, "rewards/margins": 0.07037353515625, "rewards/rejected": -0.0081939697265625, "step": 15 }, { "epoch": 0.003023288771316548, "grad_norm": 4.041524455572265, "learning_rate": 2.830188679245283e-08, "logits/chosen": 2.921875, "logits/rejected": 3.78515625, "logps/chosen": -1122.0, "logps/rejected": -1041.0, "loss": 1.0037, "rewards/accuracies": 0.1875, "rewards/chosen": 0.00018310546875, "rewards/margins": -0.01667022705078125, "rewards/rejected": 0.0168304443359375, "step": 16 }, { "epoch": 0.003212244319523832, "grad_norm": 3.294656765997145, "learning_rate": 3.018867924528302e-08, "logits/chosen": 2.740234375, "logits/rejected": 2.93359375, "logps/chosen": -909.0, "logps/rejected": -961.0, "loss": 0.9932, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00409698486328125, "rewards/margins": 0.0211181640625, "rewards/rejected": -0.01702880859375, "step": 17 }, { "epoch": 0.0034011998677311163, "grad_norm": 5.088300406827722, "learning_rate": 3.207547169811321e-08, "logits/chosen": 2.95703125, "logits/rejected": 3.6484375, "logps/chosen": -672.0, "logps/rejected": -1179.0, "loss": 0.9993, "rewards/accuracies": 0.34375, "rewards/chosen": 0.0042724609375, "rewards/margins": 0.00299835205078125, "rewards/rejected": 0.001392364501953125, "step": 18 }, { "epoch": 0.0035901554159384003, "grad_norm": 3.2197865875352227, "learning_rate": 3.39622641509434e-08, "logits/chosen": 2.365234375, "logits/rejected": 2.58203125, "logps/chosen": -708.5, "logps/rejected": -709.5, "loss": 1.0048, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0014801025390625, "rewards/margins": -0.01898956298828125, "rewards/rejected": 0.017482757568359375, "step": 19 }, { "epoch": 0.0037791109641456848, "grad_norm": 3.288180769413972, "learning_rate": 3.5849056603773585e-08, "logits/chosen": 1.75, "logits/rejected": 2.5234375, "logps/chosen": -932.0, "logps/rejected": -774.0, "loss": 0.9855, "rewards/accuracies": 0.15625, "rewards/chosen": 0.048736572265625, "rewards/margins": 0.07285308837890625, "rewards/rejected": -0.0238037109375, "step": 20 }, { "epoch": 0.003968066512352969, "grad_norm": 8.10368685723322, "learning_rate": 3.7735849056603774e-08, "logits/chosen": 1.900390625, "logits/rejected": 2.671875, "logps/chosen": -996.5, "logps/rejected": -1301.5, "loss": 0.9973, "rewards/accuracies": 0.4375, "rewards/chosen": 0.01019287109375, "rewards/margins": 0.0098114013671875, "rewards/rejected": 0.00032806396484375, "step": 21 }, { "epoch": 0.004157022060560253, "grad_norm": 2.638037224996867, "learning_rate": 3.962264150943396e-08, "logits/chosen": 2.43359375, "logits/rejected": 2.720703125, "logps/chosen": -584.0, "logps/rejected": -602.0, "loss": 1.0011, "rewards/accuracies": 0.34375, "rewards/chosen": 0.001560211181640625, "rewards/margins": -0.005645751953125, "rewards/rejected": 0.007213592529296875, "step": 22 }, { "epoch": 0.004345977608767537, "grad_norm": 6.64284820318849, "learning_rate": 4.1509433962264144e-08, "logits/chosen": 2.333984375, "logits/rejected": 2.853515625, "logps/chosen": -813.5, "logps/rejected": -1743.0, "loss": 0.9995, "rewards/accuracies": 0.25, "rewards/chosen": 0.009979248046875, "rewards/margins": 0.0005941390991210938, "rewards/rejected": 0.0093994140625, "step": 23 }, { "epoch": 0.004534933156974822, "grad_norm": 12.909410899305641, "learning_rate": 4.339622641509433e-08, "logits/chosen": 2.7578125, "logits/rejected": 3.4609375, "logps/chosen": -749.0, "logps/rejected": -676.0, "loss": 0.9994, "rewards/accuracies": 0.28125, "rewards/chosen": -0.007038116455078125, "rewards/margins": 0.001983642578125, "rewards/rejected": -0.009002685546875, "step": 24 }, { "epoch": 0.004723888705182106, "grad_norm": 5.217146753974185, "learning_rate": 4.528301886792452e-08, "logits/chosen": 2.62109375, "logits/rejected": 2.953125, "logps/chosen": -831.0, "logps/rejected": -922.0, "loss": 0.9968, "rewards/accuracies": 0.25, "rewards/chosen": 0.02532958984375, "rewards/margins": 0.012653350830078125, "rewards/rejected": 0.012706756591796875, "step": 25 }, { "epoch": 0.00491284425338939, "grad_norm": 2.9656822504904, "learning_rate": 4.7169811320754715e-08, "logits/chosen": 3.138671875, "logits/rejected": 3.69921875, "logps/chosen": -1004.5, "logps/rejected": -843.0, "loss": 0.9642, "rewards/accuracies": 0.3125, "rewards/chosen": 0.8279228210449219, "rewards/margins": 0.9262237548828125, "rewards/rejected": -0.097747802734375, "step": 26 }, { "epoch": 0.005101799801596674, "grad_norm": 4.550749704609593, "learning_rate": 4.9056603773584904e-08, "logits/chosen": 2.8828125, "logits/rejected": 3.6328125, "logps/chosen": -840.5, "logps/rejected": -1035.5, "loss": 1.0045, "rewards/accuracies": 0.15625, "rewards/chosen": -0.020748138427734375, "rewards/margins": -0.01874542236328125, "rewards/rejected": -0.0019683837890625, "step": 27 }, { "epoch": 0.005290755349803959, "grad_norm": 3.323014242942387, "learning_rate": 5.094339622641509e-08, "logits/chosen": 3.1171875, "logits/rejected": 3.390625, "logps/chosen": -718.5, "logps/rejected": -551.5, "loss": 1.0029, "rewards/accuracies": 0.28125, "rewards/chosen": -0.029693603515625, "rewards/margins": -0.0130615234375, "rewards/rejected": -0.0165863037109375, "step": 28 }, { "epoch": 0.005479710898011243, "grad_norm": 4.404734674075751, "learning_rate": 5.283018867924528e-08, "logits/chosen": 3.375, "logits/rejected": 3.546875, "logps/chosen": -939.0, "logps/rejected": -951.0, "loss": 1.0089, "rewards/accuracies": 0.28125, "rewards/chosen": 0.0023345947265625, "rewards/margins": -0.044891357421875, "rewards/rejected": 0.0473480224609375, "step": 29 }, { "epoch": 0.005668666446218527, "grad_norm": 3.660515712830508, "learning_rate": 5.471698113207547e-08, "logits/chosen": 2.953125, "logits/rejected": 3.50390625, "logps/chosen": -889.0, "logps/rejected": -1062.0, "loss": 1.0206, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0226898193359375, "rewards/margins": -0.0849609375, "rewards/rejected": 0.06243896484375, "step": 30 }, { "epoch": 0.005857621994425811, "grad_norm": 11.301772998000239, "learning_rate": 5.660377358490566e-08, "logits/chosen": 1.4560546875, "logits/rejected": 2.8828125, "logps/chosen": -687.0, "logps/rejected": -2241.0, "loss": 0.9819, "rewards/accuracies": 0.25, "rewards/chosen": 0.0093536376953125, "rewards/margins": 0.080596923828125, "rewards/rejected": -0.07114410400390625, "step": 31 }, { "epoch": 0.006046577542633096, "grad_norm": 2.993979347423391, "learning_rate": 5.8490566037735845e-08, "logits/chosen": 3.36328125, "logits/rejected": 3.44921875, "logps/chosen": -528.5, "logps/rejected": -571.0, "loss": 0.9983, "rewards/accuracies": 0.25, "rewards/chosen": 0.005096435546875, "rewards/margins": 0.0068359375, "rewards/rejected": -0.00176239013671875, "step": 32 }, { "epoch": 0.00623553309084038, "grad_norm": 4.496250426045665, "learning_rate": 6.037735849056604e-08, "logits/chosen": 1.880859375, "logits/rejected": 2.712890625, "logps/chosen": -1019.5, "logps/rejected": -1534.0, "loss": 0.9946, "rewards/accuracies": 0.25, "rewards/chosen": 0.014862060546875, "rewards/margins": 0.019561767578125, "rewards/rejected": -0.00469970703125, "step": 33 }, { "epoch": 0.006424488639047664, "grad_norm": 3.3504759292881254, "learning_rate": 6.226415094339623e-08, "logits/chosen": 1.8583984375, "logits/rejected": 2.09765625, "logps/chosen": -842.0, "logps/rejected": -697.0, "loss": 1.0006, "rewards/accuracies": 0.28125, "rewards/chosen": 0.0078277587890625, "rewards/margins": -0.00312042236328125, "rewards/rejected": 0.01096343994140625, "step": 34 }, { "epoch": 0.006613444187254948, "grad_norm": 5.019558425214804, "learning_rate": 6.415094339622642e-08, "logits/chosen": 2.390625, "logits/rejected": 3.0390625, "logps/chosen": -1029.0, "logps/rejected": -2026.0, "loss": 0.9768, "rewards/accuracies": 0.28125, "rewards/chosen": -0.00034332275390625, "rewards/margins": 0.1114501953125, "rewards/rejected": -0.11181640625, "step": 35 }, { "epoch": 0.0068023997354622326, "grad_norm": 3.407084167899378, "learning_rate": 6.60377358490566e-08, "logits/chosen": 2.3447265625, "logits/rejected": 3.03125, "logps/chosen": -972.0, "logps/rejected": -634.0, "loss": 0.9867, "rewards/accuracies": 0.34375, "rewards/chosen": 0.05495452880859375, "rewards/margins": 0.0636444091796875, "rewards/rejected": -0.00861358642578125, "step": 36 }, { "epoch": 0.006991355283669517, "grad_norm": 3.9581391686059217, "learning_rate": 6.79245283018868e-08, "logits/chosen": 2.796875, "logits/rejected": 3.291015625, "logps/chosen": -623.5, "logps/rejected": -836.5, "loss": 1.0088, "rewards/accuracies": 0.25, "rewards/chosen": 0.01507568359375, "rewards/margins": -0.035247802734375, "rewards/rejected": 0.05035400390625, "step": 37 }, { "epoch": 0.007180310831876801, "grad_norm": 3.1322460145908484, "learning_rate": 6.981132075471698e-08, "logits/chosen": 3.14453125, "logits/rejected": 3.7421875, "logps/chosen": -703.5, "logps/rejected": -745.0, "loss": 1.0016, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0055084228515625, "rewards/margins": -0.0107574462890625, "rewards/rejected": 0.0052642822265625, "step": 38 }, { "epoch": 0.007369266380084085, "grad_norm": 5.437311080435267, "learning_rate": 7.169811320754717e-08, "logits/chosen": 2.9375, "logits/rejected": 3.298828125, "logps/chosen": -959.0, "logps/rejected": -1631.0, "loss": 1.007, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0025482177734375, "rewards/margins": -0.0286865234375, "rewards/rejected": 0.026214599609375, "step": 39 }, { "epoch": 0.0075582219282913695, "grad_norm": 4.774530991761315, "learning_rate": 7.358490566037736e-08, "logits/chosen": 2.95703125, "logits/rejected": 3.37109375, "logps/chosen": -1062.0, "logps/rejected": -1168.0, "loss": 1.0073, "rewards/accuracies": 0.09375, "rewards/chosen": -0.015178680419921875, "rewards/margins": -0.027587890625, "rewards/rejected": 0.01242828369140625, "step": 40 }, { "epoch": 0.007747177476498654, "grad_norm": 7786.03930718343, "learning_rate": 7.547169811320755e-08, "logits/chosen": 2.9765625, "logits/rejected": 3.140625, "logps/chosen": -1119.0, "logps/rejected": -17691.0, "loss": 0.9996, "rewards/accuracies": 0.21875, "rewards/chosen": 0.008037567138671875, "rewards/margins": 0.00177764892578125, "rewards/rejected": 0.006244659423828125, "step": 41 }, { "epoch": 0.007936133024705938, "grad_norm": 2.9370793898453993, "learning_rate": 7.735849056603774e-08, "logits/chosen": 2.62890625, "logits/rejected": 3.2265625, "logps/chosen": -776.0, "logps/rejected": -692.0, "loss": 0.9991, "rewards/accuracies": 0.40625, "rewards/chosen": 0.008209228515625, "rewards/margins": 0.001708984375, "rewards/rejected": 0.00634765625, "step": 42 }, { "epoch": 0.008125088572913223, "grad_norm": 4.930304236068891, "learning_rate": 7.924528301886792e-08, "logits/chosen": 2.9765625, "logits/rejected": 3.98046875, "logps/chosen": -1269.0, "logps/rejected": -1161.0, "loss": 0.9921, "rewards/accuracies": 0.34375, "rewards/chosen": 0.032684326171875, "rewards/margins": 0.03668212890625, "rewards/rejected": -0.00392913818359375, "step": 43 }, { "epoch": 0.008314044121120506, "grad_norm": 2.7494444095325328, "learning_rate": 8.11320754716981e-08, "logits/chosen": 2.41796875, "logits/rejected": 2.99609375, "logps/chosen": -638.0, "logps/rejected": -597.0, "loss": 0.9928, "rewards/accuracies": 0.34375, "rewards/chosen": 0.02016925811767578, "rewards/margins": 0.02801513671875, "rewards/rejected": -0.00792694091796875, "step": 44 }, { "epoch": 0.00850299966932779, "grad_norm": 2.1958033646084902, "learning_rate": 8.301886792452829e-08, "logits/chosen": 4.08984375, "logits/rejected": 4.671875, "logps/chosen": -371.75, "logps/rejected": -460.75, "loss": 1.0024, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0009765625, "rewards/margins": -0.01053619384765625, "rewards/rejected": 0.00958251953125, "step": 45 }, { "epoch": 0.008691955217535074, "grad_norm": 3.539234273877629, "learning_rate": 8.490566037735848e-08, "logits/chosen": 1.78662109375, "logits/rejected": 2.8740234375, "logps/chosen": -873.0, "logps/rejected": -1047.0, "loss": 0.9785, "rewards/accuracies": 0.15625, "rewards/chosen": -0.0124053955078125, "rewards/margins": 0.11163330078125, "rewards/rejected": -0.12371826171875, "step": 46 }, { "epoch": 0.008880910765742359, "grad_norm": 2.419914647038902, "learning_rate": 8.679245283018866e-08, "logits/chosen": 2.017578125, "logits/rejected": 2.44140625, "logps/chosen": -541.5, "logps/rejected": -495.75, "loss": 1.0052, "rewards/accuracies": 0.15625, "rewards/chosen": -0.0072479248046875, "rewards/margins": -0.0205078125, "rewards/rejected": 0.01331329345703125, "step": 47 }, { "epoch": 0.009069866313949643, "grad_norm": 3.1579071427109224, "learning_rate": 8.867924528301885e-08, "logits/chosen": 2.642578125, "logits/rejected": 3.40625, "logps/chosen": -725.0, "logps/rejected": -815.0, "loss": 1.0011, "rewards/accuracies": 0.28125, "rewards/chosen": 0.01059722900390625, "rewards/margins": -0.005157470703125, "rewards/rejected": 0.0157928466796875, "step": 48 }, { "epoch": 0.009258821862156928, "grad_norm": 3.3512314832858876, "learning_rate": 9.056603773584904e-08, "logits/chosen": 3.18359375, "logits/rejected": 3.78515625, "logps/chosen": -766.5, "logps/rejected": -859.5, "loss": 0.9915, "rewards/accuracies": 0.3125, "rewards/chosen": 0.0431365966796875, "rewards/margins": 0.03818511962890625, "rewards/rejected": 0.0047607421875, "step": 49 }, { "epoch": 0.009447777410364212, "grad_norm": 3.3018185738189985, "learning_rate": 9.245283018867923e-08, "logits/chosen": 2.1484375, "logits/rejected": 2.28515625, "logps/chosen": -814.5, "logps/rejected": -633.0, "loss": 0.9985, "rewards/accuracies": 0.34375, "rewards/chosen": 0.0411376953125, "rewards/margins": 0.012005805969238281, "rewards/rejected": 0.028909683227539062, "step": 50 }, { "epoch": 0.009636732958571497, "grad_norm": 2.994907939530964, "learning_rate": 9.433962264150943e-08, "logits/chosen": 2.2861328125, "logits/rejected": 2.884765625, "logps/chosen": -800.0, "logps/rejected": -695.5, "loss": 0.9987, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0324249267578125, "rewards/margins": 0.004730224609375, "rewards/rejected": -0.03717041015625, "step": 51 }, { "epoch": 0.00982568850677878, "grad_norm": 3.964767870754139, "learning_rate": 9.622641509433962e-08, "logits/chosen": 2.8984375, "logits/rejected": 3.14453125, "logps/chosen": -828.5, "logps/rejected": -994.5, "loss": 1.0193, "rewards/accuracies": 0.09375, "rewards/chosen": -0.02545166015625, "rewards/margins": -0.0762939453125, "rewards/rejected": 0.050872802734375, "step": 52 }, { "epoch": 0.010014644054986064, "grad_norm": 3.072513227576804, "learning_rate": 9.811320754716981e-08, "logits/chosen": 2.986328125, "logits/rejected": 3.85546875, "logps/chosen": -547.5, "logps/rejected": -895.5, "loss": 1.0067, "rewards/accuracies": 0.375, "rewards/chosen": 0.0050048828125, "rewards/margins": -0.035247802734375, "rewards/rejected": 0.04028034210205078, "step": 53 }, { "epoch": 0.010203599603193348, "grad_norm": 6.133522152348281, "learning_rate": 1e-07, "logits/chosen": 3.04296875, "logits/rejected": 3.75390625, "logps/chosen": -660.5, "logps/rejected": -1453.0, "loss": 1.0001, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01763153076171875, "rewards/margins": -0.001739501953125, "rewards/rejected": -0.015960693359375, "step": 54 }, { "epoch": 0.010392555151400633, "grad_norm": 3.1406486039127355, "learning_rate": 1.0188679245283018e-07, "logits/chosen": 2.37890625, "logits/rejected": 3.18359375, "logps/chosen": -672.5, "logps/rejected": -744.0, "loss": 1.007, "rewards/accuracies": 0.25, "rewards/chosen": 0.00543975830078125, "rewards/margins": -0.0295257568359375, "rewards/rejected": 0.035003662109375, "step": 55 }, { "epoch": 0.010581510699607917, "grad_norm": 3.6519045025257744, "learning_rate": 1.0377358490566037e-07, "logits/chosen": 2.546875, "logits/rejected": 3.2421875, "logps/chosen": -656.5, "logps/rejected": -698.5, "loss": 1.012, "rewards/accuracies": 0.28125, "rewards/chosen": 0.02032470703125, "rewards/margins": -0.0583648681640625, "rewards/rejected": 0.07860565185546875, "step": 56 }, { "epoch": 0.010770466247815202, "grad_norm": 4.1489486976039975, "learning_rate": 1.0566037735849056e-07, "logits/chosen": 2.310546875, "logits/rejected": 2.74609375, "logps/chosen": -966.0, "logps/rejected": -664.0, "loss": 1.004, "rewards/accuracies": 0.25, "rewards/chosen": -0.005084991455078125, "rewards/margins": -0.01605224609375, "rewards/rejected": 0.0109405517578125, "step": 57 }, { "epoch": 0.010959421796022486, "grad_norm": 2.52452913993745, "learning_rate": 1.0754716981132075e-07, "logits/chosen": 3.18359375, "logits/rejected": 3.3515625, "logps/chosen": -17529.5, "logps/rejected": -712.5, "loss": 1.0161, "rewards/accuracies": 0.375, "rewards/chosen": -6.417572021484375, "rewards/margins": -6.40155029296875, "rewards/rejected": -0.0207366943359375, "step": 58 }, { "epoch": 0.01114837734422977, "grad_norm": 4.295064900003871, "learning_rate": 1.0943396226415094e-07, "logits/chosen": 2.8251953125, "logits/rejected": 3.4375, "logps/chosen": -852.0, "logps/rejected": -1332.0, "loss": 1.0062, "rewards/accuracies": 0.3125, "rewards/chosen": -0.00354766845703125, "rewards/margins": -0.02703857421875, "rewards/rejected": 0.0235137939453125, "step": 59 }, { "epoch": 0.011337332892437053, "grad_norm": 2.7847533149205357, "learning_rate": 1.1132075471698113e-07, "logits/chosen": 2.6796875, "logits/rejected": 2.748046875, "logps/chosen": -649.5, "logps/rejected": -533.0, "loss": 0.9966, "rewards/accuracies": 0.28125, "rewards/chosen": 0.032257080078125, "rewards/margins": 0.0134735107421875, "rewards/rejected": 0.01879119873046875, "step": 60 }, { "epoch": 0.011526288440644338, "grad_norm": 3.8051570366703613, "learning_rate": 1.1320754716981131e-07, "logits/chosen": 2.119140625, "logits/rejected": 2.6875, "logps/chosen": -1018.0, "logps/rejected": -788.0, "loss": 0.9951, "rewards/accuracies": 0.28125, "rewards/chosen": 0.00507354736328125, "rewards/margins": 0.02008056640625, "rewards/rejected": -0.0149688720703125, "step": 61 }, { "epoch": 0.011715243988851622, "grad_norm": 15.236583163100672, "learning_rate": 1.150943396226415e-07, "logits/chosen": 2.74609375, "logits/rejected": 3.111328125, "logps/chosen": -1111.0, "logps/rejected": -1894.0, "loss": 0.9905, "rewards/accuracies": 0.3125, "rewards/chosen": 0.03179931640625, "rewards/margins": 0.040435791015625, "rewards/rejected": -0.008792877197265625, "step": 62 }, { "epoch": 0.011904199537058907, "grad_norm": 8.670828808787295, "learning_rate": 1.1698113207547169e-07, "logits/chosen": 3.53515625, "logits/rejected": 3.98046875, "logps/chosen": -642.0, "logps/rejected": -1184.5, "loss": 1.006, "rewards/accuracies": 0.25, "rewards/chosen": -0.005481719970703125, "rewards/margins": -0.0234222412109375, "rewards/rejected": 0.017974853515625, "step": 63 }, { "epoch": 0.012093155085266191, "grad_norm": 5.54290261818015, "learning_rate": 1.1886792452830188e-07, "logits/chosen": 2.576171875, "logits/rejected": 3.556640625, "logps/chosen": -812.0, "logps/rejected": -1225.25, "loss": 1.0051, "rewards/accuracies": 0.34375, "rewards/chosen": -0.0011444091796875, "rewards/margins": -0.023651123046875, "rewards/rejected": 0.022491455078125, "step": 64 }, { "epoch": 0.012282110633473476, "grad_norm": 4.709418805466466, "learning_rate": 1.2075471698113208e-07, "logits/chosen": 2.09375, "logits/rejected": 2.234375, "logps/chosen": -1045.0, "logps/rejected": -1696.0, "loss": 1.0007, "rewards/accuracies": 0.21875, "rewards/chosen": -0.032108306884765625, "rewards/margins": 0.130340576171875, "rewards/rejected": -0.1631011962890625, "step": 65 }, { "epoch": 0.01247106618168076, "grad_norm": 3.9357524317494788, "learning_rate": 1.2264150943396226e-07, "logits/chosen": 2.802734375, "logits/rejected": 3.146484375, "logps/chosen": -680.0, "logps/rejected": -936.0, "loss": 1.006, "rewards/accuracies": 0.15625, "rewards/chosen": -0.02967071533203125, "rewards/margins": -0.0231781005859375, "rewards/rejected": -0.00643157958984375, "step": 66 }, { "epoch": 0.012660021729888045, "grad_norm": 3.752244069811938, "learning_rate": 1.2452830188679246e-07, "logits/chosen": 2.73046875, "logits/rejected": 3.71484375, "logps/chosen": -738.5, "logps/rejected": -910.0, "loss": 1.0079, "rewards/accuracies": 0.21875, "rewards/chosen": -0.01726531982421875, "rewards/margins": -0.03389739990234375, "rewards/rejected": 0.0166015625, "step": 67 }, { "epoch": 0.012848977278095327, "grad_norm": 3.8051166389126547, "learning_rate": 1.2641509433962263e-07, "logits/chosen": 2.1611328125, "logits/rejected": 2.796875, "logps/chosen": -1153.0, "logps/rejected": -828.0, "loss": 1.0077, "rewards/accuracies": 0.21875, "rewards/chosen": -0.025577545166015625, "rewards/margins": -0.03430938720703125, "rewards/rejected": 0.0088043212890625, "step": 68 }, { "epoch": 0.013037932826302612, "grad_norm": 3.231031102610451, "learning_rate": 1.2830188679245283e-07, "logits/chosen": 1.97314453125, "logits/rejected": 1.9951171875, "logps/chosen": -768.0, "logps/rejected": -740.0, "loss": 0.9893, "rewards/accuracies": 0.375, "rewards/chosen": 0.0288543701171875, "rewards/margins": 0.04437255859375, "rewards/rejected": -0.015453338623046875, "step": 69 }, { "epoch": 0.013226888374509896, "grad_norm": 2.9161128828048817, "learning_rate": 1.30188679245283e-07, "logits/chosen": 1.95947265625, "logits/rejected": 2.8046875, "logps/chosen": -647.0, "logps/rejected": -905.5, "loss": 1.0034, "rewards/accuracies": 0.25, "rewards/chosen": 0.011562347412109375, "rewards/margins": -0.0147857666015625, "rewards/rejected": 0.026275634765625, "step": 70 }, { "epoch": 0.01341584392271718, "grad_norm": 3.5063578708333387, "learning_rate": 1.320754716981132e-07, "logits/chosen": 1.8369140625, "logits/rejected": 3.091796875, "logps/chosen": -1030.0, "logps/rejected": -830.0, "loss": 1.0007, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0341796875, "rewards/margins": -0.0048828125, "rewards/rejected": -0.0293121337890625, "step": 71 }, { "epoch": 0.013604799470924465, "grad_norm": 5.163340282870066, "learning_rate": 1.3396226415094339e-07, "logits/chosen": 1.552734375, "logits/rejected": 2.083984375, "logps/chosen": -961.0, "logps/rejected": -1370.5, "loss": 0.9971, "rewards/accuracies": 0.28125, "rewards/chosen": -0.02016448974609375, "rewards/margins": 0.01812744140625, "rewards/rejected": -0.038387298583984375, "step": 72 }, { "epoch": 0.01379375501913175, "grad_norm": 4.0937852352258695, "learning_rate": 1.358490566037736e-07, "logits/chosen": 2.068359375, "logits/rejected": 2.83984375, "logps/chosen": -1188.0, "logps/rejected": -888.0, "loss": 0.9788, "rewards/accuracies": 0.3125, "rewards/chosen": 0.015081405639648438, "rewards/margins": 0.094207763671875, "rewards/rejected": -0.0791168212890625, "step": 73 }, { "epoch": 0.013982710567339034, "grad_norm": 9.645966065980605, "learning_rate": 1.3773584905660376e-07, "logits/chosen": 1.806640625, "logits/rejected": 2.5234375, "logps/chosen": -941.0, "logps/rejected": -3062.0, "loss": 1.0122, "rewards/accuracies": 0.1875, "rewards/chosen": -0.02838134765625, "rewards/margins": -0.0537109375, "rewards/rejected": 0.0254974365234375, "step": 74 }, { "epoch": 0.014171666115546318, "grad_norm": 3.5687147627636833, "learning_rate": 1.3962264150943396e-07, "logits/chosen": 3.041015625, "logits/rejected": 3.60546875, "logps/chosen": -942.0, "logps/rejected": -769.5, "loss": 1.0004, "rewards/accuracies": 0.3125, "rewards/chosen": 0.06396484375, "rewards/margins": 0.03765869140625, "rewards/rejected": 0.02618408203125, "step": 75 }, { "epoch": 0.014360621663753601, "grad_norm": 3.548883516182886, "learning_rate": 1.4150943396226414e-07, "logits/chosen": 1.505859375, "logits/rejected": 1.9150390625, "logps/chosen": -794.0, "logps/rejected": -748.0, "loss": 1.0049, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0473480224609375, "rewards/margins": -0.0231170654296875, "rewards/rejected": -0.024250030517578125, "step": 76 }, { "epoch": 0.014549577211960886, "grad_norm": 4.699785858127177, "learning_rate": 1.4339622641509434e-07, "logits/chosen": 2.630859375, "logits/rejected": 3.109375, "logps/chosen": -1153.0, "logps/rejected": -1083.0, "loss": 0.9968, "rewards/accuracies": 0.3125, "rewards/chosen": 0.0037212371826171875, "rewards/margins": 0.012378692626953125, "rewards/rejected": -0.00867462158203125, "step": 77 }, { "epoch": 0.01473853276016817, "grad_norm": 3.326663879000665, "learning_rate": 1.4528301886792452e-07, "logits/chosen": 2.80078125, "logits/rejected": 3.0703125, "logps/chosen": -569.5, "logps/rejected": -779.5, "loss": 1.0062, "rewards/accuracies": 0.375, "rewards/chosen": -0.02874755859375, "rewards/margins": -0.03436279296875, "rewards/rejected": 0.005584716796875, "step": 78 }, { "epoch": 0.014927488308375455, "grad_norm": 4.189091503192095, "learning_rate": 1.4716981132075472e-07, "logits/chosen": 2.171875, "logits/rejected": 3.171875, "logps/chosen": -1025.0, "logps/rejected": -1410.0, "loss": 0.9946, "rewards/accuracies": 0.21875, "rewards/chosen": 0.041290283203125, "rewards/margins": 0.025604248046875, "rewards/rejected": 0.015552520751953125, "step": 79 }, { "epoch": 0.015116443856582739, "grad_norm": 5.407304617986776, "learning_rate": 1.490566037735849e-07, "logits/chosen": 2.083984375, "logits/rejected": 2.806640625, "logps/chosen": -1277.0, "logps/rejected": -1661.5, "loss": 0.9645, "rewards/accuracies": 0.25, "rewards/chosen": 0.037448883056640625, "rewards/margins": 0.20022010803222656, "rewards/rejected": -0.16275596618652344, "step": 80 }, { "epoch": 0.015305399404790023, "grad_norm": 3.967321990212605, "learning_rate": 1.509433962264151e-07, "logits/chosen": 2.6484375, "logits/rejected": 3.83984375, "logps/chosen": -856.0, "logps/rejected": -1064.0, "loss": 0.9833, "rewards/accuracies": 0.3125, "rewards/chosen": 0.005889892578125, "rewards/margins": 0.10845947265625, "rewards/rejected": -0.1027374267578125, "step": 81 }, { "epoch": 0.015494354952997308, "grad_norm": 3.625343725732847, "learning_rate": 1.5283018867924527e-07, "logits/chosen": 3.0859375, "logits/rejected": 3.61328125, "logps/chosen": -1024.5, "logps/rejected": -841.5, "loss": 0.9999, "rewards/accuracies": 0.3125, "rewards/chosen": -0.007808685302734375, "rewards/margins": -0.0029172897338867188, "rewards/rejected": -0.0048828125, "step": 82 }, { "epoch": 0.01568331050120459, "grad_norm": 3.003095901005325, "learning_rate": 1.5471698113207547e-07, "logits/chosen": 2.69921875, "logits/rejected": 3.58203125, "logps/chosen": -897.0, "logps/rejected": -902.0, "loss": 1.0017, "rewards/accuracies": 0.375, "rewards/chosen": 0.025543212890625, "rewards/margins": -0.042572021484375, "rewards/rejected": 0.06817626953125, "step": 83 }, { "epoch": 0.015872266049411877, "grad_norm": 8.278920484059752, "learning_rate": 1.5660377358490565e-07, "logits/chosen": 2.61328125, "logits/rejected": 3.17578125, "logps/chosen": -1905.0, "logps/rejected": -1323.0, "loss": 0.9957, "rewards/accuracies": 0.3125, "rewards/chosen": 0.0440216064453125, "rewards/margins": 0.019287109375, "rewards/rejected": 0.02487945556640625, "step": 84 }, { "epoch": 0.01606122159761916, "grad_norm": 2.7881248683007387, "learning_rate": 1.5849056603773585e-07, "logits/chosen": 3.123046875, "logits/rejected": 3.83203125, "logps/chosen": -624.0, "logps/rejected": -673.0, "loss": 0.9968, "rewards/accuracies": 0.3125, "rewards/chosen": -0.013117790222167969, "rewards/margins": 0.0121612548828125, "rewards/rejected": -0.0252685546875, "step": 85 }, { "epoch": 0.016250177145826446, "grad_norm": 3.2199343734279733, "learning_rate": 1.6037735849056602e-07, "logits/chosen": 1.8515625, "logits/rejected": 2.5263671875, "logps/chosen": -584.0, "logps/rejected": -591.0, "loss": 1.0035, "rewards/accuracies": 0.28125, "rewards/chosen": -0.01508331298828125, "rewards/margins": -0.0152130126953125, "rewards/rejected": 0.00019073486328125, "step": 86 }, { "epoch": 0.01643913269403373, "grad_norm": 3.533962296200102, "learning_rate": 1.622641509433962e-07, "logits/chosen": 3.14453125, "logits/rejected": 3.328125, "logps/chosen": -644.5, "logps/rejected": -704.0, "loss": 1.0109, "rewards/accuracies": 0.3125, "rewards/chosen": -0.03680419921875, "rewards/margins": -0.0472259521484375, "rewards/rejected": 0.010162353515625, "step": 87 }, { "epoch": 0.01662808824224101, "grad_norm": 3.240657244235567, "learning_rate": 1.641509433962264e-07, "logits/chosen": 1.578125, "logits/rejected": 2.373046875, "logps/chosen": -653.5, "logps/rejected": -612.75, "loss": 0.9946, "rewards/accuracies": 0.21875, "rewards/chosen": 0.00313568115234375, "rewards/margins": 0.020172119140625, "rewards/rejected": -0.0170135498046875, "step": 88 }, { "epoch": 0.016817043790448297, "grad_norm": 3.834039445066889, "learning_rate": 1.6603773584905657e-07, "logits/chosen": 2.2861328125, "logits/rejected": 2.58837890625, "logps/chosen": -726.0, "logps/rejected": -855.5, "loss": 1.0006, "rewards/accuracies": 0.34375, "rewards/chosen": -0.0078125, "rewards/margins": -0.0011730194091796875, "rewards/rejected": -0.0066680908203125, "step": 89 }, { "epoch": 0.01700599933865558, "grad_norm": 4.283004688732675, "learning_rate": 1.6792452830188678e-07, "logits/chosen": 3.3046875, "logits/rejected": 3.58984375, "logps/chosen": -1073.5, "logps/rejected": -992.0, "loss": 1.0054, "rewards/accuracies": 0.40625, "rewards/chosen": -0.04085540771484375, "rewards/margins": -0.02880859375, "rewards/rejected": -0.01202392578125, "step": 90 }, { "epoch": 0.017194954886862866, "grad_norm": 2.9585905916897164, "learning_rate": 1.6981132075471695e-07, "logits/chosen": 2.474609375, "logits/rejected": 2.720703125, "logps/chosen": -857.0, "logps/rejected": -674.5, "loss": 0.9967, "rewards/accuracies": 0.28125, "rewards/chosen": -0.007816314697265625, "rewards/margins": 0.012115478515625, "rewards/rejected": -0.01995849609375, "step": 91 }, { "epoch": 0.01738391043507015, "grad_norm": 2.468289037915394, "learning_rate": 1.7169811320754715e-07, "logits/chosen": 2.6455078125, "logits/rejected": 3.31640625, "logps/chosen": -396.5, "logps/rejected": -470.0, "loss": 0.9994, "rewards/accuracies": 0.375, "rewards/chosen": -0.01678466796875, "rewards/margins": 0.002349853515625, "rewards/rejected": -0.0191650390625, "step": 92 }, { "epoch": 0.017572865983277435, "grad_norm": 4.387106882097757, "learning_rate": 1.7358490566037733e-07, "logits/chosen": 2.447265625, "logits/rejected": 2.8203125, "logps/chosen": -963.0, "logps/rejected": -1312.0, "loss": 1.0068, "rewards/accuracies": 0.21875, "rewards/chosen": -0.028167724609375, "rewards/margins": -0.0294342041015625, "rewards/rejected": 0.0011749267578125, "step": 93 }, { "epoch": 0.017761821531484718, "grad_norm": 3.200062953184985, "learning_rate": 1.7547169811320753e-07, "logits/chosen": 1.947265625, "logits/rejected": 3.279296875, "logps/chosen": -763.5, "logps/rejected": -879.0, "loss": 1.0149, "rewards/accuracies": 0.25, "rewards/chosen": -0.024639129638671875, "rewards/margins": -0.06369781494140625, "rewards/rejected": 0.039031982421875, "step": 94 }, { "epoch": 0.017950777079692004, "grad_norm": 8.170639579159637, "learning_rate": 1.773584905660377e-07, "logits/chosen": 2.494140625, "logits/rejected": 3.1171875, "logps/chosen": -642.5, "logps/rejected": -1222.5, "loss": 0.9944, "rewards/accuracies": 0.375, "rewards/chosen": 0.01702880859375, "rewards/margins": 0.02070903778076172, "rewards/rejected": -0.00372314453125, "step": 95 }, { "epoch": 0.018139732627899287, "grad_norm": 3.937552232423435, "learning_rate": 1.792452830188679e-07, "logits/chosen": 3.18359375, "logits/rejected": 3.9765625, "logps/chosen": -896.0, "logps/rejected": -960.0, "loss": 0.9922, "rewards/accuracies": 0.15625, "rewards/chosen": 0.01792144775390625, "rewards/margins": 0.03875732421875, "rewards/rejected": -0.020721435546875, "step": 96 }, { "epoch": 0.01832868817610657, "grad_norm": 3.31220303429093, "learning_rate": 1.8113207547169808e-07, "logits/chosen": 1.3544921875, "logits/rejected": 2.75390625, "logps/chosen": -820.0, "logps/rejected": -827.0, "loss": 0.9958, "rewards/accuracies": 0.3125, "rewards/chosen": 0.008419036865234375, "rewards/margins": 0.015827178955078125, "rewards/rejected": -0.007442474365234375, "step": 97 }, { "epoch": 0.018517643724313856, "grad_norm": 3.6347841105709455, "learning_rate": 1.8301886792452828e-07, "logits/chosen": 1.677734375, "logits/rejected": 1.700927734375, "logps/chosen": -995.0, "logps/rejected": -688.0, "loss": 0.9878, "rewards/accuracies": 0.34375, "rewards/chosen": 0.009851455688476562, "rewards/margins": 0.04937744140625, "rewards/rejected": -0.0394287109375, "step": 98 }, { "epoch": 0.01870659927252114, "grad_norm": 3.1568591181282626, "learning_rate": 1.8490566037735846e-07, "logits/chosen": 2.03515625, "logits/rejected": 2.390625, "logps/chosen": -826.0, "logps/rejected": -610.0, "loss": 0.9933, "rewards/accuracies": 0.3125, "rewards/chosen": 0.014605522155761719, "rewards/margins": 0.028125762939453125, "rewards/rejected": -0.0134735107421875, "step": 99 }, { "epoch": 0.018895554820728425, "grad_norm": 2.4406228760455644, "learning_rate": 1.8679245283018866e-07, "logits/chosen": 2.103515625, "logits/rejected": 2.18359375, "logps/chosen": -602.0, "logps/rejected": -506.5, "loss": 0.9988, "rewards/accuracies": 0.34375, "rewards/chosen": 0.004502296447753906, "rewards/margins": 0.00408935546875, "rewards/rejected": 0.00042724609375, "step": 100 }, { "epoch": 0.019084510368935707, "grad_norm": 3.48969835405886, "learning_rate": 1.8867924528301886e-07, "logits/chosen": 2.267578125, "logits/rejected": 2.458984375, "logps/chosen": -756.25, "logps/rejected": -665.5, "loss": 0.9933, "rewards/accuracies": 0.28125, "rewards/chosen": -0.013092041015625, "rewards/margins": 0.02858734130859375, "rewards/rejected": -0.041778564453125, "step": 101 }, { "epoch": 0.019273465917142994, "grad_norm": 5.0672703741536544, "learning_rate": 1.9056603773584906e-07, "logits/chosen": 2.75390625, "logits/rejected": 3.890625, "logps/chosen": -1349.0, "logps/rejected": -1549.0, "loss": 0.9712, "rewards/accuracies": 0.46875, "rewards/chosen": 0.017009735107421875, "rewards/margins": 0.1581573486328125, "rewards/rejected": -0.140899658203125, "step": 102 }, { "epoch": 0.019462421465350276, "grad_norm": 6.706484935227984, "learning_rate": 1.9245283018867924e-07, "logits/chosen": 2.484375, "logits/rejected": 3.00390625, "logps/chosen": -884.5, "logps/rejected": -1450.0, "loss": 0.9856, "rewards/accuracies": 0.4375, "rewards/chosen": 0.01968669891357422, "rewards/margins": 0.057861328125, "rewards/rejected": -0.038116455078125, "step": 103 }, { "epoch": 0.01965137701355756, "grad_norm": 7.21275799916235, "learning_rate": 1.9433962264150944e-07, "logits/chosen": 2.255859375, "logits/rejected": 3.29296875, "logps/chosen": -1182.5, "logps/rejected": -2166.0, "loss": 0.9796, "rewards/accuracies": 0.34375, "rewards/chosen": 0.029022216796875, "rewards/margins": 0.0919189453125, "rewards/rejected": -0.0630950927734375, "step": 104 }, { "epoch": 0.019840332561764845, "grad_norm": 2.9539363795321467, "learning_rate": 1.9622641509433961e-07, "logits/chosen": 1.078125, "logits/rejected": 2.060546875, "logps/chosen": -726.5, "logps/rejected": -580.5, "loss": 0.9855, "rewards/accuracies": 0.3125, "rewards/chosen": 0.02448272705078125, "rewards/margins": 0.0589447021484375, "rewards/rejected": -0.0344390869140625, "step": 105 }, { "epoch": 0.020029288109972128, "grad_norm": 7.946308427607685, "learning_rate": 1.9811320754716982e-07, "logits/chosen": 3.1328125, "logits/rejected": 4.1015625, "logps/chosen": -1139.0, "logps/rejected": -1719.0, "loss": 0.9772, "rewards/accuracies": 0.25, "rewards/chosen": -0.00782012939453125, "rewards/margins": 0.13480758666992188, "rewards/rejected": -0.1424713134765625, "step": 106 }, { "epoch": 0.020218243658179414, "grad_norm": 2.8884058929316416, "learning_rate": 2e-07, "logits/chosen": 2.87109375, "logits/rejected": 3.419921875, "logps/chosen": -694.5, "logps/rejected": -536.0, "loss": 1.0072, "rewards/accuracies": 0.3125, "rewards/chosen": -0.04241943359375, "rewards/margins": -0.033477783203125, "rewards/rejected": -0.0090179443359375, "step": 107 }, { "epoch": 0.020407199206386697, "grad_norm": 3.1497767800417473, "learning_rate": 2.018867924528302e-07, "logits/chosen": 2.77734375, "logits/rejected": 3.20703125, "logps/chosen": -914.0, "logps/rejected": -756.0, "loss": 0.9878, "rewards/accuracies": 0.34375, "rewards/chosen": -0.001956939697265625, "rewards/margins": 0.049591064453125, "rewards/rejected": -0.0514984130859375, "step": 108 }, { "epoch": 0.020596154754593983, "grad_norm": 4.327172943672276, "learning_rate": 2.0377358490566037e-07, "logits/chosen": 2.861328125, "logits/rejected": 2.88671875, "logps/chosen": -1058.5, "logps/rejected": -886.0, "loss": 0.9784, "rewards/accuracies": 0.375, "rewards/chosen": 0.03529071807861328, "rewards/margins": 0.096588134765625, "rewards/rejected": -0.06131744384765625, "step": 109 }, { "epoch": 0.020785110302801266, "grad_norm": 2.9979902379828336, "learning_rate": 2.0566037735849057e-07, "logits/chosen": 1.697265625, "logits/rejected": 2.05224609375, "logps/chosen": -701.0, "logps/rejected": -672.0, "loss": 0.9984, "rewards/accuracies": 0.3125, "rewards/chosen": -0.04964590072631836, "rewards/margins": 0.0137176513671875, "rewards/rejected": -0.0632476806640625, "step": 110 }, { "epoch": 0.020974065851008552, "grad_norm": 4.0535522182178445, "learning_rate": 2.0754716981132074e-07, "logits/chosen": 2.90234375, "logits/rejected": 3.5234375, "logps/chosen": -965.0, "logps/rejected": -764.0, "loss": 0.989, "rewards/accuracies": 0.34375, "rewards/chosen": 0.0228271484375, "rewards/margins": 0.04235076904296875, "rewards/rejected": -0.0195770263671875, "step": 111 }, { "epoch": 0.021163021399215835, "grad_norm": 3.714253523437312, "learning_rate": 2.0943396226415095e-07, "logits/chosen": 2.69921875, "logits/rejected": 3.39453125, "logps/chosen": -717.0, "logps/rejected": -734.0, "loss": 1.0057, "rewards/accuracies": 0.21875, "rewards/chosen": -0.0400848388671875, "rewards/margins": -0.024370193481445312, "rewards/rejected": -0.015655517578125, "step": 112 }, { "epoch": 0.021351976947423117, "grad_norm": 4.155842866764129, "learning_rate": 2.1132075471698112e-07, "logits/chosen": 2.646484375, "logits/rejected": 3.490234375, "logps/chosen": -1099.0, "logps/rejected": -1090.0, "loss": 0.9904, "rewards/accuracies": 0.3125, "rewards/chosen": 0.01761627197265625, "rewards/margins": 0.0369415283203125, "rewards/rejected": -0.01934051513671875, "step": 113 }, { "epoch": 0.021540932495630404, "grad_norm": 5.2774440719490645, "learning_rate": 2.1320754716981132e-07, "logits/chosen": 3.0234375, "logits/rejected": 3.53125, "logps/chosen": -699.5, "logps/rejected": -1282.5, "loss": 1.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.021484375, "rewards/margins": -0.00115966796875, "rewards/rejected": -0.02036285400390625, "step": 114 }, { "epoch": 0.021729888043837686, "grad_norm": 3.170137427668298, "learning_rate": 2.150943396226415e-07, "logits/chosen": 1.9453125, "logits/rejected": 2.265625, "logps/chosen": -708.5, "logps/rejected": -775.5, "loss": 0.9873, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0092315673828125, "rewards/margins": 0.0506591796875, "rewards/rejected": -0.041412353515625, "step": 115 }, { "epoch": 0.021918843592044972, "grad_norm": 3.4645140088401556, "learning_rate": 2.169811320754717e-07, "logits/chosen": 2.171875, "logits/rejected": 2.373046875, "logps/chosen": -916.5, "logps/rejected": -666.0, "loss": 1.0029, "rewards/accuracies": 0.4375, "rewards/chosen": -0.027557373046875, "rewards/margins": -0.01515960693359375, "rewards/rejected": -0.012310028076171875, "step": 116 }, { "epoch": 0.022107799140252255, "grad_norm": 3.683528452006128, "learning_rate": 2.1886792452830187e-07, "logits/chosen": 2.99609375, "logits/rejected": 3.75, "logps/chosen": -610.5, "logps/rejected": -1124.0, "loss": 0.9878, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0224761962890625, "rewards/margins": 0.0487060546875, "rewards/rejected": -0.07131195068359375, "step": 117 }, { "epoch": 0.02229675468845954, "grad_norm": 4.499905363486643, "learning_rate": 2.2075471698113208e-07, "logits/chosen": 1.7294921875, "logits/rejected": 1.86572265625, "logps/chosen": -1486.0, "logps/rejected": -803.5, "loss": 0.9902, "rewards/accuracies": 0.40625, "rewards/chosen": 0.008600234985351562, "rewards/margins": 0.036712646484375, "rewards/rejected": -0.028106689453125, "step": 118 }, { "epoch": 0.022485710236666824, "grad_norm": 3.482324046645368, "learning_rate": 2.2264150943396225e-07, "logits/chosen": 1.8603515625, "logits/rejected": 3.08984375, "logps/chosen": -678.5, "logps/rejected": -702.0, "loss": 0.9805, "rewards/accuracies": 0.40625, "rewards/chosen": 0.000797271728515625, "rewards/margins": 0.08551025390625, "rewards/rejected": -0.0848388671875, "step": 119 }, { "epoch": 0.022674665784874107, "grad_norm": 3.214934333956976, "learning_rate": 2.2452830188679245e-07, "logits/chosen": 2.158203125, "logits/rejected": 2.72265625, "logps/chosen": -813.0, "logps/rejected": -1270.0, "loss": 0.9833, "rewards/accuracies": 0.28125, "rewards/chosen": -0.0281829833984375, "rewards/margins": 0.110748291015625, "rewards/rejected": -0.138946533203125, "step": 120 }, { "epoch": 0.022863621333081393, "grad_norm": 4.10051732253273, "learning_rate": 2.2641509433962263e-07, "logits/chosen": 2.99609375, "logits/rejected": 3.125, "logps/chosen": -968.0, "logps/rejected": -655.5, "loss": 1.002, "rewards/accuracies": 0.4375, "rewards/chosen": -0.059417724609375, "rewards/margins": -0.0103607177734375, "rewards/rejected": -0.049041748046875, "step": 121 }, { "epoch": 0.023052576881288676, "grad_norm": 3.4969433417643336, "learning_rate": 2.2830188679245283e-07, "logits/chosen": 2.7578125, "logits/rejected": 3.484375, "logps/chosen": -595.5, "logps/rejected": -879.0, "loss": 0.9647, "rewards/accuracies": 0.5, "rewards/chosen": 0.0435791015625, "rewards/margins": 0.15313720703125, "rewards/rejected": -0.1096038818359375, "step": 122 }, { "epoch": 0.023241532429495962, "grad_norm": 4.235278367078347, "learning_rate": 2.30188679245283e-07, "logits/chosen": 2.345703125, "logits/rejected": 3.095703125, "logps/chosen": -921.0, "logps/rejected": -917.0, "loss": 0.9761, "rewards/accuracies": 0.40625, "rewards/chosen": -0.022500991821289062, "rewards/margins": 0.09796142578125, "rewards/rejected": -0.12060546875, "step": 123 }, { "epoch": 0.023430487977703245, "grad_norm": 3.2746804574346373, "learning_rate": 2.320754716981132e-07, "logits/chosen": 2.76171875, "logits/rejected": 2.4296875, "logps/chosen": -519.0, "logps/rejected": -657.0, "loss": 0.9805, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0103912353515625, "rewards/margins": 0.0756072998046875, "rewards/rejected": -0.086029052734375, "step": 124 }, { "epoch": 0.02361944352591053, "grad_norm": 2.7220909372771183, "learning_rate": 2.3396226415094338e-07, "logits/chosen": 2.78125, "logits/rejected": 3.3125, "logps/chosen": -715.0, "logps/rejected": -608.5, "loss": 0.9922, "rewards/accuracies": 0.375, "rewards/chosen": -0.011264801025390625, "rewards/margins": 0.03202056884765625, "rewards/rejected": -0.04311370849609375, "step": 125 }, { "epoch": 0.023808399074117814, "grad_norm": 4.384602715158752, "learning_rate": 2.3584905660377358e-07, "logits/chosen": 2.455078125, "logits/rejected": 2.80859375, "logps/chosen": -717.5, "logps/rejected": -926.0, "loss": 0.9851, "rewards/accuracies": 0.34375, "rewards/chosen": 0.0133819580078125, "rewards/margins": 0.0620574951171875, "rewards/rejected": -0.04852294921875, "step": 126 }, { "epoch": 0.0239973546223251, "grad_norm": 2.851073904107619, "learning_rate": 2.3773584905660376e-07, "logits/chosen": 2.126953125, "logits/rejected": 2.5078125, "logps/chosen": -553.0, "logps/rejected": -567.0, "loss": 0.9869, "rewards/accuracies": 0.4375, "rewards/chosen": 0.001934051513671875, "rewards/margins": 0.052890777587890625, "rewards/rejected": -0.05084228515625, "step": 127 }, { "epoch": 0.024186310170532382, "grad_norm": 2.93585578944786, "learning_rate": 2.3962264150943396e-07, "logits/chosen": 2.64453125, "logits/rejected": 2.96875, "logps/chosen": -682.5, "logps/rejected": -754.0, "loss": 0.9937, "rewards/accuracies": 0.40625, "rewards/chosen": -0.02777099609375, "rewards/margins": 0.0238800048828125, "rewards/rejected": -0.051666259765625, "step": 128 }, { "epoch": 0.024375265718739665, "grad_norm": 4.308196010003208, "learning_rate": 2.4150943396226416e-07, "logits/chosen": 2.4873046875, "logits/rejected": 3.37890625, "logps/chosen": -1144.0, "logps/rejected": -1096.0, "loss": 1.0015, "rewards/accuracies": 0.34375, "rewards/chosen": -0.05377197265625, "rewards/margins": -0.0078125, "rewards/rejected": -0.045867919921875, "step": 129 }, { "epoch": 0.02456422126694695, "grad_norm": 4.785327553678186, "learning_rate": 2.433962264150943e-07, "logits/chosen": 2.1259765625, "logits/rejected": 3.080078125, "logps/chosen": -770.0, "logps/rejected": -1021.0, "loss": 0.9685, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04998779296875, "rewards/margins": 0.12896347045898438, "rewards/rejected": -0.0789794921875, "step": 130 }, { "epoch": 0.024753176815154234, "grad_norm": 3.8525923783223583, "learning_rate": 2.452830188679245e-07, "logits/chosen": 3.6953125, "logits/rejected": 4.53125, "logps/chosen": -808.5, "logps/rejected": -1418.0, "loss": 0.9642, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00244140625, "rewards/margins": 0.18658447265625, "rewards/rejected": -0.189453125, "step": 131 }, { "epoch": 0.02494213236336152, "grad_norm": 4.140511408246227, "learning_rate": 2.471698113207547e-07, "logits/chosen": 2.3984375, "logits/rejected": 2.857421875, "logps/chosen": -891.0, "logps/rejected": -1202.5, "loss": 0.9772, "rewards/accuracies": 0.4375, "rewards/chosen": -0.02008056640625, "rewards/margins": 0.09433555603027344, "rewards/rejected": -0.1146240234375, "step": 132 }, { "epoch": 0.025131087911568803, "grad_norm": 3.011789221922201, "learning_rate": 2.490566037735849e-07, "logits/chosen": 2.21875, "logits/rejected": 2.95703125, "logps/chosen": -742.5, "logps/rejected": -1930.0, "loss": 0.9863, "rewards/accuracies": 0.375, "rewards/chosen": -0.037578582763671875, "rewards/margins": -0.045166015625, "rewards/rejected": 0.008392333984375, "step": 133 }, { "epoch": 0.02532004345977609, "grad_norm": 3.0173551682312385, "learning_rate": 2.5094339622641506e-07, "logits/chosen": 2.404296875, "logits/rejected": 2.984375, "logps/chosen": -744.0, "logps/rejected": -563.5, "loss": 0.9766, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0179901123046875, "rewards/margins": 0.09674072265625, "rewards/rejected": -0.1147003173828125, "step": 134 }, { "epoch": 0.025508999007983372, "grad_norm": 3.60078116272245, "learning_rate": 2.5283018867924526e-07, "logits/chosen": 2.09375, "logits/rejected": 2.150390625, "logps/chosen": -980.0, "logps/rejected": -823.0, "loss": 1.0035, "rewards/accuracies": 0.375, "rewards/chosen": -0.05887603759765625, "rewards/margins": -0.0180816650390625, "rewards/rejected": -0.04085540771484375, "step": 135 }, { "epoch": 0.025697954556190655, "grad_norm": 3.138850160359396, "learning_rate": 2.5471698113207547e-07, "logits/chosen": 2.603515625, "logits/rejected": 3.341796875, "logps/chosen": -785.5, "logps/rejected": -802.0, "loss": 0.9545, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0296783447265625, "rewards/margins": 0.19627761840820312, "rewards/rejected": -0.166656494140625, "step": 136 }, { "epoch": 0.02588691010439794, "grad_norm": 3.0357584605120658, "learning_rate": 2.5660377358490567e-07, "logits/chosen": 2.327880859375, "logits/rejected": 3.0439453125, "logps/chosen": -671.5, "logps/rejected": -849.0, "loss": 0.9689, "rewards/accuracies": 0.5, "rewards/chosen": 0.042388916015625, "rewards/margins": 0.135223388671875, "rewards/rejected": -0.092803955078125, "step": 137 }, { "epoch": 0.026075865652605223, "grad_norm": 2.965854885596104, "learning_rate": 2.584905660377358e-07, "logits/chosen": 2.650390625, "logits/rejected": 2.857421875, "logps/chosen": -774.5, "logps/rejected": -699.0, "loss": 0.9835, "rewards/accuracies": 0.53125, "rewards/chosen": -0.02374267578125, "rewards/margins": 0.0640869140625, "rewards/rejected": -0.087860107421875, "step": 138 }, { "epoch": 0.02626482120081251, "grad_norm": 3.8383975005575217, "learning_rate": 2.60377358490566e-07, "logits/chosen": 2.96875, "logits/rejected": 3.1640625, "logps/chosen": -5896.0, "logps/rejected": -874.5, "loss": 0.928, "rewards/accuracies": 0.59375, "rewards/chosen": 1.6230926513671875, "rewards/margins": 1.857330322265625, "rewards/rejected": -0.231109619140625, "step": 139 }, { "epoch": 0.026453776749019792, "grad_norm": 3.2505308915412, "learning_rate": 2.622641509433962e-07, "logits/chosen": 2.5078125, "logits/rejected": 3.2265625, "logps/chosen": -650.0, "logps/rejected": -1011.0, "loss": 0.9709, "rewards/accuracies": 0.4375, "rewards/chosen": 0.008544921875, "rewards/margins": 0.11904144287109375, "rewards/rejected": -0.110107421875, "step": 140 }, { "epoch": 0.02664273229722708, "grad_norm": 3.7781254911148188, "learning_rate": 2.641509433962264e-07, "logits/chosen": 2.1201171875, "logits/rejected": 2.34326171875, "logps/chosen": -950.0, "logps/rejected": -838.0, "loss": 0.9915, "rewards/accuracies": 0.5, "rewards/chosen": -0.06573486328125, "rewards/margins": 0.029998779296875, "rewards/rejected": -0.09576416015625, "step": 141 }, { "epoch": 0.02683168784543436, "grad_norm": 7778.7115327585125, "learning_rate": 2.6603773584905657e-07, "logits/chosen": 2.45703125, "logits/rejected": 2.0859375, "logps/chosen": -644.5, "logps/rejected": -17310.5, "loss": 0.986, "rewards/accuracies": 0.4375, "rewards/chosen": -0.012899398803710938, "rewards/margins": 0.0559234619140625, "rewards/rejected": -0.06878662109375, "step": 142 }, { "epoch": 0.027020643393641647, "grad_norm": 3.3017110398282474, "learning_rate": 2.6792452830188677e-07, "logits/chosen": 2.763671875, "logits/rejected": 3.44140625, "logps/chosen": -730.5, "logps/rejected": -941.0, "loss": 0.9639, "rewards/accuracies": 0.5, "rewards/chosen": 0.01434326171875, "rewards/margins": 0.15167236328125, "rewards/rejected": -0.1372528076171875, "step": 143 }, { "epoch": 0.02720959894184893, "grad_norm": 3.6909707608424283, "learning_rate": 2.6981132075471697e-07, "logits/chosen": 0.7177734375, "logits/rejected": 1.744140625, "logps/chosen": -881.5, "logps/rejected": -818.0, "loss": 0.95, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0163726806640625, "rewards/margins": 0.204345703125, "rewards/rejected": -0.187744140625, "step": 144 }, { "epoch": 0.027398554490056213, "grad_norm": 2.294748596773968, "learning_rate": 2.716981132075472e-07, "logits/chosen": 1.9921875, "logits/rejected": 2.86328125, "logps/chosen": -492.5, "logps/rejected": -423.75, "loss": 0.9863, "rewards/accuracies": 0.5, "rewards/chosen": -0.027435302734375, "rewards/margins": 0.05492591857910156, "rewards/rejected": -0.08258056640625, "step": 145 }, { "epoch": 0.0275875100382635, "grad_norm": 3.0479112780346904, "learning_rate": 2.735849056603773e-07, "logits/chosen": 1.923828125, "logits/rejected": 2.037109375, "logps/chosen": -708.0, "logps/rejected": -569.5, "loss": 0.9841, "rewards/accuracies": 0.375, "rewards/chosen": -0.03516387939453125, "rewards/margins": 0.06591796875, "rewards/rejected": -0.10113525390625, "step": 146 }, { "epoch": 0.027776465586470782, "grad_norm": 3.4682223517583366, "learning_rate": 2.754716981132075e-07, "logits/chosen": 3.017578125, "logits/rejected": 3.259765625, "logps/chosen": -872.5, "logps/rejected": -770.5, "loss": 0.9667, "rewards/accuracies": 0.53125, "rewards/chosen": -0.021697998046875, "rewards/margins": 0.1357421875, "rewards/rejected": -0.1571044921875, "step": 147 }, { "epoch": 0.027965421134678068, "grad_norm": 3.713847265387797, "learning_rate": 2.773584905660377e-07, "logits/chosen": 2.61328125, "logits/rejected": 2.751953125, "logps/chosen": -864.0, "logps/rejected": -907.0, "loss": 0.9685, "rewards/accuracies": 0.5, "rewards/chosen": -0.03397369384765625, "rewards/margins": 0.127685546875, "rewards/rejected": -0.1619873046875, "step": 148 }, { "epoch": 0.02815437668288535, "grad_norm": 3.1351656339045495, "learning_rate": 2.7924528301886793e-07, "logits/chosen": 1.83203125, "logits/rejected": 2.373046875, "logps/chosen": -848.5, "logps/rejected": -817.5, "loss": 0.9568, "rewards/accuracies": 0.4375, "rewards/chosen": 0.029693603515625, "rewards/margins": 0.17633056640625, "rewards/rejected": -0.1463623046875, "step": 149 }, { "epoch": 0.028343332231092637, "grad_norm": 5.594364793476688, "learning_rate": 2.811320754716981e-07, "logits/chosen": 3.265625, "logits/rejected": 3.494140625, "logps/chosen": -1173.5, "logps/rejected": -668.0, "loss": 0.9918, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0463714599609375, "rewards/margins": 0.03387451171875, "rewards/rejected": -0.08013916015625, "step": 150 }, { "epoch": 0.02853228777929992, "grad_norm": 3.1979270497028187, "learning_rate": 2.830188679245283e-07, "logits/chosen": 2.322265625, "logits/rejected": 3.140625, "logps/chosen": -812.0, "logps/rejected": -831.5, "loss": 0.9707, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0111083984375, "rewards/margins": 0.1256103515625, "rewards/rejected": -0.11431884765625, "step": 151 }, { "epoch": 0.028721243327507202, "grad_norm": 3.306512096617463, "learning_rate": 2.849056603773585e-07, "logits/chosen": 1.611328125, "logits/rejected": 2.2197265625, "logps/chosen": -876.0, "logps/rejected": -855.5, "loss": 0.9479, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0089111328125, "rewards/margins": 0.22259521484375, "rewards/rejected": -0.230712890625, "step": 152 }, { "epoch": 0.02891019887571449, "grad_norm": 3.263080256917803, "learning_rate": 2.867924528301887e-07, "logits/chosen": 2.625, "logits/rejected": 3.142578125, "logps/chosen": -947.5, "logps/rejected": -669.0, "loss": 0.9613, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00213623046875, "rewards/margins": 0.16745758056640625, "rewards/rejected": -0.16986083984375, "step": 153 }, { "epoch": 0.02909915442392177, "grad_norm": 2.2421526732368364, "learning_rate": 2.8867924528301883e-07, "logits/chosen": 2.654296875, "logits/rejected": 3.529296875, "logps/chosen": -487.75, "logps/rejected": -473.5, "loss": 0.9703, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00225830078125, "rewards/margins": 0.12078857421875, "rewards/rejected": -0.1226806640625, "step": 154 }, { "epoch": 0.029288109972129057, "grad_norm": 2.823753398737972, "learning_rate": 2.9056603773584903e-07, "logits/chosen": 2.193359375, "logits/rejected": 2.900390625, "logps/chosen": -711.0, "logps/rejected": -652.5, "loss": 0.9696, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0354461669921875, "rewards/margins": 0.1270751953125, "rewards/rejected": -0.1622314453125, "step": 155 }, { "epoch": 0.02947706552033634, "grad_norm": 2.7359789988371226, "learning_rate": 2.9245283018867923e-07, "logits/chosen": 2.5546875, "logits/rejected": 2.4296875, "logps/chosen": -581.0, "logps/rejected": -453.5, "loss": 0.9945, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0772705078125, "rewards/margins": 0.0203857421875, "rewards/rejected": -0.0975341796875, "step": 156 }, { "epoch": 0.029666021068543626, "grad_norm": 2.3924247287408136, "learning_rate": 2.9433962264150943e-07, "logits/chosen": 1.73583984375, "logits/rejected": 2.39453125, "logps/chosen": -507.5, "logps/rejected": -524.5, "loss": 0.979, "rewards/accuracies": 0.5, "rewards/chosen": -0.04657173156738281, "rewards/margins": 0.0814208984375, "rewards/rejected": -0.1279296875, "step": 157 }, { "epoch": 0.02985497661675091, "grad_norm": 3.4091762611564267, "learning_rate": 2.962264150943396e-07, "logits/chosen": 2.0703125, "logits/rejected": 2.5009765625, "logps/chosen": -860.0, "logps/rejected": -658.0, "loss": 0.9822, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0816650390625, "rewards/margins": 0.0738372802734375, "rewards/rejected": -0.155517578125, "step": 158 }, { "epoch": 0.030043932164958195, "grad_norm": 3.8316094927376563, "learning_rate": 2.981132075471698e-07, "logits/chosen": 2.361328125, "logits/rejected": 3.23828125, "logps/chosen": -1008.0, "logps/rejected": -1183.0, "loss": 0.975, "rewards/accuracies": 0.40625, "rewards/chosen": -0.07281494140625, "rewards/margins": 0.10650634765625, "rewards/rejected": -0.1788330078125, "step": 159 }, { "epoch": 0.030232887713165478, "grad_norm": 6.638130740391131, "learning_rate": 3e-07, "logits/chosen": 2.7109375, "logits/rejected": 3.21875, "logps/chosen": -605.0, "logps/rejected": -1481.0, "loss": 0.973, "rewards/accuracies": 0.5, "rewards/chosen": -0.0540771484375, "rewards/margins": 0.108795166015625, "rewards/rejected": -0.16314697265625, "step": 160 }, { "epoch": 0.03042184326137276, "grad_norm": 16.211739772932493, "learning_rate": 3.018867924528302e-07, "logits/chosen": 3.3515625, "logits/rejected": 4.4140625, "logps/chosen": -942.0, "logps/rejected": -1392.0, "loss": 0.9746, "rewards/accuracies": 0.46875, "rewards/chosen": -0.04840230941772461, "rewards/margins": 0.111968994140625, "rewards/rejected": -0.160888671875, "step": 161 }, { "epoch": 0.030610798809580047, "grad_norm": 4.059191917823587, "learning_rate": 3.0377358490566034e-07, "logits/chosen": 2.072265625, "logits/rejected": 3.154296875, "logps/chosen": -634.0, "logps/rejected": -1112.5, "loss": 0.9855, "rewards/accuracies": 0.46875, "rewards/chosen": -0.08941650390625, "rewards/margins": 0.058837890625, "rewards/rejected": -0.148193359375, "step": 162 }, { "epoch": 0.03079975435778733, "grad_norm": 3.364683443382378, "learning_rate": 3.0566037735849054e-07, "logits/chosen": 2.607421875, "logits/rejected": 2.7412109375, "logps/chosen": -825.5, "logps/rejected": -17742.0, "loss": 0.9482, "rewards/accuracies": 0.5, "rewards/chosen": -0.056427001953125, "rewards/margins": 12.980911254882812, "rewards/rejected": -13.032958984375, "step": 163 }, { "epoch": 0.030988709905994616, "grad_norm": 3.727443557340476, "learning_rate": 3.0754716981132074e-07, "logits/chosen": 2.103515625, "logits/rejected": 2.44140625, "logps/chosen": -923.0, "logps/rejected": -1022.0, "loss": 0.9724, "rewards/accuracies": 0.46875, "rewards/chosen": -0.097320556640625, "rewards/margins": 0.113037109375, "rewards/rejected": -0.210693359375, "step": 164 }, { "epoch": 0.0311776654542019, "grad_norm": 3.8514139450544627, "learning_rate": 3.0943396226415094e-07, "logits/chosen": 2.59375, "logits/rejected": 3.19921875, "logps/chosen": -1083.0, "logps/rejected": -856.0, "loss": 0.9523, "rewards/accuracies": 0.625, "rewards/chosen": -0.01959228515625, "rewards/margins": 0.197357177734375, "rewards/rejected": -0.2166748046875, "step": 165 }, { "epoch": 0.03136662100240918, "grad_norm": 2.9274778495703004, "learning_rate": 3.113207547169811e-07, "logits/chosen": 2.548828125, "logits/rejected": 2.666015625, "logps/chosen": -658.0, "logps/rejected": -512.5, "loss": 0.9712, "rewards/accuracies": 0.5, "rewards/chosen": -0.0732421875, "rewards/margins": 0.1217041015625, "rewards/rejected": -0.1949462890625, "step": 166 }, { "epoch": 0.03155557655061647, "grad_norm": 2.9247303179614694, "learning_rate": 3.132075471698113e-07, "logits/chosen": 1.9609375, "logits/rejected": 2.64453125, "logps/chosen": -761.0, "logps/rejected": -622.0, "loss": 0.9631, "rewards/accuracies": 0.625, "rewards/chosen": -0.0682220458984375, "rewards/margins": 0.1724853515625, "rewards/rejected": -0.240966796875, "step": 167 }, { "epoch": 0.031744532098823754, "grad_norm": 2.72609009560235, "learning_rate": 3.150943396226415e-07, "logits/chosen": 2.185546875, "logits/rejected": 2.376953125, "logps/chosen": -812.0, "logps/rejected": -603.0, "loss": 0.9441, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06201171875, "rewards/margins": 0.2408447265625, "rewards/rejected": -0.3017578125, "step": 168 }, { "epoch": 0.031933487647031036, "grad_norm": 2.7554529927129034, "learning_rate": 3.169811320754717e-07, "logits/chosen": 1.818359375, "logits/rejected": 2.4375, "logps/chosen": -758.0, "logps/rejected": -758.0, "loss": 0.96, "rewards/accuracies": 0.5, "rewards/chosen": -0.0143890380859375, "rewards/margins": 0.181915283203125, "rewards/rejected": -0.1962890625, "step": 169 }, { "epoch": 0.03212244319523832, "grad_norm": 3.11654752605198, "learning_rate": 3.1886792452830184e-07, "logits/chosen": 1.9222412109375, "logits/rejected": 2.494140625, "logps/chosen": -735.0, "logps/rejected": -863.0, "loss": 0.9531, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0322265625, "rewards/margins": 0.2134246826171875, "rewards/rejected": -0.24566650390625, "step": 170 }, { "epoch": 0.0323113987434456, "grad_norm": 2.4418974038847563, "learning_rate": 3.2075471698113204e-07, "logits/chosen": 2.365234375, "logits/rejected": 2.693359375, "logps/chosen": -524.5, "logps/rejected": -581.5, "loss": 0.9669, "rewards/accuracies": 0.625, "rewards/chosen": -0.036927223205566406, "rewards/margins": 0.1348876953125, "rewards/rejected": -0.171630859375, "step": 171 }, { "epoch": 0.03250035429165289, "grad_norm": 2.138205615394506, "learning_rate": 3.2264150943396225e-07, "logits/chosen": 1.89453125, "logits/rejected": 1.6318359375, "logps/chosen": -467.25, "logps/rejected": -396.75, "loss": 0.9685, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0360565185546875, "rewards/margins": 0.1287841796875, "rewards/rejected": -0.164794921875, "step": 172 }, { "epoch": 0.032689309839860174, "grad_norm": 3.2988001048726936, "learning_rate": 3.245283018867924e-07, "logits/chosen": 1.7353515625, "logits/rejected": 1.58203125, "logps/chosen": -756.0, "logps/rejected": -990.0, "loss": 0.9701, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0307464599609375, "rewards/margins": 0.1231689453125, "rewards/rejected": -0.1536865234375, "step": 173 }, { "epoch": 0.03287826538806746, "grad_norm": 2.3726304625546137, "learning_rate": 3.264150943396226e-07, "logits/chosen": 1.697265625, "logits/rejected": 1.986328125, "logps/chosen": -555.5, "logps/rejected": -573.5, "loss": 0.9673, "rewards/accuracies": 0.53125, "rewards/chosen": -0.041412353515625, "rewards/margins": 0.1376190185546875, "rewards/rejected": -0.17889404296875, "step": 174 }, { "epoch": 0.03306722093627474, "grad_norm": 4.183360760201133, "learning_rate": 3.283018867924528e-07, "logits/chosen": 2.0927734375, "logits/rejected": 2.96484375, "logps/chosen": -958.5, "logps/rejected": -991.5, "loss": 0.9574, "rewards/accuracies": 0.71875, "rewards/chosen": -0.007293701171875, "rewards/margins": 0.188232421875, "rewards/rejected": -0.19580078125, "step": 175 }, { "epoch": 0.03325617648448202, "grad_norm": 2.750352650243821, "learning_rate": 3.30188679245283e-07, "logits/chosen": 2.3720703125, "logits/rejected": 2.75146484375, "logps/chosen": -724.0, "logps/rejected": -511.5, "loss": 0.9624, "rewards/accuracies": 0.53125, "rewards/chosen": -0.06331634521484375, "rewards/margins": 0.150390625, "rewards/rejected": -0.2137451171875, "step": 176 }, { "epoch": 0.03344513203268931, "grad_norm": 3.401759339790889, "learning_rate": 3.3207547169811315e-07, "logits/chosen": 2.52197265625, "logits/rejected": 2.85986328125, "logps/chosen": -961.0, "logps/rejected": -1146.0, "loss": 0.9725, "rewards/accuracies": 0.5, "rewards/chosen": -0.01531982421875, "rewards/margins": 0.11492919921875, "rewards/rejected": -0.130126953125, "step": 177 }, { "epoch": 0.033634087580896595, "grad_norm": 3.8964872902444663, "learning_rate": 3.3396226415094335e-07, "logits/chosen": 2.9638671875, "logits/rejected": 3.64453125, "logps/chosen": -551.0, "logps/rejected": -820.0, "loss": 0.9674, "rewards/accuracies": 0.5, "rewards/chosen": -0.05153799057006836, "rewards/margins": 0.1432342529296875, "rewards/rejected": -0.1949462890625, "step": 178 }, { "epoch": 0.03382304312910388, "grad_norm": 4.5248248658723025, "learning_rate": 3.3584905660377355e-07, "logits/chosen": 2.3115234375, "logits/rejected": 3.93359375, "logps/chosen": -966.0, "logps/rejected": -2718.0, "loss": 0.8967, "rewards/accuracies": 0.71875, "rewards/chosen": -0.068389892578125, "rewards/margins": 0.7548828125, "rewards/rejected": -0.8232421875, "step": 179 }, { "epoch": 0.03401199867731116, "grad_norm": 2.6323122820693077, "learning_rate": 3.3773584905660375e-07, "logits/chosen": 2.96484375, "logits/rejected": 3.625, "logps/chosen": -665.0, "logps/rejected": -759.0, "loss": 0.9563, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0047760009765625, "rewards/margins": 0.196441650390625, "rewards/rejected": -0.191650390625, "step": 180 }, { "epoch": 0.03420095422551845, "grad_norm": 5.241678672823782, "learning_rate": 3.396226415094339e-07, "logits/chosen": 1.978515625, "logits/rejected": 2.78125, "logps/chosen": -930.0, "logps/rejected": -1693.5, "loss": 0.9465, "rewards/accuracies": 0.59375, "rewards/chosen": -0.01934814453125, "rewards/margins": 0.277587890625, "rewards/rejected": -0.296875, "step": 181 }, { "epoch": 0.03438990977372573, "grad_norm": 3.0127206610831854, "learning_rate": 3.415094339622641e-07, "logits/chosen": 1.97265625, "logits/rejected": 2.70703125, "logps/chosen": -741.5, "logps/rejected": -647.5, "loss": 0.9556, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0809326171875, "rewards/margins": 0.1993408203125, "rewards/rejected": -0.27978515625, "step": 182 }, { "epoch": 0.034578865321933015, "grad_norm": 3.859909885296323, "learning_rate": 3.433962264150943e-07, "logits/chosen": 2.7421875, "logits/rejected": 3.02734375, "logps/chosen": -841.0, "logps/rejected": -850.5, "loss": 0.9862, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1422119140625, "rewards/margins": 0.04974365234375, "rewards/rejected": -0.1923828125, "step": 183 }, { "epoch": 0.0347678208701403, "grad_norm": 3.417947184740284, "learning_rate": 3.452830188679245e-07, "logits/chosen": 1.5517578125, "logits/rejected": 1.892578125, "logps/chosen": -780.0, "logps/rejected": -1031.0, "loss": 0.9265, "rewards/accuracies": 0.71875, "rewards/chosen": -0.05987548828125, "rewards/margins": 0.355224609375, "rewards/rejected": -0.41455078125, "step": 184 }, { "epoch": 0.03495677641834758, "grad_norm": 2.0153449153991967, "learning_rate": 3.4716981132075466e-07, "logits/chosen": 2.099609375, "logits/rejected": 2.5859375, "logps/chosen": -393.25, "logps/rejected": -358.5, "loss": 0.9706, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0474853515625, "rewards/margins": 0.119903564453125, "rewards/rejected": -0.1671142578125, "step": 185 }, { "epoch": 0.03514573196655487, "grad_norm": 3.555202531747825, "learning_rate": 3.4905660377358486e-07, "logits/chosen": 1.3828125, "logits/rejected": 1.751953125, "logps/chosen": -868.5, "logps/rejected": -798.0, "loss": 0.936, "rewards/accuracies": 0.65625, "rewards/chosen": 0.042999267578125, "rewards/margins": 0.390869140625, "rewards/rejected": -0.348876953125, "step": 186 }, { "epoch": 0.03533468751476215, "grad_norm": 2.9010706259157795, "learning_rate": 3.5094339622641506e-07, "logits/chosen": 1.36962890625, "logits/rejected": 1.8671875, "logps/chosen": -672.5, "logps/rejected": -746.5, "loss": 0.9509, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1455078125, "rewards/margins": 0.23077392578125, "rewards/rejected": -0.376708984375, "step": 187 }, { "epoch": 0.035523643062969436, "grad_norm": 2.4814648076163475, "learning_rate": 3.5283018867924526e-07, "logits/chosen": 2.07421875, "logits/rejected": 2.80078125, "logps/chosen": -749.0, "logps/rejected": -989.5, "loss": 0.9362, "rewards/accuracies": 0.71875, "rewards/chosen": -0.05301475524902344, "rewards/margins": 0.312255859375, "rewards/rejected": -0.365234375, "step": 188 }, { "epoch": 0.03571259861117672, "grad_norm": 2.8180553240801838, "learning_rate": 3.547169811320754e-07, "logits/chosen": 2.21484375, "logits/rejected": 2.69921875, "logps/chosen": -728.0, "logps/rejected": -701.0, "loss": 0.9463, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0986328125, "rewards/margins": 0.23291015625, "rewards/rejected": -0.33154296875, "step": 189 }, { "epoch": 0.03590155415938401, "grad_norm": 3.006335316069373, "learning_rate": 3.566037735849056e-07, "logits/chosen": 2.671875, "logits/rejected": 4.09375, "logps/chosen": -722.0, "logps/rejected": -1358.5, "loss": 0.9529, "rewards/accuracies": 0.53125, "rewards/chosen": -0.162353515625, "rewards/margins": 0.385498046875, "rewards/rejected": -0.5474853515625, "step": 190 }, { "epoch": 0.03609050970759129, "grad_norm": 3.6035589184096466, "learning_rate": 3.584905660377358e-07, "logits/chosen": 2.51953125, "logits/rejected": 2.6484375, "logps/chosen": -1206.0, "logps/rejected": -1541.5, "loss": 0.9177, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0549468994140625, "rewards/margins": 0.4879150390625, "rewards/rejected": -0.541015625, "step": 191 }, { "epoch": 0.036279465255798574, "grad_norm": 3.053738633361808, "learning_rate": 3.60377358490566e-07, "logits/chosen": 1.7646484375, "logits/rejected": 1.808349609375, "logps/chosen": -1107.0, "logps/rejected": -932.5, "loss": 0.9279, "rewards/accuracies": 0.65625, "rewards/chosen": -0.020263671875, "rewards/margins": 0.37274169921875, "rewards/rejected": -0.3924560546875, "step": 192 }, { "epoch": 0.036468420804005856, "grad_norm": 2.944652884982271, "learning_rate": 3.6226415094339616e-07, "logits/chosen": 1.89453125, "logits/rejected": 2.85546875, "logps/chosen": -968.5, "logps/rejected": -837.5, "loss": 0.9598, "rewards/accuracies": 0.59375, "rewards/chosen": -0.199462890625, "rewards/margins": 0.188232421875, "rewards/rejected": -0.388671875, "step": 193 }, { "epoch": 0.03665737635221314, "grad_norm": 3.3334677350495348, "learning_rate": 3.6415094339622636e-07, "logits/chosen": 1.7421875, "logits/rejected": 2.29296875, "logps/chosen": -949.5, "logps/rejected": -950.0, "loss": 0.9185, "rewards/accuracies": 0.625, "rewards/chosen": -0.02752685546875, "rewards/margins": 0.40875244140625, "rewards/rejected": -0.4361572265625, "step": 194 }, { "epoch": 0.03684633190042043, "grad_norm": 2.5574492438120746, "learning_rate": 3.6603773584905657e-07, "logits/chosen": 1.8857421875, "logits/rejected": 3.255859375, "logps/chosen": -451.0, "logps/rejected": -508.0, "loss": 0.9418, "rewards/accuracies": 0.71875, "rewards/chosen": -0.089111328125, "rewards/margins": 0.2744140625, "rewards/rejected": -0.363037109375, "step": 195 }, { "epoch": 0.03703528744862771, "grad_norm": 2.662118478841741, "learning_rate": 3.6792452830188677e-07, "logits/chosen": 2.59765625, "logits/rejected": 2.41015625, "logps/chosen": -681.5, "logps/rejected": -503.0, "loss": 0.9736, "rewards/accuracies": 0.5, "rewards/chosen": -0.03125, "rewards/margins": 0.15489959716796875, "rewards/rejected": -0.1859130859375, "step": 196 }, { "epoch": 0.037224242996834994, "grad_norm": 3.03228186151406, "learning_rate": 3.698113207547169e-07, "logits/chosen": 2.6171875, "logits/rejected": 3.3046875, "logps/chosen": -706.5, "logps/rejected": -554.5, "loss": 0.9602, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1629638671875, "rewards/margins": 0.17470550537109375, "rewards/rejected": -0.337646484375, "step": 197 }, { "epoch": 0.03741319854504228, "grad_norm": 5.657278156842082, "learning_rate": 3.716981132075471e-07, "logits/chosen": 2.521484375, "logits/rejected": 2.64453125, "logps/chosen": -562.0, "logps/rejected": -1516.5, "loss": 0.9452, "rewards/accuracies": 0.71875, "rewards/chosen": -0.09108734130859375, "rewards/margins": 0.230224609375, "rewards/rejected": -0.321533203125, "step": 198 }, { "epoch": 0.03760215409324956, "grad_norm": 3.0262332993450367, "learning_rate": 3.735849056603773e-07, "logits/chosen": 1.720703125, "logits/rejected": 2.693359375, "logps/chosen": -998.0, "logps/rejected": -719.0, "loss": 0.9596, "rewards/accuracies": 0.625, "rewards/chosen": -0.10107421875, "rewards/margins": 0.184814453125, "rewards/rejected": -0.2852783203125, "step": 199 }, { "epoch": 0.03779110964145685, "grad_norm": 2.3172825069379748, "learning_rate": 3.7547169811320757e-07, "logits/chosen": 2.122802734375, "logits/rejected": 2.9716796875, "logps/chosen": -650.5, "logps/rejected": -745.5, "loss": 0.9304, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0771484375, "rewards/margins": 0.44677734375, "rewards/rejected": -0.525390625, "step": 200 }, { "epoch": 0.03798006518966413, "grad_norm": 3.478253680172613, "learning_rate": 3.773584905660377e-07, "logits/chosen": 2.37109375, "logits/rejected": 3.515625, "logps/chosen": -938.0, "logps/rejected": -1214.0, "loss": 0.8984, "rewards/accuracies": 0.71875, "rewards/chosen": -0.053466796875, "rewards/margins": 0.50146484375, "rewards/rejected": -0.5546875, "step": 201 }, { "epoch": 0.038169020737871415, "grad_norm": 2.799609681803864, "learning_rate": 3.792452830188679e-07, "logits/chosen": 1.40625, "logits/rejected": 2.56640625, "logps/chosen": -646.5, "logps/rejected": -1259.0, "loss": 0.9467, "rewards/accuracies": 0.625, "rewards/chosen": -0.13433837890625, "rewards/margins": 0.2822265625, "rewards/rejected": -0.415771484375, "step": 202 }, { "epoch": 0.0383579762860787, "grad_norm": 2.9211520784770073, "learning_rate": 3.811320754716981e-07, "logits/chosen": 2.177734375, "logits/rejected": 3.5859375, "logps/chosen": -586.5, "logps/rejected": -1153.5, "loss": 0.924, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10134506225585938, "rewards/margins": 0.3603515625, "rewards/rejected": -0.46240234375, "step": 203 }, { "epoch": 0.03854693183428599, "grad_norm": 2.2904878864745757, "learning_rate": 3.8301886792452833e-07, "logits/chosen": 2.515625, "logits/rejected": 3.29296875, "logps/chosen": -474.0, "logps/rejected": -596.5, "loss": 0.9512, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09307861328125, "rewards/margins": 0.22369384765625, "rewards/rejected": -0.316650390625, "step": 204 }, { "epoch": 0.03873588738249327, "grad_norm": 6.0791473451228, "learning_rate": 3.849056603773585e-07, "logits/chosen": 1.886932373046875, "logits/rejected": 2.592041015625, "logps/chosen": -864.0, "logps/rejected": -1273.0, "loss": 0.9651, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1385498046875, "rewards/margins": 0.16387939453125, "rewards/rejected": -0.302490234375, "step": 205 }, { "epoch": 0.03892484293070055, "grad_norm": 2.9683957809080326, "learning_rate": 3.867924528301887e-07, "logits/chosen": 1.6181640625, "logits/rejected": 1.687744140625, "logps/chosen": -793.0, "logps/rejected": -774.0, "loss": 0.9023, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1417236328125, "rewards/margins": 0.443359375, "rewards/rejected": -0.58447265625, "step": 206 }, { "epoch": 0.039113798478907835, "grad_norm": 2.973594471604034, "learning_rate": 3.886792452830189e-07, "logits/chosen": 1.626953125, "logits/rejected": 2.662109375, "logps/chosen": -965.5, "logps/rejected": -1104.5, "loss": 0.9379, "rewards/accuracies": 0.4375, "rewards/chosen": -0.160400390625, "rewards/margins": 0.3248291015625, "rewards/rejected": -0.4853515625, "step": 207 }, { "epoch": 0.03930275402711512, "grad_norm": 2.2438898270926324, "learning_rate": 3.905660377358491e-07, "logits/chosen": 3.064453125, "logits/rejected": 3.3984375, "logps/chosen": -512.75, "logps/rejected": -548.0, "loss": 0.965, "rewards/accuracies": 0.5, "rewards/chosen": -0.0992431640625, "rewards/margins": 0.1668701171875, "rewards/rejected": -0.265869140625, "step": 208 }, { "epoch": 0.03949170957532241, "grad_norm": 2.913637948706713, "learning_rate": 3.9245283018867923e-07, "logits/chosen": 1.54736328125, "logits/rejected": 2.3271484375, "logps/chosen": -799.0, "logps/rejected": -998.0, "loss": 0.9182, "rewards/accuracies": 0.71875, "rewards/chosen": -0.04888916015625, "rewards/margins": 0.4178466796875, "rewards/rejected": -0.4658203125, "step": 209 }, { "epoch": 0.03968066512352969, "grad_norm": 2.6851929724844603, "learning_rate": 3.9433962264150943e-07, "logits/chosen": 1.6484375, "logits/rejected": 2.671875, "logps/chosen": -813.0, "logps/rejected": -710.0, "loss": 0.9299, "rewards/accuracies": 0.6875, "rewards/chosen": 0.029296875, "rewards/margins": 0.3463134765625, "rewards/rejected": -0.317138671875, "step": 210 }, { "epoch": 0.03986962067173697, "grad_norm": 3.1245364024204325, "learning_rate": 3.9622641509433963e-07, "logits/chosen": 2.43359375, "logits/rejected": 2.494140625, "logps/chosen": -774.5, "logps/rejected": -7387.5, "loss": 0.985, "rewards/accuracies": 0.40625, "rewards/chosen": -0.02557373046875, "rewards/margins": -3.05743408203125, "rewards/rejected": 3.044921875, "step": 211 }, { "epoch": 0.040058576219944256, "grad_norm": 2.562433898621542, "learning_rate": 3.9811320754716983e-07, "logits/chosen": 1.962890625, "logits/rejected": 2.609375, "logps/chosen": -749.0, "logps/rejected": -754.0, "loss": 0.9189, "rewards/accuracies": 0.65625, "rewards/chosen": -0.042633056640625, "rewards/margins": 0.366943359375, "rewards/rejected": -0.40966796875, "step": 212 }, { "epoch": 0.040247531768151545, "grad_norm": 2.9617643447696715, "learning_rate": 4e-07, "logits/chosen": 2.9375, "logits/rejected": 3.69140625, "logps/chosen": -670.5, "logps/rejected": -726.0, "loss": 0.9266, "rewards/accuracies": 0.65625, "rewards/chosen": -0.136474609375, "rewards/margins": 0.3751220703125, "rewards/rejected": -0.512451171875, "step": 213 }, { "epoch": 0.04043648731635883, "grad_norm": 2.6488432167331615, "learning_rate": 4.018867924528302e-07, "logits/chosen": 1.623046875, "logits/rejected": 1.84375, "logps/chosen": -503.0, "logps/rejected": -555.5, "loss": 0.9612, "rewards/accuracies": 0.625, "rewards/chosen": -0.068115234375, "rewards/margins": 0.177001953125, "rewards/rejected": -0.244873046875, "step": 214 }, { "epoch": 0.04062544286456611, "grad_norm": 2.9928044744958933, "learning_rate": 4.037735849056604e-07, "logits/chosen": 0.69873046875, "logits/rejected": 1.27685546875, "logps/chosen": -624.0, "logps/rejected": -623.0, "loss": 0.9172, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0924072265625, "rewards/margins": 0.38525390625, "rewards/rejected": -0.4775390625, "step": 215 }, { "epoch": 0.040814398412773394, "grad_norm": 2.8729510319159313, "learning_rate": 4.056603773584906e-07, "logits/chosen": 1.8623046875, "logits/rejected": 2.564453125, "logps/chosen": -786.0, "logps/rejected": -753.0, "loss": 0.9023, "rewards/accuracies": 0.65625, "rewards/chosen": -0.079498291015625, "rewards/margins": 0.46142578125, "rewards/rejected": -0.5400390625, "step": 216 }, { "epoch": 0.041003353960980676, "grad_norm": 3.848739453796428, "learning_rate": 4.0754716981132074e-07, "logits/chosen": 3.08984375, "logits/rejected": 3.625, "logps/chosen": -943.5, "logps/rejected": -1197.0, "loss": 0.9603, "rewards/accuracies": 0.5625, "rewards/chosen": -0.26611328125, "rewards/margins": 0.3182373046875, "rewards/rejected": -0.58447265625, "step": 217 }, { "epoch": 0.041192309509187966, "grad_norm": 2.4223444496408373, "learning_rate": 4.0943396226415094e-07, "logits/chosen": 1.211181640625, "logits/rejected": 1.820556640625, "logps/chosen": -880.75, "logps/rejected": -742.0, "loss": 0.9167, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0115966796875, "rewards/margins": 0.44091796875, "rewards/rejected": -0.4521484375, "step": 218 }, { "epoch": 0.04138126505739525, "grad_norm": 2.6931593789483306, "learning_rate": 4.1132075471698114e-07, "logits/chosen": 2.234375, "logits/rejected": 1.9888916015625, "logps/chosen": -668.5, "logps/rejected": -467.5, "loss": 0.9541, "rewards/accuracies": 0.625, "rewards/chosen": -0.11431884765625, "rewards/margins": 0.205078125, "rewards/rejected": -0.3197021484375, "step": 219 }, { "epoch": 0.04157022060560253, "grad_norm": 2.427318166602575, "learning_rate": 4.1320754716981134e-07, "logits/chosen": 2.529296875, "logits/rejected": 3.44921875, "logps/chosen": -494.0, "logps/rejected": -582.75, "loss": 0.9536, "rewards/accuracies": 0.59375, "rewards/chosen": 0.013378143310546875, "rewards/margins": 0.223388671875, "rewards/rejected": -0.210205078125, "step": 220 }, { "epoch": 0.041759176153809814, "grad_norm": 2.586402941212883, "learning_rate": 4.150943396226415e-07, "logits/chosen": 1.84423828125, "logits/rejected": 2.73828125, "logps/chosen": -628.0, "logps/rejected": -764.0, "loss": 0.931, "rewards/accuracies": 0.6875, "rewards/chosen": -0.018218994140625, "rewards/margins": 0.35888671875, "rewards/rejected": -0.377197265625, "step": 221 }, { "epoch": 0.041948131702017104, "grad_norm": 2.423854452079881, "learning_rate": 4.169811320754717e-07, "logits/chosen": 1.4267578125, "logits/rejected": 2.033203125, "logps/chosen": -525.5, "logps/rejected": -518.0, "loss": 0.9426, "rewards/accuracies": 0.71875, "rewards/chosen": -0.09161376953125, "rewards/margins": 0.258056640625, "rewards/rejected": -0.349609375, "step": 222 }, { "epoch": 0.042137087250224387, "grad_norm": 3.480988960473042, "learning_rate": 4.188679245283019e-07, "logits/chosen": 1.140380859375, "logits/rejected": 1.845703125, "logps/chosen": -851.0, "logps/rejected": -1012.0, "loss": 0.9037, "rewards/accuracies": 0.75, "rewards/chosen": -0.121795654296875, "rewards/margins": 0.468017578125, "rewards/rejected": -0.58935546875, "step": 223 }, { "epoch": 0.04232604279843167, "grad_norm": 3.03651585779056, "learning_rate": 4.207547169811321e-07, "logits/chosen": 2.0263671875, "logits/rejected": 3.181640625, "logps/chosen": -902.0, "logps/rejected": -1016.5, "loss": 0.8739, "rewards/accuracies": 0.78125, "rewards/chosen": 0.02426910400390625, "rewards/margins": 0.65087890625, "rewards/rejected": -0.6279296875, "step": 224 }, { "epoch": 0.04251499834663895, "grad_norm": 2.465110630516761, "learning_rate": 4.2264150943396224e-07, "logits/chosen": 2.373046875, "logits/rejected": 3.23828125, "logps/chosen": -1347.0, "logps/rejected": -1035.0, "loss": 0.8703, "rewards/accuracies": 0.6875, "rewards/chosen": 0.163330078125, "rewards/margins": 0.81005859375, "rewards/rejected": -0.64501953125, "step": 225 }, { "epoch": 0.042703953894846235, "grad_norm": 3.4271046815604422, "learning_rate": 4.2452830188679244e-07, "logits/chosen": 1.8876953125, "logits/rejected": 2.39453125, "logps/chosen": -1111.0, "logps/rejected": -1022.0, "loss": 0.9118, "rewards/accuracies": 0.625, "rewards/chosen": -0.00927734375, "rewards/margins": 0.46551513671875, "rewards/rejected": -0.474365234375, "step": 226 }, { "epoch": 0.042892909443053524, "grad_norm": 2.6675286921305905, "learning_rate": 4.2641509433962265e-07, "logits/chosen": 0.87109375, "logits/rejected": 2.0390625, "logps/chosen": -732.5, "logps/rejected": -619.5, "loss": 0.9108, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1261749267578125, "rewards/margins": 0.391845703125, "rewards/rejected": -0.51806640625, "step": 227 }, { "epoch": 0.04308186499126081, "grad_norm": 2.55094897879031, "learning_rate": 4.2830188679245285e-07, "logits/chosen": 1.7578125, "logits/rejected": 2.0625, "logps/chosen": -613.5, "logps/rejected": -628.5, "loss": 0.9236, "rewards/accuracies": 0.71875, "rewards/chosen": -0.141357421875, "rewards/margins": 0.369384765625, "rewards/rejected": -0.51171875, "step": 228 }, { "epoch": 0.04327082053946809, "grad_norm": 2.4837761861327303, "learning_rate": 4.30188679245283e-07, "logits/chosen": 2.306640625, "logits/rejected": 3.34765625, "logps/chosen": -662.25, "logps/rejected": -799.5, "loss": 0.9231, "rewards/accuracies": 0.625, "rewards/chosen": -0.1766357421875, "rewards/margins": 0.451904296875, "rewards/rejected": -0.627685546875, "step": 229 }, { "epoch": 0.04345977608767537, "grad_norm": 3.0011785441501497, "learning_rate": 4.320754716981132e-07, "logits/chosen": 1.35888671875, "logits/rejected": 2.46875, "logps/chosen": -738.0, "logps/rejected": -840.0, "loss": 0.907, "rewards/accuracies": 0.78125, "rewards/chosen": -0.113525390625, "rewards/margins": 0.41650390625, "rewards/rejected": -0.52978515625, "step": 230 }, { "epoch": 0.043648731635882655, "grad_norm": 2.578546368783491, "learning_rate": 4.339622641509434e-07, "logits/chosen": 1.53448486328125, "logits/rejected": 2.3203125, "logps/chosen": -591.5, "logps/rejected": -585.5, "loss": 0.9117, "rewards/accuracies": 0.71875, "rewards/chosen": -0.068359375, "rewards/margins": 0.45703125, "rewards/rejected": -0.525390625, "step": 231 }, { "epoch": 0.043837687184089945, "grad_norm": 3.565578070372061, "learning_rate": 4.358490566037736e-07, "logits/chosen": 2.8359375, "logits/rejected": 3.671875, "logps/chosen": -1082.5, "logps/rejected": -1100.0, "loss": 0.8845, "rewards/accuracies": 0.71875, "rewards/chosen": -0.081787109375, "rewards/margins": 0.649169921875, "rewards/rejected": -0.7294921875, "step": 232 }, { "epoch": 0.04402664273229723, "grad_norm": 2.8642956489316878, "learning_rate": 4.3773584905660375e-07, "logits/chosen": 1.9453125, "logits/rejected": 2.53125, "logps/chosen": -873.5, "logps/rejected": -799.0, "loss": 0.9253, "rewards/accuracies": 0.5, "rewards/chosen": -0.1728515625, "rewards/margins": 0.487762451171875, "rewards/rejected": -0.6591796875, "step": 233 }, { "epoch": 0.04421559828050451, "grad_norm": 2.5610137898681615, "learning_rate": 4.3962264150943395e-07, "logits/chosen": 1.226806640625, "logits/rejected": 1.791015625, "logps/chosen": -914.5, "logps/rejected": -764.0, "loss": 0.8965, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0850830078125, "rewards/margins": 0.570068359375, "rewards/rejected": -0.655029296875, "step": 234 }, { "epoch": 0.04440455382871179, "grad_norm": 2.61557743648572, "learning_rate": 4.4150943396226415e-07, "logits/chosen": 1.318359375, "logits/rejected": 1.802734375, "logps/chosen": -637.0, "logps/rejected": -505.5, "loss": 0.9731, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13873291015625, "rewards/margins": 0.109130859375, "rewards/rejected": -0.248291015625, "step": 235 }, { "epoch": 0.04459350937691908, "grad_norm": 2.451785940473782, "learning_rate": 4.4339622641509435e-07, "logits/chosen": 1.8271484375, "logits/rejected": 2.5205078125, "logps/chosen": -727.5, "logps/rejected": -768.5, "loss": 0.9076, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1390380859375, "rewards/margins": 0.50830078125, "rewards/rejected": -0.6478271484375, "step": 236 }, { "epoch": 0.044782464925126365, "grad_norm": 2.369519000781259, "learning_rate": 4.452830188679245e-07, "logits/chosen": 1.76171875, "logits/rejected": 2.5078125, "logps/chosen": -1061.5, "logps/rejected": -806.5, "loss": 0.9335, "rewards/accuracies": 0.40625, "rewards/chosen": -0.125244140625, "rewards/margins": 0.3731689453125, "rewards/rejected": -0.49658203125, "step": 237 }, { "epoch": 0.04497142047333365, "grad_norm": 2.5513263572452773, "learning_rate": 4.471698113207547e-07, "logits/chosen": 1.4443359375, "logits/rejected": 2.4375, "logps/chosen": -643.0, "logps/rejected": -632.5, "loss": 0.9167, "rewards/accuracies": 0.8125, "rewards/chosen": -0.071136474609375, "rewards/margins": 0.387939453125, "rewards/rejected": -0.45947265625, "step": 238 }, { "epoch": 0.04516037602154093, "grad_norm": 3.641172757200689, "learning_rate": 4.490566037735849e-07, "logits/chosen": 1.912109375, "logits/rejected": 2.703125, "logps/chosen": -738.0, "logps/rejected": -774.0, "loss": 0.9216, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0911865234375, "rewards/margins": 0.5107421875, "rewards/rejected": -0.6015625, "step": 239 }, { "epoch": 0.045349331569748214, "grad_norm": 4.431237471078401, "learning_rate": 4.509433962264151e-07, "logits/chosen": 2.46484375, "logits/rejected": 3.17578125, "logps/chosen": -1040.5, "logps/rejected": -1679.5, "loss": 0.8726, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3592071533203125, "rewards/margins": 0.8909912109375, "rewards/rejected": -0.531982421875, "step": 240 }, { "epoch": 0.0455382871179555, "grad_norm": 4.588851597682168, "learning_rate": 4.5283018867924526e-07, "logits/chosen": 1.9208984375, "logits/rejected": 2.5615234375, "logps/chosen": -739.0, "logps/rejected": -1690.0, "loss": 0.9327, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1566162109375, "rewards/margins": 0.46533203125, "rewards/rejected": -0.62158203125, "step": 241 }, { "epoch": 0.045727242666162786, "grad_norm": 2.8319931461598866, "learning_rate": 4.5471698113207546e-07, "logits/chosen": 1.427734375, "logits/rejected": 1.76171875, "logps/chosen": -893.5, "logps/rejected": -648.0, "loss": 0.9097, "rewards/accuracies": 0.65625, "rewards/chosen": -0.173065185546875, "rewards/margins": 0.444580078125, "rewards/rejected": -0.61767578125, "step": 242 }, { "epoch": 0.04591619821437007, "grad_norm": 3.0449218915193095, "learning_rate": 4.5660377358490566e-07, "logits/chosen": 2.609375, "logits/rejected": 2.80078125, "logps/chosen": -1081.0, "logps/rejected": -1245.5, "loss": 0.8505, "rewards/accuracies": 0.78125, "rewards/chosen": -0.011962890625, "rewards/margins": 0.935546875, "rewards/rejected": -0.94677734375, "step": 243 }, { "epoch": 0.04610515376257735, "grad_norm": 2.146462796377429, "learning_rate": 4.5849056603773586e-07, "logits/chosen": 1.6875, "logits/rejected": 2.5390625, "logps/chosen": -767.0, "logps/rejected": -655.0, "loss": 0.9106, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2198486328125, "rewards/margins": 0.5858154296875, "rewards/rejected": -0.80419921875, "step": 244 }, { "epoch": 0.04629410931078464, "grad_norm": 2.2882357846308845, "learning_rate": 4.60377358490566e-07, "logits/chosen": 2.162109375, "logits/rejected": 2.50390625, "logps/chosen": -547.5, "logps/rejected": -572.0, "loss": 0.8989, "rewards/accuracies": 0.78125, "rewards/chosen": -0.04510498046875, "rewards/margins": 0.5029296875, "rewards/rejected": -0.5478515625, "step": 245 }, { "epoch": 0.046483064858991924, "grad_norm": 3.0963469174326175, "learning_rate": 4.622641509433962e-07, "logits/chosen": 0.8583984375, "logits/rejected": 1.4580078125, "logps/chosen": -1097.0, "logps/rejected": -2100.0, "loss": 0.8651, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14361572265625, "rewards/margins": 1.1572265625, "rewards/rejected": -1.3017578125, "step": 246 }, { "epoch": 0.046672020407199206, "grad_norm": 2.8568646283064743, "learning_rate": 4.641509433962264e-07, "logits/chosen": 1.22216796875, "logits/rejected": 1.537109375, "logps/chosen": -727.5, "logps/rejected": -702.5, "loss": 0.9209, "rewards/accuracies": 0.6875, "rewards/chosen": -0.246826171875, "rewards/margins": 0.39990234375, "rewards/rejected": -0.64697265625, "step": 247 }, { "epoch": 0.04686097595540649, "grad_norm": 4.6087620865167, "learning_rate": 4.660377358490566e-07, "logits/chosen": 1.69921875, "logits/rejected": 2.5830078125, "logps/chosen": -1163.0, "logps/rejected": -2302.0, "loss": 0.8739, "rewards/accuracies": 0.71875, "rewards/chosen": -0.24029541015625, "rewards/margins": 0.7890625, "rewards/rejected": -1.0283203125, "step": 248 }, { "epoch": 0.04704993150361377, "grad_norm": 2.916292929384207, "learning_rate": 4.6792452830188676e-07, "logits/chosen": 2.2421875, "logits/rejected": 2.5390625, "logps/chosen": -699.0, "logps/rejected": -941.0, "loss": 0.9014, "rewards/accuracies": 0.71875, "rewards/chosen": 0.04156494140625, "rewards/margins": 0.51220703125, "rewards/rejected": -0.470703125, "step": 249 }, { "epoch": 0.04723888705182106, "grad_norm": 2.6133799493563026, "learning_rate": 4.6981132075471696e-07, "logits/chosen": 1.5234375, "logits/rejected": 1.833984375, "logps/chosen": -726.0, "logps/rejected": -639.0, "loss": 0.9296, "rewards/accuracies": 0.75, "rewards/chosen": -0.210662841796875, "rewards/margins": 0.52294921875, "rewards/rejected": -0.7314453125, "step": 250 }, { "epoch": 0.047427842600028344, "grad_norm": 4.021764851920127, "learning_rate": 4.7169811320754717e-07, "logits/chosen": 0.9404296875, "logits/rejected": 1.7464599609375, "logps/chosen": -938.0, "logps/rejected": -955.0, "loss": 0.8669, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0081787109375, "rewards/margins": 0.6767578125, "rewards/rejected": -0.6839599609375, "step": 251 }, { "epoch": 0.04761679814823563, "grad_norm": 2.591994846435553, "learning_rate": 4.7358490566037737e-07, "logits/chosen": 2.46484375, "logits/rejected": 3.375, "logps/chosen": -789.0, "logps/rejected": -1336.0, "loss": 0.9015, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22412109375, "rewards/margins": 0.57568359375, "rewards/rejected": -0.80078125, "step": 252 }, { "epoch": 0.04780575369644291, "grad_norm": 2.7669714549665776, "learning_rate": 4.754716981132075e-07, "logits/chosen": 0.76220703125, "logits/rejected": 1.12744140625, "logps/chosen": -1276.0, "logps/rejected": -1055.0, "loss": 0.8335, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1265869140625, "rewards/margins": 0.90185546875, "rewards/rejected": -1.0283203125, "step": 253 }, { "epoch": 0.0479947092446502, "grad_norm": 2.727319380795776, "learning_rate": 4.773584905660378e-07, "logits/chosen": 2.3427734375, "logits/rejected": 2.96484375, "logps/chosen": -946.0, "logps/rejected": -727.5, "loss": 0.869, "rewards/accuracies": 0.65625, "rewards/chosen": 0.100341796875, "rewards/margins": 0.6881103515625, "rewards/rejected": -0.586181640625, "step": 254 }, { "epoch": 0.04818366479285748, "grad_norm": 3.17550269751791, "learning_rate": 4.792452830188679e-07, "logits/chosen": 1.818359375, "logits/rejected": 2.310546875, "logps/chosen": -990.0, "logps/rejected": -1123.0, "loss": 0.8654, "rewards/accuracies": 0.84375, "rewards/chosen": -0.05712890625, "rewards/margins": 0.89404296875, "rewards/rejected": -0.9501953125, "step": 255 }, { "epoch": 0.048372620341064765, "grad_norm": 2.127025615036947, "learning_rate": 4.811320754716981e-07, "logits/chosen": 1.232421875, "logits/rejected": 1.679931640625, "logps/chosen": -517.0, "logps/rejected": -625.5, "loss": 0.8992, "rewards/accuracies": 0.84375, "rewards/chosen": -0.100921630859375, "rewards/margins": 0.59326171875, "rewards/rejected": -0.693359375, "step": 256 }, { "epoch": 0.04856157588927205, "grad_norm": 2.800247037234231, "learning_rate": 4.830188679245283e-07, "logits/chosen": 2.345703125, "logits/rejected": 3.5546875, "logps/chosen": -672.0, "logps/rejected": -1080.0, "loss": 0.9196, "rewards/accuracies": 0.53125, "rewards/chosen": -0.048095703125, "rewards/margins": 0.4775390625, "rewards/rejected": -0.52294921875, "step": 257 }, { "epoch": 0.04875053143747933, "grad_norm": 2.854205662660026, "learning_rate": 4.849056603773585e-07, "logits/chosen": 2.376953125, "logits/rejected": 2.748046875, "logps/chosen": -775.0, "logps/rejected": -614.5, "loss": 0.903, "rewards/accuracies": 0.78125, "rewards/chosen": -0.03863525390625, "rewards/margins": 0.618408203125, "rewards/rejected": -0.6552734375, "step": 258 }, { "epoch": 0.04893948698568662, "grad_norm": 2.1751214811435626, "learning_rate": 4.867924528301886e-07, "logits/chosen": 1.9462890625, "logits/rejected": 2.3525390625, "logps/chosen": -527.5, "logps/rejected": -17879.5, "loss": 0.8634, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02777099609375, "rewards/margins": 13.494140625, "rewards/rejected": -13.5078125, "step": 259 }, { "epoch": 0.0491284425338939, "grad_norm": 2.4080180758100163, "learning_rate": 4.886792452830189e-07, "logits/chosen": 0.70068359375, "logits/rejected": 1.3798828125, "logps/chosen": -676.5, "logps/rejected": -642.0, "loss": 0.87, "rewards/accuracies": 0.75, "rewards/chosen": -0.076934814453125, "rewards/margins": 0.703125, "rewards/rejected": -0.78125, "step": 260 }, { "epoch": 0.049317398082101185, "grad_norm": 2.2869667848868875, "learning_rate": 4.90566037735849e-07, "logits/chosen": 1.28759765625, "logits/rejected": 1.875, "logps/chosen": -813.5, "logps/rejected": -1510.0, "loss": 0.8603, "rewards/accuracies": 0.78125, "rewards/chosen": -0.00140380859375, "rewards/margins": 1.1630859375, "rewards/rejected": -1.1630859375, "step": 261 }, { "epoch": 0.04950635363030847, "grad_norm": 2.5887374400336047, "learning_rate": 4.924528301886793e-07, "logits/chosen": 2.734375, "logits/rejected": 3.75, "logps/chosen": -663.5, "logps/rejected": -972.0, "loss": 0.8888, "rewards/accuracies": 0.78125, "rewards/chosen": -0.05442047119140625, "rewards/margins": 0.5439453125, "rewards/rejected": -0.59912109375, "step": 262 }, { "epoch": 0.04969530917851575, "grad_norm": 2.623108175308275, "learning_rate": 4.943396226415094e-07, "logits/chosen": 1.5634765625, "logits/rejected": 2.26953125, "logps/chosen": -1154.5, "logps/rejected": -1246.75, "loss": 0.8058, "rewards/accuracies": 0.875, "rewards/chosen": 0.1309814453125, "rewards/margins": 1.333984375, "rewards/rejected": -1.2021484375, "step": 263 }, { "epoch": 0.04988426472672304, "grad_norm": 2.3426297459274497, "learning_rate": 4.962264150943396e-07, "logits/chosen": 2.0009765625, "logits/rejected": 3.05859375, "logps/chosen": -1063.5, "logps/rejected": -1078.0, "loss": 0.8368, "rewards/accuracies": 0.71875, "rewards/chosen": 0.32489013671875, "rewards/margins": 1.2236328125, "rewards/rejected": -0.8974609375, "step": 264 }, { "epoch": 0.05007322027493032, "grad_norm": 2.339347369635007, "learning_rate": 4.981132075471698e-07, "logits/chosen": 1.83984375, "logits/rejected": 3.42578125, "logps/chosen": -808.5, "logps/rejected": -1445.0, "loss": 0.8477, "rewards/accuracies": 0.84375, "rewards/chosen": -0.040374755859375, "rewards/margins": 0.92578125, "rewards/rejected": -0.96484375, "step": 265 }, { "epoch": 0.050262175823137606, "grad_norm": 2.3871090621387063, "learning_rate": 5e-07, "logits/chosen": 1.80859375, "logits/rejected": 2.7421875, "logps/chosen": -1178.0, "logps/rejected": -1046.0, "loss": 0.8732, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0400390625, "rewards/margins": 0.8759765625, "rewards/rejected": -0.8349609375, "step": 266 }, { "epoch": 0.05045113137134489, "grad_norm": 2.2988334935440724, "learning_rate": 5.018867924528301e-07, "logits/chosen": 1.798828125, "logits/rejected": 2.671875, "logps/chosen": -553.5, "logps/rejected": -568.0, "loss": 0.8693, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0489501953125, "rewards/margins": 0.594970703125, "rewards/rejected": -0.544677734375, "step": 267 }, { "epoch": 0.05064008691955218, "grad_norm": 2.36458623714157, "learning_rate": 5.037735849056604e-07, "logits/chosen": 1.6875, "logits/rejected": 2.697265625, "logps/chosen": -705.5, "logps/rejected": -843.5, "loss": 0.8693, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06103515625, "rewards/margins": 0.689453125, "rewards/rejected": -0.751953125, "step": 268 }, { "epoch": 0.05082904246775946, "grad_norm": 3.394342722224443, "learning_rate": 5.056603773584905e-07, "logits/chosen": 0.9208984375, "logits/rejected": 1.8515625, "logps/chosen": -697.0, "logps/rejected": -791.0, "loss": 0.8668, "rewards/accuracies": 0.65625, "rewards/chosen": 0.02972412109375, "rewards/margins": 0.65478515625, "rewards/rejected": -0.62548828125, "step": 269 }, { "epoch": 0.051017998015966744, "grad_norm": 2.314124844337064, "learning_rate": 5.075471698113207e-07, "logits/chosen": 1.6826171875, "logits/rejected": 2.353515625, "logps/chosen": -864.5, "logps/rejected": -852.0, "loss": 0.884, "rewards/accuracies": 0.71875, "rewards/chosen": -0.175048828125, "rewards/margins": 0.64697265625, "rewards/rejected": -0.81884765625, "step": 270 }, { "epoch": 0.051206953564174026, "grad_norm": 2.4941625667940324, "learning_rate": 5.094339622641509e-07, "logits/chosen": 1.60205078125, "logits/rejected": 2.541015625, "logps/chosen": -788.0, "logps/rejected": -968.0, "loss": 0.8436, "rewards/accuracies": 0.84375, "rewards/chosen": 0.12640380859375, "rewards/margins": 0.87353515625, "rewards/rejected": -0.74755859375, "step": 271 }, { "epoch": 0.05139590911238131, "grad_norm": 2.337721996834013, "learning_rate": 5.113207547169811e-07, "logits/chosen": 0.912109375, "logits/rejected": 1.611328125, "logps/chosen": -722.0, "logps/rejected": -567.0, "loss": 0.8627, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0771484375, "rewards/margins": 0.682373046875, "rewards/rejected": -0.60400390625, "step": 272 }, { "epoch": 0.0515848646605886, "grad_norm": 2.7298871017288007, "learning_rate": 5.132075471698113e-07, "logits/chosen": 1.568359375, "logits/rejected": 1.919921875, "logps/chosen": -1055.0, "logps/rejected": -917.0, "loss": 0.8468, "rewards/accuracies": 0.75, "rewards/chosen": 0.1083984375, "rewards/margins": 0.92529296875, "rewards/rejected": -0.81640625, "step": 273 }, { "epoch": 0.05177382020879588, "grad_norm": 2.523436832538183, "learning_rate": 5.150943396226415e-07, "logits/chosen": 0.73828125, "logits/rejected": 1.19140625, "logps/chosen": -1011.0, "logps/rejected": -1092.0, "loss": 0.8085, "rewards/accuracies": 0.84375, "rewards/chosen": 0.185302734375, "rewards/margins": 1.169921875, "rewards/rejected": -0.986328125, "step": 274 }, { "epoch": 0.051962775757003164, "grad_norm": 2.7737330174987407, "learning_rate": 5.169811320754716e-07, "logits/chosen": 1.939453125, "logits/rejected": 1.8203125, "logps/chosen": -829.5, "logps/rejected": -14986.0, "loss": 0.8101, "rewards/accuracies": 0.75, "rewards/chosen": 0.276611328125, "rewards/margins": 13.912109375, "rewards/rejected": -13.6513671875, "step": 275 }, { "epoch": 0.05215173130521045, "grad_norm": 2.42736150970675, "learning_rate": 5.188679245283019e-07, "logits/chosen": 2.01171875, "logits/rejected": 3.046875, "logps/chosen": -792.0, "logps/rejected": -571.5, "loss": 0.9283, "rewards/accuracies": 0.53125, "rewards/chosen": 0.248199462890625, "rewards/margins": 0.486572265625, "rewards/rejected": -0.2393798828125, "step": 276 }, { "epoch": 0.05234068685341774, "grad_norm": 2.404072191031683, "learning_rate": 5.20754716981132e-07, "logits/chosen": 1.287109375, "logits/rejected": 1.94140625, "logps/chosen": -2139.5, "logps/rejected": -781.0, "loss": 0.8624, "rewards/accuracies": 0.75, "rewards/chosen": -0.01953125, "rewards/margins": 0.671875, "rewards/rejected": -0.6904296875, "step": 277 }, { "epoch": 0.05252964240162502, "grad_norm": 2.95391352182971, "learning_rate": 5.226415094339622e-07, "logits/chosen": 1.314453125, "logits/rejected": 1.6875, "logps/chosen": -999.0, "logps/rejected": -731.0, "loss": 0.868, "rewards/accuracies": 0.75, "rewards/chosen": -0.0899658203125, "rewards/margins": 0.6982421875, "rewards/rejected": -0.7880859375, "step": 278 }, { "epoch": 0.0527185979498323, "grad_norm": 2.550164506471429, "learning_rate": 5.245283018867924e-07, "logits/chosen": 1.3232421875, "logits/rejected": 2.47265625, "logps/chosen": -597.0, "logps/rejected": -955.5, "loss": 0.8577, "rewards/accuracies": 0.75, "rewards/chosen": 0.0162353515625, "rewards/margins": 0.7606048583984375, "rewards/rejected": -0.7430419921875, "step": 279 }, { "epoch": 0.052907553498039585, "grad_norm": 2.5307444486450295, "learning_rate": 5.264150943396226e-07, "logits/chosen": 2.333984375, "logits/rejected": 2.828125, "logps/chosen": -862.0, "logps/rejected": -1464.0, "loss": 0.842, "rewards/accuracies": 0.71875, "rewards/chosen": 0.345367431640625, "rewards/margins": 1.517578125, "rewards/rejected": -1.1708984375, "step": 280 }, { "epoch": 0.05309650904624687, "grad_norm": 1.9655023476428881, "learning_rate": 5.283018867924528e-07, "logits/chosen": 1.79296875, "logits/rejected": 2.814453125, "logps/chosen": -580.75, "logps/rejected": -708.5, "loss": 0.8824, "rewards/accuracies": 0.78125, "rewards/chosen": 0.112396240234375, "rewards/margins": 0.72705078125, "rewards/rejected": -0.614501953125, "step": 281 }, { "epoch": 0.05328546459445416, "grad_norm": 2.672435575039001, "learning_rate": 5.30188679245283e-07, "logits/chosen": 1.2294921875, "logits/rejected": 1.919921875, "logps/chosen": -1036.5, "logps/rejected": -1279.0, "loss": 0.8867, "rewards/accuracies": 0.8125, "rewards/chosen": -0.177886962890625, "rewards/margins": 0.89013671875, "rewards/rejected": -1.0673828125, "step": 282 }, { "epoch": 0.05347442014266144, "grad_norm": 2.4218791583726462, "learning_rate": 5.320754716981131e-07, "logits/chosen": 1.640625, "logits/rejected": 2.541015625, "logps/chosen": -1028.5, "logps/rejected": -799.0, "loss": 0.8651, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0130615234375, "rewards/margins": 0.7294921875, "rewards/rejected": -0.7421875, "step": 283 }, { "epoch": 0.05366337569086872, "grad_norm": 4.019682338964653, "learning_rate": 5.339622641509434e-07, "logits/chosen": 2.068359375, "logits/rejected": 2.880859375, "logps/chosen": -762.0, "logps/rejected": -887.5, "loss": 0.9019, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0174560546875, "rewards/margins": 0.47998046875, "rewards/rejected": -0.462890625, "step": 284 }, { "epoch": 0.053852331239076005, "grad_norm": 2.42650055414299, "learning_rate": 5.358490566037735e-07, "logits/chosen": 2.2890625, "logits/rejected": 2.7109375, "logps/chosen": -1091.0, "logps/rejected": -1374.0, "loss": 0.8723, "rewards/accuracies": 0.75, "rewards/chosen": 0.1119384765625, "rewards/margins": 0.962890625, "rewards/rejected": -0.85107421875, "step": 285 }, { "epoch": 0.054041286787283295, "grad_norm": 2.3489267468203923, "learning_rate": 5.377358490566037e-07, "logits/chosen": 2.43359375, "logits/rejected": 3.35546875, "logps/chosen": -769.0, "logps/rejected": -1133.0, "loss": 0.9003, "rewards/accuracies": 0.71875, "rewards/chosen": -0.05010986328125, "rewards/margins": 0.5732421875, "rewards/rejected": -0.623291015625, "step": 286 }, { "epoch": 0.05423024233549058, "grad_norm": 2.730617434799706, "learning_rate": 5.396226415094339e-07, "logits/chosen": 2.0146484375, "logits/rejected": 2.8115234375, "logps/chosen": -782.5, "logps/rejected": -602.0, "loss": 0.8614, "rewards/accuracies": 0.75, "rewards/chosen": -0.026611328125, "rewards/margins": 0.7138671875, "rewards/rejected": -0.7412109375, "step": 287 }, { "epoch": 0.05441919788369786, "grad_norm": 2.4011515606036733, "learning_rate": 5.415094339622641e-07, "logits/chosen": 0.412200927734375, "logits/rejected": 0.5078125, "logps/chosen": -820.0, "logps/rejected": -580.0, "loss": 0.8859, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09112548828125, "rewards/margins": 0.71875, "rewards/rejected": -0.626953125, "step": 288 }, { "epoch": 0.05460815343190514, "grad_norm": 2.0874649710142386, "learning_rate": 5.433962264150943e-07, "logits/chosen": 2.291015625, "logits/rejected": 2.513671875, "logps/chosen": -468.0, "logps/rejected": -499.0, "loss": 0.9172, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12103271484375, "rewards/margins": 0.419952392578125, "rewards/rejected": -0.299560546875, "step": 289 }, { "epoch": 0.054797108980112426, "grad_norm": 2.867076624781252, "learning_rate": 5.452830188679245e-07, "logits/chosen": 0.94580078125, "logits/rejected": 1.6328125, "logps/chosen": -728.0, "logps/rejected": -1054.0, "loss": 0.8496, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0128173828125, "rewards/margins": 0.96875, "rewards/rejected": -0.953125, "step": 290 }, { "epoch": 0.054986064528319716, "grad_norm": 2.3081917026342906, "learning_rate": 5.471698113207546e-07, "logits/chosen": 1.669921875, "logits/rejected": 2.48046875, "logps/chosen": -827.0, "logps/rejected": -2275.0, "loss": 0.8191, "rewards/accuracies": 0.84375, "rewards/chosen": 0.149627685546875, "rewards/margins": 1.3173828125, "rewards/rejected": -1.1669921875, "step": 291 }, { "epoch": 0.055175020076527, "grad_norm": 2.6354769541039764, "learning_rate": 5.490566037735849e-07, "logits/chosen": 1.04296875, "logits/rejected": 1.8017578125, "logps/chosen": -759.25, "logps/rejected": -655.5, "loss": 0.856, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1617431640625, "rewards/margins": 0.81500244140625, "rewards/rejected": -0.6533203125, "step": 292 }, { "epoch": 0.05536397562473428, "grad_norm": 2.1312530650069585, "learning_rate": 5.50943396226415e-07, "logits/chosen": 1.927734375, "logits/rejected": 2.2783203125, "logps/chosen": -503.5, "logps/rejected": -768.5, "loss": 0.8533, "rewards/accuracies": 0.78125, "rewards/chosen": 0.176055908203125, "rewards/margins": 1.060546875, "rewards/rejected": -0.8828125, "step": 293 }, { "epoch": 0.055552931172941564, "grad_norm": 2.567177072328469, "learning_rate": 5.528301886792452e-07, "logits/chosen": 2.4921875, "logits/rejected": 2.875, "logps/chosen": -1436.0, "logps/rejected": -1917.0, "loss": 0.7539, "rewards/accuracies": 0.75, "rewards/chosen": 0.3956298828125, "rewards/margins": 2.1298828125, "rewards/rejected": -1.7353515625, "step": 294 }, { "epoch": 0.055741886721148846, "grad_norm": 2.0366545463503454, "learning_rate": 5.547169811320755e-07, "logits/chosen": 1.5439453125, "logits/rejected": 1.849609375, "logps/chosen": -531.0, "logps/rejected": -635.0, "loss": 0.8458, "rewards/accuracies": 0.75, "rewards/chosen": 0.1815185546875, "rewards/margins": 0.783203125, "rewards/rejected": -0.599609375, "step": 295 }, { "epoch": 0.055930842269356136, "grad_norm": 2.5200763561075816, "learning_rate": 5.566037735849056e-07, "logits/chosen": 1.4853515625, "logits/rejected": 1.4013671875, "logps/chosen": -663.0, "logps/rejected": -868.0, "loss": 0.8646, "rewards/accuracies": 0.78125, "rewards/chosen": 0.195281982421875, "rewards/margins": 0.69921875, "rewards/rejected": -0.50537109375, "step": 296 }, { "epoch": 0.05611979781756342, "grad_norm": 2.028527166746289, "learning_rate": 5.584905660377359e-07, "logits/chosen": 0.92041015625, "logits/rejected": 1.78125, "logps/chosen": -792.0, "logps/rejected": -769.5, "loss": 0.8373, "rewards/accuracies": 0.71875, "rewards/chosen": 0.24755859375, "rewards/margins": 0.9169921875, "rewards/rejected": -0.671875, "step": 297 }, { "epoch": 0.0563087533657707, "grad_norm": 2.420541002745267, "learning_rate": 5.60377358490566e-07, "logits/chosen": 1.4384765625, "logits/rejected": 2.4208984375, "logps/chosen": -576.0, "logps/rejected": -770.0, "loss": 0.8815, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0538330078125, "rewards/margins": 0.75732421875, "rewards/rejected": -0.7041015625, "step": 298 }, { "epoch": 0.056497708913977984, "grad_norm": 2.7607582942762514, "learning_rate": 5.622641509433962e-07, "logits/chosen": 0.45977783203125, "logits/rejected": 0.4542236328125, "logps/chosen": -732.0, "logps/rejected": -505.0, "loss": 0.8979, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0093994140625, "rewards/margins": 0.6019287109375, "rewards/rejected": -0.611083984375, "step": 299 }, { "epoch": 0.056686664462185274, "grad_norm": 2.8522298536826414, "learning_rate": 5.641509433962264e-07, "logits/chosen": 2.494140625, "logits/rejected": 2.611328125, "logps/chosen": -945.0, "logps/rejected": -849.0, "loss": 0.8431, "rewards/accuracies": 0.90625, "rewards/chosen": 0.228515625, "rewards/margins": 0.91748046875, "rewards/rejected": -0.6884765625, "step": 300 }, { "epoch": 0.05687562001039256, "grad_norm": 2.269770813735564, "learning_rate": 5.660377358490566e-07, "logits/chosen": 0.74609375, "logits/rejected": 1.4437255859375, "logps/chosen": -930.5, "logps/rejected": -782.5, "loss": 0.8438, "rewards/accuracies": 0.84375, "rewards/chosen": 0.19158935546875, "rewards/margins": 1.02392578125, "rewards/rejected": -0.83251953125, "step": 301 }, { "epoch": 0.05706457555859984, "grad_norm": 2.2615854732087093, "learning_rate": 5.679245283018867e-07, "logits/chosen": 1.698486328125, "logits/rejected": 2.23291015625, "logps/chosen": -647.0, "logps/rejected": -15524.0, "loss": 0.8069, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3564453125, "rewards/margins": 16.95703125, "rewards/rejected": -16.66748046875, "step": 302 }, { "epoch": 0.05725353110680712, "grad_norm": 2.6447029199105696, "learning_rate": 5.69811320754717e-07, "logits/chosen": 1.5185546875, "logits/rejected": 1.931640625, "logps/chosen": -1170.0, "logps/rejected": -1869.0, "loss": 0.7741, "rewards/accuracies": 0.875, "rewards/chosen": 0.341796875, "rewards/margins": 1.62109375, "rewards/rejected": -1.2802734375, "step": 303 }, { "epoch": 0.057442486655014405, "grad_norm": 2.299071829462304, "learning_rate": 5.716981132075471e-07, "logits/chosen": 0.849609375, "logits/rejected": 1.2001953125, "logps/chosen": -708.0, "logps/rejected": -502.5, "loss": 0.8898, "rewards/accuracies": 0.875, "rewards/chosen": -0.04833984375, "rewards/margins": 0.55712890625, "rewards/rejected": -0.6068115234375, "step": 304 }, { "epoch": 0.057631442203221694, "grad_norm": 3.4458872244628176, "learning_rate": 5.735849056603774e-07, "logits/chosen": 1.199951171875, "logits/rejected": 1.27783203125, "logps/chosen": -957.0, "logps/rejected": -788.5, "loss": 0.8708, "rewards/accuracies": 0.75, "rewards/chosen": 0.075439453125, "rewards/margins": 0.85498046875, "rewards/rejected": -0.779296875, "step": 305 }, { "epoch": 0.05782039775142898, "grad_norm": 2.8660234841977252, "learning_rate": 5.754716981132075e-07, "logits/chosen": 1.38916015625, "logits/rejected": 2.0234375, "logps/chosen": -809.5, "logps/rejected": -768.0, "loss": 0.9052, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02880859375, "rewards/margins": 0.45947265625, "rewards/rejected": -0.4296875, "step": 306 }, { "epoch": 0.05800935329963626, "grad_norm": 2.4988216813660156, "learning_rate": 5.773584905660377e-07, "logits/chosen": 1.931640625, "logits/rejected": 2.427734375, "logps/chosen": -711.0, "logps/rejected": -891.0, "loss": 0.9027, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0975341796875, "rewards/margins": 0.5611572265625, "rewards/rejected": -0.6575927734375, "step": 307 }, { "epoch": 0.05819830884784354, "grad_norm": 2.4372043555121867, "learning_rate": 5.792452830188679e-07, "logits/chosen": 0.58984375, "logits/rejected": 1.12109375, "logps/chosen": -1027.0, "logps/rejected": -677.0, "loss": 0.8444, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3338623046875, "rewards/margins": 0.91748046875, "rewards/rejected": -0.584716796875, "step": 308 }, { "epoch": 0.05838726439605083, "grad_norm": 2.1671180782700796, "learning_rate": 5.811320754716981e-07, "logits/chosen": 0.789306640625, "logits/rejected": 1.07769775390625, "logps/chosen": -747.0, "logps/rejected": -906.75, "loss": 0.8784, "rewards/accuracies": 0.75, "rewards/chosen": 0.093017578125, "rewards/margins": 0.889404296875, "rewards/rejected": -0.797119140625, "step": 309 }, { "epoch": 0.058576219944258115, "grad_norm": 2.5766627567148035, "learning_rate": 5.830188679245282e-07, "logits/chosen": 1.796875, "logits/rejected": 2.421875, "logps/chosen": -1075.0, "logps/rejected": -1083.0, "loss": 0.7765, "rewards/accuracies": 0.78125, "rewards/chosen": 0.313720703125, "rewards/margins": 1.5126953125, "rewards/rejected": -1.1982421875, "step": 310 }, { "epoch": 0.0587651754924654, "grad_norm": 2.244642443776483, "learning_rate": 5.849056603773585e-07, "logits/chosen": 1.079345703125, "logits/rejected": 1.29541015625, "logps/chosen": -877.0, "logps/rejected": -770.0, "loss": 0.8289, "rewards/accuracies": 0.71875, "rewards/chosen": 0.306640625, "rewards/margins": 0.9365234375, "rewards/rejected": -0.629150390625, "step": 311 }, { "epoch": 0.05895413104067268, "grad_norm": 2.7262652407433694, "learning_rate": 5.867924528301886e-07, "logits/chosen": 1.38330078125, "logits/rejected": 1.9169921875, "logps/chosen": -967.0, "logps/rejected": -831.5, "loss": 0.8513, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0476837158203125, "rewards/margins": 0.977783203125, "rewards/rejected": -0.92822265625, "step": 312 }, { "epoch": 0.05914308658887996, "grad_norm": 4.581906002711697, "learning_rate": 5.886792452830189e-07, "logits/chosen": 1.633544921875, "logits/rejected": 2.3951416015625, "logps/chosen": -794.0, "logps/rejected": -906.0, "loss": 0.855, "rewards/accuracies": 0.75, "rewards/chosen": 0.2352294921875, "rewards/margins": 0.826171875, "rewards/rejected": -0.59130859375, "step": 313 }, { "epoch": 0.05933204213708725, "grad_norm": 4.549909552867715, "learning_rate": 5.90566037735849e-07, "logits/chosen": 2.078125, "logits/rejected": 2.408203125, "logps/chosen": -546.0, "logps/rejected": -759.5, "loss": 0.8689, "rewards/accuracies": 0.71875, "rewards/chosen": 0.07958984375, "rewards/margins": 0.6943359375, "rewards/rejected": -0.61376953125, "step": 314 }, { "epoch": 0.059520997685294535, "grad_norm": 2.151032589100128, "learning_rate": 5.924528301886792e-07, "logits/chosen": 1.3505859375, "logits/rejected": 1.533203125, "logps/chosen": -928.5, "logps/rejected": -777.0, "loss": 0.8389, "rewards/accuracies": 0.71875, "rewards/chosen": 0.07269287109375, "rewards/margins": 0.97607421875, "rewards/rejected": -0.904296875, "step": 315 }, { "epoch": 0.05970995323350182, "grad_norm": 2.18403829515087, "learning_rate": 5.943396226415094e-07, "logits/chosen": 1.56103515625, "logits/rejected": 1.6474609375, "logps/chosen": -1006.0, "logps/rejected": -982.0, "loss": 0.8204, "rewards/accuracies": 0.625, "rewards/chosen": 0.323486328125, "rewards/margins": 1.21923828125, "rewards/rejected": -0.8935546875, "step": 316 }, { "epoch": 0.0598989087817091, "grad_norm": 2.269719046128395, "learning_rate": 5.962264150943396e-07, "logits/chosen": 0.864990234375, "logits/rejected": 1.4609375, "logps/chosen": -744.0, "logps/rejected": -722.0, "loss": 0.8687, "rewards/accuracies": 0.75, "rewards/chosen": 0.214080810546875, "rewards/margins": 0.6767578125, "rewards/rejected": -0.46240234375, "step": 317 }, { "epoch": 0.06008786432991639, "grad_norm": 2.310451112276074, "learning_rate": 5.981132075471697e-07, "logits/chosen": 0.4757080078125, "logits/rejected": 0.955078125, "logps/chosen": -783.0, "logps/rejected": -561.5, "loss": 0.8732, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0281982421875, "rewards/margins": 0.699737548828125, "rewards/rejected": -0.72705078125, "step": 318 }, { "epoch": 0.06027681987812367, "grad_norm": 2.5689442298665326, "learning_rate": 6e-07, "logits/chosen": 1.1748046875, "logits/rejected": 2.24609375, "logps/chosen": -691.0, "logps/rejected": -627.5, "loss": 0.8956, "rewards/accuracies": 0.625, "rewards/chosen": 0.015625, "rewards/margins": 0.6123046875, "rewards/rejected": -0.596435546875, "step": 319 }, { "epoch": 0.060465775426330956, "grad_norm": 2.426813855388781, "learning_rate": 6.018867924528301e-07, "logits/chosen": 0.82861328125, "logits/rejected": 1.888671875, "logps/chosen": -1210.0, "logps/rejected": -1070.0, "loss": 0.847, "rewards/accuracies": 0.78125, "rewards/chosen": -0.249267578125, "rewards/margins": 1.138671875, "rewards/rejected": -1.3876953125, "step": 320 }, { "epoch": 0.06065473097453824, "grad_norm": 2.499383118490723, "learning_rate": 6.037735849056604e-07, "logits/chosen": 0.38232421875, "logits/rejected": 1.61181640625, "logps/chosen": -712.0, "logps/rejected": -1373.0, "loss": 0.8074, "rewards/accuracies": 0.75, "rewards/chosen": 0.3193359375, "rewards/margins": 1.37744140625, "rewards/rejected": -1.058837890625, "step": 321 }, { "epoch": 0.06084368652274552, "grad_norm": 2.472064509999575, "learning_rate": 6.056603773584905e-07, "logits/chosen": 0.625457763671875, "logits/rejected": 1.1046142578125, "logps/chosen": -890.0, "logps/rejected": -888.0, "loss": 0.8915, "rewards/accuracies": 0.78125, "rewards/chosen": -0.145721435546875, "rewards/margins": 0.771484375, "rewards/rejected": -0.9169921875, "step": 322 }, { "epoch": 0.06103264207095281, "grad_norm": 2.9412771707700496, "learning_rate": 6.075471698113207e-07, "logits/chosen": 0.818359375, "logits/rejected": 1.56085205078125, "logps/chosen": -899.0, "logps/rejected": -950.5, "loss": 0.8312, "rewards/accuracies": 0.84375, "rewards/chosen": -0.01251220703125, "rewards/margins": 1.23828125, "rewards/rejected": -1.2509765625, "step": 323 }, { "epoch": 0.061221597619160094, "grad_norm": 3.247880340348031, "learning_rate": 6.094339622641509e-07, "logits/chosen": 1.2412109375, "logits/rejected": 2.07421875, "logps/chosen": -938.0, "logps/rejected": -1152.0, "loss": 0.825, "rewards/accuracies": 0.84375, "rewards/chosen": 0.29443359375, "rewards/margins": 1.2265625, "rewards/rejected": -0.9326171875, "step": 324 }, { "epoch": 0.06141055316736738, "grad_norm": 2.125117397273372, "learning_rate": 6.113207547169811e-07, "logits/chosen": 3.03125, "logits/rejected": 3.39453125, "logps/chosen": -693.5, "logps/rejected": -1157.0, "loss": 0.8443, "rewards/accuracies": 0.65625, "rewards/chosen": 0.25335693359375, "rewards/margins": 0.98779296875, "rewards/rejected": -0.73583984375, "step": 325 }, { "epoch": 0.06159950871557466, "grad_norm": 2.681452711755122, "learning_rate": 6.132075471698112e-07, "logits/chosen": 0.826171875, "logits/rejected": 1.01025390625, "logps/chosen": -1186.0, "logps/rejected": -970.5, "loss": 0.8182, "rewards/accuracies": 0.71875, "rewards/chosen": 0.373779296875, "rewards/margins": 1.349609375, "rewards/rejected": -0.9755859375, "step": 326 }, { "epoch": 0.06178846426378194, "grad_norm": 2.8745540181094418, "learning_rate": 6.150943396226415e-07, "logits/chosen": 1.294921875, "logits/rejected": 2.34765625, "logps/chosen": -849.0, "logps/rejected": -909.0, "loss": 0.8251, "rewards/accuracies": 0.71875, "rewards/chosen": 0.23828125, "rewards/margins": 0.93359375, "rewards/rejected": -0.6923828125, "step": 327 }, { "epoch": 0.06197741981198923, "grad_norm": 2.134347161412294, "learning_rate": 6.169811320754716e-07, "logits/chosen": 1.4306640625, "logits/rejected": 1.2861328125, "logps/chosen": -593.0, "logps/rejected": -527.75, "loss": 0.8902, "rewards/accuracies": 0.71875, "rewards/chosen": 0.051025390625, "rewards/margins": 0.627685546875, "rewards/rejected": -0.575836181640625, "step": 328 }, { "epoch": 0.062166375360196514, "grad_norm": 2.6685604818090103, "learning_rate": 6.188679245283019e-07, "logits/chosen": 2.337890625, "logits/rejected": 2.55078125, "logps/chosen": -1081.0, "logps/rejected": -1244.0, "loss": 0.7687, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5498046875, "rewards/margins": 1.52197265625, "rewards/rejected": -0.9716796875, "step": 329 }, { "epoch": 0.0623553309084038, "grad_norm": 2.4581801779781975, "learning_rate": 6.20754716981132e-07, "logits/chosen": 0.6748046875, "logits/rejected": 1.6246337890625, "logps/chosen": -675.0, "logps/rejected": -1130.0, "loss": 0.8502, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0360107421875, "rewards/margins": 1.109375, "rewards/rejected": -1.142578125, "step": 330 }, { "epoch": 0.06254428645661109, "grad_norm": 2.4166684325046, "learning_rate": 6.226415094339622e-07, "logits/chosen": 0.00189208984375, "logits/rejected": 0.708984375, "logps/chosen": -643.75, "logps/rejected": -626.5, "loss": 0.875, "rewards/accuracies": 0.84375, "rewards/chosen": -0.09637451171875, "rewards/margins": 0.8680419921875, "rewards/rejected": -0.962158203125, "step": 331 }, { "epoch": 0.06273324200481836, "grad_norm": 2.6355564210206786, "learning_rate": 6.245283018867924e-07, "logits/chosen": 1.0169677734375, "logits/rejected": 1.58837890625, "logps/chosen": -825.5, "logps/rejected": -1079.5, "loss": 0.8312, "rewards/accuracies": 0.84375, "rewards/chosen": -0.102325439453125, "rewards/margins": 0.99017333984375, "rewards/rejected": -1.09375, "step": 332 }, { "epoch": 0.06292219755302565, "grad_norm": 2.5350045664631855, "learning_rate": 6.264150943396226e-07, "logits/chosen": 0.223876953125, "logits/rejected": 1.21044921875, "logps/chosen": -1263.5, "logps/rejected": -1757.0, "loss": 0.7539, "rewards/accuracies": 0.875, "rewards/chosen": 0.4105224609375, "rewards/margins": 2.353515625, "rewards/rejected": -1.943359375, "step": 333 }, { "epoch": 0.06311115310123294, "grad_norm": 2.1751325866360367, "learning_rate": 6.283018867924527e-07, "logits/chosen": 0.782470703125, "logits/rejected": 1.36328125, "logps/chosen": -634.0, "logps/rejected": -678.75, "loss": 0.8574, "rewards/accuracies": 0.78125, "rewards/chosen": 0.072998046875, "rewards/margins": 1.0166015625, "rewards/rejected": -0.944091796875, "step": 334 }, { "epoch": 0.06330010864944022, "grad_norm": 3.0947113025868953, "learning_rate": 6.30188679245283e-07, "logits/chosen": 1.120361328125, "logits/rejected": 1.916015625, "logps/chosen": -1010.5, "logps/rejected": -1712.0, "loss": 0.8453, "rewards/accuracies": 0.625, "rewards/chosen": -0.1256103515625, "rewards/margins": 1.31201171875, "rewards/rejected": -1.43701171875, "step": 335 }, { "epoch": 0.06348906419764751, "grad_norm": 2.4897157526821374, "learning_rate": 6.320754716981131e-07, "logits/chosen": 2.154296875, "logits/rejected": 2.373046875, "logps/chosen": -1206.0, "logps/rejected": -1206.0, "loss": 0.81, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39776611328125, "rewards/margins": 1.544921875, "rewards/rejected": -1.146728515625, "step": 336 }, { "epoch": 0.06367801974585478, "grad_norm": 2.3218710702299803, "learning_rate": 6.339622641509434e-07, "logits/chosen": 1.096923828125, "logits/rejected": 2.0859375, "logps/chosen": -640.0, "logps/rejected": -732.5, "loss": 0.8399, "rewards/accuracies": 0.75, "rewards/chosen": 0.20074462890625, "rewards/margins": 0.9306640625, "rewards/rejected": -0.73046875, "step": 337 }, { "epoch": 0.06386697529406207, "grad_norm": 2.5463974161153917, "learning_rate": 6.358490566037735e-07, "logits/chosen": 1.44580078125, "logits/rejected": 2.087890625, "logps/chosen": -1293.0, "logps/rejected": -899.0, "loss": 0.8275, "rewards/accuracies": 0.625, "rewards/chosen": 0.300048828125, "rewards/margins": 1.46875, "rewards/rejected": -1.169921875, "step": 338 }, { "epoch": 0.06405593084226936, "grad_norm": 2.1536035567993794, "learning_rate": 6.377358490566037e-07, "logits/chosen": 1.763671875, "logits/rejected": 2.275390625, "logps/chosen": -579.0, "logps/rejected": -705.0, "loss": 0.881, "rewards/accuracies": 0.6875, "rewards/chosen": 0.287109375, "rewards/margins": 0.614013671875, "rewards/rejected": -0.327484130859375, "step": 339 }, { "epoch": 0.06424488639047664, "grad_norm": 2.1619465191958525, "learning_rate": 6.396226415094339e-07, "logits/chosen": 0.88671875, "logits/rejected": 1.33984375, "logps/chosen": -921.5, "logps/rejected": -731.0, "loss": 0.8101, "rewards/accuracies": 0.75, "rewards/chosen": 0.427001953125, "rewards/margins": 1.1419677734375, "rewards/rejected": -0.7159423828125, "step": 340 }, { "epoch": 0.06443384193868393, "grad_norm": 2.3583347279241913, "learning_rate": 6.415094339622641e-07, "logits/chosen": 1.0517578125, "logits/rejected": 1.34423828125, "logps/chosen": -1248.0, "logps/rejected": -902.0, "loss": 0.824, "rewards/accuracies": 0.84375, "rewards/chosen": 0.27679443359375, "rewards/margins": 1.294921875, "rewards/rejected": -1.017578125, "step": 341 }, { "epoch": 0.0646227974868912, "grad_norm": 2.7085461752576947, "learning_rate": 6.433962264150942e-07, "logits/chosen": 0.7470703125, "logits/rejected": 1.1064453125, "logps/chosen": -707.5, "logps/rejected": -1151.0, "loss": 0.8235, "rewards/accuracies": 0.71875, "rewards/chosen": -0.01746368408203125, "rewards/margins": 1.095703125, "rewards/rejected": -1.1142578125, "step": 342 }, { "epoch": 0.0648117530350985, "grad_norm": 2.3508634802297825, "learning_rate": 6.452830188679245e-07, "logits/chosen": 1.5771484375, "logits/rejected": 2.921875, "logps/chosen": -1082.5, "logps/rejected": -1342.0, "loss": 0.7895, "rewards/accuracies": 0.84375, "rewards/chosen": 0.134033203125, "rewards/margins": 1.537109375, "rewards/rejected": -1.40234375, "step": 343 }, { "epoch": 0.06500070858330578, "grad_norm": 2.4553854498362946, "learning_rate": 6.471698113207546e-07, "logits/chosen": 0.4412841796875, "logits/rejected": 0.6856689453125, "logps/chosen": -721.0, "logps/rejected": -1100.0, "loss": 0.8293, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10662841796875, "rewards/margins": 1.23046875, "rewards/rejected": -1.123046875, "step": 344 }, { "epoch": 0.06518966413151306, "grad_norm": 2.2786035826233784, "learning_rate": 6.490566037735848e-07, "logits/chosen": 1.619140625, "logits/rejected": 1.767578125, "logps/chosen": -817.0, "logps/rejected": -707.0, "loss": 0.8744, "rewards/accuracies": 0.75, "rewards/chosen": -0.069091796875, "rewards/margins": 0.779296875, "rewards/rejected": -0.84619140625, "step": 345 }, { "epoch": 0.06537861967972035, "grad_norm": 2.622923536525408, "learning_rate": 6.50943396226415e-07, "logits/chosen": 1.1123046875, "logits/rejected": 1.771484375, "logps/chosen": -875.5, "logps/rejected": -1113.0, "loss": 0.7777, "rewards/accuracies": 0.71875, "rewards/chosen": 0.32647705078125, "rewards/margins": 1.3828125, "rewards/rejected": -1.0546875, "step": 346 }, { "epoch": 0.06556757522792762, "grad_norm": 2.487519253138874, "learning_rate": 6.528301886792452e-07, "logits/chosen": 1.7548828125, "logits/rejected": 2.31640625, "logps/chosen": -1037.0, "logps/rejected": -809.0, "loss": 0.8722, "rewards/accuracies": 0.75, "rewards/chosen": -0.052001953125, "rewards/margins": 0.9765625, "rewards/rejected": -1.02734375, "step": 347 }, { "epoch": 0.06575653077613491, "grad_norm": 2.2587357935623946, "learning_rate": 6.547169811320754e-07, "logits/chosen": 0.934326171875, "logits/rejected": 1.3525390625, "logps/chosen": -521.5, "logps/rejected": -627.0, "loss": 0.8755, "rewards/accuracies": 0.78125, "rewards/chosen": -0.026611328125, "rewards/margins": 0.657470703125, "rewards/rejected": -0.68408203125, "step": 348 }, { "epoch": 0.0659454863243422, "grad_norm": 2.0588916796433905, "learning_rate": 6.566037735849056e-07, "logits/chosen": 0.94384765625, "logits/rejected": 1.3642578125, "logps/chosen": -468.5, "logps/rejected": -537.5, "loss": 0.882, "rewards/accuracies": 0.65625, "rewards/chosen": -0.018310546875, "rewards/margins": 0.6328125, "rewards/rejected": -0.65185546875, "step": 349 }, { "epoch": 0.06613444187254948, "grad_norm": 1.8341584223294023, "learning_rate": 6.584905660377357e-07, "logits/chosen": 1.83984375, "logits/rejected": 1.8505859375, "logps/chosen": -530.5, "logps/rejected": -465.0, "loss": 0.8657, "rewards/accuracies": 0.65625, "rewards/chosen": 0.2350921630859375, "rewards/margins": 0.7413330078125, "rewards/rejected": -0.50537109375, "step": 350 }, { "epoch": 0.06632339742075677, "grad_norm": 2.521889788229066, "learning_rate": 6.60377358490566e-07, "logits/chosen": 1.53125, "logits/rejected": 1.56298828125, "logps/chosen": -696.0, "logps/rejected": -536.5, "loss": 0.8717, "rewards/accuracies": 0.78125, "rewards/chosen": 0.062255859375, "rewards/margins": 0.58203125, "rewards/rejected": -0.51904296875, "step": 351 }, { "epoch": 0.06651235296896404, "grad_norm": 2.075238857936553, "learning_rate": 6.622641509433961e-07, "logits/chosen": 0.3994140625, "logits/rejected": 0.421875, "logps/chosen": -734.5, "logps/rejected": -680.0, "loss": 0.8514, "rewards/accuracies": 0.875, "rewards/chosen": -0.138671875, "rewards/margins": 0.8701171875, "rewards/rejected": -1.01171875, "step": 352 }, { "epoch": 0.06670130851717133, "grad_norm": 2.516558908042944, "learning_rate": 6.641509433962263e-07, "logits/chosen": 0.04052734375, "logits/rejected": 0.8837890625, "logps/chosen": -729.0, "logps/rejected": -779.0, "loss": 0.8628, "rewards/accuracies": 0.78125, "rewards/chosen": 0.036376953125, "rewards/margins": 0.8310546875, "rewards/rejected": -0.79541015625, "step": 353 }, { "epoch": 0.06689026406537862, "grad_norm": 2.6783176791612853, "learning_rate": 6.660377358490566e-07, "logits/chosen": 1.09423828125, "logits/rejected": 1.357421875, "logps/chosen": -1069.0, "logps/rejected": -728.5, "loss": 0.8943, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1966705322265625, "rewards/margins": 0.677978515625, "rewards/rejected": -0.873779296875, "step": 354 }, { "epoch": 0.0670792196135859, "grad_norm": 2.2084304012847666, "learning_rate": 6.679245283018867e-07, "logits/chosen": 1.2431640625, "logits/rejected": 0.887939453125, "logps/chosen": -772.5, "logps/rejected": -513.0, "loss": 0.8993, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0138702392578125, "rewards/margins": 0.6201171875, "rewards/rejected": -0.6064453125, "step": 355 }, { "epoch": 0.06726817516179319, "grad_norm": 3.1592653029583375, "learning_rate": 6.69811320754717e-07, "logits/chosen": 1.6826171875, "logits/rejected": 1.78857421875, "logps/chosen": -1081.0, "logps/rejected": -1186.0, "loss": 0.8234, "rewards/accuracies": 0.75, "rewards/chosen": 0.3701171875, "rewards/margins": 1.42431640625, "rewards/rejected": -1.0537109375, "step": 356 }, { "epoch": 0.06745713071000048, "grad_norm": 2.456049002587352, "learning_rate": 6.716981132075471e-07, "logits/chosen": 1.0179443359375, "logits/rejected": 2.244140625, "logps/chosen": -1096.0, "logps/rejected": -1066.0, "loss": 0.7675, "rewards/accuracies": 0.84375, "rewards/chosen": -0.03955078125, "rewards/margins": 1.515625, "rewards/rejected": -1.55859375, "step": 357 }, { "epoch": 0.06764608625820775, "grad_norm": 2.5156251684216233, "learning_rate": 6.735849056603773e-07, "logits/chosen": 1.2109375, "logits/rejected": 1.36669921875, "logps/chosen": -608.0, "logps/rejected": -772.0, "loss": 0.7962, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0472412109375, "rewards/margins": 1.140625, "rewards/rejected": -1.091796875, "step": 358 }, { "epoch": 0.06783504180641504, "grad_norm": 2.240312659704891, "learning_rate": 6.754716981132075e-07, "logits/chosen": 1.6220703125, "logits/rejected": 2.32421875, "logps/chosen": -768.0, "logps/rejected": -910.0, "loss": 0.8303, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1495819091796875, "rewards/margins": 1.125, "rewards/rejected": -0.9755859375, "step": 359 }, { "epoch": 0.06802399735462232, "grad_norm": 2.791735477110591, "learning_rate": 6.773584905660377e-07, "logits/chosen": 0.8472900390625, "logits/rejected": 2.0234375, "logps/chosen": -858.0, "logps/rejected": -962.0, "loss": 0.8209, "rewards/accuracies": 0.78125, "rewards/chosen": 0.15350341796875, "rewards/margins": 1.2265625, "rewards/rejected": -1.072021484375, "step": 360 }, { "epoch": 0.06821295290282961, "grad_norm": 2.17047219702688, "learning_rate": 6.792452830188678e-07, "logits/chosen": 0.5009765625, "logits/rejected": 1.73828125, "logps/chosen": -741.0, "logps/rejected": -881.0, "loss": 0.814, "rewards/accuracies": 0.75, "rewards/chosen": 0.1759033203125, "rewards/margins": 1.3232421875, "rewards/rejected": -1.1474609375, "step": 361 }, { "epoch": 0.0684019084510369, "grad_norm": 2.0642363927223735, "learning_rate": 6.811320754716981e-07, "logits/chosen": 1.796875, "logits/rejected": 1.599609375, "logps/chosen": -2107.0, "logps/rejected": -1638.5, "loss": 0.8862, "rewards/accuracies": 0.59375, "rewards/chosen": -0.25634765625, "rewards/margins": 1.12939453125, "rewards/rejected": -1.38427734375, "step": 362 }, { "epoch": 0.06859086399924418, "grad_norm": 1.9397380483713198, "learning_rate": 6.830188679245282e-07, "logits/chosen": 1.35797119140625, "logits/rejected": 2.4560546875, "logps/chosen": -463.0, "logps/rejected": -569.5, "loss": 0.8818, "rewards/accuracies": 0.71875, "rewards/chosen": 0.04425048828125, "rewards/margins": 0.6435546875, "rewards/rejected": -0.599853515625, "step": 363 }, { "epoch": 0.06877981954745147, "grad_norm": 2.326505902760136, "learning_rate": 6.849056603773585e-07, "logits/chosen": 0.65087890625, "logits/rejected": 1.70947265625, "logps/chosen": -578.0, "logps/rejected": -751.5, "loss": 0.8584, "rewards/accuracies": 0.6875, "rewards/chosen": 0.021270751953125, "rewards/margins": 0.920654296875, "rewards/rejected": -0.89697265625, "step": 364 }, { "epoch": 0.06896877509565874, "grad_norm": 2.1867432974418057, "learning_rate": 6.867924528301886e-07, "logits/chosen": 0.484375, "logits/rejected": 1.30078125, "logps/chosen": -558.0, "logps/rejected": -543.5, "loss": 0.8289, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2486572265625, "rewards/margins": 0.887939453125, "rewards/rejected": -0.63818359375, "step": 365 }, { "epoch": 0.06915773064386603, "grad_norm": 2.601113872818613, "learning_rate": 6.886792452830188e-07, "logits/chosen": 1.515625, "logits/rejected": 2.23046875, "logps/chosen": -773.0, "logps/rejected": -577.0, "loss": 0.8646, "rewards/accuracies": 0.8125, "rewards/chosen": 0.17626953125, "rewards/margins": 0.7646484375, "rewards/rejected": -0.589111328125, "step": 366 }, { "epoch": 0.06934668619207332, "grad_norm": 2.2903799910172022, "learning_rate": 6.90566037735849e-07, "logits/chosen": 1.162109375, "logits/rejected": 1.393310546875, "logps/chosen": -642.0, "logps/rejected": -659.0, "loss": 0.8722, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04010009765625, "rewards/margins": 0.8564453125, "rewards/rejected": -0.815185546875, "step": 367 }, { "epoch": 0.0695356417402806, "grad_norm": 2.2463576568923695, "learning_rate": 6.924528301886792e-07, "logits/chosen": 0.41943359375, "logits/rejected": 1.349609375, "logps/chosen": -680.0, "logps/rejected": -1553.5, "loss": 0.8046, "rewards/accuracies": 0.84375, "rewards/chosen": 0.241729736328125, "rewards/margins": 1.2060546875, "rewards/rejected": -0.964111328125, "step": 368 }, { "epoch": 0.06972459728848789, "grad_norm": 2.181997532940946, "learning_rate": 6.943396226415093e-07, "logits/chosen": 1.5594482421875, "logits/rejected": 2.41015625, "logps/chosen": -461.5, "logps/rejected": -562.5, "loss": 0.877, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1280517578125, "rewards/margins": 0.7138671875, "rewards/rejected": -0.841796875, "step": 369 }, { "epoch": 0.06991355283669516, "grad_norm": 2.7201029545713737, "learning_rate": 6.962264150943396e-07, "logits/chosen": 0.72119140625, "logits/rejected": 1.4100341796875, "logps/chosen": -746.0, "logps/rejected": -892.5, "loss": 0.8525, "rewards/accuracies": 0.71875, "rewards/chosen": -0.115966796875, "rewards/margins": 1.083984375, "rewards/rejected": -1.201171875, "step": 370 }, { "epoch": 0.07010250838490245, "grad_norm": 2.4509283283206087, "learning_rate": 6.981132075471697e-07, "logits/chosen": 0.31005859375, "logits/rejected": 0.25830078125, "logps/chosen": -708.5, "logps/rejected": -503.0, "loss": 0.868, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0164794921875, "rewards/margins": 0.7021484375, "rewards/rejected": -0.7197265625, "step": 371 }, { "epoch": 0.07029146393310974, "grad_norm": 2.835655609324951, "learning_rate": 7e-07, "logits/chosen": 1.94140625, "logits/rejected": 2.12890625, "logps/chosen": -751.0, "logps/rejected": -1082.0, "loss": 0.835, "rewards/accuracies": 0.75, "rewards/chosen": 0.2958984375, "rewards/margins": 1.23828125, "rewards/rejected": -0.9423828125, "step": 372 }, { "epoch": 0.07048041948131702, "grad_norm": 2.4222946637435547, "learning_rate": 7.018867924528301e-07, "logits/chosen": 0.12451171875, "logits/rejected": 0.6572265625, "logps/chosen": -926.5, "logps/rejected": -811.0, "loss": 0.8132, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1307373046875, "rewards/margins": 1.2568359375, "rewards/rejected": -1.1259765625, "step": 373 }, { "epoch": 0.0706693750295243, "grad_norm": 2.212525994592712, "learning_rate": 7.037735849056603e-07, "logits/chosen": 0.7908935546875, "logits/rejected": 0.78369140625, "logps/chosen": -596.5, "logps/rejected": -737.0, "loss": 0.834, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0147705078125, "rewards/margins": 1.06640625, "rewards/rejected": -1.05029296875, "step": 374 }, { "epoch": 0.07085833057773158, "grad_norm": 2.9016567739323538, "learning_rate": 7.056603773584905e-07, "logits/chosen": 1.0546875, "logits/rejected": 2.033203125, "logps/chosen": -987.0, "logps/rejected": -1188.0, "loss": 0.8141, "rewards/accuracies": 0.84375, "rewards/chosen": 0.101959228515625, "rewards/margins": 1.427734375, "rewards/rejected": -1.32763671875, "step": 375 }, { "epoch": 0.07104728612593887, "grad_norm": 2.121998365951719, "learning_rate": 7.075471698113207e-07, "logits/chosen": 1.2001953125, "logits/rejected": 1.1798095703125, "logps/chosen": -1016.0, "logps/rejected": -770.5, "loss": 0.7973, "rewards/accuracies": 0.75, "rewards/chosen": 0.35113525390625, "rewards/margins": 1.2626953125, "rewards/rejected": -0.9111328125, "step": 376 }, { "epoch": 0.07123624167414616, "grad_norm": 2.3193917391754573, "learning_rate": 7.094339622641508e-07, "logits/chosen": 1.1142578125, "logits/rejected": 1.425048828125, "logps/chosen": -581.0, "logps/rejected": -577.5, "loss": 0.858, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0072021484375, "rewards/margins": 0.7861328125, "rewards/rejected": -0.779296875, "step": 377 }, { "epoch": 0.07142519722235344, "grad_norm": 2.630684239216432, "learning_rate": 7.113207547169811e-07, "logits/chosen": 0.6544189453125, "logits/rejected": 1.669921875, "logps/chosen": -808.0, "logps/rejected": -841.0, "loss": 0.8181, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2371826171875, "rewards/margins": 1.189453125, "rewards/rejected": -0.9521484375, "step": 378 }, { "epoch": 0.07161415277056073, "grad_norm": 2.3457717622359877, "learning_rate": 7.132075471698112e-07, "logits/chosen": 0.3115234375, "logits/rejected": 1.236328125, "logps/chosen": -581.5, "logps/rejected": -548.5, "loss": 0.8644, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0516357421875, "rewards/margins": 0.9462890625, "rewards/rejected": -0.896240234375, "step": 379 }, { "epoch": 0.07180310831876802, "grad_norm": 2.383817737854493, "learning_rate": 7.150943396226415e-07, "logits/chosen": 1.0322265625, "logits/rejected": 2.04345703125, "logps/chosen": -813.5, "logps/rejected": -883.0, "loss": 0.804, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1071014404296875, "rewards/margins": 1.2216796875, "rewards/rejected": -1.115234375, "step": 380 }, { "epoch": 0.07199206386697529, "grad_norm": 2.2605233995704737, "learning_rate": 7.169811320754716e-07, "logits/chosen": 1.26416015625, "logits/rejected": 2.060546875, "logps/chosen": -610.0, "logps/rejected": -1849.0, "loss": 0.853, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0245361328125, "rewards/margins": 1.09375, "rewards/rejected": -1.1162109375, "step": 381 }, { "epoch": 0.07218101941518258, "grad_norm": 2.2306187969774656, "learning_rate": 7.188679245283018e-07, "logits/chosen": 1.2255859375, "logits/rejected": 0.85546875, "logps/chosen": -668.0, "logps/rejected": -573.0, "loss": 0.8513, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0416259765625, "rewards/margins": 0.9560546875, "rewards/rejected": -0.99755859375, "step": 382 }, { "epoch": 0.07236997496338986, "grad_norm": 2.0158227736410512, "learning_rate": 7.20754716981132e-07, "logits/chosen": 1.3115234375, "logits/rejected": 1.6209716796875, "logps/chosen": -834.5, "logps/rejected": -718.5, "loss": 0.9002, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0997314453125, "rewards/margins": 0.76171875, "rewards/rejected": -0.659423828125, "step": 383 }, { "epoch": 0.07255893051159715, "grad_norm": 2.3179449080283945, "learning_rate": 7.226415094339622e-07, "logits/chosen": 1.6875, "logits/rejected": 2.3046875, "logps/chosen": -675.5, "logps/rejected": -915.0, "loss": 0.8094, "rewards/accuracies": 0.875, "rewards/chosen": 0.11871337890625, "rewards/margins": 1.33203125, "rewards/rejected": -1.2158203125, "step": 384 }, { "epoch": 0.07274788605980444, "grad_norm": 2.4714799017332347, "learning_rate": 7.245283018867923e-07, "logits/chosen": 1.238525390625, "logits/rejected": 1.8515625, "logps/chosen": -1085.0, "logps/rejected": -2270.0, "loss": 0.8178, "rewards/accuracies": 0.8125, "rewards/chosen": 0.05181884765625, "rewards/margins": 1.8935546875, "rewards/rejected": -1.83984375, "step": 385 }, { "epoch": 0.07293684160801171, "grad_norm": 1.9388564222192048, "learning_rate": 7.264150943396226e-07, "logits/chosen": 0.64013671875, "logits/rejected": 1.2607421875, "logps/chosen": -484.0, "logps/rejected": -607.5, "loss": 0.8314, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2214202880859375, "rewards/margins": 0.98828125, "rewards/rejected": -0.7666015625, "step": 386 }, { "epoch": 0.073125797156219, "grad_norm": 2.021704295455657, "learning_rate": 7.283018867924527e-07, "logits/chosen": 1.472900390625, "logits/rejected": 1.646484375, "logps/chosen": -646.0, "logps/rejected": -615.5, "loss": 0.8241, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2562255859375, "rewards/margins": 0.96728515625, "rewards/rejected": -0.712890625, "step": 387 }, { "epoch": 0.07331475270442628, "grad_norm": 2.2462371201455773, "learning_rate": 7.30188679245283e-07, "logits/chosen": 0.4912109375, "logits/rejected": 1.078125, "logps/chosen": -620.0, "logps/rejected": -541.5, "loss": 0.884, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00140380859375, "rewards/margins": 0.63037109375, "rewards/rejected": -0.62939453125, "step": 388 }, { "epoch": 0.07350370825263357, "grad_norm": 2.394855410138797, "learning_rate": 7.320754716981131e-07, "logits/chosen": 0.5511474609375, "logits/rejected": 1.3583984375, "logps/chosen": -1040.0, "logps/rejected": -1365.0, "loss": 0.7511, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4635009765625, "rewards/margins": 1.763671875, "rewards/rejected": -1.298828125, "step": 389 }, { "epoch": 0.07369266380084086, "grad_norm": 2.746541969906855, "learning_rate": 7.339622641509433e-07, "logits/chosen": 0.72265625, "logits/rejected": 1.1962890625, "logps/chosen": -741.5, "logps/rejected": -1014.5, "loss": 0.8574, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0234375, "rewards/margins": 0.8974609375, "rewards/rejected": -0.919921875, "step": 390 }, { "epoch": 0.07388161934904813, "grad_norm": 2.2060365126579904, "learning_rate": 7.358490566037735e-07, "logits/chosen": 0.478515625, "logits/rejected": 0.5146484375, "logps/chosen": -534.5, "logps/rejected": -530.5, "loss": 0.901, "rewards/accuracies": 0.71875, "rewards/chosen": 0.040130615234375, "rewards/margins": 0.462890625, "rewards/rejected": -0.42236328125, "step": 391 }, { "epoch": 0.07407057489725542, "grad_norm": 2.0514757286265133, "learning_rate": 7.377358490566037e-07, "logits/chosen": 0.3291015625, "logits/rejected": 0.7724609375, "logps/chosen": -510.5, "logps/rejected": -502.5, "loss": 0.8358, "rewards/accuracies": 0.84375, "rewards/chosen": -0.070556640625, "rewards/margins": 0.874755859375, "rewards/rejected": -0.9443359375, "step": 392 }, { "epoch": 0.0742595304454627, "grad_norm": 2.3636341843857815, "learning_rate": 7.396226415094338e-07, "logits/chosen": 1.11181640625, "logits/rejected": 1.150390625, "logps/chosen": -777.0, "logps/rejected": -783.0, "loss": 0.8143, "rewards/accuracies": 0.78125, "rewards/chosen": 0.203369140625, "rewards/margins": 1.2421875, "rewards/rejected": -1.041015625, "step": 393 }, { "epoch": 0.07444848599366999, "grad_norm": 2.963696381915579, "learning_rate": 7.415094339622641e-07, "logits/chosen": 0.5703125, "logits/rejected": 1.1070098876953125, "logps/chosen": -891.0, "logps/rejected": -1184.0, "loss": 0.8215, "rewards/accuracies": 0.75, "rewards/chosen": -0.0245361328125, "rewards/margins": 1.2607421875, "rewards/rejected": -1.2880859375, "step": 394 }, { "epoch": 0.07463744154187728, "grad_norm": 2.528628006332879, "learning_rate": 7.433962264150942e-07, "logits/chosen": 1.36181640625, "logits/rejected": 1.99609375, "logps/chosen": -962.0, "logps/rejected": -1221.0, "loss": 0.8303, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0782470703125, "rewards/margins": 1.40234375, "rewards/rejected": -1.3232421875, "step": 395 }, { "epoch": 0.07482639709008455, "grad_norm": 2.277729422348157, "learning_rate": 7.452830188679245e-07, "logits/chosen": 0.4541015625, "logits/rejected": 0.767578125, "logps/chosen": -620.5, "logps/rejected": -628.0, "loss": 0.8661, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0230712890625, "rewards/margins": 0.962890625, "rewards/rejected": -0.9384765625, "step": 396 }, { "epoch": 0.07501535263829184, "grad_norm": 2.492748604821275, "learning_rate": 7.471698113207546e-07, "logits/chosen": 1.68994140625, "logits/rejected": 0.956298828125, "logps/chosen": -755.5, "logps/rejected": -643.0, "loss": 0.9164, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2193603515625, "rewards/margins": 0.4501953125, "rewards/rejected": -0.6708984375, "step": 397 }, { "epoch": 0.07520430818649912, "grad_norm": 2.1808079997240952, "learning_rate": 7.490566037735848e-07, "logits/chosen": 1.5126953125, "logits/rejected": 1.7392578125, "logps/chosen": -960.0, "logps/rejected": -883.0, "loss": 0.7804, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2369384765625, "rewards/margins": 1.5107421875, "rewards/rejected": -1.2734375, "step": 398 }, { "epoch": 0.07539326373470641, "grad_norm": 2.100988730370486, "learning_rate": 7.509433962264151e-07, "logits/chosen": 1.53857421875, "logits/rejected": 1.58984375, "logps/chosen": -729.0, "logps/rejected": -554.5, "loss": 0.8165, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3680877685546875, "rewards/margins": 1.18603515625, "rewards/rejected": -0.81787109375, "step": 399 }, { "epoch": 0.0755822192829137, "grad_norm": 2.4082059045927218, "learning_rate": 7.528301886792453e-07, "logits/chosen": 0.991943359375, "logits/rejected": 1.947265625, "logps/chosen": -702.5, "logps/rejected": -1115.5, "loss": 0.8226, "rewards/accuracies": 0.75, "rewards/chosen": 0.0023193359375, "rewards/margins": 1.4873046875, "rewards/rejected": -1.4853515625, "step": 400 }, { "epoch": 0.07577117483112097, "grad_norm": 2.2651357585850556, "learning_rate": 7.547169811320754e-07, "logits/chosen": 0.52978515625, "logits/rejected": 0.791015625, "logps/chosen": -710.5, "logps/rejected": -682.5, "loss": 0.8237, "rewards/accuracies": 0.78125, "rewards/chosen": 0.06744384765625, "rewards/margins": 1.125, "rewards/rejected": -1.0576171875, "step": 401 }, { "epoch": 0.07596013037932826, "grad_norm": 2.3562287549381153, "learning_rate": 7.566037735849057e-07, "logits/chosen": 0.9359130859375, "logits/rejected": 1.70263671875, "logps/chosen": -694.0, "logps/rejected": -499.5, "loss": 0.8577, "rewards/accuracies": 0.65625, "rewards/chosen": -0.061798095703125, "rewards/margins": 0.8291015625, "rewards/rejected": -0.8896484375, "step": 402 }, { "epoch": 0.07614908592753555, "grad_norm": 2.0850814279111574, "learning_rate": 7.584905660377358e-07, "logits/chosen": 1.220703125, "logits/rejected": 1.93603515625, "logps/chosen": -750.75, "logps/rejected": -582.0, "loss": 0.8109, "rewards/accuracies": 0.75, "rewards/chosen": 0.2747802734375, "rewards/margins": 1.298828125, "rewards/rejected": -1.021484375, "step": 403 }, { "epoch": 0.07633804147574283, "grad_norm": 2.9454799287909985, "learning_rate": 7.603773584905661e-07, "logits/chosen": 1.05029296875, "logits/rejected": 1.0396728515625, "logps/chosen": -819.0, "logps/rejected": -675.5, "loss": 0.7861, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3798828125, "rewards/margins": 1.55078125, "rewards/rejected": -1.1728515625, "step": 404 }, { "epoch": 0.07652699702395012, "grad_norm": 2.0761354355686286, "learning_rate": 7.622641509433962e-07, "logits/chosen": 1.212890625, "logits/rejected": 1.7376708984375, "logps/chosen": -1266.0, "logps/rejected": -1172.0, "loss": 0.7017, "rewards/accuracies": 0.9375, "rewards/chosen": 0.62109375, "rewards/margins": 2.111328125, "rewards/rejected": -1.494140625, "step": 405 }, { "epoch": 0.0767159525721574, "grad_norm": 2.178169900869585, "learning_rate": 7.641509433962264e-07, "logits/chosen": 0.74658203125, "logits/rejected": 1.77392578125, "logps/chosen": -909.0, "logps/rejected": -1279.0, "loss": 0.7885, "rewards/accuracies": 0.90625, "rewards/chosen": -0.00701904296875, "rewards/margins": 1.455078125, "rewards/rejected": -1.462890625, "step": 406 }, { "epoch": 0.07690490812036468, "grad_norm": 2.248318227379087, "learning_rate": 7.660377358490567e-07, "logits/chosen": 1.095703125, "logits/rejected": 1.47998046875, "logps/chosen": -729.5, "logps/rejected": -867.0, "loss": 0.8403, "rewards/accuracies": 0.6875, "rewards/chosen": -0.00390625, "rewards/margins": 0.9638671875, "rewards/rejected": -0.9677734375, "step": 407 }, { "epoch": 0.07709386366857197, "grad_norm": 2.481271867376669, "learning_rate": 7.679245283018868e-07, "logits/chosen": 2.01348876953125, "logits/rejected": 2.3125, "logps/chosen": -1001.0, "logps/rejected": -801.25, "loss": 0.879, "rewards/accuracies": 0.625, "rewards/chosen": 0.18115234375, "rewards/margins": 0.9072265625, "rewards/rejected": -0.726318359375, "step": 408 }, { "epoch": 0.07728281921677925, "grad_norm": 2.362531286315215, "learning_rate": 7.69811320754717e-07, "logits/chosen": 0.511474609375, "logits/rejected": 1.205078125, "logps/chosen": -920.0, "logps/rejected": -1177.0, "loss": 0.7554, "rewards/accuracies": 0.78125, "rewards/chosen": 0.51080322265625, "rewards/margins": 1.533203125, "rewards/rejected": -1.0234375, "step": 409 }, { "epoch": 0.07747177476498654, "grad_norm": 2.31963778499756, "learning_rate": 7.716981132075472e-07, "logits/chosen": 0.718994140625, "logits/rejected": 1.183837890625, "logps/chosen": -658.5, "logps/rejected": -663.0, "loss": 0.8555, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0101165771484375, "rewards/margins": 0.74169921875, "rewards/rejected": -0.730712890625, "step": 410 }, { "epoch": 0.07766073031319382, "grad_norm": 2.7503330648256927, "learning_rate": 7.735849056603774e-07, "logits/chosen": 2.35546875, "logits/rejected": 2.494140625, "logps/chosen": -1028.0, "logps/rejected": -948.0, "loss": 0.8679, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0979156494140625, "rewards/margins": 0.912109375, "rewards/rejected": -0.81201171875, "step": 411 }, { "epoch": 0.0778496858614011, "grad_norm": 2.4447969993831657, "learning_rate": 7.754716981132076e-07, "logits/chosen": 0.900390625, "logits/rejected": 1.15869140625, "logps/chosen": -1018.5, "logps/rejected": -700.0, "loss": 0.8148, "rewards/accuracies": 0.78125, "rewards/chosen": 0.37255859375, "rewards/margins": 1.408203125, "rewards/rejected": -1.03466796875, "step": 412 }, { "epoch": 0.0780386414096084, "grad_norm": 2.2222908341833842, "learning_rate": 7.773584905660378e-07, "logits/chosen": 1.298828125, "logits/rejected": 1.583984375, "logps/chosen": -513.75, "logps/rejected": -558.75, "loss": 0.8401, "rewards/accuracies": 0.8125, "rewards/chosen": 0.054443359375, "rewards/margins": 0.85791015625, "rewards/rejected": -0.80322265625, "step": 413 }, { "epoch": 0.07822759695781567, "grad_norm": 1.9798043704825967, "learning_rate": 7.792452830188679e-07, "logits/chosen": 1.06103515625, "logits/rejected": 1.264892578125, "logps/chosen": -662.0, "logps/rejected": -749.0, "loss": 0.7891, "rewards/accuracies": 0.75, "rewards/chosen": 0.378173828125, "rewards/margins": 1.23974609375, "rewards/rejected": -0.85693359375, "step": 414 }, { "epoch": 0.07841655250602296, "grad_norm": 2.2588494566050694, "learning_rate": 7.811320754716982e-07, "logits/chosen": 1.2578125, "logits/rejected": 1.318603515625, "logps/chosen": -860.0, "logps/rejected": -828.0, "loss": 0.8058, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4246826171875, "rewards/margins": 1.246429443359375, "rewards/rejected": -0.82080078125, "step": 415 }, { "epoch": 0.07860550805423024, "grad_norm": 2.216973773830069, "learning_rate": 7.830188679245283e-07, "logits/chosen": 1.1826171875, "logits/rejected": 1.8095703125, "logps/chosen": -639.0, "logps/rejected": -635.5, "loss": 0.8304, "rewards/accuracies": 0.84375, "rewards/chosen": 0.128509521484375, "rewards/margins": 1.22119140625, "rewards/rejected": -1.09228515625, "step": 416 }, { "epoch": 0.07879446360243753, "grad_norm": 2.1148093539395365, "learning_rate": 7.849056603773585e-07, "logits/chosen": 0.4625244140625, "logits/rejected": 0.686767578125, "logps/chosen": -789.0, "logps/rejected": -932.0, "loss": 0.8289, "rewards/accuracies": 0.59375, "rewards/chosen": 0.40191650390625, "rewards/margins": 1.357421875, "rewards/rejected": -0.9560546875, "step": 417 }, { "epoch": 0.07898341915064482, "grad_norm": 2.0390287728013425, "learning_rate": 7.867924528301887e-07, "logits/chosen": 0.92724609375, "logits/rejected": 1.673828125, "logps/chosen": -713.0, "logps/rejected": -898.0, "loss": 0.8505, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3440093994140625, "rewards/margins": 1.205078125, "rewards/rejected": -0.85888671875, "step": 418 }, { "epoch": 0.07917237469885209, "grad_norm": 2.4411173446739953, "learning_rate": 7.886792452830189e-07, "logits/chosen": 1.0703125, "logits/rejected": 1.802734375, "logps/chosen": -757.0, "logps/rejected": -963.0, "loss": 0.8137, "rewards/accuracies": 0.8125, "rewards/chosen": 0.00732421875, "rewards/margins": 1.038330078125, "rewards/rejected": -1.03125, "step": 419 }, { "epoch": 0.07936133024705938, "grad_norm": 2.5514619435710992, "learning_rate": 7.905660377358491e-07, "logits/chosen": 0.767578125, "logits/rejected": 1.40966796875, "logps/chosen": -1225.0, "logps/rejected": -1247.0, "loss": 0.7262, "rewards/accuracies": 0.875, "rewards/chosen": 0.291015625, "rewards/margins": 2.30078125, "rewards/rejected": -2.0029296875, "step": 420 }, { "epoch": 0.07955028579526667, "grad_norm": 2.1158248514796747, "learning_rate": 7.924528301886793e-07, "logits/chosen": 1.666015625, "logits/rejected": 2.150390625, "logps/chosen": -946.5, "logps/rejected": -770.0, "loss": 0.7783, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4189453125, "rewards/margins": 1.416015625, "rewards/rejected": -0.9931640625, "step": 421 }, { "epoch": 0.07973924134347395, "grad_norm": 2.257083307833097, "learning_rate": 7.943396226415094e-07, "logits/chosen": 1.144775390625, "logits/rejected": 1.970916748046875, "logps/chosen": -699.0, "logps/rejected": -1151.0, "loss": 0.7799, "rewards/accuracies": 0.75, "rewards/chosen": 0.35205078125, "rewards/margins": 1.3828125, "rewards/rejected": -1.03125, "step": 422 }, { "epoch": 0.07992819689168124, "grad_norm": 1.9764591648877963, "learning_rate": 7.962264150943397e-07, "logits/chosen": 1.57421875, "logits/rejected": 1.802734375, "logps/chosen": -598.5, "logps/rejected": -559.0, "loss": 0.833, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0890960693359375, "rewards/margins": 0.9326171875, "rewards/rejected": -0.8427734375, "step": 423 }, { "epoch": 0.08011715243988851, "grad_norm": 2.115702847302277, "learning_rate": 7.981132075471698e-07, "logits/chosen": 1.20947265625, "logits/rejected": 1.80078125, "logps/chosen": -661.0, "logps/rejected": -816.75, "loss": 0.8181, "rewards/accuracies": 0.875, "rewards/chosen": 0.256591796875, "rewards/margins": 1.037109375, "rewards/rejected": -0.77984619140625, "step": 424 }, { "epoch": 0.0803061079880958, "grad_norm": 2.458845836073191, "learning_rate": 8e-07, "logits/chosen": 0.763671875, "logits/rejected": 1.6044921875, "logps/chosen": -851.0, "logps/rejected": -1256.0, "loss": 0.8405, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02252197265625, "rewards/margins": 0.892578125, "rewards/rejected": -0.916015625, "step": 425 }, { "epoch": 0.08049506353630309, "grad_norm": 2.1971796072261367, "learning_rate": 8.018867924528302e-07, "logits/chosen": 1.122802734375, "logits/rejected": 1.857421875, "logps/chosen": -649.5, "logps/rejected": -581.5, "loss": 0.8663, "rewards/accuracies": 0.71875, "rewards/chosen": 0.02520751953125, "rewards/margins": 0.794921875, "rewards/rejected": -0.7705078125, "step": 426 }, { "epoch": 0.08068401908451037, "grad_norm": 2.798069754619274, "learning_rate": 8.037735849056604e-07, "logits/chosen": 1.1005859375, "logits/rejected": 1.6748046875, "logps/chosen": -1080.0, "logps/rejected": -1023.5, "loss": 0.7827, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1331787109375, "rewards/margins": 1.3447265625, "rewards/rejected": -1.2119140625, "step": 427 }, { "epoch": 0.08087297463271766, "grad_norm": 2.3571014144349993, "learning_rate": 8.056603773584906e-07, "logits/chosen": 0.939697265625, "logits/rejected": 1.4990234375, "logps/chosen": -567.5, "logps/rejected": -546.0, "loss": 0.8595, "rewards/accuracies": 0.78125, "rewards/chosen": 0.206787109375, "rewards/margins": 0.654296875, "rewards/rejected": -0.44580078125, "step": 428 }, { "epoch": 0.08106193018092493, "grad_norm": 1.9257229817813653, "learning_rate": 8.075471698113208e-07, "logits/chosen": 1.05908203125, "logits/rejected": 1.775390625, "logps/chosen": -575.5, "logps/rejected": -559.0, "loss": 0.8501, "rewards/accuracies": 0.71875, "rewards/chosen": 0.10693359375, "rewards/margins": 0.88037109375, "rewards/rejected": -0.7724609375, "step": 429 }, { "epoch": 0.08125088572913222, "grad_norm": 2.373759072537923, "learning_rate": 8.094339622641509e-07, "logits/chosen": 1.14990234375, "logits/rejected": 1.50927734375, "logps/chosen": -791.0, "logps/rejected": -1027.0, "loss": 0.788, "rewards/accuracies": 0.75, "rewards/chosen": 0.38482666015625, "rewards/margins": 1.4228515625, "rewards/rejected": -1.041015625, "step": 430 }, { "epoch": 0.08143984127733951, "grad_norm": 2.5256727423834078, "learning_rate": 8.113207547169812e-07, "logits/chosen": 0.64202880859375, "logits/rejected": 2.17578125, "logps/chosen": -694.5, "logps/rejected": -1935.5, "loss": 0.7948, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1490478515625, "rewards/margins": 1.7763671875, "rewards/rejected": -1.625, "step": 431 }, { "epoch": 0.08162879682554679, "grad_norm": 2.459967509117028, "learning_rate": 8.132075471698113e-07, "logits/chosen": 1.216796875, "logits/rejected": 1.6513671875, "logps/chosen": -701.0, "logps/rejected": -691.0, "loss": 0.8253, "rewards/accuracies": 0.71875, "rewards/chosen": 0.222412109375, "rewards/margins": 1.1629638671875, "rewards/rejected": -0.9410400390625, "step": 432 }, { "epoch": 0.08181775237375408, "grad_norm": 2.725519226868126, "learning_rate": 8.150943396226415e-07, "logits/chosen": 0.33935546875, "logits/rejected": 0.4752349853515625, "logps/chosen": -854.0, "logps/rejected": -794.0, "loss": 0.7718, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1522216796875, "rewards/margins": 1.2119140625, "rewards/rejected": -1.0615234375, "step": 433 }, { "epoch": 0.08200670792196135, "grad_norm": 2.7248002494075974, "learning_rate": 8.169811320754717e-07, "logits/chosen": 1.330078125, "logits/rejected": 1.90087890625, "logps/chosen": -733.5, "logps/rejected": -663.0, "loss": 0.8336, "rewards/accuracies": 0.625, "rewards/chosen": 0.06353759765625, "rewards/margins": 0.98583984375, "rewards/rejected": -0.92333984375, "step": 434 }, { "epoch": 0.08219566347016864, "grad_norm": 4.100731708075386, "learning_rate": 8.188679245283019e-07, "logits/chosen": 0.443359375, "logits/rejected": 0.67578125, "logps/chosen": -850.5, "logps/rejected": -824.5, "loss": 0.8191, "rewards/accuracies": 0.75, "rewards/chosen": 0.134033203125, "rewards/margins": 1.041015625, "rewards/rejected": -0.9052734375, "step": 435 }, { "epoch": 0.08238461901837593, "grad_norm": 2.3640567212218238, "learning_rate": 8.207547169811321e-07, "logits/chosen": 0.935546875, "logits/rejected": 1.16552734375, "logps/chosen": -884.0, "logps/rejected": -1174.0, "loss": 0.803, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0377197265625, "rewards/margins": 1.30517578125, "rewards/rejected": -1.2666015625, "step": 436 }, { "epoch": 0.08257357456658321, "grad_norm": 2.2834238704225855, "learning_rate": 8.226415094339623e-07, "logits/chosen": 0.6328125, "logits/rejected": 1.021484375, "logps/chosen": -608.0, "logps/rejected": -1109.0, "loss": 0.7761, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2769775390625, "rewards/margins": 1.47509765625, "rewards/rejected": -1.197265625, "step": 437 }, { "epoch": 0.0827625301147905, "grad_norm": 2.162170236258507, "learning_rate": 8.245283018867924e-07, "logits/chosen": 1.41796875, "logits/rejected": 1.90234375, "logps/chosen": -982.0, "logps/rejected": -1494.0, "loss": 0.71, "rewards/accuracies": 0.875, "rewards/chosen": 0.4915771484375, "rewards/margins": 2.099609375, "rewards/rejected": -1.603515625, "step": 438 }, { "epoch": 0.08295148566299777, "grad_norm": 1.9504567332327178, "learning_rate": 8.264150943396227e-07, "logits/chosen": 1.48095703125, "logits/rejected": 1.85595703125, "logps/chosen": -627.5, "logps/rejected": -600.5, "loss": 0.7895, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05126953125, "rewards/margins": 1.3017578125, "rewards/rejected": -1.251953125, "step": 439 }, { "epoch": 0.08314044121120506, "grad_norm": 2.3861102394981013, "learning_rate": 8.283018867924528e-07, "logits/chosen": 2.1875, "logits/rejected": 2.2265625, "logps/chosen": -745.0, "logps/rejected": -871.0, "loss": 0.7713, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2166748046875, "rewards/margins": 1.26416015625, "rewards/rejected": -1.0439453125, "step": 440 }, { "epoch": 0.08332939675941235, "grad_norm": 2.223542030571409, "learning_rate": 8.30188679245283e-07, "logits/chosen": 0.96484375, "logits/rejected": 1.4600830078125, "logps/chosen": -954.0, "logps/rejected": -934.5, "loss": 0.798, "rewards/accuracies": 0.875, "rewards/chosen": 0.8046875, "rewards/margins": 1.474609375, "rewards/rejected": -0.6669921875, "step": 441 }, { "epoch": 0.08351835230761963, "grad_norm": 2.494529847770344, "learning_rate": 8.320754716981132e-07, "logits/chosen": 1.247802734375, "logits/rejected": 1.34375, "logps/chosen": -657.5, "logps/rejected": -660.5, "loss": 0.8214, "rewards/accuracies": 0.84375, "rewards/chosen": -0.039306640625, "rewards/margins": 1.0205078125, "rewards/rejected": -1.0595703125, "step": 442 }, { "epoch": 0.08370730785582692, "grad_norm": 2.2379142330834063, "learning_rate": 8.339622641509434e-07, "logits/chosen": 0.734375, "logits/rejected": 1.03125, "logps/chosen": -908.0, "logps/rejected": -664.0, "loss": 0.7722, "rewards/accuracies": 0.875, "rewards/chosen": 0.21209716796875, "rewards/margins": 1.3388671875, "rewards/rejected": -1.125, "step": 443 }, { "epoch": 0.08389626340403421, "grad_norm": 2.0802420335157668, "learning_rate": 8.358490566037736e-07, "logits/chosen": 1.111328125, "logits/rejected": 1.79736328125, "logps/chosen": -892.0, "logps/rejected": -2369.0, "loss": 0.7407, "rewards/accuracies": 0.75, "rewards/chosen": 0.35498046875, "rewards/margins": 2.9443359375, "rewards/rejected": -2.591796875, "step": 444 }, { "epoch": 0.08408521895224148, "grad_norm": 2.5253632815772704, "learning_rate": 8.377358490566038e-07, "logits/chosen": 0.572265625, "logits/rejected": 1.852783203125, "logps/chosen": -1148.0, "logps/rejected": -1136.0, "loss": 0.7099, "rewards/accuracies": 0.90625, "rewards/chosen": 0.21533203125, "rewards/margins": 2.072265625, "rewards/rejected": -1.85546875, "step": 445 }, { "epoch": 0.08427417450044877, "grad_norm": 2.089574529143971, "learning_rate": 8.396226415094339e-07, "logits/chosen": 0.533203125, "logits/rejected": 0.97265625, "logps/chosen": -558.5, "logps/rejected": -556.5, "loss": 0.7917, "rewards/accuracies": 0.875, "rewards/chosen": 0.0718841552734375, "rewards/margins": 1.11328125, "rewards/rejected": -1.04296875, "step": 446 }, { "epoch": 0.08446313004865605, "grad_norm": 1.913068398720654, "learning_rate": 8.415094339622642e-07, "logits/chosen": 0.28369140625, "logits/rejected": 1.068359375, "logps/chosen": -620.5, "logps/rejected": -663.25, "loss": 0.7846, "rewards/accuracies": 0.875, "rewards/chosen": 0.2877197265625, "rewards/margins": 1.39892578125, "rewards/rejected": -1.1083984375, "step": 447 }, { "epoch": 0.08465208559686334, "grad_norm": 2.132780441981967, "learning_rate": 8.433962264150943e-07, "logits/chosen": 1.7275390625, "logits/rejected": 2.4013671875, "logps/chosen": -941.0, "logps/rejected": -964.25, "loss": 0.8499, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2987060546875, "rewards/margins": 1.1466064453125, "rewards/rejected": -1.447265625, "step": 448 }, { "epoch": 0.08484104114507063, "grad_norm": 2.032213058020042, "learning_rate": 8.452830188679245e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.71392822265625, "logps/chosen": -782.0, "logps/rejected": -597.0, "loss": 0.84, "rewards/accuracies": 0.75, "rewards/chosen": -0.0557403564453125, "rewards/margins": 0.970703125, "rewards/rejected": -1.0263671875, "step": 449 }, { "epoch": 0.0850299966932779, "grad_norm": 2.206958835254166, "learning_rate": 8.471698113207547e-07, "logits/chosen": 0.3203125, "logits/rejected": 0.8232421875, "logps/chosen": -867.0, "logps/rejected": -859.5, "loss": 0.7509, "rewards/accuracies": 0.84375, "rewards/chosen": 0.18701171875, "rewards/margins": 1.6689453125, "rewards/rejected": -1.4814453125, "step": 450 }, { "epoch": 0.0852189522414852, "grad_norm": 2.166227761164564, "learning_rate": 8.490566037735849e-07, "logits/chosen": 0.681640625, "logits/rejected": 1.67578125, "logps/chosen": -611.25, "logps/rejected": -607.75, "loss": 0.7719, "rewards/accuracies": 0.84375, "rewards/chosen": -0.002685546875, "rewards/margins": 1.404296875, "rewards/rejected": -1.40234375, "step": 451 }, { "epoch": 0.08540790778969247, "grad_norm": 2.398049762811901, "learning_rate": 8.509433962264151e-07, "logits/chosen": 0.24609375, "logits/rejected": 1.056640625, "logps/chosen": -759.0, "logps/rejected": -730.0, "loss": 0.8035, "rewards/accuracies": 0.875, "rewards/chosen": 0.0708770751953125, "rewards/margins": 1.0849609375, "rewards/rejected": -1.013671875, "step": 452 }, { "epoch": 0.08559686333789976, "grad_norm": 2.3262307907783826, "learning_rate": 8.528301886792453e-07, "logits/chosen": 1.6923828125, "logits/rejected": 1.79296875, "logps/chosen": -732.0, "logps/rejected": -576.5, "loss": 0.7911, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1597900390625, "rewards/margins": 1.1455078125, "rewards/rejected": -0.986328125, "step": 453 }, { "epoch": 0.08578581888610705, "grad_norm": 2.306831794092483, "learning_rate": 8.547169811320754e-07, "logits/chosen": -0.062744140625, "logits/rejected": 0.013671875, "logps/chosen": -640.0, "logps/rejected": -694.0, "loss": 0.804, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1146240234375, "rewards/margins": 1.1162109375, "rewards/rejected": -1.00048828125, "step": 454 }, { "epoch": 0.08597477443431432, "grad_norm": 1.9954479462899504, "learning_rate": 8.566037735849057e-07, "logits/chosen": 1.2041015625, "logits/rejected": 2.0703125, "logps/chosen": -721.0, "logps/rejected": -1613.0, "loss": 0.7645, "rewards/accuracies": 0.875, "rewards/chosen": 0.203125, "rewards/margins": 1.9580078125, "rewards/rejected": -1.7568359375, "step": 455 }, { "epoch": 0.08616372998252161, "grad_norm": 2.200585596362537, "learning_rate": 8.584905660377358e-07, "logits/chosen": 0.62939453125, "logits/rejected": 1.138671875, "logps/chosen": -707.0, "logps/rejected": -696.0, "loss": 0.7911, "rewards/accuracies": 0.875, "rewards/chosen": 0.003173828125, "rewards/margins": 1.1123046875, "rewards/rejected": -1.109375, "step": 456 }, { "epoch": 0.08635268553072889, "grad_norm": 2.320939354629429, "learning_rate": 8.60377358490566e-07, "logits/chosen": 1.654296875, "logits/rejected": 1.693359375, "logps/chosen": -842.0, "logps/rejected": -8561.0, "loss": 0.8349, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07257080078125, "rewards/margins": -82.28515625, "rewards/rejected": 82.2431640625, "step": 457 }, { "epoch": 0.08654164107893618, "grad_norm": 2.0321344625055096, "learning_rate": 8.622641509433962e-07, "logits/chosen": 0.8232421875, "logits/rejected": 2.2060546875, "logps/chosen": -543.5, "logps/rejected": -1994.0, "loss": 0.7978, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0882568359375, "rewards/margins": 2.03955078125, "rewards/rejected": -1.95538330078125, "step": 458 }, { "epoch": 0.08673059662714347, "grad_norm": 6658.997381024349, "learning_rate": 8.641509433962264e-07, "logits/chosen": 1.56689453125, "logits/rejected": 1.4404296875, "logps/chosen": -613.5, "logps/rejected": -14182.25, "loss": 0.9048, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0124969482421875, "rewards/margins": 0.51513671875, "rewards/rejected": -0.5029296875, "step": 459 }, { "epoch": 0.08691955217535074, "grad_norm": 2.1530010102187096, "learning_rate": 8.660377358490565e-07, "logits/chosen": 0.44793701171875, "logits/rejected": 0.5224609375, "logps/chosen": -628.0, "logps/rejected": -1014.0, "loss": 0.7772, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2061767578125, "rewards/margins": 1.7119140625, "rewards/rejected": -1.5078125, "step": 460 }, { "epoch": 0.08710850772355803, "grad_norm": 2.4928624092702685, "learning_rate": 8.679245283018868e-07, "logits/chosen": 0.177734375, "logits/rejected": 0.83477783203125, "logps/chosen": -851.0, "logps/rejected": -1277.5, "loss": 0.8145, "rewards/accuracies": 0.75, "rewards/chosen": 0.1490478515625, "rewards/margins": 1.42578125, "rewards/rejected": -1.2724609375, "step": 461 }, { "epoch": 0.08729746327176531, "grad_norm": 2.5359819833018697, "learning_rate": 8.698113207547169e-07, "logits/chosen": 1.1904296875, "logits/rejected": 1.912109375, "logps/chosen": -971.0, "logps/rejected": -960.0, "loss": 0.7376, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5003662109375, "rewards/margins": 1.6640625, "rewards/rejected": -1.1669921875, "step": 462 }, { "epoch": 0.0874864188199726, "grad_norm": 1.94462430782965, "learning_rate": 8.716981132075472e-07, "logits/chosen": 1.31005859375, "logits/rejected": 1.80322265625, "logps/chosen": -612.5, "logps/rejected": -557.5, "loss": 0.7965, "rewards/accuracies": 0.75, "rewards/chosen": 0.210845947265625, "rewards/margins": 1.16796875, "rewards/rejected": -0.9541015625, "step": 463 }, { "epoch": 0.08767537436817989, "grad_norm": 1.876570189570718, "learning_rate": 8.735849056603773e-07, "logits/chosen": 1.392578125, "logits/rejected": 1.487060546875, "logps/chosen": -439.5, "logps/rejected": -908.5, "loss": 0.7888, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3221435546875, "rewards/margins": 1.4931640625, "rewards/rejected": -1.169677734375, "step": 464 }, { "epoch": 0.08786432991638717, "grad_norm": 2.541580515846274, "learning_rate": 8.754716981132075e-07, "logits/chosen": 1.185546875, "logits/rejected": 1.4619140625, "logps/chosen": -866.0, "logps/rejected": -1341.5, "loss": 0.7336, "rewards/accuracies": 0.84375, "rewards/chosen": 0.17926025390625, "rewards/margins": 1.4375, "rewards/rejected": -1.25390625, "step": 465 }, { "epoch": 0.08805328546459446, "grad_norm": 2.8499626350009035, "learning_rate": 8.773584905660378e-07, "logits/chosen": 0.677734375, "logits/rejected": 1.447265625, "logps/chosen": -1002.0, "logps/rejected": -1246.0, "loss": 0.7778, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1226806640625, "rewards/margins": 1.375, "rewards/rejected": -1.25390625, "step": 466 }, { "epoch": 0.08824224101280174, "grad_norm": 2.364094921173164, "learning_rate": 8.792452830188679e-07, "logits/chosen": 0.63916015625, "logits/rejected": 0.5703125, "logps/chosen": -985.0, "logps/rejected": -10871.0, "loss": 0.7776, "rewards/accuracies": 0.75, "rewards/chosen": 0.04962158203125, "rewards/margins": 23.7783203125, "rewards/rejected": -23.6669921875, "step": 467 }, { "epoch": 0.08843119656100902, "grad_norm": 2.3736802905535366, "learning_rate": 8.81132075471698e-07, "logits/chosen": 1.15087890625, "logits/rejected": 1.33349609375, "logps/chosen": -660.5, "logps/rejected": -634.0, "loss": 0.832, "rewards/accuracies": 0.71875, "rewards/chosen": 0.04193115234375, "rewards/margins": 0.95458984375, "rewards/rejected": -0.91015625, "step": 468 }, { "epoch": 0.08862015210921631, "grad_norm": 2.3679891821987464, "learning_rate": 8.830188679245283e-07, "logits/chosen": 0.87158203125, "logits/rejected": 0.691650390625, "logps/chosen": -571.0, "logps/rejected": -890.5, "loss": 0.8025, "rewards/accuracies": 0.75, "rewards/chosen": 0.05914306640625, "rewards/margins": 0.99609375, "rewards/rejected": -0.9365234375, "step": 469 }, { "epoch": 0.08880910765742359, "grad_norm": 1.8771800826981118, "learning_rate": 8.849056603773585e-07, "logits/chosen": 1.185546875, "logits/rejected": 1.5205078125, "logps/chosen": -752.5, "logps/rejected": -1070.5, "loss": 0.7374, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3511962890625, "rewards/margins": 2.029296875, "rewards/rejected": -1.6806640625, "step": 470 }, { "epoch": 0.08899806320563088, "grad_norm": 2.0112030222424235, "learning_rate": 8.867924528301887e-07, "logits/chosen": 0.320068359375, "logits/rejected": 1.1484375, "logps/chosen": -524.0, "logps/rejected": -476.0, "loss": 0.7888, "rewards/accuracies": 0.78125, "rewards/chosen": 0.045501708984375, "rewards/margins": 1.16015625, "rewards/rejected": -1.1142578125, "step": 471 }, { "epoch": 0.08918701875383817, "grad_norm": 2.0005233042654815, "learning_rate": 8.886792452830189e-07, "logits/chosen": 0.076171875, "logits/rejected": 0.85009765625, "logps/chosen": -461.75, "logps/rejected": -405.5, "loss": 0.7982, "rewards/accuracies": 0.875, "rewards/chosen": 0.135345458984375, "rewards/margins": 0.9521484375, "rewards/rejected": -0.8173828125, "step": 472 }, { "epoch": 0.08937597430204544, "grad_norm": 2.1812044395373524, "learning_rate": 8.90566037735849e-07, "logits/chosen": 0.3896484375, "logits/rejected": 0.626220703125, "logps/chosen": -636.5, "logps/rejected": -591.0, "loss": 0.7885, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11492919921875, "rewards/margins": 1.1806640625, "rewards/rejected": -1.0625, "step": 473 }, { "epoch": 0.08956492985025273, "grad_norm": 2.0171591486106513, "learning_rate": 8.924528301886793e-07, "logits/chosen": 0.97265625, "logits/rejected": 1.84375, "logps/chosen": -571.5, "logps/rejected": -656.0, "loss": 0.8239, "rewards/accuracies": 0.75, "rewards/chosen": 0.304290771484375, "rewards/margins": 1.1728515625, "rewards/rejected": -0.8681640625, "step": 474 }, { "epoch": 0.08975388539846, "grad_norm": 2.744026697493932, "learning_rate": 8.943396226415094e-07, "logits/chosen": 1.8876953125, "logits/rejected": 2.8466796875, "logps/chosen": -604.0, "logps/rejected": -1805.0, "loss": 0.7519, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1485595703125, "rewards/margins": 1.5908203125, "rewards/rejected": -1.4384765625, "step": 475 }, { "epoch": 0.0899428409466673, "grad_norm": 2.083456525913322, "learning_rate": 8.962264150943396e-07, "logits/chosen": 0.791748046875, "logits/rejected": 1.23583984375, "logps/chosen": -592.5, "logps/rejected": -634.5, "loss": 0.8764, "rewards/accuracies": 0.59375, "rewards/chosen": 0.021728515625, "rewards/margins": 0.822021484375, "rewards/rejected": -0.802459716796875, "step": 476 }, { "epoch": 0.09013179649487459, "grad_norm": 2.038123038963144, "learning_rate": 8.981132075471698e-07, "logits/chosen": 1.010986328125, "logits/rejected": 2.126953125, "logps/chosen": -994.0, "logps/rejected": -1191.5, "loss": 0.7484, "rewards/accuracies": 0.78125, "rewards/chosen": 0.47235107421875, "rewards/margins": 2.171875, "rewards/rejected": -1.69384765625, "step": 477 }, { "epoch": 0.09032075204308186, "grad_norm": 2.1107543145570133, "learning_rate": 9e-07, "logits/chosen": 0.005859375, "logits/rejected": 0.97802734375, "logps/chosen": -665.0, "logps/rejected": -1317.0, "loss": 0.7663, "rewards/accuracies": 0.84375, "rewards/chosen": 0.01393890380859375, "rewards/margins": 1.55078125, "rewards/rejected": -1.5361328125, "step": 478 }, { "epoch": 0.09050970759128915, "grad_norm": 2.2597900097141577, "learning_rate": 9.018867924528302e-07, "logits/chosen": 1.50390625, "logits/rejected": 2.10546875, "logps/chosen": -827.5, "logps/rejected": -1468.5, "loss": 0.78, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1473388671875, "rewards/margins": 1.744140625, "rewards/rejected": -1.5908203125, "step": 479 }, { "epoch": 0.09069866313949643, "grad_norm": 2.230878604361403, "learning_rate": 9.037735849056604e-07, "logits/chosen": 0.910400390625, "logits/rejected": 1.74951171875, "logps/chosen": -1055.0, "logps/rejected": -1227.0, "loss": 0.7206, "rewards/accuracies": 0.875, "rewards/chosen": 0.55303955078125, "rewards/margins": 2.1748046875, "rewards/rejected": -1.6201171875, "step": 480 }, { "epoch": 0.09088761868770372, "grad_norm": 3.7377681559034754, "learning_rate": 9.056603773584905e-07, "logits/chosen": 0.104248046875, "logits/rejected": 0.96533203125, "logps/chosen": -1048.0, "logps/rejected": -1218.5, "loss": 0.7706, "rewards/accuracies": 0.71875, "rewards/chosen": 0.291015625, "rewards/margins": 2.013671875, "rewards/rejected": -1.724609375, "step": 481 }, { "epoch": 0.091076574235911, "grad_norm": 2.056273953279526, "learning_rate": 9.075471698113208e-07, "logits/chosen": 0.916259765625, "logits/rejected": 0.98431396484375, "logps/chosen": -964.0, "logps/rejected": -863.0, "loss": 0.75, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6400146484375, "rewards/margins": 1.8173828125, "rewards/rejected": -1.1806640625, "step": 482 }, { "epoch": 0.09126552978411828, "grad_norm": 2.1301620798432075, "learning_rate": 9.094339622641509e-07, "logits/chosen": 1.05517578125, "logits/rejected": 1.08544921875, "logps/chosen": -655.0, "logps/rejected": -527.5, "loss": 0.8394, "rewards/accuracies": 0.65625, "rewards/chosen": 0.09136962890625, "rewards/margins": 0.818359375, "rewards/rejected": -0.7265625, "step": 483 }, { "epoch": 0.09145448533232557, "grad_norm": 2.2522058771899847, "learning_rate": 9.113207547169811e-07, "logits/chosen": 1.900390625, "logits/rejected": 2.748046875, "logps/chosen": -962.0, "logps/rejected": -1569.0, "loss": 0.7401, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4632568359375, "rewards/margins": 2.2626953125, "rewards/rejected": -1.80078125, "step": 484 }, { "epoch": 0.09164344088053286, "grad_norm": 2.462736139770739, "learning_rate": 9.132075471698113e-07, "logits/chosen": 0.953125, "logits/rejected": 1.1708984375, "logps/chosen": -1148.0, "logps/rejected": -920.0, "loss": 0.7053, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4884033203125, "rewards/margins": 2.369140625, "rewards/rejected": -1.8779296875, "step": 485 }, { "epoch": 0.09183239642874014, "grad_norm": 2.319491229845784, "learning_rate": 9.150943396226415e-07, "logits/chosen": 0.2529296875, "logits/rejected": 0.5311279296875, "logps/chosen": -785.25, "logps/rejected": -734.5, "loss": 0.7436, "rewards/accuracies": 0.875, "rewards/chosen": 0.27880859375, "rewards/margins": 1.53125, "rewards/rejected": -1.251953125, "step": 486 }, { "epoch": 0.09202135197694743, "grad_norm": 2.2925848545701437, "learning_rate": 9.169811320754717e-07, "logits/chosen": 0.90673828125, "logits/rejected": 1.3369140625, "logps/chosen": -988.5, "logps/rejected": -1133.0, "loss": 0.7925, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4595947265625, "rewards/margins": 1.83984375, "rewards/rejected": -1.37890625, "step": 487 }, { "epoch": 0.0922103075251547, "grad_norm": 2.0403071686422187, "learning_rate": 9.188679245283019e-07, "logits/chosen": 1.166015625, "logits/rejected": 1.5390625, "logps/chosen": -852.5, "logps/rejected": -674.0, "loss": 0.8043, "rewards/accuracies": 0.6875, "rewards/chosen": 0.165771484375, "rewards/margins": 1.0458984375, "rewards/rejected": -0.87939453125, "step": 488 }, { "epoch": 0.09239926307336199, "grad_norm": 2.433563021766201, "learning_rate": 9.20754716981132e-07, "logits/chosen": 1.72021484375, "logits/rejected": 2.2666015625, "logps/chosen": -1147.0, "logps/rejected": -931.0, "loss": 0.7752, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0203857421875, "rewards/margins": 1.3388671875, "rewards/rejected": -1.3212890625, "step": 489 }, { "epoch": 0.09258821862156928, "grad_norm": 2.323144195532946, "learning_rate": 9.226415094339623e-07, "logits/chosen": 1.49609375, "logits/rejected": 2.46630859375, "logps/chosen": -719.0, "logps/rejected": -987.0, "loss": 0.756, "rewards/accuracies": 0.90625, "rewards/chosen": 0.00823974609375, "rewards/margins": 1.4267578125, "rewards/rejected": -1.41796875, "step": 490 }, { "epoch": 0.09277717416977656, "grad_norm": 2.5914339228804018, "learning_rate": 9.245283018867924e-07, "logits/chosen": 1.119384765625, "logits/rejected": 2.09228515625, "logps/chosen": -776.5, "logps/rejected": -819.0, "loss": 0.7459, "rewards/accuracies": 0.75, "rewards/chosen": 0.22711181640625, "rewards/margins": 1.5087890625, "rewards/rejected": -1.28125, "step": 491 }, { "epoch": 0.09296612971798385, "grad_norm": 1.986294034387935, "learning_rate": 9.264150943396226e-07, "logits/chosen": -0.026611328125, "logits/rejected": -0.4244384765625, "logps/chosen": -508.0, "logps/rejected": -465.0, "loss": 0.7974, "rewards/accuracies": 0.78125, "rewards/chosen": 0.155517578125, "rewards/margins": 0.9873046875, "rewards/rejected": -0.83203125, "step": 492 }, { "epoch": 0.09315508526619112, "grad_norm": 2.0969968589558547, "learning_rate": 9.283018867924528e-07, "logits/chosen": 1.927001953125, "logits/rejected": 2.2080078125, "logps/chosen": -758.0, "logps/rejected": -689.5, "loss": 0.8229, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2501220703125, "rewards/margins": 1.08251953125, "rewards/rejected": -0.8291015625, "step": 493 }, { "epoch": 0.09334404081439841, "grad_norm": 2.3100008446352667, "learning_rate": 9.30188679245283e-07, "logits/chosen": 1.27508544921875, "logits/rejected": 1.291259765625, "logps/chosen": -910.5, "logps/rejected": -760.0, "loss": 0.7798, "rewards/accuracies": 0.78125, "rewards/chosen": 0.49273681640625, "rewards/margins": 1.30810546875, "rewards/rejected": -0.818359375, "step": 494 }, { "epoch": 0.0935329963626057, "grad_norm": 2.510163959033215, "learning_rate": 9.320754716981132e-07, "logits/chosen": 0.41082763671875, "logits/rejected": 1.6728515625, "logps/chosen": -1032.0, "logps/rejected": -1291.0, "loss": 0.7049, "rewards/accuracies": 0.875, "rewards/chosen": 0.40869140625, "rewards/margins": 2.099609375, "rewards/rejected": -1.69140625, "step": 495 }, { "epoch": 0.09372195191081298, "grad_norm": 2.3871772260573767, "learning_rate": 9.339622641509434e-07, "logits/chosen": 0.6025390625, "logits/rejected": 1.02734375, "logps/chosen": -747.0, "logps/rejected": -1300.0, "loss": 0.7304, "rewards/accuracies": 0.84375, "rewards/chosen": 0.231201171875, "rewards/margins": 1.8125, "rewards/rejected": -1.5810546875, "step": 496 }, { "epoch": 0.09391090745902027, "grad_norm": 2.479351732501454, "learning_rate": 9.358490566037735e-07, "logits/chosen": 0.37890625, "logits/rejected": 0.52587890625, "logps/chosen": -741.0, "logps/rejected": -674.0, "loss": 0.7599, "rewards/accuracies": 0.875, "rewards/chosen": 0.1869964599609375, "rewards/margins": 1.302734375, "rewards/rejected": -1.1162109375, "step": 497 }, { "epoch": 0.09409986300722754, "grad_norm": 2.0237532830877303, "learning_rate": 9.377358490566038e-07, "logits/chosen": 1.25927734375, "logits/rejected": 1.490234375, "logps/chosen": -646.5, "logps/rejected": -603.0, "loss": 0.9144, "rewards/accuracies": 0.625, "rewards/chosen": -0.19207763671875, "rewards/margins": 0.611572265625, "rewards/rejected": -0.8037109375, "step": 498 }, { "epoch": 0.09428881855543483, "grad_norm": 2.11977211351733, "learning_rate": 9.396226415094339e-07, "logits/chosen": 1.4482421875, "logits/rejected": 1.62890625, "logps/chosen": -869.0, "logps/rejected": -825.0, "loss": 0.7593, "rewards/accuracies": 0.90625, "rewards/chosen": 0.24560546875, "rewards/margins": 1.48046875, "rewards/rejected": -1.234375, "step": 499 }, { "epoch": 0.09447777410364212, "grad_norm": 2.011074912573527, "learning_rate": 9.415094339622641e-07, "logits/chosen": 0.53271484375, "logits/rejected": 1.287109375, "logps/chosen": -723.0, "logps/rejected": -888.0, "loss": 0.7995, "rewards/accuracies": 0.75, "rewards/chosen": 0.26495361328125, "rewards/margins": 1.841796875, "rewards/rejected": -1.578125, "step": 500 }, { "epoch": 0.0946667296518494, "grad_norm": 2.2948926741886724, "learning_rate": 9.433962264150943e-07, "logits/chosen": -0.271240234375, "logits/rejected": -0.36279296875, "logps/chosen": -913.0, "logps/rejected": -646.0, "loss": 0.7775, "rewards/accuracies": 0.75, "rewards/chosen": 0.2567138671875, "rewards/margins": 1.404296875, "rewards/rejected": -1.1474609375, "step": 501 }, { "epoch": 0.09485568520005669, "grad_norm": 2.3466528306575656, "learning_rate": 9.452830188679245e-07, "logits/chosen": 0.3447265625, "logits/rejected": 0.6893310546875, "logps/chosen": -624.5, "logps/rejected": -743.5, "loss": 0.8436, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1352996826171875, "rewards/margins": 1.16455078125, "rewards/rejected": -1.0283203125, "step": 502 }, { "epoch": 0.09504464074826396, "grad_norm": 2.292908369633121, "learning_rate": 9.471698113207547e-07, "logits/chosen": 0.013671875, "logits/rejected": 0.5330810546875, "logps/chosen": -792.5, "logps/rejected": -713.5, "loss": 0.8229, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15869140625, "rewards/margins": 1.1591796875, "rewards/rejected": -1.0009765625, "step": 503 }, { "epoch": 0.09523359629647125, "grad_norm": 2.2770245185930906, "learning_rate": 9.490566037735849e-07, "logits/chosen": 1.17431640625, "logits/rejected": 0.932373046875, "logps/chosen": -624.0, "logps/rejected": -526.5, "loss": 0.8594, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13525390625, "rewards/margins": 0.79296875, "rewards/rejected": -0.658203125, "step": 504 }, { "epoch": 0.09542255184467854, "grad_norm": 2.6084509458357665, "learning_rate": 9.50943396226415e-07, "logits/chosen": 0.1962890625, "logits/rejected": 0.21856689453125, "logps/chosen": -1116.0, "logps/rejected": -1086.0, "loss": 0.7136, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7518310546875, "rewards/margins": 2.0234375, "rewards/rejected": -1.26953125, "step": 505 }, { "epoch": 0.09561150739288582, "grad_norm": 2.4496740690179513, "learning_rate": 9.528301886792453e-07, "logits/chosen": 1.0048828125, "logits/rejected": 1.029296875, "logps/chosen": -1079.0, "logps/rejected": -875.0, "loss": 0.7702, "rewards/accuracies": 0.75, "rewards/chosen": 0.5615234375, "rewards/margins": 1.7724609375, "rewards/rejected": -1.209716796875, "step": 506 }, { "epoch": 0.09580046294109311, "grad_norm": 2.1639203505861646, "learning_rate": 9.547169811320755e-07, "logits/chosen": 0.916015625, "logits/rejected": 0.831787109375, "logps/chosen": -812.0, "logps/rejected": -628.0, "loss": 0.7507, "rewards/accuracies": 0.875, "rewards/chosen": 0.2193450927734375, "rewards/margins": 1.337890625, "rewards/rejected": -1.119140625, "step": 507 }, { "epoch": 0.0959894184893004, "grad_norm": 2.1382203901570316, "learning_rate": 9.566037735849056e-07, "logits/chosen": 0.7587890625, "logits/rejected": 1.001953125, "logps/chosen": -948.0, "logps/rejected": -1009.5, "loss": 0.6845, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6260986328125, "rewards/margins": 2.14453125, "rewards/rejected": -1.515625, "step": 508 }, { "epoch": 0.09617837403750767, "grad_norm": 2.325488500413768, "learning_rate": 9.584905660377358e-07, "logits/chosen": 1.28515625, "logits/rejected": 1.078125, "logps/chosen": -639.0, "logps/rejected": -551.0, "loss": 0.8315, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19287109375, "rewards/margins": 0.9228515625, "rewards/rejected": -0.73095703125, "step": 509 }, { "epoch": 0.09636732958571496, "grad_norm": 1.977610531723706, "learning_rate": 9.60377358490566e-07, "logits/chosen": 1.4853515625, "logits/rejected": 2.23828125, "logps/chosen": -677.0, "logps/rejected": -939.0, "loss": 0.758, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3087158203125, "rewards/margins": 1.6376953125, "rewards/rejected": -1.32421875, "step": 510 }, { "epoch": 0.09655628513392224, "grad_norm": 1.8292940889546307, "learning_rate": 9.622641509433961e-07, "logits/chosen": 0.6513671875, "logits/rejected": 0.519775390625, "logps/chosen": -634.5, "logps/rejected": -640.5, "loss": 0.8029, "rewards/accuracies": 0.84375, "rewards/chosen": 0.22991943359375, "rewards/margins": 1.31689453125, "rewards/rejected": -1.087890625, "step": 511 }, { "epoch": 0.09674524068212953, "grad_norm": 2.4595706019244363, "learning_rate": 9.641509433962264e-07, "logits/chosen": 0.52880859375, "logits/rejected": 0.861328125, "logps/chosen": -1009.0, "logps/rejected": -913.0, "loss": 0.7592, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2442626953125, "rewards/margins": 1.98046875, "rewards/rejected": -1.7373046875, "step": 512 }, { "epoch": 0.09693419623033682, "grad_norm": 5.2659720866111375, "learning_rate": 9.660377358490566e-07, "logits/chosen": 0.8946533203125, "logits/rejected": 1.3759765625, "logps/chosen": -932.5, "logps/rejected": -1311.5, "loss": 0.7623, "rewards/accuracies": 0.8125, "rewards/chosen": 0.01312255859375, "rewards/margins": 1.974609375, "rewards/rejected": -1.96484375, "step": 513 }, { "epoch": 0.0971231517785441, "grad_norm": 1.8751305802953542, "learning_rate": 9.679245283018867e-07, "logits/chosen": 1.06640625, "logits/rejected": 1.38720703125, "logps/chosen": -1075.5, "logps/rejected": -611.5, "loss": 0.8021, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5807037353515625, "rewards/margins": 0.65283203125, "rewards/rejected": -1.2333984375, "step": 514 }, { "epoch": 0.09731210732675138, "grad_norm": 2.280578684672403, "learning_rate": 9.69811320754717e-07, "logits/chosen": -0.072021484375, "logits/rejected": 0.417724609375, "logps/chosen": -699.5, "logps/rejected": -885.0, "loss": 0.7466, "rewards/accuracies": 0.75, "rewards/chosen": 0.059814453125, "rewards/margins": 1.806640625, "rewards/rejected": -1.74609375, "step": 515 }, { "epoch": 0.09750106287495866, "grad_norm": 2.4923967854438525, "learning_rate": 9.716981132075472e-07, "logits/chosen": 0.0546875, "logits/rejected": 0.78759765625, "logps/chosen": -1074.0, "logps/rejected": -1571.0, "loss": 0.7293, "rewards/accuracies": 0.875, "rewards/chosen": 0.406005859375, "rewards/margins": 2.244140625, "rewards/rejected": -1.83984375, "step": 516 }, { "epoch": 0.09769001842316595, "grad_norm": 1.9050224247627028, "learning_rate": 9.735849056603772e-07, "logits/chosen": 1.00341796875, "logits/rejected": 1.1494140625, "logps/chosen": -914.0, "logps/rejected": -537.5, "loss": 0.8971, "rewards/accuracies": 0.6875, "rewards/chosen": -0.500732421875, "rewards/margins": 0.22998046875, "rewards/rejected": -0.732666015625, "step": 517 }, { "epoch": 0.09787897397137324, "grad_norm": 2.2521289197165633, "learning_rate": 9.754716981132075e-07, "logits/chosen": 0.34893798828125, "logits/rejected": 0.8524169921875, "logps/chosen": -642.0, "logps/rejected": -768.0, "loss": 0.7736, "rewards/accuracies": 0.84375, "rewards/chosen": 0.186767578125, "rewards/margins": 1.2548828125, "rewards/rejected": -1.0703125, "step": 518 }, { "epoch": 0.09806792951958052, "grad_norm": 1.9976241539041752, "learning_rate": 9.773584905660377e-07, "logits/chosen": 0.781982421875, "logits/rejected": 1.44189453125, "logps/chosen": -627.0, "logps/rejected": -719.0, "loss": 0.7722, "rewards/accuracies": 0.875, "rewards/chosen": 0.1065673828125, "rewards/margins": 1.40234375, "rewards/rejected": -1.298828125, "step": 519 }, { "epoch": 0.0982568850677878, "grad_norm": 2.149268206166489, "learning_rate": 9.792452830188678e-07, "logits/chosen": 0.6044921875, "logits/rejected": 1.0126953125, "logps/chosen": -713.5, "logps/rejected": -750.0, "loss": 0.7494, "rewards/accuracies": 0.75, "rewards/chosen": 0.299560546875, "rewards/margins": 1.66796875, "rewards/rejected": -1.369140625, "step": 520 }, { "epoch": 0.09844584061599508, "grad_norm": 1.928496208342038, "learning_rate": 9.81132075471698e-07, "logits/chosen": 0.7971038818359375, "logits/rejected": 0.677734375, "logps/chosen": -592.5, "logps/rejected": -493.5, "loss": 0.832, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10601806640625, "rewards/margins": 0.97802734375, "rewards/rejected": -0.870849609375, "step": 521 }, { "epoch": 0.09863479616420237, "grad_norm": 2.3928622916226523, "learning_rate": 9.830188679245283e-07, "logits/chosen": 1.26513671875, "logits/rejected": 2.259765625, "logps/chosen": -1031.0, "logps/rejected": -1709.0, "loss": 0.7368, "rewards/accuracies": 0.78125, "rewards/chosen": 0.292236328125, "rewards/margins": 2.42578125, "rewards/rejected": -2.1328125, "step": 522 }, { "epoch": 0.09882375171240966, "grad_norm": 1.9338899010354549, "learning_rate": 9.849056603773586e-07, "logits/chosen": 1.093994140625, "logits/rejected": 1.5009765625, "logps/chosen": -663.75, "logps/rejected": -723.0, "loss": 0.8716, "rewards/accuracies": 0.78125, "rewards/chosen": -0.15716552734375, "rewards/margins": 1.0445556640625, "rewards/rejected": -1.19970703125, "step": 523 }, { "epoch": 0.09901270726061694, "grad_norm": 2.229311075853531, "learning_rate": 9.867924528301886e-07, "logits/chosen": 1.4658203125, "logits/rejected": 1.5146484375, "logps/chosen": -838.0, "logps/rejected": -758.0, "loss": 0.7001, "rewards/accuracies": 0.96875, "rewards/chosen": 0.0308837890625, "rewards/margins": 2.048828125, "rewards/rejected": -2.01953125, "step": 524 }, { "epoch": 0.09920166280882423, "grad_norm": 2.498572695481409, "learning_rate": 9.886792452830189e-07, "logits/chosen": 0.6300048828125, "logits/rejected": 1.18017578125, "logps/chosen": -1086.0, "logps/rejected": -1075.5, "loss": 0.7462, "rewards/accuracies": 0.8125, "rewards/chosen": 0.502197265625, "rewards/margins": 1.98828125, "rewards/rejected": -1.484375, "step": 525 }, { "epoch": 0.0993906183570315, "grad_norm": 2.0019522783062604, "learning_rate": 9.90566037735849e-07, "logits/chosen": 1.14532470703125, "logits/rejected": 0.90283203125, "logps/chosen": -642.5, "logps/rejected": -423.0, "loss": 0.8508, "rewards/accuracies": 0.78125, "rewards/chosen": 0.102783203125, "rewards/margins": 0.76318359375, "rewards/rejected": -0.660614013671875, "step": 526 }, { "epoch": 0.09957957390523879, "grad_norm": 2.036645952708192, "learning_rate": 9.924528301886791e-07, "logits/chosen": 0.2161865234375, "logits/rejected": 0.2177734375, "logps/chosen": -687.0, "logps/rejected": -534.0, "loss": 0.7886, "rewards/accuracies": 0.75, "rewards/chosen": 0.2261962890625, "rewards/margins": 1.189453125, "rewards/rejected": -0.966064453125, "step": 527 }, { "epoch": 0.09976852945344608, "grad_norm": 2.149251452040631, "learning_rate": 9.943396226415094e-07, "logits/chosen": 0.9853515625, "logits/rejected": 1.73828125, "logps/chosen": -872.0, "logps/rejected": -905.0, "loss": 0.7371, "rewards/accuracies": 0.875, "rewards/chosen": 0.2532958984375, "rewards/margins": 1.8203125, "rewards/rejected": -1.5654296875, "step": 528 }, { "epoch": 0.09995748500165336, "grad_norm": 2.4702034755206244, "learning_rate": 9.962264150943397e-07, "logits/chosen": 0.7874755859375, "logits/rejected": 1.78619384765625, "logps/chosen": -1069.0, "logps/rejected": -1148.0, "loss": 0.7937, "rewards/accuracies": 0.6875, "rewards/chosen": 0.27130126953125, "rewards/margins": 1.548828125, "rewards/rejected": -1.27587890625, "step": 529 }, { "epoch": 0.10014644054986065, "grad_norm": 2.116394256749048, "learning_rate": 9.981132075471697e-07, "logits/chosen": 0.79833984375, "logits/rejected": 0.5299072265625, "logps/chosen": -779.5, "logps/rejected": -608.0, "loss": 0.7128, "rewards/accuracies": 0.875, "rewards/chosen": 0.22393798828125, "rewards/margins": 1.736328125, "rewards/rejected": -1.513671875, "step": 530 }, { "epoch": 0.10033539609806794, "grad_norm": 2.0402103677676178, "learning_rate": 1e-06, "logits/chosen": 1.303466796875, "logits/rejected": 1.20068359375, "logps/chosen": -577.0, "logps/rejected": -558.5, "loss": 0.7842, "rewards/accuracies": 0.8125, "rewards/chosen": 0.319580078125, "rewards/margins": 1.3203125, "rewards/rejected": -1.0, "step": 531 }, { "epoch": 0.10052435164627521, "grad_norm": 2.0018269415972285, "learning_rate": 9.999999021138866e-07, "logits/chosen": 1.5509033203125, "logits/rejected": 1.760009765625, "logps/chosen": -683.0, "logps/rejected": -923.0, "loss": 0.8061, "rewards/accuracies": 0.6875, "rewards/chosen": 0.180999755859375, "rewards/margins": 1.431640625, "rewards/rejected": -1.248046875, "step": 532 }, { "epoch": 0.1007133071944825, "grad_norm": 2.0550477027957426, "learning_rate": 9.999996084555892e-07, "logits/chosen": 0.98193359375, "logits/rejected": 1.03662109375, "logps/chosen": -458.0, "logps/rejected": -395.25, "loss": 0.8891, "rewards/accuracies": 0.625, "rewards/chosen": -0.052764892578125, "rewards/margins": 0.55322265625, "rewards/rejected": -0.603515625, "step": 533 }, { "epoch": 0.10090226274268978, "grad_norm": 2.7769222971382463, "learning_rate": 9.999991190252355e-07, "logits/chosen": 0.5650634765625, "logits/rejected": 0.986572265625, "logps/chosen": -1019.0, "logps/rejected": -871.0, "loss": 0.7667, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4716796875, "rewards/margins": 1.544921875, "rewards/rejected": -1.0712890625, "step": 534 }, { "epoch": 0.10109121829089707, "grad_norm": 2.688085654111965, "learning_rate": 9.999984338230383e-07, "logits/chosen": 0.4833984375, "logits/rejected": 1.081298828125, "logps/chosen": -438.5, "logps/rejected": -504.5, "loss": 0.8016, "rewards/accuracies": 0.78125, "rewards/chosen": 0.15679931640625, "rewards/margins": 1.013671875, "rewards/rejected": -0.85791015625, "step": 535 }, { "epoch": 0.10128017383910436, "grad_norm": 1.97818190772725, "learning_rate": 9.99997552849296e-07, "logits/chosen": 0.9105224609375, "logits/rejected": 0.9833984375, "logps/chosen": -669.5, "logps/rejected": -946.0, "loss": 0.7305, "rewards/accuracies": 0.84375, "rewards/chosen": 0.199951171875, "rewards/margins": 1.828125, "rewards/rejected": -1.62890625, "step": 536 }, { "epoch": 0.10146912938731163, "grad_norm": 2.3716742883047495, "learning_rate": 9.999964761043916e-07, "logits/chosen": 0.125732421875, "logits/rejected": 0.6240234375, "logps/chosen": -865.0, "logps/rejected": -1505.0, "loss": 0.7212, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2606201171875, "rewards/margins": 2.2392578125, "rewards/rejected": -1.9775390625, "step": 537 }, { "epoch": 0.10165808493551892, "grad_norm": 1.909635674194512, "learning_rate": 9.999952035887936e-07, "logits/chosen": 0.619140625, "logits/rejected": 1.15478515625, "logps/chosen": -842.5, "logps/rejected": -759.0, "loss": 0.7384, "rewards/accuracies": 0.75, "rewards/chosen": 0.5452880859375, "rewards/margins": 1.9228515625, "rewards/rejected": -1.376953125, "step": 538 }, { "epoch": 0.1018470404837262, "grad_norm": 2.196619856010926, "learning_rate": 9.999937353030555e-07, "logits/chosen": 0.162353515625, "logits/rejected": 1.396484375, "logps/chosen": -807.5, "logps/rejected": -1099.0, "loss": 0.748, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4190673828125, "rewards/margins": 1.849609375, "rewards/rejected": -1.4296875, "step": 539 }, { "epoch": 0.10203599603193349, "grad_norm": 2.2999176757330955, "learning_rate": 9.999920712478163e-07, "logits/chosen": 0.267578125, "logits/rejected": 0.54296875, "logps/chosen": -964.5, "logps/rejected": -812.0, "loss": 0.7287, "rewards/accuracies": 0.875, "rewards/chosen": 0.365478515625, "rewards/margins": 1.71484375, "rewards/rejected": -1.3505859375, "step": 540 }, { "epoch": 0.10222495158014078, "grad_norm": 5.313177656409165, "learning_rate": 9.999902114237997e-07, "logits/chosen": 1.4677734375, "logits/rejected": 2.345703125, "logps/chosen": -950.0, "logps/rejected": -2022.0, "loss": 0.8107, "rewards/accuracies": 0.75, "rewards/chosen": 0.2347412109375, "rewards/margins": 1.8037109375, "rewards/rejected": -1.572265625, "step": 541 }, { "epoch": 0.10241390712834805, "grad_norm": 2.237271397038368, "learning_rate": 9.999881558318152e-07, "logits/chosen": 0.123779296875, "logits/rejected": 0.38134765625, "logps/chosen": -875.0, "logps/rejected": -18558.0, "loss": 0.7524, "rewards/accuracies": 0.84375, "rewards/chosen": 0.094482421875, "rewards/margins": 52.51953125, "rewards/rejected": -52.3994140625, "step": 542 }, { "epoch": 0.10260286267655534, "grad_norm": 1.95575769069871, "learning_rate": 9.999859044727567e-07, "logits/chosen": 0.7001953125, "logits/rejected": 1.1435546875, "logps/chosen": -579.0, "logps/rejected": -635.0, "loss": 0.8145, "rewards/accuracies": 0.71875, "rewards/chosen": 0.196533203125, "rewards/margins": 0.9873046875, "rewards/rejected": -0.7900390625, "step": 543 }, { "epoch": 0.10279181822476262, "grad_norm": 2.8015606140764917, "learning_rate": 9.999834573476039e-07, "logits/chosen": 0.274169921875, "logits/rejected": 0.79931640625, "logps/chosen": -550.5, "logps/rejected": -987.0, "loss": 0.83, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0091552734375, "rewards/margins": 1.01416015625, "rewards/rejected": -1.0234375, "step": 544 }, { "epoch": 0.10298077377296991, "grad_norm": 2.302820728677797, "learning_rate": 9.999808144574213e-07, "logits/chosen": 0.23046875, "logits/rejected": 0.9755859375, "logps/chosen": -769.0, "logps/rejected": -852.0, "loss": 0.7455, "rewards/accuracies": 0.78125, "rewards/chosen": 0.300048828125, "rewards/margins": 1.63720703125, "rewards/rejected": -1.33740234375, "step": 545 }, { "epoch": 0.1031697293211772, "grad_norm": 2.330850900398744, "learning_rate": 9.999779758033584e-07, "logits/chosen": 1.2679443359375, "logits/rejected": 1.3896484375, "logps/chosen": -1018.0, "logps/rejected": -923.0, "loss": 0.7288, "rewards/accuracies": 0.78125, "rewards/chosen": 0.597900390625, "rewards/margins": 1.7548828125, "rewards/rejected": -1.1552734375, "step": 546 }, { "epoch": 0.10335868486938447, "grad_norm": 2.021192468606078, "learning_rate": 9.999749413866509e-07, "logits/chosen": 0.6123046875, "logits/rejected": 0.68988037109375, "logps/chosen": -720.5, "logps/rejected": -986.0, "loss": 0.795, "rewards/accuracies": 0.8125, "rewards/chosen": -0.024169921875, "rewards/margins": 1.333984375, "rewards/rejected": -1.3583984375, "step": 547 }, { "epoch": 0.10354764041759176, "grad_norm": 2.124268295978602, "learning_rate": 9.999717112086182e-07, "logits/chosen": 0.4840087890625, "logits/rejected": 0.65869140625, "logps/chosen": -969.5, "logps/rejected": -668.0, "loss": 0.7597, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4903564453125, "rewards/margins": 1.6201171875, "rewards/rejected": -1.1298828125, "step": 548 }, { "epoch": 0.10373659596579904, "grad_norm": 1.8815053931546006, "learning_rate": 9.99968285270666e-07, "logits/chosen": 0.857421875, "logits/rejected": 0.689453125, "logps/chosen": -903.5, "logps/rejected": -852.5, "loss": 0.7392, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5546875, "rewards/margins": 1.951171875, "rewards/rejected": -1.396484375, "step": 549 }, { "epoch": 0.10392555151400633, "grad_norm": 2.252573323780509, "learning_rate": 9.999646635742845e-07, "logits/chosen": 0.95458984375, "logits/rejected": 0.705810546875, "logps/chosen": -705.0, "logps/rejected": -519.5, "loss": 0.7894, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0535736083984375, "rewards/margins": 1.177734375, "rewards/rejected": -1.232421875, "step": 550 }, { "epoch": 0.10411450706221362, "grad_norm": 2.781849231505212, "learning_rate": 9.999608461210495e-07, "logits/chosen": 1.0958251953125, "logits/rejected": 2.48046875, "logps/chosen": -1007.0, "logps/rejected": -1910.0, "loss": 0.6637, "rewards/accuracies": 0.90625, "rewards/chosen": 0.54296875, "rewards/margins": 2.7177734375, "rewards/rejected": -2.1796875, "step": 551 }, { "epoch": 0.1043034626104209, "grad_norm": 3.1299239119329823, "learning_rate": 9.999568329126217e-07, "logits/chosen": 0.388671875, "logits/rejected": 0.8955078125, "logps/chosen": -517.5, "logps/rejected": -420.0, "loss": 0.8358, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1900634765625, "rewards/margins": 0.8486328125, "rewards/rejected": -0.66015625, "step": 552 }, { "epoch": 0.10449241815862818, "grad_norm": 2.446846696711488, "learning_rate": 9.99952623950747e-07, "logits/chosen": 1.23828125, "logits/rejected": 2.029296875, "logps/chosen": -816.5, "logps/rejected": -731.0, "loss": 0.7554, "rewards/accuracies": 0.84375, "rewards/chosen": -0.00244140625, "rewards/margins": 1.453125, "rewards/rejected": -1.45703125, "step": 553 }, { "epoch": 0.10468137370683547, "grad_norm": 2.2502620357714154, "learning_rate": 9.999482192372565e-07, "logits/chosen": 1.25537109375, "logits/rejected": 1.41448974609375, "logps/chosen": -1259.0, "logps/rejected": -1451.0, "loss": 0.6491, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8017578125, "rewards/margins": 2.748046875, "rewards/rejected": -1.9423828125, "step": 554 }, { "epoch": 0.10487032925504275, "grad_norm": 2.2898146599494984, "learning_rate": 9.999436187740668e-07, "logits/chosen": 0.8427734375, "logits/rejected": 1.2509765625, "logps/chosen": -722.0, "logps/rejected": -748.5, "loss": 0.8005, "rewards/accuracies": 0.75, "rewards/chosen": -0.020751953125, "rewards/margins": 1.1279296875, "rewards/rejected": -1.1494140625, "step": 555 }, { "epoch": 0.10505928480325004, "grad_norm": 2.2019230111881605, "learning_rate": 9.99938822563179e-07, "logits/chosen": 0.796142578125, "logits/rejected": 0.52392578125, "logps/chosen": -848.5, "logps/rejected": -1160.0, "loss": 0.7581, "rewards/accuracies": 0.78125, "rewards/chosen": 0.135009765625, "rewards/margins": 1.982421875, "rewards/rejected": -1.84765625, "step": 556 }, { "epoch": 0.10524824035145731, "grad_norm": 2.1350450027011476, "learning_rate": 9.999338306066796e-07, "logits/chosen": 0.73779296875, "logits/rejected": 1.24853515625, "logps/chosen": -714.5, "logps/rejected": -662.5, "loss": 0.767, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1455078125, "rewards/margins": 1.4375, "rewards/rejected": -1.2919921875, "step": 557 }, { "epoch": 0.1054371958996646, "grad_norm": 2.0792143212561416, "learning_rate": 9.999286429067406e-07, "logits/chosen": 0.316650390625, "logits/rejected": 0.6982421875, "logps/chosen": -594.0, "logps/rejected": -461.0, "loss": 0.7678, "rewards/accuracies": 0.8125, "rewards/chosen": 0.124755859375, "rewards/margins": 1.3623046875, "rewards/rejected": -1.2353515625, "step": 558 }, { "epoch": 0.1056261514478719, "grad_norm": 2.4817479717542, "learning_rate": 9.999232594656186e-07, "logits/chosen": 1.1953125, "logits/rejected": 1.44189453125, "logps/chosen": -912.0, "logps/rejected": -1220.5, "loss": 0.7604, "rewards/accuracies": 0.71875, "rewards/chosen": 0.237060546875, "rewards/margins": 1.7392578125, "rewards/rejected": -1.5029296875, "step": 559 }, { "epoch": 0.10581510699607917, "grad_norm": 2.2556916343069617, "learning_rate": 9.99917680285656e-07, "logits/chosen": 1.074462890625, "logits/rejected": 1.181640625, "logps/chosen": -979.0, "logps/rejected": -823.0, "loss": 0.6654, "rewards/accuracies": 0.875, "rewards/chosen": 0.4913330078125, "rewards/margins": 1.966796875, "rewards/rejected": -1.474609375, "step": 560 }, { "epoch": 0.10600406254428646, "grad_norm": 2.3441595454930026, "learning_rate": 9.9991190536928e-07, "logits/chosen": 1.0596923828125, "logits/rejected": 1.892578125, "logps/chosen": -738.0, "logps/rejected": -1760.0, "loss": 0.7696, "rewards/accuracies": 0.71875, "rewards/chosen": 0.13531494140625, "rewards/margins": 2.302734375, "rewards/rejected": -2.169921875, "step": 561 }, { "epoch": 0.10619301809249374, "grad_norm": 1.8901981871522182, "learning_rate": 9.999059347190027e-07, "logits/chosen": 1.5078125, "logits/rejected": 1.23681640625, "logps/chosen": -551.5, "logps/rejected": -855.0, "loss": 0.7693, "rewards/accuracies": 0.75, "rewards/chosen": 0.2400970458984375, "rewards/margins": 1.4248046875, "rewards/rejected": -1.18359375, "step": 562 }, { "epoch": 0.10638197364070102, "grad_norm": 2.084548370727907, "learning_rate": 9.998997683374217e-07, "logits/chosen": 0.25830078125, "logits/rejected": 1.3759765625, "logps/chosen": -795.5, "logps/rejected": -1862.0, "loss": 0.6531, "rewards/accuracies": 0.9375, "rewards/chosen": 0.664306640625, "rewards/margins": 3.24609375, "rewards/rejected": -2.58203125, "step": 563 }, { "epoch": 0.10657092918890831, "grad_norm": 2.2368150339225226, "learning_rate": 9.9989340622722e-07, "logits/chosen": 0.276641845703125, "logits/rejected": 0.7841796875, "logps/chosen": -802.0, "logps/rejected": -15489.0, "loss": 0.7395, "rewards/accuracies": 0.875, "rewards/chosen": 0.2220458984375, "rewards/margins": 33.681640625, "rewards/rejected": -33.6328125, "step": 564 }, { "epoch": 0.10675988473711559, "grad_norm": 2.2165721713105744, "learning_rate": 9.998868483911652e-07, "logits/chosen": 1.119140625, "logits/rejected": 0.5556640625, "logps/chosen": -1475.0, "logps/rejected": -901.5, "loss": 0.5934, "rewards/accuracies": 1.0, "rewards/chosen": 0.669921875, "rewards/margins": 2.455078125, "rewards/rejected": -1.78125, "step": 565 }, { "epoch": 0.10694884028532288, "grad_norm": 2.217617211667235, "learning_rate": 9.9988009483211e-07, "logits/chosen": 0.1221923828125, "logits/rejected": 0.5915069580078125, "logps/chosen": -695.0, "logps/rejected": -669.5, "loss": 0.7861, "rewards/accuracies": 0.75, "rewards/chosen": 0.083740234375, "rewards/margins": 1.3291015625, "rewards/rejected": -1.24609375, "step": 566 }, { "epoch": 0.10713779583353016, "grad_norm": 2.3763063335516916, "learning_rate": 9.998731455529931e-07, "logits/chosen": 1.7421875, "logits/rejected": 1.80810546875, "logps/chosen": -784.0, "logps/rejected": -743.0, "loss": 0.7869, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2598876953125, "rewards/margins": 1.451171875, "rewards/rejected": -1.193359375, "step": 567 }, { "epoch": 0.10732675138173745, "grad_norm": 2.1232647104048556, "learning_rate": 9.998660005568374e-07, "logits/chosen": 0.831787109375, "logits/rejected": 1.5341796875, "logps/chosen": -751.0, "logps/rejected": -799.5, "loss": 0.8005, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1556396484375, "rewards/margins": 1.4287109375, "rewards/rejected": -1.2734375, "step": 568 }, { "epoch": 0.10751570692994473, "grad_norm": 2.0519392295034313, "learning_rate": 9.998586598467515e-07, "logits/chosen": 0.896484375, "logits/rejected": 0.9931640625, "logps/chosen": -641.5, "logps/rejected": -514.5, "loss": 0.7855, "rewards/accuracies": 0.84375, "rewards/chosen": 0.11572265625, "rewards/margins": 1.236328125, "rewards/rejected": -1.1171875, "step": 569 }, { "epoch": 0.10770466247815201, "grad_norm": 2.605035116000079, "learning_rate": 9.998511234259286e-07, "logits/chosen": 0.64404296875, "logits/rejected": 1.14453125, "logps/chosen": -711.5, "logps/rejected": -1082.5, "loss": 0.7294, "rewards/accuracies": 0.84375, "rewards/chosen": 0.28265380859375, "rewards/margins": 1.865234375, "rewards/rejected": -1.58203125, "step": 570 }, { "epoch": 0.1078936180263593, "grad_norm": 1.8371368305402918, "learning_rate": 9.998433912976481e-07, "logits/chosen": 1.359375, "logits/rejected": 1.6650390625, "logps/chosen": -1023.0, "logps/rejected": -1237.5, "loss": 0.7284, "rewards/accuracies": 0.875, "rewards/chosen": 0.580047607421875, "rewards/margins": 2.46484375, "rewards/rejected": -1.8896484375, "step": 571 }, { "epoch": 0.10808257357456659, "grad_norm": 2.1289053693801985, "learning_rate": 9.998354634652732e-07, "logits/chosen": 1.365234375, "logits/rejected": 1.517578125, "logps/chosen": -655.0, "logps/rejected": -580.5, "loss": 0.7904, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2249755859375, "rewards/margins": 1.23974609375, "rewards/rejected": -1.0126953125, "step": 572 }, { "epoch": 0.10827152912277387, "grad_norm": 2.2469438767965566, "learning_rate": 9.998273399322533e-07, "logits/chosen": 1.951171875, "logits/rejected": 0.8505859375, "logps/chosen": -705.0, "logps/rejected": -783.0, "loss": 0.8063, "rewards/accuracies": 0.8125, "rewards/chosen": 0.21240234375, "rewards/margins": 1.22705078125, "rewards/rejected": -1.013671875, "step": 573 }, { "epoch": 0.10846048467098116, "grad_norm": 1.984051890698667, "learning_rate": 9.998190207021222e-07, "logits/chosen": 0.84814453125, "logits/rejected": 1.584228515625, "logps/chosen": -681.5, "logps/rejected": -774.0, "loss": 0.7327, "rewards/accuracies": 0.75, "rewards/chosen": 0.170989990234375, "rewards/margins": 1.748046875, "rewards/rejected": -1.57421875, "step": 574 }, { "epoch": 0.10864944021918843, "grad_norm": 3.5063291120470823, "learning_rate": 9.998105057784995e-07, "logits/chosen": 0.49169921875, "logits/rejected": 1.1552734375, "logps/chosen": -629.5, "logps/rejected": -665.75, "loss": 0.8233, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0595703125, "rewards/margins": 1.26171875, "rewards/rejected": -1.3203125, "step": 575 }, { "epoch": 0.10883839576739572, "grad_norm": 2.3146063594200608, "learning_rate": 9.998017951650895e-07, "logits/chosen": 1.9296875, "logits/rejected": 2.00390625, "logps/chosen": -793.0, "logps/rejected": -718.5, "loss": 0.7754, "rewards/accuracies": 0.8125, "rewards/chosen": 0.077392578125, "rewards/margins": 1.396484375, "rewards/rejected": -1.3203125, "step": 576 }, { "epoch": 0.10902735131560301, "grad_norm": 2.1954388212094775, "learning_rate": 9.997928888656816e-07, "logits/chosen": 1.154296875, "logits/rejected": 1.6279296875, "logps/chosen": -633.0, "logps/rejected": -1158.5, "loss": 0.766, "rewards/accuracies": 0.8125, "rewards/chosen": 0.173553466796875, "rewards/margins": 1.82421875, "rewards/rejected": -1.654296875, "step": 577 }, { "epoch": 0.10921630686381029, "grad_norm": 1.939683881570826, "learning_rate": 9.997837868841508e-07, "logits/chosen": 1.638671875, "logits/rejected": 2.5185546875, "logps/chosen": -499.0, "logps/rejected": -1315.0, "loss": 0.7613, "rewards/accuracies": 0.8125, "rewards/chosen": 0.152099609375, "rewards/margins": 2.201171875, "rewards/rejected": -2.0498046875, "step": 578 }, { "epoch": 0.10940526241201758, "grad_norm": 2.079138048358559, "learning_rate": 9.997744892244568e-07, "logits/chosen": 0.333984375, "logits/rejected": 0.278564453125, "logps/chosen": -524.5, "logps/rejected": -596.0, "loss": 0.7782, "rewards/accuracies": 0.84375, "rewards/chosen": -0.028961181640625, "rewards/margins": 1.2294921875, "rewards/rejected": -1.2578125, "step": 579 }, { "epoch": 0.10959421796022485, "grad_norm": 1.9039265583073928, "learning_rate": 9.99764995890644e-07, "logits/chosen": 0.464599609375, "logits/rejected": 1.03369140625, "logps/chosen": -896.5, "logps/rejected": -962.5, "loss": 0.8489, "rewards/accuracies": 0.5625, "rewards/chosen": -0.072906494140625, "rewards/margins": 1.5341796875, "rewards/rejected": -1.6083984375, "step": 580 }, { "epoch": 0.10978317350843214, "grad_norm": 1.9214276485044444, "learning_rate": 9.997553068868435e-07, "logits/chosen": 0.697998046875, "logits/rejected": 0.93603515625, "logps/chosen": -891.0, "logps/rejected": -965.5, "loss": 0.6979, "rewards/accuracies": 0.875, "rewards/chosen": 0.388916015625, "rewards/margins": 2.1328125, "rewards/rejected": -1.7421875, "step": 581 }, { "epoch": 0.10997212905663943, "grad_norm": 2.475318664943981, "learning_rate": 9.997454222172696e-07, "logits/chosen": 0.791015625, "logits/rejected": 1.029296875, "logps/chosen": -514.0, "logps/rejected": -1919.0, "loss": 0.7571, "rewards/accuracies": 0.8125, "rewards/chosen": 0.076171875, "rewards/margins": 2.9482421875, "rewards/rejected": -2.87890625, "step": 582 }, { "epoch": 0.1101610846048467, "grad_norm": 2.0146035426245805, "learning_rate": 9.997353418862232e-07, "logits/chosen": 1.009765625, "logits/rejected": 1.61328125, "logps/chosen": -731.5, "logps/rejected": -1283.0, "loss": 0.7802, "rewards/accuracies": 0.84375, "rewards/chosen": -0.066314697265625, "rewards/margins": 2.09375, "rewards/rejected": -2.16015625, "step": 583 }, { "epoch": 0.110350040153054, "grad_norm": 2.1360045726556867, "learning_rate": 9.997250658980895e-07, "logits/chosen": 0.864990234375, "logits/rejected": 0.2509765625, "logps/chosen": -821.0, "logps/rejected": -636.0, "loss": 0.802, "rewards/accuracies": 0.78125, "rewards/chosen": 0.24468994140625, "rewards/margins": 1.12548828125, "rewards/rejected": -0.880859375, "step": 584 }, { "epoch": 0.11053899570126127, "grad_norm": 2.382242413516729, "learning_rate": 9.997145942573387e-07, "logits/chosen": 0.500732421875, "logits/rejected": 0.7640380859375, "logps/chosen": -747.0, "logps/rejected": -875.0, "loss": 0.7614, "rewards/accuracies": 0.78125, "rewards/chosen": 0.087158203125, "rewards/margins": 1.5654296875, "rewards/rejected": -1.478515625, "step": 585 }, { "epoch": 0.11072795124946856, "grad_norm": 1.681327367716575, "learning_rate": 9.99703926968527e-07, "logits/chosen": 1.513671875, "logits/rejected": 2.07421875, "logps/chosen": -792.0, "logps/rejected": -1138.0, "loss": 0.7011, "rewards/accuracies": 0.84375, "rewards/chosen": 0.91552734375, "rewards/margins": 2.42578125, "rewards/rejected": -1.51220703125, "step": 586 }, { "epoch": 0.11091690679767585, "grad_norm": 2.011130051277241, "learning_rate": 9.996930640362951e-07, "logits/chosen": 0.420166015625, "logits/rejected": 0.8692626953125, "logps/chosen": -612.0, "logps/rejected": -574.0, "loss": 0.8367, "rewards/accuracies": 0.65625, "rewards/chosen": 0.135589599609375, "rewards/margins": 0.859375, "rewards/rejected": -0.72265625, "step": 587 }, { "epoch": 0.11110586234588313, "grad_norm": 2.258672681906892, "learning_rate": 9.996820054653688e-07, "logits/chosen": 0.517578125, "logits/rejected": 0.9736328125, "logps/chosen": -1049.0, "logps/rejected": -1095.0, "loss": 0.6481, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5369873046875, "rewards/margins": 2.39453125, "rewards/rejected": -1.859375, "step": 588 }, { "epoch": 0.11129481789409042, "grad_norm": 1.9140246062250783, "learning_rate": 9.99670751260559e-07, "logits/chosen": 1.000244140625, "logits/rejected": 1.270263671875, "logps/chosen": -600.5, "logps/rejected": -775.0, "loss": 0.7213, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39306640625, "rewards/margins": 1.8837890625, "rewards/rejected": -1.490234375, "step": 589 }, { "epoch": 0.11148377344229769, "grad_norm": 2.2502522607734186, "learning_rate": 9.996593014267625e-07, "logits/chosen": 1.00054931640625, "logits/rejected": 1.16357421875, "logps/chosen": -849.0, "logps/rejected": -832.0, "loss": 0.7631, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3795166015625, "rewards/margins": 1.3935546875, "rewards/rejected": -1.013671875, "step": 590 }, { "epoch": 0.11167272899050498, "grad_norm": 2.2434726756726326, "learning_rate": 9.996476559689596e-07, "logits/chosen": 0.900390625, "logits/rejected": 1.05126953125, "logps/chosen": -946.5, "logps/rejected": -1004.5, "loss": 0.7524, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6826171875, "rewards/margins": 2.078125, "rewards/rejected": -1.39453125, "step": 591 }, { "epoch": 0.11186168453871227, "grad_norm": 2.0155705138535622, "learning_rate": 9.996358148922173e-07, "logits/chosen": 1.94921875, "logits/rejected": 1.4306640625, "logps/chosen": -719.5, "logps/rejected": -708.5, "loss": 0.8082, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2344970703125, "rewards/margins": 1.1484375, "rewards/rejected": -0.9140625, "step": 592 }, { "epoch": 0.11205064008691955, "grad_norm": 2.2218478817386256, "learning_rate": 9.996237782016868e-07, "logits/chosen": 1.50390625, "logits/rejected": 1.904296875, "logps/chosen": -794.0, "logps/rejected": -594.5, "loss": 0.8707, "rewards/accuracies": 0.625, "rewards/chosen": -0.1099853515625, "rewards/margins": 0.818359375, "rewards/rejected": -0.927734375, "step": 593 }, { "epoch": 0.11223959563512684, "grad_norm": 2.059239136525736, "learning_rate": 9.996115459026049e-07, "logits/chosen": 0.8265380859375, "logits/rejected": 1.70361328125, "logps/chosen": -585.5, "logps/rejected": -671.5, "loss": 0.7203, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3065185546875, "rewards/margins": 1.8642578125, "rewards/rejected": -1.5576171875, "step": 594 }, { "epoch": 0.11242855118333413, "grad_norm": 2.508333394488228, "learning_rate": 9.995991180002926e-07, "logits/chosen": 0.9853515625, "logits/rejected": 1.01123046875, "logps/chosen": -740.0, "logps/rejected": -664.0, "loss": 0.738, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4189453125, "rewards/margins": 1.681640625, "rewards/rejected": -1.259765625, "step": 595 }, { "epoch": 0.1126175067315414, "grad_norm": 1.8659533759836824, "learning_rate": 9.995864945001573e-07, "logits/chosen": 0.8824462890625, "logits/rejected": 1.14013671875, "logps/chosen": -572.5, "logps/rejected": -709.5, "loss": 0.6698, "rewards/accuracies": 0.875, "rewards/chosen": 0.2904052734375, "rewards/margins": 1.9140625, "rewards/rejected": -1.623046875, "step": 596 }, { "epoch": 0.11280646227974869, "grad_norm": 2.4755188814866793, "learning_rate": 9.995736754076908e-07, "logits/chosen": -0.0703125, "logits/rejected": 0.019775390625, "logps/chosen": -821.0, "logps/rejected": -901.5, "loss": 0.7572, "rewards/accuracies": 0.75, "rewards/chosen": 0.2003173828125, "rewards/margins": 1.3251953125, "rewards/rejected": -1.12890625, "step": 597 }, { "epoch": 0.11299541782795597, "grad_norm": 2.2005519037379146, "learning_rate": 9.995606607284699e-07, "logits/chosen": 0.943115234375, "logits/rejected": 0.8095703125, "logps/chosen": -826.5, "logps/rejected": -600.5, "loss": 0.7352, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1080322265625, "rewards/margins": 1.533203125, "rewards/rejected": -1.423828125, "step": 598 }, { "epoch": 0.11318437337616326, "grad_norm": 2.0663971567698836, "learning_rate": 9.995474504681567e-07, "logits/chosen": 0.62158203125, "logits/rejected": 1.2529296875, "logps/chosen": -928.5, "logps/rejected": -852.0, "loss": 0.697, "rewards/accuracies": 0.78125, "rewards/chosen": 0.292144775390625, "rewards/margins": 2.08984375, "rewards/rejected": -1.802734375, "step": 599 }, { "epoch": 0.11337332892437055, "grad_norm": 1.8552465865569734, "learning_rate": 9.995340446324979e-07, "logits/chosen": 1.123046875, "logits/rejected": 1.431640625, "logps/chosen": -897.0, "logps/rejected": -1266.5, "loss": 0.8072, "rewards/accuracies": 0.71875, "rewards/chosen": 0.007080078125, "rewards/margins": 1.87646484375, "rewards/rejected": -1.873046875, "step": 600 }, { "epoch": 0.11356228447257782, "grad_norm": 1.7848777483011553, "learning_rate": 9.995204432273261e-07, "logits/chosen": 1.3486328125, "logits/rejected": 1.29052734375, "logps/chosen": -738.0, "logps/rejected": -597.5, "loss": 0.8052, "rewards/accuracies": 0.65625, "rewards/chosen": 0.419677734375, "rewards/margins": 1.4365234375, "rewards/rejected": -1.02197265625, "step": 601 }, { "epoch": 0.11375124002078511, "grad_norm": 2.314987119822365, "learning_rate": 9.995066462585588e-07, "logits/chosen": 0.96405029296875, "logits/rejected": 1.888671875, "logps/chosen": -970.0, "logps/rejected": -770.0, "loss": 0.8376, "rewards/accuracies": 0.71875, "rewards/chosen": 0.31964111328125, "rewards/margins": 1.302734375, "rewards/rejected": -0.98046875, "step": 602 }, { "epoch": 0.11394019556899239, "grad_norm": 2.1085571082112904, "learning_rate": 9.99492653732198e-07, "logits/chosen": 1.91796875, "logits/rejected": 1.88330078125, "logps/chosen": -886.0, "logps/rejected": -15962.0, "loss": 0.8214, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1431884765625, "rewards/margins": 71.90234375, "rewards/rejected": -72.03515625, "step": 603 }, { "epoch": 0.11412915111719968, "grad_norm": 2.555043598326098, "learning_rate": 9.99478465654331e-07, "logits/chosen": 1.11572265625, "logits/rejected": 1.318359375, "logps/chosen": -1188.5, "logps/rejected": -1060.5, "loss": 0.7031, "rewards/accuracies": 0.875, "rewards/chosen": 0.47698974609375, "rewards/margins": 1.9990234375, "rewards/rejected": -1.5224609375, "step": 604 }, { "epoch": 0.11431810666540697, "grad_norm": 2.1008705342440055, "learning_rate": 9.99464082031131e-07, "logits/chosen": 1.66015625, "logits/rejected": 2.22265625, "logps/chosen": -1131.0, "logps/rejected": -1216.0, "loss": 0.6902, "rewards/accuracies": 0.75, "rewards/chosen": 0.95703125, "rewards/margins": 3.0625, "rewards/rejected": -2.107421875, "step": 605 }, { "epoch": 0.11450706221361424, "grad_norm": 1.8813996048533983, "learning_rate": 9.994495028688548e-07, "logits/chosen": 0.806640625, "logits/rejected": 1.3408203125, "logps/chosen": -462.5, "logps/rejected": -609.5, "loss": 0.7365, "rewards/accuracies": 0.84375, "rewards/chosen": 0.38916015625, "rewards/margins": 1.63671875, "rewards/rejected": -1.248046875, "step": 606 }, { "epoch": 0.11469601776182153, "grad_norm": 2.4798464847478265, "learning_rate": 9.994347281738451e-07, "logits/chosen": 0.014892578125, "logits/rejected": -0.1729736328125, "logps/chosen": -882.0, "logps/rejected": -1047.5, "loss": 0.6572, "rewards/accuracies": 0.96875, "rewards/chosen": 0.30126953125, "rewards/margins": 2.220703125, "rewards/rejected": -1.916015625, "step": 607 }, { "epoch": 0.11488497331002881, "grad_norm": 1.8791680021699197, "learning_rate": 9.994197579525306e-07, "logits/chosen": 0.7864990234375, "logits/rejected": 1.2607421875, "logps/chosen": -650.5, "logps/rejected": -859.0, "loss": 0.76, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5343017578125, "rewards/margins": 1.8916015625, "rewards/rejected": -1.35400390625, "step": 608 }, { "epoch": 0.1150739288582361, "grad_norm": 2.277420230987325, "learning_rate": 9.994045922114229e-07, "logits/chosen": 0.486328125, "logits/rejected": 0.56640625, "logps/chosen": -657.5, "logps/rejected": -483.25, "loss": 0.8334, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1634521484375, "rewards/margins": 1.0205078125, "rewards/rejected": -0.8564453125, "step": 609 }, { "epoch": 0.11526288440644339, "grad_norm": 2.268195186852684, "learning_rate": 9.993892309571206e-07, "logits/chosen": 1.3984375, "logits/rejected": 1.970703125, "logps/chosen": -1037.0, "logps/rejected": -1430.0, "loss": 0.6932, "rewards/accuracies": 0.875, "rewards/chosen": 0.30621337890625, "rewards/margins": 2.365234375, "rewards/rejected": -2.056640625, "step": 610 }, { "epoch": 0.11545183995465066, "grad_norm": 2.3107459472465313, "learning_rate": 9.99373674196306e-07, "logits/chosen": 1.33056640625, "logits/rejected": 1.7646484375, "logps/chosen": -798.5, "logps/rejected": -757.5, "loss": 0.7533, "rewards/accuracies": 0.8125, "rewards/chosen": 0.390869140625, "rewards/margins": 1.6484375, "rewards/rejected": -1.26171875, "step": 611 }, { "epoch": 0.11564079550285795, "grad_norm": 1.9208250214474705, "learning_rate": 9.993579219357476e-07, "logits/chosen": 0.23486328125, "logits/rejected": 0.50244140625, "logps/chosen": -532.5, "logps/rejected": -530.5, "loss": 0.7851, "rewards/accuracies": 0.75, "rewards/chosen": 0.133636474609375, "rewards/margins": 1.181640625, "rewards/rejected": -1.0498046875, "step": 612 }, { "epoch": 0.11582975105106523, "grad_norm": 2.624415291604501, "learning_rate": 9.993419741822983e-07, "logits/chosen": 0.8525390625, "logits/rejected": 0.675048828125, "logps/chosen": -888.5, "logps/rejected": -578.5, "loss": 0.7106, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4686279296875, "rewards/margins": 1.5966796875, "rewards/rejected": -1.129150390625, "step": 613 }, { "epoch": 0.11601870659927252, "grad_norm": 1.9526199413771579, "learning_rate": 9.993258309428961e-07, "logits/chosen": 0.59228515625, "logits/rejected": 0.9581298828125, "logps/chosen": -930.0, "logps/rejected": -900.5, "loss": 0.6945, "rewards/accuracies": 0.84375, "rewards/chosen": 0.567138671875, "rewards/margins": 2.216796875, "rewards/rejected": -1.6484375, "step": 614 }, { "epoch": 0.11620766214747981, "grad_norm": 1.9014506399013673, "learning_rate": 9.99309492224564e-07, "logits/chosen": 1.4013671875, "logits/rejected": 1.6376953125, "logps/chosen": -566.0, "logps/rejected": -561.5, "loss": 0.8367, "rewards/accuracies": 0.65625, "rewards/chosen": -0.059722900390625, "rewards/margins": 0.90625, "rewards/rejected": -0.9638671875, "step": 615 }, { "epoch": 0.11639661769568709, "grad_norm": 2.3276153299046287, "learning_rate": 9.9929295803441e-07, "logits/chosen": 0.599609375, "logits/rejected": 1.5364990234375, "logps/chosen": -935.0, "logps/rejected": -1047.0, "loss": 0.7201, "rewards/accuracies": 0.84375, "rewards/chosen": 0.367431640625, "rewards/margins": 1.9296875, "rewards/rejected": -1.5625, "step": 616 }, { "epoch": 0.11658557324389437, "grad_norm": 1.7367518391771797, "learning_rate": 9.992762283796277e-07, "logits/chosen": 1.474609375, "logits/rejected": 1.21343994140625, "logps/chosen": -845.0, "logps/rejected": -682.0, "loss": 0.7186, "rewards/accuracies": 0.8125, "rewards/chosen": 0.474609375, "rewards/margins": 1.873046875, "rewards/rejected": -1.3994140625, "step": 617 }, { "epoch": 0.11677452879210166, "grad_norm": 2.1996414264655213, "learning_rate": 9.992593032674953e-07, "logits/chosen": 0.78125, "logits/rejected": 0.712158203125, "logps/chosen": -822.0, "logps/rejected": -861.5, "loss": 0.7085, "rewards/accuracies": 0.84375, "rewards/chosen": 0.427490234375, "rewards/margins": 2.1357421875, "rewards/rejected": -1.703125, "step": 618 }, { "epoch": 0.11696348434030894, "grad_norm": 2.185051866486618, "learning_rate": 9.992421827053758e-07, "logits/chosen": 0.287811279296875, "logits/rejected": 1.76171875, "logps/chosen": -747.0, "logps/rejected": -2053.0, "loss": 0.7502, "rewards/accuracies": 0.75, "rewards/chosen": 0.19720458984375, "rewards/margins": 2.7109375, "rewards/rejected": -2.513671875, "step": 619 }, { "epoch": 0.11715243988851623, "grad_norm": 2.016537571505156, "learning_rate": 9.992248667007176e-07, "logits/chosen": 0.3857421875, "logits/rejected": 1.0009765625, "logps/chosen": -641.5, "logps/rejected": -628.0, "loss": 0.7146, "rewards/accuracies": 0.8125, "rewards/chosen": 0.307403564453125, "rewards/margins": 1.6201171875, "rewards/rejected": -1.3134765625, "step": 620 }, { "epoch": 0.1173413954367235, "grad_norm": 1.8918374248342829, "learning_rate": 9.992073552610539e-07, "logits/chosen": 1.3651123046875, "logits/rejected": 2.0927734375, "logps/chosen": -696.5, "logps/rejected": -867.0, "loss": 0.7575, "rewards/accuracies": 0.75, "rewards/chosen": 0.351806640625, "rewards/margins": 1.5966796875, "rewards/rejected": -1.244140625, "step": 621 }, { "epoch": 0.1175303509849308, "grad_norm": 1.9222476615289077, "learning_rate": 9.991896483940033e-07, "logits/chosen": 0.31689453125, "logits/rejected": 0.05712890625, "logps/chosen": -967.0, "logps/rejected": -896.5, "loss": 0.6702, "rewards/accuracies": 0.84375, "rewards/chosen": 0.857177734375, "rewards/margins": 2.373046875, "rewards/rejected": -1.51171875, "step": 622 }, { "epoch": 0.11771930653313809, "grad_norm": 1.9530551204918085, "learning_rate": 9.99171746107269e-07, "logits/chosen": 0.74847412109375, "logits/rejected": 0.9296875, "logps/chosen": -811.5, "logps/rejected": -666.0, "loss": 0.708, "rewards/accuracies": 0.8125, "rewards/chosen": 0.455078125, "rewards/margins": 1.796875, "rewards/rejected": -1.33984375, "step": 623 }, { "epoch": 0.11790826208134536, "grad_norm": 2.2132266580471365, "learning_rate": 9.991536484086393e-07, "logits/chosen": 1.2666015625, "logits/rejected": 1.5078125, "logps/chosen": -617.0, "logps/rejected": -567.0, "loss": 0.7475, "rewards/accuracies": 0.875, "rewards/chosen": 0.21600341796875, "rewards/margins": 1.3583984375, "rewards/rejected": -1.1416015625, "step": 624 }, { "epoch": 0.11809721762955265, "grad_norm": 2.1897305876459923, "learning_rate": 9.991353553059876e-07, "logits/chosen": 0.0859375, "logits/rejected": 0.4140625, "logps/chosen": -779.0, "logps/rejected": -924.0, "loss": 0.6957, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5159912109375, "rewards/margins": 1.900390625, "rewards/rejected": -1.37890625, "step": 625 }, { "epoch": 0.11828617317775993, "grad_norm": 2.1905076275183726, "learning_rate": 9.991168668072727e-07, "logits/chosen": 0.7451171875, "logits/rejected": 1.3046875, "logps/chosen": -783.0, "logps/rejected": -761.0, "loss": 0.7547, "rewards/accuracies": 0.78125, "rewards/chosen": 0.13494873046875, "rewards/margins": 1.501953125, "rewards/rejected": -1.3671875, "step": 626 }, { "epoch": 0.11847512872596722, "grad_norm": 2.1675165017129707, "learning_rate": 9.990981829205377e-07, "logits/chosen": 1.0537109375, "logits/rejected": 0.841064453125, "logps/chosen": -796.0, "logps/rejected": -696.0, "loss": 0.7585, "rewards/accuracies": 0.75, "rewards/chosen": 0.207275390625, "rewards/margins": 1.81640625, "rewards/rejected": -1.609375, "step": 627 }, { "epoch": 0.1186640842741745, "grad_norm": 1.8840399370743455, "learning_rate": 9.990793036539106e-07, "logits/chosen": 1.5888671875, "logits/rejected": 2.0360107421875, "logps/chosen": -990.0, "logps/rejected": -1165.0, "loss": 0.6648, "rewards/accuracies": 0.875, "rewards/chosen": 0.609039306640625, "rewards/margins": 2.3876953125, "rewards/rejected": -1.7724609375, "step": 628 }, { "epoch": 0.11885303982238178, "grad_norm": 2.426076183240464, "learning_rate": 9.990602290156057e-07, "logits/chosen": 0.18231201171875, "logits/rejected": 0.9873046875, "logps/chosen": -963.0, "logps/rejected": -629.0, "loss": 0.785, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0740966796875, "rewards/margins": 1.3515625, "rewards/rejected": -1.27734375, "step": 629 }, { "epoch": 0.11904199537058907, "grad_norm": 2.100797810098865, "learning_rate": 9.990409590139205e-07, "logits/chosen": 0.602783203125, "logits/rejected": 1.07958984375, "logps/chosen": -781.0, "logps/rejected": -1156.0, "loss": 0.7292, "rewards/accuracies": 0.90625, "rewards/chosen": -0.108642578125, "rewards/margins": 1.90625, "rewards/rejected": -2.015625, "step": 630 }, { "epoch": 0.11923095091879635, "grad_norm": 1.6869776222057162, "learning_rate": 9.990214936572391e-07, "logits/chosen": 1.052734375, "logits/rejected": 1.5322265625, "logps/chosen": -836.5, "logps/rejected": -837.0, "loss": 0.7626, "rewards/accuracies": 0.6875, "rewards/chosen": 0.378173828125, "rewards/margins": 1.865234375, "rewards/rejected": -1.48828125, "step": 631 }, { "epoch": 0.11941990646700364, "grad_norm": 2.1837428430280847, "learning_rate": 9.990018329540295e-07, "logits/chosen": 0.987548828125, "logits/rejected": 1.2060546875, "logps/chosen": -671.0, "logps/rejected": -1779.0, "loss": 0.8055, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0076904296875, "rewards/margins": 1.7568359375, "rewards/rejected": -1.75048828125, "step": 632 }, { "epoch": 0.11960886201521093, "grad_norm": 1.9380573618088925, "learning_rate": 9.989819769128454e-07, "logits/chosen": 0.3992919921875, "logits/rejected": 0.072265625, "logps/chosen": -896.0, "logps/rejected": -706.5, "loss": 0.6444, "rewards/accuracies": 0.90625, "rewards/chosen": 0.31494140625, "rewards/margins": 2.224609375, "rewards/rejected": -1.908203125, "step": 633 }, { "epoch": 0.1197978175634182, "grad_norm": 2.1953045398989035, "learning_rate": 9.989619255423248e-07, "logits/chosen": 0.799102783203125, "logits/rejected": 1.1396484375, "logps/chosen": -501.0, "logps/rejected": -452.0, "loss": 0.7767, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0743408203125, "rewards/margins": 1.353515625, "rewards/rejected": -1.423828125, "step": 634 }, { "epoch": 0.11998677311162549, "grad_norm": 2.269515775142207, "learning_rate": 9.989416788511913e-07, "logits/chosen": -0.004638671875, "logits/rejected": 0.656005859375, "logps/chosen": -962.0, "logps/rejected": -726.0, "loss": 0.6949, "rewards/accuracies": 0.875, "rewards/chosen": 0.169769287109375, "rewards/margins": 2.013671875, "rewards/rejected": -1.84375, "step": 635 }, { "epoch": 0.12017572865983278, "grad_norm": 1.9137508918620438, "learning_rate": 9.989212368482531e-07, "logits/chosen": 0.6572265625, "logits/rejected": 0.720703125, "logps/chosen": -1258.5, "logps/rejected": -800.5, "loss": 0.6734, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0419921875, "rewards/margins": 1.7281494140625, "rewards/rejected": -1.68359375, "step": 636 }, { "epoch": 0.12036468420804006, "grad_norm": 2.109093515174504, "learning_rate": 9.989005995424035e-07, "logits/chosen": 1.3701171875, "logits/rejected": 1.8828125, "logps/chosen": -694.0, "logps/rejected": -1911.0, "loss": 0.7344, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2291259765625, "rewards/margins": 2.576171875, "rewards/rejected": -2.353515625, "step": 637 }, { "epoch": 0.12055363975624735, "grad_norm": 1.9370311403758689, "learning_rate": 9.988797669426206e-07, "logits/chosen": 1.20751953125, "logits/rejected": 1.44677734375, "logps/chosen": -683.0, "logps/rejected": -637.5, "loss": 0.7751, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2757568359375, "rewards/margins": 1.484375, "rewards/rejected": -1.205078125, "step": 638 }, { "epoch": 0.12074259530445462, "grad_norm": 2.1599399417595535, "learning_rate": 9.98858739057968e-07, "logits/chosen": 0.5048828125, "logits/rejected": 1.2705078125, "logps/chosen": -967.0, "logps/rejected": -1150.0, "loss": 0.743, "rewards/accuracies": 0.84375, "rewards/chosen": 0.45703125, "rewards/margins": 2.2841796875, "rewards/rejected": -1.8251953125, "step": 639 }, { "epoch": 0.12093155085266191, "grad_norm": 2.003033176313918, "learning_rate": 9.988375158975935e-07, "logits/chosen": 0.80859375, "logits/rejected": 1.91015625, "logps/chosen": -769.5, "logps/rejected": -1101.0, "loss": 0.7652, "rewards/accuracies": 0.65625, "rewards/chosen": 0.4525146484375, "rewards/margins": 1.9697265625, "rewards/rejected": -1.51318359375, "step": 640 }, { "epoch": 0.1211205064008692, "grad_norm": 2.032124656422597, "learning_rate": 9.988160974707302e-07, "logits/chosen": 0.4931640625, "logits/rejected": 1.3974609375, "logps/chosen": -855.5, "logps/rejected": -960.5, "loss": 0.6847, "rewards/accuracies": 0.875, "rewards/chosen": 0.6497802734375, "rewards/margins": 2.3935546875, "rewards/rejected": -1.744140625, "step": 641 }, { "epoch": 0.12130946194907648, "grad_norm": 2.2347325674065, "learning_rate": 9.987944837866965e-07, "logits/chosen": 1.3558349609375, "logits/rejected": 1.59765625, "logps/chosen": -836.0, "logps/rejected": -690.0, "loss": 0.7324, "rewards/accuracies": 0.875, "rewards/chosen": 0.1383056640625, "rewards/margins": 2.322265625, "rewards/rejected": -2.177734375, "step": 642 }, { "epoch": 0.12149841749728377, "grad_norm": 1.747399154126785, "learning_rate": 9.987726748548952e-07, "logits/chosen": 0.7080078125, "logits/rejected": 1.384765625, "logps/chosen": -681.5, "logps/rejected": -949.0, "loss": 0.7485, "rewards/accuracies": 0.78125, "rewards/chosen": -0.068115234375, "rewards/margins": 2.16015625, "rewards/rejected": -2.2265625, "step": 643 }, { "epoch": 0.12168737304549104, "grad_norm": 1.747100422349436, "learning_rate": 9.987506706848143e-07, "logits/chosen": 0.00335693359375, "logits/rejected": 0.670867919921875, "logps/chosen": -473.0, "logps/rejected": -530.5, "loss": 0.7786, "rewards/accuracies": 0.8125, "rewards/chosen": 0.033447265625, "rewards/margins": 1.271484375, "rewards/rejected": -1.23828125, "step": 644 }, { "epoch": 0.12187632859369833, "grad_norm": 1.9299838523339727, "learning_rate": 9.987284712860268e-07, "logits/chosen": 1.58203125, "logits/rejected": 1.962890625, "logps/chosen": -855.0, "logps/rejected": -715.5, "loss": 0.6962, "rewards/accuracies": 0.8125, "rewards/chosen": 0.45135498046875, "rewards/margins": 2.06640625, "rewards/rejected": -1.615234375, "step": 645 }, { "epoch": 0.12206528414190562, "grad_norm": 1.546121988579207, "learning_rate": 9.987060766681903e-07, "logits/chosen": 1.1533203125, "logits/rejected": 1.66015625, "logps/chosen": -491.5, "logps/rejected": -554.0, "loss": 0.7627, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3092041015625, "rewards/margins": 1.751953125, "rewards/rejected": -1.4453125, "step": 646 }, { "epoch": 0.1222542396901129, "grad_norm": 1.775598535775058, "learning_rate": 9.986834868410477e-07, "logits/chosen": 0.76806640625, "logits/rejected": 0.380615234375, "logps/chosen": -501.5, "logps/rejected": -379.5, "loss": 0.728, "rewards/accuracies": 0.84375, "rewards/chosen": 0.12744140625, "rewards/margins": 1.3642578125, "rewards/rejected": -1.23828125, "step": 647 }, { "epoch": 0.12244319523832019, "grad_norm": 2.7413806134630683, "learning_rate": 9.986607018144268e-07, "logits/chosen": 1.556640625, "logits/rejected": 2.69921875, "logps/chosen": -859.0, "logps/rejected": -1702.0, "loss": 0.6844, "rewards/accuracies": 0.78125, "rewards/chosen": 0.41796875, "rewards/margins": 2.83984375, "rewards/rejected": -2.42578125, "step": 648 }, { "epoch": 0.12263215078652746, "grad_norm": 2.0521593447673063, "learning_rate": 9.9863772159824e-07, "logits/chosen": 0.9091796875, "logits/rejected": 1.638671875, "logps/chosen": -784.0, "logps/rejected": -1742.0, "loss": 0.6664, "rewards/accuracies": 0.78125, "rewards/chosen": 0.43408203125, "rewards/margins": 3.2490234375, "rewards/rejected": -2.8232421875, "step": 649 }, { "epoch": 0.12282110633473475, "grad_norm": 2.1435578708844747, "learning_rate": 9.986145462024848e-07, "logits/chosen": 1.0576171875, "logits/rejected": 1.2236328125, "logps/chosen": -644.5, "logps/rejected": -674.5, "loss": 0.7158, "rewards/accuracies": 0.875, "rewards/chosen": 0.425872802734375, "rewards/margins": 1.8818359375, "rewards/rejected": -1.45703125, "step": 650 }, { "epoch": 0.12301006188294204, "grad_norm": 7.6657372910946044, "learning_rate": 9.98591175637244e-07, "logits/chosen": 1.796875, "logits/rejected": 1.80078125, "logps/chosen": -641.0, "logps/rejected": -626.0, "loss": 0.7213, "rewards/accuracies": 0.84375, "rewards/chosen": 0.301025390625, "rewards/margins": 1.6474609375, "rewards/rejected": -1.34521484375, "step": 651 }, { "epoch": 0.12319901743114932, "grad_norm": 2.231773603829115, "learning_rate": 9.985676099126848e-07, "logits/chosen": -0.17431640625, "logits/rejected": 0.7841796875, "logps/chosen": -773.0, "logps/rejected": -691.0, "loss": 0.7946, "rewards/accuracies": 0.75, "rewards/chosen": 0.083740234375, "rewards/margins": 1.30853271484375, "rewards/rejected": -1.227783203125, "step": 652 }, { "epoch": 0.12338797297935661, "grad_norm": 1.9842266071549484, "learning_rate": 9.985438490390592e-07, "logits/chosen": 0.580078125, "logits/rejected": 1.46240234375, "logps/chosen": -835.0, "logps/rejected": -992.0, "loss": 0.7303, "rewards/accuracies": 0.84375, "rewards/chosen": 0.446044921875, "rewards/margins": 1.94384765625, "rewards/rejected": -1.4951171875, "step": 653 }, { "epoch": 0.12357692852756388, "grad_norm": 2.3146708292585836, "learning_rate": 9.985198930267045e-07, "logits/chosen": 0.6376953125, "logits/rejected": 1.14013671875, "logps/chosen": -907.0, "logps/rejected": -937.5, "loss": 0.7087, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3115234375, "rewards/margins": 2.013671875, "rewards/rejected": -1.701171875, "step": 654 }, { "epoch": 0.12376588407577117, "grad_norm": 2.1797769351568466, "learning_rate": 9.98495741886043e-07, "logits/chosen": 1.6142578125, "logits/rejected": 2.16015625, "logps/chosen": -1347.0, "logps/rejected": -1120.0, "loss": 0.6147, "rewards/accuracies": 0.9375, "rewards/chosen": 0.739501953125, "rewards/margins": 2.90625, "rewards/rejected": -2.173828125, "step": 655 }, { "epoch": 0.12395483962397846, "grad_norm": 1.8145146708602544, "learning_rate": 9.984713956275813e-07, "logits/chosen": 1.3056640625, "logits/rejected": 1.3388671875, "logps/chosen": -458.0, "logps/rejected": -496.25, "loss": 0.7884, "rewards/accuracies": 0.75, "rewards/chosen": -0.069580078125, "rewards/margins": 1.376953125, "rewards/rejected": -1.4443359375, "step": 656 }, { "epoch": 0.12414379517218574, "grad_norm": 2.0799615701554353, "learning_rate": 9.984468542619112e-07, "logits/chosen": 0.8134765625, "logits/rejected": 1.5166015625, "logps/chosen": -643.0, "logps/rejected": -1147.5, "loss": 0.808, "rewards/accuracies": 0.65625, "rewards/chosen": 0.2325439453125, "rewards/margins": 1.712890625, "rewards/rejected": -1.484375, "step": 657 }, { "epoch": 0.12433275072039303, "grad_norm": 1.9268463269498048, "learning_rate": 9.984221177997096e-07, "logits/chosen": 0.103515625, "logits/rejected": -0.135498046875, "logps/chosen": -620.0, "logps/rejected": -534.0, "loss": 0.7142, "rewards/accuracies": 0.78125, "rewards/chosen": 0.112548828125, "rewards/margins": 1.634765625, "rewards/rejected": -1.521484375, "step": 658 }, { "epoch": 0.12452170626860032, "grad_norm": 2.08536342438001, "learning_rate": 9.983971862517382e-07, "logits/chosen": 1.173828125, "logits/rejected": 1.7852783203125, "logps/chosen": -722.5, "logps/rejected": -756.5, "loss": 0.6975, "rewards/accuracies": 0.84375, "rewards/chosen": 0.12451171875, "rewards/margins": 1.84765625, "rewards/rejected": -1.720703125, "step": 659 }, { "epoch": 0.1247106618168076, "grad_norm": 1.968770501540993, "learning_rate": 9.983720596288432e-07, "logits/chosen": 0.934814453125, "logits/rejected": 1.38916015625, "logps/chosen": -700.5, "logps/rejected": -1231.0, "loss": 0.7045, "rewards/accuracies": 0.875, "rewards/chosen": 0.4482421875, "rewards/margins": 2.5546875, "rewards/rejected": -2.107421875, "step": 660 }, { "epoch": 0.12489961736501488, "grad_norm": 2.2648940339504695, "learning_rate": 9.98346737941956e-07, "logits/chosen": 1.13720703125, "logits/rejected": 0.59857177734375, "logps/chosen": -896.5, "logps/rejected": -888.0, "loss": 0.7314, "rewards/accuracies": 0.78125, "rewards/chosen": 0.43280029296875, "rewards/margins": 1.673828125, "rewards/rejected": -1.244140625, "step": 661 }, { "epoch": 0.12508857291322217, "grad_norm": 1.9333490646797324, "learning_rate": 9.983212212020928e-07, "logits/chosen": 1.5107421875, "logits/rejected": 1.861328125, "logps/chosen": -819.0, "logps/rejected": -801.0, "loss": 0.7052, "rewards/accuracies": 0.75, "rewards/chosen": 0.861328125, "rewards/margins": 1.96875, "rewards/rejected": -1.10888671875, "step": 662 }, { "epoch": 0.12527752846142945, "grad_norm": 1.7366767899056206, "learning_rate": 9.982955094203547e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.7247314453125, "logps/chosen": -648.0, "logps/rejected": -715.0, "loss": 0.7174, "rewards/accuracies": 0.84375, "rewards/chosen": 0.332275390625, "rewards/margins": 1.8349609375, "rewards/rejected": -1.4990234375, "step": 663 }, { "epoch": 0.12546648400963673, "grad_norm": 2.3414807713180883, "learning_rate": 9.982696026079273e-07, "logits/chosen": 0.833984375, "logits/rejected": 1.60595703125, "logps/chosen": -610.5, "logps/rejected": -639.0, "loss": 0.7425, "rewards/accuracies": 0.875, "rewards/chosen": 0.303436279296875, "rewards/margins": 1.646484375, "rewards/rejected": -1.34375, "step": 664 }, { "epoch": 0.12565543955784403, "grad_norm": 2.086254664610521, "learning_rate": 9.98243500776082e-07, "logits/chosen": 0.8359375, "logits/rejected": 0.951416015625, "logps/chosen": -993.5, "logps/rejected": -952.5, "loss": 0.6664, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4853515625, "rewards/margins": 2.3427734375, "rewards/rejected": -1.857421875, "step": 665 }, { "epoch": 0.1258443951060513, "grad_norm": 2.2548068640212615, "learning_rate": 9.982172039361735e-07, "logits/chosen": 1.10546875, "logits/rejected": 0.88134765625, "logps/chosen": -759.0, "logps/rejected": -739.0, "loss": 0.7605, "rewards/accuracies": 0.78125, "rewards/chosen": -0.00531005859375, "rewards/margins": 1.3203125, "rewards/rejected": -1.3251953125, "step": 666 }, { "epoch": 0.12603335065425858, "grad_norm": 2.2112648692128887, "learning_rate": 9.981907120996428e-07, "logits/chosen": 1.09765625, "logits/rejected": 1.744140625, "logps/chosen": -1079.0, "logps/rejected": -1229.0, "loss": 0.6982, "rewards/accuracies": 0.78125, "rewards/chosen": 0.37933349609375, "rewards/margins": 2.40625, "rewards/rejected": -2.01953125, "step": 667 }, { "epoch": 0.12622230620246588, "grad_norm": 2.658779225746724, "learning_rate": 9.981640252780154e-07, "logits/chosen": 0.0986328125, "logits/rejected": 0.27685546875, "logps/chosen": -676.0, "logps/rejected": -775.0, "loss": 0.7733, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0216064453125, "rewards/margins": 1.259765625, "rewards/rejected": -1.2373046875, "step": 668 }, { "epoch": 0.12641126175067316, "grad_norm": 2.025855305931663, "learning_rate": 9.981371434829007e-07, "logits/chosen": 0.60986328125, "logits/rejected": 0.596923828125, "logps/chosen": -910.0, "logps/rejected": -957.0, "loss": 0.6912, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09088134765625, "rewards/margins": 2.2265625, "rewards/rejected": -2.13671875, "step": 669 }, { "epoch": 0.12660021729888044, "grad_norm": 1.949943088016308, "learning_rate": 9.981100667259942e-07, "logits/chosen": 0.61767578125, "logits/rejected": 1.26220703125, "logps/chosen": -877.0, "logps/rejected": -731.0, "loss": 0.6986, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3153076171875, "rewards/margins": 1.888671875, "rewards/rejected": -1.572265625, "step": 670 }, { "epoch": 0.1267891728470877, "grad_norm": 1.9517077789426318, "learning_rate": 9.98082795019075e-07, "logits/chosen": 0.634765625, "logits/rejected": 1.5556640625, "logps/chosen": -754.0, "logps/rejected": -2398.0, "loss": 0.7839, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3326416015625, "rewards/margins": 2.3046875, "rewards/rejected": -2.6328125, "step": 671 }, { "epoch": 0.12697812839529501, "grad_norm": 2.627446713613108, "learning_rate": 9.980553283740082e-07, "logits/chosen": 1.2333984375, "logits/rejected": 1.173828125, "logps/chosen": -715.5, "logps/rejected": -686.5, "loss": 0.7613, "rewards/accuracies": 0.75, "rewards/chosen": 0.10546875, "rewards/margins": 1.744140625, "rewards/rejected": -1.63671875, "step": 672 }, { "epoch": 0.1271670839435023, "grad_norm": 2.1582385733799088, "learning_rate": 9.98027666802743e-07, "logits/chosen": 0.84033203125, "logits/rejected": 1.4794921875, "logps/chosen": -908.0, "logps/rejected": -981.5, "loss": 0.7109, "rewards/accuracies": 0.78125, "rewards/chosen": 0.40625, "rewards/margins": 2.15625, "rewards/rejected": -1.75390625, "step": 673 }, { "epoch": 0.12735603949170957, "grad_norm": 1.87127596960193, "learning_rate": 9.979998103173133e-07, "logits/chosen": 1.373046875, "logits/rejected": 2.091796875, "logps/chosen": -586.0, "logps/rejected": -724.0, "loss": 0.72, "rewards/accuracies": 0.8125, "rewards/chosen": 0.246826171875, "rewards/margins": 1.775390625, "rewards/rejected": -1.53125, "step": 674 }, { "epoch": 0.12754499503991687, "grad_norm": 2.0965534441039, "learning_rate": 9.979717589298385e-07, "logits/chosen": 0.56402587890625, "logits/rejected": 0.4091796875, "logps/chosen": -781.0, "logps/rejected": -973.0, "loss": 0.6357, "rewards/accuracies": 0.9375, "rewards/chosen": 0.31884765625, "rewards/margins": 2.025390625, "rewards/rejected": -1.705078125, "step": 675 }, { "epoch": 0.12773395058812415, "grad_norm": 2.0658818496747875, "learning_rate": 9.979435126525219e-07, "logits/chosen": 0.80078125, "logits/rejected": 0.7080078125, "logps/chosen": -657.5, "logps/rejected": -724.5, "loss": 0.7565, "rewards/accuracies": 0.875, "rewards/chosen": -0.1251220703125, "rewards/margins": 1.728515625, "rewards/rejected": -1.853515625, "step": 676 }, { "epoch": 0.12792290613633142, "grad_norm": 2.8147045318885247, "learning_rate": 9.979150714976523e-07, "logits/chosen": 1.513671875, "logits/rejected": 1.49072265625, "logps/chosen": -1210.0, "logps/rejected": -1162.0, "loss": 0.6066, "rewards/accuracies": 0.875, "rewards/chosen": 0.873046875, "rewards/margins": 2.853515625, "rewards/rejected": -1.9736328125, "step": 677 }, { "epoch": 0.12811186168453872, "grad_norm": 2.301115059599815, "learning_rate": 9.97886435477603e-07, "logits/chosen": 0.345458984375, "logits/rejected": 0.029296875, "logps/chosen": -869.0, "logps/rejected": -16556.0, "loss": 0.6641, "rewards/accuracies": 0.84375, "rewards/chosen": 0.203369140625, "rewards/margins": 79.69921875, "rewards/rejected": -79.5625, "step": 678 }, { "epoch": 0.128300817232746, "grad_norm": 2.21590708761389, "learning_rate": 9.978576046048319e-07, "logits/chosen": 1.3857421875, "logits/rejected": 1.1806640625, "logps/chosen": -604.5, "logps/rejected": -636.5, "loss": 0.7298, "rewards/accuracies": 0.75, "rewards/chosen": 0.05712890625, "rewards/margins": 1.533203125, "rewards/rejected": -1.474609375, "step": 679 }, { "epoch": 0.12848977278095328, "grad_norm": 2.3104059054811814, "learning_rate": 9.97828578891882e-07, "logits/chosen": 1.06396484375, "logits/rejected": 1.345703125, "logps/chosen": -993.0, "logps/rejected": -1777.0, "loss": 0.5751, "rewards/accuracies": 0.9375, "rewards/chosen": 0.74658203125, "rewards/margins": 3.5546875, "rewards/rejected": -2.810546875, "step": 680 }, { "epoch": 0.12867872832916055, "grad_norm": 2.157061288599104, "learning_rate": 9.977993583513808e-07, "logits/chosen": 1.0615234375, "logits/rejected": 1.73577880859375, "logps/chosen": -760.0, "logps/rejected": -1277.0, "loss": 0.6767, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4241943359375, "rewards/margins": 2.482421875, "rewards/rejected": -2.060546875, "step": 681 }, { "epoch": 0.12886768387736786, "grad_norm": 1.7506228949545026, "learning_rate": 9.977699429960407e-07, "logits/chosen": 0.822265625, "logits/rejected": 1.3671875, "logps/chosen": -816.0, "logps/rejected": -910.0, "loss": 0.7144, "rewards/accuracies": 0.78125, "rewards/chosen": 0.66278076171875, "rewards/margins": 2.0234375, "rewards/rejected": -1.3642578125, "step": 682 }, { "epoch": 0.12905663942557513, "grad_norm": 2.207964672479215, "learning_rate": 9.97740332838659e-07, "logits/chosen": 1.06005859375, "logits/rejected": 0.7421875, "logps/chosen": -846.0, "logps/rejected": -807.0, "loss": 0.7558, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4957275390625, "rewards/margins": 1.4638671875, "rewards/rejected": -0.966796875, "step": 683 }, { "epoch": 0.1292455949737824, "grad_norm": 1.9210003629487518, "learning_rate": 9.977105278921175e-07, "logits/chosen": 0.91455078125, "logits/rejected": 1.3916015625, "logps/chosen": -628.5, "logps/rejected": -1510.5, "loss": 0.7399, "rewards/accuracies": 0.78125, "rewards/chosen": 0.265380859375, "rewards/margins": 2.064453125, "rewards/rejected": -1.8046875, "step": 684 }, { "epoch": 0.1294345505219897, "grad_norm": 2.0866641486311575, "learning_rate": 9.976805281693825e-07, "logits/chosen": 0.6204833984375, "logits/rejected": 1.03662109375, "logps/chosen": -797.75, "logps/rejected": -1526.0, "loss": 0.6729, "rewards/accuracies": 0.84375, "rewards/chosen": 0.65283203125, "rewards/margins": 2.0205078125, "rewards/rejected": -1.3720703125, "step": 685 }, { "epoch": 0.129623506070197, "grad_norm": 1.7475605967893018, "learning_rate": 9.97650333683506e-07, "logits/chosen": 1.001220703125, "logits/rejected": 1.5087890625, "logps/chosen": -680.5, "logps/rejected": -688.0, "loss": 0.7061, "rewards/accuracies": 0.8125, "rewards/chosen": 0.475830078125, "rewards/margins": 1.908203125, "rewards/rejected": -1.431640625, "step": 686 }, { "epoch": 0.12981246161840426, "grad_norm": 1.9988481239968345, "learning_rate": 9.976199444476238e-07, "logits/chosen": 1.01513671875, "logits/rejected": 1.1806640625, "logps/chosen": -716.0, "logps/rejected": -18179.0, "loss": 0.6977, "rewards/accuracies": 0.875, "rewards/chosen": 0.6884765625, "rewards/margins": 33.8359375, "rewards/rejected": -33.259765625, "step": 687 }, { "epoch": 0.13000141716661157, "grad_norm": 2.090736265778818, "learning_rate": 9.975893604749565e-07, "logits/chosen": 0.64215087890625, "logits/rejected": 0.9521484375, "logps/chosen": -775.5, "logps/rejected": -747.0, "loss": 0.7846, "rewards/accuracies": 0.84375, "rewards/chosen": 0.21038818359375, "rewards/margins": 1.4580078125, "rewards/rejected": -1.2490234375, "step": 688 }, { "epoch": 0.13019037271481884, "grad_norm": 1.959103289115642, "learning_rate": 9.975585817788097e-07, "logits/chosen": 2.11328125, "logits/rejected": 2.318359375, "logps/chosen": -446.0, "logps/rejected": -517.0, "loss": 0.8074, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21844482421875, "rewards/margins": 1.13525390625, "rewards/rejected": -0.9173583984375, "step": 689 }, { "epoch": 0.13037932826302612, "grad_norm": 2.172108169000739, "learning_rate": 9.97527608372574e-07, "logits/chosen": 1.94921875, "logits/rejected": 2.8046875, "logps/chosen": -1401.0, "logps/rejected": -1618.0, "loss": 0.6835, "rewards/accuracies": 0.8125, "rewards/chosen": 0.50732421875, "rewards/margins": 2.734375, "rewards/rejected": -2.224609375, "step": 690 }, { "epoch": 0.13056828381123342, "grad_norm": 2.089378095597156, "learning_rate": 9.974964402697239e-07, "logits/chosen": 2.138671875, "logits/rejected": 2.19140625, "logps/chosen": -1283.0, "logps/rejected": -1169.0, "loss": 0.637, "rewards/accuracies": 0.875, "rewards/chosen": 1.01171875, "rewards/margins": 2.953125, "rewards/rejected": -1.939453125, "step": 691 }, { "epoch": 0.1307572393594407, "grad_norm": 1.935787971392424, "learning_rate": 9.974650774838193e-07, "logits/chosen": 0.44769287109375, "logits/rejected": 0.9677734375, "logps/chosen": -1131.0, "logps/rejected": -1147.5, "loss": 0.6443, "rewards/accuracies": 0.8125, "rewards/chosen": 1.076171875, "rewards/margins": 2.595703125, "rewards/rejected": -1.525390625, "step": 692 }, { "epoch": 0.13094619490764797, "grad_norm": 2.0819036916526055, "learning_rate": 9.974335200285046e-07, "logits/chosen": -0.00408935546875, "logits/rejected": 0.7734375, "logps/chosen": -775.0, "logps/rejected": -1350.0, "loss": 0.7743, "rewards/accuracies": 0.6875, "rewards/chosen": 0.360321044921875, "rewards/margins": 1.859375, "rewards/rejected": -1.498046875, "step": 693 }, { "epoch": 0.13113515045585525, "grad_norm": 1.931847785749339, "learning_rate": 9.974017679175088e-07, "logits/chosen": 0.661865234375, "logits/rejected": 0.533447265625, "logps/chosen": -920.0, "logps/rejected": -1033.0, "loss": 0.7247, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7088623046875, "rewards/margins": 2.392578125, "rewards/rejected": -1.6796875, "step": 694 }, { "epoch": 0.13132410600406255, "grad_norm": 5.631922990919145, "learning_rate": 9.973698211646457e-07, "logits/chosen": 1.398681640625, "logits/rejected": 1.36083984375, "logps/chosen": -909.5, "logps/rejected": -862.0, "loss": 0.8124, "rewards/accuracies": 0.84375, "rewards/chosen": -0.043701171875, "rewards/margins": 1.427734375, "rewards/rejected": -1.470703125, "step": 695 }, { "epoch": 0.13151306155226983, "grad_norm": 2.1152136229922425, "learning_rate": 9.973376797838135e-07, "logits/chosen": 1.87939453125, "logits/rejected": 1.017822265625, "logps/chosen": -1066.0, "logps/rejected": -723.0, "loss": 0.8089, "rewards/accuracies": 0.71875, "rewards/chosen": -0.078857421875, "rewards/margins": 1.2421875, "rewards/rejected": -1.3212890625, "step": 696 }, { "epoch": 0.1317020171004771, "grad_norm": 2.276673643960669, "learning_rate": 9.973053437889955e-07, "logits/chosen": 1.3466796875, "logits/rejected": 1.36181640625, "logps/chosen": -912.0, "logps/rejected": -735.0, "loss": 0.7264, "rewards/accuracies": 0.84375, "rewards/chosen": 0.572509765625, "rewards/margins": 1.966796875, "rewards/rejected": -1.396484375, "step": 697 }, { "epoch": 0.1318909726486844, "grad_norm": 1.7048878091016249, "learning_rate": 9.972728131942593e-07, "logits/chosen": 0.3944091796875, "logits/rejected": 0.90234375, "logps/chosen": -887.0, "logps/rejected": -1220.5, "loss": 0.681, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4943695068359375, "rewards/margins": 3.04296875, "rewards/rejected": -2.544921875, "step": 698 }, { "epoch": 0.13207992819689168, "grad_norm": 2.145214893942157, "learning_rate": 9.972400880137576e-07, "logits/chosen": 0.5400390625, "logits/rejected": 1.03515625, "logps/chosen": -636.0, "logps/rejected": -790.0, "loss": 0.7497, "rewards/accuracies": 0.8125, "rewards/chosen": 0.26531982421875, "rewards/margins": 1.65234375, "rewards/rejected": -1.3876953125, "step": 699 }, { "epoch": 0.13226888374509896, "grad_norm": 2.1146418324560177, "learning_rate": 9.972071682617272e-07, "logits/chosen": 0.55938720703125, "logits/rejected": 0.4931640625, "logps/chosen": -1868.0, "logps/rejected": -762.0, "loss": 0.7325, "rewards/accuracies": 0.78125, "rewards/chosen": 0.091552734375, "rewards/margins": 1.2431640625, "rewards/rejected": -1.154296875, "step": 700 }, { "epoch": 0.13245783929330626, "grad_norm": 2.1920786086834005, "learning_rate": 9.971740539524898e-07, "logits/chosen": 0.6025390625, "logits/rejected": 0.58740234375, "logps/chosen": -602.5, "logps/rejected": -776.0, "loss": 0.7211, "rewards/accuracies": 0.8125, "rewards/chosen": 0.116943359375, "rewards/margins": 2.0546875, "rewards/rejected": -1.935546875, "step": 701 }, { "epoch": 0.13264679484151354, "grad_norm": 1.5707637074017007, "learning_rate": 9.97140745100452e-07, "logits/chosen": 0.7119140625, "logits/rejected": 1.815185546875, "logps/chosen": -973.0, "logps/rejected": -1086.0, "loss": 0.7008, "rewards/accuracies": 0.71875, "rewards/chosen": 1.140869140625, "rewards/margins": 2.9609375, "rewards/rejected": -1.828125, "step": 702 }, { "epoch": 0.1328357503897208, "grad_norm": 1.9813467742659518, "learning_rate": 9.971072417201047e-07, "logits/chosen": -0.13671875, "logits/rejected": 0.23486328125, "logps/chosen": -805.5, "logps/rejected": -976.5, "loss": 0.7018, "rewards/accuracies": 0.78125, "rewards/chosen": 0.759521484375, "rewards/margins": 2.552734375, "rewards/rejected": -1.7939453125, "step": 703 }, { "epoch": 0.1330247059379281, "grad_norm": 1.9446871750068457, "learning_rate": 9.970735438260234e-07, "logits/chosen": 0.91015625, "logits/rejected": 1.01708984375, "logps/chosen": -810.0, "logps/rejected": -578.5, "loss": 0.7631, "rewards/accuracies": 0.78125, "rewards/chosen": 0.20965576171875, "rewards/margins": 1.5029296875, "rewards/rejected": -1.2900390625, "step": 704 }, { "epoch": 0.1332136614861354, "grad_norm": 1.5570115562721434, "learning_rate": 9.970396514328682e-07, "logits/chosen": 0.787109375, "logits/rejected": 0.2685546875, "logps/chosen": -367.25, "logps/rejected": -353.75, "loss": 0.7495, "rewards/accuracies": 0.875, "rewards/chosen": 0.102783203125, "rewards/margins": 1.2958984375, "rewards/rejected": -1.1923828125, "step": 705 }, { "epoch": 0.13340261703434267, "grad_norm": 1.9726493463292483, "learning_rate": 9.970055645553843e-07, "logits/chosen": -0.0218505859375, "logits/rejected": 0.330810546875, "logps/chosen": -524.0, "logps/rejected": -655.0, "loss": 0.7266, "rewards/accuracies": 0.875, "rewards/chosen": 0.34503173828125, "rewards/margins": 1.720703125, "rewards/rejected": -1.3779296875, "step": 706 }, { "epoch": 0.13359157258254994, "grad_norm": 2.4246038324341153, "learning_rate": 9.96971283208401e-07, "logits/chosen": 0.279541015625, "logits/rejected": 0.445556640625, "logps/chosen": -809.5, "logps/rejected": -963.0, "loss": 0.7532, "rewards/accuracies": 0.78125, "rewards/chosen": 0.00146484375, "rewards/margins": 2.123046875, "rewards/rejected": -2.119140625, "step": 707 }, { "epoch": 0.13378052813075725, "grad_norm": 2.0437193442423185, "learning_rate": 9.969368074068328e-07, "logits/chosen": 1.0244140625, "logits/rejected": 1.4267578125, "logps/chosen": -975.0, "logps/rejected": -893.0, "loss": 0.6939, "rewards/accuracies": 0.875, "rewards/chosen": 0.68701171875, "rewards/margins": 2.4609375, "rewards/rejected": -1.7734375, "step": 708 }, { "epoch": 0.13396948367896452, "grad_norm": 2.13135110374336, "learning_rate": 9.96902137165678e-07, "logits/chosen": 0.7783203125, "logits/rejected": 0.53369140625, "logps/chosen": -721.5, "logps/rejected": -686.5, "loss": 0.7374, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1334228515625, "rewards/margins": 1.947265625, "rewards/rejected": -2.080078125, "step": 709 }, { "epoch": 0.1341584392271718, "grad_norm": 1.9879751136864037, "learning_rate": 9.968672725000194e-07, "logits/chosen": 0.1561279296875, "logits/rejected": -0.147216796875, "logps/chosen": -1053.5, "logps/rejected": -712.0, "loss": 0.7028, "rewards/accuracies": 0.84375, "rewards/chosen": 0.292633056640625, "rewards/margins": 2.24609375, "rewards/rejected": -1.94921875, "step": 710 }, { "epoch": 0.1343473947753791, "grad_norm": 1.9671661699636207, "learning_rate": 9.968322134250257e-07, "logits/chosen": 0.6220703125, "logits/rejected": 0.7646484375, "logps/chosen": -894.5, "logps/rejected": -700.0, "loss": 0.7201, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1339111328125, "rewards/margins": 1.75390625, "rewards/rejected": -1.890625, "step": 711 }, { "epoch": 0.13453635032358638, "grad_norm": 2.0494444094771795, "learning_rate": 9.96796959955949e-07, "logits/chosen": 0.764404296875, "logits/rejected": 0.87939453125, "logps/chosen": -894.0, "logps/rejected": -1132.0, "loss": 0.6907, "rewards/accuracies": 0.78125, "rewards/chosen": 0.46038818359375, "rewards/margins": 2.353515625, "rewards/rejected": -1.8935546875, "step": 712 }, { "epoch": 0.13472530587179365, "grad_norm": 2.1071652828335066, "learning_rate": 9.96761512108126e-07, "logits/chosen": 0.5478515625, "logits/rejected": 0.3056640625, "logps/chosen": -714.5, "logps/rejected": -680.0, "loss": 0.7673, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1151123046875, "rewards/margins": 1.595703125, "rewards/rejected": -1.4755859375, "step": 713 }, { "epoch": 0.13491426142000096, "grad_norm": 1.9293680163342564, "learning_rate": 9.967258698969787e-07, "logits/chosen": 1.0718994140625, "logits/rejected": 1.1025390625, "logps/chosen": -1014.0, "logps/rejected": -903.25, "loss": 0.7257, "rewards/accuracies": 0.84375, "rewards/chosen": 0.05133056640625, "rewards/margins": 1.986328125, "rewards/rejected": -1.9345703125, "step": 714 }, { "epoch": 0.13510321696820823, "grad_norm": 2.1138177117834167, "learning_rate": 9.96690033338013e-07, "logits/chosen": 0.22802734375, "logits/rejected": -0.083984375, "logps/chosen": -599.0, "logps/rejected": -600.0, "loss": 0.8083, "rewards/accuracies": 0.71875, "rewards/chosen": -0.25213623046875, "rewards/margins": 1.447021484375, "rewards/rejected": -1.69921875, "step": 715 }, { "epoch": 0.1352921725164155, "grad_norm": 1.8828050404307901, "learning_rate": 9.966540024468195e-07, "logits/chosen": 0.7406005859375, "logits/rejected": 0.78564453125, "logps/chosen": -1133.0, "logps/rejected": -551.5, "loss": 0.7258, "rewards/accuracies": 0.8125, "rewards/chosen": -0.408203125, "rewards/margins": 0.9921875, "rewards/rejected": -1.4013671875, "step": 716 }, { "epoch": 0.13548112806462279, "grad_norm": 2.154452112348546, "learning_rate": 9.96617777239074e-07, "logits/chosen": 0.60498046875, "logits/rejected": 0.84228515625, "logps/chosen": -740.5, "logps/rejected": -726.0, "loss": 0.7213, "rewards/accuracies": 0.78125, "rewards/chosen": 0.14605712890625, "rewards/margins": 1.79296875, "rewards/rejected": -1.646484375, "step": 717 }, { "epoch": 0.1356700836128301, "grad_norm": 1.9620916845889473, "learning_rate": 9.965813577305354e-07, "logits/chosen": 0.3447265625, "logits/rejected": 1.2421875, "logps/chosen": -874.0, "logps/rejected": -2181.0, "loss": 0.7135, "rewards/accuracies": 0.8125, "rewards/chosen": 0.254150390625, "rewards/margins": 2.935546875, "rewards/rejected": -2.677734375, "step": 718 }, { "epoch": 0.13585903916103736, "grad_norm": 2.107929151244693, "learning_rate": 9.965447439370485e-07, "logits/chosen": 0.12060546875, "logits/rejected": 0.3701934814453125, "logps/chosen": -1010.5, "logps/rejected": -1314.0, "loss": 0.6353, "rewards/accuracies": 0.875, "rewards/chosen": 0.73382568359375, "rewards/margins": 2.84375, "rewards/rejected": -2.111328125, "step": 719 }, { "epoch": 0.13604799470924464, "grad_norm": 2.1004711187830782, "learning_rate": 9.96507935874542e-07, "logits/chosen": 0.2353515625, "logits/rejected": 0.390380859375, "logps/chosen": -669.0, "logps/rejected": -2104.5, "loss": 0.718, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1680908203125, "rewards/margins": 3.1318359375, "rewards/rejected": -2.966796875, "step": 720 }, { "epoch": 0.13623695025745194, "grad_norm": 2.2482161617746157, "learning_rate": 9.96470933559029e-07, "logits/chosen": 1.12890625, "logits/rejected": 1.625, "logps/chosen": -728.0, "logps/rejected": -1178.5, "loss": 0.6185, "rewards/accuracies": 0.78125, "rewards/chosen": 0.935546875, "rewards/margins": 2.3515625, "rewards/rejected": -1.4150390625, "step": 721 }, { "epoch": 0.13642590580565922, "grad_norm": 1.9179045027790398, "learning_rate": 9.96433737006608e-07, "logits/chosen": 0.59326171875, "logits/rejected": 0.7279052734375, "logps/chosen": -748.0, "logps/rejected": -616.0, "loss": 0.694, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4581298828125, "rewards/margins": 1.79296875, "rewards/rejected": -1.3349609375, "step": 722 }, { "epoch": 0.1366148613538665, "grad_norm": 2.370780359090241, "learning_rate": 9.963963462334605e-07, "logits/chosen": 0.8603515625, "logits/rejected": 0.57275390625, "logps/chosen": -758.0, "logps/rejected": -759.0, "loss": 0.7418, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4931640625, "rewards/margins": 1.3037109375, "rewards/rejected": -0.81005859375, "step": 723 }, { "epoch": 0.1368038169020738, "grad_norm": 2.012048990042175, "learning_rate": 9.963587612558538e-07, "logits/chosen": 0.8017578125, "logits/rejected": 1.41015625, "logps/chosen": -634.0, "logps/rejected": -940.0, "loss": 0.7057, "rewards/accuracies": 0.78125, "rewards/chosen": 0.46875, "rewards/margins": 2.267578125, "rewards/rejected": -1.798828125, "step": 724 }, { "epoch": 0.13699277245028108, "grad_norm": 2.222888931916807, "learning_rate": 9.963209820901393e-07, "logits/chosen": 1.630859375, "logits/rejected": 1.74609375, "logps/chosen": -844.0, "logps/rejected": -831.5, "loss": 0.7687, "rewards/accuracies": 0.75, "rewards/chosen": 0.8101806640625, "rewards/margins": 1.692138671875, "rewards/rejected": -0.8818359375, "step": 725 }, { "epoch": 0.13718172799848835, "grad_norm": 1.9682696217309346, "learning_rate": 9.962830087527525e-07, "logits/chosen": 0.440185546875, "logits/rejected": 1.0206298828125, "logps/chosen": -489.5, "logps/rejected": -650.5, "loss": 0.7568, "rewards/accuracies": 0.875, "rewards/chosen": 0.187530517578125, "rewards/margins": 1.6484375, "rewards/rejected": -1.4609375, "step": 726 }, { "epoch": 0.13737068354669563, "grad_norm": 2.1006484770526668, "learning_rate": 9.962448412602138e-07, "logits/chosen": 0.722412109375, "logits/rejected": 0.9803466796875, "logps/chosen": -896.5, "logps/rejected": -814.0, "loss": 0.7391, "rewards/accuracies": 0.78125, "rewards/chosen": 0.51318359375, "rewards/margins": 1.7998046875, "rewards/rejected": -1.28515625, "step": 727 }, { "epoch": 0.13755963909490293, "grad_norm": 1.6523129577146247, "learning_rate": 9.962064796291282e-07, "logits/chosen": 1.677734375, "logits/rejected": 1.62890625, "logps/chosen": -625.0, "logps/rejected": -728.5, "loss": 0.7439, "rewards/accuracies": 0.78125, "rewards/chosen": 0.28546142578125, "rewards/margins": 1.668212890625, "rewards/rejected": -1.3837890625, "step": 728 }, { "epoch": 0.1377485946431102, "grad_norm": 1.7923578335615933, "learning_rate": 9.961679238761845e-07, "logits/chosen": 0.9794921875, "logits/rejected": 1.7919921875, "logps/chosen": -771.0, "logps/rejected": -1266.0, "loss": 0.7532, "rewards/accuracies": 0.75, "rewards/chosen": 0.533935546875, "rewards/margins": 2.52734375, "rewards/rejected": -1.990234375, "step": 729 }, { "epoch": 0.13793755019131748, "grad_norm": 1.998013593230295, "learning_rate": 9.961291740181566e-07, "logits/chosen": 0.6072998046875, "logits/rejected": 1.232177734375, "logps/chosen": -418.5, "logps/rejected": -778.0, "loss": 0.8866, "rewards/accuracies": 0.71875, "rewards/chosen": 0.12188720703125, "rewards/margins": 0.5849609375, "rewards/rejected": -0.462890625, "step": 730 }, { "epoch": 0.13812650573952479, "grad_norm": 2.029320756536463, "learning_rate": 9.960902300719023e-07, "logits/chosen": 0.3360595703125, "logits/rejected": 0.91162109375, "logps/chosen": -1187.0, "logps/rejected": -1314.0, "loss": 0.689, "rewards/accuracies": 0.875, "rewards/chosen": 0.761474609375, "rewards/margins": 2.8564453125, "rewards/rejected": -2.0966796875, "step": 731 }, { "epoch": 0.13831546128773206, "grad_norm": 2.3175997491692715, "learning_rate": 9.960510920543648e-07, "logits/chosen": 1.69091796875, "logits/rejected": 1.2880859375, "logps/chosen": -1350.0, "logps/rejected": -798.0, "loss": 0.6572, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1546630859375, "rewards/margins": 2.00244140625, "rewards/rejected": -1.845703125, "step": 732 }, { "epoch": 0.13850441683593934, "grad_norm": 2.0065044663439635, "learning_rate": 9.960117599825704e-07, "logits/chosen": 1.1279296875, "logits/rejected": 1.23046875, "logps/chosen": -932.0, "logps/rejected": -770.5, "loss": 0.7425, "rewards/accuracies": 0.78125, "rewards/chosen": 0.36334228515625, "rewards/margins": 1.541015625, "rewards/rejected": -1.1748046875, "step": 733 }, { "epoch": 0.13869337238414664, "grad_norm": 1.8518393802126965, "learning_rate": 9.959722338736309e-07, "logits/chosen": 0.32080078125, "logits/rejected": 0.9462890625, "logps/chosen": -817.0, "logps/rejected": -953.5, "loss": 0.7047, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5385284423828125, "rewards/margins": 2.7412109375, "rewards/rejected": -2.20703125, "step": 734 }, { "epoch": 0.13888232793235392, "grad_norm": 2.0418047810711917, "learning_rate": 9.959325137447416e-07, "logits/chosen": 1.39453125, "logits/rejected": 1.283203125, "logps/chosen": -775.0, "logps/rejected": -1350.5, "loss": 0.7284, "rewards/accuracies": 0.84375, "rewards/chosen": 0.779541015625, "rewards/margins": 2.412109375, "rewards/rejected": -1.6337890625, "step": 735 }, { "epoch": 0.1390712834805612, "grad_norm": 1.9198255476931287, "learning_rate": 9.958925996131833e-07, "logits/chosen": 0.951171875, "logits/rejected": 1.484375, "logps/chosen": -1155.0, "logps/rejected": -967.0, "loss": 0.6903, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8017578125, "rewards/margins": 2.56640625, "rewards/rejected": -1.7646484375, "step": 736 }, { "epoch": 0.1392602390287685, "grad_norm": 1.912972178707006, "learning_rate": 9.958524914963204e-07, "logits/chosen": 1.3798828125, "logits/rejected": 1.779296875, "logps/chosen": -550.0, "logps/rejected": -578.0, "loss": 0.8152, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2183837890625, "rewards/margins": 1.20263671875, "rewards/rejected": -0.98583984375, "step": 737 }, { "epoch": 0.13944919457697577, "grad_norm": 1.543908950749828, "learning_rate": 9.958121894116018e-07, "logits/chosen": 1.83203125, "logits/rejected": 1.60205078125, "logps/chosen": -1789.0, "logps/rejected": -966.0, "loss": 0.6113, "rewards/accuracies": 0.84375, "rewards/chosen": -0.147216796875, "rewards/margins": 2.353515625, "rewards/rejected": -2.505859375, "step": 738 }, { "epoch": 0.13963815012518305, "grad_norm": 2.0548150196879416, "learning_rate": 9.95771693376561e-07, "logits/chosen": 0.51123046875, "logits/rejected": 0.469146728515625, "logps/chosen": -744.5, "logps/rejected": -795.0, "loss": 0.7025, "rewards/accuracies": 0.875, "rewards/chosen": 0.5029296875, "rewards/margins": 4.1171875, "rewards/rejected": -3.6083984375, "step": 739 }, { "epoch": 0.13982710567339032, "grad_norm": 1.9548637177020844, "learning_rate": 9.95731003408816e-07, "logits/chosen": 0.85888671875, "logits/rejected": 0.79608154296875, "logps/chosen": -534.0, "logps/rejected": -557.0, "loss": 0.7404, "rewards/accuracies": 0.84375, "rewards/chosen": 0.199554443359375, "rewards/margins": 1.494140625, "rewards/rejected": -1.29296875, "step": 740 }, { "epoch": 0.14001606122159763, "grad_norm": 2.0997293062620748, "learning_rate": 9.956901195260683e-07, "logits/chosen": 0.98126220703125, "logits/rejected": 1.11767578125, "logps/chosen": -1299.0, "logps/rejected": -1480.0, "loss": 0.7199, "rewards/accuracies": 0.78125, "rewards/chosen": 0.13555908203125, "rewards/margins": 2.115234375, "rewards/rejected": -1.9765625, "step": 741 }, { "epoch": 0.1402050167698049, "grad_norm": 2.499977150276758, "learning_rate": 9.956490417461051e-07, "logits/chosen": 0.587890625, "logits/rejected": 0.990234375, "logps/chosen": -1172.0, "logps/rejected": -817.5, "loss": 0.6204, "rewards/accuracies": 0.875, "rewards/chosen": 0.5985107421875, "rewards/margins": 2.3359375, "rewards/rejected": -1.740234375, "step": 742 }, { "epoch": 0.14039397231801218, "grad_norm": 1.8141745445304283, "learning_rate": 9.95607770086797e-07, "logits/chosen": 1.1875, "logits/rejected": 1.31640625, "logps/chosen": -579.5, "logps/rejected": -1559.0, "loss": 0.751, "rewards/accuracies": 0.78125, "rewards/chosen": 0.251708984375, "rewards/margins": 2.0576171875, "rewards/rejected": -1.8076171875, "step": 743 }, { "epoch": 0.14058292786621948, "grad_norm": 1.7007306677376781, "learning_rate": 9.955663045660992e-07, "logits/chosen": 0.326171875, "logits/rejected": 0.32080078125, "logps/chosen": -600.5, "logps/rejected": -722.5, "loss": 0.7098, "rewards/accuracies": 0.84375, "rewards/chosen": 0.16064453125, "rewards/margins": 2.029296875, "rewards/rejected": -1.869140625, "step": 744 }, { "epoch": 0.14077188341442676, "grad_norm": 2.1278954360216495, "learning_rate": 9.955246452020512e-07, "logits/chosen": 0.5810546875, "logits/rejected": 1.25048828125, "logps/chosen": -989.0, "logps/rejected": -1060.0, "loss": 0.5757, "rewards/accuracies": 0.9375, "rewards/chosen": 0.765380859375, "rewards/margins": 3.1953125, "rewards/rejected": -2.423828125, "step": 745 }, { "epoch": 0.14096083896263403, "grad_norm": 2.062948852776286, "learning_rate": 9.95482792012777e-07, "logits/chosen": 1.0439453125, "logits/rejected": 1.6337890625, "logps/chosen": -791.0, "logps/rejected": -846.0, "loss": 0.6583, "rewards/accuracies": 0.8125, "rewards/chosen": 0.54541015625, "rewards/margins": 2.748046875, "rewards/rejected": -2.201171875, "step": 746 }, { "epoch": 0.14114979451084134, "grad_norm": 1.8047774474575977, "learning_rate": 9.954407450164847e-07, "logits/chosen": 0.94921875, "logits/rejected": 2.100830078125, "logps/chosen": -611.0, "logps/rejected": -1690.5, "loss": 0.6387, "rewards/accuracies": 0.90625, "rewards/chosen": 0.59521484375, "rewards/margins": 3.30859375, "rewards/rejected": -2.712890625, "step": 747 }, { "epoch": 0.1413387500590486, "grad_norm": 1.8498282338239116, "learning_rate": 9.953985042314665e-07, "logits/chosen": 0.85003662109375, "logits/rejected": 0.2872314453125, "logps/chosen": -736.0, "logps/rejected": -690.0, "loss": 0.6142, "rewards/accuracies": 0.9375, "rewards/chosen": 0.60205078125, "rewards/margins": 2.51171875, "rewards/rejected": -1.90234375, "step": 748 }, { "epoch": 0.1415277056072559, "grad_norm": 2.7723202731672907, "learning_rate": 9.953560696761e-07, "logits/chosen": 1.3291015625, "logits/rejected": 1.60205078125, "logps/chosen": -736.0, "logps/rejected": -946.0, "loss": 0.7165, "rewards/accuracies": 0.84375, "rewards/chosen": 0.381134033203125, "rewards/margins": 1.943359375, "rewards/rejected": -1.560546875, "step": 749 }, { "epoch": 0.14171666115546316, "grad_norm": 2.0468312965024555, "learning_rate": 9.953134413688458e-07, "logits/chosen": 0.78759765625, "logits/rejected": 0.6484375, "logps/chosen": -934.0, "logps/rejected": -1119.5, "loss": 0.6547, "rewards/accuracies": 0.90625, "rewards/chosen": 0.64599609375, "rewards/margins": 2.51171875, "rewards/rejected": -1.8671875, "step": 750 }, { "epoch": 0.14190561670367047, "grad_norm": 1.624216424497592, "learning_rate": 9.952706193282495e-07, "logits/chosen": 1.224609375, "logits/rejected": 1.220703125, "logps/chosen": -793.5, "logps/rejected": -889.5, "loss": 0.7505, "rewards/accuracies": 0.625, "rewards/chosen": 0.47119140625, "rewards/margins": 2.2841796875, "rewards/rejected": -1.818359375, "step": 751 }, { "epoch": 0.14209457225187774, "grad_norm": 1.8562123736355807, "learning_rate": 9.952276035729406e-07, "logits/chosen": 0.51171875, "logits/rejected": 1.0947265625, "logps/chosen": -728.5, "logps/rejected": -1773.0, "loss": 0.711, "rewards/accuracies": 0.78125, "rewards/chosen": 0.306640625, "rewards/margins": 2.640625, "rewards/rejected": -2.337890625, "step": 752 }, { "epoch": 0.14228352780008502, "grad_norm": 87.39699228211265, "learning_rate": 9.951843941216332e-07, "logits/chosen": 1.0615234375, "logits/rejected": 1.5625, "logps/chosen": -1248.5, "logps/rejected": -932.0, "loss": 0.7478, "rewards/accuracies": 0.75, "rewards/chosen": 0.226715087890625, "rewards/margins": 1.998046875, "rewards/rejected": -1.7734375, "step": 753 }, { "epoch": 0.14247248334829232, "grad_norm": 1.6619659515434948, "learning_rate": 9.951409909931256e-07, "logits/chosen": 0.63134765625, "logits/rejected": 1.248046875, "logps/chosen": -801.5, "logps/rejected": -1140.5, "loss": 0.6451, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4512176513671875, "rewards/margins": 2.802734375, "rewards/rejected": -2.3515625, "step": 754 }, { "epoch": 0.1426614388964996, "grad_norm": 1.7679423049306386, "learning_rate": 9.950973942063003e-07, "logits/chosen": 0.2425537109375, "logits/rejected": 0.7469482421875, "logps/chosen": -964.0, "logps/rejected": -1171.5, "loss": 0.7102, "rewards/accuracies": 0.6875, "rewards/chosen": 0.55426025390625, "rewards/margins": 2.421875, "rewards/rejected": -1.876953125, "step": 755 }, { "epoch": 0.14285039444470687, "grad_norm": 2.619726357971259, "learning_rate": 9.95053603780124e-07, "logits/chosen": 0.5484619140625, "logits/rejected": 0.596923828125, "logps/chosen": -870.0, "logps/rejected": -727.0, "loss": 0.7772, "rewards/accuracies": 0.78125, "rewards/chosen": 0.572265625, "rewards/margins": 1.53173828125, "rewards/rejected": -0.9593505859375, "step": 756 }, { "epoch": 0.14303934999291418, "grad_norm": 1.909810816440544, "learning_rate": 9.950096197336477e-07, "logits/chosen": 1.5078125, "logits/rejected": 1.4237060546875, "logps/chosen": -746.0, "logps/rejected": -754.5, "loss": 0.6703, "rewards/accuracies": 0.875, "rewards/chosen": 0.1671142578125, "rewards/margins": 2.236328125, "rewards/rejected": -2.07421875, "step": 757 }, { "epoch": 0.14322830554112145, "grad_norm": 2.2050297612044565, "learning_rate": 9.949654420860067e-07, "logits/chosen": 0.149658203125, "logits/rejected": 0.2802734375, "logps/chosen": -1010.0, "logps/rejected": -607.5, "loss": 0.726, "rewards/accuracies": 0.78125, "rewards/chosen": 0.23870849609375, "rewards/margins": 2.03125, "rewards/rejected": -1.79296875, "step": 758 }, { "epoch": 0.14341726108932873, "grad_norm": 2.231096810338041, "learning_rate": 9.949210708564204e-07, "logits/chosen": 0.966796875, "logits/rejected": 1.724609375, "logps/chosen": -588.0, "logps/rejected": -872.5, "loss": 0.7579, "rewards/accuracies": 0.71875, "rewards/chosen": 0.294921875, "rewards/margins": 1.560546875, "rewards/rejected": -1.2626953125, "step": 759 }, { "epoch": 0.14360621663753603, "grad_norm": 1.914985813972015, "learning_rate": 9.948765060641923e-07, "logits/chosen": 0.0830078125, "logits/rejected": 0.193603515625, "logps/chosen": -649.0, "logps/rejected": -667.0, "loss": 0.6275, "rewards/accuracies": 0.875, "rewards/chosen": 0.37646484375, "rewards/margins": 2.36328125, "rewards/rejected": -1.98828125, "step": 760 }, { "epoch": 0.1437951721857433, "grad_norm": 2.0954834269215383, "learning_rate": 9.948317477287106e-07, "logits/chosen": 0.269287109375, "logits/rejected": 0.364013671875, "logps/chosen": -1100.0, "logps/rejected": -742.0, "loss": 0.64, "rewards/accuracies": 0.90625, "rewards/chosen": 0.75537109375, "rewards/margins": 2.609375, "rewards/rejected": -1.8515625, "step": 761 }, { "epoch": 0.14398412773395058, "grad_norm": 1.8947656468841365, "learning_rate": 9.947867958694474e-07, "logits/chosen": 0.65380859375, "logits/rejected": 1.1875, "logps/chosen": -635.5, "logps/rejected": -708.5, "loss": 0.6742, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1986083984375, "rewards/margins": 2.224609375, "rewards/rejected": -2.02734375, "step": 762 }, { "epoch": 0.14417308328215786, "grad_norm": 2.02893474863704, "learning_rate": 9.947416505059587e-07, "logits/chosen": 0.81396484375, "logits/rejected": 0.6611328125, "logps/chosen": -876.0, "logps/rejected": -971.5, "loss": 0.6472, "rewards/accuracies": 0.96875, "rewards/chosen": 0.500762939453125, "rewards/margins": 2.365234375, "rewards/rejected": -1.8671875, "step": 763 }, { "epoch": 0.14436203883036516, "grad_norm": 1.8870606921541595, "learning_rate": 9.946963116578848e-07, "logits/chosen": 0.79345703125, "logits/rejected": 0.87939453125, "logps/chosen": -598.0, "logps/rejected": -807.0, "loss": 0.707, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1771240234375, "rewards/margins": 1.853515625, "rewards/rejected": -1.6767578125, "step": 764 }, { "epoch": 0.14455099437857244, "grad_norm": 1.8369492835190555, "learning_rate": 9.94650779344951e-07, "logits/chosen": 0.318359375, "logits/rejected": 0.753021240234375, "logps/chosen": -845.5, "logps/rejected": -1262.0, "loss": 0.6781, "rewards/accuracies": 0.78125, "rewards/chosen": 0.32275390625, "rewards/margins": 2.962890625, "rewards/rejected": -2.638671875, "step": 765 }, { "epoch": 0.14473994992677972, "grad_norm": 2.017872519957772, "learning_rate": 9.946050535869657e-07, "logits/chosen": 0.3824462890625, "logits/rejected": 0.180908203125, "logps/chosen": -630.0, "logps/rejected": -682.0, "loss": 0.6585, "rewards/accuracies": 0.90625, "rewards/chosen": 0.273681640625, "rewards/margins": 1.880859375, "rewards/rejected": -1.607421875, "step": 766 }, { "epoch": 0.14492890547498702, "grad_norm": 2.130331758592837, "learning_rate": 9.945591344038219e-07, "logits/chosen": 1.3974609375, "logits/rejected": 1.802734375, "logps/chosen": -817.0, "logps/rejected": -1073.0, "loss": 0.7035, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6103515625, "rewards/margins": 2.0859375, "rewards/rejected": -1.474609375, "step": 767 }, { "epoch": 0.1451178610231943, "grad_norm": 2.052427476969908, "learning_rate": 9.945130218154965e-07, "logits/chosen": 0.949951171875, "logits/rejected": 1.3935546875, "logps/chosen": -893.0, "logps/rejected": -757.0, "loss": 0.7324, "rewards/accuracies": 0.6875, "rewards/chosen": 0.65087890625, "rewards/margins": 2.02734375, "rewards/rejected": -1.376953125, "step": 768 }, { "epoch": 0.14530681657140157, "grad_norm": 1.9808296067973379, "learning_rate": 9.944667158420512e-07, "logits/chosen": 0.8056640625, "logits/rejected": 1.3818359375, "logps/chosen": -696.0, "logps/rejected": -2355.0, "loss": 0.7679, "rewards/accuracies": 0.8125, "rewards/chosen": -0.001220703125, "rewards/margins": 2.484375, "rewards/rejected": -2.4833984375, "step": 769 }, { "epoch": 0.14549577211960887, "grad_norm": 1.746736329909086, "learning_rate": 9.94420216503631e-07, "logits/chosen": 0.75, "logits/rejected": 1.8173828125, "logps/chosen": -638.5, "logps/rejected": -2065.5, "loss": 0.716, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1966552734375, "rewards/margins": 3.4072265625, "rewards/rejected": -3.203125, "step": 770 }, { "epoch": 0.14568472766781615, "grad_norm": 2.1629220206041184, "learning_rate": 9.943735238204657e-07, "logits/chosen": 0.91064453125, "logits/rejected": 1.238037109375, "logps/chosen": -686.0, "logps/rejected": -1404.5, "loss": 0.7592, "rewards/accuracies": 0.75, "rewards/chosen": 0.290283203125, "rewards/margins": 1.3818359375, "rewards/rejected": -1.09033203125, "step": 771 }, { "epoch": 0.14587368321602343, "grad_norm": 2.181249705068788, "learning_rate": 9.943266378128688e-07, "logits/chosen": 1.6376953125, "logits/rejected": 1.31689453125, "logps/chosen": -833.5, "logps/rejected": -959.5, "loss": 0.6642, "rewards/accuracies": 0.875, "rewards/chosen": 0.5467529296875, "rewards/margins": 2.25, "rewards/rejected": -1.705078125, "step": 772 }, { "epoch": 0.1460626387642307, "grad_norm": 1.8856534212324043, "learning_rate": 9.942795585012378e-07, "logits/chosen": 0.7601318359375, "logits/rejected": 0.55615234375, "logps/chosen": -909.0, "logps/rejected": -719.0, "loss": 0.6722, "rewards/accuracies": 0.78125, "rewards/chosen": 0.648193359375, "rewards/margins": 2.2998046875, "rewards/rejected": -1.65625, "step": 773 }, { "epoch": 0.146251594312438, "grad_norm": 1.6381247312130676, "learning_rate": 9.942322859060551e-07, "logits/chosen": 1.868896484375, "logits/rejected": 1.402099609375, "logps/chosen": -959.0, "logps/rejected": -884.0, "loss": 0.7254, "rewards/accuracies": 0.8125, "rewards/chosen": 0.32568359375, "rewards/margins": 2.216796875, "rewards/rejected": -1.890625, "step": 774 }, { "epoch": 0.14644054986064528, "grad_norm": 1.8997275356324064, "learning_rate": 9.94184820047886e-07, "logits/chosen": 0.9466552734375, "logits/rejected": 0.730712890625, "logps/chosen": -646.0, "logps/rejected": -552.0, "loss": 0.7402, "rewards/accuracies": 0.8125, "rewards/chosen": 0.43023681640625, "rewards/margins": 1.5498046875, "rewards/rejected": -1.120819091796875, "step": 775 }, { "epoch": 0.14662950540885256, "grad_norm": 2.3860004381325277, "learning_rate": 9.941371609473808e-07, "logits/chosen": 1.3408203125, "logits/rejected": 1.466796875, "logps/chosen": -1006.0, "logps/rejected": -1236.5, "loss": 0.6893, "rewards/accuracies": 0.8125, "rewards/chosen": 0.214599609375, "rewards/margins": 2.263671875, "rewards/rejected": -2.05078125, "step": 776 }, { "epoch": 0.14681846095705986, "grad_norm": 2.0910858338360714, "learning_rate": 9.940893086252735e-07, "logits/chosen": 0.3359375, "logits/rejected": 0.98095703125, "logps/chosen": -895.5, "logps/rejected": -835.5, "loss": 0.7296, "rewards/accuracies": 0.78125, "rewards/chosen": 0.08746337890625, "rewards/margins": 1.763671875, "rewards/rejected": -1.67578125, "step": 777 }, { "epoch": 0.14700741650526714, "grad_norm": 2.0566091130495505, "learning_rate": 9.940412631023821e-07, "logits/chosen": 0.920654296875, "logits/rejected": 1.06787109375, "logps/chosen": -1147.0, "logps/rejected": -930.0, "loss": 0.687, "rewards/accuracies": 0.8125, "rewards/chosen": 0.55224609375, "rewards/margins": 2.73828125, "rewards/rejected": -2.185546875, "step": 778 }, { "epoch": 0.1471963720534744, "grad_norm": 2.2980329680694225, "learning_rate": 9.93993024399609e-07, "logits/chosen": 0.4129638671875, "logits/rejected": 0.955322265625, "logps/chosen": -633.5, "logps/rejected": -617.0, "loss": 0.7548, "rewards/accuracies": 0.75, "rewards/chosen": 0.2958984375, "rewards/margins": 1.72265625, "rewards/rejected": -1.42578125, "step": 779 }, { "epoch": 0.14738532760168171, "grad_norm": 2.005015273507875, "learning_rate": 9.939445925379406e-07, "logits/chosen": 0.3896484375, "logits/rejected": 0.422607421875, "logps/chosen": -787.0, "logps/rejected": -833.0, "loss": 0.7183, "rewards/accuracies": 0.84375, "rewards/chosen": 0.326904296875, "rewards/margins": 2.099609375, "rewards/rejected": -1.767578125, "step": 780 }, { "epoch": 0.147574283149889, "grad_norm": 2.2118258187806266, "learning_rate": 9.938959675384466e-07, "logits/chosen": 0.5244140625, "logits/rejected": 0.370361328125, "logps/chosen": -856.5, "logps/rejected": -912.5, "loss": 0.6752, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5233154296875, "rewards/margins": 2.2060546875, "rewards/rejected": -1.6796875, "step": 781 }, { "epoch": 0.14776323869809627, "grad_norm": 2.7565406399419805, "learning_rate": 9.938471494222814e-07, "logits/chosen": 1.955078125, "logits/rejected": 1.56640625, "logps/chosen": -1780.0, "logps/rejected": -1976.0, "loss": 0.654, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0615234375, "rewards/margins": 2.57421875, "rewards/rejected": -2.5078125, "step": 782 }, { "epoch": 0.14795219424630357, "grad_norm": 2.0853254747581977, "learning_rate": 9.937981382106837e-07, "logits/chosen": 0.735595703125, "logits/rejected": 0.5604248046875, "logps/chosen": -569.5, "logps/rejected": -665.0, "loss": 0.6669, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0213623046875, "rewards/margins": 2.7109375, "rewards/rejected": -2.68359375, "step": 783 }, { "epoch": 0.14814114979451085, "grad_norm": 1.9616445417169304, "learning_rate": 9.937489339249757e-07, "logits/chosen": 1.0673828125, "logits/rejected": 0.8759765625, "logps/chosen": -651.0, "logps/rejected": -638.0, "loss": 0.6972, "rewards/accuracies": 0.875, "rewards/chosen": 0.131103515625, "rewards/margins": 1.8271484375, "rewards/rejected": -1.697265625, "step": 784 }, { "epoch": 0.14833010534271812, "grad_norm": 1.6097837474888754, "learning_rate": 9.936995365865635e-07, "logits/chosen": -0.2066650390625, "logits/rejected": 0.23193359375, "logps/chosen": -531.0, "logps/rejected": -651.0, "loss": 0.6714, "rewards/accuracies": 0.84375, "rewards/chosen": 0.184173583984375, "rewards/margins": 2.609375, "rewards/rejected": -2.4296875, "step": 785 }, { "epoch": 0.1485190608909254, "grad_norm": 1.6814872542193744, "learning_rate": 9.93649946216937e-07, "logits/chosen": 0.9267578125, "logits/rejected": 1.623046875, "logps/chosen": -573.0, "logps/rejected": -830.5, "loss": 0.6968, "rewards/accuracies": 0.84375, "rewards/chosen": 0.43890380859375, "rewards/margins": 2.1669921875, "rewards/rejected": -1.7275390625, "step": 786 }, { "epoch": 0.1487080164391327, "grad_norm": 1.8515222199486807, "learning_rate": 9.936001628376712e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.84765625, "logps/chosen": -796.5, "logps/rejected": -837.0, "loss": 0.7776, "rewards/accuracies": 0.71875, "rewards/chosen": 0.257568359375, "rewards/margins": 1.880859375, "rewards/rejected": -1.620361328125, "step": 787 }, { "epoch": 0.14889697198733998, "grad_norm": 2.224934971198902, "learning_rate": 9.93550186470424e-07, "logits/chosen": -0.1484375, "logits/rejected": 0.320556640625, "logps/chosen": -786.0, "logps/rejected": -688.0, "loss": 0.7736, "rewards/accuracies": 0.8125, "rewards/chosen": 0.37890625, "rewards/margins": 1.2841796875, "rewards/rejected": -0.90673828125, "step": 788 }, { "epoch": 0.14908592753554725, "grad_norm": 1.7119662377005704, "learning_rate": 9.935000171369378e-07, "logits/chosen": 0.499267578125, "logits/rejected": 0.81591796875, "logps/chosen": -659.5, "logps/rejected": -756.5, "loss": 0.7794, "rewards/accuracies": 0.75, "rewards/chosen": 0.470703125, "rewards/margins": 1.505859375, "rewards/rejected": -1.0361328125, "step": 789 }, { "epoch": 0.14927488308375456, "grad_norm": 2.155169448080607, "learning_rate": 9.93449654859038e-07, "logits/chosen": 0.862548828125, "logits/rejected": 1.0986328125, "logps/chosen": -2027.0, "logps/rejected": -1150.0, "loss": 0.6887, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4844970703125, "rewards/margins": 2.21875, "rewards/rejected": -1.734375, "step": 790 }, { "epoch": 0.14946383863196183, "grad_norm": 1.806292043786328, "learning_rate": 9.933990996586356e-07, "logits/chosen": 1.2587890625, "logits/rejected": 2.1728515625, "logps/chosen": -498.0, "logps/rejected": -640.5, "loss": 0.7255, "rewards/accuracies": 0.8125, "rewards/chosen": 0.416015625, "rewards/margins": 1.7099609375, "rewards/rejected": -1.2939453125, "step": 791 }, { "epoch": 0.1496527941801691, "grad_norm": 1.772191778783199, "learning_rate": 9.933483515577242e-07, "logits/chosen": 0.1192626953125, "logits/rejected": 0.5400390625, "logps/chosen": -659.5, "logps/rejected": -835.0, "loss": 0.7623, "rewards/accuracies": 0.65625, "rewards/chosen": 0.44384765625, "rewards/margins": 1.9833984375, "rewards/rejected": -1.5400390625, "step": 792 }, { "epoch": 0.1498417497283764, "grad_norm": 1.7688139360636341, "learning_rate": 9.932974105783816e-07, "logits/chosen": 1.22314453125, "logits/rejected": 1.5302734375, "logps/chosen": -949.0, "logps/rejected": -1035.5, "loss": 0.6182, "rewards/accuracies": 0.8125, "rewards/chosen": 0.800537109375, "rewards/margins": 2.96484375, "rewards/rejected": -2.16796875, "step": 793 }, { "epoch": 0.1500307052765837, "grad_norm": 1.9757046327867245, "learning_rate": 9.9324627674277e-07, "logits/chosen": 0.4547119140625, "logits/rejected": 0.79620361328125, "logps/chosen": -716.0, "logps/rejected": -1268.0, "loss": 0.7701, "rewards/accuracies": 0.75, "rewards/chosen": 0.13372802734375, "rewards/margins": 1.94091796875, "rewards/rejected": -1.804443359375, "step": 794 }, { "epoch": 0.15021966082479096, "grad_norm": 1.971261572119007, "learning_rate": 9.931949500731347e-07, "logits/chosen": 0.57470703125, "logits/rejected": 0.075927734375, "logps/chosen": -596.0, "logps/rejected": -496.5, "loss": 0.7688, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0667724609375, "rewards/margins": 1.4150390625, "rewards/rejected": -1.345947265625, "step": 795 }, { "epoch": 0.15040861637299824, "grad_norm": 1.9863188843991244, "learning_rate": 9.931434305918059e-07, "logits/chosen": 1.720703125, "logits/rejected": 1.98828125, "logps/chosen": -978.5, "logps/rejected": -2083.5, "loss": 0.6564, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6478271484375, "rewards/margins": 3.2275390625, "rewards/rejected": -2.578125, "step": 796 }, { "epoch": 0.15059757192120554, "grad_norm": 1.624116431074634, "learning_rate": 9.930917183211965e-07, "logits/chosen": 1.84375, "logits/rejected": 1.982421875, "logps/chosen": -765.5, "logps/rejected": -3356.0, "loss": 0.7354, "rewards/accuracies": 0.75, "rewards/chosen": 0.591552734375, "rewards/margins": 26.7452392578125, "rewards/rejected": -26.150390625, "step": 797 }, { "epoch": 0.15078652746941282, "grad_norm": 1.891701134295093, "learning_rate": 9.930398132838043e-07, "logits/chosen": 0.3773193359375, "logits/rejected": 0.64404296875, "logps/chosen": -788.0, "logps/rejected": -701.0, "loss": 0.6614, "rewards/accuracies": 0.9375, "rewards/chosen": 0.466796875, "rewards/margins": 2.2109375, "rewards/rejected": -1.748046875, "step": 798 }, { "epoch": 0.1509754830176201, "grad_norm": 1.6714104192468717, "learning_rate": 9.929877155022106e-07, "logits/chosen": 1.2587890625, "logits/rejected": 1.5704345703125, "logps/chosen": -977.0, "logps/rejected": -1120.0, "loss": 0.5546, "rewards/accuracies": 0.90625, "rewards/chosen": 1.068359375, "rewards/margins": 3.458984375, "rewards/rejected": -2.388671875, "step": 799 }, { "epoch": 0.1511644385658274, "grad_norm": 1.827501350121353, "learning_rate": 9.9293542499908e-07, "logits/chosen": 1.0068359375, "logits/rejected": 1.2314453125, "logps/chosen": -570.0, "logps/rejected": -506.5, "loss": 0.6644, "rewards/accuracies": 0.875, "rewards/chosen": 0.2705078125, "rewards/margins": 1.892578125, "rewards/rejected": -1.623046875, "step": 800 }, { "epoch": 0.15135339411403467, "grad_norm": 1.9676168248560812, "learning_rate": 9.92882941797162e-07, "logits/chosen": 1.0950927734375, "logits/rejected": 0.73193359375, "logps/chosen": -1063.0, "logps/rejected": -1146.0, "loss": 0.7357, "rewards/accuracies": 0.75, "rewards/chosen": 0.88623046875, "rewards/margins": 2.66015625, "rewards/rejected": -1.775390625, "step": 801 }, { "epoch": 0.15154234966224195, "grad_norm": 1.659681405385668, "learning_rate": 9.928302659192893e-07, "logits/chosen": 1.07861328125, "logits/rejected": 1.646484375, "logps/chosen": -656.0, "logps/rejected": -886.5, "loss": 0.704, "rewards/accuracies": 0.875, "rewards/chosen": 0.53466796875, "rewards/margins": 2.060546875, "rewards/rejected": -1.5244140625, "step": 802 }, { "epoch": 0.15173130521044925, "grad_norm": 1.8524300805691893, "learning_rate": 9.927773973883783e-07, "logits/chosen": 0.69580078125, "logits/rejected": 1.478515625, "logps/chosen": -982.0, "logps/rejected": -1107.0, "loss": 0.5605, "rewards/accuracies": 0.9375, "rewards/chosen": 0.89111328125, "rewards/margins": 3.265625, "rewards/rejected": -2.376953125, "step": 803 }, { "epoch": 0.15192026075865653, "grad_norm": 2.100373679621216, "learning_rate": 9.927243362274296e-07, "logits/chosen": 0.8984375, "logits/rejected": 1.478515625, "logps/chosen": -672.0, "logps/rejected": -706.5, "loss": 0.7271, "rewards/accuracies": 0.8125, "rewards/chosen": 0.35308837890625, "rewards/margins": 1.640625, "rewards/rejected": -1.2880859375, "step": 804 }, { "epoch": 0.1521092163068638, "grad_norm": 1.996661746614586, "learning_rate": 9.926710824595272e-07, "logits/chosen": 0.220947265625, "logits/rejected": 0.666748046875, "logps/chosen": -888.5, "logps/rejected": -804.0, "loss": 0.6759, "rewards/accuracies": 0.84375, "rewards/chosen": 0.60595703125, "rewards/margins": 3.001953125, "rewards/rejected": -2.400390625, "step": 805 }, { "epoch": 0.1522981718550711, "grad_norm": 2.3072936386250595, "learning_rate": 9.926176361078394e-07, "logits/chosen": 0.708984375, "logits/rejected": 0.451416015625, "logps/chosen": -1182.5, "logps/rejected": -837.5, "loss": 0.692, "rewards/accuracies": 0.875, "rewards/chosen": 0.083984375, "rewards/margins": 2.34375, "rewards/rejected": -2.26171875, "step": 806 }, { "epoch": 0.15248712740327838, "grad_norm": 1.8047503004413177, "learning_rate": 9.925639971956176e-07, "logits/chosen": 0.67724609375, "logits/rejected": 1.123779296875, "logps/chosen": -978.0, "logps/rejected": -1691.0, "loss": 0.6462, "rewards/accuracies": 0.9375, "rewards/chosen": 0.483154296875, "rewards/margins": 3.4296875, "rewards/rejected": -2.94140625, "step": 807 }, { "epoch": 0.15267608295148566, "grad_norm": 1.7214370775685868, "learning_rate": 9.92510165746198e-07, "logits/chosen": 1.009765625, "logits/rejected": 0.914794921875, "logps/chosen": -918.75, "logps/rejected": -630.0, "loss": 0.6685, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3837890625, "rewards/margins": 2.517578125, "rewards/rejected": -2.134765625, "step": 808 }, { "epoch": 0.15286503849969293, "grad_norm": 2.174086652398026, "learning_rate": 9.924561417829994e-07, "logits/chosen": 0.275634765625, "logits/rejected": 0.06982421875, "logps/chosen": -1237.0, "logps/rejected": -1138.0, "loss": 0.6358, "rewards/accuracies": 0.875, "rewards/chosen": 0.573883056640625, "rewards/margins": 3.21875, "rewards/rejected": -2.642578125, "step": 809 }, { "epoch": 0.15305399404790024, "grad_norm": 1.9411789161594915, "learning_rate": 9.924019253295252e-07, "logits/chosen": 0.5435791015625, "logits/rejected": 1.065673828125, "logps/chosen": -589.5, "logps/rejected": -685.0, "loss": 0.7741, "rewards/accuracies": 0.75, "rewards/chosen": -0.1748046875, "rewards/margins": 1.205078125, "rewards/rejected": -1.37890625, "step": 810 }, { "epoch": 0.1532429495961075, "grad_norm": 2.4082059784096224, "learning_rate": 9.92347516409362e-07, "logits/chosen": 0.6708984375, "logits/rejected": 0.94384765625, "logps/chosen": -767.0, "logps/rejected": -822.0, "loss": 0.731, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1494140625, "rewards/margins": 2.267578125, "rewards/rejected": -2.115234375, "step": 811 }, { "epoch": 0.1534319051443148, "grad_norm": 2.0404610437447377, "learning_rate": 9.922929150461805e-07, "logits/chosen": 0.6396484375, "logits/rejected": 1.50390625, "logps/chosen": -686.0, "logps/rejected": -2452.0, "loss": 0.7239, "rewards/accuracies": 0.8125, "rewards/chosen": -0.322021484375, "rewards/margins": 3.478515625, "rewards/rejected": -3.8046875, "step": 812 }, { "epoch": 0.1536208606925221, "grad_norm": 2.533693384217985, "learning_rate": 9.92238121263735e-07, "logits/chosen": 0.3544921875, "logits/rejected": 0.647705078125, "logps/chosen": -840.0, "logps/rejected": -921.0, "loss": 0.6226, "rewards/accuracies": 0.90625, "rewards/chosen": 0.0364990234375, "rewards/margins": 3.02734375, "rewards/rejected": -2.99609375, "step": 813 }, { "epoch": 0.15380981624072937, "grad_norm": 1.6305755489823677, "learning_rate": 9.921831350858634e-07, "logits/chosen": 0.8753662109375, "logits/rejected": 0.99609375, "logps/chosen": -864.0, "logps/rejected": -1062.5, "loss": 0.6486, "rewards/accuracies": 0.8125, "rewards/chosen": 0.412109375, "rewards/margins": 2.916015625, "rewards/rejected": -2.501953125, "step": 814 }, { "epoch": 0.15399877178893664, "grad_norm": 1.7176212147559922, "learning_rate": 9.921279565364873e-07, "logits/chosen": 1.5654296875, "logits/rejected": 2.0224609375, "logps/chosen": -794.5, "logps/rejected": -837.0, "loss": 0.6997, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5078125, "rewards/margins": 2.626953125, "rewards/rejected": -2.115234375, "step": 815 }, { "epoch": 0.15418772733714395, "grad_norm": 2.0981073907328733, "learning_rate": 9.920725856396125e-07, "logits/chosen": 0.88134765625, "logits/rejected": 1.374755859375, "logps/chosen": -834.0, "logps/rejected": -889.0, "loss": 0.7514, "rewards/accuracies": 0.75, "rewards/chosen": 0.028076171875, "rewards/margins": 2.0234375, "rewards/rejected": -1.994140625, "step": 816 }, { "epoch": 0.15437668288535122, "grad_norm": 1.9503961704356307, "learning_rate": 9.92017022419328e-07, "logits/chosen": 0.4036865234375, "logits/rejected": 0.3271484375, "logps/chosen": -702.0, "logps/rejected": -618.5, "loss": 0.7594, "rewards/accuracies": 0.75, "rewards/chosen": 0.16162109375, "rewards/margins": 1.3955078125, "rewards/rejected": -1.2333984375, "step": 817 }, { "epoch": 0.1545656384335585, "grad_norm": 1.6855382976552427, "learning_rate": 9.919612668998062e-07, "logits/chosen": 0.9306640625, "logits/rejected": 0.78277587890625, "logps/chosen": -478.75, "logps/rejected": -539.0, "loss": 0.6561, "rewards/accuracies": 0.84375, "rewards/chosen": 0.348876953125, "rewards/margins": 2.044921875, "rewards/rejected": -1.693359375, "step": 818 }, { "epoch": 0.1547545939817658, "grad_norm": 1.9432615760139482, "learning_rate": 9.919053191053037e-07, "logits/chosen": -0.34130859375, "logits/rejected": 0.03173828125, "logps/chosen": -696.25, "logps/rejected": -686.0, "loss": 0.7278, "rewards/accuracies": 0.75, "rewards/chosen": 0.20538330078125, "rewards/margins": 2.009765625, "rewards/rejected": -1.806640625, "step": 819 }, { "epoch": 0.15494354952997308, "grad_norm": 1.7708327393892236, "learning_rate": 9.918491790601607e-07, "logits/chosen": 0.5845947265625, "logits/rejected": 1.26025390625, "logps/chosen": -456.0, "logps/rejected": -504.5, "loss": 0.7552, "rewards/accuracies": 0.75, "rewards/chosen": 0.02325439453125, "rewards/margins": 1.623046875, "rewards/rejected": -1.599609375, "step": 820 }, { "epoch": 0.15513250507818035, "grad_norm": 1.9392692262231193, "learning_rate": 9.917928467888003e-07, "logits/chosen": 0.981689453125, "logits/rejected": 0.99609375, "logps/chosen": -537.0, "logps/rejected": -572.5, "loss": 0.7408, "rewards/accuracies": 0.78125, "rewards/chosen": -0.009521484375, "rewards/margins": 1.5693359375, "rewards/rejected": -1.578125, "step": 821 }, { "epoch": 0.15532146062638763, "grad_norm": 1.7952642504883287, "learning_rate": 9.917363223157307e-07, "logits/chosen": 1.3974609375, "logits/rejected": 1.599609375, "logps/chosen": -869.0, "logps/rejected": -1527.0, "loss": 0.6273, "rewards/accuracies": 0.90625, "rewards/chosen": 0.82666015625, "rewards/margins": 3.216796875, "rewards/rejected": -2.392578125, "step": 822 }, { "epoch": 0.15551041617459493, "grad_norm": 1.904399318579612, "learning_rate": 9.916796056655424e-07, "logits/chosen": 1.3203125, "logits/rejected": 1.4619140625, "logps/chosen": -688.5, "logps/rejected": -634.5, "loss": 0.686, "rewards/accuracies": 0.875, "rewards/chosen": 0.302734375, "rewards/margins": 1.939453125, "rewards/rejected": -1.640625, "step": 823 }, { "epoch": 0.1556993717228022, "grad_norm": 1.7684216931052947, "learning_rate": 9.916226968629096e-07, "logits/chosen": 1.286865234375, "logits/rejected": 1.58984375, "logps/chosen": -679.0, "logps/rejected": -719.0, "loss": 0.6211, "rewards/accuracies": 0.875, "rewards/chosen": 0.515625, "rewards/margins": 2.53515625, "rewards/rejected": -2.015625, "step": 824 }, { "epoch": 0.15588832727100949, "grad_norm": 1.7641689155864353, "learning_rate": 9.91565595932591e-07, "logits/chosen": 0.99365234375, "logits/rejected": 1.326171875, "logps/chosen": -803.0, "logps/rejected": -767.0, "loss": 0.7065, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4605865478515625, "rewards/margins": 1.998046875, "rewards/rejected": -1.5400390625, "step": 825 }, { "epoch": 0.1560772828192168, "grad_norm": 1.8780672071906417, "learning_rate": 9.915083028994283e-07, "logits/chosen": 0.257080078125, "logits/rejected": 0.29931640625, "logps/chosen": -766.5, "logps/rejected": -862.0, "loss": 0.7658, "rewards/accuracies": 0.8125, "rewards/chosen": 0.23486328125, "rewards/margins": 2.06640625, "rewards/rejected": -1.83203125, "step": 826 }, { "epoch": 0.15626623836742407, "grad_norm": 1.8536726031456372, "learning_rate": 9.914508177883462e-07, "logits/chosen": 0.4990234375, "logits/rejected": 1.1103515625, "logps/chosen": -598.0, "logps/rejected": -796.5, "loss": 0.6334, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5546875, "rewards/margins": 2.46484375, "rewards/rejected": -1.908203125, "step": 827 }, { "epoch": 0.15645519391563134, "grad_norm": 1.933880354583341, "learning_rate": 9.913931406243544e-07, "logits/chosen": 1.689453125, "logits/rejected": 1.42578125, "logps/chosen": -674.5, "logps/rejected": -644.0, "loss": 0.6889, "rewards/accuracies": 0.78125, "rewards/chosen": 0.153564453125, "rewards/margins": 2.46484375, "rewards/rejected": -2.30859375, "step": 828 }, { "epoch": 0.15664414946383864, "grad_norm": 1.8708008794267372, "learning_rate": 9.913352714325444e-07, "logits/chosen": 1.59765625, "logits/rejected": 1.635498046875, "logps/chosen": -813.0, "logps/rejected": -638.5, "loss": 0.634, "rewards/accuracies": 0.8125, "rewards/chosen": 0.482421875, "rewards/margins": 2.646484375, "rewards/rejected": -2.1640625, "step": 829 }, { "epoch": 0.15683310501204592, "grad_norm": 2.0612574744821437, "learning_rate": 9.91277210238093e-07, "logits/chosen": 0.179931640625, "logits/rejected": 0.090179443359375, "logps/chosen": -566.0, "logps/rejected": -11907.5, "loss": 0.7117, "rewards/accuracies": 0.75, "rewards/chosen": 0.1890869140625, "rewards/margins": 94.82421875, "rewards/rejected": -94.810546875, "step": 830 }, { "epoch": 0.1570220605602532, "grad_norm": 2.029642981454956, "learning_rate": 9.912189570662593e-07, "logits/chosen": 1.927734375, "logits/rejected": 2.220703125, "logps/chosen": -909.5, "logps/rejected": -723.0, "loss": 0.7033, "rewards/accuracies": 0.75, "rewards/chosen": 0.71636962890625, "rewards/margins": 2.22265625, "rewards/rejected": -1.501953125, "step": 831 }, { "epoch": 0.15721101610846047, "grad_norm": 2.1381752718718774, "learning_rate": 9.911605119423861e-07, "logits/chosen": 1.15380859375, "logits/rejected": 0.6181640625, "logps/chosen": -650.5, "logps/rejected": -516.5, "loss": 0.7292, "rewards/accuracies": 0.8125, "rewards/chosen": 0.23974609375, "rewards/margins": 1.6552734375, "rewards/rejected": -1.416015625, "step": 832 }, { "epoch": 0.15739997165666778, "grad_norm": 2.52081339812544, "learning_rate": 9.911018748919004e-07, "logits/chosen": 1.27734375, "logits/rejected": 1.459228515625, "logps/chosen": -896.0, "logps/rejected": -1812.0, "loss": 0.6958, "rewards/accuracies": 0.75, "rewards/chosen": 0.36865234375, "rewards/margins": 2.7421875, "rewards/rejected": -2.376953125, "step": 833 }, { "epoch": 0.15758892720487505, "grad_norm": 1.9720420582586105, "learning_rate": 9.91043045940312e-07, "logits/chosen": 0.273193359375, "logits/rejected": 0.4334716796875, "logps/chosen": -636.5, "logps/rejected": -521.0, "loss": 0.7283, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2447509765625, "rewards/margins": 1.84765625, "rewards/rejected": -1.60546875, "step": 834 }, { "epoch": 0.15777788275308233, "grad_norm": 2.059276441679853, "learning_rate": 9.90984025113214e-07, "logits/chosen": 0.546875, "logits/rejected": 0.471435546875, "logps/chosen": -985.5, "logps/rejected": -975.5, "loss": 0.6398, "rewards/accuracies": 0.84375, "rewards/chosen": 0.58270263671875, "rewards/margins": 2.583984375, "rewards/rejected": -2.0009765625, "step": 835 }, { "epoch": 0.15796683830128963, "grad_norm": 1.526026043955114, "learning_rate": 9.909248124362841e-07, "logits/chosen": 1.1767578125, "logits/rejected": 1.333740234375, "logps/chosen": -897.0, "logps/rejected": -908.5, "loss": 0.6539, "rewards/accuracies": 0.875, "rewards/chosen": 1.0244140625, "rewards/margins": 3.107421875, "rewards/rejected": -2.0830078125, "step": 836 }, { "epoch": 0.1581557938494969, "grad_norm": 1.8038087267854481, "learning_rate": 9.908654079352822e-07, "logits/chosen": 1.236572265625, "logits/rejected": 1.5302734375, "logps/chosen": -891.0, "logps/rejected": -1372.0, "loss": 0.6835, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7176513671875, "rewards/margins": 3.4296875, "rewards/rejected": -2.7060546875, "step": 837 }, { "epoch": 0.15834474939770418, "grad_norm": 2.3120595755201077, "learning_rate": 9.908058116360524e-07, "logits/chosen": -0.091064453125, "logits/rejected": 0.229248046875, "logps/chosen": -664.0, "logps/rejected": -754.5, "loss": 0.7451, "rewards/accuracies": 0.8125, "rewards/chosen": 0.073486328125, "rewards/margins": 1.466796875, "rewards/rejected": -1.39453125, "step": 838 }, { "epoch": 0.15853370494591149, "grad_norm": 2.026525452881206, "learning_rate": 9.907460235645221e-07, "logits/chosen": 0.64599609375, "logits/rejected": 1.130859375, "logps/chosen": -626.5, "logps/rejected": -664.0, "loss": 0.7037, "rewards/accuracies": 0.8125, "rewards/chosen": 0.247314453125, "rewards/margins": 2.537109375, "rewards/rejected": -2.29296875, "step": 839 }, { "epoch": 0.15872266049411876, "grad_norm": 1.8498683055333753, "learning_rate": 9.906860437467019e-07, "logits/chosen": 0.431884765625, "logits/rejected": 0.77294921875, "logps/chosen": -1051.0, "logps/rejected": -864.5, "loss": 0.6735, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8509521484375, "rewards/margins": 3.357421875, "rewards/rejected": -2.5078125, "step": 840 }, { "epoch": 0.15891161604232604, "grad_norm": 1.8249373274346705, "learning_rate": 9.906258722086862e-07, "logits/chosen": 1.25390625, "logits/rejected": 1.28564453125, "logps/chosen": -943.0, "logps/rejected": -982.0, "loss": 0.6309, "rewards/accuracies": 0.90625, "rewards/chosen": 0.91455078125, "rewards/margins": 3.09765625, "rewards/rejected": -2.1796875, "step": 841 }, { "epoch": 0.15910057159053334, "grad_norm": 1.9982101312938436, "learning_rate": 9.905655089766522e-07, "logits/chosen": 1.0830078125, "logits/rejected": 1.2578125, "logps/chosen": -723.0, "logps/rejected": -757.0, "loss": 0.6962, "rewards/accuracies": 0.90625, "rewards/chosen": 0.42401123046875, "rewards/margins": 2.1875, "rewards/rejected": -1.76171875, "step": 842 }, { "epoch": 0.15928952713874062, "grad_norm": 2.1217976353021597, "learning_rate": 9.90504954076861e-07, "logits/chosen": 0.302001953125, "logits/rejected": -0.0478515625, "logps/chosen": -812.0, "logps/rejected": -709.0, "loss": 0.7158, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1749267578125, "rewards/margins": 1.751953125, "rewards/rejected": -1.576171875, "step": 843 }, { "epoch": 0.1594784826869479, "grad_norm": 2.0517885754606198, "learning_rate": 9.904442075356575e-07, "logits/chosen": 1.0650634765625, "logits/rejected": 0.720703125, "logps/chosen": -1184.5, "logps/rejected": -1161.0, "loss": 0.6253, "rewards/accuracies": 0.84375, "rewards/chosen": 0.658935546875, "rewards/margins": 4.05078125, "rewards/rejected": -3.384765625, "step": 844 }, { "epoch": 0.15966743823515517, "grad_norm": 2.1147542870864533, "learning_rate": 9.903832693794691e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.2548828125, "logps/chosen": -1146.0, "logps/rejected": -1053.0, "loss": 0.6203, "rewards/accuracies": 0.875, "rewards/chosen": 0.60302734375, "rewards/margins": 3.18359375, "rewards/rejected": -2.57421875, "step": 845 }, { "epoch": 0.15985639378336247, "grad_norm": 1.7926236134324811, "learning_rate": 9.903221396348065e-07, "logits/chosen": 0.5416259765625, "logits/rejected": 1.020263671875, "logps/chosen": -460.5, "logps/rejected": -629.5, "loss": 0.7516, "rewards/accuracies": 0.78125, "rewards/chosen": 0.250335693359375, "rewards/margins": 1.841796875, "rewards/rejected": -1.595703125, "step": 846 }, { "epoch": 0.16004534933156975, "grad_norm": 2.221388807084187, "learning_rate": 9.902608183282647e-07, "logits/chosen": 0.2900390625, "logits/rejected": 0.44677734375, "logps/chosen": -941.0, "logps/rejected": -989.0, "loss": 0.6453, "rewards/accuracies": 0.90625, "rewards/chosen": 0.63818359375, "rewards/margins": 2.490234375, "rewards/rejected": -1.8515625, "step": 847 }, { "epoch": 0.16023430487977702, "grad_norm": 1.9863033878322203, "learning_rate": 9.90199305486521e-07, "logits/chosen": 1.400390625, "logits/rejected": 1.4765625, "logps/chosen": -1371.0, "logps/rejected": -4099.0, "loss": 0.6481, "rewards/accuracies": 0.90625, "rewards/chosen": 0.783203125, "rewards/margins": 0.76171875, "rewards/rejected": 0.013671875, "step": 848 }, { "epoch": 0.16042326042798433, "grad_norm": 2.270297413083386, "learning_rate": 9.901376011363373e-07, "logits/chosen": 1.04296875, "logits/rejected": 1.008544921875, "logps/chosen": -956.0, "logps/rejected": -933.0, "loss": 0.7091, "rewards/accuracies": 0.75, "rewards/chosen": 0.00067138671875, "rewards/margins": 2.177734375, "rewards/rejected": -2.17578125, "step": 849 }, { "epoch": 0.1606122159761916, "grad_norm": 1.8298925458417035, "learning_rate": 9.900757053045574e-07, "logits/chosen": 0.3974609375, "logits/rejected": 0.26171875, "logps/chosen": -842.0, "logps/rejected": -689.5, "loss": 0.6872, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1923828125, "rewards/margins": 2.376953125, "rewards/rejected": -2.181640625, "step": 850 }, { "epoch": 0.16080117152439888, "grad_norm": 1.6948985194039103, "learning_rate": 9.900136180181093e-07, "logits/chosen": 0.39892578125, "logits/rejected": 1.4150390625, "logps/chosen": -479.75, "logps/rejected": -582.0, "loss": 0.7453, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0078125, "rewards/margins": 1.904541015625, "rewards/rejected": -1.91455078125, "step": 851 }, { "epoch": 0.16099012707260618, "grad_norm": 2.046383097789378, "learning_rate": 9.899513393040038e-07, "logits/chosen": 2.074462890625, "logits/rejected": 1.72705078125, "logps/chosen": -699.0, "logps/rejected": -665.0, "loss": 0.6793, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4140625, "rewards/margins": 2.107421875, "rewards/rejected": -1.693359375, "step": 852 }, { "epoch": 0.16117908262081346, "grad_norm": 2.114941397815625, "learning_rate": 9.898888691893354e-07, "logits/chosen": 0.52398681640625, "logits/rejected": 0.712890625, "logps/chosen": -778.0, "logps/rejected": -1031.0, "loss": 0.629, "rewards/accuracies": 0.84375, "rewards/chosen": 0.25787353515625, "rewards/margins": 2.47265625, "rewards/rejected": -2.212890625, "step": 853 }, { "epoch": 0.16136803816902073, "grad_norm": 2.2916731316627783, "learning_rate": 9.898262077012816e-07, "logits/chosen": 1.861328125, "logits/rejected": 2.51318359375, "logps/chosen": -720.5, "logps/rejected": -1026.0, "loss": 0.5699, "rewards/accuracies": 0.9375, "rewards/chosen": 0.34332275390625, "rewards/margins": 3.8515625, "rewards/rejected": -3.5, "step": 854 }, { "epoch": 0.161556993717228, "grad_norm": 1.9603671827987232, "learning_rate": 9.897633548671035e-07, "logits/chosen": 1.02880859375, "logits/rejected": 1.370849609375, "logps/chosen": -613.5, "logps/rejected": -675.0, "loss": 0.7767, "rewards/accuracies": 0.75, "rewards/chosen": 0.15185546875, "rewards/margins": 2.689453125, "rewards/rejected": -2.5380859375, "step": 855 }, { "epoch": 0.1617459492654353, "grad_norm": 2.30447945723107, "learning_rate": 9.89700310714145e-07, "logits/chosen": 1.544921875, "logits/rejected": 0.81103515625, "logps/chosen": -753.5, "logps/rejected": -554.0, "loss": 0.7937, "rewards/accuracies": 0.71875, "rewards/chosen": 0.40447998046875, "rewards/margins": 1.236328125, "rewards/rejected": -0.83251953125, "step": 856 }, { "epoch": 0.1619349048136426, "grad_norm": 2.111784739934014, "learning_rate": 9.896370752698333e-07, "logits/chosen": 1.48046875, "logits/rejected": 1.822265625, "logps/chosen": -967.0, "logps/rejected": -1899.0, "loss": 0.744, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3818359375, "rewards/margins": 3.032470703125, "rewards/rejected": -2.65234375, "step": 857 }, { "epoch": 0.16212386036184986, "grad_norm": 2.1480756820356683, "learning_rate": 9.89573648561679e-07, "logits/chosen": 0.790679931640625, "logits/rejected": 0.53662109375, "logps/chosen": -678.0, "logps/rejected": -552.0, "loss": 0.7513, "rewards/accuracies": 0.78125, "rewards/chosen": 0.19970703125, "rewards/margins": 2.083984375, "rewards/rejected": -1.880859375, "step": 858 }, { "epoch": 0.16231281591005717, "grad_norm": 1.6479606824920923, "learning_rate": 9.895100306172759e-07, "logits/chosen": 0.80419921875, "logits/rejected": 1.4404296875, "logps/chosen": -753.5, "logps/rejected": -782.0, "loss": 0.711, "rewards/accuracies": 0.71875, "rewards/chosen": 0.496337890625, "rewards/margins": 2.1005859375, "rewards/rejected": -1.603515625, "step": 859 }, { "epoch": 0.16250177145826444, "grad_norm": 1.6142518685071092, "learning_rate": 9.89446221464301e-07, "logits/chosen": 1.75390625, "logits/rejected": 1.6015625, "logps/chosen": -592.0, "logps/rejected": -560.5, "loss": 0.6699, "rewards/accuracies": 0.90625, "rewards/chosen": 0.306640625, "rewards/margins": 1.951171875, "rewards/rejected": -1.646484375, "step": 860 }, { "epoch": 0.16269072700647172, "grad_norm": 2.0648960592664145, "learning_rate": 9.893822211305144e-07, "logits/chosen": 1.228515625, "logits/rejected": 0.9013671875, "logps/chosen": -753.5, "logps/rejected": -745.0, "loss": 0.6441, "rewards/accuracies": 0.875, "rewards/chosen": 0.29998779296875, "rewards/margins": 2.5078125, "rewards/rejected": -2.208984375, "step": 861 }, { "epoch": 0.16287968255467902, "grad_norm": 1.591420844269556, "learning_rate": 9.893180296437593e-07, "logits/chosen": 1.66796875, "logits/rejected": 1.54638671875, "logps/chosen": -981.75, "logps/rejected": -1437.0, "loss": 0.6708, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6414794921875, "rewards/margins": 3.029296875, "rewards/rejected": -2.3837890625, "step": 862 }, { "epoch": 0.1630686381028863, "grad_norm": 1.9568426065300886, "learning_rate": 9.892536470319622e-07, "logits/chosen": 0.89453125, "logits/rejected": 1.009765625, "logps/chosen": -689.0, "logps/rejected": -553.5, "loss": 0.7196, "rewards/accuracies": 0.90625, "rewards/chosen": 0.25048828125, "rewards/margins": 1.623046875, "rewards/rejected": -1.37109375, "step": 863 }, { "epoch": 0.16325759365109357, "grad_norm": 2.0848273550490872, "learning_rate": 9.89189073323133e-07, "logits/chosen": 0.58331298828125, "logits/rejected": 1.646484375, "logps/chosen": -643.0, "logps/rejected": -1463.0, "loss": 0.6327, "rewards/accuracies": 0.875, "rewards/chosen": 0.56085205078125, "rewards/margins": 3.00390625, "rewards/rejected": -2.44140625, "step": 864 }, { "epoch": 0.16344654919930088, "grad_norm": 1.7538334716273714, "learning_rate": 9.891243085453641e-07, "logits/chosen": 1.29913330078125, "logits/rejected": 1.1640625, "logps/chosen": -793.0, "logps/rejected": -946.0, "loss": 0.7026, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4774169921875, "rewards/margins": 2.271484375, "rewards/rejected": -1.7890625, "step": 865 }, { "epoch": 0.16363550474750815, "grad_norm": 1.7781843286306935, "learning_rate": 9.890593527268317e-07, "logits/chosen": 1.9013671875, "logits/rejected": 2.05859375, "logps/chosen": -653.5, "logps/rejected": -840.0, "loss": 0.6456, "rewards/accuracies": 0.8125, "rewards/chosen": 0.607421875, "rewards/margins": 2.0472412109375, "rewards/rejected": -1.435546875, "step": 866 }, { "epoch": 0.16382446029571543, "grad_norm": 1.922803827605944, "learning_rate": 9.889942058957943e-07, "logits/chosen": 0.60791015625, "logits/rejected": 0.689453125, "logps/chosen": -755.0, "logps/rejected": -817.0, "loss": 0.6387, "rewards/accuracies": 0.8125, "rewards/chosen": 0.58056640625, "rewards/margins": 2.234375, "rewards/rejected": -1.658203125, "step": 867 }, { "epoch": 0.1640134158439227, "grad_norm": 1.8002165369051606, "learning_rate": 9.889288680805944e-07, "logits/chosen": 0.940185546875, "logits/rejected": 0.980712890625, "logps/chosen": -878.0, "logps/rejected": -1046.5, "loss": 0.6024, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6446533203125, "rewards/margins": 3.17578125, "rewards/rejected": -2.537109375, "step": 868 }, { "epoch": 0.16420237139213, "grad_norm": 2.438867149106808, "learning_rate": 9.88863339309657e-07, "logits/chosen": 1.460205078125, "logits/rejected": 1.181640625, "logps/chosen": -1448.0, "logps/rejected": -993.0, "loss": 0.6912, "rewards/accuracies": 0.78125, "rewards/chosen": 0.15777587890625, "rewards/margins": 2.32421875, "rewards/rejected": -2.169921875, "step": 869 }, { "epoch": 0.16439132694033728, "grad_norm": 2.2792351402511795, "learning_rate": 9.887976196114906e-07, "logits/chosen": 2.271484375, "logits/rejected": 2.275390625, "logps/chosen": -694.0, "logps/rejected": -662.0, "loss": 0.7001, "rewards/accuracies": 0.8125, "rewards/chosen": 0.35052490234375, "rewards/margins": 2.5078125, "rewards/rejected": -2.15625, "step": 870 }, { "epoch": 0.16458028248854456, "grad_norm": 2.3000866471742407, "learning_rate": 9.887317090146864e-07, "logits/chosen": 0.896484375, "logits/rejected": 1.51171875, "logps/chosen": -1020.0, "logps/rejected": -19408.0, "loss": 0.7375, "rewards/accuracies": 0.78125, "rewards/chosen": 0.263427734375, "rewards/margins": 97.6953125, "rewards/rejected": -97.859375, "step": 871 }, { "epoch": 0.16476923803675186, "grad_norm": 1.6012119081597662, "learning_rate": 9.886656075479183e-07, "logits/chosen": 1.560546875, "logits/rejected": 1.8828125, "logps/chosen": -626.0, "logps/rejected": -1001.5, "loss": 0.6414, "rewards/accuracies": 0.78125, "rewards/chosen": 0.18701171875, "rewards/margins": 2.826171875, "rewards/rejected": -2.6328125, "step": 872 }, { "epoch": 0.16495819358495914, "grad_norm": 2.5672971648067393, "learning_rate": 9.885993152399444e-07, "logits/chosen": 0.51416015625, "logits/rejected": 0.978759765625, "logps/chosen": -683.5, "logps/rejected": -628.0, "loss": 0.708, "rewards/accuracies": 0.78125, "rewards/chosen": 0.05078125, "rewards/margins": 2.2421875, "rewards/rejected": -2.1953125, "step": 873 }, { "epoch": 0.16514714913316642, "grad_norm": 2.288599535576642, "learning_rate": 9.885328321196047e-07, "logits/chosen": 1.0458984375, "logits/rejected": 0.9158935546875, "logps/chosen": -1063.5, "logps/rejected": -733.5, "loss": 0.7331, "rewards/accuracies": 0.6875, "rewards/chosen": 0.402099609375, "rewards/margins": 2.0966796875, "rewards/rejected": -1.69384765625, "step": 874 }, { "epoch": 0.16533610468137372, "grad_norm": 1.8549982144906227, "learning_rate": 9.884661582158227e-07, "logits/chosen": 0.72314453125, "logits/rejected": 0.81884765625, "logps/chosen": -798.0, "logps/rejected": -803.0, "loss": 0.7408, "rewards/accuracies": 0.78125, "rewards/chosen": 0.47052001953125, "rewards/margins": 2.23046875, "rewards/rejected": -1.767578125, "step": 875 }, { "epoch": 0.165525060229581, "grad_norm": 2.132434451874567, "learning_rate": 9.88399293557605e-07, "logits/chosen": 0.31787109375, "logits/rejected": 0.72021484375, "logps/chosen": -553.0, "logps/rejected": -852.5, "loss": 0.726, "rewards/accuracies": 0.75, "rewards/chosen": 0.267578125, "rewards/margins": 1.9619140625, "rewards/rejected": -1.695068359375, "step": 876 }, { "epoch": 0.16571401577778827, "grad_norm": 2.0078762056503487, "learning_rate": 9.883322381740409e-07, "logits/chosen": 2.109375, "logits/rejected": 3.30078125, "logps/chosen": -632.0, "logps/rejected": -1606.0, "loss": 0.7481, "rewards/accuracies": 0.75, "rewards/chosen": 0.26171875, "rewards/margins": 2.94921875, "rewards/rejected": -2.6845703125, "step": 877 }, { "epoch": 0.16590297132599555, "grad_norm": 1.8648611531599502, "learning_rate": 9.882649920943027e-07, "logits/chosen": 1.33984375, "logits/rejected": 1.56640625, "logps/chosen": -931.0, "logps/rejected": -836.0, "loss": 0.657, "rewards/accuracies": 0.75, "rewards/chosen": 0.5103759765625, "rewards/margins": 2.68359375, "rewards/rejected": -2.169921875, "step": 878 }, { "epoch": 0.16609192687420285, "grad_norm": 1.8367793573812015, "learning_rate": 9.88197555347646e-07, "logits/chosen": 1.33935546875, "logits/rejected": 1.271484375, "logps/chosen": -624.0, "logps/rejected": -504.0, "loss": 0.6579, "rewards/accuracies": 0.78125, "rewards/chosen": 1.064453125, "rewards/margins": 2.048828125, "rewards/rejected": -0.98583984375, "step": 879 }, { "epoch": 0.16628088242241013, "grad_norm": 2.2389737022504055, "learning_rate": 9.881299279634093e-07, "logits/chosen": 0.0986328125, "logits/rejected": 0.2353515625, "logps/chosen": -893.5, "logps/rejected": -744.0, "loss": 0.6865, "rewards/accuracies": 0.875, "rewards/chosen": 0.520751953125, "rewards/margins": 2.029296875, "rewards/rejected": -1.505859375, "step": 880 }, { "epoch": 0.1664698379706174, "grad_norm": 2.179180388113368, "learning_rate": 9.880621099710132e-07, "logits/chosen": 0.87890625, "logits/rejected": 1.25732421875, "logps/chosen": -590.0, "logps/rejected": -585.0, "loss": 0.7498, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7236328125, "rewards/margins": 1.798828125, "rewards/rejected": -1.076171875, "step": 881 }, { "epoch": 0.1666587935188247, "grad_norm": 2.1434675433904387, "learning_rate": 9.879941013999624e-07, "logits/chosen": 1.27587890625, "logits/rejected": 0.87109375, "logps/chosen": -677.0, "logps/rejected": -616.5, "loss": 0.5963, "rewards/accuracies": 0.90625, "rewards/chosen": 0.809326171875, "rewards/margins": 2.61328125, "rewards/rejected": -1.80078125, "step": 882 }, { "epoch": 0.16684774906703198, "grad_norm": 1.9041854104541167, "learning_rate": 9.879259022798438e-07, "logits/chosen": 0.9140625, "logits/rejected": 1.3421630859375, "logps/chosen": -948.0, "logps/rejected": -1044.0, "loss": 0.6663, "rewards/accuracies": 0.84375, "rewards/chosen": 0.69384765625, "rewards/margins": 2.9072265625, "rewards/rejected": -2.2119140625, "step": 883 }, { "epoch": 0.16703670461523926, "grad_norm": 1.9791295828720026, "learning_rate": 9.878575126403274e-07, "logits/chosen": 0.6470947265625, "logits/rejected": 0.6669921875, "logps/chosen": -630.5, "logps/rejected": -672.0, "loss": 0.7391, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5150146484375, "rewards/margins": 1.9306640625, "rewards/rejected": -1.4130859375, "step": 884 }, { "epoch": 0.16722566016344656, "grad_norm": 2.0291725385713266, "learning_rate": 9.877889325111661e-07, "logits/chosen": 1.2294921875, "logits/rejected": 1.41845703125, "logps/chosen": -873.5, "logps/rejected": -930.0, "loss": 0.5658, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0263671875, "rewards/margins": 3.10546875, "rewards/rejected": -2.07421875, "step": 885 }, { "epoch": 0.16741461571165384, "grad_norm": 1.8993616759209098, "learning_rate": 9.877201619221957e-07, "logits/chosen": 1.6923828125, "logits/rejected": 1.3046875, "logps/chosen": -700.0, "logps/rejected": -812.5, "loss": 0.6021, "rewards/accuracies": 0.875, "rewards/chosen": 0.470703125, "rewards/margins": 2.755859375, "rewards/rejected": -2.28515625, "step": 886 }, { "epoch": 0.1676035712598611, "grad_norm": 2.26224778266052, "learning_rate": 9.876512009033347e-07, "logits/chosen": 1.44921875, "logits/rejected": 1.2158203125, "logps/chosen": -614.0, "logps/rejected": -599.0, "loss": 0.7336, "rewards/accuracies": 0.78125, "rewards/chosen": 0.380126953125, "rewards/margins": 1.4501953125, "rewards/rejected": -1.0712890625, "step": 887 }, { "epoch": 0.16779252680806842, "grad_norm": 1.9296165687892206, "learning_rate": 9.875820494845847e-07, "logits/chosen": 1.3115234375, "logits/rejected": 2.1650390625, "logps/chosen": -585.0, "logps/rejected": -1120.0, "loss": 0.7733, "rewards/accuracies": 0.8125, "rewards/chosen": 0.418701171875, "rewards/margins": 1.19140625, "rewards/rejected": -0.77099609375, "step": 888 }, { "epoch": 0.1679814823562757, "grad_norm": 2.2610804370136726, "learning_rate": 9.875127076960297e-07, "logits/chosen": 0.877685546875, "logits/rejected": 1.537109375, "logps/chosen": -460.75, "logps/rejected": -340.5, "loss": 0.7846, "rewards/accuracies": 0.75, "rewards/chosen": 0.36920166015625, "rewards/margins": 1.173828125, "rewards/rejected": -0.80255126953125, "step": 889 }, { "epoch": 0.16817043790448297, "grad_norm": 1.957478592254571, "learning_rate": 9.87443175567837e-07, "logits/chosen": 2.0703125, "logits/rejected": 1.6064453125, "logps/chosen": -694.5, "logps/rejected": -604.25, "loss": 0.6264, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8173828125, "rewards/margins": 2.43359375, "rewards/rejected": -1.6162109375, "step": 890 }, { "epoch": 0.16835939345269024, "grad_norm": 1.906445907822118, "learning_rate": 9.873734531302565e-07, "logits/chosen": 0.31494140625, "logits/rejected": 1.28955078125, "logps/chosen": -541.0, "logps/rejected": -944.5, "loss": 0.7401, "rewards/accuracies": 0.78125, "rewards/chosen": 0.317626953125, "rewards/margins": 3.0546875, "rewards/rejected": -2.73046875, "step": 891 }, { "epoch": 0.16854834900089755, "grad_norm": 1.7329550232732247, "learning_rate": 9.87303540413621e-07, "logits/chosen": 0.620361328125, "logits/rejected": 0.75677490234375, "logps/chosen": -887.0, "logps/rejected": -913.0, "loss": 0.606, "rewards/accuracies": 0.9375, "rewards/chosen": 0.65380859375, "rewards/margins": 3.27734375, "rewards/rejected": -2.623046875, "step": 892 }, { "epoch": 0.16873730454910482, "grad_norm": 2.173198122643494, "learning_rate": 9.872334374483457e-07, "logits/chosen": 1.117919921875, "logits/rejected": 1.0859375, "logps/chosen": -893.0, "logps/rejected": -955.0, "loss": 0.7211, "rewards/accuracies": 0.8125, "rewards/chosen": 0.287109375, "rewards/margins": 1.955078125, "rewards/rejected": -1.669921875, "step": 893 }, { "epoch": 0.1689262600973121, "grad_norm": 2.0417958522423842, "learning_rate": 9.87163144264929e-07, "logits/chosen": 1.1962890625, "logits/rejected": 1.4521484375, "logps/chosen": -872.5, "logps/rejected": -1469.0, "loss": 0.6958, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0283203125, "rewards/margins": 2.8671875, "rewards/rejected": -1.83984375, "step": 894 }, { "epoch": 0.1691152156455194, "grad_norm": 1.8538095742873566, "learning_rate": 9.870926608939521e-07, "logits/chosen": 0.74560546875, "logits/rejected": 1.058837890625, "logps/chosen": -482.5, "logps/rejected": -603.0, "loss": 0.6898, "rewards/accuracies": 0.84375, "rewards/chosen": 0.08544921875, "rewards/margins": 2.337890625, "rewards/rejected": -2.25, "step": 895 }, { "epoch": 0.16930417119372668, "grad_norm": 1.6239292753715886, "learning_rate": 9.870219873660786e-07, "logits/chosen": 1.1875, "logits/rejected": 1.6533203125, "logps/chosen": -620.5, "logps/rejected": -675.5, "loss": 0.6786, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3726806640625, "rewards/margins": 2.228515625, "rewards/rejected": -1.857421875, "step": 896 }, { "epoch": 0.16949312674193395, "grad_norm": 1.9381439772493867, "learning_rate": 9.869511237120547e-07, "logits/chosen": 1.355712890625, "logits/rejected": 1.9912109375, "logps/chosen": -1010.0, "logps/rejected": -986.0, "loss": 0.7145, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5692138671875, "rewards/margins": 2.828125, "rewards/rejected": -2.257080078125, "step": 897 }, { "epoch": 0.16968208229014126, "grad_norm": 1.9410814244023258, "learning_rate": 9.868800699627102e-07, "logits/chosen": 0.332275390625, "logits/rejected": 0.44873046875, "logps/chosen": -726.5, "logps/rejected": -610.5, "loss": 0.6617, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2333221435546875, "rewards/margins": 2.236328125, "rewards/rejected": -1.99609375, "step": 898 }, { "epoch": 0.16987103783834853, "grad_norm": 1.6398750270078621, "learning_rate": 9.868088261489565e-07, "logits/chosen": 0.54052734375, "logits/rejected": 0.891357421875, "logps/chosen": -548.5, "logps/rejected": -1422.0, "loss": 0.728, "rewards/accuracies": 0.8125, "rewards/chosen": 0.07098388671875, "rewards/margins": 2.666015625, "rewards/rejected": -2.5966796875, "step": 899 }, { "epoch": 0.1700599933865558, "grad_norm": 1.6502818364363077, "learning_rate": 9.86737392301788e-07, "logits/chosen": 1.0048828125, "logits/rejected": 0.6474609375, "logps/chosen": -753.5, "logps/rejected": -785.0, "loss": 0.6662, "rewards/accuracies": 0.84375, "rewards/chosen": 0.54931640625, "rewards/margins": 2.5380859375, "rewards/rejected": -1.990234375, "step": 900 }, { "epoch": 0.17024894893476308, "grad_norm": 2.8894590472859454, "learning_rate": 9.866657684522827e-07, "logits/chosen": 1.2041015625, "logits/rejected": 0.961669921875, "logps/chosen": -1798.0, "logps/rejected": -1352.0, "loss": 0.553, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9208984375, "rewards/margins": 4.18359375, "rewards/rejected": -3.263671875, "step": 901 }, { "epoch": 0.1704379044829704, "grad_norm": 2.0605312173356487, "learning_rate": 9.865939546315997e-07, "logits/chosen": 0.76806640625, "logits/rejected": 0.9736328125, "logps/chosen": -684.0, "logps/rejected": -1034.0, "loss": 0.7222, "rewards/accuracies": 0.8125, "rewards/chosen": 0.03271484375, "rewards/margins": 2.294921875, "rewards/rejected": -2.26171875, "step": 902 }, { "epoch": 0.17062686003117766, "grad_norm": 2.0682526746597474, "learning_rate": 9.865219508709824e-07, "logits/chosen": 1.16168212890625, "logits/rejected": 1.2860107421875, "logps/chosen": -762.0, "logps/rejected": -832.5, "loss": 0.7144, "rewards/accuracies": 0.875, "rewards/chosen": 0.406494140625, "rewards/margins": 2.060546875, "rewards/rejected": -1.6494140625, "step": 903 }, { "epoch": 0.17081581557938494, "grad_norm": 1.7332587292133597, "learning_rate": 9.864497572017552e-07, "logits/chosen": 1.3916015625, "logits/rejected": 1.29150390625, "logps/chosen": -718.0, "logps/rejected": -720.5, "loss": 0.662, "rewards/accuracies": 0.84375, "rewards/chosen": 0.631103515625, "rewards/margins": 2.0283203125, "rewards/rejected": -1.4013671875, "step": 904 }, { "epoch": 0.17100477112759224, "grad_norm": 1.7391794197230255, "learning_rate": 9.863773736553263e-07, "logits/chosen": 2.076171875, "logits/rejected": 2.001953125, "logps/chosen": -909.0, "logps/rejected": -883.0, "loss": 0.5891, "rewards/accuracies": 0.90625, "rewards/chosen": 1.15087890625, "rewards/margins": 2.96875, "rewards/rejected": -1.814453125, "step": 905 }, { "epoch": 0.17119372667579952, "grad_norm": 1.608851666292957, "learning_rate": 9.86304800263186e-07, "logits/chosen": 1.7919921875, "logits/rejected": 2.25244140625, "logps/chosen": -688.5, "logps/rejected": -675.0, "loss": 0.6805, "rewards/accuracies": 0.8125, "rewards/chosen": 0.66015625, "rewards/margins": 2.775390625, "rewards/rejected": -2.109375, "step": 906 }, { "epoch": 0.1713826822240068, "grad_norm": 2.2801713756163693, "learning_rate": 9.862320370569075e-07, "logits/chosen": 0.377685546875, "logits/rejected": 1.1640625, "logps/chosen": -781.0, "logps/rejected": -2119.0, "loss": 0.6306, "rewards/accuracies": 0.875, "rewards/chosen": 0.330902099609375, "rewards/margins": 4.470703125, "rewards/rejected": -4.142578125, "step": 907 }, { "epoch": 0.1715716377722141, "grad_norm": 2.721992704492997, "learning_rate": 9.861590840681463e-07, "logits/chosen": 0.696533203125, "logits/rejected": 0.6025390625, "logps/chosen": -924.0, "logps/rejected": -19725.0, "loss": 0.797, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3460693359375, "rewards/margins": 110.8330078125, "rewards/rejected": -110.287109375, "step": 908 }, { "epoch": 0.17176059332042137, "grad_norm": 1.7136127080432189, "learning_rate": 9.860859413286402e-07, "logits/chosen": 0.88482666015625, "logits/rejected": 1.02685546875, "logps/chosen": -913.5, "logps/rejected": -814.0, "loss": 0.7457, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6585693359375, "rewards/margins": 2.1416015625, "rewards/rejected": -1.4873046875, "step": 909 }, { "epoch": 0.17194954886862865, "grad_norm": 2.005596625834556, "learning_rate": 9.860126088702105e-07, "logits/chosen": 1.966552734375, "logits/rejected": 1.644775390625, "logps/chosen": -838.0, "logps/rejected": -890.0, "loss": 0.7759, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1640625, "rewards/margins": 2.05078125, "rewards/rejected": -1.884765625, "step": 910 }, { "epoch": 0.17213850441683595, "grad_norm": 1.7468695050576875, "learning_rate": 9.859390867247602e-07, "logits/chosen": 1.296875, "logits/rejected": 1.5380859375, "logps/chosen": -1051.5, "logps/rejected": -1680.5, "loss": 0.7647, "rewards/accuracies": 0.625, "rewards/chosen": 0.611572265625, "rewards/margins": 2.8349609375, "rewards/rejected": -2.222900390625, "step": 911 }, { "epoch": 0.17232745996504323, "grad_norm": 2.24432907437477, "learning_rate": 9.858653749242749e-07, "logits/chosen": 1.74267578125, "logits/rejected": 1.53173828125, "logps/chosen": -764.0, "logps/rejected": -538.5, "loss": 0.8243, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0670318603515625, "rewards/margins": 1.3447265625, "rewards/rejected": -1.412109375, "step": 912 }, { "epoch": 0.1725164155132505, "grad_norm": 1.8513881789857398, "learning_rate": 9.857914735008229e-07, "logits/chosen": 0.951171875, "logits/rejected": 1.232421875, "logps/chosen": -558.0, "logps/rejected": -838.5, "loss": 0.7709, "rewards/accuracies": 0.75, "rewards/chosen": 0.156494140625, "rewards/margins": 1.8125, "rewards/rejected": -1.65625, "step": 913 }, { "epoch": 0.17270537106145778, "grad_norm": 2.2705997635977058, "learning_rate": 9.85717382486555e-07, "logits/chosen": 0.5574951171875, "logits/rejected": 0.869140625, "logps/chosen": -607.5, "logps/rejected": -558.0, "loss": 0.7003, "rewards/accuracies": 0.8125, "rewards/chosen": 0.15643310546875, "rewards/margins": 2.1953125, "rewards/rejected": -2.041015625, "step": 914 }, { "epoch": 0.17289432660966508, "grad_norm": 1.8257580302042595, "learning_rate": 9.856431019137047e-07, "logits/chosen": 1.15283203125, "logits/rejected": 1.17919921875, "logps/chosen": -849.0, "logps/rejected": -946.0, "loss": 0.6345, "rewards/accuracies": 0.875, "rewards/chosen": 0.638671875, "rewards/margins": 2.81640625, "rewards/rejected": -2.17578125, "step": 915 }, { "epoch": 0.17308328215787236, "grad_norm": 1.8037298852208024, "learning_rate": 9.855686318145875e-07, "logits/chosen": 0.98828125, "logits/rejected": 1.14453125, "logps/chosen": -374.5, "logps/rejected": -653.0, "loss": 0.7838, "rewards/accuracies": 0.875, "rewards/chosen": 0.45654296875, "rewards/margins": 1.3994140625, "rewards/rejected": -0.9404296875, "step": 916 }, { "epoch": 0.17327223770607963, "grad_norm": 1.861126930240531, "learning_rate": 9.85493972221602e-07, "logits/chosen": 1.0361328125, "logits/rejected": 0.98504638671875, "logps/chosen": -648.0, "logps/rejected": -828.0, "loss": 0.5538, "rewards/accuracies": 0.90625, "rewards/chosen": 0.49853515625, "rewards/margins": 3.3359375, "rewards/rejected": -2.84375, "step": 917 }, { "epoch": 0.17346119325428694, "grad_norm": 1.8751356993886772, "learning_rate": 9.854191231672277e-07, "logits/chosen": 1.8525390625, "logits/rejected": 1.8828125, "logps/chosen": -720.0, "logps/rejected": -909.0, "loss": 0.5905, "rewards/accuracies": 0.90625, "rewards/chosen": 0.474609375, "rewards/margins": 2.97265625, "rewards/rejected": -2.50390625, "step": 918 }, { "epoch": 0.17365014880249421, "grad_norm": 1.6667139426955808, "learning_rate": 9.853440846840288e-07, "logits/chosen": 2.021484375, "logits/rejected": 2.314453125, "logps/chosen": -1153.0, "logps/rejected": -1524.0, "loss": 0.5954, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3994140625, "rewards/margins": 8.76171875, "rewards/rejected": -7.369140625, "step": 919 }, { "epoch": 0.1738391043507015, "grad_norm": 1.520010842546849, "learning_rate": 9.852688568046502e-07, "logits/chosen": 1.009765625, "logits/rejected": 2.18359375, "logps/chosen": -830.5, "logps/rejected": -1059.5, "loss": 0.7036, "rewards/accuracies": 0.65625, "rewards/chosen": 0.969818115234375, "rewards/margins": 3.583984375, "rewards/rejected": -2.61328125, "step": 920 }, { "epoch": 0.1740280598989088, "grad_norm": 1.8128787901115353, "learning_rate": 9.851934395618197e-07, "logits/chosen": 1.41650390625, "logits/rejected": 1.130859375, "logps/chosen": -688.0, "logps/rejected": -1293.5, "loss": 0.6374, "rewards/accuracies": 0.9375, "rewards/chosen": 0.36083984375, "rewards/margins": 3.232421875, "rewards/rejected": -2.86328125, "step": 921 }, { "epoch": 0.17421701544711607, "grad_norm": 1.600480671643121, "learning_rate": 9.851178329883479e-07, "logits/chosen": 1.1201171875, "logits/rejected": 1.513671875, "logps/chosen": -520.0, "logps/rejected": -641.0, "loss": 0.6884, "rewards/accuracies": 0.875, "rewards/chosen": 0.034912109375, "rewards/margins": 2.15625, "rewards/rejected": -2.126953125, "step": 922 }, { "epoch": 0.17440597099532335, "grad_norm": 1.7626275743348836, "learning_rate": 9.850420371171269e-07, "logits/chosen": 1.5050048828125, "logits/rejected": 1.61865234375, "logps/chosen": -618.75, "logps/rejected": -783.75, "loss": 0.6922, "rewards/accuracies": 0.8125, "rewards/chosen": 0.474761962890625, "rewards/margins": 2.396484375, "rewards/rejected": -1.921875, "step": 923 }, { "epoch": 0.17459492654353062, "grad_norm": 2.0795696961051786, "learning_rate": 9.849660519811321e-07, "logits/chosen": 1.3828125, "logits/rejected": 1.7900390625, "logps/chosen": -1103.0, "logps/rejected": -862.0, "loss": 0.7367, "rewards/accuracies": 0.75, "rewards/chosen": -0.0316162109375, "rewards/margins": 1.626953125, "rewards/rejected": -1.6572265625, "step": 924 }, { "epoch": 0.17478388209173792, "grad_norm": 2.1681457322765296, "learning_rate": 9.848898776134205e-07, "logits/chosen": 0.9810791015625, "logits/rejected": 1.6396484375, "logps/chosen": -778.0, "logps/rejected": -862.0, "loss": 0.8185, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23968505859375, "rewards/margins": 1.6201171875, "rewards/rejected": -1.3798828125, "step": 925 }, { "epoch": 0.1749728376399452, "grad_norm": 1.7933239088886428, "learning_rate": 9.848135140471319e-07, "logits/chosen": 1.3876953125, "logits/rejected": 0.8369140625, "logps/chosen": -860.0, "logps/rejected": -547.5, "loss": 0.5891, "rewards/accuracies": 0.90625, "rewards/chosen": 0.50732421875, "rewards/margins": 2.7734375, "rewards/rejected": -2.263671875, "step": 926 }, { "epoch": 0.17516179318815248, "grad_norm": 1.7652562246384804, "learning_rate": 9.84736961315488e-07, "logits/chosen": 1.515625, "logits/rejected": 1.333740234375, "logps/chosen": -510.0, "logps/rejected": -679.25, "loss": 0.773, "rewards/accuracies": 0.78125, "rewards/chosen": 0.457427978515625, "rewards/margins": 1.783203125, "rewards/rejected": -1.326171875, "step": 927 }, { "epoch": 0.17535074873635978, "grad_norm": 2.202712757799589, "learning_rate": 9.846602194517933e-07, "logits/chosen": 2.76953125, "logits/rejected": 2.9375, "logps/chosen": -786.0, "logps/rejected": -552.5, "loss": 0.5992, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5224609375, "rewards/margins": 2.734375, "rewards/rejected": -2.2109375, "step": 928 }, { "epoch": 0.17553970428456706, "grad_norm": 2.0691648916619054, "learning_rate": 9.84583288489434e-07, "logits/chosen": 1.2421875, "logits/rejected": 1.177490234375, "logps/chosen": -760.0, "logps/rejected": -841.5, "loss": 0.7244, "rewards/accuracies": 0.75, "rewards/chosen": 0.582275390625, "rewards/margins": 2.11328125, "rewards/rejected": -1.533203125, "step": 929 }, { "epoch": 0.17572865983277433, "grad_norm": 3.0326487336244616, "learning_rate": 9.845061684618792e-07, "logits/chosen": 1.5838623046875, "logits/rejected": 1.786956787109375, "logps/chosen": -816.0, "logps/rejected": -907.0, "loss": 0.7277, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2275390625, "rewards/margins": 2.2578125, "rewards/rejected": -2.02734375, "step": 930 }, { "epoch": 0.17591761538098163, "grad_norm": 1.751725645148846, "learning_rate": 9.844288594026797e-07, "logits/chosen": 1.0859375, "logits/rejected": 0.93408203125, "logps/chosen": -493.5, "logps/rejected": -635.5, "loss": 0.6915, "rewards/accuracies": 0.78125, "rewards/chosen": 0.133880615234375, "rewards/margins": 2.126953125, "rewards/rejected": -1.99609375, "step": 931 }, { "epoch": 0.1761065709291889, "grad_norm": 1.7661440619592748, "learning_rate": 9.843513613454686e-07, "logits/chosen": 0.4605712890625, "logits/rejected": 0.569580078125, "logps/chosen": -751.5, "logps/rejected": -695.5, "loss": 0.6603, "rewards/accuracies": 0.875, "rewards/chosen": 0.26708984375, "rewards/margins": 2.759765625, "rewards/rejected": -2.498046875, "step": 932 }, { "epoch": 0.17629552647739619, "grad_norm": 1.9285581749662108, "learning_rate": 9.84273674323962e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.3583984375, "logps/chosen": -629.0, "logps/rejected": -590.0, "loss": 0.7073, "rewards/accuracies": 0.75, "rewards/chosen": 0.0748291015625, "rewards/margins": 2.0869140625, "rewards/rejected": -2.013671875, "step": 933 }, { "epoch": 0.1764844820256035, "grad_norm": 2.0600644819037193, "learning_rate": 9.841957983719568e-07, "logits/chosen": 1.400390625, "logits/rejected": 1.639404296875, "logps/chosen": -545.5, "logps/rejected": -844.5, "loss": 0.7118, "rewards/accuracies": 0.84375, "rewards/chosen": 0.242919921875, "rewards/margins": 2.388671875, "rewards/rejected": -2.146484375, "step": 934 }, { "epoch": 0.17667343757381077, "grad_norm": 2.6294924602761647, "learning_rate": 9.841177335233333e-07, "logits/chosen": 1.33123779296875, "logits/rejected": 1.359375, "logps/chosen": -1046.0, "logps/rejected": -1185.0, "loss": 0.6588, "rewards/accuracies": 0.875, "rewards/chosen": 0.30572509765625, "rewards/margins": 2.587890625, "rewards/rejected": -2.28125, "step": 935 }, { "epoch": 0.17686239312201804, "grad_norm": 2.093751915751472, "learning_rate": 9.840394798120536e-07, "logits/chosen": 0.607421875, "logits/rejected": 0.67138671875, "logps/chosen": -808.0, "logps/rejected": -834.0, "loss": 0.6782, "rewards/accuracies": 0.84375, "rewards/chosen": 0.50537109375, "rewards/margins": 2.8984375, "rewards/rejected": -2.388671875, "step": 936 }, { "epoch": 0.17705134867022532, "grad_norm": 1.5237205253018897, "learning_rate": 9.839610372721619e-07, "logits/chosen": 1.681640625, "logits/rejected": 1.56298828125, "logps/chosen": -1076.5, "logps/rejected": -18519.0, "loss": 0.7203, "rewards/accuracies": 0.75, "rewards/chosen": 0.34814453125, "rewards/margins": 123.2451171875, "rewards/rejected": -123.1279296875, "step": 937 }, { "epoch": 0.17724030421843262, "grad_norm": 1.9100124591916106, "learning_rate": 9.838824059377845e-07, "logits/chosen": 0.8433837890625, "logits/rejected": 1.20263671875, "logps/chosen": -516.5, "logps/rejected": -788.0, "loss": 0.6744, "rewards/accuracies": 0.78125, "rewards/chosen": 0.498046875, "rewards/margins": 2.5234375, "rewards/rejected": -2.0234375, "step": 938 }, { "epoch": 0.1774292597666399, "grad_norm": 2.1235380007457274, "learning_rate": 9.838035858431298e-07, "logits/chosen": 0.548828125, "logits/rejected": 0.869140625, "logps/chosen": -19785.0, "logps/rejected": -1313.0, "loss": 0.6514, "rewards/accuracies": 0.84375, "rewards/chosen": -114.1845703125, "rewards/margins": -112.21875, "rewards/rejected": -2.244140625, "step": 939 }, { "epoch": 0.17761821531484717, "grad_norm": 2.102406212833626, "learning_rate": 9.837245770224888e-07, "logits/chosen": 1.0205078125, "logits/rejected": 1.06640625, "logps/chosen": -547.5, "logps/rejected": -1508.0, "loss": 0.7685, "rewards/accuracies": 0.75, "rewards/chosen": 0.344482421875, "rewards/margins": 1.634765625, "rewards/rejected": -1.291015625, "step": 940 }, { "epoch": 0.17780717086305448, "grad_norm": 2.1999410245264586, "learning_rate": 9.83645379510234e-07, "logits/chosen": 0.2890625, "logits/rejected": 0.591796875, "logps/chosen": -20077.0, "logps/rejected": -899.0, "loss": 0.5783, "rewards/accuracies": 0.90625, "rewards/chosen": -113.7861328125, "rewards/margins": -112.08984375, "rewards/rejected": -2.013671875, "step": 941 }, { "epoch": 0.17799612641126175, "grad_norm": 2.0216978000731807, "learning_rate": 9.835659933408202e-07, "logits/chosen": 1.3841552734375, "logits/rejected": 1.490966796875, "logps/chosen": -833.0, "logps/rejected": -805.0, "loss": 0.7375, "rewards/accuracies": 0.71875, "rewards/chosen": 0.555816650390625, "rewards/margins": 1.7470703125, "rewards/rejected": -1.1943359375, "step": 942 }, { "epoch": 0.17818508195946903, "grad_norm": 2.459125470199289, "learning_rate": 9.834864185487842e-07, "logits/chosen": 1.735595703125, "logits/rejected": 1.8359375, "logps/chosen": -655.0, "logps/rejected": -782.0, "loss": 0.6184, "rewards/accuracies": 0.84375, "rewards/chosen": 0.94921875, "rewards/margins": 2.6015625, "rewards/rejected": -1.6513671875, "step": 943 }, { "epoch": 0.17837403750767633, "grad_norm": 2.2967600879849215, "learning_rate": 9.834066551687454e-07, "logits/chosen": 0.70947265625, "logits/rejected": 0.8330078125, "logps/chosen": -686.0, "logps/rejected": -842.0, "loss": 0.6754, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5126953125, "rewards/margins": 2.154296875, "rewards/rejected": -1.638671875, "step": 944 }, { "epoch": 0.1785629930558836, "grad_norm": 2.157310971225129, "learning_rate": 9.833267032354044e-07, "logits/chosen": 1.244140625, "logits/rejected": 0.962890625, "logps/chosen": -787.0, "logps/rejected": -657.0, "loss": 0.6254, "rewards/accuracies": 0.875, "rewards/chosen": 0.74951171875, "rewards/margins": 2.693359375, "rewards/rejected": -1.93896484375, "step": 945 }, { "epoch": 0.17875194860409088, "grad_norm": 1.7011382386952592, "learning_rate": 9.832465627835443e-07, "logits/chosen": 1.30810546875, "logits/rejected": 1.5693359375, "logps/chosen": -499.0, "logps/rejected": -705.0, "loss": 0.6977, "rewards/accuracies": 0.8125, "rewards/chosen": 0.60888671875, "rewards/margins": 2.484375, "rewards/rejected": -1.8759765625, "step": 946 }, { "epoch": 0.17894090415229816, "grad_norm": 2.050794700467854, "learning_rate": 9.831662338480303e-07, "logits/chosen": 1.5732421875, "logits/rejected": 1.58154296875, "logps/chosen": -767.5, "logps/rejected": -690.0, "loss": 0.7521, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24072265625, "rewards/margins": 1.708984375, "rewards/rejected": -1.46875, "step": 947 }, { "epoch": 0.17912985970050546, "grad_norm": 2.2950311195244177, "learning_rate": 9.830857164638094e-07, "logits/chosen": 1.478515625, "logits/rejected": 1.57763671875, "logps/chosen": -1035.0, "logps/rejected": -1205.5, "loss": 0.6479, "rewards/accuracies": 0.8125, "rewards/chosen": 0.797119140625, "rewards/margins": 3.00390625, "rewards/rejected": -2.21240234375, "step": 948 }, { "epoch": 0.17931881524871274, "grad_norm": 2.1231996992206525, "learning_rate": 9.830050106659105e-07, "logits/chosen": 0.13037109375, "logits/rejected": 0.8558349609375, "logps/chosen": -672.0, "logps/rejected": -773.0, "loss": 0.7773, "rewards/accuracies": 0.65625, "rewards/chosen": 0.395263671875, "rewards/margins": 1.763671875, "rewards/rejected": -1.369140625, "step": 949 }, { "epoch": 0.17950777079692, "grad_norm": 1.8050407022662827, "learning_rate": 9.829241164894448e-07, "logits/chosen": 0.40869140625, "logits/rejected": 0.12451171875, "logps/chosen": -639.5, "logps/rejected": -618.0, "loss": 0.6689, "rewards/accuracies": 0.875, "rewards/chosen": 0.26251220703125, "rewards/margins": 2.77734375, "rewards/rejected": -2.515625, "step": 950 }, { "epoch": 0.17969672634512732, "grad_norm": 2.008697265464324, "learning_rate": 9.828430339696053e-07, "logits/chosen": 1.61328125, "logits/rejected": 1.560546875, "logps/chosen": -1041.0, "logps/rejected": -1116.0, "loss": 0.7032, "rewards/accuracies": 0.78125, "rewards/chosen": 0.639892578125, "rewards/margins": 2.990234375, "rewards/rejected": -2.3515625, "step": 951 }, { "epoch": 0.1798856818933346, "grad_norm": 1.9137702756434554, "learning_rate": 9.827617631416665e-07, "logits/chosen": 0.744140625, "logits/rejected": 0.923828125, "logps/chosen": -864.0, "logps/rejected": -1211.5, "loss": 0.6136, "rewards/accuracies": 0.875, "rewards/chosen": 0.56640625, "rewards/margins": 3.0703125, "rewards/rejected": -2.49609375, "step": 952 }, { "epoch": 0.18007463744154187, "grad_norm": 1.6893521035326997, "learning_rate": 9.826803040409859e-07, "logits/chosen": 1.37890625, "logits/rejected": 1.927734375, "logps/chosen": -804.0, "logps/rejected": -1358.0, "loss": 0.5809, "rewards/accuracies": 0.8125, "rewards/chosen": 0.847900390625, "rewards/margins": 3.833984375, "rewards/rejected": -2.990234375, "step": 953 }, { "epoch": 0.18026359298974917, "grad_norm": 2.831116926456701, "learning_rate": 9.825986567030017e-07, "logits/chosen": 1.333984375, "logits/rejected": 0.9453125, "logps/chosen": -595.0, "logps/rejected": -691.0, "loss": 0.6648, "rewards/accuracies": 0.84375, "rewards/chosen": 0.121337890625, "rewards/margins": 2.306640625, "rewards/rejected": -2.1875, "step": 954 }, { "epoch": 0.18045254853795645, "grad_norm": 2.14293750642228, "learning_rate": 9.825168211632345e-07, "logits/chosen": 0.407470703125, "logits/rejected": 1.1865234375, "logps/chosen": -1125.0, "logps/rejected": -1103.0, "loss": 0.5892, "rewards/accuracies": 0.8125, "rewards/chosen": 0.38525390625, "rewards/margins": 3.65625, "rewards/rejected": -3.26953125, "step": 955 }, { "epoch": 0.18064150408616372, "grad_norm": 1.9185772748230974, "learning_rate": 9.824347974572873e-07, "logits/chosen": 0.9317626953125, "logits/rejected": 0.5185546875, "logps/chosen": -724.0, "logps/rejected": -660.0, "loss": 0.6422, "rewards/accuracies": 0.75, "rewards/chosen": 0.5869140625, "rewards/margins": 2.255859375, "rewards/rejected": -1.669921875, "step": 956 }, { "epoch": 0.18083045963437103, "grad_norm": 2.2506066197897012, "learning_rate": 9.82352585620844e-07, "logits/chosen": 1.408935546875, "logits/rejected": 1.5654296875, "logps/chosen": -844.5, "logps/rejected": -975.0, "loss": 0.603, "rewards/accuracies": 0.875, "rewards/chosen": 0.35009765625, "rewards/margins": 3.93359375, "rewards/rejected": -3.583984375, "step": 957 }, { "epoch": 0.1810194151825783, "grad_norm": 2.045743051976816, "learning_rate": 9.822701856896708e-07, "logits/chosen": 0.564697265625, "logits/rejected": 1.1650390625, "logps/chosen": -809.0, "logps/rejected": -1887.0, "loss": 0.6884, "rewards/accuracies": 0.84375, "rewards/chosen": 0.368408203125, "rewards/margins": 3.5, "rewards/rejected": -3.130859375, "step": 958 }, { "epoch": 0.18120837073078558, "grad_norm": 2.2114362227010034, "learning_rate": 9.82187597699616e-07, "logits/chosen": 1.08203125, "logits/rejected": 0.6912841796875, "logps/chosen": -770.5, "logps/rejected": -598.0, "loss": 0.7296, "rewards/accuracies": 0.6875, "rewards/chosen": 0.412109375, "rewards/margins": 1.794921875, "rewards/rejected": -1.3828125, "step": 959 }, { "epoch": 0.18139732627899285, "grad_norm": 1.9378890021348507, "learning_rate": 9.821048216866093e-07, "logits/chosen": 1.3740234375, "logits/rejected": 1.6796875, "logps/chosen": -627.0, "logps/rejected": -723.0, "loss": 0.6433, "rewards/accuracies": 0.78125, "rewards/chosen": 0.55517578125, "rewards/margins": 2.8203125, "rewards/rejected": -2.263671875, "step": 960 }, { "epoch": 0.18158628182720016, "grad_norm": 1.6856896488373, "learning_rate": 9.820218576866624e-07, "logits/chosen": 1.66796875, "logits/rejected": 1.5810546875, "logps/chosen": -587.5, "logps/rejected": -608.5, "loss": 0.6106, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2138671875, "rewards/margins": 2.599609375, "rewards/rejected": -2.38671875, "step": 961 }, { "epoch": 0.18177523737540743, "grad_norm": 1.7064170995126926, "learning_rate": 9.819387057358687e-07, "logits/chosen": 0.88720703125, "logits/rejected": 0.5478515625, "logps/chosen": -689.5, "logps/rejected": -835.5, "loss": 0.5498, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6171875, "rewards/margins": 3.287109375, "rewards/rejected": -2.677734375, "step": 962 }, { "epoch": 0.1819641929236147, "grad_norm": 1.842157176636128, "learning_rate": 9.818553658704035e-07, "logits/chosen": 0.715576171875, "logits/rejected": 0.7275390625, "logps/chosen": -479.5, "logps/rejected": -475.0, "loss": 0.7438, "rewards/accuracies": 0.75, "rewards/chosen": 0.19384765625, "rewards/margins": 1.771484375, "rewards/rejected": -1.578125, "step": 963 }, { "epoch": 0.182153148471822, "grad_norm": 1.8546486160716016, "learning_rate": 9.817718381265238e-07, "logits/chosen": 1.310546875, "logits/rejected": 0.69140625, "logps/chosen": -800.5, "logps/rejected": -662.5, "loss": 0.6316, "rewards/accuracies": 0.8125, "rewards/chosen": 0.66259765625, "rewards/margins": 2.6328125, "rewards/rejected": -1.966796875, "step": 964 }, { "epoch": 0.1823421040200293, "grad_norm": 1.8525866650158334, "learning_rate": 9.816881225405681e-07, "logits/chosen": 1.078125, "logits/rejected": 0.489990234375, "logps/chosen": -636.5, "logps/rejected": -581.0, "loss": 0.706, "rewards/accuracies": 0.875, "rewards/chosen": 0.01995849609375, "rewards/margins": 2.228515625, "rewards/rejected": -2.205078125, "step": 965 }, { "epoch": 0.18253105956823656, "grad_norm": 1.5964299794032233, "learning_rate": 9.816042191489569e-07, "logits/chosen": 1.556640625, "logits/rejected": 1.53125, "logps/chosen": -1076.0, "logps/rejected": -1004.0, "loss": 0.6472, "rewards/accuracies": 0.78125, "rewards/chosen": 0.24365234375, "rewards/margins": 3.26953125, "rewards/rejected": -3.0234375, "step": 966 }, { "epoch": 0.18272001511644387, "grad_norm": 402.87872631528484, "learning_rate": 9.815201279881924e-07, "logits/chosen": 1.78515625, "logits/rejected": 2.283203125, "logps/chosen": -8864.0, "logps/rejected": -4259.5, "loss": 0.7062, "rewards/accuracies": 0.8125, "rewards/chosen": -97.0849609375, "rewards/margins": -95.3984375, "rewards/rejected": -1.6259765625, "step": 967 }, { "epoch": 0.18290897066465114, "grad_norm": 2.0711483308031458, "learning_rate": 9.814358490948581e-07, "logits/chosen": 2.568359375, "logits/rejected": 2.4296875, "logps/chosen": -804.0, "logps/rejected": -746.0, "loss": 0.6275, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6875, "rewards/margins": 3.099609375, "rewards/rejected": -2.412109375, "step": 968 }, { "epoch": 0.18309792621285842, "grad_norm": 2.141426845426683, "learning_rate": 9.8135138250562e-07, "logits/chosen": 1.11932373046875, "logits/rejected": 1.4990234375, "logps/chosen": -741.0, "logps/rejected": -732.0, "loss": 0.6419, "rewards/accuracies": 0.90625, "rewards/chosen": 0.40869140625, "rewards/margins": 2.384765625, "rewards/rejected": -1.978515625, "step": 969 }, { "epoch": 0.18328688176106572, "grad_norm": 2.123237898755207, "learning_rate": 9.812667282572249e-07, "logits/chosen": 1.73828125, "logits/rejected": 1.01025390625, "logps/chosen": -863.0, "logps/rejected": -734.0, "loss": 0.6695, "rewards/accuracies": 0.8125, "rewards/chosen": 0.43896484375, "rewards/margins": 2.125, "rewards/rejected": -1.685546875, "step": 970 }, { "epoch": 0.183475837309273, "grad_norm": 1.7919220698289657, "learning_rate": 9.811818863865015e-07, "logits/chosen": 2.2333984375, "logits/rejected": 2.044921875, "logps/chosen": -755.0, "logps/rejected": -787.5, "loss": 0.6532, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7646484375, "rewards/margins": 2.64453125, "rewards/rejected": -1.87890625, "step": 971 }, { "epoch": 0.18366479285748027, "grad_norm": 2.0203626354609954, "learning_rate": 9.810968569303603e-07, "logits/chosen": 1.62353515625, "logits/rejected": 2.6611328125, "logps/chosen": -841.0, "logps/rejected": -2520.0, "loss": 0.5892, "rewards/accuracies": 0.875, "rewards/chosen": 0.77099609375, "rewards/margins": 4.078125, "rewards/rejected": -3.30078125, "step": 972 }, { "epoch": 0.18385374840568755, "grad_norm": 2.4163445202514984, "learning_rate": 9.810116399257937e-07, "logits/chosen": 2.55859375, "logits/rejected": 2.98828125, "logps/chosen": -565.25, "logps/rejected": -744.5, "loss": 0.7415, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6474609375, "rewards/margins": 1.865234375, "rewards/rejected": -1.2236328125, "step": 973 }, { "epoch": 0.18404270395389485, "grad_norm": 2.2130481263643533, "learning_rate": 9.809262354098746e-07, "logits/chosen": 1.513427734375, "logits/rejected": 1.5023193359375, "logps/chosen": -997.0, "logps/rejected": -782.0, "loss": 0.6318, "rewards/accuracies": 0.75, "rewards/chosen": 0.65576171875, "rewards/margins": 2.74609375, "rewards/rejected": -2.0859375, "step": 974 }, { "epoch": 0.18423165950210213, "grad_norm": 1.6901988651761268, "learning_rate": 9.808406434197585e-07, "logits/chosen": 1.5205078125, "logits/rejected": 1.7744140625, "logps/chosen": -910.0, "logps/rejected": -847.0, "loss": 0.5406, "rewards/accuracies": 0.84375, "rewards/chosen": 0.708740234375, "rewards/margins": 3.26171875, "rewards/rejected": -2.546875, "step": 975 }, { "epoch": 0.1844206150503094, "grad_norm": 1.9612488884706767, "learning_rate": 9.807548639926824e-07, "logits/chosen": 2.109375, "logits/rejected": 2.0927734375, "logps/chosen": -1000.5, "logps/rejected": -850.5, "loss": 0.5914, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7138671875, "rewards/margins": 2.85546875, "rewards/rejected": -2.138671875, "step": 976 }, { "epoch": 0.1846095705985167, "grad_norm": 1.9983765973190328, "learning_rate": 9.806688971659644e-07, "logits/chosen": 1.79345703125, "logits/rejected": 1.6591796875, "logps/chosen": -577.0, "logps/rejected": -588.5, "loss": 0.7832, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0263671875, "rewards/margins": 1.8017578125, "rewards/rejected": -1.7783203125, "step": 977 }, { "epoch": 0.18479852614672398, "grad_norm": 1.9904304954657845, "learning_rate": 9.80582742977004e-07, "logits/chosen": 1.9208984375, "logits/rejected": 2.61328125, "logps/chosen": -573.0, "logps/rejected": -806.5, "loss": 0.679, "rewards/accuracies": 0.875, "rewards/chosen": 0.44140625, "rewards/margins": 2.2890625, "rewards/rejected": -1.841796875, "step": 978 }, { "epoch": 0.18498748169493126, "grad_norm": 2.3281858860756928, "learning_rate": 9.804964014632833e-07, "logits/chosen": 1.54296875, "logits/rejected": 2.4140625, "logps/chosen": -473.25, "logps/rejected": -970.0, "loss": 0.7604, "rewards/accuracies": 0.8125, "rewards/chosen": -0.273193359375, "rewards/margins": 2.30859375, "rewards/rejected": -2.583984375, "step": 979 }, { "epoch": 0.18517643724313856, "grad_norm": 2.148602148688885, "learning_rate": 9.804098726623643e-07, "logits/chosen": 1.51171875, "logits/rejected": 2.2529296875, "logps/chosen": -460.25, "logps/rejected": -670.0, "loss": 0.7126, "rewards/accuracies": 0.8125, "rewards/chosen": 0.188720703125, "rewards/margins": 2.12109375, "rewards/rejected": -1.93603515625, "step": 980 }, { "epoch": 0.18536539279134584, "grad_norm": 2.305133090411001, "learning_rate": 9.803231566118916e-07, "logits/chosen": 1.348876953125, "logits/rejected": 1.88525390625, "logps/chosen": -1014.0, "logps/rejected": -1226.0, "loss": 0.6558, "rewards/accuracies": 0.8125, "rewards/chosen": 0.47314453125, "rewards/margins": 3.544921875, "rewards/rejected": -3.05859375, "step": 981 }, { "epoch": 0.18555434833955312, "grad_norm": 2.604727672119204, "learning_rate": 9.80236253349591e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.4344482421875, "logps/chosen": -698.0, "logps/rejected": -694.0, "loss": 0.7153, "rewards/accuracies": 0.71875, "rewards/chosen": 0.34912109375, "rewards/margins": 2.46875, "rewards/rejected": -2.123046875, "step": 982 }, { "epoch": 0.1857433038877604, "grad_norm": 1.7903436624473377, "learning_rate": 9.8014916291327e-07, "logits/chosen": 1.14453125, "logits/rejected": 1.482421875, "logps/chosen": -609.5, "logps/rejected": -577.0, "loss": 0.619, "rewards/accuracies": 0.8125, "rewards/chosen": 0.66943359375, "rewards/margins": 2.775390625, "rewards/rejected": -2.1025390625, "step": 983 }, { "epoch": 0.1859322594359677, "grad_norm": 2.7015087801724724, "learning_rate": 9.80061885340817e-07, "logits/chosen": 2.302734375, "logits/rejected": 2.2578125, "logps/chosen": -859.0, "logps/rejected": -723.5, "loss": 0.7054, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22015380859375, "rewards/margins": 2.447265625, "rewards/rejected": -2.23046875, "step": 984 }, { "epoch": 0.18612121498417497, "grad_norm": 2.0587564963688085, "learning_rate": 9.799744206702016e-07, "logits/chosen": 1.52587890625, "logits/rejected": 1.129150390625, "logps/chosen": -693.5, "logps/rejected": -650.5, "loss": 0.6249, "rewards/accuracies": 0.875, "rewards/chosen": 0.61572265625, "rewards/margins": 2.48828125, "rewards/rejected": -1.875, "step": 985 }, { "epoch": 0.18631017053238225, "grad_norm": 1.5184719333588408, "learning_rate": 9.798867689394758e-07, "logits/chosen": 2.509765625, "logits/rejected": 2.4375, "logps/chosen": -707.75, "logps/rejected": -775.25, "loss": 0.5977, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0439453125, "rewards/margins": 2.53515625, "rewards/rejected": -1.4912109375, "step": 986 }, { "epoch": 0.18649912608058955, "grad_norm": 1.9219355566183822, "learning_rate": 9.797989301867723e-07, "logits/chosen": 1.9189453125, "logits/rejected": 2.712890625, "logps/chosen": -969.5, "logps/rejected": -1379.0, "loss": 0.6391, "rewards/accuracies": 0.875, "rewards/chosen": 0.93939208984375, "rewards/margins": 3.630859375, "rewards/rejected": -2.685546875, "step": 987 }, { "epoch": 0.18668808162879683, "grad_norm": 2.275852293334765, "learning_rate": 9.797109044503055e-07, "logits/chosen": 2.361328125, "logits/rejected": 2.373046875, "logps/chosen": -986.0, "logps/rejected": -1153.0, "loss": 0.6622, "rewards/accuracies": 0.8125, "rewards/chosen": 1.17919921875, "rewards/margins": 3.09326171875, "rewards/rejected": -1.9150390625, "step": 988 }, { "epoch": 0.1868770371770041, "grad_norm": 1.8073382841691603, "learning_rate": 9.796226917683706e-07, "logits/chosen": 2.01953125, "logits/rejected": 2.109375, "logps/chosen": -746.0, "logps/rejected": -575.0, "loss": 0.758, "rewards/accuracies": 0.75, "rewards/chosen": 0.6221923828125, "rewards/margins": 1.7001953125, "rewards/rejected": -1.07891845703125, "step": 989 }, { "epoch": 0.1870659927252114, "grad_norm": 2.4046193166662526, "learning_rate": 9.795342921793444e-07, "logits/chosen": 1.666015625, "logits/rejected": 2.177734375, "logps/chosen": -1061.5, "logps/rejected": -1031.0, "loss": 0.6088, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0302734375, "rewards/margins": 2.765625, "rewards/rejected": -1.734375, "step": 990 }, { "epoch": 0.18725494827341868, "grad_norm": 2.1366559596941532, "learning_rate": 9.794457057216858e-07, "logits/chosen": 2.55859375, "logits/rejected": 3.2890625, "logps/chosen": -1056.5, "logps/rejected": -2087.0, "loss": 0.7584, "rewards/accuracies": 0.6875, "rewards/chosen": -0.260009765625, "rewards/margins": 1.99365234375, "rewards/rejected": -2.26025390625, "step": 991 }, { "epoch": 0.18744390382162596, "grad_norm": 2.170270750055544, "learning_rate": 9.793569324339332e-07, "logits/chosen": 2.09765625, "logits/rejected": 2.3671875, "logps/chosen": -1135.0, "logps/rejected": -1008.0, "loss": 0.6817, "rewards/accuracies": 0.8125, "rewards/chosen": 1.37548828125, "rewards/margins": 2.966796875, "rewards/rejected": -1.5966796875, "step": 992 }, { "epoch": 0.18763285936983326, "grad_norm": 2.220738661157399, "learning_rate": 9.792679723547083e-07, "logits/chosen": 2.568359375, "logits/rejected": 2.4375, "logps/chosen": -1123.75, "logps/rejected": -1301.0, "loss": 0.6999, "rewards/accuracies": 0.75, "rewards/chosen": 1.01611328125, "rewards/margins": 2.763671875, "rewards/rejected": -1.748046875, "step": 993 }, { "epoch": 0.18782181491804054, "grad_norm": 1.6978769804612925, "learning_rate": 9.791788255227128e-07, "logits/chosen": 1.867919921875, "logits/rejected": 2.5625, "logps/chosen": -756.0, "logps/rejected": -1898.0, "loss": 0.6162, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8583984375, "rewards/margins": 3.6796875, "rewards/rejected": -2.822265625, "step": 994 }, { "epoch": 0.1880107704662478, "grad_norm": 2.081263530341672, "learning_rate": 9.790894919767298e-07, "logits/chosen": 2.658203125, "logits/rejected": 2.501953125, "logps/chosen": -886.0, "logps/rejected": -801.0, "loss": 0.6701, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7353515625, "rewards/margins": 2.255859375, "rewards/rejected": -1.521484375, "step": 995 }, { "epoch": 0.1881997260144551, "grad_norm": 1.6730752014056838, "learning_rate": 9.78999971755624e-07, "logits/chosen": 1.763671875, "logits/rejected": 1.763427734375, "logps/chosen": -813.0, "logps/rejected": -721.5, "loss": 0.602, "rewards/accuracies": 0.875, "rewards/chosen": 0.913330078125, "rewards/margins": 2.95703125, "rewards/rejected": -2.044921875, "step": 996 }, { "epoch": 0.1883886815626624, "grad_norm": 2.011888319778643, "learning_rate": 9.789102648983411e-07, "logits/chosen": 0.9404296875, "logits/rejected": 1.20458984375, "logps/chosen": -995.5, "logps/rejected": -1330.0, "loss": 0.7006, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5618896484375, "rewards/margins": 2.5390625, "rewards/rejected": -1.98046875, "step": 997 }, { "epoch": 0.18857763711086967, "grad_norm": 1.881705042619157, "learning_rate": 9.788203714439079e-07, "logits/chosen": 2.154296875, "logits/rejected": 1.362548828125, "logps/chosen": -554.5, "logps/rejected": -706.0, "loss": 0.6693, "rewards/accuracies": 0.71875, "rewards/chosen": 0.46533203125, "rewards/margins": 2.6015625, "rewards/rejected": -2.13671875, "step": 998 }, { "epoch": 0.18876659265907694, "grad_norm": 1.832705995280525, "learning_rate": 9.787302914314328e-07, "logits/chosen": 1.67236328125, "logits/rejected": 1.54541015625, "logps/chosen": -657.0, "logps/rejected": -923.0, "loss": 0.6671, "rewards/accuracies": 0.875, "rewards/chosen": 0.625732421875, "rewards/margins": 2.859375, "rewards/rejected": -2.234375, "step": 999 }, { "epoch": 0.18895554820728425, "grad_norm": 2.3230406764414164, "learning_rate": 9.786400249001044e-07, "logits/chosen": 1.9423828125, "logits/rejected": 2.748046875, "logps/chosen": -580.0, "logps/rejected": -1124.0, "loss": 0.7031, "rewards/accuracies": 0.78125, "rewards/chosen": 0.092041015625, "rewards/margins": 2.966796875, "rewards/rejected": -2.873046875, "step": 1000 }, { "epoch": 0.18914450375549152, "grad_norm": 1.7751571071019558, "learning_rate": 9.78549571889194e-07, "logits/chosen": 1.1669921875, "logits/rejected": 1.890625, "logps/chosen": -703.5, "logps/rejected": -775.0, "loss": 0.7574, "rewards/accuracies": 0.75, "rewards/chosen": 0.356201171875, "rewards/margins": 1.69287109375, "rewards/rejected": -1.3359375, "step": 1001 }, { "epoch": 0.1893334593036988, "grad_norm": 1.7465202003102094, "learning_rate": 9.784589324380525e-07, "logits/chosen": 1.66015625, "logits/rejected": 1.599609375, "logps/chosen": -749.0, "logps/rejected": -666.5, "loss": 0.627, "rewards/accuracies": 0.875, "rewards/chosen": 0.5041351318359375, "rewards/margins": 2.484375, "rewards/rejected": -1.98046875, "step": 1002 }, { "epoch": 0.1895224148519061, "grad_norm": 1.789140134581503, "learning_rate": 9.783681065861127e-07, "logits/chosen": 1.9482421875, "logits/rejected": 1.583984375, "logps/chosen": -1012.0, "logps/rejected": -1068.5, "loss": 0.6654, "rewards/accuracies": 0.71875, "rewards/chosen": 0.52191162109375, "rewards/margins": 3.1318359375, "rewards/rejected": -2.60546875, "step": 1003 }, { "epoch": 0.18971137040011338, "grad_norm": 2.032713694776223, "learning_rate": 9.782770943728882e-07, "logits/chosen": 0.668701171875, "logits/rejected": 0.822265625, "logps/chosen": -676.0, "logps/rejected": -779.0, "loss": 0.6999, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6904296875, "rewards/margins": 2.232421875, "rewards/rejected": -1.537109375, "step": 1004 }, { "epoch": 0.18990032594832065, "grad_norm": 1.9127914821527705, "learning_rate": 9.781858958379741e-07, "logits/chosen": 1.9453125, "logits/rejected": 1.4111328125, "logps/chosen": -610.5, "logps/rejected": -690.5, "loss": 0.6149, "rewards/accuracies": 0.8125, "rewards/chosen": 0.809326171875, "rewards/margins": 2.9453125, "rewards/rejected": -2.13671875, "step": 1005 }, { "epoch": 0.19008928149652793, "grad_norm": 1.83003395404457, "learning_rate": 9.78094511021046e-07, "logits/chosen": 1.787109375, "logits/rejected": 1.490234375, "logps/chosen": -819.5, "logps/rejected": -854.0, "loss": 0.5367, "rewards/accuracies": 0.875, "rewards/chosen": 1.10546875, "rewards/margins": 3.1875, "rewards/rejected": -2.076171875, "step": 1006 }, { "epoch": 0.19027823704473523, "grad_norm": 2.1828855107775045, "learning_rate": 9.780029399618611e-07, "logits/chosen": 1.3697509765625, "logits/rejected": 1.720947265625, "logps/chosen": -792.0, "logps/rejected": -982.0, "loss": 0.682, "rewards/accuracies": 0.8125, "rewards/chosen": 0.826171875, "rewards/margins": 1.8974609375, "rewards/rejected": -1.06866455078125, "step": 1007 }, { "epoch": 0.1904671925929425, "grad_norm": 1.6166583403045431, "learning_rate": 9.77911182700257e-07, "logits/chosen": 1.31005859375, "logits/rejected": 1.4169921875, "logps/chosen": -552.0, "logps/rejected": -510.5, "loss": 0.6943, "rewards/accuracies": 0.78125, "rewards/chosen": 0.318359375, "rewards/margins": 2.1494140625, "rewards/rejected": -1.83203125, "step": 1008 }, { "epoch": 0.19065614814114978, "grad_norm": 1.713005671325991, "learning_rate": 9.778192392761529e-07, "logits/chosen": 1.5087890625, "logits/rejected": 1.7381591796875, "logps/chosen": -949.0, "logps/rejected": -1874.5, "loss": 0.6183, "rewards/accuracies": 0.84375, "rewards/chosen": 0.909912109375, "rewards/margins": 3.55078125, "rewards/rejected": -2.62890625, "step": 1009 }, { "epoch": 0.1908451036893571, "grad_norm": 1.9823455252967175, "learning_rate": 9.777271097295484e-07, "logits/chosen": 1.6982421875, "logits/rejected": 1.4384765625, "logps/chosen": -779.5, "logps/rejected": -851.0, "loss": 0.7374, "rewards/accuracies": 0.84375, "rewards/chosen": 0.583251953125, "rewards/margins": 1.896484375, "rewards/rejected": -1.31396484375, "step": 1010 }, { "epoch": 0.19103405923756436, "grad_norm": 1.8859542018429833, "learning_rate": 9.776347941005248e-07, "logits/chosen": 1.0146484375, "logits/rejected": 1.28515625, "logps/chosen": -544.0, "logps/rejected": -408.0, "loss": 0.7054, "rewards/accuracies": 0.8125, "rewards/chosen": 0.374267578125, "rewards/margins": 1.84765625, "rewards/rejected": -1.4716796875, "step": 1011 }, { "epoch": 0.19122301478577164, "grad_norm": 1.699768218665709, "learning_rate": 9.775422924292437e-07, "logits/chosen": 2.291015625, "logits/rejected": 1.744140625, "logps/chosen": -1016.0, "logps/rejected": -917.0, "loss": 0.6067, "rewards/accuracies": 0.71875, "rewards/chosen": 1.15625, "rewards/margins": 2.89453125, "rewards/rejected": -1.73486328125, "step": 1012 }, { "epoch": 0.19141197033397894, "grad_norm": 2.1697698875230995, "learning_rate": 9.77449604755948e-07, "logits/chosen": 2.0595703125, "logits/rejected": 1.97265625, "logps/chosen": -597.5, "logps/rejected": -494.0, "loss": 0.6053, "rewards/accuracies": 0.90625, "rewards/chosen": 0.81640625, "rewards/margins": 2.87890625, "rewards/rejected": -2.064453125, "step": 1013 }, { "epoch": 0.19160092588218622, "grad_norm": 1.9282279276168812, "learning_rate": 9.773567311209616e-07, "logits/chosen": 1.619140625, "logits/rejected": 2.041015625, "logps/chosen": -1123.0, "logps/rejected": -900.0, "loss": 0.6737, "rewards/accuracies": 0.78125, "rewards/chosen": 0.628173828125, "rewards/margins": 2.58984375, "rewards/rejected": -1.9609375, "step": 1014 }, { "epoch": 0.1917898814303935, "grad_norm": 1.794300328216302, "learning_rate": 9.772636715646887e-07, "logits/chosen": 1.791015625, "logits/rejected": 1.841796875, "logps/chosen": -552.0, "logps/rejected": -609.0, "loss": 0.6851, "rewards/accuracies": 0.75, "rewards/chosen": 0.5595703125, "rewards/margins": 2.31640625, "rewards/rejected": -1.7578125, "step": 1015 }, { "epoch": 0.1919788369786008, "grad_norm": 1.9826143886296095, "learning_rate": 9.771704261276148e-07, "logits/chosen": 1.193603515625, "logits/rejected": 1.55029296875, "logps/chosen": -593.0, "logps/rejected": -554.5, "loss": 0.6905, "rewards/accuracies": 0.8125, "rewards/chosen": 0.296142578125, "rewards/margins": 2.1015625, "rewards/rejected": -1.802734375, "step": 1016 }, { "epoch": 0.19216779252680807, "grad_norm": 1.9428620541020056, "learning_rate": 9.770769948503068e-07, "logits/chosen": 1.447265625, "logits/rejected": 1.10986328125, "logps/chosen": -499.5, "logps/rejected": -416.5, "loss": 0.7455, "rewards/accuracies": 0.6875, "rewards/chosen": 0.580078125, "rewards/margins": 1.4970703125, "rewards/rejected": -0.92041015625, "step": 1017 }, { "epoch": 0.19235674807501535, "grad_norm": 1.8411157947717203, "learning_rate": 9.769833777734115e-07, "logits/chosen": 1.9609375, "logits/rejected": 1.958984375, "logps/chosen": -664.0, "logps/rejected": -791.0, "loss": 0.609, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6052627563476562, "rewards/margins": 3.294921875, "rewards/rejected": -2.6875, "step": 1018 }, { "epoch": 0.19254570362322262, "grad_norm": 2.5460832502117188, "learning_rate": 9.768895749376572e-07, "logits/chosen": 1.5380859375, "logits/rejected": 1.640625, "logps/chosen": -678.5, "logps/rejected": -692.0, "loss": 0.6936, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0703125, "rewards/margins": 2.37890625, "rewards/rejected": -2.455078125, "step": 1019 }, { "epoch": 0.19273465917142993, "grad_norm": 2.224921658001625, "learning_rate": 9.767955863838524e-07, "logits/chosen": 1.78271484375, "logits/rejected": 1.3251953125, "logps/chosen": -1329.0, "logps/rejected": -1123.0, "loss": 0.6397, "rewards/accuracies": 0.8125, "rewards/chosen": 0.65478515625, "rewards/margins": 3.642578125, "rewards/rejected": -2.9921875, "step": 1020 }, { "epoch": 0.1929236147196372, "grad_norm": 2.3557312515080815, "learning_rate": 9.767014121528873e-07, "logits/chosen": 0.9150390625, "logits/rejected": 0.5478515625, "logps/chosen": -725.0, "logps/rejected": -620.5, "loss": 0.6341, "rewards/accuracies": 0.96875, "rewards/chosen": -0.014404296875, "rewards/margins": 2.80859375, "rewards/rejected": -2.826171875, "step": 1021 }, { "epoch": 0.19311257026784448, "grad_norm": 1.6746124094696624, "learning_rate": 9.766070522857318e-07, "logits/chosen": 1.15625, "logits/rejected": 1.244873046875, "logps/chosen": -654.0, "logps/rejected": -1421.0, "loss": 0.6872, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4102783203125, "rewards/margins": 3.669921875, "rewards/rejected": -3.263671875, "step": 1022 }, { "epoch": 0.19330152581605178, "grad_norm": 3.191522527561377, "learning_rate": 9.765125068234373e-07, "logits/chosen": 1.15380859375, "logits/rejected": 0.91943359375, "logps/chosen": -829.0, "logps/rejected": -877.5, "loss": 0.6061, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8193359375, "rewards/margins": 2.880859375, "rewards/rejected": -2.05859375, "step": 1023 }, { "epoch": 0.19349048136425906, "grad_norm": 1.8106432769110414, "learning_rate": 9.764177758071359e-07, "logits/chosen": 1.83984375, "logits/rejected": 1.9970703125, "logps/chosen": -536.0, "logps/rejected": -533.0, "loss": 0.6932, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0068359375, "rewards/margins": 2.2421875, "rewards/rejected": -2.2275390625, "step": 1024 }, { "epoch": 0.19367943691246634, "grad_norm": 2.167447462186591, "learning_rate": 9.763228592780404e-07, "logits/chosen": 2.44140625, "logits/rejected": 2.0419921875, "logps/chosen": -904.0, "logps/rejected": -762.5, "loss": 0.6746, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39300537109375, "rewards/margins": 2.171875, "rewards/rejected": -1.7802734375, "step": 1025 }, { "epoch": 0.19386839246067364, "grad_norm": 1.9343441957683258, "learning_rate": 9.762277572774434e-07, "logits/chosen": 1.470703125, "logits/rejected": 2.0693359375, "logps/chosen": -1120.0, "logps/rejected": -1909.0, "loss": 0.6844, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2943115234375, "rewards/margins": 3.888671875, "rewards/rejected": -3.591796875, "step": 1026 }, { "epoch": 0.19405734800888091, "grad_norm": 2.1046356043941326, "learning_rate": 9.761324698467198e-07, "logits/chosen": 1.185546875, "logits/rejected": 1.90478515625, "logps/chosen": -814.5, "logps/rejected": -860.0, "loss": 0.7662, "rewards/accuracies": 0.65625, "rewards/chosen": 0.2532958984375, "rewards/margins": 1.779296875, "rewards/rejected": -1.525390625, "step": 1027 }, { "epoch": 0.1942463035570882, "grad_norm": 3.1921873793186877, "learning_rate": 9.76036997027324e-07, "logits/chosen": 1.505859375, "logits/rejected": 2.0478515625, "logps/chosen": -545.0, "logps/rejected": -817.0, "loss": 0.7266, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1259765625, "rewards/margins": 2.55419921875, "rewards/rejected": -2.42822265625, "step": 1028 }, { "epoch": 0.19443525910529547, "grad_norm": 1.7383082878260754, "learning_rate": 9.759413388607914e-07, "logits/chosen": 1.234375, "logits/rejected": 1.4697265625, "logps/chosen": -947.0, "logps/rejected": -1103.0, "loss": 0.5841, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3564453125, "rewards/margins": 4.13671875, "rewards/rejected": -2.78125, "step": 1029 }, { "epoch": 0.19462421465350277, "grad_norm": 1.792680129080254, "learning_rate": 9.758454953887382e-07, "logits/chosen": 1.12109375, "logits/rejected": 1.3983154296875, "logps/chosen": -647.0, "logps/rejected": -713.0, "loss": 0.7191, "rewards/accuracies": 0.75, "rewards/chosen": 0.526123046875, "rewards/margins": 2.05859375, "rewards/rejected": -1.5341796875, "step": 1030 }, { "epoch": 0.19481317020171005, "grad_norm": 4.0060183232995135, "learning_rate": 9.757494666528607e-07, "logits/chosen": 1.359375, "logits/rejected": 1.765625, "logps/chosen": -544.5, "logps/rejected": -719.5, "loss": 0.6972, "rewards/accuracies": 0.8125, "rewards/chosen": 0.685546875, "rewards/margins": 2.228515625, "rewards/rejected": -1.54296875, "step": 1031 }, { "epoch": 0.19500212574991732, "grad_norm": 2.6683090733920922, "learning_rate": 9.756532526949364e-07, "logits/chosen": 1.5855712890625, "logits/rejected": 1.86279296875, "logps/chosen": -1063.0, "logps/rejected": -1008.0, "loss": 0.6003, "rewards/accuracies": 0.84375, "rewards/chosen": 1.02001953125, "rewards/margins": 3.1484375, "rewards/rejected": -2.138671875, "step": 1032 }, { "epoch": 0.19519108129812462, "grad_norm": 1.8496485810687364, "learning_rate": 9.755568535568233e-07, "logits/chosen": 1.330078125, "logits/rejected": 1.3310546875, "logps/chosen": -768.5, "logps/rejected": -680.5, "loss": 0.664, "rewards/accuracies": 0.75, "rewards/chosen": 0.824462890625, "rewards/margins": 2.517578125, "rewards/rejected": -1.6953125, "step": 1033 }, { "epoch": 0.1953800368463319, "grad_norm": 2.330305809327464, "learning_rate": 9.754602692804592e-07, "logits/chosen": 1.85546875, "logits/rejected": 2.193359375, "logps/chosen": -1215.0, "logps/rejected": -1119.0, "loss": 0.612, "rewards/accuracies": 0.875, "rewards/chosen": 1.033935546875, "rewards/margins": 4.390625, "rewards/rejected": -3.359375, "step": 1034 }, { "epoch": 0.19556899239453918, "grad_norm": 1.8796586688529364, "learning_rate": 9.753634999078635e-07, "logits/chosen": 1.423828125, "logits/rejected": 1.708984375, "logps/chosen": -608.5, "logps/rejected": -1594.0, "loss": 0.6418, "rewards/accuracies": 0.75, "rewards/chosen": 0.6982421875, "rewards/margins": 4.30859375, "rewards/rejected": -3.609375, "step": 1035 }, { "epoch": 0.19575794794274648, "grad_norm": 1.5590279866390493, "learning_rate": 9.752665454811356e-07, "logits/chosen": 1.318359375, "logits/rejected": 2.04296875, "logps/chosen": -1197.0, "logps/rejected": -1656.0, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 1.2138671875, "rewards/margins": 6.6015625, "rewards/rejected": -5.3828125, "step": 1036 }, { "epoch": 0.19594690349095376, "grad_norm": 1.5573502912718542, "learning_rate": 9.751694060424554e-07, "logits/chosen": 1.271484375, "logits/rejected": 0.9306640625, "logps/chosen": -1045.0, "logps/rejected": -1006.5, "loss": 0.6192, "rewards/accuracies": 0.75, "rewards/chosen": 0.7734375, "rewards/margins": 3.703125, "rewards/rejected": -2.921875, "step": 1037 }, { "epoch": 0.19613585903916103, "grad_norm": 1.8165643832654956, "learning_rate": 9.750720816340832e-07, "logits/chosen": 1.7314453125, "logits/rejected": 1.365234375, "logps/chosen": -1444.0, "logps/rejected": -1281.0, "loss": 0.5278, "rewards/accuracies": 0.875, "rewards/chosen": 1.13671875, "rewards/margins": 4.17578125, "rewards/rejected": -3.03515625, "step": 1038 }, { "epoch": 0.19632481458736833, "grad_norm": 1.989511227594767, "learning_rate": 9.749745722983602e-07, "logits/chosen": 1.103515625, "logits/rejected": 0.82275390625, "logps/chosen": -1065.5, "logps/rejected": -973.0, "loss": 0.6583, "rewards/accuracies": 0.78125, "rewards/chosen": 0.640411376953125, "rewards/margins": 3.927734375, "rewards/rejected": -3.29296875, "step": 1039 }, { "epoch": 0.1965137701355756, "grad_norm": 2.0657941213523086, "learning_rate": 9.748768780777078e-07, "logits/chosen": 2.66796875, "logits/rejected": 2.9140625, "logps/chosen": -718.0, "logps/rejected": -715.0, "loss": 0.604, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0029296875, "rewards/margins": 2.5078125, "rewards/rejected": -2.509765625, "step": 1040 }, { "epoch": 0.1967027256837829, "grad_norm": 2.7468133358252516, "learning_rate": 9.747789990146275e-07, "logits/chosen": 2.087890625, "logits/rejected": 2.025390625, "logps/chosen": -917.0, "logps/rejected": -722.0, "loss": 0.6628, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5810546875, "rewards/margins": 2.478515625, "rewards/rejected": -1.900390625, "step": 1041 }, { "epoch": 0.19689168123199016, "grad_norm": 2.025385336402997, "learning_rate": 9.746809351517018e-07, "logits/chosen": 2.01171875, "logits/rejected": 1.6650390625, "logps/chosen": -719.5, "logps/rejected": -970.5, "loss": 0.7124, "rewards/accuracies": 0.8125, "rewards/chosen": 0.802734375, "rewards/margins": 2.34375, "rewards/rejected": -1.54229736328125, "step": 1042 }, { "epoch": 0.19708063678019747, "grad_norm": 2.2389946618657746, "learning_rate": 9.74582686531593e-07, "logits/chosen": 1.310546875, "logits/rejected": 1.3642578125, "logps/chosen": -769.0, "logps/rejected": -875.0, "loss": 0.7101, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7158203125, "rewards/margins": 2.693359375, "rewards/rejected": -1.978515625, "step": 1043 }, { "epoch": 0.19726959232840474, "grad_norm": 2.0526623695159034, "learning_rate": 9.744842531970447e-07, "logits/chosen": 1.18359375, "logits/rejected": 1.3984375, "logps/chosen": -783.0, "logps/rejected": -845.0, "loss": 0.6686, "rewards/accuracies": 0.71875, "rewards/chosen": 0.560546875, "rewards/margins": 2.6484375, "rewards/rejected": -2.087890625, "step": 1044 }, { "epoch": 0.19745854787661202, "grad_norm": 1.3279778833574174, "learning_rate": 9.743856351908797e-07, "logits/chosen": 0.95361328125, "logits/rejected": 0.9453125, "logps/chosen": -891.0, "logps/rejected": -777.0, "loss": 0.6019, "rewards/accuracies": 0.8125, "rewards/chosen": 0.86669921875, "rewards/margins": 4.06640625, "rewards/rejected": -3.19921875, "step": 1045 }, { "epoch": 0.19764750342481932, "grad_norm": 1.6697279854612905, "learning_rate": 9.742868325560018e-07, "logits/chosen": 1.7421875, "logits/rejected": 2.11474609375, "logps/chosen": -565.0, "logps/rejected": -478.0, "loss": 0.7207, "rewards/accuracies": 0.84375, "rewards/chosen": 0.72265625, "rewards/margins": 2.2421875, "rewards/rejected": -1.5185546875, "step": 1046 }, { "epoch": 0.1978364589730266, "grad_norm": 1.7218774298634845, "learning_rate": 9.741878453353954e-07, "logits/chosen": 1.330078125, "logits/rejected": 1.1884765625, "logps/chosen": -655.5, "logps/rejected": -671.0, "loss": 0.6462, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8115234375, "rewards/margins": 2.40625, "rewards/rejected": -1.595703125, "step": 1047 }, { "epoch": 0.19802541452123387, "grad_norm": 1.8803645255391874, "learning_rate": 9.740886735721242e-07, "logits/chosen": 1.726318359375, "logits/rejected": 2.4453125, "logps/chosen": -667.5, "logps/rejected": -1563.0, "loss": 0.6921, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6220703125, "rewards/margins": 3.373046875, "rewards/rejected": -2.7509765625, "step": 1048 }, { "epoch": 0.19821437006944118, "grad_norm": 1.828218881758045, "learning_rate": 9.739893173093335e-07, "logits/chosen": 1.5478515625, "logits/rejected": 1.57421875, "logps/chosen": -769.25, "logps/rejected": -1459.0, "loss": 0.6844, "rewards/accuracies": 0.84375, "rewards/chosen": 0.77685546875, "rewards/margins": 2.625, "rewards/rejected": -1.8486328125, "step": 1049 }, { "epoch": 0.19840332561764845, "grad_norm": 2.3485565021804016, "learning_rate": 9.738897765902476e-07, "logits/chosen": 2.375, "logits/rejected": 2.2275390625, "logps/chosen": -732.5, "logps/rejected": -733.0, "loss": 0.6822, "rewards/accuracies": 0.71875, "rewards/chosen": 0.755615234375, "rewards/margins": 2.326171875, "rewards/rejected": -1.572265625, "step": 1050 }, { "epoch": 0.19859228116585573, "grad_norm": 1.9705465357161853, "learning_rate": 9.737900514581719e-07, "logits/chosen": 1.36328125, "logits/rejected": 0.9580078125, "logps/chosen": -996.0, "logps/rejected": -1032.5, "loss": 0.6109, "rewards/accuracies": 0.78125, "rewards/chosen": 0.998046875, "rewards/margins": 3.3125, "rewards/rejected": -2.3121337890625, "step": 1051 }, { "epoch": 0.198781236714063, "grad_norm": 4.262457983994995, "learning_rate": 9.736901419564919e-07, "logits/chosen": 1.42578125, "logits/rejected": 2.083984375, "logps/chosen": -679.5, "logps/rejected": -762.0, "loss": 0.6709, "rewards/accuracies": 0.90625, "rewards/chosen": 0.62158203125, "rewards/margins": 2.830078125, "rewards/rejected": -2.208984375, "step": 1052 }, { "epoch": 0.1989701922622703, "grad_norm": 2.1865204394671856, "learning_rate": 9.735900481286728e-07, "logits/chosen": 1.26416015625, "logits/rejected": 1.37384033203125, "logps/chosen": -586.0, "logps/rejected": -815.0, "loss": 0.6361, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5909423828125, "rewards/margins": 2.52734375, "rewards/rejected": -1.93359375, "step": 1053 }, { "epoch": 0.19915914781047758, "grad_norm": 2.220868933217563, "learning_rate": 9.734897700182606e-07, "logits/chosen": 1.708984375, "logits/rejected": 1.666015625, "logps/chosen": -1365.0, "logps/rejected": -1372.0, "loss": 0.5494, "rewards/accuracies": 0.84375, "rewards/chosen": 0.996337890625, "rewards/margins": 4.31640625, "rewards/rejected": -3.3203125, "step": 1054 }, { "epoch": 0.19934810335868486, "grad_norm": 1.866624201553837, "learning_rate": 9.733893076688813e-07, "logits/chosen": 1.70703125, "logits/rejected": 2.193359375, "logps/chosen": -1007.0, "logps/rejected": -1428.5, "loss": 0.6818, "rewards/accuracies": 0.78125, "rewards/chosen": 0.52880859375, "rewards/margins": 3.181640625, "rewards/rejected": -2.654296875, "step": 1055 }, { "epoch": 0.19953705890689216, "grad_norm": 2.2162501686733256, "learning_rate": 9.732886611242408e-07, "logits/chosen": 1.3037109375, "logits/rejected": 1.69384765625, "logps/chosen": -691.0, "logps/rejected": -814.0, "loss": 0.6001, "rewards/accuracies": 0.84375, "rewards/chosen": 0.34228515625, "rewards/margins": 2.966796875, "rewards/rejected": -2.62109375, "step": 1056 }, { "epoch": 0.19972601445509944, "grad_norm": 2.5428133326617997, "learning_rate": 9.731878304281256e-07, "logits/chosen": 1.47265625, "logits/rejected": 1.99609375, "logps/chosen": -617.5, "logps/rejected": -1518.0, "loss": 0.6639, "rewards/accuracies": 0.90625, "rewards/chosen": -0.030517578125, "rewards/margins": 3.6259765625, "rewards/rejected": -3.662109375, "step": 1057 }, { "epoch": 0.1999149700033067, "grad_norm": 2.252885546821093, "learning_rate": 9.730868156244013e-07, "logits/chosen": 0.812255859375, "logits/rejected": 0.906494140625, "logps/chosen": -544.0, "logps/rejected": -550.5, "loss": 0.7614, "rewards/accuracies": 0.71875, "rewards/chosen": -0.34912109375, "rewards/margins": 2.0087890625, "rewards/rejected": -2.357421875, "step": 1058 }, { "epoch": 0.20010392555151402, "grad_norm": 1.982895478145567, "learning_rate": 9.729856167570153e-07, "logits/chosen": 2.34765625, "logits/rejected": 2.2412109375, "logps/chosen": -641.5, "logps/rejected": -773.0, "loss": 0.7, "rewards/accuracies": 0.8125, "rewards/chosen": 0.064453125, "rewards/margins": 2.287109375, "rewards/rejected": -2.220703125, "step": 1059 }, { "epoch": 0.2002928810997213, "grad_norm": 2.3125145392895337, "learning_rate": 9.728842338699936e-07, "logits/chosen": 1.181640625, "logits/rejected": 0.6572265625, "logps/chosen": -933.0, "logps/rejected": -845.5, "loss": 0.609, "rewards/accuracies": 0.78125, "rewards/chosen": 0.494140625, "rewards/margins": 3.509765625, "rewards/rejected": -3.0078125, "step": 1060 }, { "epoch": 0.20048183664792857, "grad_norm": 1.7465024452303124, "learning_rate": 9.727826670074428e-07, "logits/chosen": 1.927734375, "logits/rejected": 2.3203125, "logps/chosen": -793.0, "logps/rejected": -763.0, "loss": 0.7879, "rewards/accuracies": 0.6875, "rewards/chosen": -0.47265625, "rewards/margins": 1.96240234375, "rewards/rejected": -2.43359375, "step": 1061 }, { "epoch": 0.20067079219613587, "grad_norm": 3.6556602019543374, "learning_rate": 9.726809162135493e-07, "logits/chosen": 1.41796875, "logits/rejected": 1.701171875, "logps/chosen": -902.5, "logps/rejected": -978.0, "loss": 0.6574, "rewards/accuracies": 0.78125, "rewards/chosen": -0.097412109375, "rewards/margins": 2.900390625, "rewards/rejected": -3.00390625, "step": 1062 }, { "epoch": 0.20085974774434315, "grad_norm": 1.7634079376348135, "learning_rate": 9.725789815325803e-07, "logits/chosen": 1.66015625, "logits/rejected": 1.6796875, "logps/chosen": -606.0, "logps/rejected": -676.5, "loss": 0.7259, "rewards/accuracies": 0.6875, "rewards/chosen": -0.009033203125, "rewards/margins": 1.974609375, "rewards/rejected": -1.982421875, "step": 1063 }, { "epoch": 0.20104870329255042, "grad_norm": 2.0491339692671815, "learning_rate": 9.72476863008882e-07, "logits/chosen": 2.2109375, "logits/rejected": 2.7265625, "logps/chosen": -920.5, "logps/rejected": -1667.5, "loss": 0.6658, "rewards/accuracies": 0.78125, "rewards/chosen": 0.662841796875, "rewards/margins": 2.9775390625, "rewards/rejected": -2.3212890625, "step": 1064 }, { "epoch": 0.2012376588407577, "grad_norm": 2.0292816921165833, "learning_rate": 9.723745606868809e-07, "logits/chosen": 2.0263671875, "logits/rejected": 1.7001953125, "logps/chosen": -627.5, "logps/rejected": -880.0, "loss": 0.7066, "rewards/accuracies": 0.78125, "rewards/chosen": 0.49188232421875, "rewards/margins": 1.904296875, "rewards/rejected": -1.41015625, "step": 1065 }, { "epoch": 0.201426614388965, "grad_norm": 2.0924305971225685, "learning_rate": 9.722720746110838e-07, "logits/chosen": 1.4375, "logits/rejected": 1.98046875, "logps/chosen": -821.5, "logps/rejected": -2235.0, "loss": 0.5656, "rewards/accuracies": 0.90625, "rewards/chosen": 0.94921875, "rewards/margins": 3.54296875, "rewards/rejected": -2.59765625, "step": 1066 }, { "epoch": 0.20161556993717228, "grad_norm": 1.9393257256549614, "learning_rate": 9.721694048260773e-07, "logits/chosen": 1.537109375, "logits/rejected": 0.631591796875, "logps/chosen": -616.75, "logps/rejected": -575.5, "loss": 0.6643, "rewards/accuracies": 0.84375, "rewards/chosen": 0.604248046875, "rewards/margins": 2.263671875, "rewards/rejected": -1.6591796875, "step": 1067 }, { "epoch": 0.20180452548537955, "grad_norm": 2.5141625788506303, "learning_rate": 9.720665513765274e-07, "logits/chosen": 2.3091583251953125, "logits/rejected": 1.97607421875, "logps/chosen": -616.0, "logps/rejected": -651.0, "loss": 0.5734, "rewards/accuracies": 0.9375, "rewards/chosen": 0.82080078125, "rewards/margins": 2.8515625, "rewards/rejected": -2.03125, "step": 1068 }, { "epoch": 0.20199348103358686, "grad_norm": 2.1191677289751523, "learning_rate": 9.71963514307181e-07, "logits/chosen": 1.635986328125, "logits/rejected": 2.2021484375, "logps/chosen": -736.5, "logps/rejected": -899.0, "loss": 0.802, "rewards/accuracies": 0.65625, "rewards/chosen": 0.89990234375, "rewards/margins": 1.7734375, "rewards/rejected": -0.873046875, "step": 1069 }, { "epoch": 0.20218243658179413, "grad_norm": 2.74429294929098, "learning_rate": 9.718602936628639e-07, "logits/chosen": 1.7030029296875, "logits/rejected": 2.173828125, "logps/chosen": -699.5, "logps/rejected": -754.5, "loss": 0.7615, "rewards/accuracies": 0.71875, "rewards/chosen": 0.596923828125, "rewards/margins": 2.6396484375, "rewards/rejected": -2.0421142578125, "step": 1070 }, { "epoch": 0.2023713921300014, "grad_norm": 1.5637928785981834, "learning_rate": 9.717568894884822e-07, "logits/chosen": 1.97265625, "logits/rejected": 2.3720703125, "logps/chosen": -722.5, "logps/rejected": -1269.5, "loss": 0.6155, "rewards/accuracies": 0.90625, "rewards/chosen": 0.927734375, "rewards/margins": 3.15625, "rewards/rejected": -2.22265625, "step": 1071 }, { "epoch": 0.2025603476782087, "grad_norm": 2.04511377472539, "learning_rate": 9.71653301829022e-07, "logits/chosen": 1.1290283203125, "logits/rejected": 1.69384765625, "logps/chosen": -1153.5, "logps/rejected": -1106.0, "loss": 0.53, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3984375, "rewards/margins": 3.55078125, "rewards/rejected": -2.15625, "step": 1072 }, { "epoch": 0.202749303226416, "grad_norm": 1.8222741221601413, "learning_rate": 9.715495307295489e-07, "logits/chosen": 1.8671875, "logits/rejected": 1.703125, "logps/chosen": -573.5, "logps/rejected": -517.0, "loss": 0.6717, "rewards/accuracies": 0.8125, "rewards/chosen": 0.597412109375, "rewards/margins": 2.0029296875, "rewards/rejected": -1.400390625, "step": 1073 }, { "epoch": 0.20293825877462326, "grad_norm": 2.007484326851157, "learning_rate": 9.714455762352084e-07, "logits/chosen": 1.3603515625, "logits/rejected": 1.392333984375, "logps/chosen": -826.0, "logps/rejected": -1134.5, "loss": 0.6752, "rewards/accuracies": 0.84375, "rewards/chosen": 0.49951171875, "rewards/margins": 2.365234375, "rewards/rejected": -1.865234375, "step": 1074 }, { "epoch": 0.20312721432283054, "grad_norm": 1.9832485475063004, "learning_rate": 9.713414383912262e-07, "logits/chosen": 1.0185546875, "logits/rejected": 1.791015625, "logps/chosen": -765.5, "logps/rejected": -983.5, "loss": 0.6751, "rewards/accuracies": 0.875, "rewards/chosen": 0.600341796875, "rewards/margins": 2.40234375, "rewards/rejected": -1.7958984375, "step": 1075 }, { "epoch": 0.20331616987103784, "grad_norm": 2.662219032691642, "learning_rate": 9.71237117242907e-07, "logits/chosen": 2.2275390625, "logits/rejected": 1.32470703125, "logps/chosen": -1086.0, "logps/rejected": -1176.0, "loss": 0.5946, "rewards/accuracies": 0.875, "rewards/chosen": 0.8759765625, "rewards/margins": 3.41015625, "rewards/rejected": -2.5390625, "step": 1076 }, { "epoch": 0.20350512541924512, "grad_norm": 2.421509704647638, "learning_rate": 9.71132612835636e-07, "logits/chosen": 1.5419921875, "logits/rejected": 2.275390625, "logps/chosen": -992.0, "logps/rejected": -1473.0, "loss": 0.4842, "rewards/accuracies": 0.90625, "rewards/chosen": 1.078125, "rewards/margins": 4.63671875, "rewards/rejected": -3.5625, "step": 1077 }, { "epoch": 0.2036940809674524, "grad_norm": 1.9694952057176915, "learning_rate": 9.710279252148775e-07, "logits/chosen": 1.66015625, "logits/rejected": 2.45703125, "logps/chosen": -687.0, "logps/rejected": -762.0, "loss": 0.6293, "rewards/accuracies": 0.9375, "rewards/chosen": 0.787353515625, "rewards/margins": 2.833984375, "rewards/rejected": -2.048828125, "step": 1078 }, { "epoch": 0.2038830365156597, "grad_norm": 1.8542830831860362, "learning_rate": 9.709230544261758e-07, "logits/chosen": 1.822265625, "logits/rejected": 1.9912109375, "logps/chosen": -691.0, "logps/rejected": -761.0, "loss": 0.6661, "rewards/accuracies": 0.84375, "rewards/chosen": 0.246337890625, "rewards/margins": 2.568359375, "rewards/rejected": -2.32421875, "step": 1079 }, { "epoch": 0.20407199206386697, "grad_norm": 2.3462795271108567, "learning_rate": 9.70818000515155e-07, "logits/chosen": 1.1611328125, "logits/rejected": 1.3984375, "logps/chosen": -768.5, "logps/rejected": -756.0, "loss": 0.6318, "rewards/accuracies": 0.84375, "rewards/chosen": 0.717041015625, "rewards/margins": 2.7109375, "rewards/rejected": -1.9931640625, "step": 1080 }, { "epoch": 0.20426094761207425, "grad_norm": 2.017562877680244, "learning_rate": 9.707127635275185e-07, "logits/chosen": 2.16796875, "logits/rejected": 2.33203125, "logps/chosen": -1016.5, "logps/rejected": -903.5, "loss": 0.6766, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5750045776367188, "rewards/margins": 3.216796875, "rewards/rejected": -2.6328125, "step": 1081 }, { "epoch": 0.20444990316028155, "grad_norm": 1.93683410448868, "learning_rate": 9.7060734350905e-07, "logits/chosen": 1.1796875, "logits/rejected": 0.4169921875, "logps/chosen": -730.5, "logps/rejected": -597.5, "loss": 0.6527, "rewards/accuracies": 0.75, "rewards/chosen": 0.353759765625, "rewards/margins": 2.556640625, "rewards/rejected": -2.20703125, "step": 1082 }, { "epoch": 0.20463885870848883, "grad_norm": 2.180182929142591, "learning_rate": 9.705017405056122e-07, "logits/chosen": 1.5078125, "logits/rejected": 1.29296875, "logps/chosen": -966.0, "logps/rejected": -852.0, "loss": 0.6413, "rewards/accuracies": 0.75, "rewards/chosen": 0.3330078125, "rewards/margins": 2.634765625, "rewards/rejected": -2.30078125, "step": 1083 }, { "epoch": 0.2048278142566961, "grad_norm": 3.0797224387503306, "learning_rate": 9.703959545631475e-07, "logits/chosen": 1.486083984375, "logits/rejected": 2.04296875, "logps/chosen": -893.0, "logps/rejected": -1467.0, "loss": 0.6932, "rewards/accuracies": 0.90625, "rewards/chosen": 0.024169921875, "rewards/margins": 5.62890625, "rewards/rejected": -5.6171875, "step": 1084 }, { "epoch": 0.2050167698049034, "grad_norm": 3.6710922385933347, "learning_rate": 9.702899857276782e-07, "logits/chosen": 1.92578125, "logits/rejected": 2.318359375, "logps/chosen": -1078.0, "logps/rejected": -1859.0, "loss": 0.6588, "rewards/accuracies": 0.78125, "rewards/chosen": 0.652557373046875, "rewards/margins": 3.87109375, "rewards/rejected": -3.2109375, "step": 1085 }, { "epoch": 0.20520572535311069, "grad_norm": 1.9990713679084202, "learning_rate": 9.701838340453062e-07, "logits/chosen": 1.7509765625, "logits/rejected": 1.72265625, "logps/chosen": -937.0, "logps/rejected": -738.0, "loss": 0.6235, "rewards/accuracies": 0.84375, "rewards/chosen": 0.336181640625, "rewards/margins": 2.716796875, "rewards/rejected": -2.37890625, "step": 1086 }, { "epoch": 0.20539468090131796, "grad_norm": 2.1986592511609397, "learning_rate": 9.70077499562212e-07, "logits/chosen": 1.623046875, "logits/rejected": 1.4814453125, "logps/chosen": -662.5, "logps/rejected": -557.0, "loss": 0.6817, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1513671875, "rewards/margins": 2.134765625, "rewards/rejected": -1.982421875, "step": 1087 }, { "epoch": 0.20558363644952524, "grad_norm": 1.59543067212938, "learning_rate": 9.69970982324657e-07, "logits/chosen": 1.5743408203125, "logits/rejected": 2.3486328125, "logps/chosen": -1312.0, "logps/rejected": -1163.0, "loss": 0.5481, "rewards/accuracies": 0.875, "rewards/chosen": 1.1767578125, "rewards/margins": 3.79296875, "rewards/rejected": -2.6171875, "step": 1088 }, { "epoch": 0.20577259199773254, "grad_norm": 1.9376578455855828, "learning_rate": 9.69864282378981e-07, "logits/chosen": 1.33154296875, "logits/rejected": 1.69140625, "logps/chosen": -650.0, "logps/rejected": -781.0, "loss": 0.654, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8330078125, "rewards/margins": 2.8173828125, "rewards/rejected": -1.98046875, "step": 1089 }, { "epoch": 0.20596154754593982, "grad_norm": 1.7875255845275029, "learning_rate": 9.697573997716042e-07, "logits/chosen": 1.8486328125, "logits/rejected": 1.34765625, "logps/chosen": -934.5, "logps/rejected": -978.0, "loss": 0.5334, "rewards/accuracies": 0.96875, "rewards/chosen": 0.709716796875, "rewards/margins": 3.85546875, "rewards/rejected": -3.16015625, "step": 1090 }, { "epoch": 0.2061505030941471, "grad_norm": 1.7254601354461465, "learning_rate": 9.696503345490254e-07, "logits/chosen": 1.0791015625, "logits/rejected": 1.470703125, "logps/chosen": -765.5, "logps/rejected": -1337.5, "loss": 0.6601, "rewards/accuracies": 0.75, "rewards/chosen": 0.97216796875, "rewards/margins": 3.0810546875, "rewards/rejected": -2.1142578125, "step": 1091 }, { "epoch": 0.2063394586423544, "grad_norm": 1.8859940470897745, "learning_rate": 9.695430867578239e-07, "logits/chosen": 1.466796875, "logits/rejected": 1.2705078125, "logps/chosen": -791.0, "logps/rejected": -787.0, "loss": 0.7238, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4609375, "rewards/margins": 2.451171875, "rewards/rejected": -1.990234375, "step": 1092 }, { "epoch": 0.20652841419056167, "grad_norm": 1.8627709829066987, "learning_rate": 9.694356564446569e-07, "logits/chosen": 1.5849609375, "logits/rejected": 1.525390625, "logps/chosen": -1747.0, "logps/rejected": -2061.5, "loss": 0.6092, "rewards/accuracies": 0.9375, "rewards/chosen": 0.829345703125, "rewards/margins": 4.1171875, "rewards/rejected": -3.283203125, "step": 1093 }, { "epoch": 0.20671736973876895, "grad_norm": 2.0594485081155227, "learning_rate": 9.693280436562624e-07, "logits/chosen": 1.744140625, "logits/rejected": 1.37841796875, "logps/chosen": -534.25, "logps/rejected": -665.0, "loss": 0.6858, "rewards/accuracies": 0.875, "rewards/chosen": 0.1611328125, "rewards/margins": 2.013671875, "rewards/rejected": -1.8515625, "step": 1094 }, { "epoch": 0.20690632528697625, "grad_norm": 1.7721568920522364, "learning_rate": 9.692202484394574e-07, "logits/chosen": 2.7265625, "logits/rejected": 2.07177734375, "logps/chosen": -1442.0, "logps/rejected": -1172.0, "loss": 0.5884, "rewards/accuracies": 0.875, "rewards/chosen": 1.662109375, "rewards/margins": 3.7900390625, "rewards/rejected": -2.125, "step": 1095 }, { "epoch": 0.20709528083518353, "grad_norm": 1.9756833393664792, "learning_rate": 9.691122708411378e-07, "logits/chosen": 0.78662109375, "logits/rejected": 1.3173828125, "logps/chosen": -852.5, "logps/rejected": -907.0, "loss": 0.54, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9091796875, "rewards/margins": 3.77734375, "rewards/rejected": -2.86328125, "step": 1096 }, { "epoch": 0.2072842363833908, "grad_norm": 1.8706362299913375, "learning_rate": 9.690041109082794e-07, "logits/chosen": 0.630859375, "logits/rejected": 1.184814453125, "logps/chosen": -896.0, "logps/rejected": -1000.0, "loss": 0.6214, "rewards/accuracies": 0.84375, "rewards/chosen": 1.005859375, "rewards/margins": 3.19921875, "rewards/rejected": -2.193359375, "step": 1097 }, { "epoch": 0.20747319193159808, "grad_norm": 2.400337174214887, "learning_rate": 9.688957686879371e-07, "logits/chosen": 1.037109375, "logits/rejected": 1.392822265625, "logps/chosen": -529.5, "logps/rejected": -559.5, "loss": 0.6653, "rewards/accuracies": 0.84375, "rewards/chosen": 0.35546875, "rewards/margins": 2.3125, "rewards/rejected": -1.9619140625, "step": 1098 }, { "epoch": 0.20766214747980538, "grad_norm": 2.10338046676577, "learning_rate": 9.687872442272452e-07, "logits/chosen": 0.725341796875, "logits/rejected": 1.39794921875, "logps/chosen": -864.0, "logps/rejected": -1517.0, "loss": 0.7109, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22705078125, "rewards/margins": 3.0, "rewards/rejected": -2.775390625, "step": 1099 }, { "epoch": 0.20785110302801266, "grad_norm": 1.886886415682989, "learning_rate": 9.68678537573417e-07, "logits/chosen": 1.3779296875, "logits/rejected": 1.3095703125, "logps/chosen": -722.25, "logps/rejected": -1041.0, "loss": 0.6374, "rewards/accuracies": 0.8125, "rewards/chosen": 0.662109375, "rewards/margins": 3.333984375, "rewards/rejected": -2.67578125, "step": 1100 }, { "epoch": 0.20804005857621993, "grad_norm": 1.9292054869706927, "learning_rate": 9.685696487737453e-07, "logits/chosen": -0.11962890625, "logits/rejected": -0.291015625, "logps/chosen": -702.0, "logps/rejected": -754.0, "loss": 0.5252, "rewards/accuracies": 0.90625, "rewards/chosen": 0.64208984375, "rewards/margins": 3.3984375, "rewards/rejected": -2.7578125, "step": 1101 }, { "epoch": 0.20822901412442724, "grad_norm": 2.2409617116978553, "learning_rate": 9.68460577875602e-07, "logits/chosen": 1.73828125, "logits/rejected": 1.802734375, "logps/chosen": -909.0, "logps/rejected": -880.5, "loss": 0.6869, "rewards/accuracies": 0.84375, "rewards/chosen": -1.06640625, "rewards/margins": 1.1591796875, "rewards/rejected": -2.2236328125, "step": 1102 }, { "epoch": 0.2084179696726345, "grad_norm": 1.641059534841198, "learning_rate": 9.683513249264386e-07, "logits/chosen": 1.5625, "logits/rejected": 1.768798828125, "logps/chosen": -577.0, "logps/rejected": -1080.5, "loss": 0.5926, "rewards/accuracies": 0.875, "rewards/chosen": 0.86572265625, "rewards/margins": 3.828125, "rewards/rejected": -2.966796875, "step": 1103 }, { "epoch": 0.2086069252208418, "grad_norm": 2.0266057623173683, "learning_rate": 9.682418899737856e-07, "logits/chosen": 0.48419189453125, "logits/rejected": 0.2685546875, "logps/chosen": -906.0, "logps/rejected": -977.0, "loss": 0.6336, "rewards/accuracies": 0.875, "rewards/chosen": 0.89404296875, "rewards/margins": 3.2265625, "rewards/rejected": -2.333984375, "step": 1104 }, { "epoch": 0.2087958807690491, "grad_norm": 2.2634382488907847, "learning_rate": 9.681322730652522e-07, "logits/chosen": 1.3408203125, "logits/rejected": 1.98046875, "logps/chosen": -861.0, "logps/rejected": -840.0, "loss": 0.5497, "rewards/accuracies": 0.84375, "rewards/chosen": 0.53076171875, "rewards/margins": 3.76953125, "rewards/rejected": -3.23828125, "step": 1105 }, { "epoch": 0.20898483631725637, "grad_norm": 2.340329621416479, "learning_rate": 9.680224742485274e-07, "logits/chosen": 1.1396484375, "logits/rejected": 0.612060546875, "logps/chosen": -736.5, "logps/rejected": -681.0, "loss": 0.6561, "rewards/accuracies": 0.84375, "rewards/chosen": 0.708648681640625, "rewards/margins": 2.43359375, "rewards/rejected": -1.7197265625, "step": 1106 }, { "epoch": 0.20917379186546364, "grad_norm": 1.6579945747444622, "learning_rate": 9.679124935713792e-07, "logits/chosen": 1.57373046875, "logits/rejected": 2.208984375, "logps/chosen": -427.5, "logps/rejected": -596.5, "loss": 0.6412, "rewards/accuracies": 0.875, "rewards/chosen": 0.45654296875, "rewards/margins": 3.11328125, "rewards/rejected": -2.65625, "step": 1107 }, { "epoch": 0.20936274741367095, "grad_norm": 1.9856975799631236, "learning_rate": 9.678023310816544e-07, "logits/chosen": 1.1328125, "logits/rejected": 1.0703125, "logps/chosen": -974.5, "logps/rejected": -925.0, "loss": 0.5308, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1313934326171875, "rewards/margins": 3.91015625, "rewards/rejected": -2.7734375, "step": 1108 }, { "epoch": 0.20955170296187822, "grad_norm": 1.861603304034774, "learning_rate": 9.676919868272793e-07, "logits/chosen": 1.68359375, "logits/rejected": 1.11865234375, "logps/chosen": -865.0, "logps/rejected": -835.0, "loss": 0.5701, "rewards/accuracies": 0.875, "rewards/chosen": 1.1400146484375, "rewards/margins": 3.51171875, "rewards/rejected": -2.373046875, "step": 1109 }, { "epoch": 0.2097406585100855, "grad_norm": 1.963679699297157, "learning_rate": 9.67581460856259e-07, "logits/chosen": 0.734375, "logits/rejected": 0.72265625, "logps/chosen": -942.0, "logps/rejected": -994.0, "loss": 0.6311, "rewards/accuracies": 0.75, "rewards/chosen": 1.00048828125, "rewards/margins": 3.794921875, "rewards/rejected": -2.794921875, "step": 1110 }, { "epoch": 0.20992961405829277, "grad_norm": 2.3469202696372546, "learning_rate": 9.67470753216678e-07, "logits/chosen": 1.5732421875, "logits/rejected": 1.60546875, "logps/chosen": -944.0, "logps/rejected": -895.5, "loss": 0.7118, "rewards/accuracies": 0.78125, "rewards/chosen": 0.783447265625, "rewards/margins": 2.94921875, "rewards/rejected": -2.1708984375, "step": 1111 }, { "epoch": 0.21011856960650008, "grad_norm": 2.0702047076448222, "learning_rate": 9.673598639566992e-07, "logits/chosen": 1.3427734375, "logits/rejected": 1.08056640625, "logps/chosen": -935.5, "logps/rejected": -736.5, "loss": 0.7017, "rewards/accuracies": 0.78125, "rewards/chosen": 0.31585693359375, "rewards/margins": 2.271484375, "rewards/rejected": -1.95703125, "step": 1112 }, { "epoch": 0.21030752515470735, "grad_norm": 2.197426975331438, "learning_rate": 9.672487931245652e-07, "logits/chosen": 1.25244140625, "logits/rejected": 0.8507080078125, "logps/chosen": -720.5, "logps/rejected": -649.5, "loss": 0.6187, "rewards/accuracies": 0.8125, "rewards/chosen": 1.02587890625, "rewards/margins": 2.7578125, "rewards/rejected": -1.724609375, "step": 1113 }, { "epoch": 0.21049648070291463, "grad_norm": 2.0799557996977227, "learning_rate": 9.67137540768597e-07, "logits/chosen": 1.4111328125, "logits/rejected": 1.6796875, "logps/chosen": -461.5, "logps/rejected": -1107.0, "loss": 0.6757, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4453125, "rewards/margins": 2.5859375, "rewards/rejected": -2.138671875, "step": 1114 }, { "epoch": 0.21068543625112193, "grad_norm": 1.6754989721302618, "learning_rate": 9.670261069371952e-07, "logits/chosen": 1.1905517578125, "logits/rejected": 1.29931640625, "logps/chosen": -746.0, "logps/rejected": -598.5, "loss": 0.7161, "rewards/accuracies": 0.75, "rewards/chosen": 1.04541015625, "rewards/margins": 2.37890625, "rewards/rejected": -1.3291015625, "step": 1115 }, { "epoch": 0.2108743917993292, "grad_norm": 1.6859900510338, "learning_rate": 9.66914491678839e-07, "logits/chosen": 1.255859375, "logits/rejected": 1.3603515625, "logps/chosen": -636.0, "logps/rejected": -734.0, "loss": 0.6565, "rewards/accuracies": 0.8125, "rewards/chosen": 0.935546875, "rewards/margins": 2.701171875, "rewards/rejected": -1.7626953125, "step": 1116 }, { "epoch": 0.21106334734753648, "grad_norm": 1.905510851612246, "learning_rate": 9.668026950420862e-07, "logits/chosen": 1.49609375, "logits/rejected": 1.388671875, "logps/chosen": -593.0, "logps/rejected": -711.0, "loss": 0.6428, "rewards/accuracies": 0.84375, "rewards/chosen": 0.499267578125, "rewards/margins": 2.671875, "rewards/rejected": -2.169921875, "step": 1117 }, { "epoch": 0.2112523028957438, "grad_norm": 2.169532709001254, "learning_rate": 9.666907170755745e-07, "logits/chosen": 1.98828125, "logits/rejected": 2.14453125, "logps/chosen": -740.0, "logps/rejected": -864.5, "loss": 0.7037, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5101318359375, "rewards/margins": 2.22265625, "rewards/rejected": -1.71484375, "step": 1118 }, { "epoch": 0.21144125844395106, "grad_norm": 1.5738920336255813, "learning_rate": 9.665785578280192e-07, "logits/chosen": 1.91015625, "logits/rejected": 1.69921875, "logps/chosen": -1024.0, "logps/rejected": -1717.0, "loss": 0.5298, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2896728515625, "rewards/margins": 4.73046875, "rewards/rejected": -3.4375, "step": 1119 }, { "epoch": 0.21163021399215834, "grad_norm": 2.057686602583024, "learning_rate": 9.664662173482154e-07, "logits/chosen": 1.45166015625, "logits/rejected": 1.1640625, "logps/chosen": -372.25, "logps/rejected": -376.0, "loss": 0.67, "rewards/accuracies": 0.84375, "rewards/chosen": 0.70166015625, "rewards/margins": 1.900390625, "rewards/rejected": -1.19921875, "step": 1120 }, { "epoch": 0.21181916954036564, "grad_norm": 1.7398881047853947, "learning_rate": 9.663536956850367e-07, "logits/chosen": 1.69140625, "logits/rejected": 1.802734375, "logps/chosen": -885.5, "logps/rejected": -807.5, "loss": 0.622, "rewards/accuracies": 0.8125, "rewards/chosen": 1.255859375, "rewards/margins": 3.326171875, "rewards/rejected": -2.06884765625, "step": 1121 }, { "epoch": 0.21200812508857292, "grad_norm": 1.7148818779395676, "learning_rate": 9.662409928874354e-07, "logits/chosen": 2.2021484375, "logits/rejected": 2.4296875, "logps/chosen": -859.0, "logps/rejected": -1595.0, "loss": 0.6399, "rewards/accuracies": 0.90625, "rewards/chosen": 0.808349609375, "rewards/margins": 3.8828125, "rewards/rejected": -3.068359375, "step": 1122 }, { "epoch": 0.2121970806367802, "grad_norm": 1.7407562483054757, "learning_rate": 9.661281090044433e-07, "logits/chosen": 1.5869140625, "logits/rejected": 2.134765625, "logps/chosen": -908.5, "logps/rejected": -1500.0, "loss": 0.579, "rewards/accuracies": 0.875, "rewards/chosen": 1.2998046875, "rewards/margins": 3.68359375, "rewards/rejected": -2.392578125, "step": 1123 }, { "epoch": 0.21238603618498747, "grad_norm": 2.2727255454656734, "learning_rate": 9.660150440851699e-07, "logits/chosen": 1.546875, "logits/rejected": 1.341796875, "logps/chosen": -902.0, "logps/rejected": -725.0, "loss": 0.6793, "rewards/accuracies": 0.75, "rewards/chosen": 0.8486328125, "rewards/margins": 2.1650390625, "rewards/rejected": -1.31396484375, "step": 1124 }, { "epoch": 0.21257499173319477, "grad_norm": 1.7755259288209793, "learning_rate": 9.659017981788043e-07, "logits/chosen": 1.2890625, "logits/rejected": 1.544921875, "logps/chosen": -676.0, "logps/rejected": -682.5, "loss": 0.7565, "rewards/accuracies": 0.75, "rewards/chosen": 0.27685546875, "rewards/margins": 2.05078125, "rewards/rejected": -1.77392578125, "step": 1125 }, { "epoch": 0.21276394728140205, "grad_norm": 1.6292468774837274, "learning_rate": 9.65788371334614e-07, "logits/chosen": 0.8955078125, "logits/rejected": 1.4794921875, "logps/chosen": -735.0, "logps/rejected": -1641.0, "loss": 0.6943, "rewards/accuracies": 0.75, "rewards/chosen": 0.7733154296875, "rewards/margins": 3.34375, "rewards/rejected": -2.576171875, "step": 1126 }, { "epoch": 0.21295290282960933, "grad_norm": 1.8091719389863732, "learning_rate": 9.656747636019456e-07, "logits/chosen": 1.2958984375, "logits/rejected": 1.369140625, "logps/chosen": -702.25, "logps/rejected": -815.0, "loss": 0.7158, "rewards/accuracies": 0.8125, "rewards/chosen": 0.53662109375, "rewards/margins": 2.525390625, "rewards/rejected": -1.9879150390625, "step": 1127 }, { "epoch": 0.21314185837781663, "grad_norm": 1.8363774272195068, "learning_rate": 9.655609750302235e-07, "logits/chosen": 1.015869140625, "logits/rejected": 1.083984375, "logps/chosen": -588.0, "logps/rejected": -457.0, "loss": 0.6556, "rewards/accuracies": 0.8125, "rewards/chosen": 0.69921875, "rewards/margins": 2.24609375, "rewards/rejected": -1.546875, "step": 1128 }, { "epoch": 0.2133308139260239, "grad_norm": 1.9967474072313245, "learning_rate": 9.654470056689517e-07, "logits/chosen": 1.08154296875, "logits/rejected": 0.9248046875, "logps/chosen": -883.5, "logps/rejected": -736.0, "loss": 0.6051, "rewards/accuracies": 0.875, "rewards/chosen": 0.732177734375, "rewards/margins": 3.197265625, "rewards/rejected": -2.46484375, "step": 1129 }, { "epoch": 0.21351976947423118, "grad_norm": 1.8514653009595423, "learning_rate": 9.653328555677126e-07, "logits/chosen": 0.98876953125, "logits/rejected": 1.6767578125, "logps/chosen": -978.0, "logps/rejected": -1591.5, "loss": 0.6476, "rewards/accuracies": 0.75, "rewards/chosen": 0.902099609375, "rewards/margins": 3.41796875, "rewards/rejected": -2.51171875, "step": 1130 }, { "epoch": 0.21370872502243848, "grad_norm": 1.8316489813413919, "learning_rate": 9.652185247761667e-07, "logits/chosen": 0.51593017578125, "logits/rejected": 0.521240234375, "logps/chosen": -1027.0, "logps/rejected": -897.0, "loss": 0.6049, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6767578125, "rewards/margins": 2.75, "rewards/rejected": -2.078125, "step": 1131 }, { "epoch": 0.21389768057064576, "grad_norm": 2.176009597843429, "learning_rate": 9.65104013344054e-07, "logits/chosen": 2.0625, "logits/rejected": 2.324951171875, "logps/chosen": -1222.0, "logps/rejected": -1681.0, "loss": 0.6195, "rewards/accuracies": 0.84375, "rewards/chosen": 1.05889892578125, "rewards/margins": 4.0859375, "rewards/rejected": -3.021484375, "step": 1132 }, { "epoch": 0.21408663611885304, "grad_norm": 1.9240793652845383, "learning_rate": 9.649893213211923e-07, "logits/chosen": 1.23193359375, "logits/rejected": 0.984130859375, "logps/chosen": -819.5, "logps/rejected": -687.5, "loss": 0.6292, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7548828125, "rewards/margins": 2.5625, "rewards/rejected": -1.810546875, "step": 1133 }, { "epoch": 0.2142755916670603, "grad_norm": 1.7581395727071925, "learning_rate": 9.648744487574786e-07, "logits/chosen": 2.22265625, "logits/rejected": 2.599609375, "logps/chosen": -955.0, "logps/rejected": -738.0, "loss": 0.7197, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7197265625, "rewards/margins": 2.3076171875, "rewards/rejected": -1.5927734375, "step": 1134 }, { "epoch": 0.21446454721526761, "grad_norm": 2.008946859436948, "learning_rate": 9.647593957028879e-07, "logits/chosen": 1.00048828125, "logits/rejected": 1.3193359375, "logps/chosen": -844.5, "logps/rejected": -1829.0, "loss": 0.6155, "rewards/accuracies": 0.875, "rewards/chosen": 0.76708984375, "rewards/margins": 3.939453125, "rewards/rejected": -3.1689453125, "step": 1135 }, { "epoch": 0.2146535027634749, "grad_norm": 1.6124996109528416, "learning_rate": 9.64644162207474e-07, "logits/chosen": 0.7109375, "logits/rejected": 1.3818359375, "logps/chosen": -425.0, "logps/rejected": -1306.0, "loss": 0.6906, "rewards/accuracies": 0.71875, "rewards/chosen": 0.186767578125, "rewards/margins": 3.30078125, "rewards/rejected": -3.11328125, "step": 1136 }, { "epoch": 0.21484245831168217, "grad_norm": 1.907838288925997, "learning_rate": 9.645287483213693e-07, "logits/chosen": 1.6083984375, "logits/rejected": 1.965087890625, "logps/chosen": -769.0, "logps/rejected": -657.25, "loss": 0.6715, "rewards/accuracies": 0.78125, "rewards/chosen": 0.65869140625, "rewards/margins": 2.529296875, "rewards/rejected": -1.87060546875, "step": 1137 }, { "epoch": 0.21503141385988947, "grad_norm": 2.3560945051142297, "learning_rate": 9.644131540947844e-07, "logits/chosen": 1.3896484375, "logits/rejected": 1.837890625, "logps/chosen": -881.0, "logps/rejected": -1350.0, "loss": 0.6596, "rewards/accuracies": 0.78125, "rewards/chosen": 0.652587890625, "rewards/margins": 3.62890625, "rewards/rejected": -2.974609375, "step": 1138 }, { "epoch": 0.21522036940809675, "grad_norm": 1.818039001459691, "learning_rate": 9.642973795780084e-07, "logits/chosen": 1.15625, "logits/rejected": 0.96240234375, "logps/chosen": -673.0, "logps/rejected": -672.0, "loss": 0.718, "rewards/accuracies": 0.75, "rewards/chosen": 0.1702880859375, "rewards/margins": 2.015625, "rewards/rejected": -1.84375, "step": 1139 }, { "epoch": 0.21540932495630402, "grad_norm": 1.7853203754744733, "learning_rate": 9.641814248214091e-07, "logits/chosen": 2.0966796875, "logits/rejected": 2.9765625, "logps/chosen": -727.5, "logps/rejected": -1126.5, "loss": 0.7267, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4788818359375, "rewards/margins": 3.185546875, "rewards/rejected": -2.70703125, "step": 1140 }, { "epoch": 0.21559828050451132, "grad_norm": 2.154062834832621, "learning_rate": 9.640652898754325e-07, "logits/chosen": 1.4462890625, "logits/rejected": 1.890625, "logps/chosen": -745.5, "logps/rejected": -1359.0, "loss": 0.686, "rewards/accuracies": 0.71875, "rewards/chosen": -0.291259765625, "rewards/margins": 5.828125, "rewards/rejected": -6.130859375, "step": 1141 }, { "epoch": 0.2157872360527186, "grad_norm": 2.6226715868282677, "learning_rate": 9.639489747906032e-07, "logits/chosen": 1.069091796875, "logits/rejected": 0.783203125, "logps/chosen": -638.0, "logps/rejected": -830.5, "loss": 0.6542, "rewards/accuracies": 0.84375, "rewards/chosen": 0.396484375, "rewards/margins": 2.88671875, "rewards/rejected": -2.48828125, "step": 1142 }, { "epoch": 0.21597619160092588, "grad_norm": 1.9758149442523325, "learning_rate": 9.638324796175238e-07, "logits/chosen": 1.5859375, "logits/rejected": 1.5035400390625, "logps/chosen": -712.5, "logps/rejected": -1127.5, "loss": 0.7367, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2978515625, "rewards/margins": 2.810546875, "rewards/rejected": -2.51220703125, "step": 1143 }, { "epoch": 0.21616514714913318, "grad_norm": 1.8151971901439585, "learning_rate": 9.637158044068756e-07, "logits/chosen": 2.294921875, "logits/rejected": 2.037109375, "logps/chosen": -834.5, "logps/rejected": -662.5, "loss": 0.5968, "rewards/accuracies": 0.8125, "rewards/chosen": 0.80029296875, "rewards/margins": 3.24609375, "rewards/rejected": -2.4462890625, "step": 1144 }, { "epoch": 0.21635410269734046, "grad_norm": 2.0739299662134703, "learning_rate": 9.63598949209418e-07, "logits/chosen": 0.6826171875, "logits/rejected": 0.41015625, "logps/chosen": -663.0, "logps/rejected": -582.0, "loss": 0.6138, "rewards/accuracies": 0.875, "rewards/chosen": 0.2177734375, "rewards/margins": 3.26953125, "rewards/rejected": -3.04296875, "step": 1145 }, { "epoch": 0.21654305824554773, "grad_norm": 2.2203249583021796, "learning_rate": 9.634819140759886e-07, "logits/chosen": 0.3046875, "logits/rejected": 0.44921875, "logps/chosen": -505.5, "logps/rejected": -721.5, "loss": 0.6979, "rewards/accuracies": 0.84375, "rewards/chosen": 0.355224609375, "rewards/margins": 2.169921875, "rewards/rejected": -1.81640625, "step": 1146 }, { "epoch": 0.216732013793755, "grad_norm": 2.310779998769381, "learning_rate": 9.633646990575037e-07, "logits/chosen": 2.15625, "logits/rejected": 2.90625, "logps/chosen": -1651.0, "logps/rejected": -2262.0, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": 0.1142578125, "rewards/margins": 5.3896484375, "rewards/rejected": -5.271484375, "step": 1147 }, { "epoch": 0.2169209693419623, "grad_norm": 2.221707504161548, "learning_rate": 9.632473042049576e-07, "logits/chosen": 1.21142578125, "logits/rejected": 1.212890625, "logps/chosen": -998.0, "logps/rejected": -1639.0, "loss": 0.6645, "rewards/accuracies": 0.78125, "rewards/chosen": 0.805908203125, "rewards/margins": 3.69140625, "rewards/rejected": -2.88037109375, "step": 1148 }, { "epoch": 0.2171099248901696, "grad_norm": 1.9986372154617589, "learning_rate": 9.631297295694227e-07, "logits/chosen": 1.56396484375, "logits/rejected": 1.5625, "logps/chosen": -1101.0, "logps/rejected": -1123.5, "loss": 0.5881, "rewards/accuracies": 0.90625, "rewards/chosen": 0.64404296875, "rewards/margins": 3.63671875, "rewards/rejected": -2.986328125, "step": 1149 }, { "epoch": 0.21729888043837686, "grad_norm": 1.8834358400320097, "learning_rate": 9.6301197520205e-07, "logits/chosen": 1.3544921875, "logits/rejected": 1.5244140625, "logps/chosen": -839.0, "logps/rejected": -605.5, "loss": 0.679, "rewards/accuracies": 0.90625, "rewards/chosen": 0.309326171875, "rewards/margins": 2.3759765625, "rewards/rejected": -2.072265625, "step": 1150 }, { "epoch": 0.21748783598658417, "grad_norm": 2.0546685234938087, "learning_rate": 9.628940411540685e-07, "logits/chosen": 1.634765625, "logits/rejected": 1.4091796875, "logps/chosen": -794.5, "logps/rejected": -867.0, "loss": 0.62, "rewards/accuracies": 0.8125, "rewards/chosen": 0.230712890625, "rewards/margins": 2.58984375, "rewards/rejected": -2.359375, "step": 1151 }, { "epoch": 0.21767679153479144, "grad_norm": 2.8098261059544423, "learning_rate": 9.62775927476785e-07, "logits/chosen": 1.7734375, "logits/rejected": 2.005859375, "logps/chosen": -767.5, "logps/rejected": -592.0, "loss": 0.7806, "rewards/accuracies": 0.71875, "rewards/chosen": 0.18115234375, "rewards/margins": 1.603515625, "rewards/rejected": -1.421875, "step": 1152 }, { "epoch": 0.21786574708299872, "grad_norm": 1.8431623223214537, "learning_rate": 9.626576342215853e-07, "logits/chosen": 2.08203125, "logits/rejected": 2.23046875, "logps/chosen": -669.0, "logps/rejected": -616.0, "loss": 0.7114, "rewards/accuracies": 0.75, "rewards/chosen": 0.5162353515625, "rewards/margins": 2.1171875, "rewards/rejected": -1.59765625, "step": 1153 }, { "epoch": 0.21805470263120602, "grad_norm": 1.7007068550061777, "learning_rate": 9.625391614399321e-07, "logits/chosen": 1.8916015625, "logits/rejected": 2.06640625, "logps/chosen": -963.5, "logps/rejected": -1193.0, "loss": 0.5899, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1396484375, "rewards/margins": 3.7109375, "rewards/rejected": -2.57421875, "step": 1154 }, { "epoch": 0.2182436581794133, "grad_norm": 1.7992530975389813, "learning_rate": 9.624205091833674e-07, "logits/chosen": 2.2578125, "logits/rejected": 2.826171875, "logps/chosen": -572.0, "logps/rejected": -1708.0, "loss": 0.7363, "rewards/accuracies": 0.6875, "rewards/chosen": 0.478759765625, "rewards/margins": 3.109375, "rewards/rejected": -2.6298828125, "step": 1155 }, { "epoch": 0.21843261372762057, "grad_norm": 1.9661635193295688, "learning_rate": 9.623016775035106e-07, "logits/chosen": 1.83984375, "logits/rejected": 1.1259765625, "logps/chosen": -559.0, "logps/rejected": -465.5, "loss": 0.7612, "rewards/accuracies": 0.75, "rewards/chosen": 0.14117431640625, "rewards/margins": 1.7197265625, "rewards/rejected": -1.578125, "step": 1156 }, { "epoch": 0.21862156927582785, "grad_norm": 2.0479326705104723, "learning_rate": 9.621826664520597e-07, "logits/chosen": 2.3818359375, "logits/rejected": 2.412109375, "logps/chosen": -782.5, "logps/rejected": -579.0, "loss": 0.6037, "rewards/accuracies": 0.84375, "rewards/chosen": 0.72900390625, "rewards/margins": 2.775390625, "rewards/rejected": -2.044921875, "step": 1157 }, { "epoch": 0.21881052482403515, "grad_norm": 1.9231242923972511, "learning_rate": 9.6206347608079e-07, "logits/chosen": 0.9189453125, "logits/rejected": 1.5810546875, "logps/chosen": -538.5, "logps/rejected": -839.0, "loss": 0.6521, "rewards/accuracies": 0.8125, "rewards/chosen": 0.300537109375, "rewards/margins": 2.572265625, "rewards/rejected": -2.26953125, "step": 1158 }, { "epoch": 0.21899948037224243, "grad_norm": 1.9153550460789817, "learning_rate": 9.619441064415555e-07, "logits/chosen": 2.095703125, "logits/rejected": 1.654296875, "logps/chosen": -767.5, "logps/rejected": -676.0, "loss": 0.7147, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2900390625, "rewards/margins": 2.2666015625, "rewards/rejected": -1.9765625, "step": 1159 }, { "epoch": 0.2191884359204497, "grad_norm": 1.773280258458008, "learning_rate": 9.618245575862875e-07, "logits/chosen": 2.5107421875, "logits/rejected": 3.1806640625, "logps/chosen": -776.0, "logps/rejected": -1009.0, "loss": 0.6174, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8291015625, "rewards/margins": 3.2578125, "rewards/rejected": -2.427734375, "step": 1160 }, { "epoch": 0.219377391468657, "grad_norm": 2.1369438704604926, "learning_rate": 9.61704829566996e-07, "logits/chosen": 2.58203125, "logits/rejected": 2.77734375, "logps/chosen": -575.0, "logps/rejected": -739.0, "loss": 0.8258, "rewards/accuracies": 0.625, "rewards/chosen": 0.12579345703125, "rewards/margins": 1.4647216796875, "rewards/rejected": -1.33837890625, "step": 1161 }, { "epoch": 0.21956634701686428, "grad_norm": 3.7443679465289654, "learning_rate": 9.615849224357684e-07, "logits/chosen": 1.3818359375, "logits/rejected": 0.967529296875, "logps/chosen": -738.5, "logps/rejected": -738.5, "loss": 0.694, "rewards/accuracies": 0.8125, "rewards/chosen": 0.090087890625, "rewards/margins": 2.7109375, "rewards/rejected": -2.615234375, "step": 1162 }, { "epoch": 0.21975530256507156, "grad_norm": 2.0932082180951577, "learning_rate": 9.614648362447704e-07, "logits/chosen": 1.35205078125, "logits/rejected": 2.0927734375, "logps/chosen": -624.5, "logps/rejected": -1777.5, "loss": 0.7113, "rewards/accuracies": 0.875, "rewards/chosen": -0.413909912109375, "rewards/margins": 3.966796875, "rewards/rejected": -4.37109375, "step": 1163 }, { "epoch": 0.21994425811327886, "grad_norm": 2.279660192351048, "learning_rate": 9.61344571046245e-07, "logits/chosen": 1.1181640625, "logits/rejected": 1.23046875, "logps/chosen": -713.0, "logps/rejected": -1431.0, "loss": 0.6091, "rewards/accuracies": 0.875, "rewards/chosen": 0.68359375, "rewards/margins": 3.515625, "rewards/rejected": -2.83984375, "step": 1164 }, { "epoch": 0.22013321366148614, "grad_norm": 1.8276031752936601, "learning_rate": 9.612241268925142e-07, "logits/chosen": 1.908203125, "logits/rejected": 1.4140625, "logps/chosen": -1150.0, "logps/rejected": -2001.0, "loss": 0.6445, "rewards/accuracies": 0.71875, "rewards/chosen": 0.638427734375, "rewards/margins": 3.71484375, "rewards/rejected": -3.076171875, "step": 1165 }, { "epoch": 0.2203221692096934, "grad_norm": 1.6066224527328696, "learning_rate": 9.611035038359767e-07, "logits/chosen": 2.365234375, "logits/rejected": 2.240234375, "logps/chosen": -839.0, "logps/rejected": -967.0, "loss": 0.6245, "rewards/accuracies": 0.8125, "rewards/chosen": 0.935546875, "rewards/margins": 3.173828125, "rewards/rejected": -2.2353515625, "step": 1166 }, { "epoch": 0.22051112475790072, "grad_norm": 2.1805998510937443, "learning_rate": 9.609827019291094e-07, "logits/chosen": 2.638671875, "logits/rejected": 2.62109375, "logps/chosen": -1171.5, "logps/rejected": -813.0, "loss": 0.5709, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0537109375, "rewards/margins": 2.697265625, "rewards/rejected": -2.751953125, "step": 1167 }, { "epoch": 0.220700080306108, "grad_norm": 1.65955821173374, "learning_rate": 9.608617212244675e-07, "logits/chosen": 2.005859375, "logits/rejected": 1.90380859375, "logps/chosen": -488.5, "logps/rejected": -635.5, "loss": 0.6995, "rewards/accuracies": 0.65625, "rewards/chosen": 0.4462890625, "rewards/margins": 2.2109375, "rewards/rejected": -1.763671875, "step": 1168 }, { "epoch": 0.22088903585431527, "grad_norm": 1.95648746255659, "learning_rate": 9.607405617746832e-07, "logits/chosen": 2.1259765625, "logits/rejected": 2.32177734375, "logps/chosen": -1087.0, "logps/rejected": -2222.0, "loss": 0.6773, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2294921875, "rewards/margins": 2.6124267578125, "rewards/rejected": -2.3828125, "step": 1169 }, { "epoch": 0.22107799140252254, "grad_norm": 1.718234065803723, "learning_rate": 9.606192236324671e-07, "logits/chosen": 1.46875, "logits/rejected": 1.2236328125, "logps/chosen": -979.0, "logps/rejected": -685.0, "loss": 0.641, "rewards/accuracies": 0.8125, "rewards/chosen": 0.947265625, "rewards/margins": 2.87890625, "rewards/rejected": -1.935546875, "step": 1170 }, { "epoch": 0.22126694695072985, "grad_norm": 1.9573917307008821, "learning_rate": 9.60497706850607e-07, "logits/chosen": 1.72265625, "logits/rejected": 1.916015625, "logps/chosen": -1086.0, "logps/rejected": -1251.0, "loss": 0.6068, "rewards/accuracies": 0.8125, "rewards/chosen": 1.498046875, "rewards/margins": 3.8046875, "rewards/rejected": -2.302734375, "step": 1171 }, { "epoch": 0.22145590249893712, "grad_norm": 1.7951811158196378, "learning_rate": 9.60376011481969e-07, "logits/chosen": 2.39453125, "logits/rejected": 1.6923828125, "logps/chosen": -793.0, "logps/rejected": -745.5, "loss": 0.6599, "rewards/accuracies": 0.75, "rewards/chosen": 0.939453125, "rewards/margins": 2.61328125, "rewards/rejected": -1.669921875, "step": 1172 }, { "epoch": 0.2216448580471444, "grad_norm": 1.9134470907945857, "learning_rate": 9.602541375794966e-07, "logits/chosen": 2.154296875, "logits/rejected": 2.04296875, "logps/chosen": -516.0, "logps/rejected": -617.5, "loss": 0.7332, "rewards/accuracies": 0.65625, "rewards/chosen": 0.61962890625, "rewards/margins": 1.80859375, "rewards/rejected": -1.18505859375, "step": 1173 }, { "epoch": 0.2218338135953517, "grad_norm": 2.1017377809530906, "learning_rate": 9.601320851962107e-07, "logits/chosen": 2.357421875, "logits/rejected": 2.408203125, "logps/chosen": -924.5, "logps/rejected": -829.0, "loss": 0.5732, "rewards/accuracies": 0.875, "rewards/chosen": 1.1689453125, "rewards/margins": 3.453125, "rewards/rejected": -2.28515625, "step": 1174 }, { "epoch": 0.22202276914355898, "grad_norm": 2.320795736151106, "learning_rate": 9.600098543852101e-07, "logits/chosen": 1.666015625, "logits/rejected": 1.330078125, "logps/chosen": -606.5, "logps/rejected": -536.0, "loss": 0.5941, "rewards/accuracies": 0.875, "rewards/chosen": 0.7109375, "rewards/margins": 2.72265625, "rewards/rejected": -2.015625, "step": 1175 }, { "epoch": 0.22221172469176625, "grad_norm": 2.081690014566185, "learning_rate": 9.598874451996717e-07, "logits/chosen": 2.36328125, "logits/rejected": 2.49609375, "logps/chosen": -544.0, "logps/rejected": -600.0, "loss": 0.6753, "rewards/accuracies": 0.8125, "rewards/chosen": 0.592529296875, "rewards/margins": 2.626953125, "rewards/rejected": -2.041015625, "step": 1176 }, { "epoch": 0.22240068023997356, "grad_norm": 1.9791562587697256, "learning_rate": 9.59764857692849e-07, "logits/chosen": 1.7958984375, "logits/rejected": 2.34375, "logps/chosen": -674.0, "logps/rejected": -1435.0, "loss": 0.6546, "rewards/accuracies": 0.84375, "rewards/chosen": 0.75341796875, "rewards/margins": 3.48046875, "rewards/rejected": -2.728515625, "step": 1177 }, { "epoch": 0.22258963578818083, "grad_norm": 1.8458904047302096, "learning_rate": 9.59642091918074e-07, "logits/chosen": 1.5244140625, "logits/rejected": 2.74609375, "logps/chosen": -772.5, "logps/rejected": -1298.0, "loss": 0.5994, "rewards/accuracies": 0.84375, "rewards/chosen": 0.52880859375, "rewards/margins": 3.75, "rewards/rejected": -3.208984375, "step": 1178 }, { "epoch": 0.2227785913363881, "grad_norm": 2.543127507053502, "learning_rate": 9.59519147928756e-07, "logits/chosen": 1.192138671875, "logits/rejected": 1.4453125, "logps/chosen": -672.5, "logps/rejected": -1022.0, "loss": 0.6181, "rewards/accuracies": 0.8125, "rewards/chosen": 0.828125, "rewards/margins": 2.96875, "rewards/rejected": -2.134765625, "step": 1179 }, { "epoch": 0.22296754688459539, "grad_norm": 1.8662041133995613, "learning_rate": 9.59396025778381e-07, "logits/chosen": 0.92041015625, "logits/rejected": 0.512939453125, "logps/chosen": -408.25, "logps/rejected": -533.0, "loss": 0.6643, "rewards/accuracies": 0.84375, "rewards/chosen": 0.087890625, "rewards/margins": 2.50390625, "rewards/rejected": -2.4189453125, "step": 1180 }, { "epoch": 0.2231565024328027, "grad_norm": 2.9353829402968574, "learning_rate": 9.59272725520514e-07, "logits/chosen": 2.80078125, "logits/rejected": 2.744140625, "logps/chosen": -776.5, "logps/rejected": -817.5, "loss": 0.7452, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4822998046875, "rewards/margins": 2.20849609375, "rewards/rejected": -1.7314453125, "step": 1181 }, { "epoch": 0.22334545798100996, "grad_norm": 2.009601667845864, "learning_rate": 9.591492472087962e-07, "logits/chosen": 0.9931640625, "logits/rejected": 1.12548828125, "logps/chosen": -893.5, "logps/rejected": -971.0, "loss": 0.7069, "rewards/accuracies": 0.78125, "rewards/chosen": 0.218353271484375, "rewards/margins": 2.6953125, "rewards/rejected": -2.478515625, "step": 1182 }, { "epoch": 0.22353441352921724, "grad_norm": 1.6233233414332007, "learning_rate": 9.590255908969468e-07, "logits/chosen": 1.615234375, "logits/rejected": 1.984375, "logps/chosen": -701.5, "logps/rejected": -1427.0, "loss": 0.7229, "rewards/accuracies": 0.71875, "rewards/chosen": 0.472381591796875, "rewards/margins": 3.916015625, "rewards/rejected": -3.44921875, "step": 1183 }, { "epoch": 0.22372336907742454, "grad_norm": 2.4832132247818097, "learning_rate": 9.589017566387626e-07, "logits/chosen": 2.083984375, "logits/rejected": 2.3671875, "logps/chosen": -1034.0, "logps/rejected": -1091.0, "loss": 0.6294, "rewards/accuracies": 0.90625, "rewards/chosen": 0.384033203125, "rewards/margins": 2.8515625, "rewards/rejected": -2.46484375, "step": 1184 }, { "epoch": 0.22391232462563182, "grad_norm": 2.194600610840096, "learning_rate": 9.587777444881178e-07, "logits/chosen": 0.687255859375, "logits/rejected": 0.811767578125, "logps/chosen": -689.5, "logps/rejected": -728.0, "loss": 0.7516, "rewards/accuracies": 0.8125, "rewards/chosen": 0.155517578125, "rewards/margins": 2.40234375, "rewards/rejected": -2.2421875, "step": 1185 }, { "epoch": 0.2241012801738391, "grad_norm": 2.3456406767953624, "learning_rate": 9.586535544989633e-07, "logits/chosen": 1.404296875, "logits/rejected": 1.1357421875, "logps/chosen": -540.5, "logps/rejected": -463.0, "loss": 0.605, "rewards/accuracies": 0.875, "rewards/chosen": 0.3916015625, "rewards/margins": 3.05859375, "rewards/rejected": -2.673828125, "step": 1186 }, { "epoch": 0.2242902357220464, "grad_norm": 1.8644584656551597, "learning_rate": 9.585291867253283e-07, "logits/chosen": 1.83935546875, "logits/rejected": 1.14892578125, "logps/chosen": -697.5, "logps/rejected": -672.5, "loss": 0.6968, "rewards/accuracies": 0.75, "rewards/chosen": 0.279052734375, "rewards/margins": 2.466796875, "rewards/rejected": -2.19140625, "step": 1187 }, { "epoch": 0.22447919127025368, "grad_norm": 1.7104806764200235, "learning_rate": 9.584046412213187e-07, "logits/chosen": 0.1719970703125, "logits/rejected": 0.408935546875, "logps/chosen": -726.5, "logps/rejected": -567.5, "loss": 0.7369, "rewards/accuracies": 0.84375, "rewards/chosen": -0.30224609375, "rewards/margins": 2.51171875, "rewards/rejected": -2.822265625, "step": 1188 }, { "epoch": 0.22466814681846095, "grad_norm": 2.2231493344618936, "learning_rate": 9.58279918041118e-07, "logits/chosen": 0.9736328125, "logits/rejected": 0.41552734375, "logps/chosen": -899.0, "logps/rejected": -921.5, "loss": 0.6379, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2666015625, "rewards/margins": 3.2265625, "rewards/rejected": -2.95703125, "step": 1189 }, { "epoch": 0.22485710236666825, "grad_norm": 2.024588271721484, "learning_rate": 9.58155017238987e-07, "logits/chosen": 1.607421875, "logits/rejected": 1.84619140625, "logps/chosen": -780.0, "logps/rejected": -831.0, "loss": 0.6494, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0238037109375, "rewards/margins": 2.91015625, "rewards/rejected": -2.896484375, "step": 1190 }, { "epoch": 0.22504605791487553, "grad_norm": 2.449503873000374, "learning_rate": 9.580299388692635e-07, "logits/chosen": 1.8515625, "logits/rejected": 2.19140625, "logps/chosen": -767.0, "logps/rejected": -1148.0, "loss": 0.6664, "rewards/accuracies": 0.75, "rewards/chosen": 0.03662109375, "rewards/margins": 2.859375, "rewards/rejected": -2.8203125, "step": 1191 }, { "epoch": 0.2252350134630828, "grad_norm": 2.6088351387532804, "learning_rate": 9.57904682986363e-07, "logits/chosen": 1.984375, "logits/rejected": 1.65625, "logps/chosen": -566.5, "logps/rejected": -737.5, "loss": 0.651, "rewards/accuracies": 0.875, "rewards/chosen": 0.025146484375, "rewards/margins": 3.396484375, "rewards/rejected": -3.3671875, "step": 1192 }, { "epoch": 0.22542396901129008, "grad_norm": 1.8886123896354443, "learning_rate": 9.577792496447782e-07, "logits/chosen": 0.9619140625, "logits/rejected": 0.9521484375, "logps/chosen": -695.0, "logps/rejected": -636.5, "loss": 0.6943, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3060302734375, "rewards/margins": 2.427734375, "rewards/rejected": -2.73046875, "step": 1193 }, { "epoch": 0.22561292455949739, "grad_norm": 2.282709672285631, "learning_rate": 9.576536388990782e-07, "logits/chosen": 2.51171875, "logits/rejected": 2.58984375, "logps/chosen": -798.0, "logps/rejected": -737.5, "loss": 0.7132, "rewards/accuracies": 0.78125, "rewards/chosen": 0.18408203125, "rewards/margins": 2.5078125, "rewards/rejected": -2.328125, "step": 1194 }, { "epoch": 0.22580188010770466, "grad_norm": 1.5825877450590125, "learning_rate": 9.575278508039105e-07, "logits/chosen": 2.578125, "logits/rejected": 2.400390625, "logps/chosen": -1026.0, "logps/rejected": -995.5, "loss": 0.7393, "rewards/accuracies": 0.625, "rewards/chosen": 0.2119140625, "rewards/margins": 2.69677734375, "rewards/rejected": -2.4765625, "step": 1195 }, { "epoch": 0.22599083565591194, "grad_norm": 2.042983788638345, "learning_rate": 9.574018854139987e-07, "logits/chosen": 1.388671875, "logits/rejected": 1.27490234375, "logps/chosen": -706.0, "logps/rejected": -1331.0, "loss": 0.5736, "rewards/accuracies": 0.90625, "rewards/chosen": 0.01904296875, "rewards/margins": 4.548828125, "rewards/rejected": -4.5390625, "step": 1196 }, { "epoch": 0.22617979120411924, "grad_norm": 2.923378424232205, "learning_rate": 9.57275742784144e-07, "logits/chosen": 1.1337890625, "logits/rejected": 1.30126953125, "logps/chosen": -709.0, "logps/rejected": -1871.0, "loss": 0.6893, "rewards/accuracies": 0.8125, "rewards/chosen": 0.19189453125, "rewards/margins": 3.076171875, "rewards/rejected": -2.87890625, "step": 1197 }, { "epoch": 0.22636874675232652, "grad_norm": 1.9117501094359686, "learning_rate": 9.571494229692251e-07, "logits/chosen": 1.84375, "logits/rejected": 1.5888671875, "logps/chosen": -710.0, "logps/rejected": -643.0, "loss": 0.6593, "rewards/accuracies": 0.8125, "rewards/chosen": 0.63623046875, "rewards/margins": 4.2578125, "rewards/rejected": -3.6298828125, "step": 1198 }, { "epoch": 0.2265577023005338, "grad_norm": 2.3910013322183907, "learning_rate": 9.570229260241968e-07, "logits/chosen": 1.8466796875, "logits/rejected": 1.3193359375, "logps/chosen": -1368.0, "logps/rejected": -1236.0, "loss": 0.488, "rewards/accuracies": 0.96875, "rewards/chosen": 1.65625, "rewards/margins": 4.51171875, "rewards/rejected": -2.857421875, "step": 1199 }, { "epoch": 0.2267466578487411, "grad_norm": 1.978065533098932, "learning_rate": 9.568962520040919e-07, "logits/chosen": 1.30950927734375, "logits/rejected": 1.412353515625, "logps/chosen": -747.0, "logps/rejected": -647.5, "loss": 0.7405, "rewards/accuracies": 0.6875, "rewards/chosen": 0.431640625, "rewards/margins": 1.8056640625, "rewards/rejected": -1.37109375, "step": 1200 }, { "epoch": 0.22693561339694837, "grad_norm": 2.174651767937809, "learning_rate": 9.567694009640197e-07, "logits/chosen": 1.4619140625, "logits/rejected": 1.5908203125, "logps/chosen": -694.5, "logps/rejected": -682.5, "loss": 0.662, "rewards/accuracies": 0.71875, "rewards/chosen": 0.60888671875, "rewards/margins": 2.544921875, "rewards/rejected": -1.939453125, "step": 1201 }, { "epoch": 0.22712456894515565, "grad_norm": 1.850762926029984, "learning_rate": 9.566423729591664e-07, "logits/chosen": 1.576171875, "logits/rejected": 1.2200927734375, "logps/chosen": -847.0, "logps/rejected": -671.5, "loss": 0.6953, "rewards/accuracies": 0.75, "rewards/chosen": 0.537353515625, "rewards/margins": 2.341796875, "rewards/rejected": -1.8046875, "step": 1202 }, { "epoch": 0.22731352449336292, "grad_norm": 2.3663379313559316, "learning_rate": 9.565151680447962e-07, "logits/chosen": 1.470703125, "logits/rejected": 1.87109375, "logps/chosen": -18901.0, "logps/rejected": -6372.0, "loss": 0.6553, "rewards/accuracies": 0.71875, "rewards/chosen": -56.3232421875, "rewards/margins": -55.39453125, "rewards/rejected": -1.04296875, "step": 1203 }, { "epoch": 0.22750248004157023, "grad_norm": 1.6754867176094532, "learning_rate": 9.56387786276249e-07, "logits/chosen": 1.083251953125, "logits/rejected": 1.059814453125, "logps/chosen": -1070.5, "logps/rejected": -893.0, "loss": 0.5892, "rewards/accuracies": 0.875, "rewards/chosen": 0.919921875, "rewards/margins": 3.158203125, "rewards/rejected": -2.24609375, "step": 1204 }, { "epoch": 0.2276914355897775, "grad_norm": 1.7507706970413695, "learning_rate": 9.56260227708942e-07, "logits/chosen": 1.2685546875, "logits/rejected": 1.5859375, "logps/chosen": -774.5, "logps/rejected": -623.0, "loss": 0.6241, "rewards/accuracies": 0.8125, "rewards/chosen": 0.767578125, "rewards/margins": 2.349609375, "rewards/rejected": -1.583984375, "step": 1205 }, { "epoch": 0.22788039113798478, "grad_norm": 2.037094022681669, "learning_rate": 9.561324923983697e-07, "logits/chosen": 1.955078125, "logits/rejected": 2.162109375, "logps/chosen": -704.0, "logps/rejected": -848.0, "loss": 0.7401, "rewards/accuracies": 0.75, "rewards/chosen": 0.3360595703125, "rewards/margins": 2.208984375, "rewards/rejected": -1.873046875, "step": 1206 }, { "epoch": 0.22806934668619208, "grad_norm": 3.05131005587392, "learning_rate": 9.560045804001036e-07, "logits/chosen": 2.037109375, "logits/rejected": 2.30859375, "logps/chosen": -421.25, "logps/rejected": -497.0, "loss": 0.7917, "rewards/accuracies": 0.84375, "rewards/chosen": 0.35662841796875, "rewards/margins": 2.029296875, "rewards/rejected": -1.67431640625, "step": 1207 }, { "epoch": 0.22825830223439936, "grad_norm": 1.9901923123724952, "learning_rate": 9.558764917697911e-07, "logits/chosen": 1.2646484375, "logits/rejected": 1.369140625, "logps/chosen": -749.5, "logps/rejected": -752.0, "loss": 0.7076, "rewards/accuracies": 0.84375, "rewards/chosen": 0.54541015625, "rewards/margins": 1.98828125, "rewards/rejected": -1.44677734375, "step": 1208 }, { "epoch": 0.22844725778260663, "grad_norm": 2.491264872448029, "learning_rate": 9.557482265631575e-07, "logits/chosen": 0.9580078125, "logits/rejected": 1.1103515625, "logps/chosen": -690.5, "logps/rejected": -498.0, "loss": 0.7166, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2802734375, "rewards/margins": 1.904296875, "rewards/rejected": -1.625, "step": 1209 }, { "epoch": 0.22863621333081394, "grad_norm": 2.007432917641877, "learning_rate": 9.556197848360046e-07, "logits/chosen": 1.112060546875, "logits/rejected": 1.2958984375, "logps/chosen": -374.5, "logps/rejected": -1300.0, "loss": 0.7249, "rewards/accuracies": 0.75, "rewards/chosen": 0.42138671875, "rewards/margins": 2.884765625, "rewards/rejected": -2.470703125, "step": 1210 }, { "epoch": 0.2288251688790212, "grad_norm": 1.850314501429629, "learning_rate": 9.554911666442104e-07, "logits/chosen": 2.037109375, "logits/rejected": 1.771484375, "logps/chosen": -1171.0, "logps/rejected": -892.0, "loss": 0.5851, "rewards/accuracies": 0.875, "rewards/chosen": 1.158203125, "rewards/margins": 3.7265625, "rewards/rejected": -2.56640625, "step": 1211 }, { "epoch": 0.2290141244272285, "grad_norm": 2.855655612809035, "learning_rate": 9.553623720437305e-07, "logits/chosen": 1.658203125, "logits/rejected": 1.798828125, "logps/chosen": -829.0, "logps/rejected": -704.0, "loss": 0.7702, "rewards/accuracies": 0.625, "rewards/chosen": 0.5966796875, "rewards/margins": 1.943603515625, "rewards/rejected": -1.345703125, "step": 1212 }, { "epoch": 0.2292030799754358, "grad_norm": 2.9262097225574335, "learning_rate": 9.55233401090597e-07, "logits/chosen": 2.216796875, "logits/rejected": 2.482421875, "logps/chosen": -921.5, "logps/rejected": -1189.0, "loss": 0.5207, "rewards/accuracies": 0.875, "rewards/chosen": 1.16015625, "rewards/margins": 4.05078125, "rewards/rejected": -2.890625, "step": 1213 }, { "epoch": 0.22939203552364307, "grad_norm": 2.041390036224482, "learning_rate": 9.551042538409185e-07, "logits/chosen": 1.1162109375, "logits/rejected": 1.767333984375, "logps/chosen": -534.0, "logps/rejected": -940.5, "loss": 0.5444, "rewards/accuracies": 0.875, "rewards/chosen": 0.681640625, "rewards/margins": 3.80078125, "rewards/rejected": -3.111328125, "step": 1214 }, { "epoch": 0.22958099107185034, "grad_norm": 1.9139460002297362, "learning_rate": 9.549749303508803e-07, "logits/chosen": 1.080078125, "logits/rejected": 0.915771484375, "logps/chosen": -675.5, "logps/rejected": -763.0, "loss": 0.6142, "rewards/accuracies": 0.90625, "rewards/chosen": 0.224609375, "rewards/margins": 3.16796875, "rewards/rejected": -2.9375, "step": 1215 }, { "epoch": 0.22976994662005762, "grad_norm": 2.275286387449833, "learning_rate": 9.548454306767446e-07, "logits/chosen": 0.6396484375, "logits/rejected": 0.8846435546875, "logps/chosen": -603.5, "logps/rejected": -776.5, "loss": 0.7494, "rewards/accuracies": 0.78125, "rewards/chosen": -0.239501953125, "rewards/margins": 1.99609375, "rewards/rejected": -2.236328125, "step": 1216 }, { "epoch": 0.22995890216826492, "grad_norm": 1.9657743084630988, "learning_rate": 9.547157548748503e-07, "logits/chosen": 1.495361328125, "logits/rejected": 2.031494140625, "logps/chosen": -580.5, "logps/rejected": -942.5, "loss": 0.6566, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3814697265625, "rewards/margins": 3.384765625, "rewards/rejected": -3.01171875, "step": 1217 }, { "epoch": 0.2301478577164722, "grad_norm": 1.8660253989206061, "learning_rate": 9.545859030016127e-07, "logits/chosen": 1.7744140625, "logits/rejected": 1.794921875, "logps/chosen": -668.0, "logps/rejected": -875.0, "loss": 0.6174, "rewards/accuracies": 0.8125, "rewards/chosen": 0.563232421875, "rewards/margins": 3.955078125, "rewards/rejected": -3.388671875, "step": 1218 }, { "epoch": 0.23033681326467947, "grad_norm": 1.7852802161005874, "learning_rate": 9.544558751135239e-07, "logits/chosen": 0.9541015625, "logits/rejected": 0.77734375, "logps/chosen": -641.0, "logps/rejected": -589.0, "loss": 0.581, "rewards/accuracies": 0.84375, "rewards/chosen": 0.496795654296875, "rewards/margins": 3.29296875, "rewards/rejected": -2.791015625, "step": 1219 }, { "epoch": 0.23052576881288678, "grad_norm": 2.1365643867613846, "learning_rate": 9.54325671267152e-07, "logits/chosen": 1.060546875, "logits/rejected": 1.2119140625, "logps/chosen": -784.0, "logps/rejected": -1439.5, "loss": 0.5379, "rewards/accuracies": 0.875, "rewards/chosen": 0.9912109375, "rewards/margins": 4.734375, "rewards/rejected": -3.7421875, "step": 1220 }, { "epoch": 0.23071472436109405, "grad_norm": 2.284235279486839, "learning_rate": 9.541952915191428e-07, "logits/chosen": 1.4189453125, "logits/rejected": 1.7060546875, "logps/chosen": -536.5, "logps/rejected": -831.0, "loss": 0.7059, "rewards/accuracies": 0.75, "rewards/chosen": -0.18994140625, "rewards/margins": 4.990234375, "rewards/rejected": -5.1796875, "step": 1221 }, { "epoch": 0.23090367990930133, "grad_norm": 2.6490496969532225, "learning_rate": 9.540647359262172e-07, "logits/chosen": 1.22509765625, "logits/rejected": 1.2763671875, "logps/chosen": -890.5, "logps/rejected": -944.0, "loss": 0.652, "rewards/accuracies": 0.78125, "rewards/chosen": 0.422607421875, "rewards/margins": 3.248046875, "rewards/rejected": -2.8203125, "step": 1222 }, { "epoch": 0.23109263545750863, "grad_norm": 2.2499899419562803, "learning_rate": 9.53934004545174e-07, "logits/chosen": 2.443359375, "logits/rejected": 2.75, "logps/chosen": -901.0, "logps/rejected": -922.0, "loss": 0.7297, "rewards/accuracies": 0.71875, "rewards/chosen": 0.08056640625, "rewards/margins": 2.2177734375, "rewards/rejected": -2.134765625, "step": 1223 }, { "epoch": 0.2312815910057159, "grad_norm": 2.3843935846057236, "learning_rate": 9.53803097432887e-07, "logits/chosen": 1.640625, "logits/rejected": 1.384765625, "logps/chosen": -623.5, "logps/rejected": -1053.5, "loss": 0.773, "rewards/accuracies": 0.75, "rewards/chosen": -0.3951416015625, "rewards/margins": 2.1480712890625, "rewards/rejected": -2.54296875, "step": 1224 }, { "epoch": 0.23147054655392318, "grad_norm": 2.1240895916137155, "learning_rate": 9.536720146463079e-07, "logits/chosen": 1.181640625, "logits/rejected": 1.58544921875, "logps/chosen": -750.5, "logps/rejected": -891.0, "loss": 0.6159, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5743408203125, "rewards/margins": 4.3046875, "rewards/rejected": -3.7265625, "step": 1225 }, { "epoch": 0.23165950210213046, "grad_norm": 1.884661699699202, "learning_rate": 9.535407562424642e-07, "logits/chosen": 1.451171875, "logits/rejected": 2.1396484375, "logps/chosen": -1008.5, "logps/rejected": -1408.0, "loss": 0.6126, "rewards/accuracies": 0.875, "rewards/chosen": 0.6741943359375, "rewards/margins": 4.5234375, "rewards/rejected": -3.84375, "step": 1226 }, { "epoch": 0.23184845765033776, "grad_norm": 2.2453199204078014, "learning_rate": 9.534093222784596e-07, "logits/chosen": 1.46875, "logits/rejected": 0.7786865234375, "logps/chosen": -923.0, "logps/rejected": -820.0, "loss": 0.6533, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3065185546875, "rewards/margins": 2.9921875, "rewards/rejected": -2.6796875, "step": 1227 }, { "epoch": 0.23203741319854504, "grad_norm": 3.510677874171902, "learning_rate": 9.532777128114743e-07, "logits/chosen": 2.1279296875, "logits/rejected": 2.04296875, "logps/chosen": -671.5, "logps/rejected": -749.5, "loss": 0.6711, "rewards/accuracies": 0.84375, "rewards/chosen": 0.200592041015625, "rewards/margins": 2.36328125, "rewards/rejected": -2.166015625, "step": 1228 }, { "epoch": 0.23222636874675232, "grad_norm": 2.2052091558408793, "learning_rate": 9.53145927898765e-07, "logits/chosen": 0.84765625, "logits/rejected": 0.6181640625, "logps/chosen": -708.0, "logps/rejected": -708.5, "loss": 0.6815, "rewards/accuracies": 0.84375, "rewards/chosen": 0.343536376953125, "rewards/margins": 2.099609375, "rewards/rejected": -1.753387451171875, "step": 1229 }, { "epoch": 0.23241532429495962, "grad_norm": 1.8398365472265668, "learning_rate": 9.530139675976647e-07, "logits/chosen": 1.44775390625, "logits/rejected": 1.357421875, "logps/chosen": -1099.0, "logps/rejected": -1496.0, "loss": 0.6155, "rewards/accuracies": 0.875, "rewards/chosen": 0.0244140625, "rewards/margins": 3.232421875, "rewards/rejected": -3.203125, "step": 1230 }, { "epoch": 0.2326042798431669, "grad_norm": 1.8712353247015925, "learning_rate": 9.528818319655827e-07, "logits/chosen": 1.34375, "logits/rejected": 1.9541015625, "logps/chosen": -798.0, "logps/rejected": -1317.5, "loss": 0.685, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5185546875, "rewards/margins": 3.59375, "rewards/rejected": -3.08203125, "step": 1231 }, { "epoch": 0.23279323539137417, "grad_norm": 4.480879084642627, "learning_rate": 9.527495210600043e-07, "logits/chosen": 1.5126953125, "logits/rejected": 0.768798828125, "logps/chosen": -997.5, "logps/rejected": -999.0, "loss": 0.5318, "rewards/accuracies": 0.96875, "rewards/chosen": 1.1123046875, "rewards/margins": 5.0546875, "rewards/rejected": -3.9375, "step": 1232 }, { "epoch": 0.23298219093958147, "grad_norm": 2.1125062112525383, "learning_rate": 9.526170349384914e-07, "logits/chosen": 2.025390625, "logits/rejected": 2.00537109375, "logps/chosen": -857.0, "logps/rejected": -1013.0, "loss": 0.6677, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1744384765625, "rewards/margins": 3.392578125, "rewards/rejected": -3.22265625, "step": 1233 }, { "epoch": 0.23317114648778875, "grad_norm": 1.9204396154948438, "learning_rate": 9.524843736586821e-07, "logits/chosen": 2.189453125, "logits/rejected": 2.203125, "logps/chosen": -847.5, "logps/rejected": -804.0, "loss": 0.6403, "rewards/accuracies": 0.75, "rewards/chosen": 0.671142578125, "rewards/margins": 2.890625, "rewards/rejected": -2.21484375, "step": 1234 }, { "epoch": 0.23336010203599603, "grad_norm": 1.973735309343947, "learning_rate": 9.523515372782904e-07, "logits/chosen": 1.94140625, "logits/rejected": 1.8662109375, "logps/chosen": -672.0, "logps/rejected": -802.0, "loss": 0.6756, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2119140625, "rewards/margins": 2.58203125, "rewards/rejected": -2.3662109375, "step": 1235 }, { "epoch": 0.23354905758420333, "grad_norm": 2.8163043764410536, "learning_rate": 9.522185258551069e-07, "logits/chosen": 1.12078857421875, "logits/rejected": 1.1025390625, "logps/chosen": -989.0, "logps/rejected": -1993.0, "loss": 0.4848, "rewards/accuracies": 0.90625, "rewards/chosen": 1.04833984375, "rewards/margins": 5.6953125, "rewards/rejected": -4.6484375, "step": 1236 }, { "epoch": 0.2337380131324106, "grad_norm": 2.0297968077140665, "learning_rate": 9.520853394469978e-07, "logits/chosen": 0.4951171875, "logits/rejected": 0.055908203125, "logps/chosen": -570.0, "logps/rejected": -606.0, "loss": 0.5554, "rewards/accuracies": 0.90625, "rewards/chosen": 0.365386962890625, "rewards/margins": 3.263671875, "rewards/rejected": -2.8984375, "step": 1237 }, { "epoch": 0.23392696868061788, "grad_norm": 2.0961837726395105, "learning_rate": 9.519519781119063e-07, "logits/chosen": 1.703125, "logits/rejected": 1.230682373046875, "logps/chosen": -757.0, "logps/rejected": -768.5, "loss": 0.5389, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6025390625, "rewards/margins": 3.40625, "rewards/rejected": -2.8046875, "step": 1238 }, { "epoch": 0.23411592422882516, "grad_norm": 1.4906511095539594, "learning_rate": 9.518184419078508e-07, "logits/chosen": 2.43359375, "logits/rejected": 2.7666015625, "logps/chosen": -1148.0, "logps/rejected": -2130.0, "loss": 0.5282, "rewards/accuracies": 0.90625, "rewards/chosen": 1.091796875, "rewards/margins": 5.37109375, "rewards/rejected": -4.27734375, "step": 1239 }, { "epoch": 0.23430487977703246, "grad_norm": 2.4267250917059573, "learning_rate": 9.516847308929262e-07, "logits/chosen": 1.638671875, "logits/rejected": 1.40234375, "logps/chosen": -827.5, "logps/rejected": -589.0, "loss": 0.7759, "rewards/accuracies": 0.78125, "rewards/chosen": 0.39013671875, "rewards/margins": 1.865234375, "rewards/rejected": -1.478515625, "step": 1240 }, { "epoch": 0.23449383532523974, "grad_norm": 2.5921101475711428, "learning_rate": 9.515508451253035e-07, "logits/chosen": 2.505859375, "logits/rejected": 2.49609375, "logps/chosen": -1022.0, "logps/rejected": -1156.0, "loss": 0.6732, "rewards/accuracies": 0.875, "rewards/chosen": 2.296875, "rewards/margins": 4.4921875, "rewards/rejected": -2.1796875, "step": 1241 }, { "epoch": 0.234682790873447, "grad_norm": 2.2504342898165604, "learning_rate": 9.514167846632294e-07, "logits/chosen": 2.47265625, "logits/rejected": 2.595703125, "logps/chosen": -394.75, "logps/rejected": -445.0, "loss": 0.7753, "rewards/accuracies": 0.75, "rewards/chosen": 0.2021484375, "rewards/margins": 1.31689453125, "rewards/rejected": -1.11572265625, "step": 1242 }, { "epoch": 0.23487174642165431, "grad_norm": 2.20059327829779, "learning_rate": 9.512825495650271e-07, "logits/chosen": 1.5107421875, "logits/rejected": 1.97998046875, "logps/chosen": -794.0, "logps/rejected": -725.0, "loss": 0.641, "rewards/accuracies": 0.875, "rewards/chosen": 1.203125, "rewards/margins": 2.796875, "rewards/rejected": -1.596435546875, "step": 1243 }, { "epoch": 0.2350607019698616, "grad_norm": 1.8958385979806018, "learning_rate": 9.511481398890951e-07, "logits/chosen": 1.355712890625, "logits/rejected": 1.365234375, "logps/chosen": -755.5, "logps/rejected": -1248.0, "loss": 0.6281, "rewards/accuracies": 0.6875, "rewards/chosen": 0.90380859375, "rewards/margins": 3.56640625, "rewards/rejected": -2.657958984375, "step": 1244 }, { "epoch": 0.23524965751806887, "grad_norm": 2.9731198563504977, "learning_rate": 9.510135556939089e-07, "logits/chosen": 2.0244140625, "logits/rejected": 2.4580078125, "logps/chosen": -410.5, "logps/rejected": -513.5, "loss": 0.7929, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39697265625, "rewards/margins": 2.138671875, "rewards/rejected": -1.7392578125, "step": 1245 }, { "epoch": 0.23543861306627617, "grad_norm": 2.045159614187056, "learning_rate": 9.508787970380187e-07, "logits/chosen": 1.97900390625, "logits/rejected": 2.14453125, "logps/chosen": -636.5, "logps/rejected": -555.5, "loss": 0.7259, "rewards/accuracies": 0.71875, "rewards/chosen": 1.05029296875, "rewards/margins": 2.0185546875, "rewards/rejected": -0.974609375, "step": 1246 }, { "epoch": 0.23562756861448345, "grad_norm": 1.9580730436303715, "learning_rate": 9.507438639800514e-07, "logits/chosen": 1.146484375, "logits/rejected": 1.294921875, "logps/chosen": -782.5, "logps/rejected": -1616.0, "loss": 0.6535, "rewards/accuracies": 0.8125, "rewards/chosen": 0.527587890625, "rewards/margins": 2.77734375, "rewards/rejected": -2.248046875, "step": 1247 }, { "epoch": 0.23581652416269072, "grad_norm": 2.541534526615845, "learning_rate": 9.506087565787093e-07, "logits/chosen": 1.441650390625, "logits/rejected": 1.52294921875, "logps/chosen": -836.0, "logps/rejected": -786.0, "loss": 0.6132, "rewards/accuracies": 0.71875, "rewards/chosen": 0.609375, "rewards/margins": 2.783203125, "rewards/rejected": -2.173828125, "step": 1248 }, { "epoch": 0.236005479710898, "grad_norm": 1.7336482664884565, "learning_rate": 9.504734748927712e-07, "logits/chosen": 0.80078125, "logits/rejected": 1.3314208984375, "logps/chosen": -428.0, "logps/rejected": -383.0, "loss": 0.7213, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14697265625, "rewards/margins": 2.0078125, "rewards/rejected": -2.154296875, "step": 1249 }, { "epoch": 0.2361944352591053, "grad_norm": 2.0138033988659325, "learning_rate": 9.503380189810909e-07, "logits/chosen": 0.0029296875, "logits/rejected": -0.041015625, "logps/chosen": -576.5, "logps/rejected": -485.0, "loss": 0.6412, "rewards/accuracies": 0.84375, "rewards/chosen": 0.375732421875, "rewards/margins": 2.1875, "rewards/rejected": -1.810546875, "step": 1250 }, { "epoch": 0.23638339080731258, "grad_norm": 2.5929843362081164, "learning_rate": 9.502023889025987e-07, "logits/chosen": 2.162109375, "logits/rejected": 2.716796875, "logps/chosen": -654.0, "logps/rejected": -688.0, "loss": 0.6469, "rewards/accuracies": 0.875, "rewards/chosen": 0.203857421875, "rewards/margins": 2.57421875, "rewards/rejected": -2.365234375, "step": 1251 }, { "epoch": 0.23657234635551985, "grad_norm": 2.616253344892468, "learning_rate": 9.500665847163004e-07, "logits/chosen": 1.984375, "logits/rejected": 1.9794921875, "logps/chosen": -996.5, "logps/rejected": -705.0, "loss": 0.7342, "rewards/accuracies": 0.71875, "rewards/chosen": 0.28564453125, "rewards/margins": 1.91796875, "rewards/rejected": -1.6298828125, "step": 1252 }, { "epoch": 0.23676130190372716, "grad_norm": 2.715863051228292, "learning_rate": 9.499306064812774e-07, "logits/chosen": 0.80322265625, "logits/rejected": 0.65771484375, "logps/chosen": -572.5, "logps/rejected": -644.5, "loss": 0.7374, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07464599609375, "rewards/margins": 2.3046875, "rewards/rejected": -2.232421875, "step": 1253 }, { "epoch": 0.23695025745193443, "grad_norm": 1.8524261413165155, "learning_rate": 9.49794454256687e-07, "logits/chosen": 1.8564453125, "logits/rejected": 2.0185546875, "logps/chosen": -836.5, "logps/rejected": -802.5, "loss": 0.6967, "rewards/accuracies": 0.78125, "rewards/chosen": 0.009765625, "rewards/margins": 2.9609375, "rewards/rejected": -2.958984375, "step": 1254 }, { "epoch": 0.2371392130001417, "grad_norm": 1.991055804571171, "learning_rate": 9.49658128101762e-07, "logits/chosen": 1.2952880859375, "logits/rejected": 1.08203125, "logps/chosen": -834.0, "logps/rejected": -2641.0, "loss": 0.6238, "rewards/accuracies": 0.78125, "rewards/chosen": 0.03961181640625, "rewards/margins": -0.87109375, "rewards/rejected": 0.91015625, "step": 1255 }, { "epoch": 0.237328168548349, "grad_norm": 2.303606897996147, "learning_rate": 9.495216280758113e-07, "logits/chosen": 1.625, "logits/rejected": 2.205078125, "logps/chosen": -1110.0, "logps/rejected": -1177.0, "loss": 0.5959, "rewards/accuracies": 0.9375, "rewards/chosen": 0.338134765625, "rewards/margins": 3.9453125, "rewards/rejected": -3.609375, "step": 1256 }, { "epoch": 0.2375171240965563, "grad_norm": 1.7672499237323995, "learning_rate": 9.493849542382189e-07, "logits/chosen": 0.611572265625, "logits/rejected": 0.78961181640625, "logps/chosen": -818.5, "logps/rejected": -708.5, "loss": 0.6604, "rewards/accuracies": 0.8125, "rewards/chosen": 0.309814453125, "rewards/margins": 2.78515625, "rewards/rejected": -2.474609375, "step": 1257 }, { "epoch": 0.23770607964476356, "grad_norm": 2.114733041892903, "learning_rate": 9.492481066484448e-07, "logits/chosen": 1.90234375, "logits/rejected": 2.466796875, "logps/chosen": -1005.0, "logps/rejected": -1012.0, "loss": 0.679, "rewards/accuracies": 0.78125, "rewards/chosen": 0.35528564453125, "rewards/margins": 3.25390625, "rewards/rejected": -2.900390625, "step": 1258 }, { "epoch": 0.23789503519297087, "grad_norm": 1.9423207260755253, "learning_rate": 9.491110853660243e-07, "logits/chosen": 1.422607421875, "logits/rejected": 1.26318359375, "logps/chosen": -988.0, "logps/rejected": -970.5, "loss": 0.6178, "rewards/accuracies": 0.84375, "rewards/chosen": 0.660888671875, "rewards/margins": 3.03125, "rewards/rejected": -2.369140625, "step": 1259 }, { "epoch": 0.23808399074117814, "grad_norm": 1.7627578746433135, "learning_rate": 9.489738904505687e-07, "logits/chosen": 1.97265625, "logits/rejected": 2.408203125, "logps/chosen": -789.0, "logps/rejected": -884.0, "loss": 0.6002, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7060546875, "rewards/margins": 3.41015625, "rewards/rejected": -2.703125, "step": 1260 }, { "epoch": 0.23827294628938542, "grad_norm": 1.9823850400431584, "learning_rate": 9.488365219617644e-07, "logits/chosen": 1.868896484375, "logits/rejected": 2.017578125, "logps/chosen": -867.0, "logps/rejected": -948.0, "loss": 0.5942, "rewards/accuracies": 0.8125, "rewards/chosen": 0.64697265625, "rewards/margins": 3.0078125, "rewards/rejected": -2.359375, "step": 1261 }, { "epoch": 0.2384619018375927, "grad_norm": 3.3875368495906852, "learning_rate": 9.486989799593733e-07, "logits/chosen": 1.689453125, "logits/rejected": 1.935546875, "logps/chosen": -752.5, "logps/rejected": -758.0, "loss": 0.6603, "rewards/accuracies": 0.84375, "rewards/chosen": 0.248291015625, "rewards/margins": 2.78515625, "rewards/rejected": -2.53515625, "step": 1262 }, { "epoch": 0.2386508573858, "grad_norm": 1.5401792496295215, "learning_rate": 9.485612645032333e-07, "logits/chosen": 1.708984375, "logits/rejected": 2.029296875, "logps/chosen": -531.5, "logps/rejected": -1477.0, "loss": 0.6035, "rewards/accuracies": 0.84375, "rewards/chosen": 0.30364990234375, "rewards/margins": 5.19140625, "rewards/rejected": -4.90625, "step": 1263 }, { "epoch": 0.23883981293400727, "grad_norm": 1.7828048391065472, "learning_rate": 9.484233756532571e-07, "logits/chosen": 1.921875, "logits/rejected": 2.0634765625, "logps/chosen": -701.0, "logps/rejected": -855.0, "loss": 0.6493, "rewards/accuracies": 0.875, "rewards/chosen": 0.4443359375, "rewards/margins": 2.814453125, "rewards/rejected": -2.373046875, "step": 1264 }, { "epoch": 0.23902876848221455, "grad_norm": 2.927523253619086, "learning_rate": 9.482853134694336e-07, "logits/chosen": 1.6220703125, "logits/rejected": 2.0283203125, "logps/chosen": -641.5, "logps/rejected": -1760.0, "loss": 0.702, "rewards/accuracies": 0.6875, "rewards/chosen": 0.384521484375, "rewards/margins": 3.802734375, "rewards/rejected": -3.4140625, "step": 1265 }, { "epoch": 0.23921772403042185, "grad_norm": 2.193206923283996, "learning_rate": 9.481470780118261e-07, "logits/chosen": 1.240234375, "logits/rejected": 1.1181640625, "logps/chosen": -854.5, "logps/rejected": -862.5, "loss": 0.6393, "rewards/accuracies": 0.75, "rewards/chosen": 0.45556640625, "rewards/margins": 2.630859375, "rewards/rejected": -2.173828125, "step": 1266 }, { "epoch": 0.23940667957862913, "grad_norm": 1.8999856242311557, "learning_rate": 9.480086693405744e-07, "logits/chosen": 2.1953125, "logits/rejected": 1.92578125, "logps/chosen": -734.0, "logps/rejected": -671.0, "loss": 0.5914, "rewards/accuracies": 0.84375, "rewards/chosen": 0.57275390625, "rewards/margins": 2.6953125, "rewards/rejected": -2.1171875, "step": 1267 }, { "epoch": 0.2395956351268364, "grad_norm": 1.6649845813733473, "learning_rate": 9.478700875158928e-07, "logits/chosen": 2.0078125, "logits/rejected": 1.9365234375, "logps/chosen": -1304.0, "logps/rejected": -969.5, "loss": 0.6527, "rewards/accuracies": 0.71875, "rewards/chosen": -0.06201171875, "rewards/margins": 2.392578125, "rewards/rejected": -2.45458984375, "step": 1268 }, { "epoch": 0.2397845906750437, "grad_norm": 1.8580202851291796, "learning_rate": 9.477313325980713e-07, "logits/chosen": 1.5693359375, "logits/rejected": 1.73583984375, "logps/chosen": -875.0, "logps/rejected": -1013.0, "loss": 0.6441, "rewards/accuracies": 0.71875, "rewards/chosen": 0.51025390625, "rewards/margins": 3.0546875, "rewards/rejected": -2.544921875, "step": 1269 }, { "epoch": 0.23997354622325098, "grad_norm": 2.02897979666718, "learning_rate": 9.475924046474751e-07, "logits/chosen": 1.25, "logits/rejected": 1.0849609375, "logps/chosen": -714.25, "logps/rejected": -939.0, "loss": 0.7463, "rewards/accuracies": 0.8125, "rewards/chosen": 0.010650634765625, "rewards/margins": 2.64453125, "rewards/rejected": -2.63671875, "step": 1270 }, { "epoch": 0.24016250177145826, "grad_norm": 2.5632568167126144, "learning_rate": 9.474533037245448e-07, "logits/chosen": 2.39453125, "logits/rejected": 2.44140625, "logps/chosen": -1244.0, "logps/rejected": -1130.0, "loss": 0.63, "rewards/accuracies": 0.78125, "rewards/chosen": 1.23388671875, "rewards/margins": 3.158203125, "rewards/rejected": -1.92578125, "step": 1271 }, { "epoch": 0.24035145731966556, "grad_norm": 1.8735752909116665, "learning_rate": 9.473140298897961e-07, "logits/chosen": 1.679931640625, "logits/rejected": 1.48291015625, "logps/chosen": -669.5, "logps/rejected": -768.0, "loss": 0.7141, "rewards/accuracies": 0.84375, "rewards/chosen": 0.447265625, "rewards/margins": 1.904296875, "rewards/rejected": -1.4560546875, "step": 1272 }, { "epoch": 0.24054041286787284, "grad_norm": 1.4712554502450323, "learning_rate": 9.471745832038202e-07, "logits/chosen": 1.90625, "logits/rejected": 1.763671875, "logps/chosen": -1254.0, "logps/rejected": -1423.0, "loss": 0.5296, "rewards/accuracies": 0.9375, "rewards/chosen": 1.857421875, "rewards/margins": 4.94921875, "rewards/rejected": -3.099609375, "step": 1273 }, { "epoch": 0.24072936841608011, "grad_norm": 2.0272501589789553, "learning_rate": 9.47034963727283e-07, "logits/chosen": 1.0751953125, "logits/rejected": 0.762664794921875, "logps/chosen": -564.0, "logps/rejected": -553.0, "loss": 0.6982, "rewards/accuracies": 0.78125, "rewards/chosen": 0.505615234375, "rewards/margins": 1.998046875, "rewards/rejected": -1.4892578125, "step": 1274 }, { "epoch": 0.2409183239642874, "grad_norm": 2.233662631937747, "learning_rate": 9.468951715209259e-07, "logits/chosen": 0.6240234375, "logits/rejected": 0.676025390625, "logps/chosen": -951.0, "logps/rejected": -792.0, "loss": 0.7671, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02178955078125, "rewards/margins": 2.0625, "rewards/rejected": -2.0390625, "step": 1275 }, { "epoch": 0.2411072795124947, "grad_norm": 2.4852800441281038, "learning_rate": 9.467552066455658e-07, "logits/chosen": 1.765625, "logits/rejected": 1.1357421875, "logps/chosen": -1043.0, "logps/rejected": -762.5, "loss": 0.5924, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8681640625, "rewards/margins": 3.31640625, "rewards/rejected": -2.44921875, "step": 1276 }, { "epoch": 0.24129623506070197, "grad_norm": 3.6123241063134577, "learning_rate": 9.466150691620937e-07, "logits/chosen": 2.0986328125, "logits/rejected": 2.208984375, "logps/chosen": -767.0, "logps/rejected": -1083.0, "loss": 0.6713, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4560546875, "rewards/margins": 2.7255859375, "rewards/rejected": -2.27099609375, "step": 1277 }, { "epoch": 0.24148519060890924, "grad_norm": 1.9711567821583305, "learning_rate": 9.464747591314769e-07, "logits/chosen": 1.87890625, "logits/rejected": 2.236328125, "logps/chosen": -770.0, "logps/rejected": -887.5, "loss": 0.6772, "rewards/accuracies": 0.75, "rewards/chosen": 0.75244140625, "rewards/margins": 2.8583984375, "rewards/rejected": -2.107421875, "step": 1278 }, { "epoch": 0.24167414615711655, "grad_norm": 1.9485486936287961, "learning_rate": 9.463342766147568e-07, "logits/chosen": 1.4256591796875, "logits/rejected": 1.6279296875, "logps/chosen": -855.5, "logps/rejected": -690.0, "loss": 0.6223, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3701171875, "rewards/margins": 2.951171875, "rewards/rejected": -2.578125, "step": 1279 }, { "epoch": 0.24186310170532382, "grad_norm": 1.5428286213544657, "learning_rate": 9.461936216730506e-07, "logits/chosen": 1.84765625, "logits/rejected": 1.07275390625, "logps/chosen": -525.0, "logps/rejected": -567.0, "loss": 0.6108, "rewards/accuracies": 0.84375, "rewards/chosen": 0.70458984375, "rewards/margins": 2.8125, "rewards/rejected": -2.103515625, "step": 1280 }, { "epoch": 0.2420520572535311, "grad_norm": 1.9778560859893601, "learning_rate": 9.460527943675499e-07, "logits/chosen": 1.82196044921875, "logits/rejected": 1.592041015625, "logps/chosen": -371.25, "logps/rejected": -1029.5, "loss": 0.6738, "rewards/accuracies": 0.84375, "rewards/chosen": 0.130859375, "rewards/margins": 3.142578125, "rewards/rejected": -3.009765625, "step": 1281 }, { "epoch": 0.2422410128017384, "grad_norm": 1.9291158960884638, "learning_rate": 9.459117947595216e-07, "logits/chosen": 1.76708984375, "logits/rejected": 1.9736328125, "logps/chosen": -454.25, "logps/rejected": -1477.5, "loss": 0.712, "rewards/accuracies": 0.84375, "rewards/chosen": 0.34326171875, "rewards/margins": 2.658203125, "rewards/rejected": -2.310546875, "step": 1282 }, { "epoch": 0.24242996834994568, "grad_norm": 1.985630480861377, "learning_rate": 9.457706229103073e-07, "logits/chosen": 1.57177734375, "logits/rejected": 2.072265625, "logps/chosen": -835.5, "logps/rejected": -1344.0, "loss": 0.6527, "rewards/accuracies": 0.875, "rewards/chosen": 0.2545166015625, "rewards/margins": 3.99609375, "rewards/rejected": -3.7421875, "step": 1283 }, { "epoch": 0.24261892389815296, "grad_norm": 1.754924846748309, "learning_rate": 9.456292788813241e-07, "logits/chosen": 2.20166015625, "logits/rejected": 1.653076171875, "logps/chosen": -779.0, "logps/rejected": -798.0, "loss": 0.6943, "rewards/accuracies": 0.78125, "rewards/chosen": 0.224365234375, "rewards/margins": 2.515625, "rewards/rejected": -2.29296875, "step": 1284 }, { "epoch": 0.24280787944636023, "grad_norm": 1.997474958573437, "learning_rate": 9.454877627340634e-07, "logits/chosen": 0.6353759765625, "logits/rejected": -0.0146484375, "logps/chosen": -810.0, "logps/rejected": -556.5, "loss": 0.5884, "rewards/accuracies": 0.84375, "rewards/chosen": 0.506591796875, "rewards/margins": 2.875, "rewards/rejected": -2.36328125, "step": 1285 }, { "epoch": 0.24299683499456753, "grad_norm": 2.1225435962739314, "learning_rate": 9.453460745300918e-07, "logits/chosen": 1.44140625, "logits/rejected": 1.87890625, "logps/chosen": -614.0, "logps/rejected": -700.5, "loss": 0.6562, "rewards/accuracies": 0.75, "rewards/chosen": 0.751953125, "rewards/margins": 2.83984375, "rewards/rejected": -2.087890625, "step": 1286 }, { "epoch": 0.2431857905427748, "grad_norm": 1.734223950778348, "learning_rate": 9.452042143310503e-07, "logits/chosen": 2.01171875, "logits/rejected": 1.599609375, "logps/chosen": -687.5, "logps/rejected": -574.5, "loss": 0.6071, "rewards/accuracies": 0.875, "rewards/chosen": 0.5771484375, "rewards/margins": 3.11328125, "rewards/rejected": -2.5390625, "step": 1287 }, { "epoch": 0.24337474609098209, "grad_norm": 1.700492600742832, "learning_rate": 9.450621821986559e-07, "logits/chosen": 2.0556640625, "logits/rejected": 2.021484375, "logps/chosen": -695.75, "logps/rejected": -723.0, "loss": 0.7043, "rewards/accuracies": 0.6875, "rewards/chosen": 0.47265625, "rewards/margins": 2.04296875, "rewards/rejected": -1.5751953125, "step": 1288 }, { "epoch": 0.2435637016391894, "grad_norm": 2.140034050957626, "learning_rate": 9.449199781946987e-07, "logits/chosen": 2.4375, "logits/rejected": 2.765625, "logps/chosen": -750.0, "logps/rejected": -1169.0, "loss": 0.7663, "rewards/accuracies": 0.59375, "rewards/chosen": 0.81787109375, "rewards/margins": 2.2373046875, "rewards/rejected": -1.4130859375, "step": 1289 }, { "epoch": 0.24375265718739667, "grad_norm": 1.5778188591022753, "learning_rate": 9.447776023810451e-07, "logits/chosen": 2.02734375, "logits/rejected": 1.8115234375, "logps/chosen": -711.5, "logps/rejected": -854.0, "loss": 0.5936, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1044921875, "rewards/margins": 3.15234375, "rewards/rejected": -2.044921875, "step": 1290 }, { "epoch": 0.24394161273560394, "grad_norm": 2.0833937075642837, "learning_rate": 9.446350548196353e-07, "logits/chosen": 2.02392578125, "logits/rejected": 1.6484375, "logps/chosen": -832.5, "logps/rejected": -808.0, "loss": 0.6432, "rewards/accuracies": 0.71875, "rewards/chosen": 0.16143798828125, "rewards/margins": 3.109375, "rewards/rejected": -2.947265625, "step": 1291 }, { "epoch": 0.24413056828381124, "grad_norm": 2.297429026980229, "learning_rate": 9.444923355724846e-07, "logits/chosen": 0.8394775390625, "logits/rejected": 1.154296875, "logps/chosen": -1189.0, "logps/rejected": -1140.0, "loss": 0.6166, "rewards/accuracies": 0.875, "rewards/chosen": 1.02587890625, "rewards/margins": 3.44677734375, "rewards/rejected": -2.42236328125, "step": 1292 }, { "epoch": 0.24431952383201852, "grad_norm": 1.486827042151255, "learning_rate": 9.443494447016829e-07, "logits/chosen": 2.40234375, "logits/rejected": 2.365234375, "logps/chosen": -558.0, "logps/rejected": -642.0, "loss": 0.7906, "rewards/accuracies": 0.59375, "rewards/chosen": 0.4638671875, "rewards/margins": 1.8935546875, "rewards/rejected": -1.42529296875, "step": 1293 }, { "epoch": 0.2445084793802258, "grad_norm": 2.5491504288004783, "learning_rate": 9.44206382269395e-07, "logits/chosen": 1.8369140625, "logits/rejected": 2.27734375, "logps/chosen": -833.25, "logps/rejected": -1012.25, "loss": 0.6745, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3670654296875, "rewards/margins": 2.82373046875, "rewards/rejected": -2.4521484375, "step": 1294 }, { "epoch": 0.2446974349284331, "grad_norm": 1.9328972249812681, "learning_rate": 9.440631483378599e-07, "logits/chosen": 1.02001953125, "logits/rejected": 1.105224609375, "logps/chosen": -1306.5, "logps/rejected": -835.5, "loss": 0.6881, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1533203125, "rewards/margins": 2.462890625, "rewards/rejected": -2.623046875, "step": 1295 }, { "epoch": 0.24488639047664038, "grad_norm": 1.6090451635706653, "learning_rate": 9.439197429693912e-07, "logits/chosen": 1.865234375, "logits/rejected": 1.7333984375, "logps/chosen": -590.0, "logps/rejected": -768.0, "loss": 0.6338, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6402587890625, "rewards/margins": 3.16015625, "rewards/rejected": -2.517578125, "step": 1296 }, { "epoch": 0.24507534602484765, "grad_norm": 3.1375703370845085, "learning_rate": 9.437761662263779e-07, "logits/chosen": 2.05859375, "logits/rejected": 2.271484375, "logps/chosen": -839.0, "logps/rejected": -986.0, "loss": 0.6041, "rewards/accuracies": 0.84375, "rewards/chosen": 0.36279296875, "rewards/margins": 3.685546875, "rewards/rejected": -3.310546875, "step": 1297 }, { "epoch": 0.24526430157305493, "grad_norm": 2.0508770154047986, "learning_rate": 9.436324181712826e-07, "logits/chosen": 1.490234375, "logits/rejected": 1.197265625, "logps/chosen": -621.5, "logps/rejected": -633.0, "loss": 0.6051, "rewards/accuracies": 0.875, "rewards/chosen": 0.8076171875, "rewards/margins": 3.01953125, "rewards/rejected": -2.2109375, "step": 1298 }, { "epoch": 0.24545325712126223, "grad_norm": 1.9942888733153556, "learning_rate": 9.434884988666429e-07, "logits/chosen": 1.986572265625, "logits/rejected": 2.02294921875, "logps/chosen": -842.0, "logps/rejected": -1114.0, "loss": 0.7311, "rewards/accuracies": 0.8125, "rewards/chosen": 0.326416015625, "rewards/margins": 2.5361328125, "rewards/rejected": -2.212890625, "step": 1299 }, { "epoch": 0.2456422126694695, "grad_norm": 2.0089893727473354, "learning_rate": 9.433444083750707e-07, "logits/chosen": 2.390625, "logits/rejected": 1.654296875, "logps/chosen": -805.0, "logps/rejected": -781.0, "loss": 0.6719, "rewards/accuracies": 0.75, "rewards/chosen": 0.6484375, "rewards/margins": 3.03515625, "rewards/rejected": -2.3857421875, "step": 1300 }, { "epoch": 0.24583116821767678, "grad_norm": 3.0365719905561037, "learning_rate": 9.432001467592525e-07, "logits/chosen": 1.3076171875, "logits/rejected": 1.7265625, "logps/chosen": -653.5, "logps/rejected": -830.0, "loss": 0.6671, "rewards/accuracies": 0.875, "rewards/chosen": 0.6103515625, "rewards/margins": 2.853515625, "rewards/rejected": -2.236328125, "step": 1301 }, { "epoch": 0.24602012376588409, "grad_norm": 1.8519940556057728, "learning_rate": 9.430557140819495e-07, "logits/chosen": 1.310546875, "logits/rejected": 1.271484375, "logps/chosen": -1007.0, "logps/rejected": -19213.5, "loss": 0.5886, "rewards/accuracies": 0.875, "rewards/chosen": 1.0573272705078125, "rewards/margins": 73.7890625, "rewards/rejected": -72.734375, "step": 1302 }, { "epoch": 0.24620907931409136, "grad_norm": 1.8061858193330687, "learning_rate": 9.429111104059967e-07, "logits/chosen": 2.25, "logits/rejected": 1.75390625, "logps/chosen": -766.5, "logps/rejected": -622.0, "loss": 0.6719, "rewards/accuracies": 0.78125, "rewards/chosen": 0.921875, "rewards/margins": 2.8984375, "rewards/rejected": -1.98193359375, "step": 1303 }, { "epoch": 0.24639803486229864, "grad_norm": 2.129183062397453, "learning_rate": 9.42766335794304e-07, "logits/chosen": 1.58203125, "logits/rejected": 1.521484375, "logps/chosen": -560.0, "logps/rejected": -675.0, "loss": 0.5292, "rewards/accuracies": 0.84375, "rewards/chosen": 0.935302734375, "rewards/margins": 3.41015625, "rewards/rejected": -2.47265625, "step": 1304 }, { "epoch": 0.24658699041050594, "grad_norm": 1.791644611461071, "learning_rate": 9.426213903098555e-07, "logits/chosen": 0.6129150390625, "logits/rejected": 0.8511962890625, "logps/chosen": -674.0, "logps/rejected": -588.0, "loss": 0.6062, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7939453125, "rewards/margins": 2.8671875, "rewards/rejected": -2.072265625, "step": 1305 }, { "epoch": 0.24677594595871322, "grad_norm": 1.5857687817682697, "learning_rate": 9.424762740157097e-07, "logits/chosen": 1.8466796875, "logits/rejected": 2.02294921875, "logps/chosen": -823.0, "logps/rejected": -616.0, "loss": 0.6427, "rewards/accuracies": 0.78125, "rewards/chosen": 0.54736328125, "rewards/margins": 3.08203125, "rewards/rejected": -2.5390625, "step": 1306 }, { "epoch": 0.2469649015069205, "grad_norm": 2.034687476667753, "learning_rate": 9.423309869749991e-07, "logits/chosen": 1.0540771484375, "logits/rejected": 0.855712890625, "logps/chosen": -736.0, "logps/rejected": -772.0, "loss": 0.5587, "rewards/accuracies": 0.90625, "rewards/chosen": 0.65234375, "rewards/margins": 3.5703125, "rewards/rejected": -2.91015625, "step": 1307 }, { "epoch": 0.24715385705512777, "grad_norm": 1.6485041945345196, "learning_rate": 9.421855292509312e-07, "logits/chosen": 2.7734375, "logits/rejected": 2.84375, "logps/chosen": -1267.0, "logps/rejected": -1048.0, "loss": 0.6397, "rewards/accuracies": 0.75, "rewards/chosen": 1.3524169921875, "rewards/margins": 3.34765625, "rewards/rejected": -2.001220703125, "step": 1308 }, { "epoch": 0.24734281260333507, "grad_norm": 1.9672625561460342, "learning_rate": 9.420399009067868e-07, "logits/chosen": 2.01416015625, "logits/rejected": 2.130859375, "logps/chosen": -876.5, "logps/rejected": -776.5, "loss": 0.5826, "rewards/accuracies": 0.84375, "rewards/chosen": 0.623046875, "rewards/margins": 3.5859375, "rewards/rejected": -2.9609375, "step": 1309 }, { "epoch": 0.24753176815154235, "grad_norm": 2.0964109914257096, "learning_rate": 9.418941020059216e-07, "logits/chosen": 2.09765625, "logits/rejected": 2.53515625, "logps/chosen": -729.0, "logps/rejected": -1365.0, "loss": 0.7315, "rewards/accuracies": 0.71875, "rewards/chosen": 1.3720703125, "rewards/margins": 2.77734375, "rewards/rejected": -1.4150390625, "step": 1310 }, { "epoch": 0.24772072369974962, "grad_norm": 2.145739193658405, "learning_rate": 9.417481326117655e-07, "logits/chosen": 1.92578125, "logits/rejected": 2.26953125, "logps/chosen": -812.0, "logps/rejected": -836.5, "loss": 0.6704, "rewards/accuracies": 0.75, "rewards/chosen": 0.75830078125, "rewards/margins": 2.63671875, "rewards/rejected": -1.876953125, "step": 1311 }, { "epoch": 0.24790967924795693, "grad_norm": 2.324372693333751, "learning_rate": 9.416019927878224e-07, "logits/chosen": 1.96875, "logits/rejected": 2.376953125, "logps/chosen": -679.5, "logps/rejected": -733.0, "loss": 0.6085, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9697265625, "rewards/margins": 3.564453125, "rewards/rejected": -2.5908203125, "step": 1312 }, { "epoch": 0.2480986347961642, "grad_norm": 2.1735329384944166, "learning_rate": 9.4145568259767e-07, "logits/chosen": 1.3896484375, "logits/rejected": 1.1171875, "logps/chosen": -665.0, "logps/rejected": -890.0, "loss": 0.5424, "rewards/accuracies": 0.90625, "rewards/chosen": 0.73876953125, "rewards/margins": 3.22265625, "rewards/rejected": -2.4921875, "step": 1313 }, { "epoch": 0.24828759034437148, "grad_norm": 1.553403059104956, "learning_rate": 9.413092021049607e-07, "logits/chosen": 2.361328125, "logits/rejected": 2.71875, "logps/chosen": -620.5, "logps/rejected": -1069.5, "loss": 0.6225, "rewards/accuracies": 0.78125, "rewards/chosen": 0.888916015625, "rewards/margins": 3.326171875, "rewards/rejected": -2.4365234375, "step": 1314 }, { "epoch": 0.24847654589257878, "grad_norm": 1.6857244537777425, "learning_rate": 9.411625513734209e-07, "logits/chosen": 1.9951171875, "logits/rejected": 1.73565673828125, "logps/chosen": -1062.5, "logps/rejected": -1037.0, "loss": 0.5816, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3779296875, "rewards/margins": 3.72265625, "rewards/rejected": -2.349609375, "step": 1315 }, { "epoch": 0.24866550144078606, "grad_norm": 1.4669483085171753, "learning_rate": 9.410157304668506e-07, "logits/chosen": 2.515625, "logits/rejected": 2.21875, "logps/chosen": -518.0, "logps/rejected": -572.0, "loss": 0.7143, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7001953125, "rewards/margins": 2.421875, "rewards/rejected": -1.724609375, "step": 1316 }, { "epoch": 0.24885445698899333, "grad_norm": 1.8671793526839588, "learning_rate": 9.408687394491241e-07, "logits/chosen": 1.95703125, "logits/rejected": 2.05078125, "logps/chosen": -671.0, "logps/rejected": -779.0, "loss": 0.6323, "rewards/accuracies": 0.84375, "rewards/chosen": 0.68603515625, "rewards/margins": 3.453125, "rewards/rejected": -2.7646484375, "step": 1317 }, { "epoch": 0.24904341253720064, "grad_norm": 1.9461633389433917, "learning_rate": 9.407215783841902e-07, "logits/chosen": 1.880859375, "logits/rejected": 1.90625, "logps/chosen": -936.0, "logps/rejected": -1058.0, "loss": 0.6034, "rewards/accuracies": 0.78125, "rewards/chosen": 0.91845703125, "rewards/margins": 3.375, "rewards/rejected": -2.462890625, "step": 1318 }, { "epoch": 0.2492323680854079, "grad_norm": 1.6560884794286135, "learning_rate": 9.405742473360708e-07, "logits/chosen": 1.94140625, "logits/rejected": 1.7421875, "logps/chosen": -893.0, "logps/rejected": -879.0, "loss": 0.6479, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0244140625, "rewards/margins": 3.23828125, "rewards/rejected": -2.216796875, "step": 1319 }, { "epoch": 0.2494213236336152, "grad_norm": 1.978048009998364, "learning_rate": 9.404267463688625e-07, "logits/chosen": 2.130859375, "logits/rejected": 2.7421875, "logps/chosen": -650.5, "logps/rejected": -2171.0, "loss": 0.7046, "rewards/accuracies": 0.71875, "rewards/chosen": 0.478759765625, "rewards/margins": 4.6025390625, "rewards/rejected": -4.14453125, "step": 1320 }, { "epoch": 0.24961027918182246, "grad_norm": 2.467833881018456, "learning_rate": 9.40279075546735e-07, "logits/chosen": 2.2119140625, "logits/rejected": 2.64453125, "logps/chosen": -1040.5, "logps/rejected": -1107.0, "loss": 0.5494, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1103515625, "rewards/margins": 4.455078125, "rewards/rejected": -3.349609375, "step": 1321 }, { "epoch": 0.24979923473002977, "grad_norm": 1.879037868476772, "learning_rate": 9.401312349339331e-07, "logits/chosen": 1.8583984375, "logits/rejected": 1.54296875, "logps/chosen": -1149.0, "logps/rejected": -826.0, "loss": 0.5044, "rewards/accuracies": 0.9375, "rewards/chosen": 1.06689453125, "rewards/margins": 3.984375, "rewards/rejected": -2.91796875, "step": 1322 }, { "epoch": 0.24998819027823704, "grad_norm": 1.7843017355860873, "learning_rate": 9.399832245947743e-07, "logits/chosen": 2.70703125, "logits/rejected": 3.1328125, "logps/chosen": -915.0, "logps/rejected": -979.5, "loss": 0.7124, "rewards/accuracies": 0.65625, "rewards/chosen": 0.231689453125, "rewards/margins": 2.802734375, "rewards/rejected": -2.564453125, "step": 1323 }, { "epoch": 0.25017714582644435, "grad_norm": 1.9146049861362242, "learning_rate": 9.398350445936505e-07, "logits/chosen": 1.99609375, "logits/rejected": 1.69189453125, "logps/chosen": -828.5, "logps/rejected": -582.0, "loss": 0.6858, "rewards/accuracies": 0.71875, "rewards/chosen": 0.056396484375, "rewards/margins": 2.53515625, "rewards/rejected": -2.482421875, "step": 1324 }, { "epoch": 0.2503661013746516, "grad_norm": 1.9984795643143063, "learning_rate": 9.396866949950274e-07, "logits/chosen": 1.8515625, "logits/rejected": 2.17578125, "logps/chosen": -546.5, "logps/rejected": -561.5, "loss": 0.7034, "rewards/accuracies": 0.78125, "rewards/chosen": 0.037841796875, "rewards/margins": 2.40234375, "rewards/rejected": -2.3671875, "step": 1325 }, { "epoch": 0.2505550569228589, "grad_norm": 3.210933481590768, "learning_rate": 9.395381758634443e-07, "logits/chosen": 2.01953125, "logits/rejected": 1.93798828125, "logps/chosen": -1273.0, "logps/rejected": -1121.0, "loss": 0.6122, "rewards/accuracies": 0.78125, "rewards/chosen": 0.50390625, "rewards/margins": 3.76953125, "rewards/rejected": -3.27734375, "step": 1326 }, { "epoch": 0.2507440124710662, "grad_norm": 2.0391677945510507, "learning_rate": 9.393894872635145e-07, "logits/chosen": 1.87158203125, "logits/rejected": 1.613037109375, "logps/chosen": -612.0, "logps/rejected": -734.5, "loss": 0.6434, "rewards/accuracies": 0.75, "rewards/chosen": 0.2138671875, "rewards/margins": 4.390625, "rewards/rejected": -4.1875, "step": 1327 }, { "epoch": 0.25093296801927345, "grad_norm": 2.007359798347941, "learning_rate": 9.392406292599246e-07, "logits/chosen": 1.4921875, "logits/rejected": 1.017333984375, "logps/chosen": -935.0, "logps/rejected": -759.0, "loss": 0.6404, "rewards/accuracies": 0.84375, "rewards/chosen": 0.70703125, "rewards/margins": 2.654296875, "rewards/rejected": -1.94140625, "step": 1328 }, { "epoch": 0.25112192356748075, "grad_norm": 2.0775200982884963, "learning_rate": 9.390916019174355e-07, "logits/chosen": 0.884765625, "logits/rejected": 1.53125, "logps/chosen": -521.0, "logps/rejected": -467.5, "loss": 0.7581, "rewards/accuracies": 0.6875, "rewards/chosen": 0.10107421875, "rewards/margins": 2.0546875, "rewards/rejected": -1.955078125, "step": 1329 }, { "epoch": 0.25131087911568806, "grad_norm": 1.935423114609131, "learning_rate": 9.389424053008814e-07, "logits/chosen": 2.181640625, "logits/rejected": 2.263671875, "logps/chosen": -724.0, "logps/rejected": -1057.5, "loss": 0.5796, "rewards/accuracies": 0.8125, "rewards/chosen": 0.944580078125, "rewards/margins": 3.615234375, "rewards/rejected": -2.673828125, "step": 1330 }, { "epoch": 0.2514998346638953, "grad_norm": 2.3566434294726326, "learning_rate": 9.3879303947517e-07, "logits/chosen": 1.60546875, "logits/rejected": 1.833984375, "logps/chosen": -833.0, "logps/rejected": -1024.0, "loss": 0.5791, "rewards/accuracies": 0.8125, "rewards/chosen": 0.95562744140625, "rewards/margins": 4.2109375, "rewards/rejected": -3.259765625, "step": 1331 }, { "epoch": 0.2516887902121026, "grad_norm": 1.8632379602228124, "learning_rate": 9.38643504505283e-07, "logits/chosen": 1.38037109375, "logits/rejected": 1.3271484375, "logps/chosen": -1171.0, "logps/rejected": -1096.0, "loss": 0.4898, "rewards/accuracies": 0.90625, "rewards/chosen": 1.09521484375, "rewards/margins": 4.50390625, "rewards/rejected": -3.40625, "step": 1332 }, { "epoch": 0.2518777457603099, "grad_norm": 1.8247423566209333, "learning_rate": 9.384938004562753e-07, "logits/chosen": 1.49609375, "logits/rejected": 1.51513671875, "logps/chosen": -539.0, "logps/rejected": -863.0, "loss": 0.744, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0859375, "rewards/margins": 2.728515625, "rewards/rejected": -2.8125, "step": 1333 }, { "epoch": 0.25206670130851716, "grad_norm": 2.1181515015933585, "learning_rate": 9.383439273932758e-07, "logits/chosen": 2.162109375, "logits/rejected": 2.2373046875, "logps/chosen": -491.0, "logps/rejected": -815.0, "loss": 0.6855, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4422607421875, "rewards/margins": 2.6484375, "rewards/rejected": -2.203125, "step": 1334 }, { "epoch": 0.25225565685672446, "grad_norm": 1.9421490942696884, "learning_rate": 9.381938853814867e-07, "logits/chosen": 1.713623046875, "logits/rejected": 1.7442626953125, "logps/chosen": -801.0, "logps/rejected": -861.5, "loss": 0.5824, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9453125, "rewards/margins": 3.64453125, "rewards/rejected": -2.701171875, "step": 1335 }, { "epoch": 0.25244461240493177, "grad_norm": 1.7805237845070088, "learning_rate": 9.380436744861835e-07, "logits/chosen": 1.760009765625, "logits/rejected": 1.435546875, "logps/chosen": -452.0, "logps/rejected": -882.5, "loss": 0.6696, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0982666015625, "rewards/margins": 2.75, "rewards/rejected": -2.84765625, "step": 1336 }, { "epoch": 0.252633567953139, "grad_norm": 2.082695366028042, "learning_rate": 9.378932947727154e-07, "logits/chosen": 1.732666015625, "logits/rejected": 1.348388671875, "logps/chosen": -1349.0, "logps/rejected": -1083.5, "loss": 0.6499, "rewards/accuracies": 0.84375, "rewards/chosen": 0.68408203125, "rewards/margins": 3.9453125, "rewards/rejected": -3.2578125, "step": 1337 }, { "epoch": 0.2528225235013463, "grad_norm": 2.6566893823456113, "learning_rate": 9.37742746306505e-07, "logits/chosen": 1.85888671875, "logits/rejected": 1.8115234375, "logps/chosen": -1064.0, "logps/rejected": -973.5, "loss": 0.6998, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3994140625, "rewards/margins": 6.37109375, "rewards/rejected": -5.96484375, "step": 1338 }, { "epoch": 0.25301147904955357, "grad_norm": 1.4314427119488315, "learning_rate": 9.375920291530484e-07, "logits/chosen": 1.83984375, "logits/rejected": 1.65234375, "logps/chosen": -865.5, "logps/rejected": -819.5, "loss": 0.6925, "rewards/accuracies": 0.71875, "rewards/chosen": 0.357177734375, "rewards/margins": 2.8203125, "rewards/rejected": -2.462890625, "step": 1339 }, { "epoch": 0.25320043459776087, "grad_norm": 2.186825132936162, "learning_rate": 9.374411433779148e-07, "logits/chosen": 1.720703125, "logits/rejected": 1.966796875, "logps/chosen": -878.0, "logps/rejected": -884.0, "loss": 0.6062, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3226318359375, "rewards/margins": 3.9375, "rewards/rejected": -3.61328125, "step": 1340 }, { "epoch": 0.2533893901459682, "grad_norm": 2.0664121322806905, "learning_rate": 9.372900890467473e-07, "logits/chosen": 1.50390625, "logits/rejected": 1.6044921875, "logps/chosen": -459.0, "logps/rejected": -553.25, "loss": 0.8425, "rewards/accuracies": 0.71875, "rewards/chosen": -0.05615234375, "rewards/margins": 1.3349609375, "rewards/rejected": -1.39208984375, "step": 1341 }, { "epoch": 0.2535783456941754, "grad_norm": 2.1127116621672295, "learning_rate": 9.371388662252616e-07, "logits/chosen": 1.88671875, "logits/rejected": 1.654296875, "logps/chosen": -764.0, "logps/rejected": -826.0, "loss": 0.6006, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4404296875, "rewards/margins": 3.666015625, "rewards/rejected": -3.23046875, "step": 1342 }, { "epoch": 0.2537673012423827, "grad_norm": 1.6580354167380622, "learning_rate": 9.369874749792476e-07, "logits/chosen": 1.72216796875, "logits/rejected": 1.68115234375, "logps/chosen": -1044.0, "logps/rejected": -1226.0, "loss": 0.665, "rewards/accuracies": 0.78125, "rewards/chosen": 0.52880859375, "rewards/margins": 3.763671875, "rewards/rejected": -3.2265625, "step": 1343 }, { "epoch": 0.25395625679059003, "grad_norm": 1.709917691774445, "learning_rate": 9.368359153745674e-07, "logits/chosen": 2.337890625, "logits/rejected": 2.45703125, "logps/chosen": -584.0, "logps/rejected": -639.0, "loss": 0.723, "rewards/accuracies": 0.8125, "rewards/chosen": 0.769287109375, "rewards/margins": 2.3876953125, "rewards/rejected": -1.62255859375, "step": 1344 }, { "epoch": 0.2541452123387973, "grad_norm": 2.1202886148877824, "learning_rate": 9.366841874771573e-07, "logits/chosen": 1.8701171875, "logits/rejected": 2.22119140625, "logps/chosen": -902.0, "logps/rejected": -916.0, "loss": 0.7302, "rewards/accuracies": 0.65625, "rewards/chosen": 0.39501953125, "rewards/margins": 2.05078125, "rewards/rejected": -1.65234375, "step": 1345 }, { "epoch": 0.2543341678870046, "grad_norm": 2.1970534899066614, "learning_rate": 9.365322913530264e-07, "logits/chosen": 1.576171875, "logits/rejected": 1.5062103271484375, "logps/chosen": -1262.0, "logps/rejected": -1037.5, "loss": 0.5917, "rewards/accuracies": 0.84375, "rewards/chosen": 0.29833984375, "rewards/margins": 4.03125, "rewards/rejected": -3.73046875, "step": 1346 }, { "epoch": 0.2545231234352119, "grad_norm": 1.6762052437814705, "learning_rate": 9.363802270682568e-07, "logits/chosen": 2.24755859375, "logits/rejected": 2.2265625, "logps/chosen": -778.0, "logps/rejected": -805.0, "loss": 0.7109, "rewards/accuracies": 0.78125, "rewards/chosen": 0.72607421875, "rewards/margins": 3.0048828125, "rewards/rejected": -2.27978515625, "step": 1347 }, { "epoch": 0.25471207898341913, "grad_norm": 5.443844280923797, "learning_rate": 9.362279946890042e-07, "logits/chosen": 2.255859375, "logits/rejected": 2.734375, "logps/chosen": -841.5, "logps/rejected": -1334.0, "loss": 0.6619, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4970703125, "rewards/margins": 3.46484375, "rewards/rejected": -2.96875, "step": 1348 }, { "epoch": 0.25490103453162644, "grad_norm": 1.7082933302319152, "learning_rate": 9.360755942814971e-07, "logits/chosen": 0.91943359375, "logits/rejected": 1.01123046875, "logps/chosen": -603.5, "logps/rejected": -547.5, "loss": 0.8287, "rewards/accuracies": 0.6875, "rewards/chosen": -0.33984375, "rewards/margins": 1.068359375, "rewards/rejected": -1.40771484375, "step": 1349 }, { "epoch": 0.25508999007983374, "grad_norm": 1.5478307294668907, "learning_rate": 9.359230259120372e-07, "logits/chosen": 2.03515625, "logits/rejected": 1.47021484375, "logps/chosen": -688.0, "logps/rejected": -702.0, "loss": 0.7396, "rewards/accuracies": 0.6875, "rewards/chosen": 0.71923828125, "rewards/margins": 2.93505859375, "rewards/rejected": -2.208740234375, "step": 1350 }, { "epoch": 0.255278945628041, "grad_norm": 1.5170746195062064, "learning_rate": 9.357702896469992e-07, "logits/chosen": 1.6396484375, "logits/rejected": 2.080078125, "logps/chosen": -715.0, "logps/rejected": -558.5, "loss": 0.6201, "rewards/accuracies": 0.90625, "rewards/chosen": 0.849609375, "rewards/margins": 3.58984375, "rewards/rejected": -2.734375, "step": 1351 }, { "epoch": 0.2554679011762483, "grad_norm": 1.776492982588551, "learning_rate": 9.356173855528311e-07, "logits/chosen": 2.224609375, "logits/rejected": 2.4326171875, "logps/chosen": -629.625, "logps/rejected": -791.5, "loss": 0.606, "rewards/accuracies": 0.875, "rewards/chosen": 1.129638671875, "rewards/margins": 3.390625, "rewards/rejected": -2.2578125, "step": 1352 }, { "epoch": 0.2556568567244556, "grad_norm": 1.9962456309784804, "learning_rate": 9.354643136960537e-07, "logits/chosen": 1.50390625, "logits/rejected": 1.65234375, "logps/chosen": -408.5, "logps/rejected": -998.0, "loss": 0.7476, "rewards/accuracies": 0.78125, "rewards/chosen": 0.40966796875, "rewards/margins": 2.125, "rewards/rejected": -1.71875, "step": 1353 }, { "epoch": 0.25584581227266284, "grad_norm": 2.367981826970881, "learning_rate": 9.353110741432607e-07, "logits/chosen": 2.388671875, "logits/rejected": 2.4326171875, "logps/chosen": -841.25, "logps/rejected": -972.0, "loss": 0.7141, "rewards/accuracies": 0.65625, "rewards/chosen": 0.9537353515625, "rewards/margins": 2.73828125, "rewards/rejected": -1.7890625, "step": 1354 }, { "epoch": 0.25603476782087015, "grad_norm": 2.075888074744316, "learning_rate": 9.35157666961119e-07, "logits/chosen": 1.4462890625, "logits/rejected": 1.57421875, "logps/chosen": -932.5, "logps/rejected": -831.5, "loss": 0.6393, "rewards/accuracies": 0.75, "rewards/chosen": 1.0283203125, "rewards/margins": 3.19140625, "rewards/rejected": -2.158203125, "step": 1355 }, { "epoch": 0.25622372336907745, "grad_norm": 1.7162790684944067, "learning_rate": 9.350040922163682e-07, "logits/chosen": 1.896484375, "logits/rejected": 2.75, "logps/chosen": -661.5, "logps/rejected": -1589.0, "loss": 0.6401, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5472412109375, "rewards/margins": 3.81640625, "rewards/rejected": -3.259765625, "step": 1356 }, { "epoch": 0.2564126789172847, "grad_norm": 2.246095625590802, "learning_rate": 9.348503499758211e-07, "logits/chosen": 1.435546875, "logits/rejected": 1.87109375, "logps/chosen": -678.0, "logps/rejected": -714.5, "loss": 0.7096, "rewards/accuracies": 0.75, "rewards/chosen": 0.413818359375, "rewards/margins": 2.1796875, "rewards/rejected": -1.767578125, "step": 1357 }, { "epoch": 0.256601634465492, "grad_norm": 2.1014378521405725, "learning_rate": 9.346964403063627e-07, "logits/chosen": 1.03857421875, "logits/rejected": 1.4775390625, "logps/chosen": -793.0, "logps/rejected": -870.5, "loss": 0.682, "rewards/accuracies": 0.8125, "rewards/chosen": 0.91796875, "rewards/margins": 3.837890625, "rewards/rejected": -2.9189453125, "step": 1358 }, { "epoch": 0.2567905900136993, "grad_norm": 1.7131726362498887, "learning_rate": 9.345423632749519e-07, "logits/chosen": 1.638671875, "logits/rejected": 1.5072021484375, "logps/chosen": -482.5, "logps/rejected": -563.5, "loss": 0.7238, "rewards/accuracies": 0.78125, "rewards/chosen": 0.374267578125, "rewards/margins": 2.533203125, "rewards/rejected": -2.158203125, "step": 1359 }, { "epoch": 0.25697954556190655, "grad_norm": 2.1709208208854687, "learning_rate": 9.343881189486195e-07, "logits/chosen": 1.14697265625, "logits/rejected": 1.04150390625, "logps/chosen": -1121.0, "logps/rejected": -1092.0, "loss": 0.621, "rewards/accuracies": 0.8125, "rewards/chosen": 1.064453125, "rewards/margins": 3.37890625, "rewards/rejected": -2.3125, "step": 1360 }, { "epoch": 0.25716850111011386, "grad_norm": 2.5494413441376444, "learning_rate": 9.342337073944692e-07, "logits/chosen": 2.443359375, "logits/rejected": 1.8380126953125, "logps/chosen": -741.0, "logps/rejected": -693.0, "loss": 0.6311, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3936767578125, "rewards/margins": 2.9609375, "rewards/rejected": -2.5703125, "step": 1361 }, { "epoch": 0.2573574566583211, "grad_norm": 2.120840635750742, "learning_rate": 9.340791286796781e-07, "logits/chosen": 1.486328125, "logits/rejected": 1.63671875, "logps/chosen": -655.0, "logps/rejected": -575.5, "loss": 0.5935, "rewards/accuracies": 0.875, "rewards/chosen": 0.6875, "rewards/margins": 3.9296875, "rewards/rejected": -3.2421875, "step": 1362 }, { "epoch": 0.2575464122065284, "grad_norm": 2.382935041277075, "learning_rate": 9.339243828714954e-07, "logits/chosen": 1.061767578125, "logits/rejected": 0.3968505859375, "logps/chosen": -1016.0, "logps/rejected": -767.0, "loss": 0.5993, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3935546875, "rewards/margins": 3.8203125, "rewards/rejected": -3.42578125, "step": 1363 }, { "epoch": 0.2577353677547357, "grad_norm": 1.8682570899428284, "learning_rate": 9.33769470037243e-07, "logits/chosen": 1.5816650390625, "logits/rejected": 1.51171875, "logps/chosen": -909.0, "logps/rejected": -1144.0, "loss": 0.6055, "rewards/accuracies": 0.8125, "rewards/chosen": 1.14892578125, "rewards/margins": 4.328125, "rewards/rejected": -3.17578125, "step": 1364 }, { "epoch": 0.25792432330294296, "grad_norm": 2.423507403061969, "learning_rate": 9.336143902443159e-07, "logits/chosen": 2.662109375, "logits/rejected": 2.58203125, "logps/chosen": -654.0, "logps/rejected": -716.5, "loss": 0.6109, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4228515625, "rewards/margins": 3.54296875, "rewards/rejected": -3.1171875, "step": 1365 }, { "epoch": 0.25811327885115026, "grad_norm": 1.6778326818174802, "learning_rate": 9.334591435601812e-07, "logits/chosen": 2.5, "logits/rejected": 2.828125, "logps/chosen": -764.5, "logps/rejected": -950.0, "loss": 0.7857, "rewards/accuracies": 0.625, "rewards/chosen": 0.765625, "rewards/margins": 2.451171875, "rewards/rejected": -1.6796875, "step": 1366 }, { "epoch": 0.25830223439935757, "grad_norm": 1.9440780708725631, "learning_rate": 9.33303730052379e-07, "logits/chosen": 2.8125, "logits/rejected": 2.720703125, "logps/chosen": -797.0, "logps/rejected": -733.5, "loss": 0.642, "rewards/accuracies": 0.8125, "rewards/chosen": 1.08544921875, "rewards/margins": 3.23046875, "rewards/rejected": -2.14208984375, "step": 1367 }, { "epoch": 0.2584911899475648, "grad_norm": 1.9639058713441333, "learning_rate": 9.331481497885221e-07, "logits/chosen": 2.216796875, "logits/rejected": 2.28515625, "logps/chosen": -607.5, "logps/rejected": -1035.0, "loss": 0.6435, "rewards/accuracies": 0.75, "rewards/chosen": 1.02734375, "rewards/margins": 3.14453125, "rewards/rejected": -2.1171875, "step": 1368 }, { "epoch": 0.2586801454957721, "grad_norm": 2.1444164346723382, "learning_rate": 9.329924028362952e-07, "logits/chosen": 1.54296875, "logits/rejected": 1.3974609375, "logps/chosen": -898.0, "logps/rejected": -1196.0, "loss": 0.548, "rewards/accuracies": 0.84375, "rewards/chosen": 0.48779296875, "rewards/margins": 4.33984375, "rewards/rejected": -3.85546875, "step": 1369 }, { "epoch": 0.2588691010439794, "grad_norm": 2.189659063171361, "learning_rate": 9.328364892634562e-07, "logits/chosen": 1.2197265625, "logits/rejected": 1.18603515625, "logps/chosen": -671.0, "logps/rejected": -815.0, "loss": 0.6607, "rewards/accuracies": 0.78125, "rewards/chosen": -0.10595703125, "rewards/margins": 3.3515625, "rewards/rejected": -3.4609375, "step": 1370 }, { "epoch": 0.25905805659218667, "grad_norm": 2.8825411639464282, "learning_rate": 9.32680409137835e-07, "logits/chosen": 1.73974609375, "logits/rejected": 1.2333984375, "logps/chosen": -765.0, "logps/rejected": -743.0, "loss": 0.6424, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4755859375, "rewards/margins": 3.494140625, "rewards/rejected": -3.025390625, "step": 1371 }, { "epoch": 0.259247012140394, "grad_norm": 1.5410064821025002, "learning_rate": 9.325241625273343e-07, "logits/chosen": 2.111328125, "logits/rejected": 2.107421875, "logps/chosen": -844.0, "logps/rejected": -870.5, "loss": 0.6492, "rewards/accuracies": 0.75, "rewards/chosen": 0.7515869140625, "rewards/margins": 3.8125, "rewards/rejected": -3.0625, "step": 1372 }, { "epoch": 0.2594359676886013, "grad_norm": 1.5493813879633898, "learning_rate": 9.323677494999289e-07, "logits/chosen": 1.53759765625, "logits/rejected": 1.056640625, "logps/chosen": -558.5, "logps/rejected": -18530.0, "loss": 0.6254, "rewards/accuracies": 0.875, "rewards/chosen": 0.55419921875, "rewards/margins": 67.23828125, "rewards/rejected": -66.3203125, "step": 1373 }, { "epoch": 0.2596249232368085, "grad_norm": 2.662853343713208, "learning_rate": 9.322111701236665e-07, "logits/chosen": 1.921875, "logits/rejected": 2.15625, "logps/chosen": -1162.0, "logps/rejected": -1223.5, "loss": 0.5392, "rewards/accuracies": 0.90625, "rewards/chosen": 2.05078125, "rewards/margins": 4.734375, "rewards/rejected": -2.685546875, "step": 1374 }, { "epoch": 0.25981387878501583, "grad_norm": 1.578353732909439, "learning_rate": 9.320544244666668e-07, "logits/chosen": 2.36328125, "logits/rejected": 2.625, "logps/chosen": -905.5, "logps/rejected": -10337.0, "loss": 0.6804, "rewards/accuracies": 0.75, "rewards/chosen": 1.0830078125, "rewards/margins": 16.1015625, "rewards/rejected": -15.01171875, "step": 1375 }, { "epoch": 0.26000283433322313, "grad_norm": 2.730200873606219, "learning_rate": 9.318975125971216e-07, "logits/chosen": 2.6025390625, "logits/rejected": 2.07470703125, "logps/chosen": -572.5, "logps/rejected": -575.0, "loss": 0.5356, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1644287109375, "rewards/margins": 3.96484375, "rewards/rejected": -2.798828125, "step": 1376 }, { "epoch": 0.2601917898814304, "grad_norm": 2.746017698265355, "learning_rate": 9.317404345832955e-07, "logits/chosen": 1.7939453125, "logits/rejected": 2.1806640625, "logps/chosen": -838.5, "logps/rejected": -795.0, "loss": 0.5701, "rewards/accuracies": 0.9375, "rewards/chosen": 0.97674560546875, "rewards/margins": 3.58984375, "rewards/rejected": -2.6083984375, "step": 1377 }, { "epoch": 0.2603807454296377, "grad_norm": 1.914496336500059, "learning_rate": 9.315831904935253e-07, "logits/chosen": 2.240234375, "logits/rejected": 2.919921875, "logps/chosen": -850.0, "logps/rejected": -1010.0, "loss": 0.6848, "rewards/accuracies": 0.75, "rewards/chosen": 1.318359375, "rewards/margins": 3.7734375, "rewards/rejected": -2.451171875, "step": 1378 }, { "epoch": 0.260569700977845, "grad_norm": 2.243551586264431, "learning_rate": 9.314257803962198e-07, "logits/chosen": 1.93017578125, "logits/rejected": 1.63134765625, "logps/chosen": -1028.0, "logps/rejected": -862.0, "loss": 0.583, "rewards/accuracies": 0.90625, "rewards/chosen": 1.279296875, "rewards/margins": 3.796875, "rewards/rejected": -2.5234375, "step": 1379 }, { "epoch": 0.26075865652605223, "grad_norm": 3.853164207081859, "learning_rate": 9.312682043598603e-07, "logits/chosen": 2.25, "logits/rejected": 1.990234375, "logps/chosen": -823.5, "logps/rejected": -882.5, "loss": 0.5663, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2705078125, "rewards/margins": 3.8984375, "rewards/rejected": -2.630859375, "step": 1380 }, { "epoch": 0.26094761207425954, "grad_norm": 1.8522754536827568, "learning_rate": 9.311104624530002e-07, "logits/chosen": 1.9541015625, "logits/rejected": 1.697265625, "logps/chosen": -1145.0, "logps/rejected": -1150.0, "loss": 0.5776, "rewards/accuracies": 0.8125, "rewards/chosen": 0.515380859375, "rewards/margins": 3.58203125, "rewards/rejected": -3.06640625, "step": 1381 }, { "epoch": 0.26113656762246684, "grad_norm": 2.160456616979671, "learning_rate": 9.309525547442648e-07, "logits/chosen": 1.6201171875, "logits/rejected": 1.64013671875, "logps/chosen": -867.0, "logps/rejected": -1067.0, "loss": 0.57, "rewards/accuracies": 0.84375, "rewards/chosen": 0.68328857421875, "rewards/margins": 4.22265625, "rewards/rejected": -3.54296875, "step": 1382 }, { "epoch": 0.2613255231706741, "grad_norm": 4.060227043525233, "learning_rate": 9.307944813023518e-07, "logits/chosen": 2.654296875, "logits/rejected": 2.76171875, "logps/chosen": -764.0, "logps/rejected": -805.5, "loss": 0.7686, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01953125, "rewards/margins": 2.0703125, "rewards/rejected": -2.0439453125, "step": 1383 }, { "epoch": 0.2615144787188814, "grad_norm": 2.1062025801648563, "learning_rate": 9.30636242196031e-07, "logits/chosen": 0.92578125, "logits/rejected": 0.8583984375, "logps/chosen": -474.5, "logps/rejected": -15712.0, "loss": 0.6991, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3856201171875, "rewards/margins": 63.564453125, "rewards/rejected": -63.947265625, "step": 1384 }, { "epoch": 0.26170343426708864, "grad_norm": 1.7464127464538926, "learning_rate": 9.304778374941442e-07, "logits/chosen": 1.751953125, "logits/rejected": 1.81640625, "logps/chosen": -878.0, "logps/rejected": -1110.0, "loss": 0.6634, "rewards/accuracies": 0.78125, "rewards/chosen": 0.648193359375, "rewards/margins": 3.6796875, "rewards/rejected": -3.037109375, "step": 1385 }, { "epoch": 0.26189238981529595, "grad_norm": 2.6516098757250335, "learning_rate": 9.303192672656055e-07, "logits/chosen": 2.0419921875, "logits/rejected": 1.85498046875, "logps/chosen": -742.0, "logps/rejected": -973.0, "loss": 0.6302, "rewards/accuracies": 0.78125, "rewards/chosen": 0.08154296875, "rewards/margins": 3.798828125, "rewards/rejected": -3.703125, "step": 1386 }, { "epoch": 0.26208134536350325, "grad_norm": 2.291229964262031, "learning_rate": 9.301605315794005e-07, "logits/chosen": 1.1953125, "logits/rejected": 0.9912109375, "logps/chosen": -746.0, "logps/rejected": -800.0, "loss": 0.6577, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5308837890625, "rewards/margins": 3.33984375, "rewards/rejected": -2.81640625, "step": 1387 }, { "epoch": 0.2622703009117105, "grad_norm": 2.2181982322284135, "learning_rate": 9.300016305045871e-07, "logits/chosen": 1.69140625, "logits/rejected": 1.867156982421875, "logps/chosen": -929.5, "logps/rejected": -1195.0, "loss": 0.6228, "rewards/accuracies": 0.8125, "rewards/chosen": -0.317626953125, "rewards/margins": 3.615234375, "rewards/rejected": -3.9296875, "step": 1388 }, { "epoch": 0.2624592564599178, "grad_norm": 3.1472745817259047, "learning_rate": 9.298425641102952e-07, "logits/chosen": 1.6494140625, "logits/rejected": 1.49609375, "logps/chosen": -822.5, "logps/rejected": -1303.0, "loss": 0.6982, "rewards/accuracies": 0.75, "rewards/chosen": 0.04241943359375, "rewards/margins": 3.79296875, "rewards/rejected": -3.75, "step": 1389 }, { "epoch": 0.2626482120081251, "grad_norm": 2.669949052541854, "learning_rate": 9.296833324657266e-07, "logits/chosen": 1.8267822265625, "logits/rejected": 2.1376953125, "logps/chosen": -1151.0, "logps/rejected": -939.5, "loss": 0.6504, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4580078125, "rewards/margins": 3.423095703125, "rewards/rejected": -2.9609375, "step": 1390 }, { "epoch": 0.26283716755633235, "grad_norm": 3.1429251139081784, "learning_rate": 9.295239356401547e-07, "logits/chosen": 1.609375, "logits/rejected": 1.93212890625, "logps/chosen": -1079.5, "logps/rejected": -1037.5, "loss": 0.5687, "rewards/accuracies": 0.84375, "rewards/chosen": 0.48779296875, "rewards/margins": 4.69921875, "rewards/rejected": -4.20703125, "step": 1391 }, { "epoch": 0.26302612310453966, "grad_norm": 2.758437539847755, "learning_rate": 9.293643737029252e-07, "logits/chosen": 2.248046875, "logits/rejected": 1.7799072265625, "logps/chosen": -826.0, "logps/rejected": -1127.0, "loss": 0.7107, "rewards/accuracies": 0.8125, "rewards/chosen": 0.456298828125, "rewards/margins": 2.83203125, "rewards/rejected": -2.375, "step": 1392 }, { "epoch": 0.26321507865274696, "grad_norm": 1.9124401244169913, "learning_rate": 9.292046467234553e-07, "logits/chosen": 1.259033203125, "logits/rejected": 1.654296875, "logps/chosen": -987.5, "logps/rejected": -964.0, "loss": 0.6533, "rewards/accuracies": 0.8125, "rewards/chosen": 0.791015625, "rewards/margins": 3.396484375, "rewards/rejected": -2.607421875, "step": 1393 }, { "epoch": 0.2634040342009542, "grad_norm": 3.5069636069031596, "learning_rate": 9.290447547712344e-07, "logits/chosen": 2.5751953125, "logits/rejected": 2.15625, "logps/chosen": -952.0, "logps/rejected": -1063.0, "loss": 0.6637, "rewards/accuracies": 0.875, "rewards/chosen": 0.738037109375, "rewards/margins": 4.34765625, "rewards/rejected": -3.60546875, "step": 1394 }, { "epoch": 0.2635929897491615, "grad_norm": 2.010924804553323, "learning_rate": 9.28884697915823e-07, "logits/chosen": 1.5166015625, "logits/rejected": 1.50390625, "logps/chosen": -801.5, "logps/rejected": -967.0, "loss": 0.5616, "rewards/accuracies": 0.78125, "rewards/chosen": 0.852294921875, "rewards/margins": 4.44921875, "rewards/rejected": -3.59765625, "step": 1395 }, { "epoch": 0.2637819452973688, "grad_norm": 2.0681162101335038, "learning_rate": 9.287244762268538e-07, "logits/chosen": 1.533203125, "logits/rejected": 1.099609375, "logps/chosen": -824.0, "logps/rejected": -1047.0, "loss": 0.6369, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3443603515625, "rewards/margins": 3.6015625, "rewards/rejected": -3.25390625, "step": 1396 }, { "epoch": 0.26397090084557606, "grad_norm": 1.4591298637701293, "learning_rate": 9.285640897740315e-07, "logits/chosen": 2.001953125, "logits/rejected": 1.72802734375, "logps/chosen": -605.0, "logps/rejected": -817.0, "loss": 0.6836, "rewards/accuracies": 0.78125, "rewards/chosen": 0.91845703125, "rewards/margins": 2.89453125, "rewards/rejected": -1.9765625, "step": 1397 }, { "epoch": 0.26415985639378337, "grad_norm": 2.7570917579112506, "learning_rate": 9.284035386271319e-07, "logits/chosen": 2.724609375, "logits/rejected": 2.580078125, "logps/chosen": -805.0, "logps/rejected": -874.0, "loss": 0.6845, "rewards/accuracies": 0.78125, "rewards/chosen": 0.869140625, "rewards/margins": 3.13671875, "rewards/rejected": -2.26953125, "step": 1398 }, { "epoch": 0.26434881194199067, "grad_norm": 2.507703389087237, "learning_rate": 9.282428228560025e-07, "logits/chosen": 1.8818359375, "logits/rejected": 1.65087890625, "logps/chosen": -766.0, "logps/rejected": -1478.0, "loss": 0.6125, "rewards/accuracies": 0.84375, "rewards/chosen": 0.72216796875, "rewards/margins": 5.578125, "rewards/rejected": -4.849609375, "step": 1399 }, { "epoch": 0.2645377674901979, "grad_norm": 2.29636626663999, "learning_rate": 9.280819425305628e-07, "logits/chosen": 2.0322265625, "logits/rejected": 2.19140625, "logps/chosen": -1059.0, "logps/rejected": -1138.0, "loss": 0.5304, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4443359375, "rewards/margins": 4.599609375, "rewards/rejected": -3.15283203125, "step": 1400 }, { "epoch": 0.2647267230384052, "grad_norm": 1.806042869934184, "learning_rate": 9.279208977208036e-07, "logits/chosen": 1.47998046875, "logits/rejected": 1.4512939453125, "logps/chosen": -668.5, "logps/rejected": -616.5, "loss": 0.6104, "rewards/accuracies": 0.90625, "rewards/chosen": 1.12890625, "rewards/margins": 2.931640625, "rewards/rejected": -1.7998046875, "step": 1401 }, { "epoch": 0.2649156785866125, "grad_norm": 1.576422087528761, "learning_rate": 9.277596884967875e-07, "logits/chosen": 1.72119140625, "logits/rejected": 1.893310546875, "logps/chosen": -840.5, "logps/rejected": -1058.0, "loss": 0.6812, "rewards/accuracies": 0.71875, "rewards/chosen": 0.82421875, "rewards/margins": 4.595703125, "rewards/rejected": -3.7734375, "step": 1402 }, { "epoch": 0.26510463413481977, "grad_norm": 2.5048870155882863, "learning_rate": 9.275983149286483e-07, "logits/chosen": 2.16552734375, "logits/rejected": 2.3896484375, "logps/chosen": -673.5, "logps/rejected": -916.0, "loss": 0.7373, "rewards/accuracies": 0.8125, "rewards/chosen": 0.73583984375, "rewards/margins": 2.642578125, "rewards/rejected": -1.8984375, "step": 1403 }, { "epoch": 0.2652935896830271, "grad_norm": 2.155343918198375, "learning_rate": 9.274367770865919e-07, "logits/chosen": 1.32421875, "logits/rejected": 1.25, "logps/chosen": -604.5, "logps/rejected": -565.0, "loss": 0.6141, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8759765625, "rewards/margins": 3.07421875, "rewards/rejected": -2.19921875, "step": 1404 }, { "epoch": 0.2654825452312344, "grad_norm": 2.090053487275063, "learning_rate": 9.272750750408945e-07, "logits/chosen": 1.3297119140625, "logits/rejected": 1.3291015625, "logps/chosen": -830.0, "logps/rejected": -767.5, "loss": 0.6813, "rewards/accuracies": 0.78125, "rewards/chosen": 0.03515625, "rewards/margins": 2.02734375, "rewards/rejected": -1.9873046875, "step": 1405 }, { "epoch": 0.2656715007794416, "grad_norm": 1.8277719837632083, "learning_rate": 9.27113208861905e-07, "logits/chosen": 1.1923828125, "logits/rejected": 1.244140625, "logps/chosen": -966.5, "logps/rejected": -1059.0, "loss": 0.6156, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0400390625, "rewards/margins": 8.583984375, "rewards/rejected": -7.572265625, "step": 1406 }, { "epoch": 0.26586045632764893, "grad_norm": 2.974050001800443, "learning_rate": 9.269511786200431e-07, "logits/chosen": 1.8466796875, "logits/rejected": 2.0302734375, "logps/chosen": -731.0, "logps/rejected": -8707.0, "loss": 0.73, "rewards/accuracies": 0.6875, "rewards/chosen": 0.138671875, "rewards/margins": 140.234375, "rewards/rejected": -140.2421875, "step": 1407 }, { "epoch": 0.2660494118758562, "grad_norm": 2.598556321745529, "learning_rate": 9.267889843857998e-07, "logits/chosen": 1.8251953125, "logits/rejected": 0.986328125, "logps/chosen": -468.5, "logps/rejected": -471.0, "loss": 0.7039, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0445556640625, "rewards/margins": 2.75, "rewards/rejected": -2.7041015625, "step": 1408 }, { "epoch": 0.2662383674240635, "grad_norm": 2.2859219765679235, "learning_rate": 9.266266262297378e-07, "logits/chosen": 1.8563232421875, "logits/rejected": 1.2509765625, "logps/chosen": -914.0, "logps/rejected": -772.0, "loss": 0.4943, "rewards/accuracies": 0.875, "rewards/chosen": 0.861328125, "rewards/margins": 4.1328125, "rewards/rejected": -3.2734375, "step": 1409 }, { "epoch": 0.2664273229722708, "grad_norm": 2.203534572447248, "learning_rate": 9.264641042224908e-07, "logits/chosen": 1.07373046875, "logits/rejected": 1.241455078125, "logps/chosen": -686.5, "logps/rejected": -1216.0, "loss": 0.722, "rewards/accuracies": 0.84375, "rewards/chosen": 0.51300048828125, "rewards/margins": 3.150390625, "rewards/rejected": -2.6328125, "step": 1410 }, { "epoch": 0.26661627852047803, "grad_norm": 1.4141491240941808, "learning_rate": 9.26301418434764e-07, "logits/chosen": 1.431640625, "logits/rejected": 2.1708984375, "logps/chosen": -422.0, "logps/rejected": -2279.5, "loss": 0.8234, "rewards/accuracies": 0.6875, "rewards/chosen": 0.28271484375, "rewards/margins": 3.20703125, "rewards/rejected": -2.9267578125, "step": 1411 }, { "epoch": 0.26680523406868534, "grad_norm": 1.7416432429376976, "learning_rate": 9.261385689373337e-07, "logits/chosen": 1.054443359375, "logits/rejected": 0.88238525390625, "logps/chosen": -512.0, "logps/rejected": -644.0, "loss": 0.7105, "rewards/accuracies": 0.78125, "rewards/chosen": 0.64599609375, "rewards/margins": 2.58203125, "rewards/rejected": -1.935546875, "step": 1412 }, { "epoch": 0.26699418961689264, "grad_norm": 2.123878862636371, "learning_rate": 9.259755558010473e-07, "logits/chosen": 2.048828125, "logits/rejected": 2.427734375, "logps/chosen": -923.0, "logps/rejected": -1003.0, "loss": 0.5996, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2255859375, "rewards/margins": 3.4375, "rewards/rejected": -2.2021484375, "step": 1413 }, { "epoch": 0.2671831451650999, "grad_norm": 2.5218310944384568, "learning_rate": 9.258123790968237e-07, "logits/chosen": 2.220703125, "logits/rejected": 2.3515625, "logps/chosen": -1558.0, "logps/rejected": -1362.0, "loss": 0.5275, "rewards/accuracies": 0.8125, "rewards/chosen": 1.828125, "rewards/margins": 4.4375, "rewards/rejected": -2.607421875, "step": 1414 }, { "epoch": 0.2673721007133072, "grad_norm": 1.639680721903814, "learning_rate": 9.25649038895653e-07, "logits/chosen": 2.369140625, "logits/rejected": 2.1953125, "logps/chosen": -863.0, "logps/rejected": -747.5, "loss": 0.5948, "rewards/accuracies": 0.8125, "rewards/chosen": 1.216796875, "rewards/margins": 3.90234375, "rewards/rejected": -2.6875, "step": 1415 }, { "epoch": 0.2675610562615145, "grad_norm": 2.1942608573332616, "learning_rate": 9.254855352685962e-07, "logits/chosen": 2.251953125, "logits/rejected": 2.076171875, "logps/chosen": -660.5, "logps/rejected": -716.0, "loss": 0.6835, "rewards/accuracies": 0.71875, "rewards/chosen": 0.763916015625, "rewards/margins": 2.78125, "rewards/rejected": -2.013671875, "step": 1416 }, { "epoch": 0.26775001180972174, "grad_norm": 2.0264707241191258, "learning_rate": 9.253218682867851e-07, "logits/chosen": 1.1171875, "logits/rejected": 0.845458984375, "logps/chosen": -927.5, "logps/rejected": -742.0, "loss": 0.6774, "rewards/accuracies": 0.84375, "rewards/chosen": 0.85546875, "rewards/margins": 2.736328125, "rewards/rejected": -1.8759765625, "step": 1417 }, { "epoch": 0.26793896735792905, "grad_norm": 1.6330838610379355, "learning_rate": 9.251580380214234e-07, "logits/chosen": 1.3876953125, "logits/rejected": 1.400390625, "logps/chosen": -985.0, "logps/rejected": -1326.0, "loss": 0.6815, "rewards/accuracies": 0.75, "rewards/chosen": 0.95703125, "rewards/margins": 4.69140625, "rewards/rejected": -3.744140625, "step": 1418 }, { "epoch": 0.26812792290613635, "grad_norm": 3.345096278368, "learning_rate": 9.24994044543785e-07, "logits/chosen": 1.6796875, "logits/rejected": 1.28125, "logps/chosen": -943.0, "logps/rejected": -1211.0, "loss": 0.6016, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5380859375, "rewards/margins": 3.49609375, "rewards/rejected": -2.953125, "step": 1419 }, { "epoch": 0.2683168784543436, "grad_norm": 1.46150560321109, "learning_rate": 9.248298879252155e-07, "logits/chosen": 1.904296875, "logits/rejected": 2.322265625, "logps/chosen": -1038.0, "logps/rejected": -1916.0, "loss": 0.5118, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0986328125, "rewards/margins": 6.265625, "rewards/rejected": -5.16796875, "step": 1420 }, { "epoch": 0.2685058340025509, "grad_norm": 2.468035139433207, "learning_rate": 9.246655682371308e-07, "logits/chosen": 1.427978515625, "logits/rejected": 1.3203125, "logps/chosen": -1088.0, "logps/rejected": -860.5, "loss": 0.5283, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0263671875, "rewards/margins": 3.875, "rewards/rejected": -2.8515625, "step": 1421 }, { "epoch": 0.2686947895507582, "grad_norm": 1.8818634071817182, "learning_rate": 9.245010855510181e-07, "logits/chosen": 2.318359375, "logits/rejected": 2.8212890625, "logps/chosen": -618.0, "logps/rejected": -941.0, "loss": 0.6467, "rewards/accuracies": 0.75, "rewards/chosen": 0.77734375, "rewards/margins": 3.1083984375, "rewards/rejected": -2.3299560546875, "step": 1422 }, { "epoch": 0.26888374509896545, "grad_norm": 1.9585920521794102, "learning_rate": 9.243364399384358e-07, "logits/chosen": 2.177001953125, "logits/rejected": 2.33349609375, "logps/chosen": -639.0, "logps/rejected": -519.75, "loss": 0.7819, "rewards/accuracies": 0.78125, "rewards/chosen": 0.62353515625, "rewards/margins": 2.06689453125, "rewards/rejected": -1.4423828125, "step": 1423 }, { "epoch": 0.26907270064717276, "grad_norm": 2.058204589189185, "learning_rate": 9.241716314710126e-07, "logits/chosen": 1.923828125, "logits/rejected": 1.9609375, "logps/chosen": -422.75, "logps/rejected": -539.5, "loss": 0.782, "rewards/accuracies": 0.625, "rewards/chosen": 0.684326171875, "rewards/margins": 1.85546875, "rewards/rejected": -1.171875, "step": 1424 }, { "epoch": 0.26926165619538006, "grad_norm": 2.381978448787129, "learning_rate": 9.240066602204484e-07, "logits/chosen": 1.8291015625, "logits/rejected": 1.9140625, "logps/chosen": -729.0, "logps/rejected": -590.0, "loss": 0.6544, "rewards/accuracies": 0.75, "rewards/chosen": 0.71044921875, "rewards/margins": 2.7001953125, "rewards/rejected": -1.9931640625, "step": 1425 }, { "epoch": 0.2694506117435873, "grad_norm": 1.854655552905433, "learning_rate": 9.238415262585138e-07, "logits/chosen": 1.4423828125, "logits/rejected": 1.4814453125, "logps/chosen": -747.0, "logps/rejected": -739.5, "loss": 0.6049, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7752685546875, "rewards/margins": 3.90234375, "rewards/rejected": -3.12109375, "step": 1426 }, { "epoch": 0.2696395672917946, "grad_norm": 1.700172845357795, "learning_rate": 9.236762296570504e-07, "logits/chosen": 1.349609375, "logits/rejected": 1.39453125, "logps/chosen": -616.0, "logps/rejected": -349.25, "loss": 0.7783, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2080078125, "rewards/margins": 1.259765625, "rewards/rejected": -1.05078125, "step": 1427 }, { "epoch": 0.2698285228400019, "grad_norm": 1.7876657748408167, "learning_rate": 9.235107704879703e-07, "logits/chosen": 2.041015625, "logits/rejected": 1.826171875, "logps/chosen": -977.0, "logps/rejected": -849.0, "loss": 0.6371, "rewards/accuracies": 0.84375, "rewards/chosen": 1.09619140625, "rewards/margins": 3.1708984375, "rewards/rejected": -2.0693359375, "step": 1428 }, { "epoch": 0.27001747838820916, "grad_norm": 2.237168942055256, "learning_rate": 9.233451488232563e-07, "logits/chosen": 0.8486328125, "logits/rejected": 0.760986328125, "logps/chosen": -886.0, "logps/rejected": -879.0, "loss": 0.6463, "rewards/accuracies": 0.75, "rewards/chosen": 0.434814453125, "rewards/margins": 2.814453125, "rewards/rejected": -2.376953125, "step": 1429 }, { "epoch": 0.27020643393641647, "grad_norm": 2.1069795167825798, "learning_rate": 9.231793647349621e-07, "logits/chosen": 1.736328125, "logits/rejected": 1.4638671875, "logps/chosen": -1107.0, "logps/rejected": -1046.0, "loss": 0.5651, "rewards/accuracies": 0.8125, "rewards/chosen": 1.244140625, "rewards/margins": 4.0078125, "rewards/rejected": -2.76171875, "step": 1430 }, { "epoch": 0.2703953894846237, "grad_norm": 3.1514274237466884, "learning_rate": 9.230134182952119e-07, "logits/chosen": 2.1953125, "logits/rejected": 1.78125, "logps/chosen": -549.5, "logps/rejected": -616.0, "loss": 0.5878, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8175048828125, "rewards/margins": 3.73046875, "rewards/rejected": -2.9140625, "step": 1431 }, { "epoch": 0.270584345032831, "grad_norm": 1.7468071729105188, "learning_rate": 9.228473095762008e-07, "logits/chosen": 2.24951171875, "logits/rejected": 1.64453125, "logps/chosen": -867.0, "logps/rejected": -782.0, "loss": 0.6143, "rewards/accuracies": 0.8125, "rewards/chosen": 1.044189453125, "rewards/margins": 3.677734375, "rewards/rejected": -2.63671875, "step": 1432 }, { "epoch": 0.2707733005810383, "grad_norm": 1.906584486661897, "learning_rate": 9.22681038650194e-07, "logits/chosen": 2.568359375, "logits/rejected": 2.921875, "logps/chosen": -439.75, "logps/rejected": -1162.0, "loss": 0.739, "rewards/accuracies": 0.75, "rewards/chosen": 0.164794921875, "rewards/margins": 3.34375, "rewards/rejected": -3.177734375, "step": 1433 }, { "epoch": 0.27096225612924557, "grad_norm": 2.1024914097366776, "learning_rate": 9.225146055895277e-07, "logits/chosen": 1.828125, "logits/rejected": 1.9677734375, "logps/chosen": -1091.0, "logps/rejected": -1910.0, "loss": 0.5418, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6923828125, "rewards/margins": 5.84375, "rewards/rejected": -5.16015625, "step": 1434 }, { "epoch": 0.2711512116774529, "grad_norm": 1.7899087038225134, "learning_rate": 9.223480104666086e-07, "logits/chosen": 1.833984375, "logits/rejected": 1.583984375, "logps/chosen": -605.0, "logps/rejected": -898.5, "loss": 0.5269, "rewards/accuracies": 0.875, "rewards/chosen": 0.704833984375, "rewards/margins": 4.890625, "rewards/rejected": -4.181640625, "step": 1435 }, { "epoch": 0.2713401672256602, "grad_norm": 2.316942284485987, "learning_rate": 9.221812533539138e-07, "logits/chosen": 1.845703125, "logits/rejected": 1.849609375, "logps/chosen": -903.0, "logps/rejected": -1828.0, "loss": 0.5562, "rewards/accuracies": 0.875, "rewards/chosen": 0.7783203125, "rewards/margins": 5.14453125, "rewards/rejected": -4.35546875, "step": 1436 }, { "epoch": 0.2715291227738674, "grad_norm": 2.2428446515017404, "learning_rate": 9.220143343239906e-07, "logits/chosen": 2.267578125, "logits/rejected": 1.875, "logps/chosen": -1277.0, "logps/rejected": -1357.0, "loss": 0.5296, "rewards/accuracies": 0.8125, "rewards/chosen": 1.431640625, "rewards/margins": 5.140625, "rewards/rejected": -3.703125, "step": 1437 }, { "epoch": 0.27171807832207473, "grad_norm": 2.164657637620252, "learning_rate": 9.218472534494573e-07, "logits/chosen": 2.23046875, "logits/rejected": 2.388671875, "logps/chosen": -644.5, "logps/rejected": -2377.5, "loss": 0.6669, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8193359375, "rewards/margins": 5.26953125, "rewards/rejected": -4.453125, "step": 1438 }, { "epoch": 0.27190703387028203, "grad_norm": 1.8779559601001323, "learning_rate": 9.216800108030023e-07, "logits/chosen": 1.4267578125, "logits/rejected": 1.62744140625, "logps/chosen": -821.5, "logps/rejected": -1017.0, "loss": 0.6192, "rewards/accuracies": 0.875, "rewards/chosen": 1.0252685546875, "rewards/margins": 3.50390625, "rewards/rejected": -2.478515625, "step": 1439 }, { "epoch": 0.2720959894184893, "grad_norm": 1.6459449428383859, "learning_rate": 9.215126064573843e-07, "logits/chosen": 1.001953125, "logits/rejected": 0.69921875, "logps/chosen": -714.0, "logps/rejected": -858.0, "loss": 0.5197, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9140625, "rewards/margins": 3.900390625, "rewards/rejected": -2.9814453125, "step": 1440 }, { "epoch": 0.2722849449666966, "grad_norm": 3.563235159935221, "learning_rate": 9.213450404854324e-07, "logits/chosen": 1.680908203125, "logits/rejected": 1.3125, "logps/chosen": -668.0, "logps/rejected": -924.0, "loss": 0.6371, "rewards/accuracies": 0.8125, "rewards/chosen": -1.32080078125, "rewards/margins": 1.5703125, "rewards/rejected": -2.88671875, "step": 1441 }, { "epoch": 0.2724739005149039, "grad_norm": 1.7273266308066877, "learning_rate": 9.211773129600466e-07, "logits/chosen": 2.140625, "logits/rejected": 2.732421875, "logps/chosen": -686.0, "logps/rejected": -2361.5, "loss": 0.6706, "rewards/accuracies": 0.78125, "rewards/chosen": 0.635498046875, "rewards/margins": 5.0625, "rewards/rejected": -4.42578125, "step": 1442 }, { "epoch": 0.27266285606311114, "grad_norm": 1.7115970477663864, "learning_rate": 9.21009423954196e-07, "logits/chosen": 1.384765625, "logits/rejected": 0.697265625, "logps/chosen": -1046.0, "logps/rejected": -638.0, "loss": 0.6216, "rewards/accuracies": 0.8125, "rewards/chosen": -0.314056396484375, "rewards/margins": 2.43359375, "rewards/rejected": -2.75, "step": 1443 }, { "epoch": 0.27285181161131844, "grad_norm": 2.2018064325096414, "learning_rate": 9.208413735409208e-07, "logits/chosen": 1.9052734375, "logits/rejected": 1.52978515625, "logps/chosen": -887.0, "logps/rejected": -943.0, "loss": 0.6366, "rewards/accuracies": 0.8125, "rewards/chosen": 0.546875, "rewards/margins": 2.8828125, "rewards/rejected": -2.330078125, "step": 1444 }, { "epoch": 0.27304076715952574, "grad_norm": 2.4570535970923015, "learning_rate": 9.206731617933315e-07, "logits/chosen": 2.552734375, "logits/rejected": 2.701171875, "logps/chosen": -708.0, "logps/rejected": -968.5, "loss": 0.6093, "rewards/accuracies": 0.78125, "rewards/chosen": 1.04638671875, "rewards/margins": 4.33203125, "rewards/rejected": -3.28125, "step": 1445 }, { "epoch": 0.273229722707733, "grad_norm": 3.1745241573347327, "learning_rate": 9.205047887846081e-07, "logits/chosen": 2.34375, "logits/rejected": 2.3828125, "logps/chosen": -1229.0, "logps/rejected": -1110.0, "loss": 0.5348, "rewards/accuracies": 0.875, "rewards/chosen": 1.1982421875, "rewards/margins": 4.0546875, "rewards/rejected": -2.857421875, "step": 1446 }, { "epoch": 0.2734186782559403, "grad_norm": 2.25122818572645, "learning_rate": 9.203362545880016e-07, "logits/chosen": 1.4970703125, "logits/rejected": 1.552734375, "logps/chosen": -1005.0, "logps/rejected": -1011.0, "loss": 0.5659, "rewards/accuracies": 0.78125, "rewards/chosen": 0.999267578125, "rewards/margins": 4.7578125, "rewards/rejected": -3.765625, "step": 1447 }, { "epoch": 0.2736076338041476, "grad_norm": 2.6684946072550435, "learning_rate": 9.201675592768324e-07, "logits/chosen": 1.61279296875, "logits/rejected": 2.275390625, "logps/chosen": -887.0, "logps/rejected": -739.0, "loss": 0.763, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4423828125, "rewards/margins": 2.0625, "rewards/rejected": -1.62176513671875, "step": 1448 }, { "epoch": 0.27379658935235485, "grad_norm": 2.1181999093019646, "learning_rate": 9.199987029244912e-07, "logits/chosen": 2.2236328125, "logits/rejected": 1.845703125, "logps/chosen": -863.5, "logps/rejected": -895.5, "loss": 0.5907, "rewards/accuracies": 0.875, "rewards/chosen": 0.8583984375, "rewards/margins": 3.41015625, "rewards/rejected": -2.55078125, "step": 1449 }, { "epoch": 0.27398554490056215, "grad_norm": 1.9043475220809603, "learning_rate": 9.198296856044393e-07, "logits/chosen": 2.064208984375, "logits/rejected": 2.435546875, "logps/chosen": -544.5, "logps/rejected": -1216.0, "loss": 0.7023, "rewards/accuracies": 0.8125, "rewards/chosen": 0.580078125, "rewards/margins": 3.650390625, "rewards/rejected": -3.076171875, "step": 1450 }, { "epoch": 0.27417450044876945, "grad_norm": 2.6825128634854067, "learning_rate": 9.196605073902073e-07, "logits/chosen": 2.10546875, "logits/rejected": 2.140625, "logps/chosen": -705.5, "logps/rejected": -624.5, "loss": 0.7865, "rewards/accuracies": 0.6875, "rewards/chosen": 0.190185546875, "rewards/margins": 2.05859375, "rewards/rejected": -1.8720703125, "step": 1451 }, { "epoch": 0.2743634559969767, "grad_norm": 2.8712001734211996, "learning_rate": 9.19491168355396e-07, "logits/chosen": 2.416015625, "logits/rejected": 2.38671875, "logps/chosen": -744.0, "logps/rejected": -849.5, "loss": 0.6149, "rewards/accuracies": 0.8125, "rewards/chosen": 0.51123046875, "rewards/margins": 3.265625, "rewards/rejected": -2.748046875, "step": 1452 }, { "epoch": 0.274552411545184, "grad_norm": 1.8702782189652218, "learning_rate": 9.193216685736764e-07, "logits/chosen": 2.541015625, "logits/rejected": 2.4921875, "logps/chosen": -1859.0, "logps/rejected": -975.0, "loss": 0.6273, "rewards/accuracies": 0.84375, "rewards/chosen": -0.362548828125, "rewards/margins": 3.23046875, "rewards/rejected": -3.58984375, "step": 1453 }, { "epoch": 0.27474136709339125, "grad_norm": 1.5811507512303693, "learning_rate": 9.191520081187894e-07, "logits/chosen": 1.6826171875, "logits/rejected": 1.595703125, "logps/chosen": -475.0, "logps/rejected": -678.5, "loss": 0.7317, "rewards/accuracies": 0.8125, "rewards/chosen": 0.168243408203125, "rewards/margins": 2.826171875, "rewards/rejected": -2.66015625, "step": 1454 }, { "epoch": 0.27493032264159856, "grad_norm": 3.5424186670781284, "learning_rate": 9.189821870645453e-07, "logits/chosen": 2.2177734375, "logits/rejected": 1.5560302734375, "logps/chosen": -735.0, "logps/rejected": -670.0, "loss": 0.6985, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1036376953125, "rewards/margins": 3.109375, "rewards/rejected": -3.0078125, "step": 1455 }, { "epoch": 0.27511927818980586, "grad_norm": 1.8926721025096342, "learning_rate": 9.188122054848248e-07, "logits/chosen": 1.43017578125, "logits/rejected": 1.501953125, "logps/chosen": -729.0, "logps/rejected": -694.0, "loss": 0.5008, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7919921875, "rewards/margins": 3.890625, "rewards/rejected": -3.095703125, "step": 1456 }, { "epoch": 0.2753082337380131, "grad_norm": 2.266844100315318, "learning_rate": 9.186420634535783e-07, "logits/chosen": 3.078125, "logits/rejected": 3.255859375, "logps/chosen": -604.0, "logps/rejected": -657.0, "loss": 0.6899, "rewards/accuracies": 0.78125, "rewards/chosen": 0.756103515625, "rewards/margins": 3.16015625, "rewards/rejected": -2.40625, "step": 1457 }, { "epoch": 0.2754971892862204, "grad_norm": 3.267402125039186, "learning_rate": 9.184717610448262e-07, "logits/chosen": 2.55859375, "logits/rejected": 3.109375, "logps/chosen": -1316.0, "logps/rejected": -1795.0, "loss": 0.4857, "rewards/accuracies": 0.9375, "rewards/chosen": 2.3515625, "rewards/margins": 5.73046875, "rewards/rejected": -3.388671875, "step": 1458 }, { "epoch": 0.2756861448344277, "grad_norm": 2.088162180300396, "learning_rate": 9.183012983326581e-07, "logits/chosen": 2.0533447265625, "logits/rejected": 1.7822265625, "logps/chosen": -1525.0, "logps/rejected": -1790.0, "loss": 0.5293, "rewards/accuracies": 0.90625, "rewards/chosen": 2.009765625, "rewards/margins": 5.55859375, "rewards/rejected": -3.55859375, "step": 1459 }, { "epoch": 0.27587510038263496, "grad_norm": 2.857637700100958, "learning_rate": 9.18130675391234e-07, "logits/chosen": 1.268798828125, "logits/rejected": 1.884765625, "logps/chosen": -556.0, "logps/rejected": -820.0, "loss": 0.695, "rewards/accuracies": 0.8125, "rewards/chosen": 0.441162109375, "rewards/margins": 2.849609375, "rewards/rejected": -2.40234375, "step": 1460 }, { "epoch": 0.27606405593084227, "grad_norm": 2.340491927458465, "learning_rate": 9.179598922947831e-07, "logits/chosen": 1.6279296875, "logits/rejected": 2.056640625, "logps/chosen": -488.5, "logps/rejected": -758.0, "loss": 0.568, "rewards/accuracies": 0.875, "rewards/chosen": 0.17822265625, "rewards/margins": 4.44921875, "rewards/rejected": -4.265625, "step": 1461 }, { "epoch": 0.27625301147904957, "grad_norm": 1.7152803185927377, "learning_rate": 9.177889491176047e-07, "logits/chosen": 2.037109375, "logits/rejected": 1.7109375, "logps/chosen": -445.75, "logps/rejected": -513.5, "loss": 0.5876, "rewards/accuracies": 0.8125, "rewards/chosen": -0.142822265625, "rewards/margins": 3.3671875, "rewards/rejected": -3.515625, "step": 1462 }, { "epoch": 0.2764419670272568, "grad_norm": 2.3233814364738654, "learning_rate": 9.176178459340672e-07, "logits/chosen": 1.691650390625, "logits/rejected": 1.74609375, "logps/chosen": -678.5, "logps/rejected": -806.0, "loss": 0.6755, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3583984375, "rewards/margins": 3.044921875, "rewards/rejected": -2.69140625, "step": 1463 }, { "epoch": 0.2766309225754641, "grad_norm": 2.364145578461517, "learning_rate": 9.174465828186093e-07, "logits/chosen": 2.2109375, "logits/rejected": 1.673828125, "logps/chosen": -708.0, "logps/rejected": -435.0, "loss": 0.7402, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1378173828125, "rewards/margins": 2.255859375, "rewards/rejected": -2.119140625, "step": 1464 }, { "epoch": 0.2768198781236714, "grad_norm": 2.756236799477773, "learning_rate": 9.172751598457387e-07, "logits/chosen": 1.994140625, "logits/rejected": 2.041015625, "logps/chosen": -676.5, "logps/rejected": -429.0, "loss": 0.6764, "rewards/accuracies": 0.71875, "rewards/chosen": -0.33642578125, "rewards/margins": 2.720703125, "rewards/rejected": -3.0546875, "step": 1465 }, { "epoch": 0.2770088336718787, "grad_norm": 1.8582281397767555, "learning_rate": 9.171035770900329e-07, "logits/chosen": 1.966796875, "logits/rejected": 1.9228515625, "logps/chosen": -1085.0, "logps/rejected": -1091.0, "loss": 0.5953, "rewards/accuracies": 0.8125, "rewards/chosen": 0.919921875, "rewards/margins": 4.8125, "rewards/rejected": -3.90625, "step": 1466 }, { "epoch": 0.277197789220086, "grad_norm": 2.35436498006677, "learning_rate": 9.169318346261387e-07, "logits/chosen": 2.24609375, "logits/rejected": 1.369140625, "logps/chosen": -1720.5, "logps/rejected": -860.5, "loss": 0.7273, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1805419921875, "rewards/margins": 3.0009765625, "rewards/rejected": -3.181640625, "step": 1467 }, { "epoch": 0.2773867447682933, "grad_norm": 1.8549926840739082, "learning_rate": 9.16759932528773e-07, "logits/chosen": 1.265625, "logits/rejected": 1.0732421875, "logps/chosen": -1031.0, "logps/rejected": -937.5, "loss": 0.5753, "rewards/accuracies": 0.84375, "rewards/chosen": 1.03759765625, "rewards/margins": 4.5, "rewards/rejected": -3.4609375, "step": 1468 }, { "epoch": 0.27757570031650053, "grad_norm": 2.1897489591260273, "learning_rate": 9.165878708727213e-07, "logits/chosen": 2.24169921875, "logits/rejected": 2.2412109375, "logps/chosen": -1145.0, "logps/rejected": -1431.0, "loss": 0.6118, "rewards/accuracies": 0.8125, "rewards/chosen": 0.70947265625, "rewards/margins": 3.92578125, "rewards/rejected": -3.21875, "step": 1469 }, { "epoch": 0.27776465586470783, "grad_norm": 2.4948094916977857, "learning_rate": 9.16415649732839e-07, "logits/chosen": 1.6220703125, "logits/rejected": 1.4599609375, "logps/chosen": -1090.0, "logps/rejected": -1250.0, "loss": 0.6094, "rewards/accuracies": 0.90625, "rewards/chosen": 0.45947265625, "rewards/margins": 4.5546875, "rewards/rejected": -4.0859375, "step": 1470 }, { "epoch": 0.27795361141291514, "grad_norm": 2.1224317563208666, "learning_rate": 9.162432691840509e-07, "logits/chosen": 1.6298828125, "logits/rejected": 2.201171875, "logps/chosen": -908.0, "logps/rejected": -1184.0, "loss": 0.6513, "rewards/accuracies": 0.75, "rewards/chosen": 1.076904296875, "rewards/margins": 4.212890625, "rewards/rejected": -3.150390625, "step": 1471 }, { "epoch": 0.2781425669611224, "grad_norm": 2.2188875515243534, "learning_rate": 9.160707293013512e-07, "logits/chosen": 1.880615234375, "logits/rejected": 1.6484375, "logps/chosen": -642.0, "logps/rejected": -556.5, "loss": 0.6536, "rewards/accuracies": 0.75, "rewards/chosen": 0.80511474609375, "rewards/margins": 2.412109375, "rewards/rejected": -1.603515625, "step": 1472 }, { "epoch": 0.2783315225093297, "grad_norm": 1.941050267382452, "learning_rate": 9.158980301598029e-07, "logits/chosen": 2.138671875, "logits/rejected": 2.291015625, "logps/chosen": -718.5, "logps/rejected": -551.5, "loss": 0.6543, "rewards/accuracies": 0.78125, "rewards/chosen": 0.70751953125, "rewards/margins": 3.171875, "rewards/rejected": -2.46484375, "step": 1473 }, { "epoch": 0.278520478057537, "grad_norm": 2.471136376122282, "learning_rate": 9.157251718345388e-07, "logits/chosen": 1.76953125, "logits/rejected": 1.597900390625, "logps/chosen": -434.5, "logps/rejected": -1338.5, "loss": 0.7657, "rewards/accuracies": 0.625, "rewards/chosen": 0.259765625, "rewards/margins": 2.650390625, "rewards/rejected": -2.39453125, "step": 1474 }, { "epoch": 0.27870943360574424, "grad_norm": 2.307301505699733, "learning_rate": 9.155521544007609e-07, "logits/chosen": 1.990234375, "logits/rejected": 1.744140625, "logps/chosen": -1069.0, "logps/rejected": -938.0, "loss": 0.5865, "rewards/accuracies": 0.84375, "rewards/chosen": 1.158203125, "rewards/margins": 3.69921875, "rewards/rejected": -2.546875, "step": 1475 }, { "epoch": 0.27889838915395154, "grad_norm": 1.8391573181320875, "learning_rate": 9.1537897793374e-07, "logits/chosen": 2.1015625, "logits/rejected": 2.056640625, "logps/chosen": -772.5, "logps/rejected": -944.0, "loss": 0.5317, "rewards/accuracies": 0.9375, "rewards/chosen": 0.683349609375, "rewards/margins": 5.05078125, "rewards/rejected": -4.37109375, "step": 1476 }, { "epoch": 0.2790873447021588, "grad_norm": 1.7684542332813629, "learning_rate": 9.152056425088171e-07, "logits/chosen": 2.208984375, "logits/rejected": 2.78125, "logps/chosen": -688.5, "logps/rejected": -1223.0, "loss": 0.7127, "rewards/accuracies": 0.75, "rewards/chosen": -0.145751953125, "rewards/margins": 3.6171875, "rewards/rejected": -3.7578125, "step": 1477 }, { "epoch": 0.2792763002503661, "grad_norm": 2.611771091221625, "learning_rate": 9.150321482014012e-07, "logits/chosen": 1.9775390625, "logits/rejected": 1.7001953125, "logps/chosen": -998.0, "logps/rejected": -8366.0, "loss": 0.6843, "rewards/accuracies": 0.71875, "rewards/chosen": 0.396484375, "rewards/margins": 83.923828125, "rewards/rejected": -83.7568359375, "step": 1478 }, { "epoch": 0.2794652557985734, "grad_norm": 2.101686207986844, "learning_rate": 9.148584950869709e-07, "logits/chosen": 2.3515625, "logits/rejected": 1.8583984375, "logps/chosen": -991.0, "logps/rejected": -962.0, "loss": 0.5346, "rewards/accuracies": 0.90625, "rewards/chosen": 1.111328125, "rewards/margins": 4.16015625, "rewards/rejected": -3.05078125, "step": 1479 }, { "epoch": 0.27965421134678065, "grad_norm": 2.1021150848613708, "learning_rate": 9.146846832410739e-07, "logits/chosen": 1.01025390625, "logits/rejected": 0.59765625, "logps/chosen": -764.5, "logps/rejected": -737.0, "loss": 0.5042, "rewards/accuracies": 0.9375, "rewards/chosen": 0.58642578125, "rewards/margins": 4.4609375, "rewards/rejected": -3.8828125, "step": 1480 }, { "epoch": 0.27984316689498795, "grad_norm": 1.380225931873027, "learning_rate": 9.145107127393269e-07, "logits/chosen": 2.6171875, "logits/rejected": 2.720703125, "logps/chosen": -764.0, "logps/rejected": -1393.0, "loss": 0.6211, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6669921875, "rewards/margins": 5.47265625, "rewards/rejected": -4.791015625, "step": 1481 }, { "epoch": 0.28003212244319525, "grad_norm": 3.2427814362278697, "learning_rate": 9.143365836574158e-07, "logits/chosen": 2.349609375, "logits/rejected": 2.49609375, "logps/chosen": -1153.0, "logps/rejected": -1245.0, "loss": 0.5529, "rewards/accuracies": 0.9375, "rewards/chosen": 0.63909912109375, "rewards/margins": 3.67578125, "rewards/rejected": -3.0390625, "step": 1482 }, { "epoch": 0.2802210779914025, "grad_norm": 2.1393781091873914, "learning_rate": 9.141622960710953e-07, "logits/chosen": 2.09375, "logits/rejected": 2.3046875, "logps/chosen": -784.0, "logps/rejected": -1128.5, "loss": 0.6146, "rewards/accuracies": 0.75, "rewards/chosen": -0.04461669921875, "rewards/margins": 4.625, "rewards/rejected": -4.671875, "step": 1483 }, { "epoch": 0.2804100335396098, "grad_norm": 4.434973264306296, "learning_rate": 9.13987850056189e-07, "logits/chosen": 2.302734375, "logits/rejected": 2.75390625, "logps/chosen": -738.0, "logps/rejected": -897.0, "loss": 0.5376, "rewards/accuracies": 0.875, "rewards/chosen": 1.359375, "rewards/margins": 3.76171875, "rewards/rejected": -2.39453125, "step": 1484 }, { "epoch": 0.2805989890878171, "grad_norm": 2.691789694258919, "learning_rate": 9.138132456885898e-07, "logits/chosen": 2.19921875, "logits/rejected": 1.5986328125, "logps/chosen": -338.75, "logps/rejected": -327.25, "loss": 0.7018, "rewards/accuracies": 0.78125, "rewards/chosen": 0.091552734375, "rewards/margins": 2.4228515625, "rewards/rejected": -2.333984375, "step": 1485 }, { "epoch": 0.28078794463602436, "grad_norm": 2.5859562632948263, "learning_rate": 9.13638483044259e-07, "logits/chosen": 2.76953125, "logits/rejected": 2.6171875, "logps/chosen": -880.0, "logps/rejected": -757.5, "loss": 0.6902, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3792724609375, "rewards/margins": 2.68359375, "rewards/rejected": -2.30859375, "step": 1486 }, { "epoch": 0.28097690018423166, "grad_norm": 2.4321514273452736, "learning_rate": 9.13463562199227e-07, "logits/chosen": 2.09619140625, "logits/rejected": 2.25, "logps/chosen": -976.0, "logps/rejected": -1454.0, "loss": 0.6975, "rewards/accuracies": 0.75, "rewards/chosen": 0.169921875, "rewards/margins": 3.5625, "rewards/rejected": -3.390625, "step": 1487 }, { "epoch": 0.28116585573243896, "grad_norm": 2.5391376593542696, "learning_rate": 9.132884832295932e-07, "logits/chosen": 1.982421875, "logits/rejected": 1.892578125, "logps/chosen": -594.0, "logps/rejected": -611.5, "loss": 0.6751, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2900390625, "rewards/margins": 2.498046875, "rewards/rejected": -2.203125, "step": 1488 }, { "epoch": 0.2813548112806462, "grad_norm": 2.9409087215220486, "learning_rate": 9.131132462115253e-07, "logits/chosen": 2.01171875, "logits/rejected": 2.08203125, "logps/chosen": -882.0, "logps/rejected": -867.0, "loss": 0.6769, "rewards/accuracies": 0.78125, "rewards/chosen": -0.18505859375, "rewards/margins": 3.291015625, "rewards/rejected": -3.47265625, "step": 1489 }, { "epoch": 0.2815437668288535, "grad_norm": 2.3051241616051494, "learning_rate": 9.129378512212601e-07, "logits/chosen": 1.880859375, "logits/rejected": 1.880859375, "logps/chosen": -625.0, "logps/rejected": -844.0, "loss": 0.6499, "rewards/accuracies": 0.75, "rewards/chosen": -0.03826904296875, "rewards/margins": 3.34765625, "rewards/rejected": -3.39453125, "step": 1490 }, { "epoch": 0.2817327223770608, "grad_norm": 2.364939364105873, "learning_rate": 9.127622983351032e-07, "logits/chosen": 1.21484375, "logits/rejected": 0.834228515625, "logps/chosen": -929.0, "logps/rejected": -731.0, "loss": 0.6236, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07470703125, "rewards/margins": 3.55078125, "rewards/rejected": -3.625, "step": 1491 }, { "epoch": 0.28192167792526807, "grad_norm": 1.7398417368755255, "learning_rate": 9.12586587629429e-07, "logits/chosen": 1.5859375, "logits/rejected": 1.3544921875, "logps/chosen": -1389.0, "logps/rejected": -1158.0, "loss": 0.5299, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2734375, "rewards/margins": 4.7421875, "rewards/rejected": -4.46875, "step": 1492 }, { "epoch": 0.28211063347347537, "grad_norm": 5.005751023983683, "learning_rate": 9.124107191806799e-07, "logits/chosen": 1.162109375, "logits/rejected": 0.89501953125, "logps/chosen": -817.5, "logps/rejected": -816.5, "loss": 0.706, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4150390625, "rewards/margins": 3.38671875, "rewards/rejected": -3.80078125, "step": 1493 }, { "epoch": 0.2822995890216827, "grad_norm": 2.151181714865686, "learning_rate": 9.122346930653676e-07, "logits/chosen": 0.7626953125, "logits/rejected": 0.7490234375, "logps/chosen": -555.0, "logps/rejected": -536.0, "loss": 0.726, "rewards/accuracies": 0.8125, "rewards/chosen": -0.821044921875, "rewards/margins": 2.9189453125, "rewards/rejected": -3.74609375, "step": 1494 }, { "epoch": 0.2824885445698899, "grad_norm": 1.9365165747129875, "learning_rate": 9.120585093600721e-07, "logits/chosen": 2.037109375, "logits/rejected": 1.95703125, "logps/chosen": -644.0, "logps/rejected": -1349.0, "loss": 0.6928, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4765625, "rewards/margins": 4.0, "rewards/rejected": -4.47265625, "step": 1495 }, { "epoch": 0.2826775001180972, "grad_norm": 2.993132661039417, "learning_rate": 9.11882168141442e-07, "logits/chosen": 1.94140625, "logits/rejected": 1.5859375, "logps/chosen": -538.5, "logps/rejected": -561.0, "loss": 0.675, "rewards/accuracies": 0.78125, "rewards/chosen": -0.344970703125, "rewards/margins": 3.0625, "rewards/rejected": -3.40625, "step": 1496 }, { "epoch": 0.28286645566630453, "grad_norm": 2.087512132638428, "learning_rate": 9.117056694861945e-07, "logits/chosen": 2.3125, "logits/rejected": 2.55078125, "logps/chosen": -1124.0, "logps/rejected": -1098.0, "loss": 0.5115, "rewards/accuracies": 0.9375, "rewards/chosen": 0.33721923828125, "rewards/margins": 4.93359375, "rewards/rejected": -4.59375, "step": 1497 }, { "epoch": 0.2830554112145118, "grad_norm": 2.6259696077012844, "learning_rate": 9.115290134711153e-07, "logits/chosen": 3.052734375, "logits/rejected": 3.16015625, "logps/chosen": -928.0, "logps/rejected": -1691.0, "loss": 0.5299, "rewards/accuracies": 0.875, "rewards/chosen": 0.3720703125, "rewards/margins": 4.98828125, "rewards/rejected": -4.6171875, "step": 1498 }, { "epoch": 0.2832443667627191, "grad_norm": 3.0962891532826853, "learning_rate": 9.113522001730583e-07, "logits/chosen": 2.865234375, "logits/rejected": 2.787109375, "logps/chosen": -687.0, "logps/rejected": -913.5, "loss": 0.7363, "rewards/accuracies": 0.71875, "rewards/chosen": -0.520751953125, "rewards/margins": 2.759765625, "rewards/rejected": -3.283203125, "step": 1499 }, { "epoch": 0.28343332231092633, "grad_norm": 2.2804185731333897, "learning_rate": 9.11175229668946e-07, "logits/chosen": 0.8583984375, "logits/rejected": 1.0888671875, "logps/chosen": -483.0, "logps/rejected": -1637.0, "loss": 0.6938, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1104736328125, "rewards/margins": 2.91015625, "rewards/rejected": -3.015625, "step": 1500 }, { "epoch": 0.28362227785913363, "grad_norm": 2.7582429031428513, "learning_rate": 9.109981020357695e-07, "logits/chosen": 2.189453125, "logits/rejected": 2.51171875, "logps/chosen": -854.0, "logps/rejected": -1173.0, "loss": 0.6413, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14208984375, "rewards/margins": 3.21875, "rewards/rejected": -3.08203125, "step": 1501 }, { "epoch": 0.28381123340734093, "grad_norm": 3.0810953419821416, "learning_rate": 9.10820817350588e-07, "logits/chosen": 1.9462890625, "logits/rejected": 1.90216064453125, "logps/chosen": -570.5, "logps/rejected": -1276.5, "loss": 0.7601, "rewards/accuracies": 0.75, "rewards/chosen": -0.49267578125, "rewards/margins": 2.8046875, "rewards/rejected": -3.294921875, "step": 1502 }, { "epoch": 0.2840001889555482, "grad_norm": 2.7196608794186155, "learning_rate": 9.106433756905292e-07, "logits/chosen": 3.49609375, "logits/rejected": 3.65234375, "logps/chosen": -907.0, "logps/rejected": -1571.5, "loss": 0.6933, "rewards/accuracies": 0.71875, "rewards/chosen": 0.698974609375, "rewards/margins": 5.008056640625, "rewards/rejected": -4.3189849853515625, "step": 1503 }, { "epoch": 0.2841891445037555, "grad_norm": 2.1768978970265733, "learning_rate": 9.104657771327887e-07, "logits/chosen": 1.802734375, "logits/rejected": 1.7275390625, "logps/chosen": -1050.0, "logps/rejected": -989.0, "loss": 0.6021, "rewards/accuracies": 0.78125, "rewards/chosen": 0.54736328125, "rewards/margins": 3.41015625, "rewards/rejected": -2.87109375, "step": 1504 }, { "epoch": 0.2843781000519628, "grad_norm": 2.159931796199466, "learning_rate": 9.102880217546308e-07, "logits/chosen": 0.875244140625, "logits/rejected": 0.9805908203125, "logps/chosen": -860.0, "logps/rejected": -772.5, "loss": 0.5551, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7225341796875, "rewards/margins": 3.6796875, "rewards/rejected": -2.951171875, "step": 1505 }, { "epoch": 0.28456705560017004, "grad_norm": 1.8796965815880404, "learning_rate": 9.101101096333881e-07, "logits/chosen": 2.5234375, "logits/rejected": 2.88671875, "logps/chosen": -608.0, "logps/rejected": -720.5, "loss": 0.5945, "rewards/accuracies": 0.8125, "rewards/chosen": 0.327301025390625, "rewards/margins": 3.40234375, "rewards/rejected": -3.07421875, "step": 1506 }, { "epoch": 0.28475601114837734, "grad_norm": 1.754432861648931, "learning_rate": 9.099320408464608e-07, "logits/chosen": 1.4443359375, "logits/rejected": 1.341796875, "logps/chosen": -524.5, "logps/rejected": -532.0, "loss": 0.6614, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1483154296875, "rewards/margins": 3.038818359375, "rewards/rejected": -2.888671875, "step": 1507 }, { "epoch": 0.28494496669658465, "grad_norm": 2.8566152821674016, "learning_rate": 9.097538154713176e-07, "logits/chosen": 2.5185546875, "logits/rejected": 2.138671875, "logps/chosen": -1281.0, "logps/rejected": -909.0, "loss": 0.5562, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2001953125, "rewards/margins": 3.7578125, "rewards/rejected": -2.564453125, "step": 1508 }, { "epoch": 0.2851339222447919, "grad_norm": 2.851741132953462, "learning_rate": 9.095754335854958e-07, "logits/chosen": 2.4609375, "logits/rejected": 2.9140625, "logps/chosen": -964.0, "logps/rejected": -1066.5, "loss": 0.659, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9765625, "rewards/margins": 3.671875, "rewards/rejected": -2.701171875, "step": 1509 }, { "epoch": 0.2853228777929992, "grad_norm": 2.0091244286929775, "learning_rate": 9.093968952666e-07, "logits/chosen": 2.455078125, "logits/rejected": 2.294921875, "logps/chosen": -754.0, "logps/rejected": -970.0, "loss": 0.6221, "rewards/accuracies": 0.75, "rewards/chosen": 1.0595703125, "rewards/margins": 3.45703125, "rewards/rejected": -2.400390625, "step": 1510 }, { "epoch": 0.2855118333412065, "grad_norm": 2.0776690903178463, "learning_rate": 9.092182005923031e-07, "logits/chosen": 2.921875, "logits/rejected": 2.873046875, "logps/chosen": -782.0, "logps/rejected": -790.0, "loss": 0.6253, "rewards/accuracies": 0.75, "rewards/chosen": -0.158203125, "rewards/margins": 2.9453125, "rewards/rejected": -3.1015625, "step": 1511 }, { "epoch": 0.28570078888941375, "grad_norm": 3.018701652273752, "learning_rate": 9.090393496403462e-07, "logits/chosen": 2.55859375, "logits/rejected": 2.91015625, "logps/chosen": -1024.0, "logps/rejected": -880.0, "loss": 0.6357, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0048828125, "rewards/margins": 4.05859375, "rewards/rejected": -3.044921875, "step": 1512 }, { "epoch": 0.28588974443762105, "grad_norm": 1.7309183311566427, "learning_rate": 9.088603424885385e-07, "logits/chosen": 1.462890625, "logits/rejected": 1.0166015625, "logps/chosen": -751.0, "logps/rejected": -553.5, "loss": 0.5415, "rewards/accuracies": 0.875, "rewards/chosen": 0.96026611328125, "rewards/margins": 4.125, "rewards/rejected": -3.16796875, "step": 1513 }, { "epoch": 0.28607869998582836, "grad_norm": 1.9291337688711336, "learning_rate": 9.086811792147566e-07, "logits/chosen": 2.25, "logits/rejected": 2.455078125, "logps/chosen": -580.5, "logps/rejected": -785.0, "loss": 0.578, "rewards/accuracies": 0.875, "rewards/chosen": 0.59228515625, "rewards/margins": 3.3125, "rewards/rejected": -2.71875, "step": 1514 }, { "epoch": 0.2862676555340356, "grad_norm": 1.8096439548809735, "learning_rate": 9.085018598969457e-07, "logits/chosen": 2.7109375, "logits/rejected": 2.89453125, "logps/chosen": -1125.0, "logps/rejected": -1066.0, "loss": 0.64, "rewards/accuracies": 0.78125, "rewards/chosen": 1.513671875, "rewards/margins": 4.4375, "rewards/rejected": -2.935546875, "step": 1515 }, { "epoch": 0.2864566110822429, "grad_norm": 2.8552778454094008, "learning_rate": 9.083223846131183e-07, "logits/chosen": 2.443359375, "logits/rejected": 2.2578125, "logps/chosen": -624.0, "logps/rejected": -1010.0, "loss": 0.5984, "rewards/accuracies": 0.71875, "rewards/chosen": 0.85009765625, "rewards/margins": 3.27734375, "rewards/rejected": -2.421875, "step": 1516 }, { "epoch": 0.2866455666304502, "grad_norm": 1.6903566739966258, "learning_rate": 9.081427534413552e-07, "logits/chosen": 2.9296875, "logits/rejected": 3.203125, "logps/chosen": -811.0, "logps/rejected": -641.5, "loss": 0.657, "rewards/accuracies": 0.78125, "rewards/chosen": 0.33203125, "rewards/margins": 2.880859375, "rewards/rejected": -2.546875, "step": 1517 }, { "epoch": 0.28683452217865746, "grad_norm": 2.6382929098224244, "learning_rate": 9.079629664598048e-07, "logits/chosen": 2.71484375, "logits/rejected": 3.078125, "logps/chosen": -975.0, "logps/rejected": -1189.5, "loss": 0.6582, "rewards/accuracies": 0.8125, "rewards/chosen": 0.794921875, "rewards/margins": 7.69140625, "rewards/rejected": -6.892578125, "step": 1518 }, { "epoch": 0.28702347772686476, "grad_norm": 3.0551669269081856, "learning_rate": 9.077830237466831e-07, "logits/chosen": 2.44140625, "logits/rejected": 2.865234375, "logps/chosen": -863.0, "logps/rejected": -1168.0, "loss": 0.6463, "rewards/accuracies": 0.75, "rewards/chosen": 0.89453125, "rewards/margins": 3.783203125, "rewards/rejected": -2.892578125, "step": 1519 }, { "epoch": 0.28721243327507207, "grad_norm": 2.5215203649901325, "learning_rate": 9.076029253802744e-07, "logits/chosen": 2.99609375, "logits/rejected": 3.37109375, "logps/chosen": -904.5, "logps/rejected": -1810.0, "loss": 0.7189, "rewards/accuracies": 0.6875, "rewards/chosen": 0.294921875, "rewards/margins": 4.033203125, "rewards/rejected": -3.72998046875, "step": 1520 }, { "epoch": 0.2874013888232793, "grad_norm": 3.003398831937226, "learning_rate": 9.074226714389302e-07, "logits/chosen": 3.18359375, "logits/rejected": 3.6484375, "logps/chosen": -532.0, "logps/rejected": -872.5, "loss": 0.7065, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0087890625, "rewards/margins": 3.07421875, "rewards/rejected": -3.060546875, "step": 1521 }, { "epoch": 0.2875903443714866, "grad_norm": 2.0194084360524602, "learning_rate": 9.072422620010698e-07, "logits/chosen": 2.3984375, "logits/rejected": 2.41796875, "logps/chosen": -943.0, "logps/rejected": -928.0, "loss": 0.6497, "rewards/accuracies": 0.75, "rewards/chosen": 0.5791015625, "rewards/margins": 3.330078125, "rewards/rejected": -2.75, "step": 1522 }, { "epoch": 0.28777929991969387, "grad_norm": 3.3304005735338933, "learning_rate": 9.070616971451804e-07, "logits/chosen": 1.9921875, "logits/rejected": 2.3203125, "logps/chosen": -1123.5, "logps/rejected": -1341.0, "loss": 0.6465, "rewards/accuracies": 0.75, "rewards/chosen": 1.3896484375, "rewards/margins": 3.974609375, "rewards/rejected": -2.580078125, "step": 1523 }, { "epoch": 0.28796825546790117, "grad_norm": 2.8759576534792943, "learning_rate": 9.068809769498167e-07, "logits/chosen": 2.0546875, "logits/rejected": 2.0546875, "logps/chosen": -605.5, "logps/rejected": -590.0, "loss": 0.7241, "rewards/accuracies": 0.875, "rewards/chosen": 0.3310546875, "rewards/margins": 2.328125, "rewards/rejected": -1.99609375, "step": 1524 }, { "epoch": 0.28815721101610847, "grad_norm": 1.7658411198266408, "learning_rate": 9.067001014936006e-07, "logits/chosen": 1.1826171875, "logits/rejected": 1.435546875, "logps/chosen": -709.5, "logps/rejected": -1017.0, "loss": 0.6614, "rewards/accuracies": 0.75, "rewards/chosen": 0.53076171875, "rewards/margins": 3.041015625, "rewards/rejected": -2.51171875, "step": 1525 }, { "epoch": 0.2883461665643157, "grad_norm": 2.574949131604438, "learning_rate": 9.065190708552219e-07, "logits/chosen": 2.134765625, "logits/rejected": 1.630859375, "logps/chosen": -864.5, "logps/rejected": -1093.0, "loss": 0.5397, "rewards/accuracies": 0.875, "rewards/chosen": 0.8779296875, "rewards/margins": 4.08984375, "rewards/rejected": -3.21484375, "step": 1526 }, { "epoch": 0.288535122112523, "grad_norm": 1.9958600545977214, "learning_rate": 9.063378851134383e-07, "logits/chosen": 2.939453125, "logits/rejected": 3.052734375, "logps/chosen": -802.5, "logps/rejected": -932.0, "loss": 0.6836, "rewards/accuracies": 0.75, "rewards/chosen": 0.69921875, "rewards/margins": 2.5078125, "rewards/rejected": -1.8095703125, "step": 1527 }, { "epoch": 0.2887240776607303, "grad_norm": 1.788994164989651, "learning_rate": 9.061565443470739e-07, "logits/chosen": 2.009765625, "logits/rejected": 1.45068359375, "logps/chosen": -658.0, "logps/rejected": -556.0, "loss": 0.6214, "rewards/accuracies": 0.84375, "rewards/chosen": 0.622314453125, "rewards/margins": 3.3046875, "rewards/rejected": -2.6875, "step": 1528 }, { "epoch": 0.2889130332089376, "grad_norm": 1.8536952322501326, "learning_rate": 9.059750486350212e-07, "logits/chosen": 1.880859375, "logits/rejected": 2.26953125, "logps/chosen": -797.0, "logps/rejected": -1373.0, "loss": 0.6104, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1396484375, "rewards/margins": 3.69921875, "rewards/rejected": -2.560546875, "step": 1529 }, { "epoch": 0.2891019887571449, "grad_norm": 2.759144509651357, "learning_rate": 9.057933980562401e-07, "logits/chosen": 2.103515625, "logits/rejected": 2.09765625, "logps/chosen": -1324.0, "logps/rejected": -1273.0, "loss": 0.4811, "rewards/accuracies": 0.9375, "rewards/chosen": 1.80859375, "rewards/margins": 5.453125, "rewards/rejected": -3.64453125, "step": 1530 }, { "epoch": 0.2892909443053522, "grad_norm": 2.49280672763201, "learning_rate": 9.056115926897572e-07, "logits/chosen": 2.3515625, "logits/rejected": 2.3671875, "logps/chosen": -876.5, "logps/rejected": -910.0, "loss": 0.5928, "rewards/accuracies": 0.8125, "rewards/chosen": 1.18505859375, "rewards/margins": 3.25, "rewards/rejected": -2.0703125, "step": 1531 }, { "epoch": 0.28947989985355943, "grad_norm": 2.9410846082191036, "learning_rate": 9.054296326146668e-07, "logits/chosen": 1.97265625, "logits/rejected": 2.552734375, "logps/chosen": -824.0, "logps/rejected": -901.5, "loss": 0.6318, "rewards/accuracies": 0.78125, "rewards/chosen": 1.248046875, "rewards/margins": 3.15625, "rewards/rejected": -1.904296875, "step": 1532 }, { "epoch": 0.28966885540176673, "grad_norm": 2.2078383305059837, "learning_rate": 9.052475179101307e-07, "logits/chosen": 2.955078125, "logits/rejected": 3.01171875, "logps/chosen": -817.0, "logps/rejected": -830.0, "loss": 0.7949, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3505859375, "rewards/margins": 2.818359375, "rewards/rejected": -2.46875, "step": 1533 }, { "epoch": 0.28985781094997404, "grad_norm": 2.0772065871485874, "learning_rate": 9.050652486553778e-07, "logits/chosen": 2.126953125, "logits/rejected": 2.0732421875, "logps/chosen": -926.0, "logps/rejected": -688.0, "loss": 0.6353, "rewards/accuracies": 0.8125, "rewards/chosen": 0.57470703125, "rewards/margins": 2.71875, "rewards/rejected": -2.1484375, "step": 1534 }, { "epoch": 0.2900467664981813, "grad_norm": 1.736784310772043, "learning_rate": 9.04882824929704e-07, "logits/chosen": 2.556640625, "logits/rejected": 2.3515625, "logps/chosen": -772.0, "logps/rejected": -680.5, "loss": 0.6857, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5556640625, "rewards/margins": 2.775390625, "rewards/rejected": -2.2265625, "step": 1535 }, { "epoch": 0.2902357220463886, "grad_norm": 2.057791529720591, "learning_rate": 9.047002468124728e-07, "logits/chosen": 2.3203125, "logits/rejected": 2.66015625, "logps/chosen": -840.5, "logps/rejected": -1624.0, "loss": 0.5701, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0185546875, "rewards/margins": 4.6015625, "rewards/rejected": -3.5859375, "step": 1536 }, { "epoch": 0.2904246775945959, "grad_norm": 2.3184394465054075, "learning_rate": 9.045175143831149e-07, "logits/chosen": 2.390625, "logits/rejected": 2.3544921875, "logps/chosen": -602.5, "logps/rejected": -693.0, "loss": 0.7878, "rewards/accuracies": 0.75, "rewards/chosen": -0.12548828125, "rewards/margins": 2.033203125, "rewards/rejected": -2.158203125, "step": 1537 }, { "epoch": 0.29061363314280314, "grad_norm": 2.1777147718601793, "learning_rate": 9.043346277211274e-07, "logits/chosen": 2.2880859375, "logits/rejected": 2.162109375, "logps/chosen": -687.75, "logps/rejected": -672.0, "loss": 0.661, "rewards/accuracies": 0.8125, "rewards/chosen": 0.87353515625, "rewards/margins": 3.439453125, "rewards/rejected": -2.568359375, "step": 1538 }, { "epoch": 0.29080258869101044, "grad_norm": 2.4658926388490525, "learning_rate": 9.041515869060755e-07, "logits/chosen": 2.533203125, "logits/rejected": 2.1015625, "logps/chosen": -633.5, "logps/rejected": -742.0, "loss": 0.6267, "rewards/accuracies": 0.90625, "rewards/chosen": 0.56982421875, "rewards/margins": 3.109375, "rewards/rejected": -2.541015625, "step": 1539 }, { "epoch": 0.29099154423921775, "grad_norm": 2.9635537417025164, "learning_rate": 9.039683920175908e-07, "logits/chosen": 1.912109375, "logits/rejected": 2.060546875, "logps/chosen": -709.0, "logps/rejected": -621.0, "loss": 0.6881, "rewards/accuracies": 0.78125, "rewards/chosen": 0.813720703125, "rewards/margins": 2.765625, "rewards/rejected": -1.958984375, "step": 1540 }, { "epoch": 0.291180499787425, "grad_norm": 3.646502404645424, "learning_rate": 9.037850431353722e-07, "logits/chosen": 2.4775390625, "logits/rejected": 2.12939453125, "logps/chosen": -1146.5, "logps/rejected": -1329.0, "loss": 0.6741, "rewards/accuracies": 0.75, "rewards/chosen": -0.5771484375, "rewards/margins": 2.7520751953125, "rewards/rejected": -3.328125, "step": 1541 }, { "epoch": 0.2913694553356323, "grad_norm": 2.4615452979028065, "learning_rate": 9.036015403391855e-07, "logits/chosen": 2.28515625, "logits/rejected": 2.54296875, "logps/chosen": -611.5, "logps/rejected": -576.0, "loss": 0.8107, "rewards/accuracies": 0.65625, "rewards/chosen": -0.67529296875, "rewards/margins": 1.5087890625, "rewards/rejected": -2.1865234375, "step": 1542 }, { "epoch": 0.2915584108838396, "grad_norm": 2.4948806755280404, "learning_rate": 9.034178837088634e-07, "logits/chosen": 1.94921875, "logits/rejected": 1.6767578125, "logps/chosen": -936.0, "logps/rejected": -751.0, "loss": 0.6361, "rewards/accuracies": 0.75, "rewards/chosen": 0.312744140625, "rewards/margins": 2.8046875, "rewards/rejected": -2.4921875, "step": 1543 }, { "epoch": 0.29174736643204685, "grad_norm": 2.3706531783409015, "learning_rate": 9.032340733243058e-07, "logits/chosen": 1.3297119140625, "logits/rejected": 1.4326171875, "logps/chosen": -1008.0, "logps/rejected": -1286.0, "loss": 0.5617, "rewards/accuracies": 0.78125, "rewards/chosen": 0.52001953125, "rewards/margins": 4.77734375, "rewards/rejected": -4.25390625, "step": 1544 }, { "epoch": 0.29193632198025415, "grad_norm": 5.887262729136337, "learning_rate": 9.03050109265479e-07, "logits/chosen": 2.2861328125, "logits/rejected": 1.943359375, "logps/chosen": -728.5, "logps/rejected": -694.5, "loss": 0.588, "rewards/accuracies": 0.875, "rewards/chosen": 0.3646240234375, "rewards/margins": 3.646484375, "rewards/rejected": -3.28515625, "step": 1545 }, { "epoch": 0.2921252775284614, "grad_norm": 4.826493338067525, "learning_rate": 9.028659916124167e-07, "logits/chosen": 1.658203125, "logits/rejected": 1.501953125, "logps/chosen": -890.0, "logps/rejected": -799.0, "loss": 0.6134, "rewards/accuracies": 0.8125, "rewards/chosen": 0.526580810546875, "rewards/margins": 3.63671875, "rewards/rejected": -3.109375, "step": 1546 }, { "epoch": 0.2923142330766687, "grad_norm": 2.8608177486689064, "learning_rate": 9.02681720445219e-07, "logits/chosen": 1.92724609375, "logits/rejected": 2.01171875, "logps/chosen": -671.5, "logps/rejected": -1359.5, "loss": 0.685, "rewards/accuracies": 0.78125, "rewards/chosen": 0.123291015625, "rewards/margins": 3.189453125, "rewards/rejected": -3.0625, "step": 1547 }, { "epoch": 0.292503188624876, "grad_norm": 2.226627692818668, "learning_rate": 9.024972958440531e-07, "logits/chosen": 2.0205078125, "logits/rejected": 1.7861328125, "logps/chosen": -680.5, "logps/rejected": -722.0, "loss": 0.5989, "rewards/accuracies": 0.8125, "rewards/chosen": 0.580078125, "rewards/margins": 3.9609375, "rewards/rejected": -3.3828125, "step": 1548 }, { "epoch": 0.29269214417308326, "grad_norm": 1.9397969493640757, "learning_rate": 9.023127178891527e-07, "logits/chosen": 2.388671875, "logits/rejected": 2.521484375, "logps/chosen": -934.0, "logps/rejected": -1439.0, "loss": 0.6483, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8056640625, "rewards/margins": 4.548828125, "rewards/rejected": -3.7421875, "step": 1549 }, { "epoch": 0.29288109972129056, "grad_norm": 2.914594545069145, "learning_rate": 9.021279866608185e-07, "logits/chosen": 1.6455078125, "logits/rejected": 1.220703125, "logps/chosen": -1112.0, "logps/rejected": -1440.0, "loss": 0.5355, "rewards/accuracies": 0.90625, "rewards/chosen": 1.603515625, "rewards/margins": 4.4375, "rewards/rejected": -2.837890625, "step": 1550 }, { "epoch": 0.29307005526949786, "grad_norm": 2.7436336098996335, "learning_rate": 9.019431022394176e-07, "logits/chosen": 1.662109375, "logits/rejected": 1.4970703125, "logps/chosen": -812.5, "logps/rejected": -831.0, "loss": 0.5578, "rewards/accuracies": 0.84375, "rewards/chosen": 1.296875, "rewards/margins": 4.453125, "rewards/rejected": -3.15234375, "step": 1551 }, { "epoch": 0.2932590108177051, "grad_norm": 2.875402033564161, "learning_rate": 9.017580647053837e-07, "logits/chosen": 3.2734375, "logits/rejected": 3.6796875, "logps/chosen": -689.0, "logps/rejected": -836.0, "loss": 0.5964, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8095703125, "rewards/margins": 3.8515625, "rewards/rejected": -3.0390625, "step": 1552 }, { "epoch": 0.2934479663659124, "grad_norm": 1.9893973568451233, "learning_rate": 9.015728741392176e-07, "logits/chosen": 2.0791015625, "logits/rejected": 2.5263671875, "logps/chosen": -954.5, "logps/rejected": -1035.0, "loss": 0.6374, "rewards/accuracies": 0.84375, "rewards/chosen": 1.13916015625, "rewards/margins": 2.736328125, "rewards/rejected": -1.59765625, "step": 1553 }, { "epoch": 0.2936369219141197, "grad_norm": 1.874532661902143, "learning_rate": 9.013875306214862e-07, "logits/chosen": 2.333984375, "logits/rejected": 1.888671875, "logps/chosen": -731.0, "logps/rejected": -838.0, "loss": 0.6489, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9517822265625, "rewards/margins": 3.294921875, "rewards/rejected": -2.333984375, "step": 1554 }, { "epoch": 0.29382587746232697, "grad_norm": 1.850761504777143, "learning_rate": 9.012020342328228e-07, "logits/chosen": 3.0390625, "logits/rejected": 2.951171875, "logps/chosen": -858.0, "logps/rejected": -882.0, "loss": 0.5619, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2666015625, "rewards/margins": 3.537109375, "rewards/rejected": -2.267578125, "step": 1555 }, { "epoch": 0.29401483301053427, "grad_norm": 2.0114918608013044, "learning_rate": 9.01016385053928e-07, "logits/chosen": 1.62109375, "logits/rejected": 1.619140625, "logps/chosen": -704.0, "logps/rejected": -986.0, "loss": 0.6409, "rewards/accuracies": 0.875, "rewards/chosen": 0.472900390625, "rewards/margins": 3.3046875, "rewards/rejected": -2.828125, "step": 1556 }, { "epoch": 0.2942037885587416, "grad_norm": 1.9122210147114491, "learning_rate": 9.008305831655678e-07, "logits/chosen": 2.19140625, "logits/rejected": 2.123046875, "logps/chosen": -773.5, "logps/rejected": -909.0, "loss": 0.6483, "rewards/accuracies": 0.84375, "rewards/chosen": 0.98583984375, "rewards/margins": 2.89453125, "rewards/rejected": -1.90087890625, "step": 1557 }, { "epoch": 0.2943927441069488, "grad_norm": 2.9118179377005866, "learning_rate": 9.006446286485757e-07, "logits/chosen": 2.646484375, "logits/rejected": 3.275390625, "logps/chosen": -674.5, "logps/rejected": -1034.5, "loss": 0.6588, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6806640625, "rewards/margins": 2.3046875, "rewards/rejected": -1.625732421875, "step": 1558 }, { "epoch": 0.2945816996551561, "grad_norm": 1.646235188145619, "learning_rate": 9.004585215838508e-07, "logits/chosen": 1.2607421875, "logits/rejected": 1.5, "logps/chosen": -613.0, "logps/rejected": -646.5, "loss": 0.595, "rewards/accuracies": 0.8125, "rewards/chosen": 0.68017578125, "rewards/margins": 3.1640625, "rewards/rejected": -2.48046875, "step": 1559 }, { "epoch": 0.29477065520336343, "grad_norm": 1.6871555599245311, "learning_rate": 9.002722620523591e-07, "logits/chosen": 1.441162109375, "logits/rejected": 1.115478515625, "logps/chosen": -614.5, "logps/rejected": -735.0, "loss": 0.6312, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0400390625, "rewards/margins": 2.98046875, "rewards/rejected": -1.9404296875, "step": 1560 }, { "epoch": 0.2949596107515707, "grad_norm": 1.6293287905921652, "learning_rate": 9.000858501351323e-07, "logits/chosen": 2.123046875, "logits/rejected": 1.9384765625, "logps/chosen": -474.0, "logps/rejected": -440.5, "loss": 0.6732, "rewards/accuracies": 0.78125, "rewards/chosen": 0.83544921875, "rewards/margins": 2.74609375, "rewards/rejected": -1.908203125, "step": 1561 }, { "epoch": 0.295148566299778, "grad_norm": 2.261539842554719, "learning_rate": 8.998992859132693e-07, "logits/chosen": 2.92578125, "logits/rejected": 2.91796875, "logps/chosen": -1305.0, "logps/rejected": -1139.5, "loss": 0.7202, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08447265625, "rewards/margins": 2.4609375, "rewards/rejected": -2.376953125, "step": 1562 }, { "epoch": 0.2953375218479853, "grad_norm": 1.8580737199219604, "learning_rate": 8.997125694679344e-07, "logits/chosen": 1.248779296875, "logits/rejected": 1.086029052734375, "logps/chosen": -524.25, "logps/rejected": -624.5, "loss": 0.7155, "rewards/accuracies": 0.75, "rewards/chosen": 0.712890625, "rewards/margins": 2.308349609375, "rewards/rejected": -1.5986328125, "step": 1563 }, { "epoch": 0.29552647739619253, "grad_norm": 1.6608722000314708, "learning_rate": 8.995257008803586e-07, "logits/chosen": 1.681640625, "logits/rejected": 2.0107421875, "logps/chosen": -812.0, "logps/rejected": -935.0, "loss": 0.5923, "rewards/accuracies": 0.84375, "rewards/chosen": 1.054443359375, "rewards/margins": 3.5234375, "rewards/rejected": -2.470703125, "step": 1564 }, { "epoch": 0.29571543294439984, "grad_norm": 2.43203001949811, "learning_rate": 8.99338680231839e-07, "logits/chosen": 2.466796875, "logits/rejected": 2.646484375, "logps/chosen": -521.5, "logps/rejected": -953.0, "loss": 0.7187, "rewards/accuracies": 0.625, "rewards/chosen": 0.385498046875, "rewards/margins": 2.73828125, "rewards/rejected": -2.3502197265625, "step": 1565 }, { "epoch": 0.29590438849260714, "grad_norm": 2.0330228389520073, "learning_rate": 8.991515076037387e-07, "logits/chosen": 2.201171875, "logits/rejected": 2.291015625, "logps/chosen": -889.0, "logps/rejected": -776.0, "loss": 0.5591, "rewards/accuracies": 0.84375, "rewards/chosen": 0.89013671875, "rewards/margins": 4.140625, "rewards/rejected": -3.25, "step": 1566 }, { "epoch": 0.2960933440408144, "grad_norm": 2.4047589679347885, "learning_rate": 8.989641830774871e-07, "logits/chosen": 2.703125, "logits/rejected": 1.958984375, "logps/chosen": -672.0, "logps/rejected": -1443.5, "loss": 0.68, "rewards/accuracies": 0.71875, "rewards/chosen": 0.55224609375, "rewards/margins": 3.42578125, "rewards/rejected": -2.87890625, "step": 1567 }, { "epoch": 0.2962822995890217, "grad_norm": 2.053800448138488, "learning_rate": 8.987767067345797e-07, "logits/chosen": 1.900390625, "logits/rejected": 2.11328125, "logps/chosen": -1441.0, "logps/rejected": -1037.5, "loss": 0.6069, "rewards/accuracies": 0.8125, "rewards/chosen": 0.23681640625, "rewards/margins": 3.4140625, "rewards/rejected": -3.1796875, "step": 1568 }, { "epoch": 0.29647125513722894, "grad_norm": 2.2926716448340922, "learning_rate": 8.985890786565778e-07, "logits/chosen": 1.255859375, "logits/rejected": 1.39404296875, "logps/chosen": -699.5, "logps/rejected": -844.0, "loss": 0.5557, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4322509765625, "rewards/margins": 4.05078125, "rewards/rejected": -3.625, "step": 1569 }, { "epoch": 0.29666021068543624, "grad_norm": 2.2398852569017738, "learning_rate": 8.984012989251089e-07, "logits/chosen": 1.66015625, "logits/rejected": 1.916015625, "logps/chosen": -1085.0, "logps/rejected": -2255.5, "loss": 0.5097, "rewards/accuracies": 0.90625, "rewards/chosen": 1.33056640625, "rewards/margins": 6.41015625, "rewards/rejected": -5.05859375, "step": 1570 }, { "epoch": 0.29684916623364355, "grad_norm": 2.0854573208962455, "learning_rate": 8.982133676218667e-07, "logits/chosen": 1.3583984375, "logits/rejected": 1.3251953125, "logps/chosen": -832.0, "logps/rejected": -791.0, "loss": 0.6164, "rewards/accuracies": 0.875, "rewards/chosen": 0.09765625, "rewards/margins": 3.517578125, "rewards/rejected": -3.4296875, "step": 1571 }, { "epoch": 0.2970381217818508, "grad_norm": 1.8499360375043965, "learning_rate": 8.980252848286103e-07, "logits/chosen": 1.814453125, "logits/rejected": 0.891357421875, "logps/chosen": -919.5, "logps/rejected": -848.5, "loss": 0.5425, "rewards/accuracies": 0.9375, "rewards/chosen": 0.76611328125, "rewards/margins": 3.90234375, "rewards/rejected": -3.134765625, "step": 1572 }, { "epoch": 0.2972270773300581, "grad_norm": 2.529765653517667, "learning_rate": 8.978370506271652e-07, "logits/chosen": 1.732421875, "logits/rejected": 1.703125, "logps/chosen": -852.5, "logps/rejected": -1194.0, "loss": 0.6159, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1650390625, "rewards/margins": 4.140625, "rewards/rejected": -3.98046875, "step": 1573 }, { "epoch": 0.2974160328782654, "grad_norm": 2.722397136617404, "learning_rate": 8.976486650994226e-07, "logits/chosen": 1.677734375, "logits/rejected": 1.60546875, "logps/chosen": -1100.0, "logps/rejected": -1131.0, "loss": 0.6247, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3853759765625, "rewards/margins": 3.28125, "rewards/rejected": -3.6640625, "step": 1574 }, { "epoch": 0.29760498842647265, "grad_norm": 1.6437443996258816, "learning_rate": 8.974601283273394e-07, "logits/chosen": 2.859375, "logits/rejected": 2.82421875, "logps/chosen": -936.5, "logps/rejected": -780.0, "loss": 0.6399, "rewards/accuracies": 0.84375, "rewards/chosen": 0.92919921875, "rewards/margins": 3.46875, "rewards/rejected": -2.544921875, "step": 1575 }, { "epoch": 0.29779394397467995, "grad_norm": 3.1238454135294145, "learning_rate": 8.972714403929383e-07, "logits/chosen": 1.69921875, "logits/rejected": 2.20703125, "logps/chosen": -696.5, "logps/rejected": -677.0, "loss": 0.6937, "rewards/accuracies": 0.84375, "rewards/chosen": 0.387939453125, "rewards/margins": 2.416015625, "rewards/rejected": -2.025390625, "step": 1576 }, { "epoch": 0.29798289952288726, "grad_norm": 2.6766956673702937, "learning_rate": 8.970826013783079e-07, "logits/chosen": 2.607421875, "logits/rejected": 3.0078125, "logps/chosen": -920.5, "logps/rejected": -844.0, "loss": 0.5625, "rewards/accuracies": 0.875, "rewards/chosen": 0.060791015625, "rewards/margins": 3.80078125, "rewards/rejected": -3.734375, "step": 1577 }, { "epoch": 0.2981718550710945, "grad_norm": 2.20188043390124, "learning_rate": 8.96893611365603e-07, "logits/chosen": 2.2578125, "logits/rejected": 1.880615234375, "logps/chosen": -848.5, "logps/rejected": -718.0, "loss": 0.6079, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0439453125, "rewards/margins": 3.9609375, "rewards/rejected": -2.91015625, "step": 1578 }, { "epoch": 0.2983608106193018, "grad_norm": 3.8511957261115985, "learning_rate": 8.967044704370428e-07, "logits/chosen": 2.49609375, "logits/rejected": 2.875, "logps/chosen": -976.5, "logps/rejected": -923.0, "loss": 0.6038, "rewards/accuracies": 0.875, "rewards/chosen": 0.501708984375, "rewards/margins": 4.01953125, "rewards/rejected": -3.51953125, "step": 1579 }, { "epoch": 0.2985497661675091, "grad_norm": 2.21496265911003, "learning_rate": 8.965151786749134e-07, "logits/chosen": 1.3643798828125, "logits/rejected": 1.4052734375, "logps/chosen": -456.0, "logps/rejected": -614.0, "loss": 0.6808, "rewards/accuracies": 0.9375, "rewards/chosen": 0.33203125, "rewards/margins": 3.49609375, "rewards/rejected": -3.158203125, "step": 1580 }, { "epoch": 0.29873872171571636, "grad_norm": 2.441519044440465, "learning_rate": 8.963257361615661e-07, "logits/chosen": 1.63232421875, "logits/rejected": 2.0126953125, "logps/chosen": -907.5, "logps/rejected": -1876.0, "loss": 0.6526, "rewards/accuracies": 0.75, "rewards/chosen": -0.2568359375, "rewards/margins": 4.171875, "rewards/rejected": -4.42578125, "step": 1581 }, { "epoch": 0.29892767726392366, "grad_norm": 4.628442626617162, "learning_rate": 8.961361429794176e-07, "logits/chosen": 1.20849609375, "logits/rejected": 1.310546875, "logps/chosen": -786.0, "logps/rejected": -747.0, "loss": 0.5886, "rewards/accuracies": 0.96875, "rewards/chosen": 0.818359375, "rewards/margins": 4.1796875, "rewards/rejected": -3.359375, "step": 1582 }, { "epoch": 0.29911663281213097, "grad_norm": 2.9712747640451966, "learning_rate": 8.959463992109504e-07, "logits/chosen": 0.8828125, "logits/rejected": 1.03021240234375, "logps/chosen": -16049.0, "logps/rejected": -984.5, "loss": 0.629, "rewards/accuracies": 0.84375, "rewards/chosen": -28.327423095703125, "rewards/margins": -25.25, "rewards/rejected": -3.16015625, "step": 1583 }, { "epoch": 0.2993055883603382, "grad_norm": 1.990211162022633, "learning_rate": 8.957565049387123e-07, "logits/chosen": 1.841796875, "logits/rejected": 1.900390625, "logps/chosen": -786.0, "logps/rejected": -633.0, "loss": 0.74, "rewards/accuracies": 0.6875, "rewards/chosen": -0.76953125, "rewards/margins": 2.2294921875, "rewards/rejected": -3.0, "step": 1584 }, { "epoch": 0.2994945439085455, "grad_norm": 2.5878716061556633, "learning_rate": 8.955664602453168e-07, "logits/chosen": 1.2744140625, "logits/rejected": 2.0263671875, "logps/chosen": -930.0, "logps/rejected": -772.0, "loss": 0.6528, "rewards/accuracies": 0.75, "rewards/chosen": 0.5712890625, "rewards/margins": 3.87890625, "rewards/rejected": -3.3125, "step": 1585 }, { "epoch": 0.2996834994567528, "grad_norm": 2.5250965925883366, "learning_rate": 8.953762652134426e-07, "logits/chosen": 2.060546875, "logits/rejected": 1.41015625, "logps/chosen": -973.0, "logps/rejected": -1014.5, "loss": 0.5281, "rewards/accuracies": 0.875, "rewards/chosen": 0.52294921875, "rewards/margins": 4.65625, "rewards/rejected": -4.1171875, "step": 1586 }, { "epoch": 0.29987245500496007, "grad_norm": 2.3062560065873856, "learning_rate": 8.95185919925834e-07, "logits/chosen": 2.498046875, "logits/rejected": 2.73046875, "logps/chosen": -806.75, "logps/rejected": -1094.0, "loss": 0.627, "rewards/accuracies": 0.71875, "rewards/chosen": 0.856689453125, "rewards/margins": 4.658203125, "rewards/rejected": -3.80078125, "step": 1587 }, { "epoch": 0.3000614105531674, "grad_norm": 2.4466005350256856, "learning_rate": 8.949954244653009e-07, "logits/chosen": 2.19140625, "logits/rejected": 2.20947265625, "logps/chosen": -938.0, "logps/rejected": -660.0, "loss": 0.6298, "rewards/accuracies": 0.84375, "rewards/chosen": 0.51611328125, "rewards/margins": 3.625, "rewards/rejected": -3.09765625, "step": 1588 }, { "epoch": 0.3002503661013747, "grad_norm": 1.7632725160323097, "learning_rate": 8.948047789147176e-07, "logits/chosen": 1.9093017578125, "logits/rejected": 1.5390625, "logps/chosen": -516.0, "logps/rejected": -619.5, "loss": 0.5823, "rewards/accuracies": 0.8125, "rewards/chosen": 0.41741943359375, "rewards/margins": 3.283203125, "rewards/rejected": -2.861328125, "step": 1589 }, { "epoch": 0.3004393216495819, "grad_norm": 3.5597997521962883, "learning_rate": 8.946139833570248e-07, "logits/chosen": 0.830078125, "logits/rejected": 0.8203125, "logps/chosen": -1006.0, "logps/rejected": -734.0, "loss": 0.588, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9130859375, "rewards/margins": 4.34765625, "rewards/rejected": -3.43359375, "step": 1590 }, { "epoch": 0.30062827719778923, "grad_norm": 2.7838204360486882, "learning_rate": 8.944230378752279e-07, "logits/chosen": 2.0703125, "logits/rejected": 2.35546875, "logps/chosen": -744.0, "logps/rejected": -883.0, "loss": 0.5045, "rewards/accuracies": 0.8125, "rewards/chosen": 0.529296875, "rewards/margins": 4.32421875, "rewards/rejected": -3.7890625, "step": 1591 }, { "epoch": 0.3008172327459965, "grad_norm": 5.029587197333866, "learning_rate": 8.942319425523974e-07, "logits/chosen": 1.7666015625, "logits/rejected": 1.6767578125, "logps/chosen": -531.5, "logps/rejected": -658.0, "loss": 0.6854, "rewards/accuracies": 0.875, "rewards/chosen": -0.102783203125, "rewards/margins": 2.939453125, "rewards/rejected": -3.04296875, "step": 1592 }, { "epoch": 0.3010061882942038, "grad_norm": 2.4634905808978846, "learning_rate": 8.940406974716696e-07, "logits/chosen": 2.375, "logits/rejected": 2.143310546875, "logps/chosen": -810.0, "logps/rejected": -824.0, "loss": 0.7292, "rewards/accuracies": 0.75, "rewards/chosen": 0.1279296875, "rewards/margins": 2.1328125, "rewards/rejected": -2.001953125, "step": 1593 }, { "epoch": 0.3011951438424111, "grad_norm": 2.081962936016788, "learning_rate": 8.938493027162453e-07, "logits/chosen": 2.650390625, "logits/rejected": 2.912109375, "logps/chosen": -568.5, "logps/rejected": -890.5, "loss": 0.7986, "rewards/accuracies": 0.78125, "rewards/chosen": 0.189453125, "rewards/margins": 2.41796875, "rewards/rejected": -2.228515625, "step": 1594 }, { "epoch": 0.30138409939061833, "grad_norm": 2.699718511075272, "learning_rate": 8.936577583693907e-07, "logits/chosen": 1.8193359375, "logits/rejected": 1.74072265625, "logps/chosen": -778.0, "logps/rejected": -678.0, "loss": 0.5877, "rewards/accuracies": 0.875, "rewards/chosen": 0.501953125, "rewards/margins": 3.0546875, "rewards/rejected": -2.55859375, "step": 1595 }, { "epoch": 0.30157305493882564, "grad_norm": 2.360415534020596, "learning_rate": 8.93466064514437e-07, "logits/chosen": 1.853515625, "logits/rejected": 1.951171875, "logps/chosen": -818.0, "logps/rejected": -972.0, "loss": 0.6119, "rewards/accuracies": 0.875, "rewards/chosen": 0.6513671875, "rewards/margins": 4.05859375, "rewards/rejected": -3.41015625, "step": 1596 }, { "epoch": 0.30176201048703294, "grad_norm": 1.9957645547828191, "learning_rate": 8.932742212347807e-07, "logits/chosen": 2.6953125, "logits/rejected": 3.091796875, "logps/chosen": -494.0, "logps/rejected": -624.0, "loss": 0.6557, "rewards/accuracies": 0.78125, "rewards/chosen": 0.50439453125, "rewards/margins": 2.373046875, "rewards/rejected": -1.873046875, "step": 1597 }, { "epoch": 0.3019509660352402, "grad_norm": 1.7657282892780672, "learning_rate": 8.930822286138829e-07, "logits/chosen": 1.705078125, "logits/rejected": 1.6328125, "logps/chosen": -835.0, "logps/rejected": -947.0, "loss": 0.6883, "rewards/accuracies": 0.75, "rewards/chosen": 0.9677734375, "rewards/margins": 2.775390625, "rewards/rejected": -1.802734375, "step": 1598 }, { "epoch": 0.3021399215834475, "grad_norm": 2.6980727265274638, "learning_rate": 8.928900867352699e-07, "logits/chosen": 1.1259765625, "logits/rejected": 1.77734375, "logps/chosen": -659.0, "logps/rejected": -1796.0, "loss": 0.7243, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01934814453125, "rewards/margins": 2.98828125, "rewards/rejected": -2.9716796875, "step": 1599 }, { "epoch": 0.3023288771316548, "grad_norm": 2.104610339173585, "learning_rate": 8.92697795682533e-07, "logits/chosen": 2.69921875, "logits/rejected": 2.12890625, "logps/chosen": -830.0, "logps/rejected": -853.5, "loss": 0.5448, "rewards/accuracies": 0.90625, "rewards/chosen": 0.88916015625, "rewards/margins": 3.82421875, "rewards/rejected": -2.931640625, "step": 1600 }, { "epoch": 0.30251783267986204, "grad_norm": 1.7164325208408668, "learning_rate": 8.925053555393282e-07, "logits/chosen": 2.056640625, "logits/rejected": 2.150390625, "logps/chosen": -689.5, "logps/rejected": -836.0, "loss": 0.6284, "rewards/accuracies": 0.71875, "rewards/chosen": 0.849609375, "rewards/margins": 2.890625, "rewards/rejected": -2.048828125, "step": 1601 }, { "epoch": 0.30270678822806935, "grad_norm": 3.1811398530612, "learning_rate": 8.923127663893765e-07, "logits/chosen": 1.728515625, "logits/rejected": 1.7236328125, "logps/chosen": -979.0, "logps/rejected": -1177.5, "loss": 0.6071, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2509765625, "rewards/margins": 3.5703125, "rewards/rejected": -2.32421875, "step": 1602 }, { "epoch": 0.30289574377627665, "grad_norm": 1.5152180505903592, "learning_rate": 8.921200283164638e-07, "logits/chosen": 1.6806640625, "logits/rejected": 1.876953125, "logps/chosen": -519.0, "logps/rejected": -639.0, "loss": 0.6179, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4930419921875, "rewards/margins": 2.90625, "rewards/rejected": -2.4140625, "step": 1603 }, { "epoch": 0.3030846993244839, "grad_norm": 3.12008006455596, "learning_rate": 8.919271414044406e-07, "logits/chosen": 2.4375, "logits/rejected": 2.61328125, "logps/chosen": -718.0, "logps/rejected": -1494.0, "loss": 0.6822, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8173828125, "rewards/margins": 3.501953125, "rewards/rejected": -2.6767578125, "step": 1604 }, { "epoch": 0.3032736548726912, "grad_norm": 1.6930400247613302, "learning_rate": 8.917341057372222e-07, "logits/chosen": 2.59375, "logits/rejected": 2.646484375, "logps/chosen": -508.75, "logps/rejected": -798.5, "loss": 0.6199, "rewards/accuracies": 0.78125, "rewards/chosen": 0.54296875, "rewards/margins": 3.24609375, "rewards/rejected": -2.703125, "step": 1605 }, { "epoch": 0.3034626104208985, "grad_norm": 2.5599764414793627, "learning_rate": 8.915409213987886e-07, "logits/chosen": 2.0048828125, "logits/rejected": 2.29296875, "logps/chosen": -948.5, "logps/rejected": -785.0, "loss": 0.6594, "rewards/accuracies": 0.75, "rewards/chosen": 0.601226806640625, "rewards/margins": 2.947265625, "rewards/rejected": -2.345703125, "step": 1606 }, { "epoch": 0.30365156596910575, "grad_norm": 2.2736435290229675, "learning_rate": 8.913475884731848e-07, "logits/chosen": 1.984375, "logits/rejected": 1.85595703125, "logps/chosen": -907.0, "logps/rejected": -1020.0, "loss": 0.6865, "rewards/accuracies": 0.71875, "rewards/chosen": 0.08837890625, "rewards/margins": 3.54296875, "rewards/rejected": -3.453125, "step": 1607 }, { "epoch": 0.30384052151731306, "grad_norm": 2.910550987991992, "learning_rate": 8.911541070445197e-07, "logits/chosen": 2.83984375, "logits/rejected": 2.451171875, "logps/chosen": -441.5, "logps/rejected": -451.0, "loss": 0.7417, "rewards/accuracies": 0.75, "rewards/chosen": 0.226318359375, "rewards/margins": 2.078125, "rewards/rejected": -1.8525390625, "step": 1608 }, { "epoch": 0.30402947706552036, "grad_norm": 1.6271566285437207, "learning_rate": 8.909604771969678e-07, "logits/chosen": 1.8388671875, "logits/rejected": 1.6494140625, "logps/chosen": -669.0, "logps/rejected": -879.0, "loss": 0.574, "rewards/accuracies": 0.78125, "rewards/chosen": 0.75, "rewards/margins": 3.8671875, "rewards/rejected": -3.12109375, "step": 1609 }, { "epoch": 0.3042184326137276, "grad_norm": 2.5170456710419926, "learning_rate": 8.907666990147672e-07, "logits/chosen": 1.884765625, "logits/rejected": 2.3251953125, "logps/chosen": -782.0, "logps/rejected": -1106.5, "loss": 0.7482, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6103515625, "rewards/margins": 2.0625, "rewards/rejected": -1.45660400390625, "step": 1610 }, { "epoch": 0.3044073881619349, "grad_norm": 1.7425273937005605, "learning_rate": 8.905727725822213e-07, "logits/chosen": 1.966796875, "logits/rejected": 1.3018798828125, "logps/chosen": -988.0, "logps/rejected": -845.0, "loss": 0.6389, "rewards/accuracies": 0.84375, "rewards/chosen": 0.61279296875, "rewards/margins": 3.27734375, "rewards/rejected": -2.658203125, "step": 1611 }, { "epoch": 0.3045963437101422, "grad_norm": 2.2423268576973343, "learning_rate": 8.903786979836976e-07, "logits/chosen": 2.671875, "logits/rejected": 3.111328125, "logps/chosen": -868.0, "logps/rejected": -714.0, "loss": 0.5896, "rewards/accuracies": 0.75, "rewards/chosen": 0.53076171875, "rewards/margins": 4.7578125, "rewards/rejected": -4.23046875, "step": 1612 }, { "epoch": 0.30478529925834946, "grad_norm": 2.8377812629750983, "learning_rate": 8.90184475303628e-07, "logits/chosen": 2.412109375, "logits/rejected": 2.439453125, "logps/chosen": -811.0, "logps/rejected": -663.25, "loss": 0.7412, "rewards/accuracies": 0.65625, "rewards/chosen": 1.21533203125, "rewards/margins": 3.25, "rewards/rejected": -2.0419921875, "step": 1613 }, { "epoch": 0.30497425480655677, "grad_norm": 2.2330421384870216, "learning_rate": 8.899901046265091e-07, "logits/chosen": 2.0576171875, "logits/rejected": 2.078125, "logps/chosen": -632.0, "logps/rejected": -815.0, "loss": 0.613, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6865234375, "rewards/margins": 2.94921875, "rewards/rejected": -2.2607421875, "step": 1614 }, { "epoch": 0.305163210354764, "grad_norm": 1.7676791502346156, "learning_rate": 8.897955860369017e-07, "logits/chosen": 1.39013671875, "logits/rejected": 1.42431640625, "logps/chosen": -720.5, "logps/rejected": -640.0, "loss": 0.6949, "rewards/accuracies": 0.65625, "rewards/chosen": 0.7552490234375, "rewards/margins": 2.5859375, "rewards/rejected": -1.83056640625, "step": 1615 }, { "epoch": 0.3053521659029713, "grad_norm": 1.9647690291511917, "learning_rate": 8.89600919619431e-07, "logits/chosen": 1.298828125, "logits/rejected": 1.22216796875, "logps/chosen": -686.0, "logps/rejected": -918.0, "loss": 0.663, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5042724609375, "rewards/margins": 3.31640625, "rewards/rejected": -2.814453125, "step": 1616 }, { "epoch": 0.3055411214511786, "grad_norm": 2.090299274870445, "learning_rate": 8.894061054587865e-07, "logits/chosen": 1.421875, "logits/rejected": 1.521484375, "logps/chosen": -609.5, "logps/rejected": -791.5, "loss": 0.6348, "rewards/accuracies": 0.84375, "rewards/chosen": 0.763671875, "rewards/margins": 2.962890625, "rewards/rejected": -2.197265625, "step": 1617 }, { "epoch": 0.30573007699938587, "grad_norm": 1.7231461466137385, "learning_rate": 8.892111436397219e-07, "logits/chosen": 1.22265625, "logits/rejected": 0.8291015625, "logps/chosen": -635.5, "logps/rejected": -744.0, "loss": 0.7227, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03765869140625, "rewards/margins": 2.69140625, "rewards/rejected": -2.734375, "step": 1618 }, { "epoch": 0.3059190325475932, "grad_norm": 2.436985508959124, "learning_rate": 8.890160342470555e-07, "logits/chosen": 1.720703125, "logits/rejected": 1.5634765625, "logps/chosen": -647.5, "logps/rejected": -716.0, "loss": 0.7965, "rewards/accuracies": 0.78125, "rewards/chosen": 0.34814453125, "rewards/margins": 2.6015625, "rewards/rejected": -2.25390625, "step": 1619 }, { "epoch": 0.3061079880958005, "grad_norm": 1.9130777640756698, "learning_rate": 8.888207773656693e-07, "logits/chosen": 1.7587890625, "logits/rejected": 1.4384765625, "logps/chosen": -743.0, "logps/rejected": -1455.5, "loss": 0.6415, "rewards/accuracies": 0.8125, "rewards/chosen": 0.224609375, "rewards/margins": 3.5390625, "rewards/rejected": -3.31640625, "step": 1620 }, { "epoch": 0.3062969436440077, "grad_norm": 2.294497526982332, "learning_rate": 8.886253730805096e-07, "logits/chosen": 1.705078125, "logits/rejected": 2.2041015625, "logps/chosen": -708.0, "logps/rejected": -1765.0, "loss": 0.6117, "rewards/accuracies": 0.875, "rewards/chosen": 1.0302734375, "rewards/margins": 5.00390625, "rewards/rejected": -3.96875, "step": 1621 }, { "epoch": 0.306485899192215, "grad_norm": 1.9445388110488044, "learning_rate": 8.88429821476587e-07, "logits/chosen": 1.90625, "logits/rejected": 1.37109375, "logps/chosen": -1123.0, "logps/rejected": -1028.0, "loss": 0.5826, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2412109375, "rewards/margins": 4.2734375, "rewards/rejected": -3.03515625, "step": 1622 }, { "epoch": 0.30667485474042233, "grad_norm": 2.455753211193686, "learning_rate": 8.882341226389762e-07, "logits/chosen": 1.48828125, "logits/rejected": 1.04217529296875, "logps/chosen": -795.0, "logps/rejected": -749.0, "loss": 0.6539, "rewards/accuracies": 0.75, "rewards/chosen": 0.8701171875, "rewards/margins": 3.65234375, "rewards/rejected": -2.78515625, "step": 1623 }, { "epoch": 0.3068638102886296, "grad_norm": 1.9159583440095842, "learning_rate": 8.880382766528159e-07, "logits/chosen": 2.248046875, "logits/rejected": 2.103515625, "logps/chosen": -644.0, "logps/rejected": -1863.0, "loss": 0.5798, "rewards/accuracies": 0.875, "rewards/chosen": 0.404296875, "rewards/margins": 5.35546875, "rewards/rejected": -4.953125, "step": 1624 }, { "epoch": 0.3070527658368369, "grad_norm": 1.7429130032673585, "learning_rate": 8.878422836033085e-07, "logits/chosen": 2.75, "logits/rejected": 3.01171875, "logps/chosen": -789.0, "logps/rejected": -1114.0, "loss": 0.5881, "rewards/accuracies": 0.78125, "rewards/chosen": 0.806396484375, "rewards/margins": 4.29296875, "rewards/rejected": -3.48828125, "step": 1625 }, { "epoch": 0.3072417213850442, "grad_norm": 1.889135322705121, "learning_rate": 8.876461435757209e-07, "logits/chosen": 2.240234375, "logits/rejected": 1.96484375, "logps/chosen": -919.5, "logps/rejected": -914.0, "loss": 0.5378, "rewards/accuracies": 0.8125, "rewards/chosen": 1.145751953125, "rewards/margins": 4.34765625, "rewards/rejected": -3.20703125, "step": 1626 }, { "epoch": 0.30743067693325143, "grad_norm": 2.8017507021097536, "learning_rate": 8.874498566553837e-07, "logits/chosen": 0.986328125, "logits/rejected": 1.061767578125, "logps/chosen": -628.5, "logps/rejected": -779.5, "loss": 0.7892, "rewards/accuracies": 0.71875, "rewards/chosen": -0.072021484375, "rewards/margins": 1.873046875, "rewards/rejected": -1.94921875, "step": 1627 }, { "epoch": 0.30761963248145874, "grad_norm": 2.48729729420673, "learning_rate": 8.872534229276912e-07, "logits/chosen": 2.2607421875, "logits/rejected": 2.900390625, "logps/chosen": -991.0, "logps/rejected": -1434.0, "loss": 0.5383, "rewards/accuracies": 0.875, "rewards/chosen": 1.01171875, "rewards/margins": 4.89453125, "rewards/rejected": -3.87890625, "step": 1628 }, { "epoch": 0.30780858802966604, "grad_norm": 5.5626740765063225, "learning_rate": 8.87056842478102e-07, "logits/chosen": 1.81640625, "logits/rejected": 1.8671875, "logps/chosen": -1133.0, "logps/rejected": -848.0, "loss": 0.5305, "rewards/accuracies": 0.875, "rewards/chosen": 0.791015625, "rewards/margins": 3.90234375, "rewards/rejected": -3.109375, "step": 1629 }, { "epoch": 0.3079975435778733, "grad_norm": 2.1541098182880805, "learning_rate": 8.868601153921382e-07, "logits/chosen": 1.5888671875, "logits/rejected": 1.52349853515625, "logps/chosen": -780.0, "logps/rejected": -855.0, "loss": 0.5229, "rewards/accuracies": 0.875, "rewards/chosen": 0.76953125, "rewards/margins": 4.2421875, "rewards/rejected": -3.47265625, "step": 1630 }, { "epoch": 0.3081864991260806, "grad_norm": 2.0081778908722736, "learning_rate": 8.866632417553859e-07, "logits/chosen": 2.6953125, "logits/rejected": 2.294921875, "logps/chosen": -821.5, "logps/rejected": -841.5, "loss": 0.751, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9814453125, "rewards/margins": 2.40625, "rewards/rejected": -1.42578125, "step": 1631 }, { "epoch": 0.3083754546742879, "grad_norm": 1.6422124488484005, "learning_rate": 8.864662216534946e-07, "logits/chosen": 1.81640625, "logits/rejected": 1.759765625, "logps/chosen": -558.5, "logps/rejected": -801.5, "loss": 0.5937, "rewards/accuracies": 0.84375, "rewards/chosen": 0.82763671875, "rewards/margins": 3.890625, "rewards/rejected": -3.064453125, "step": 1632 }, { "epoch": 0.30856441022249514, "grad_norm": 1.8313939288478562, "learning_rate": 8.86269055172178e-07, "logits/chosen": 2.876953125, "logits/rejected": 3.01171875, "logps/chosen": -772.0, "logps/rejected": -1126.0, "loss": 0.5535, "rewards/accuracies": 0.84375, "rewards/chosen": 0.51953125, "rewards/margins": 4.734375, "rewards/rejected": -4.21875, "step": 1633 }, { "epoch": 0.30875336577070245, "grad_norm": 2.76365598944357, "learning_rate": 8.860717423972133e-07, "logits/chosen": 1.982421875, "logits/rejected": 2.029296875, "logps/chosen": -658.5, "logps/rejected": -1292.0, "loss": 0.6289, "rewards/accuracies": 0.8125, "rewards/chosen": 0.672119140625, "rewards/margins": 5.33203125, "rewards/rejected": -4.6669921875, "step": 1634 }, { "epoch": 0.30894232131890975, "grad_norm": 2.0097943841797856, "learning_rate": 8.858742834144412e-07, "logits/chosen": 1.31304931640625, "logits/rejected": 1.554443359375, "logps/chosen": -811.5, "logps/rejected": -623.5, "loss": 0.6401, "rewards/accuracies": 0.8125, "rewards/chosen": 0.796875, "rewards/margins": 2.875, "rewards/rejected": -2.0799560546875, "step": 1635 }, { "epoch": 0.309131276867117, "grad_norm": 1.9826092398573802, "learning_rate": 8.85676678309766e-07, "logits/chosen": 2.6171875, "logits/rejected": 2.2734375, "logps/chosen": -668.0, "logps/rejected": -1163.0, "loss": 0.534, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2578125, "rewards/margins": 5.75390625, "rewards/rejected": -4.4794921875, "step": 1636 }, { "epoch": 0.3093202324153243, "grad_norm": 3.579004714487663, "learning_rate": 8.854789271691559e-07, "logits/chosen": 2.9013671875, "logits/rejected": 2.77734375, "logps/chosen": -353.0, "logps/rejected": -20191.0, "loss": 0.6783, "rewards/accuracies": 0.8125, "rewards/chosen": 0.232879638671875, "rewards/margins": 118.29296875, "rewards/rejected": -118.140625, "step": 1637 }, { "epoch": 0.3095091879635316, "grad_norm": 1.6048090887245974, "learning_rate": 8.852810300786424e-07, "logits/chosen": 2.03125, "logits/rejected": 2.02734375, "logps/chosen": -574.0, "logps/rejected": -672.0, "loss": 0.6249, "rewards/accuracies": 0.90625, "rewards/chosen": 0.62109375, "rewards/margins": 3.896484375, "rewards/rejected": -3.2734375, "step": 1638 }, { "epoch": 0.30969814351173885, "grad_norm": 1.494390872410788, "learning_rate": 8.850829871243202e-07, "logits/chosen": 2.373046875, "logits/rejected": 2.3369140625, "logps/chosen": -738.25, "logps/rejected": -956.0, "loss": 0.698, "rewards/accuracies": 0.65625, "rewards/chosen": 1.0068359375, "rewards/margins": 3.07421875, "rewards/rejected": -2.071044921875, "step": 1639 }, { "epoch": 0.30988709905994616, "grad_norm": 1.881601809895818, "learning_rate": 8.848847983923482e-07, "logits/chosen": 1.6484375, "logits/rejected": 1.26611328125, "logps/chosen": -987.0, "logps/rejected": -913.0, "loss": 0.5872, "rewards/accuracies": 0.8125, "rewards/chosen": 1.19921875, "rewards/margins": 3.451171875, "rewards/rejected": -2.25390625, "step": 1640 }, { "epoch": 0.3100760546081534, "grad_norm": 3.264689448875367, "learning_rate": 8.84686463968948e-07, "logits/chosen": 1.654296875, "logits/rejected": 1.583984375, "logps/chosen": -753.0, "logps/rejected": -715.0, "loss": 0.5574, "rewards/accuracies": 0.875, "rewards/chosen": 0.630859375, "rewards/margins": 4.1484375, "rewards/rejected": -3.5234375, "step": 1641 }, { "epoch": 0.3102650101563607, "grad_norm": 2.037741682574935, "learning_rate": 8.844879839404049e-07, "logits/chosen": 2.744140625, "logits/rejected": 2.5341796875, "logps/chosen": -1052.5, "logps/rejected": -1077.0, "loss": 0.5838, "rewards/accuracies": 0.875, "rewards/chosen": 1.14111328125, "rewards/margins": 4.51953125, "rewards/rejected": -3.37890625, "step": 1642 }, { "epoch": 0.310453965704568, "grad_norm": 2.486238933380635, "learning_rate": 8.842893583930675e-07, "logits/chosen": 2.404296875, "logits/rejected": 2.4140625, "logps/chosen": -1091.0, "logps/rejected": -1223.0, "loss": 0.6365, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5653076171875, "rewards/margins": 4.40234375, "rewards/rejected": -3.84375, "step": 1643 }, { "epoch": 0.31064292125277526, "grad_norm": 1.423989572216102, "learning_rate": 8.840905874133479e-07, "logits/chosen": 2.16015625, "logits/rejected": 2.498046875, "logps/chosen": -692.0, "logps/rejected": -788.0, "loss": 0.6754, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4775390625, "rewards/margins": 3.71875, "rewards/rejected": -3.234375, "step": 1644 }, { "epoch": 0.31083187680098257, "grad_norm": 1.5205982458524907, "learning_rate": 8.838916710877214e-07, "logits/chosen": 1.99609375, "logits/rejected": 1.95703125, "logps/chosen": -927.0, "logps/rejected": -1075.5, "loss": 0.6135, "rewards/accuracies": 0.8125, "rewards/chosen": 0.71630859375, "rewards/margins": 3.92578125, "rewards/rejected": -3.205078125, "step": 1645 }, { "epoch": 0.31102083234918987, "grad_norm": 2.7654374708251366, "learning_rate": 8.836926095027257e-07, "logits/chosen": 2.380859375, "logits/rejected": 1.9404296875, "logps/chosen": -1558.0, "logps/rejected": -1085.0, "loss": 0.4612, "rewards/accuracies": 0.90625, "rewards/chosen": 1.916015625, "rewards/margins": 5.734375, "rewards/rejected": -3.828125, "step": 1646 }, { "epoch": 0.3112097878973971, "grad_norm": 2.173794710310798, "learning_rate": 8.834934027449633e-07, "logits/chosen": 2.7109375, "logits/rejected": 2.75, "logps/chosen": -680.0, "logps/rejected": -854.0, "loss": 0.6307, "rewards/accuracies": 0.8125, "rewards/chosen": 0.16339111328125, "rewards/margins": 4.00390625, "rewards/rejected": -3.84375, "step": 1647 }, { "epoch": 0.3113987434456044, "grad_norm": 2.3522876633256273, "learning_rate": 8.832940509010985e-07, "logits/chosen": 2.388671875, "logits/rejected": 2.5478515625, "logps/chosen": -887.0, "logps/rejected": -973.0, "loss": 0.5848, "rewards/accuracies": 0.875, "rewards/chosen": 0.8251953125, "rewards/margins": 3.404296875, "rewards/rejected": -2.58203125, "step": 1648 }, { "epoch": 0.3115876989938117, "grad_norm": 1.5370807931209878, "learning_rate": 8.830945540578592e-07, "logits/chosen": 2.3076171875, "logits/rejected": 2.32666015625, "logps/chosen": -687.5, "logps/rejected": -892.0, "loss": 0.492, "rewards/accuracies": 0.875, "rewards/chosen": 1.01220703125, "rewards/margins": 4.421875, "rewards/rejected": -3.40234375, "step": 1649 }, { "epoch": 0.31177665454201897, "grad_norm": 1.8591096364662032, "learning_rate": 8.828949123020367e-07, "logits/chosen": 2.375, "logits/rejected": 1.722412109375, "logps/chosen": -1036.5, "logps/rejected": -827.5, "loss": 0.5338, "rewards/accuracies": 0.875, "rewards/chosen": 0.9716796875, "rewards/margins": 3.66796875, "rewards/rejected": -2.69921875, "step": 1650 }, { "epoch": 0.3119656100902263, "grad_norm": 2.223520812448985, "learning_rate": 8.826951257204848e-07, "logits/chosen": 2.9814453125, "logits/rejected": 2.71240234375, "logps/chosen": -1153.5, "logps/rejected": -834.0, "loss": 0.584, "rewards/accuracies": 0.875, "rewards/chosen": 0.8963623046875, "rewards/margins": 4.4140625, "rewards/rejected": -3.52734375, "step": 1651 }, { "epoch": 0.3121545656384336, "grad_norm": 2.2229209957393725, "learning_rate": 8.824951944001203e-07, "logits/chosen": 1.923828125, "logits/rejected": 1.1650390625, "logps/chosen": -784.0, "logps/rejected": -696.0, "loss": 0.6575, "rewards/accuracies": 0.8125, "rewards/chosen": 0.320068359375, "rewards/margins": 2.765625, "rewards/rejected": -2.44140625, "step": 1652 }, { "epoch": 0.3123435211866408, "grad_norm": 2.0101490714910883, "learning_rate": 8.822951184279237e-07, "logits/chosen": 2.97265625, "logits/rejected": 3.267578125, "logps/chosen": -1342.0, "logps/rejected": -1745.0, "loss": 0.5245, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0634765625, "rewards/margins": 5.984375, "rewards/rejected": -4.92578125, "step": 1653 }, { "epoch": 0.31253247673484813, "grad_norm": 2.4783511842577437, "learning_rate": 8.820948978909375e-07, "logits/chosen": 1.693359375, "logits/rejected": 2.53125, "logps/chosen": -847.5, "logps/rejected": -1028.0, "loss": 0.5323, "rewards/accuracies": 0.875, "rewards/chosen": 0.736328125, "rewards/margins": 4.37109375, "rewards/rejected": -3.63671875, "step": 1654 }, { "epoch": 0.31272143228305543, "grad_norm": 1.7348304533284153, "learning_rate": 8.818945328762679e-07, "logits/chosen": 2.26171875, "logits/rejected": 2.298828125, "logps/chosen": -1126.0, "logps/rejected": -1809.0, "loss": 0.598, "rewards/accuracies": 0.75, "rewards/chosen": 1.5400390625, "rewards/margins": 5.0546875, "rewards/rejected": -3.505859375, "step": 1655 }, { "epoch": 0.3129103878312627, "grad_norm": 2.4818540856331976, "learning_rate": 8.816940234710831e-07, "logits/chosen": 2.494140625, "logits/rejected": 2.50390625, "logps/chosen": -1065.0, "logps/rejected": -995.0, "loss": 0.5981, "rewards/accuracies": 0.78125, "rewards/chosen": 0.96337890625, "rewards/margins": 5.0859375, "rewards/rejected": -4.1171875, "step": 1656 }, { "epoch": 0.31309934337947, "grad_norm": 2.1250748904368755, "learning_rate": 8.814933697626151e-07, "logits/chosen": 2.00390625, "logits/rejected": 1.7880859375, "logps/chosen": -893.0, "logps/rejected": -747.0, "loss": 0.6571, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2747802734375, "rewards/margins": 2.796875, "rewards/rejected": -3.06640625, "step": 1657 }, { "epoch": 0.3132882989276773, "grad_norm": 2.85250799307552, "learning_rate": 8.812925718381578e-07, "logits/chosen": 1.44140625, "logits/rejected": 1.2745361328125, "logps/chosen": -903.5, "logps/rejected": -1009.0, "loss": 0.5831, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3349609375, "rewards/margins": 4.015625, "rewards/rejected": -3.6796875, "step": 1658 }, { "epoch": 0.31347725447588454, "grad_norm": 4.609839927736256, "learning_rate": 8.810916297850685e-07, "logits/chosen": 1.8330078125, "logits/rejected": 1.366943359375, "logps/chosen": -754.0, "logps/rejected": -708.0, "loss": 0.5731, "rewards/accuracies": 0.90625, "rewards/chosen": 0.91015625, "rewards/margins": 4.03515625, "rewards/rejected": -3.125, "step": 1659 }, { "epoch": 0.31366621002409184, "grad_norm": 2.05390636564783, "learning_rate": 8.808905436907667e-07, "logits/chosen": 1.9140625, "logits/rejected": 2.2373046875, "logps/chosen": -968.0, "logps/rejected": -1053.0, "loss": 0.4948, "rewards/accuracies": 0.84375, "rewards/chosen": 0.592529296875, "rewards/margins": 4.51953125, "rewards/rejected": -3.91796875, "step": 1660 }, { "epoch": 0.31385516557229914, "grad_norm": 2.2446725770010563, "learning_rate": 8.806893136427348e-07, "logits/chosen": 1.962890625, "logits/rejected": 1.3984375, "logps/chosen": -997.0, "logps/rejected": -964.0, "loss": 0.6177, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2138671875, "rewards/margins": 3.74609375, "rewards/rejected": -3.5390625, "step": 1661 }, { "epoch": 0.3140441211205064, "grad_norm": 1.9324422475892242, "learning_rate": 8.80487939728518e-07, "logits/chosen": 2.9375, "logits/rejected": 2.7109375, "logps/chosen": -595.0, "logps/rejected": -756.5, "loss": 0.8255, "rewards/accuracies": 0.625, "rewards/chosen": -0.1865234375, "rewards/margins": 2.3388671875, "rewards/rejected": -2.52734375, "step": 1662 }, { "epoch": 0.3142330766687137, "grad_norm": 2.0211334833533297, "learning_rate": 8.802864220357237e-07, "logits/chosen": 1.80078125, "logits/rejected": 1.4755859375, "logps/chosen": -660.5, "logps/rejected": -669.0, "loss": 0.5992, "rewards/accuracies": 0.84375, "rewards/chosen": 0.34619140625, "rewards/margins": 3.5078125, "rewards/rejected": -3.158203125, "step": 1663 }, { "epoch": 0.31442203221692094, "grad_norm": 2.620380161401736, "learning_rate": 8.80084760652022e-07, "logits/chosen": 2.515625, "logits/rejected": 2.484375, "logps/chosen": -1177.0, "logps/rejected": -968.5, "loss": 0.5639, "rewards/accuracies": 0.875, "rewards/chosen": 1.26507568359375, "rewards/margins": 5.56640625, "rewards/rejected": -4.30078125, "step": 1664 }, { "epoch": 0.31461098776512825, "grad_norm": 2.394487294997098, "learning_rate": 8.798829556651458e-07, "logits/chosen": 2.28515625, "logits/rejected": 2.158203125, "logps/chosen": -817.0, "logps/rejected": -692.5, "loss": 0.6498, "rewards/accuracies": 0.75, "rewards/chosen": 0.370361328125, "rewards/margins": 3.095703125, "rewards/rejected": -2.728515625, "step": 1665 }, { "epoch": 0.31479994331333555, "grad_norm": 3.1690903950537685, "learning_rate": 8.796810071628902e-07, "logits/chosen": 2.17578125, "logits/rejected": 2.572265625, "logps/chosen": -683.5, "logps/rejected": -894.0, "loss": 0.6852, "rewards/accuracies": 0.84375, "rewards/chosen": 0.525146484375, "rewards/margins": 3.15234375, "rewards/rejected": -2.623046875, "step": 1666 }, { "epoch": 0.3149888988615428, "grad_norm": 1.8958651895821996, "learning_rate": 8.794789152331124e-07, "logits/chosen": 1.9140625, "logits/rejected": 2.47265625, "logps/chosen": -702.0, "logps/rejected": -828.0, "loss": 0.5985, "rewards/accuracies": 0.8125, "rewards/chosen": 0.410400390625, "rewards/margins": 3.298828125, "rewards/rejected": -2.892578125, "step": 1667 }, { "epoch": 0.3151778544097501, "grad_norm": 2.449103579452581, "learning_rate": 8.79276679963733e-07, "logits/chosen": 1.06005859375, "logits/rejected": 1.2275390625, "logps/chosen": -648.0, "logps/rejected": -969.5, "loss": 0.6594, "rewards/accuracies": 0.75, "rewards/chosen": 0.17333984375, "rewards/margins": 4.15625, "rewards/rejected": -3.984375, "step": 1668 }, { "epoch": 0.3153668099579574, "grad_norm": 3.0290226002486436, "learning_rate": 8.790743014427338e-07, "logits/chosen": 1.48046875, "logits/rejected": 1.50390625, "logps/chosen": -623.5, "logps/rejected": -833.5, "loss": 0.6691, "rewards/accuracies": 0.8125, "rewards/chosen": -0.54296875, "rewards/margins": 3.2314453125, "rewards/rejected": -3.7734375, "step": 1669 }, { "epoch": 0.31555576550616465, "grad_norm": 2.152553628924128, "learning_rate": 8.788717797581596e-07, "logits/chosen": 2.203125, "logits/rejected": 1.767578125, "logps/chosen": -750.5, "logps/rejected": -809.0, "loss": 0.5826, "rewards/accuracies": 0.8125, "rewards/chosen": 0.80859375, "rewards/margins": 4.19921875, "rewards/rejected": -3.38671875, "step": 1670 }, { "epoch": 0.31574472105437196, "grad_norm": 3.0389042959561565, "learning_rate": 8.786691149981175e-07, "logits/chosen": 1.4326171875, "logits/rejected": 1.740234375, "logps/chosen": -902.0, "logps/rejected": -2153.0, "loss": 0.5243, "rewards/accuracies": 0.96875, "rewards/chosen": 0.103179931640625, "rewards/margins": 7.0859375, "rewards/rejected": -6.98046875, "step": 1671 }, { "epoch": 0.31593367660257926, "grad_norm": 2.988165962602087, "learning_rate": 8.784663072507765e-07, "logits/chosen": 1.947265625, "logits/rejected": 2.2890625, "logps/chosen": -595.5, "logps/rejected": -648.5, "loss": 0.8195, "rewards/accuracies": 0.625, "rewards/chosen": 0.37890625, "rewards/margins": 1.943359375, "rewards/rejected": -1.5615234375, "step": 1672 }, { "epoch": 0.3161226321507865, "grad_norm": 1.7207857564539135, "learning_rate": 8.782633566043683e-07, "logits/chosen": 2.23046875, "logits/rejected": 2.62890625, "logps/chosen": -588.0, "logps/rejected": -874.0, "loss": 0.6892, "rewards/accuracies": 0.75, "rewards/chosen": 0.612548828125, "rewards/margins": 4.0390625, "rewards/rejected": -3.43359375, "step": 1673 }, { "epoch": 0.3163115876989938, "grad_norm": 2.1534494495960166, "learning_rate": 8.780602631471859e-07, "logits/chosen": 2.384765625, "logits/rejected": 1.9326171875, "logps/chosen": -928.5, "logps/rejected": -822.0, "loss": 0.5039, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3359375, "rewards/margins": 4.4921875, "rewards/rejected": -3.1484375, "step": 1674 }, { "epoch": 0.3165005432472011, "grad_norm": 2.2453051642678252, "learning_rate": 8.778570269675854e-07, "logits/chosen": 1.410888671875, "logits/rejected": 1.6875, "logps/chosen": -402.0, "logps/rejected": -680.0, "loss": 0.6947, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6337890625, "rewards/margins": 4.1923828125, "rewards/rejected": -3.560791015625, "step": 1675 }, { "epoch": 0.31668949879540836, "grad_norm": 1.853495298628202, "learning_rate": 8.776536481539843e-07, "logits/chosen": 1.322265625, "logits/rejected": 1.31219482421875, "logps/chosen": -794.0, "logps/rejected": -988.0, "loss": 0.6482, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4111328125, "rewards/margins": 3.375, "rewards/rejected": -2.96484375, "step": 1676 }, { "epoch": 0.31687845434361567, "grad_norm": 3.2483514025642966, "learning_rate": 8.774501267948628e-07, "logits/chosen": 2.40234375, "logits/rejected": 2.787109375, "logps/chosen": -448.5, "logps/rejected": -1137.5, "loss": 0.5623, "rewards/accuracies": 0.84375, "rewards/chosen": 0.41357421875, "rewards/margins": 5.177734375, "rewards/rejected": -4.765625, "step": 1677 }, { "epoch": 0.31706740989182297, "grad_norm": 2.0104164535999174, "learning_rate": 8.772464629787625e-07, "logits/chosen": 2.18359375, "logits/rejected": 1.92578125, "logps/chosen": -789.0, "logps/rejected": -1139.5, "loss": 0.5519, "rewards/accuracies": 0.875, "rewards/chosen": 1.09228515625, "rewards/margins": 7.328125, "rewards/rejected": -6.2421875, "step": 1678 }, { "epoch": 0.3172563654400302, "grad_norm": 2.4407658780664514, "learning_rate": 8.770426567942872e-07, "logits/chosen": 1.19384765625, "logits/rejected": 1.03125, "logps/chosen": -721.0, "logps/rejected": -933.0, "loss": 0.5295, "rewards/accuracies": 0.875, "rewards/chosen": 1.00048828125, "rewards/margins": 3.98046875, "rewards/rejected": -2.98046875, "step": 1679 }, { "epoch": 0.3174453209882375, "grad_norm": 1.7099306884772374, "learning_rate": 8.768387083301029e-07, "logits/chosen": 2.427734375, "logits/rejected": 1.9296875, "logps/chosen": -618.5, "logps/rejected": -633.5, "loss": 0.6645, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9454345703125, "rewards/margins": 3.3515625, "rewards/rejected": -2.40234375, "step": 1680 }, { "epoch": 0.3176342765364448, "grad_norm": 2.271258946754105, "learning_rate": 8.766346176749368e-07, "logits/chosen": 1.8505859375, "logits/rejected": 2.201171875, "logps/chosen": -924.5, "logps/rejected": -1912.0, "loss": 0.7218, "rewards/accuracies": 0.71875, "rewards/chosen": 0.52685546875, "rewards/margins": 3.53564453125, "rewards/rejected": -3.00201416015625, "step": 1681 }, { "epoch": 0.3178232320846521, "grad_norm": 2.483482853587805, "learning_rate": 8.764303849175788e-07, "logits/chosen": 2.8125, "logits/rejected": 3.015625, "logps/chosen": -784.5, "logps/rejected": -989.5, "loss": 0.6001, "rewards/accuracies": 0.84375, "rewards/chosen": 1.127685546875, "rewards/margins": 4.41015625, "rewards/rejected": -3.2900390625, "step": 1682 }, { "epoch": 0.3180121876328594, "grad_norm": 2.414369783553875, "learning_rate": 8.762260101468802e-07, "logits/chosen": 1.99609375, "logits/rejected": 2.0966796875, "logps/chosen": -796.5, "logps/rejected": -2357.0, "loss": 0.6103, "rewards/accuracies": 0.875, "rewards/chosen": 0.626953125, "rewards/margins": 6.86328125, "rewards/rejected": -6.234375, "step": 1683 }, { "epoch": 0.3182011431810667, "grad_norm": 1.8612443907186567, "learning_rate": 8.760214934517539e-07, "logits/chosen": 3.283203125, "logits/rejected": 3.373046875, "logps/chosen": -1179.0, "logps/rejected": -1313.0, "loss": 0.6438, "rewards/accuracies": 0.78125, "rewards/chosen": 1.537109375, "rewards/margins": 3.984375, "rewards/rejected": -2.4453125, "step": 1684 }, { "epoch": 0.31839009872927393, "grad_norm": 1.3907691913574898, "learning_rate": 8.758168349211751e-07, "logits/chosen": 2.8671875, "logits/rejected": 2.88671875, "logps/chosen": -657.0, "logps/rejected": -1143.0, "loss": 0.5289, "rewards/accuracies": 0.84375, "rewards/chosen": 1.010009765625, "rewards/margins": 5.015625, "rewards/rejected": -4.0, "step": 1685 }, { "epoch": 0.31857905427748123, "grad_norm": 3.7074630585159327, "learning_rate": 8.7561203464418e-07, "logits/chosen": 2.056640625, "logits/rejected": 2.07421875, "logps/chosen": -876.5, "logps/rejected": -924.0, "loss": 0.6118, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4881591796875, "rewards/margins": 3.380859375, "rewards/rejected": -2.89453125, "step": 1686 }, { "epoch": 0.3187680098256885, "grad_norm": 1.7786018994770043, "learning_rate": 8.75407092709867e-07, "logits/chosen": 1.5234375, "logits/rejected": 1.818359375, "logps/chosen": -742.0, "logps/rejected": -1057.0, "loss": 0.4783, "rewards/accuracies": 0.90625, "rewards/chosen": 0.92626953125, "rewards/margins": 4.46484375, "rewards/rejected": -3.5390625, "step": 1687 }, { "epoch": 0.3189569653738958, "grad_norm": 2.0580287765247975, "learning_rate": 8.752020092073962e-07, "logits/chosen": 1.615234375, "logits/rejected": 1.3994140625, "logps/chosen": -691.5, "logps/rejected": -694.5, "loss": 0.6286, "rewards/accuracies": 0.8125, "rewards/chosen": 0.280029296875, "rewards/margins": 3.390625, "rewards/rejected": -3.1171875, "step": 1688 }, { "epoch": 0.3191459209221031, "grad_norm": 2.018244766791287, "learning_rate": 8.749967842259887e-07, "logits/chosen": 3.248046875, "logits/rejected": 3.028564453125, "logps/chosen": -902.0, "logps/rejected": -972.0, "loss": 0.604, "rewards/accuracies": 0.75, "rewards/chosen": 0.48681640625, "rewards/margins": 3.59765625, "rewards/rejected": -3.115234375, "step": 1689 }, { "epoch": 0.31933487647031034, "grad_norm": 2.8809008189698173, "learning_rate": 8.747914178549276e-07, "logits/chosen": 2.537109375, "logits/rejected": 2.111328125, "logps/chosen": -639.0, "logps/rejected": -526.0, "loss": 0.7576, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2939453125, "rewards/margins": 1.912109375, "rewards/rejected": -2.20703125, "step": 1690 }, { "epoch": 0.31952383201851764, "grad_norm": 2.1549030707271686, "learning_rate": 8.745859101835574e-07, "logits/chosen": 2.517578125, "logits/rejected": 1.8125, "logps/chosen": -728.0, "logps/rejected": -865.5, "loss": 0.645, "rewards/accuracies": 0.78125, "rewards/chosen": -0.249267578125, "rewards/margins": 3.435546875, "rewards/rejected": -3.67578125, "step": 1691 }, { "epoch": 0.31971278756672494, "grad_norm": 2.230426720564032, "learning_rate": 8.743802613012842e-07, "logits/chosen": 1.9169921875, "logits/rejected": 1.80029296875, "logps/chosen": -924.5, "logps/rejected": -928.5, "loss": 0.6835, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2509765625, "rewards/margins": 3.5400390625, "rewards/rejected": -3.29296875, "step": 1692 }, { "epoch": 0.3199017431149322, "grad_norm": 4.869212992802926, "learning_rate": 8.741744712975755e-07, "logits/chosen": 2.328125, "logits/rejected": 2.490234375, "logps/chosen": -1038.0, "logps/rejected": -849.0, "loss": 0.5295, "rewards/accuracies": 0.84375, "rewards/chosen": 0.953125, "rewards/margins": 4.84375, "rewards/rejected": -3.8984375, "step": 1693 }, { "epoch": 0.3200906986631395, "grad_norm": 2.838064164730113, "learning_rate": 8.739685402619598e-07, "logits/chosen": 2.056640625, "logits/rejected": 1.875, "logps/chosen": -1657.0, "logps/rejected": -1711.5, "loss": 0.5349, "rewards/accuracies": 0.90625, "rewards/chosen": -0.21484375, "rewards/margins": 4.876708984375, "rewards/rejected": -5.09375, "step": 1694 }, { "epoch": 0.3202796542113468, "grad_norm": 3.511866151114884, "learning_rate": 8.737624682840274e-07, "logits/chosen": 2.138671875, "logits/rejected": 2.470703125, "logps/chosen": -566.0, "logps/rejected": -872.0, "loss": 0.643, "rewards/accuracies": 0.71875, "rewards/chosen": 0.630859375, "rewards/margins": 3.37109375, "rewards/rejected": -2.73828125, "step": 1695 }, { "epoch": 0.32046860975955405, "grad_norm": 2.354941825995327, "learning_rate": 8.735562554534299e-07, "logits/chosen": 1.875, "logits/rejected": 1.8671875, "logps/chosen": -1112.0, "logps/rejected": -972.0, "loss": 0.6461, "rewards/accuracies": 0.78125, "rewards/chosen": 0.49853515625, "rewards/margins": 3.7734375, "rewards/rejected": -3.27734375, "step": 1696 }, { "epoch": 0.32065756530776135, "grad_norm": 3.43683075982236, "learning_rate": 8.733499018598801e-07, "logits/chosen": 2.095703125, "logits/rejected": 2.03369140625, "logps/chosen": -733.0, "logps/rejected": -851.5, "loss": 0.4903, "rewards/accuracies": 0.90625, "rewards/chosen": 0.911285400390625, "rewards/margins": 4.87890625, "rewards/rejected": -3.97265625, "step": 1697 }, { "epoch": 0.32084652085596865, "grad_norm": 4.6423494342767615, "learning_rate": 8.731434075931518e-07, "logits/chosen": 2.11328125, "logits/rejected": 1.375732421875, "logps/chosen": -616.5, "logps/rejected": -17495.5, "loss": 0.6137, "rewards/accuracies": 0.78125, "rewards/chosen": 0.175048828125, "rewards/margins": 16.38671875, "rewards/rejected": -16.23046875, "step": 1698 }, { "epoch": 0.3210354764041759, "grad_norm": 2.8258546676391485, "learning_rate": 8.729367727430804e-07, "logits/chosen": 2.23046875, "logits/rejected": 2.24609375, "logps/chosen": -798.0, "logps/rejected": -900.5, "loss": 0.5683, "rewards/accuracies": 0.84375, "rewards/chosen": 0.56201171875, "rewards/margins": 4.0703125, "rewards/rejected": -3.5078125, "step": 1699 }, { "epoch": 0.3212244319523832, "grad_norm": 2.1545418042975832, "learning_rate": 8.727299973995621e-07, "logits/chosen": 2.84375, "logits/rejected": 2.73828125, "logps/chosen": -816.0, "logps/rejected": -891.0, "loss": 0.5981, "rewards/accuracies": 0.78125, "rewards/chosen": 0.30859375, "rewards/margins": 4.4130859375, "rewards/rejected": -4.1015625, "step": 1700 }, { "epoch": 0.3214133875005905, "grad_norm": 2.292793904588404, "learning_rate": 8.725230816525545e-07, "logits/chosen": 1.40625, "logits/rejected": 2.037109375, "logps/chosen": -1002.5, "logps/rejected": -1737.0, "loss": 0.5372, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2568359375, "rewards/margins": 5.8515625, "rewards/rejected": -4.59765625, "step": 1701 }, { "epoch": 0.32160234304879776, "grad_norm": 2.164777129647887, "learning_rate": 8.72316025592076e-07, "logits/chosen": 2.2080078125, "logits/rejected": 1.6591796875, "logps/chosen": -1625.5, "logps/rejected": -1284.0, "loss": 0.5452, "rewards/accuracies": 0.78125, "rewards/chosen": 1.47314453125, "rewards/margins": 4.873046875, "rewards/rejected": -3.3974609375, "step": 1702 }, { "epoch": 0.32179129859700506, "grad_norm": 2.77892241057801, "learning_rate": 8.721088293082065e-07, "logits/chosen": 1.888671875, "logits/rejected": 2.1328125, "logps/chosen": -597.5, "logps/rejected": -1006.5, "loss": 0.6993, "rewards/accuracies": 0.875, "rewards/chosen": 0.7607421875, "rewards/margins": 2.390625, "rewards/rejected": -1.62548828125, "step": 1703 }, { "epoch": 0.32198025414521236, "grad_norm": 3.003877390146027, "learning_rate": 8.719014928910865e-07, "logits/chosen": 2.634765625, "logits/rejected": 2.578125, "logps/chosen": -749.0, "logps/rejected": -803.0, "loss": 0.702, "rewards/accuracies": 0.75, "rewards/chosen": 0.653076171875, "rewards/margins": 3.0859375, "rewards/rejected": -2.4296875, "step": 1704 }, { "epoch": 0.3221692096934196, "grad_norm": 2.8012256824671233, "learning_rate": 8.716940164309176e-07, "logits/chosen": 1.79296875, "logits/rejected": 1.765625, "logps/chosen": -687.5, "logps/rejected": -605.0, "loss": 0.6307, "rewards/accuracies": 0.90625, "rewards/chosen": 0.18994140625, "rewards/margins": 3.59765625, "rewards/rejected": -3.4140625, "step": 1705 }, { "epoch": 0.3223581652416269, "grad_norm": 3.115590421886564, "learning_rate": 8.714864000179621e-07, "logits/chosen": 2.4140625, "logits/rejected": 2.373046875, "logps/chosen": -713.0, "logps/rejected": -884.5, "loss": 0.5197, "rewards/accuracies": 0.875, "rewards/chosen": 0.25177001953125, "rewards/margins": 4.357421875, "rewards/rejected": -4.107421875, "step": 1706 }, { "epoch": 0.3225471207898342, "grad_norm": 3.191637068922742, "learning_rate": 8.712786437425435e-07, "logits/chosen": 3.32421875, "logits/rejected": 3.2734375, "logps/chosen": -487.5, "logps/rejected": -1194.0, "loss": 0.7145, "rewards/accuracies": 0.75, "rewards/chosen": -0.05615234375, "rewards/margins": 3.775634765625, "rewards/rejected": -3.828125, "step": 1707 }, { "epoch": 0.32273607633804147, "grad_norm": 4.539902032026842, "learning_rate": 8.710707476950463e-07, "logits/chosen": 1.9296875, "logits/rejected": 1.765625, "logps/chosen": -716.5, "logps/rejected": -1050.0, "loss": 0.6661, "rewards/accuracies": 0.875, "rewards/chosen": 0.0322265625, "rewards/margins": 4.171875, "rewards/rejected": -4.140625, "step": 1708 }, { "epoch": 0.32292503188624877, "grad_norm": 1.5528368435353075, "learning_rate": 8.708627119659152e-07, "logits/chosen": 1.8798828125, "logits/rejected": 1.73193359375, "logps/chosen": -1155.0, "logps/rejected": -1278.0, "loss": 0.6639, "rewards/accuracies": 0.71875, "rewards/chosen": 0.76416015625, "rewards/margins": 3.984375, "rewards/rejected": -3.21484375, "step": 1709 }, { "epoch": 0.323113987434456, "grad_norm": 2.066992873233616, "learning_rate": 8.706545366456563e-07, "logits/chosen": 3.0, "logits/rejected": 3.09765625, "logps/chosen": -668.5, "logps/rejected": -678.5, "loss": 0.728, "rewards/accuracies": 0.71875, "rewards/chosen": -0.389434814453125, "rewards/margins": 2.292236328125, "rewards/rejected": -2.68359375, "step": 1710 }, { "epoch": 0.3233029429826633, "grad_norm": 2.132280616678731, "learning_rate": 8.704462218248358e-07, "logits/chosen": 2.333984375, "logits/rejected": 2.2265625, "logps/chosen": -733.5, "logps/rejected": -775.5, "loss": 0.7148, "rewards/accuracies": 0.6875, "rewards/chosen": 0.71435546875, "rewards/margins": 2.982421875, "rewards/rejected": -2.271484375, "step": 1711 }, { "epoch": 0.3234918985308706, "grad_norm": 1.7464484834807792, "learning_rate": 8.702377675940813e-07, "logits/chosen": 1.876953125, "logits/rejected": 1.68359375, "logps/chosen": -957.0, "logps/rejected": -734.0, "loss": 0.4943, "rewards/accuracies": 0.875, "rewards/chosen": 0.81298828125, "rewards/margins": 4.55078125, "rewards/rejected": -3.734375, "step": 1712 }, { "epoch": 0.3236808540790779, "grad_norm": 2.819228554812114, "learning_rate": 8.700291740440804e-07, "logits/chosen": 1.4951171875, "logits/rejected": 1.30810546875, "logps/chosen": -902.0, "logps/rejected": -13151.0, "loss": 0.4927, "rewards/accuracies": 0.90625, "rewards/chosen": 0.89697265625, "rewards/margins": 7.27734375, "rewards/rejected": -6.3671875, "step": 1713 }, { "epoch": 0.3238698096272852, "grad_norm": 2.1954596584698853, "learning_rate": 8.698204412655818e-07, "logits/chosen": 2.20703125, "logits/rejected": 2.1904296875, "logps/chosen": -693.0, "logps/rejected": -1284.0, "loss": 0.5734, "rewards/accuracies": 0.90625, "rewards/chosen": 0.74481201171875, "rewards/margins": 5.6640625, "rewards/rejected": -4.91015625, "step": 1714 }, { "epoch": 0.3240587651754925, "grad_norm": 2.5833317501586293, "learning_rate": 8.696115693493945e-07, "logits/chosen": 2.73046875, "logits/rejected": 2.4296875, "logps/chosen": -978.0, "logps/rejected": -767.5, "loss": 0.6675, "rewards/accuracies": 0.875, "rewards/chosen": 0.209228515625, "rewards/margins": 3.31640625, "rewards/rejected": -3.1015625, "step": 1715 }, { "epoch": 0.32424772072369973, "grad_norm": 2.569409512938175, "learning_rate": 8.69402558386388e-07, "logits/chosen": 1.75146484375, "logits/rejected": 1.6025390625, "logps/chosen": -868.0, "logps/rejected": -844.5, "loss": 0.6131, "rewards/accuracies": 0.78125, "rewards/chosen": 0.297607421875, "rewards/margins": 3.498046875, "rewards/rejected": -3.19921875, "step": 1716 }, { "epoch": 0.32443667627190703, "grad_norm": 3.52611127693879, "learning_rate": 8.691934084674927e-07, "logits/chosen": 2.349609375, "logits/rejected": 1.6611328125, "logps/chosen": -769.5, "logps/rejected": -838.0, "loss": 0.6098, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5152587890625, "rewards/margins": 4.78515625, "rewards/rejected": -4.28515625, "step": 1717 }, { "epoch": 0.32462563182011434, "grad_norm": 3.4455783223003102, "learning_rate": 8.689841196836987e-07, "logits/chosen": 3.296875, "logits/rejected": 2.9375, "logps/chosen": -811.5, "logps/rejected": -636.0, "loss": 0.7102, "rewards/accuracies": 0.65625, "rewards/chosen": -0.792236328125, "rewards/margins": 2.078125, "rewards/rejected": -2.86328125, "step": 1718 }, { "epoch": 0.3248145873683216, "grad_norm": 3.2961014262356287, "learning_rate": 8.687746921260574e-07, "logits/chosen": 2.173828125, "logits/rejected": 2.1337890625, "logps/chosen": -649.5, "logps/rejected": -1390.5, "loss": 0.5158, "rewards/accuracies": 0.9375, "rewards/chosen": 0.72344970703125, "rewards/margins": 6.08984375, "rewards/rejected": -5.373046875, "step": 1719 }, { "epoch": 0.3250035429165289, "grad_norm": 6.619891969113851, "learning_rate": 8.685651258856796e-07, "logits/chosen": 1.47998046875, "logits/rejected": 2.228515625, "logps/chosen": -568.5, "logps/rejected": -1559.0, "loss": 0.6706, "rewards/accuracies": 0.8125, "rewards/chosen": 0.19110107421875, "rewards/margins": 2.771484375, "rewards/rejected": -2.58203125, "step": 1720 }, { "epoch": 0.3251924984647362, "grad_norm": 2.8696284233310223, "learning_rate": 8.683554210537377e-07, "logits/chosen": 2.328125, "logits/rejected": 2.462890625, "logps/chosen": -1103.0, "logps/rejected": -1083.0, "loss": 0.5501, "rewards/accuracies": 0.9375, "rewards/chosen": 0.502685546875, "rewards/margins": 4.69921875, "rewards/rejected": -4.1953125, "step": 1721 }, { "epoch": 0.32538145401294344, "grad_norm": 3.2594820014244954, "learning_rate": 8.681455777214629e-07, "logits/chosen": 2.828125, "logits/rejected": 2.5390625, "logps/chosen": -761.0, "logps/rejected": -503.0, "loss": 0.5749, "rewards/accuracies": 0.78125, "rewards/chosen": 10.962890625, "rewards/margins": 14.9375, "rewards/rejected": -3.953125, "step": 1722 }, { "epoch": 0.32557040956115074, "grad_norm": 2.5319265363857033, "learning_rate": 8.679355959801481e-07, "logits/chosen": 1.728515625, "logits/rejected": 1.8359375, "logps/chosen": -16407.0, "logps/rejected": -989.0, "loss": 0.5933, "rewards/accuracies": 0.75, "rewards/chosen": -75.88671875, "rewards/margins": -72.92578125, "rewards/rejected": -2.77734375, "step": 1723 }, { "epoch": 0.32575936510935805, "grad_norm": 3.1381169879129462, "learning_rate": 8.677254759211452e-07, "logits/chosen": 2.31640625, "logits/rejected": 1.435546875, "logps/chosen": -601.0, "logps/rejected": -747.0, "loss": 0.6095, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2398681640625, "rewards/margins": 3.34765625, "rewards/rejected": -3.58984375, "step": 1724 }, { "epoch": 0.3259483206575653, "grad_norm": 1.6650638140320801, "learning_rate": 8.67515217635867e-07, "logits/chosen": 2.48828125, "logits/rejected": 2.0869140625, "logps/chosen": -853.0, "logps/rejected": -780.0, "loss": 0.6864, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3271484375, "rewards/margins": 3.025390625, "rewards/rejected": -2.705078125, "step": 1725 }, { "epoch": 0.3261372762057726, "grad_norm": 2.0807609700641474, "learning_rate": 8.673048212157862e-07, "logits/chosen": 2.44140625, "logits/rejected": 1.966796875, "logps/chosen": -1083.75, "logps/rejected": -1623.0, "loss": 0.6827, "rewards/accuracies": 0.71875, "rewards/chosen": 0.34228515625, "rewards/margins": 2.892578125, "rewards/rejected": -2.552734375, "step": 1726 }, { "epoch": 0.3263262317539799, "grad_norm": 2.205967182263259, "learning_rate": 8.670942867524357e-07, "logits/chosen": 2.47265625, "logits/rejected": 2.68359375, "logps/chosen": -941.0, "logps/rejected": -1356.0, "loss": 0.5369, "rewards/accuracies": 0.875, "rewards/chosen": 0.9991455078125, "rewards/margins": 5.4296875, "rewards/rejected": -4.44140625, "step": 1727 }, { "epoch": 0.32651518730218715, "grad_norm": 2.025961034160533, "learning_rate": 8.668836143374083e-07, "logits/chosen": 2.1181640625, "logits/rejected": 1.802978515625, "logps/chosen": -858.0, "logps/rejected": -621.5, "loss": 0.7113, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4794921875, "rewards/margins": 2.3785400390625, "rewards/rejected": -1.89404296875, "step": 1728 }, { "epoch": 0.32670414285039445, "grad_norm": 2.1658889344741588, "learning_rate": 8.666728040623571e-07, "logits/chosen": 1.685546875, "logits/rejected": 1.7001953125, "logps/chosen": -650.0, "logps/rejected": -653.0, "loss": 0.6593, "rewards/accuracies": 0.875, "rewards/chosen": 0.407958984375, "rewards/margins": 2.53125, "rewards/rejected": -2.123046875, "step": 1729 }, { "epoch": 0.32689309839860176, "grad_norm": 2.7906004893793916, "learning_rate": 8.664618560189948e-07, "logits/chosen": 3.1875, "logits/rejected": 2.822265625, "logps/chosen": -1195.5, "logps/rejected": -546.5, "loss": 0.6977, "rewards/accuracies": 0.75, "rewards/chosen": 1.73388671875, "rewards/margins": 3.32763671875, "rewards/rejected": -1.5927734375, "step": 1730 }, { "epoch": 0.327082053946809, "grad_norm": 2.2197397062134865, "learning_rate": 8.662507702990942e-07, "logits/chosen": 2.697265625, "logits/rejected": 3.00390625, "logps/chosen": -862.0, "logps/rejected": -1076.0, "loss": 0.6249, "rewards/accuracies": 0.90625, "rewards/chosen": 1.361328125, "rewards/margins": 3.5, "rewards/rejected": -2.138671875, "step": 1731 }, { "epoch": 0.3272710094950163, "grad_norm": 2.149718127562451, "learning_rate": 8.660395469944881e-07, "logits/chosen": 3.078125, "logits/rejected": 3.16796875, "logps/chosen": -691.0, "logps/rejected": -866.0, "loss": 0.6202, "rewards/accuracies": 0.75, "rewards/chosen": 0.85888671875, "rewards/margins": 3.06640625, "rewards/rejected": -2.21484375, "step": 1732 }, { "epoch": 0.32745996504322356, "grad_norm": 2.8250262937550117, "learning_rate": 8.658281861970692e-07, "logits/chosen": 2.103515625, "logits/rejected": 2.390625, "logps/chosen": -671.5, "logps/rejected": -705.5, "loss": 0.6353, "rewards/accuracies": 0.78125, "rewards/chosen": 0.38427734375, "rewards/margins": 3.134765625, "rewards/rejected": -2.75390625, "step": 1733 }, { "epoch": 0.32764892059143086, "grad_norm": 3.5379177489089453, "learning_rate": 8.656166879987896e-07, "logits/chosen": 3.5, "logits/rejected": 3.796875, "logps/chosen": -1046.0, "logps/rejected": -1159.0, "loss": 0.7583, "rewards/accuracies": 0.6875, "rewards/chosen": 1.12890625, "rewards/margins": 3.099609375, "rewards/rejected": -1.969482421875, "step": 1734 }, { "epoch": 0.32783787613963816, "grad_norm": 3.040455314869386, "learning_rate": 8.654050524916617e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.8828125, "logps/chosen": -862.0, "logps/rejected": -968.0, "loss": 0.622, "rewards/accuracies": 0.78125, "rewards/chosen": 1.244140625, "rewards/margins": 4.00390625, "rewards/rejected": -2.751953125, "step": 1735 }, { "epoch": 0.3280268316878454, "grad_norm": 1.767912625743553, "learning_rate": 8.65193279767757e-07, "logits/chosen": 1.642578125, "logits/rejected": 1.982421875, "logps/chosen": -750.0, "logps/rejected": -1180.0, "loss": 0.4942, "rewards/accuracies": 0.875, "rewards/chosen": 1.25244140625, "rewards/margins": 5.16796875, "rewards/rejected": -3.921875, "step": 1736 }, { "epoch": 0.3282157872360527, "grad_norm": 2.2834262596646653, "learning_rate": 8.649813699192078e-07, "logits/chosen": 3.28125, "logits/rejected": 3.31640625, "logps/chosen": -680.0, "logps/rejected": -1214.0, "loss": 0.6058, "rewards/accuracies": 0.875, "rewards/chosen": 0.868408203125, "rewards/margins": 4.7734375, "rewards/rejected": -3.88671875, "step": 1737 }, { "epoch": 0.32840474278426, "grad_norm": 2.3370779271642754, "learning_rate": 8.647693230382047e-07, "logits/chosen": 2.849609375, "logits/rejected": 2.712890625, "logps/chosen": -606.5, "logps/rejected": -677.0, "loss": 0.6132, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7236328125, "rewards/margins": 3.328125, "rewards/rejected": -2.60546875, "step": 1738 }, { "epoch": 0.32859369833246727, "grad_norm": 1.6505288664637157, "learning_rate": 8.645571392169989e-07, "logits/chosen": 1.666259765625, "logits/rejected": 1.122802734375, "logps/chosen": -611.0, "logps/rejected": -596.0, "loss": 0.5943, "rewards/accuracies": 0.84375, "rewards/chosen": 0.740478515625, "rewards/margins": 3.69921875, "rewards/rejected": -2.9609375, "step": 1739 }, { "epoch": 0.32878265388067457, "grad_norm": 2.4019417400283745, "learning_rate": 8.643448185479007e-07, "logits/chosen": 1.7041015625, "logits/rejected": 1.708984375, "logps/chosen": -537.5, "logps/rejected": -558.5, "loss": 0.584, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7783203125, "rewards/margins": 4.1796875, "rewards/rejected": -3.404296875, "step": 1740 }, { "epoch": 0.3289716094288819, "grad_norm": 2.5425121291792454, "learning_rate": 8.6413236112328e-07, "logits/chosen": 1.21142578125, "logits/rejected": 1.5576171875, "logps/chosen": -1032.0, "logps/rejected": -1018.0, "loss": 0.616, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7060546875, "rewards/margins": 3.794921875, "rewards/rejected": -3.08203125, "step": 1741 }, { "epoch": 0.3291605649770891, "grad_norm": 2.3376239200213855, "learning_rate": 8.639197670355666e-07, "logits/chosen": 2.400390625, "logits/rejected": 2.228515625, "logps/chosen": -1056.0, "logps/rejected": -1340.0, "loss": 0.4726, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7548828125, "rewards/margins": 5.9453125, "rewards/rejected": -5.203125, "step": 1742 }, { "epoch": 0.3293495205252964, "grad_norm": 2.265439391313395, "learning_rate": 8.637070363772493e-07, "logits/chosen": 1.181640625, "logits/rejected": 1.2392578125, "logps/chosen": -960.0, "logps/rejected": -1243.0, "loss": 0.6132, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7421875, "rewards/margins": 4.734375, "rewards/rejected": -4.0, "step": 1743 }, { "epoch": 0.3295384760735037, "grad_norm": 1.5776539056569705, "learning_rate": 8.634941692408763e-07, "logits/chosen": 1.6181640625, "logits/rejected": 1.4622802734375, "logps/chosen": -917.0, "logps/rejected": -923.5, "loss": 0.5982, "rewards/accuracies": 0.875, "rewards/chosen": 0.560546875, "rewards/margins": 3.921875, "rewards/rejected": -3.359375, "step": 1744 }, { "epoch": 0.329727431621711, "grad_norm": 2.098196390828313, "learning_rate": 8.632811657190553e-07, "logits/chosen": 1.6259765625, "logits/rejected": 1.5302734375, "logps/chosen": -1000.0, "logps/rejected": -750.5, "loss": 0.6457, "rewards/accuracies": 0.875, "rewards/chosen": 0.7333984375, "rewards/margins": 3.74609375, "rewards/rejected": -3.009765625, "step": 1745 }, { "epoch": 0.3299163871699183, "grad_norm": 2.244416312120642, "learning_rate": 8.630680259044536e-07, "logits/chosen": 2.04296875, "logits/rejected": 1.7109375, "logps/chosen": -883.0, "logps/rejected": -1383.0, "loss": 0.569, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4029541015625, "rewards/margins": 6.5703125, "rewards/rejected": -6.1796875, "step": 1746 }, { "epoch": 0.3301053427181256, "grad_norm": 3.267512191612378, "learning_rate": 8.628547498897972e-07, "logits/chosen": 1.376953125, "logits/rejected": 1.059326171875, "logps/chosen": -1127.0, "logps/rejected": -856.0, "loss": 0.6319, "rewards/accuracies": 0.75, "rewards/chosen": 0.21875, "rewards/margins": 4.1015625, "rewards/rejected": -3.87890625, "step": 1747 }, { "epoch": 0.33029429826633283, "grad_norm": 2.0506487186823144, "learning_rate": 8.62641337767872e-07, "logits/chosen": 1.8056640625, "logits/rejected": 1.916015625, "logps/chosen": -832.5, "logps/rejected": -1055.0, "loss": 0.566, "rewards/accuracies": 0.875, "rewards/chosen": 0.3499755859375, "rewards/margins": 6.12109375, "rewards/rejected": -5.7734375, "step": 1748 }, { "epoch": 0.33048325381454013, "grad_norm": 1.942886127604698, "learning_rate": 8.624277896315228e-07, "logits/chosen": 0.99072265625, "logits/rejected": 1.0108642578125, "logps/chosen": -1021.0, "logps/rejected": -1880.0, "loss": 0.5473, "rewards/accuracies": 0.875, "rewards/chosen": 0.5, "rewards/margins": 6.5625, "rewards/rejected": -6.06640625, "step": 1749 }, { "epoch": 0.33067220936274744, "grad_norm": 2.636229763546103, "learning_rate": 8.622141055736534e-07, "logits/chosen": 1.4580078125, "logits/rejected": 1.8583984375, "logps/chosen": -488.0, "logps/rejected": -1553.0, "loss": 0.7044, "rewards/accuracies": 0.8125, "rewards/chosen": 0.21875, "rewards/margins": 5.703125, "rewards/rejected": -5.470703125, "step": 1750 }, { "epoch": 0.3308611649109547, "grad_norm": 1.8131941590274452, "learning_rate": 8.620002856872271e-07, "logits/chosen": 1.58984375, "logits/rejected": 1.3876953125, "logps/chosen": -551.5, "logps/rejected": -689.0, "loss": 0.6185, "rewards/accuracies": 0.84375, "rewards/chosen": 0.51513671875, "rewards/margins": 4.13671875, "rewards/rejected": -3.62109375, "step": 1751 }, { "epoch": 0.331050120459162, "grad_norm": 1.9950509078760963, "learning_rate": 8.617863300652659e-07, "logits/chosen": 1.96484375, "logits/rejected": 1.8974609375, "logps/chosen": -1174.0, "logps/rejected": -1134.0, "loss": 0.5424, "rewards/accuracies": 0.875, "rewards/chosen": 1.640625, "rewards/margins": 10.01171875, "rewards/rejected": -8.3671875, "step": 1752 }, { "epoch": 0.3312390760073693, "grad_norm": 2.3311888710995077, "learning_rate": 8.615722388008513e-07, "logits/chosen": 1.6082763671875, "logits/rejected": 1.486328125, "logps/chosen": -676.0, "logps/rejected": -832.0, "loss": 0.6234, "rewards/accuracies": 0.875, "rewards/chosen": 1.1435546875, "rewards/margins": 4.48828125, "rewards/rejected": -3.33984375, "step": 1753 }, { "epoch": 0.33142803155557654, "grad_norm": 2.4662799037076, "learning_rate": 8.613580119871234e-07, "logits/chosen": 2.0625, "logits/rejected": 2.53515625, "logps/chosen": -990.0, "logps/rejected": -1862.0, "loss": 0.5676, "rewards/accuracies": 0.84375, "rewards/chosen": 1.581787109375, "rewards/margins": 7.48828125, "rewards/rejected": -5.9072265625, "step": 1754 }, { "epoch": 0.33161698710378384, "grad_norm": 2.016720757131718, "learning_rate": 8.611436497172815e-07, "logits/chosen": 2.234375, "logits/rejected": 2.337890625, "logps/chosen": -869.0, "logps/rejected": -1060.5, "loss": 0.6155, "rewards/accuracies": 0.84375, "rewards/chosen": 1.50244140625, "rewards/margins": 4.236328125, "rewards/rejected": -2.73828125, "step": 1755 }, { "epoch": 0.3318059426519911, "grad_norm": 1.7157771720444306, "learning_rate": 8.609291520845837e-07, "logits/chosen": 1.6142578125, "logits/rejected": 1.7119140625, "logps/chosen": -936.0, "logps/rejected": -923.5, "loss": 0.5957, "rewards/accuracies": 0.78125, "rewards/chosen": 1.15966796875, "rewards/margins": 3.98828125, "rewards/rejected": -2.828125, "step": 1756 }, { "epoch": 0.3319948982001984, "grad_norm": 1.8000071086112583, "learning_rate": 8.607145191823474e-07, "logits/chosen": 2.92578125, "logits/rejected": 2.46484375, "logps/chosen": -775.0, "logps/rejected": -749.0, "loss": 0.6464, "rewards/accuracies": 0.71875, "rewards/chosen": 1.4189453125, "rewards/margins": 4.19140625, "rewards/rejected": -2.7578125, "step": 1757 }, { "epoch": 0.3321838537484057, "grad_norm": 2.3324052000781528, "learning_rate": 8.604997511039481e-07, "logits/chosen": 2.365234375, "logits/rejected": 1.7802734375, "logps/chosen": -533.5, "logps/rejected": -455.0, "loss": 0.6543, "rewards/accuracies": 0.90625, "rewards/chosen": 0.71728515625, "rewards/margins": 2.826171875, "rewards/rejected": -2.11328125, "step": 1758 }, { "epoch": 0.33237280929661295, "grad_norm": 2.0283131360540554, "learning_rate": 8.602848479428208e-07, "logits/chosen": 2.939453125, "logits/rejected": 1.912109375, "logps/chosen": -862.0, "logps/rejected": -871.5, "loss": 0.5807, "rewards/accuracies": 0.90625, "rewards/chosen": 0.95166015625, "rewards/margins": 3.7578125, "rewards/rejected": -2.8046875, "step": 1759 }, { "epoch": 0.33256176484482025, "grad_norm": 2.289340020541758, "learning_rate": 8.600698097924588e-07, "logits/chosen": 1.88671875, "logits/rejected": 1.705078125, "logps/chosen": -1008.5, "logps/rejected": -1098.0, "loss": 0.4669, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4443359375, "rewards/margins": 4.76171875, "rewards/rejected": -3.31640625, "step": 1760 }, { "epoch": 0.33275072039302755, "grad_norm": 2.2490817833764365, "learning_rate": 8.598546367464143e-07, "logits/chosen": 2.234375, "logits/rejected": 2.12890625, "logps/chosen": -1083.0, "logps/rejected": -1010.0, "loss": 0.5622, "rewards/accuracies": 0.90625, "rewards/chosen": 1.791015625, "rewards/margins": 4.4453125, "rewards/rejected": -2.662109375, "step": 1761 }, { "epoch": 0.3329396759412348, "grad_norm": 1.9616596399704782, "learning_rate": 8.596393288982984e-07, "logits/chosen": 2.515625, "logits/rejected": 2.142578125, "logps/chosen": -675.0, "logps/rejected": -585.0, "loss": 0.6281, "rewards/accuracies": 0.9375, "rewards/chosen": 1.08203125, "rewards/margins": 2.986328125, "rewards/rejected": -1.904296875, "step": 1762 }, { "epoch": 0.3331286314894421, "grad_norm": 1.9664552155696653, "learning_rate": 8.594238863417806e-07, "logits/chosen": 1.5322265625, "logits/rejected": 1.456085205078125, "logps/chosen": -803.0, "logps/rejected": -1633.0, "loss": 0.6273, "rewards/accuracies": 0.75, "rewards/chosen": 0.875, "rewards/margins": 4.265625, "rewards/rejected": -3.388671875, "step": 1763 }, { "epoch": 0.3333175870376494, "grad_norm": 1.908896820561257, "learning_rate": 8.592083091705889e-07, "logits/chosen": 3.091796875, "logits/rejected": 2.689453125, "logps/chosen": -761.5, "logps/rejected": -837.5, "loss": 0.5718, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3291015625, "rewards/margins": 3.73828125, "rewards/rejected": -2.4140625, "step": 1764 }, { "epoch": 0.33350654258585666, "grad_norm": 1.639077224013808, "learning_rate": 8.589925974785102e-07, "logits/chosen": 2.34765625, "logits/rejected": 1.962890625, "logps/chosen": -2014.0, "logps/rejected": -754.0, "loss": 0.6301, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1337890625, "rewards/margins": 2.2265625, "rewards/rejected": -2.365234375, "step": 1765 }, { "epoch": 0.33369549813406396, "grad_norm": 2.1351018402485593, "learning_rate": 8.587767513593895e-07, "logits/chosen": 1.4140625, "logits/rejected": 1.2744140625, "logps/chosen": -876.5, "logps/rejected": -805.0, "loss": 0.4503, "rewards/accuracies": 0.9375, "rewards/chosen": 1.01416015625, "rewards/margins": 5.73828125, "rewards/rejected": -4.7421875, "step": 1766 }, { "epoch": 0.33388445368227126, "grad_norm": 2.067810756580934, "learning_rate": 8.58560770907131e-07, "logits/chosen": 1.7783203125, "logits/rejected": 1.568359375, "logps/chosen": -1034.0, "logps/rejected": -967.0, "loss": 0.6274, "rewards/accuracies": 0.78125, "rewards/chosen": 1.365234375, "rewards/margins": 3.9296875, "rewards/rejected": -2.556640625, "step": 1767 }, { "epoch": 0.3340734092304785, "grad_norm": 1.604935482589405, "learning_rate": 8.583446562156962e-07, "logits/chosen": 1.203369140625, "logits/rejected": 1.62744140625, "logps/chosen": -501.5, "logps/rejected": -637.5, "loss": 0.8253, "rewards/accuracies": 0.78125, "rewards/chosen": -0.81201171875, "rewards/margins": 1.708984375, "rewards/rejected": -2.51953125, "step": 1768 }, { "epoch": 0.3342623647786858, "grad_norm": 2.184064778681121, "learning_rate": 8.581284073791061e-07, "logits/chosen": 2.24609375, "logits/rejected": 1.90625, "logps/chosen": -654.0, "logps/rejected": -684.5, "loss": 0.6246, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7984619140625, "rewards/margins": 3.5703125, "rewards/rejected": -2.76953125, "step": 1769 }, { "epoch": 0.3344513203268931, "grad_norm": 2.601918680118778, "learning_rate": 8.579120244914396e-07, "logits/chosen": 2.1142578125, "logits/rejected": 2.529296875, "logps/chosen": -778.5, "logps/rejected": -850.0, "loss": 0.6102, "rewards/accuracies": 0.78125, "rewards/chosen": 0.172607421875, "rewards/margins": 3.92578125, "rewards/rejected": -3.75390625, "step": 1770 }, { "epoch": 0.33464027587510037, "grad_norm": 1.7978371636460069, "learning_rate": 8.576955076468339e-07, "logits/chosen": 2.5, "logits/rejected": 2.12890625, "logps/chosen": -793.5, "logps/rejected": -803.0, "loss": 0.68, "rewards/accuracies": 0.8125, "rewards/chosen": 0.978515625, "rewards/margins": 4.74609375, "rewards/rejected": -3.775390625, "step": 1771 }, { "epoch": 0.33482923142330767, "grad_norm": 2.1900772761437293, "learning_rate": 8.574788569394845e-07, "logits/chosen": 2.7783203125, "logits/rejected": 2.5927734375, "logps/chosen": -1100.0, "logps/rejected": -1371.0, "loss": 0.6885, "rewards/accuracies": 0.78125, "rewards/chosen": 0.202880859375, "rewards/margins": 4.140625, "rewards/rejected": -3.94140625, "step": 1772 }, { "epoch": 0.335018186971515, "grad_norm": 1.9939635311341417, "learning_rate": 8.572620724636452e-07, "logits/chosen": 2.4296875, "logits/rejected": 2.2421875, "logps/chosen": -552.0, "logps/rejected": -561.0, "loss": 0.6664, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17919921875, "rewards/margins": 3.322265625, "rewards/rejected": -3.5078125, "step": 1773 }, { "epoch": 0.3352071425197222, "grad_norm": 2.281687957351462, "learning_rate": 8.570451543136278e-07, "logits/chosen": 1.05224609375, "logits/rejected": 1.0166015625, "logps/chosen": -600.0, "logps/rejected": -592.5, "loss": 0.6749, "rewards/accuracies": 0.875, "rewards/chosen": -0.304443359375, "rewards/margins": 3.078125, "rewards/rejected": -3.3828125, "step": 1774 }, { "epoch": 0.3353960980679295, "grad_norm": 3.0146855366929524, "learning_rate": 8.568281025838026e-07, "logits/chosen": 2.0068359375, "logits/rejected": 2.08203125, "logps/chosen": -864.5, "logps/rejected": -796.0, "loss": 0.6072, "rewards/accuracies": 0.84375, "rewards/chosen": 0.714599609375, "rewards/margins": 4.61328125, "rewards/rejected": -3.89453125, "step": 1775 }, { "epoch": 0.33558505361613683, "grad_norm": 4.122655956332637, "learning_rate": 8.566109173685975e-07, "logits/chosen": 1.66326904296875, "logits/rejected": 1.447265625, "logps/chosen": -931.5, "logps/rejected": -1572.0, "loss": 0.6693, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0615234375, "rewards/margins": 5.576171875, "rewards/rejected": -5.51171875, "step": 1776 }, { "epoch": 0.3357740091643441, "grad_norm": 1.5639536136959673, "learning_rate": 8.563935987624994e-07, "logits/chosen": 1.7333984375, "logits/rejected": 1.599609375, "logps/chosen": -806.0, "logps/rejected": -950.0, "loss": 0.5723, "rewards/accuracies": 0.8125, "rewards/chosen": -0.245361328125, "rewards/margins": 4.671875, "rewards/rejected": -4.921875, "step": 1777 }, { "epoch": 0.3359629647125514, "grad_norm": 3.2455120524760734, "learning_rate": 8.561761468600521e-07, "logits/chosen": 2.064453125, "logits/rejected": 2.267578125, "logps/chosen": -846.0, "logps/rejected": -987.0, "loss": 0.6843, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3616790771484375, "rewards/margins": 4.033203125, "rewards/rejected": -4.39453125, "step": 1778 }, { "epoch": 0.33615192026075863, "grad_norm": 1.6912706399864863, "learning_rate": 8.559585617558581e-07, "logits/chosen": 2.2421875, "logits/rejected": 1.63671875, "logps/chosen": -777.0, "logps/rejected": -719.0, "loss": 0.5844, "rewards/accuracies": 0.8125, "rewards/chosen": 0.46826171875, "rewards/margins": 3.72265625, "rewards/rejected": -3.25390625, "step": 1779 }, { "epoch": 0.33634087580896593, "grad_norm": 1.8433109442263262, "learning_rate": 8.557408435445778e-07, "logits/chosen": 0.9580078125, "logits/rejected": 1.099609375, "logps/chosen": -442.0, "logps/rejected": -708.5, "loss": 0.7965, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6373291015625, "rewards/margins": 2.802734375, "rewards/rejected": -3.4375, "step": 1780 }, { "epoch": 0.33652983135717324, "grad_norm": 2.4048222707359512, "learning_rate": 8.555229923209289e-07, "logits/chosen": 1.935546875, "logits/rejected": 1.673095703125, "logps/chosen": -800.0, "logps/rejected": -895.0, "loss": 0.6118, "rewards/accuracies": 0.78125, "rewards/chosen": 0.108642578125, "rewards/margins": 4.01171875, "rewards/rejected": -3.89453125, "step": 1781 }, { "epoch": 0.3367187869053805, "grad_norm": 2.4148918017620233, "learning_rate": 8.55305008179688e-07, "logits/chosen": 2.48046875, "logits/rejected": 2.5, "logps/chosen": -610.0, "logps/rejected": -693.0, "loss": 0.6077, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7177734375, "rewards/margins": 3.53125, "rewards/rejected": -2.8125, "step": 1782 }, { "epoch": 0.3369077424535878, "grad_norm": 2.8900993355470446, "learning_rate": 8.550868912156886e-07, "logits/chosen": 2.14111328125, "logits/rejected": 2.0205078125, "logps/chosen": -928.0, "logps/rejected": -818.0, "loss": 0.6142, "rewards/accuracies": 0.84375, "rewards/chosen": 0.602783203125, "rewards/margins": 3.6640625, "rewards/rejected": -3.05859375, "step": 1783 }, { "epoch": 0.3370966980017951, "grad_norm": 2.2217265141821176, "learning_rate": 8.548686415238227e-07, "logits/chosen": 2.1279296875, "logits/rejected": 1.88525390625, "logps/chosen": -662.0, "logps/rejected": -685.0, "loss": 0.7008, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2734375, "rewards/margins": 2.6185302734375, "rewards/rejected": -2.3436279296875, "step": 1784 }, { "epoch": 0.33728565355000234, "grad_norm": 1.7474703925183819, "learning_rate": 8.546502591990394e-07, "logits/chosen": 2.529296875, "logits/rejected": 2.396484375, "logps/chosen": -982.0, "logps/rejected": -1294.0, "loss": 0.4856, "rewards/accuracies": 0.90625, "rewards/chosen": 1.19921875, "rewards/margins": 5.185546875, "rewards/rejected": -3.982421875, "step": 1785 }, { "epoch": 0.33747460909820964, "grad_norm": 2.005315919196681, "learning_rate": 8.544317443363459e-07, "logits/chosen": 2.39453125, "logits/rejected": 2.009765625, "logps/chosen": -685.0, "logps/rejected": -698.5, "loss": 0.663, "rewards/accuracies": 0.8125, "rewards/chosen": 0.53515625, "rewards/margins": 3.9453125, "rewards/rejected": -3.41015625, "step": 1786 }, { "epoch": 0.33766356464641695, "grad_norm": 2.541029321607929, "learning_rate": 8.542130970308069e-07, "logits/chosen": 2.640625, "logits/rejected": 2.296875, "logps/chosen": -661.0, "logps/rejected": -843.0, "loss": 0.6377, "rewards/accuracies": 0.8125, "rewards/chosen": 0.835205078125, "rewards/margins": 5.0546875, "rewards/rejected": -4.23046875, "step": 1787 }, { "epoch": 0.3378525201946242, "grad_norm": 3.2927250027340675, "learning_rate": 8.53994317377545e-07, "logits/chosen": 2.107421875, "logits/rejected": 2.28125, "logps/chosen": -870.5, "logps/rejected": -895.0, "loss": 0.6896, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6451416015625, "rewards/margins": 2.36474609375, "rewards/rejected": -1.7177734375, "step": 1788 }, { "epoch": 0.3380414757428315, "grad_norm": 2.1636946741322904, "learning_rate": 8.537754054717398e-07, "logits/chosen": 2.26171875, "logits/rejected": 1.76953125, "logps/chosen": -730.0, "logps/rejected": -784.5, "loss": 0.6335, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0576171875, "rewards/margins": 4.11328125, "rewards/rejected": -3.05859375, "step": 1789 }, { "epoch": 0.3382304312910388, "grad_norm": 2.3851454900347226, "learning_rate": 8.535563614086289e-07, "logits/chosen": 2.59375, "logits/rejected": 2.58203125, "logps/chosen": -445.0, "logps/rejected": -614.0, "loss": 0.6465, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4814453125, "rewards/margins": 3.431640625, "rewards/rejected": -2.951171875, "step": 1790 }, { "epoch": 0.33841938683924605, "grad_norm": 1.5775022103148164, "learning_rate": 8.533371852835075e-07, "logits/chosen": 2.53125, "logits/rejected": 2.658203125, "logps/chosen": -734.0, "logps/rejected": -18814.0, "loss": 0.599, "rewards/accuracies": 0.75, "rewards/chosen": 0.7540283203125, "rewards/margins": 18.03125, "rewards/rejected": -17.3046875, "step": 1791 }, { "epoch": 0.33860834238745335, "grad_norm": 1.7328809469399504, "learning_rate": 8.531178771917278e-07, "logits/chosen": 2.515625, "logits/rejected": 2.4375, "logps/chosen": -975.0, "logps/rejected": -802.0, "loss": 0.5665, "rewards/accuracies": 0.84375, "rewards/chosen": 0.98046875, "rewards/margins": 3.83154296875, "rewards/rejected": -2.849609375, "step": 1792 }, { "epoch": 0.33879729793566066, "grad_norm": 2.2864425826925845, "learning_rate": 8.528984372286998e-07, "logits/chosen": 1.7021484375, "logits/rejected": 1.16259765625, "logps/chosen": -790.0, "logps/rejected": -567.5, "loss": 0.6101, "rewards/accuracies": 0.75, "rewards/chosen": 0.6103515625, "rewards/margins": 3.515625, "rewards/rejected": -2.8984375, "step": 1793 }, { "epoch": 0.3389862534838679, "grad_norm": 2.5958282772078687, "learning_rate": 8.526788654898906e-07, "logits/chosen": 2.16748046875, "logits/rejected": 2.1318359375, "logps/chosen": -899.0, "logps/rejected": -1073.0, "loss": 0.6818, "rewards/accuracies": 0.71875, "rewards/chosen": 0.677734375, "rewards/margins": 2.75390625, "rewards/rejected": -2.0732421875, "step": 1794 }, { "epoch": 0.3391752090320752, "grad_norm": 2.141717089285907, "learning_rate": 8.524591620708251e-07, "logits/chosen": 1.634521484375, "logits/rejected": 1.49755859375, "logps/chosen": -1025.0, "logps/rejected": -919.5, "loss": 0.7212, "rewards/accuracies": 0.78125, "rewards/chosen": 0.76513671875, "rewards/margins": 2.67578125, "rewards/rejected": -1.9140625, "step": 1795 }, { "epoch": 0.3393641645802825, "grad_norm": 2.072902816792752, "learning_rate": 8.522393270670846e-07, "logits/chosen": 2.8193359375, "logits/rejected": 2.9384765625, "logps/chosen": -661.0, "logps/rejected": -2769.0, "loss": 0.6039, "rewards/accuracies": 0.8125, "rewards/chosen": 1.08203125, "rewards/margins": 4.703125, "rewards/rejected": -3.6171875, "step": 1796 }, { "epoch": 0.33955312012848976, "grad_norm": 1.7413180901280272, "learning_rate": 8.520193605743085e-07, "logits/chosen": 2.369140625, "logits/rejected": 2.26171875, "logps/chosen": -614.5, "logps/rejected": -734.0, "loss": 0.6447, "rewards/accuracies": 0.71875, "rewards/chosen": 0.533203125, "rewards/margins": 2.6015625, "rewards/rejected": -2.0673828125, "step": 1797 }, { "epoch": 0.33974207567669706, "grad_norm": 1.8573506532491573, "learning_rate": 8.51799262688193e-07, "logits/chosen": 1.923828125, "logits/rejected": 1.73095703125, "logps/chosen": -584.0, "logps/rejected": -479.0, "loss": 0.6546, "rewards/accuracies": 0.71875, "rewards/chosen": 0.63153076171875, "rewards/margins": 2.91015625, "rewards/rejected": -2.27734375, "step": 1798 }, { "epoch": 0.33993103122490437, "grad_norm": 2.03532387910954, "learning_rate": 8.515790335044917e-07, "logits/chosen": 3.41015625, "logits/rejected": 2.353515625, "logps/chosen": -881.5, "logps/rejected": -606.5, "loss": 0.6206, "rewards/accuracies": 0.75, "rewards/chosen": 0.621826171875, "rewards/margins": 3.34375, "rewards/rejected": -2.7197265625, "step": 1799 }, { "epoch": 0.3401199867731116, "grad_norm": 1.7721823929257223, "learning_rate": 8.513586731190149e-07, "logits/chosen": 1.8984375, "logits/rejected": 1.74609375, "logps/chosen": -876.5, "logps/rejected": -803.0, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": 1.09375, "rewards/margins": 3.388671875, "rewards/rejected": -2.296875, "step": 1800 }, { "epoch": 0.3403089423213189, "grad_norm": 2.0247815932900806, "learning_rate": 8.511381816276306e-07, "logits/chosen": 2.4296875, "logits/rejected": 2.1103515625, "logps/chosen": -865.0, "logps/rejected": -713.5, "loss": 0.5847, "rewards/accuracies": 0.875, "rewards/chosen": 0.91015625, "rewards/margins": 3.615234375, "rewards/rejected": -2.703125, "step": 1801 }, { "epoch": 0.34049789786952617, "grad_norm": 1.7907830604602577, "learning_rate": 8.509175591262632e-07, "logits/chosen": 2.736328125, "logits/rejected": 2.623046875, "logps/chosen": -691.0, "logps/rejected": -1653.0, "loss": 0.6467, "rewards/accuracies": 0.625, "rewards/chosen": 0.609375, "rewards/margins": 4.16796875, "rewards/rejected": -3.552734375, "step": 1802 }, { "epoch": 0.34068685341773347, "grad_norm": 2.544412307872412, "learning_rate": 8.506968057108944e-07, "logits/chosen": 2.1767578125, "logits/rejected": 2.099365234375, "logps/chosen": -712.0, "logps/rejected": -1202.0, "loss": 0.5454, "rewards/accuracies": 0.8125, "rewards/chosen": 0.788330078125, "rewards/margins": 5.25, "rewards/rejected": -4.4765625, "step": 1803 }, { "epoch": 0.3408758089659408, "grad_norm": 1.8093569324506302, "learning_rate": 8.504759214775631e-07, "logits/chosen": 3.20703125, "logits/rejected": 3.17578125, "logps/chosen": -639.5, "logps/rejected": -703.0, "loss": 0.5003, "rewards/accuracies": 0.90625, "rewards/chosen": 0.987060546875, "rewards/margins": 4.9453125, "rewards/rejected": -3.95703125, "step": 1804 }, { "epoch": 0.341064764514148, "grad_norm": 3.238359382560332, "learning_rate": 8.502549065223645e-07, "logits/chosen": 2.544921875, "logits/rejected": 2.32958984375, "logps/chosen": -1057.0, "logps/rejected": -954.0, "loss": 0.5036, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3017578125, "rewards/margins": 4.921875, "rewards/rejected": -3.6171875, "step": 1805 }, { "epoch": 0.3412537200623553, "grad_norm": 2.3881447764901225, "learning_rate": 8.500337609414514e-07, "logits/chosen": 1.870849609375, "logits/rejected": 1.7197265625, "logps/chosen": -561.25, "logps/rejected": -1050.5, "loss": 0.6833, "rewards/accuracies": 0.875, "rewards/chosen": 0.224609375, "rewards/margins": 3.2802734375, "rewards/rejected": -3.056640625, "step": 1806 }, { "epoch": 0.34144267561056263, "grad_norm": 1.4514148963715126, "learning_rate": 8.498124848310328e-07, "logits/chosen": 3.1328125, "logits/rejected": 3.15625, "logps/chosen": -845.5, "logps/rejected": -1049.0, "loss": 0.5027, "rewards/accuracies": 0.96875, "rewards/chosen": 0.955078125, "rewards/margins": 5.28125, "rewards/rejected": -4.3203125, "step": 1807 }, { "epoch": 0.3416316311587699, "grad_norm": 3.057419159986763, "learning_rate": 8.495910782873747e-07, "logits/chosen": 2.337890625, "logits/rejected": 2.689453125, "logps/chosen": -617.0, "logps/rejected": -1005.5, "loss": 0.5905, "rewards/accuracies": 0.90625, "rewards/chosen": 0.561767578125, "rewards/margins": 4.40625, "rewards/rejected": -3.8359375, "step": 1808 }, { "epoch": 0.3418205867069772, "grad_norm": 1.9206481329751086, "learning_rate": 8.493695414068001e-07, "logits/chosen": 1.694091796875, "logits/rejected": 1.828125, "logps/chosen": -946.5, "logps/rejected": -1946.0, "loss": 0.5251, "rewards/accuracies": 0.96875, "rewards/chosen": 0.98828125, "rewards/margins": 4.9453125, "rewards/rejected": -3.95703125, "step": 1809 }, { "epoch": 0.3420095422551845, "grad_norm": 1.6759788993563423, "learning_rate": 8.491478742856881e-07, "logits/chosen": 2.4921875, "logits/rejected": 2.708984375, "logps/chosen": -622.5, "logps/rejected": -887.0, "loss": 0.6458, "rewards/accuracies": 0.78125, "rewards/chosen": 1.154296875, "rewards/margins": 3.944061279296875, "rewards/rejected": -2.7890625, "step": 1810 }, { "epoch": 0.34219849780339173, "grad_norm": 2.125950828632117, "learning_rate": 8.489260770204754e-07, "logits/chosen": 1.2548828125, "logits/rejected": 1.00390625, "logps/chosen": -800.0, "logps/rejected": -654.0, "loss": 0.6087, "rewards/accuracies": 0.84375, "rewards/chosen": 0.708892822265625, "rewards/margins": 3.51171875, "rewards/rejected": -2.8046875, "step": 1811 }, { "epoch": 0.34238745335159904, "grad_norm": 3.5836160308975544, "learning_rate": 8.487041497076542e-07, "logits/chosen": 1.90234375, "logits/rejected": 1.7783203125, "logps/chosen": -948.0, "logps/rejected": -1815.0, "loss": 0.5513, "rewards/accuracies": 0.78125, "rewards/chosen": 1.505859375, "rewards/margins": 6.1328125, "rewards/rejected": -4.619140625, "step": 1812 }, { "epoch": 0.34257640889980634, "grad_norm": 2.7498880886415944, "learning_rate": 8.484820924437744e-07, "logits/chosen": 2.30859375, "logits/rejected": 1.7841796875, "logps/chosen": -1048.0, "logps/rejected": -805.0, "loss": 0.5862, "rewards/accuracies": 0.84375, "rewards/chosen": 0.779296875, "rewards/margins": 3.84765625, "rewards/rejected": -3.0703125, "step": 1813 }, { "epoch": 0.3427653644480136, "grad_norm": 1.9976970446298312, "learning_rate": 8.482599053254413e-07, "logits/chosen": 1.6259765625, "logits/rejected": 1.77294921875, "logps/chosen": -341.5, "logps/rejected": -451.0, "loss": 0.64, "rewards/accuracies": 0.84375, "rewards/chosen": 0.70166015625, "rewards/margins": 2.84765625, "rewards/rejected": -2.1435546875, "step": 1814 }, { "epoch": 0.3429543199962209, "grad_norm": 1.8703530886566588, "learning_rate": 8.480375884493178e-07, "logits/chosen": 1.916015625, "logits/rejected": 1.583984375, "logps/chosen": -1136.0, "logps/rejected": -959.0, "loss": 0.5632, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5, "rewards/margins": 4.390625, "rewards/rejected": -2.888671875, "step": 1815 }, { "epoch": 0.3431432755444282, "grad_norm": 2.462577657734217, "learning_rate": 8.478151419121225e-07, "logits/chosen": 2.451171875, "logits/rejected": 2.255859375, "logps/chosen": -618.0, "logps/rejected": -695.0, "loss": 0.6606, "rewards/accuracies": 0.8125, "rewards/chosen": 0.732421875, "rewards/margins": 3.2734375, "rewards/rejected": -2.5478515625, "step": 1816 }, { "epoch": 0.34333223109263544, "grad_norm": 2.1029200897624443, "learning_rate": 8.475925658106303e-07, "logits/chosen": 2.7890625, "logits/rejected": 2.447265625, "logps/chosen": -489.75, "logps/rejected": -613.0, "loss": 0.7587, "rewards/accuracies": 0.75, "rewards/chosen": 0.304443359375, "rewards/margins": 2.5390625, "rewards/rejected": -2.232421875, "step": 1817 }, { "epoch": 0.34352118664084275, "grad_norm": 2.3940001451147026, "learning_rate": 8.473698602416733e-07, "logits/chosen": 3.3203125, "logits/rejected": 3.109375, "logps/chosen": -709.0, "logps/rejected": -690.75, "loss": 0.5643, "rewards/accuracies": 0.84375, "rewards/chosen": 1.11572265625, "rewards/margins": 3.6142578125, "rewards/rejected": -2.50390625, "step": 1818 }, { "epoch": 0.34371014218905005, "grad_norm": 2.2610046595339877, "learning_rate": 8.471470253021392e-07, "logits/chosen": 2.4453125, "logits/rejected": 2.380859375, "logps/chosen": -899.0, "logps/rejected": -1121.0, "loss": 0.5929, "rewards/accuracies": 0.78125, "rewards/chosen": 1.298583984375, "rewards/margins": 4.67578125, "rewards/rejected": -3.375, "step": 1819 }, { "epoch": 0.3438990977372573, "grad_norm": 2.2652250886795176, "learning_rate": 8.469240610889724e-07, "logits/chosen": 2.5576171875, "logits/rejected": 3.08984375, "logps/chosen": -602.0, "logps/rejected": -789.0, "loss": 0.659, "rewards/accuracies": 0.75, "rewards/chosen": 0.5693359375, "rewards/margins": 3.3203125, "rewards/rejected": -2.74609375, "step": 1820 }, { "epoch": 0.3440880532854646, "grad_norm": 1.9082258002112487, "learning_rate": 8.467009676991731e-07, "logits/chosen": 2.9140625, "logits/rejected": 2.73046875, "logps/chosen": -899.0, "logps/rejected": -1061.0, "loss": 0.6236, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9207763671875, "rewards/margins": 4.171875, "rewards/rejected": -3.25390625, "step": 1821 }, { "epoch": 0.3442770088336719, "grad_norm": 1.8167600814775011, "learning_rate": 8.464777452297978e-07, "logits/chosen": 1.990234375, "logits/rejected": 1.82421875, "logps/chosen": -873.0, "logps/rejected": -836.0, "loss": 0.5974, "rewards/accuracies": 0.875, "rewards/chosen": 0.875732421875, "rewards/margins": 3.76953125, "rewards/rejected": -2.8857421875, "step": 1822 }, { "epoch": 0.34446596438187915, "grad_norm": 2.3582092459110693, "learning_rate": 8.462543937779598e-07, "logits/chosen": 2.388671875, "logits/rejected": 2.4296875, "logps/chosen": -913.0, "logps/rejected": -884.0, "loss": 0.7464, "rewards/accuracies": 0.75, "rewards/chosen": 0.328857421875, "rewards/margins": 2.751953125, "rewards/rejected": -2.42926025390625, "step": 1823 }, { "epoch": 0.34465491993008646, "grad_norm": 2.1313691623336353, "learning_rate": 8.460309134408276e-07, "logits/chosen": 2.34765625, "logits/rejected": 2.4921875, "logps/chosen": -625.0, "logps/rejected": -900.5, "loss": 0.641, "rewards/accuracies": 0.84375, "rewards/chosen": 0.56689453125, "rewards/margins": 3.18359375, "rewards/rejected": -2.615234375, "step": 1824 }, { "epoch": 0.3448438754782937, "grad_norm": 2.1530686022992804, "learning_rate": 8.458073043156264e-07, "logits/chosen": 2.97265625, "logits/rejected": 2.48828125, "logps/chosen": -510.5, "logps/rejected": -678.0, "loss": 0.6283, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1318359375, "rewards/margins": 4.4609375, "rewards/rejected": -4.33984375, "step": 1825 }, { "epoch": 0.345032831026501, "grad_norm": 2.31781365093992, "learning_rate": 8.455835664996371e-07, "logits/chosen": 2.677734375, "logits/rejected": 2.751953125, "logps/chosen": -876.25, "logps/rejected": -1506.0, "loss": 0.7252, "rewards/accuracies": 0.78125, "rewards/chosen": 0.998046875, "rewards/margins": 3.3193359375, "rewards/rejected": -2.3291015625, "step": 1826 }, { "epoch": 0.3452217865747083, "grad_norm": 2.3539858772894786, "learning_rate": 8.453597000901967e-07, "logits/chosen": 2.3515625, "logits/rejected": 2.42578125, "logps/chosen": -645.0, "logps/rejected": -892.0, "loss": 0.606, "rewards/accuracies": 0.75, "rewards/chosen": 1.19921875, "rewards/margins": 4.005859375, "rewards/rejected": -2.80712890625, "step": 1827 }, { "epoch": 0.34541074212291556, "grad_norm": 1.7651301842509544, "learning_rate": 8.451357051846983e-07, "logits/chosen": 2.70703125, "logits/rejected": 2.06494140625, "logps/chosen": -731.25, "logps/rejected": -605.0, "loss": 0.5391, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7838134765625, "rewards/margins": 4.4296875, "rewards/rejected": -3.65234375, "step": 1828 }, { "epoch": 0.34559969767112286, "grad_norm": 1.5702282970747239, "learning_rate": 8.449115818805906e-07, "logits/chosen": 1.810546875, "logits/rejected": 2.1484375, "logps/chosen": -579.5, "logps/rejected": -786.5, "loss": 0.7318, "rewards/accuracies": 0.78125, "rewards/chosen": 0.26611328125, "rewards/margins": 2.2734375, "rewards/rejected": -2.0078125, "step": 1829 }, { "epoch": 0.34578865321933017, "grad_norm": 1.5214049804315424, "learning_rate": 8.446873302753783e-07, "logits/chosen": 2.830078125, "logits/rejected": 2.7890625, "logps/chosen": -631.5, "logps/rejected": -570.5, "loss": 0.5864, "rewards/accuracies": 0.84375, "rewards/chosen": 0.660400390625, "rewards/margins": 3.85546875, "rewards/rejected": -3.1953125, "step": 1830 }, { "epoch": 0.3459776087675374, "grad_norm": 2.3194199171901446, "learning_rate": 8.444629504666219e-07, "logits/chosen": 2.146484375, "logits/rejected": 1.841796875, "logps/chosen": -743.0, "logps/rejected": -563.0, "loss": 0.5627, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3291015625, "rewards/margins": 3.61328125, "rewards/rejected": -2.287109375, "step": 1831 }, { "epoch": 0.3461665643157447, "grad_norm": 2.281426758015515, "learning_rate": 8.442384425519378e-07, "logits/chosen": 2.2099609375, "logits/rejected": 2.28076171875, "logps/chosen": -919.5, "logps/rejected": -680.5, "loss": 0.6956, "rewards/accuracies": 0.6875, "rewards/chosen": 0.984375, "rewards/margins": 3.451171875, "rewards/rejected": -2.46484375, "step": 1832 }, { "epoch": 0.346355519863952, "grad_norm": 2.0974616347315016, "learning_rate": 8.44013806628998e-07, "logits/chosen": 1.6337890625, "logits/rejected": 1.65234375, "logps/chosen": -657.75, "logps/rejected": -658.5, "loss": 0.6622, "rewards/accuracies": 0.875, "rewards/chosen": 0.6695556640625, "rewards/margins": 2.912109375, "rewards/rejected": -2.240234375, "step": 1833 }, { "epoch": 0.34654447541215927, "grad_norm": 1.823194384017629, "learning_rate": 8.437890427955303e-07, "logits/chosen": 2.357421875, "logits/rejected": 2.267578125, "logps/chosen": -481.0, "logps/rejected": -560.0, "loss": 0.7484, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5146484375, "rewards/margins": 2.376953125, "rewards/rejected": -1.859375, "step": 1834 }, { "epoch": 0.3467334309603666, "grad_norm": 2.691052387841538, "learning_rate": 8.43564151149318e-07, "logits/chosen": 3.14453125, "logits/rejected": 1.919921875, "logps/chosen": -1117.5, "logps/rejected": -978.5, "loss": 0.5274, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6729736328125, "rewards/margins": 5.802734375, "rewards/rejected": -5.12890625, "step": 1835 }, { "epoch": 0.3469223865085739, "grad_norm": 2.9017253788335573, "learning_rate": 8.433391317882e-07, "logits/chosen": 1.71484375, "logits/rejected": 1.35791015625, "logps/chosen": -577.5, "logps/rejected": -694.5, "loss": 0.7073, "rewards/accuracies": 0.78125, "rewards/chosen": 0.078125, "rewards/margins": 3.09375, "rewards/rejected": -3.013671875, "step": 1836 }, { "epoch": 0.3471113420567811, "grad_norm": 2.591343248632196, "learning_rate": 8.431139848100708e-07, "logits/chosen": 2.248046875, "logits/rejected": 2.294921875, "logps/chosen": -1151.5, "logps/rejected": -1910.0, "loss": 0.6032, "rewards/accuracies": 0.8125, "rewards/chosen": 1.42919921875, "rewards/margins": 5.765625, "rewards/rejected": -4.341796875, "step": 1837 }, { "epoch": 0.34730029760498843, "grad_norm": 2.1316592799538983, "learning_rate": 8.428887103128807e-07, "logits/chosen": 2.0546875, "logits/rejected": 1.7900390625, "logps/chosen": -1128.0, "logps/rejected": -1272.0, "loss": 0.4406, "rewards/accuracies": 0.875, "rewards/chosen": 0.823486328125, "rewards/margins": 7.765625, "rewards/rejected": -6.95703125, "step": 1838 }, { "epoch": 0.34748925315319573, "grad_norm": 2.1774511234911214, "learning_rate": 8.426633083946351e-07, "logits/chosen": 1.744140625, "logits/rejected": 1.1435546875, "logps/chosen": -1159.0, "logps/rejected": -936.0, "loss": 0.5405, "rewards/accuracies": 0.8125, "rewards/chosen": 1.341796875, "rewards/margins": 5.484375, "rewards/rejected": -4.15234375, "step": 1839 }, { "epoch": 0.347678208701403, "grad_norm": 2.3368920289853476, "learning_rate": 8.424377791533947e-07, "logits/chosen": 2.375, "logits/rejected": 2.322265625, "logps/chosen": -698.0, "logps/rejected": -863.0, "loss": 0.5765, "rewards/accuracies": 0.8125, "rewards/chosen": -0.101287841796875, "rewards/margins": 5.15234375, "rewards/rejected": -5.25, "step": 1840 }, { "epoch": 0.3478671642496103, "grad_norm": 2.6786536830219836, "learning_rate": 8.422121226872764e-07, "logits/chosen": 1.814453125, "logits/rejected": 1.8046875, "logps/chosen": -799.0, "logps/rejected": -725.0, "loss": 0.6648, "rewards/accuracies": 0.75, "rewards/chosen": 0.333740234375, "rewards/margins": 3.5703125, "rewards/rejected": -3.2265625, "step": 1841 }, { "epoch": 0.3480561197978176, "grad_norm": 2.134121519694596, "learning_rate": 8.419863390944515e-07, "logits/chosen": 2.294921875, "logits/rejected": 2.0390625, "logps/chosen": -1320.0, "logps/rejected": -1069.0, "loss": 0.486, "rewards/accuracies": 0.875, "rewards/chosen": 1.560546875, "rewards/margins": 4.8984375, "rewards/rejected": -3.33984375, "step": 1842 }, { "epoch": 0.34824507534602483, "grad_norm": 2.052001407283108, "learning_rate": 8.417604284731473e-07, "logits/chosen": 1.779296875, "logits/rejected": 1.7607421875, "logps/chosen": -444.0, "logps/rejected": -658.5, "loss": 0.6937, "rewards/accuracies": 0.625, "rewards/chosen": 0.10791015625, "rewards/margins": 3.806640625, "rewards/rejected": -3.69921875, "step": 1843 }, { "epoch": 0.34843403089423214, "grad_norm": 12.352172194780643, "learning_rate": 8.415343909216457e-07, "logits/chosen": 2.19140625, "logits/rejected": 2.466796875, "logps/chosen": -1064.0, "logps/rejected": -1101.0, "loss": 0.691, "rewards/accuracies": 0.78125, "rewards/chosen": 0.08978271484375, "rewards/margins": 3.671875, "rewards/rejected": -3.580078125, "step": 1844 }, { "epoch": 0.34862298644243944, "grad_norm": 3.4030070666822057, "learning_rate": 8.413082265382844e-07, "logits/chosen": 2.048828125, "logits/rejected": 1.287109375, "logps/chosen": -722.5, "logps/rejected": -759.0, "loss": 0.6139, "rewards/accuracies": 0.875, "rewards/chosen": 0.355712890625, "rewards/margins": 6.19140625, "rewards/rejected": -5.82421875, "step": 1845 }, { "epoch": 0.3488119419906467, "grad_norm": 3.627900044424052, "learning_rate": 8.410819354214563e-07, "logits/chosen": 2.1162109375, "logits/rejected": 1.6298828125, "logps/chosen": -751.5, "logps/rejected": -859.0, "loss": 0.7369, "rewards/accuracies": 0.75, "rewards/chosen": -0.167724609375, "rewards/margins": 4.4140625, "rewards/rejected": -4.583984375, "step": 1846 }, { "epoch": 0.349000897538854, "grad_norm": 2.0791138960908615, "learning_rate": 8.408555176696088e-07, "logits/chosen": 2.55078125, "logits/rejected": 2.412109375, "logps/chosen": -1074.0, "logps/rejected": -1978.0, "loss": 0.4712, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5576171875, "rewards/margins": 7.5703125, "rewards/rejected": -6.01171875, "step": 1847 }, { "epoch": 0.34918985308706124, "grad_norm": 2.8592040007406485, "learning_rate": 8.406289733812451e-07, "logits/chosen": 3.1953125, "logits/rejected": 3.43359375, "logps/chosen": -1132.0, "logps/rejected": -1415.0, "loss": 0.5998, "rewards/accuracies": 0.75, "rewards/chosen": 0.842529296875, "rewards/margins": 4.60546875, "rewards/rejected": -3.76171875, "step": 1848 }, { "epoch": 0.34937880863526855, "grad_norm": 2.013033265020537, "learning_rate": 8.40402302654923e-07, "logits/chosen": 3.11328125, "logits/rejected": 3.33203125, "logps/chosen": -855.0, "logps/rejected": -1299.0, "loss": 0.5192, "rewards/accuracies": 0.875, "rewards/chosen": 0.7783203125, "rewards/margins": 6.1328125, "rewards/rejected": -5.3515625, "step": 1849 }, { "epoch": 0.34956776418347585, "grad_norm": 3.497725310835945, "learning_rate": 8.401755055892557e-07, "logits/chosen": 1.9296875, "logits/rejected": 1.95703125, "logps/chosen": -682.0, "logps/rejected": -1329.0, "loss": 0.6263, "rewards/accuracies": 0.90625, "rewards/chosen": -0.300048828125, "rewards/margins": 6.15625, "rewards/rejected": -6.47265625, "step": 1850 }, { "epoch": 0.3497567197316831, "grad_norm": 3.03680735028017, "learning_rate": 8.399485822829108e-07, "logits/chosen": 2.1796875, "logits/rejected": 2.1484375, "logps/chosen": -17770.0, "logps/rejected": -18138.0, "loss": 0.7792, "rewards/accuracies": 0.75, "rewards/chosen": -14.041015625, "rewards/margins": 3.75, "rewards/rejected": -17.83203125, "step": 1851 }, { "epoch": 0.3499456752798904, "grad_norm": 1.7095777605048905, "learning_rate": 8.397215328346114e-07, "logits/chosen": 2.3671875, "logits/rejected": 2.5927734375, "logps/chosen": -1057.0, "logps/rejected": -2015.0, "loss": 0.5687, "rewards/accuracies": 0.8125, "rewards/chosen": 0.162109375, "rewards/margins": 8.453125, "rewards/rejected": -8.28515625, "step": 1852 }, { "epoch": 0.3501346308280977, "grad_norm": 3.305327954746026, "learning_rate": 8.394943573431351e-07, "logits/chosen": 2.1796875, "logits/rejected": 2.3359375, "logps/chosen": -801.5, "logps/rejected": -899.5, "loss": 0.6767, "rewards/accuracies": 0.84375, "rewards/chosen": -0.23388671875, "rewards/margins": 3.8125, "rewards/rejected": -4.044921875, "step": 1853 }, { "epoch": 0.35032358637630495, "grad_norm": 1.415982880076099, "learning_rate": 8.392670559073144e-07, "logits/chosen": 3.271484375, "logits/rejected": 3.171875, "logps/chosen": -610.0, "logps/rejected": -763.0, "loss": 0.641, "rewards/accuracies": 0.78125, "rewards/chosen": 0.787109375, "rewards/margins": 3.66015625, "rewards/rejected": -2.8671875, "step": 1854 }, { "epoch": 0.35051254192451226, "grad_norm": 1.5859703031846948, "learning_rate": 8.390396286260369e-07, "logits/chosen": 2.32763671875, "logits/rejected": 2.64208984375, "logps/chosen": -861.0, "logps/rejected": -1327.0, "loss": 0.6519, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2197265625, "rewards/margins": 6.77734375, "rewards/rejected": -7.0, "step": 1855 }, { "epoch": 0.35070149747271956, "grad_norm": 3.4561382215110044, "learning_rate": 8.388120755982444e-07, "logits/chosen": 2.681640625, "logits/rejected": 2.9296875, "logps/chosen": -776.0, "logps/rejected": -1982.0, "loss": 0.542, "rewards/accuracies": 0.875, "rewards/chosen": 0.876953125, "rewards/margins": 5.9453125, "rewards/rejected": -5.0703125, "step": 1856 }, { "epoch": 0.3508904530209268, "grad_norm": 2.0007204240501912, "learning_rate": 8.385843969229338e-07, "logits/chosen": 2.07421875, "logits/rejected": 1.81640625, "logps/chosen": -999.0, "logps/rejected": -974.0, "loss": 0.5097, "rewards/accuracies": 0.84375, "rewards/chosen": 0.65869140625, "rewards/margins": 4.265625, "rewards/rejected": -3.6015625, "step": 1857 }, { "epoch": 0.3510794085691341, "grad_norm": 2.1293487090211993, "learning_rate": 8.383565926991567e-07, "logits/chosen": 3.4609375, "logits/rejected": 2.880859375, "logps/chosen": -1283.0, "logps/rejected": -1197.5, "loss": 0.6847, "rewards/accuracies": 0.75, "rewards/chosen": 0.9306640625, "rewards/margins": 3.3125, "rewards/rejected": -2.380859375, "step": 1858 }, { "epoch": 0.3512683641173414, "grad_norm": 2.462000783822831, "learning_rate": 8.38128663026019e-07, "logits/chosen": 2.45703125, "logits/rejected": 2.53125, "logps/chosen": -598.0, "logps/rejected": -747.0, "loss": 0.7252, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4697265625, "rewards/margins": 2.5673828125, "rewards/rejected": -2.10302734375, "step": 1859 }, { "epoch": 0.35145731966554866, "grad_norm": 2.3830743187827084, "learning_rate": 8.379006080026815e-07, "logits/chosen": 2.744140625, "logits/rejected": 2.423828125, "logps/chosen": -877.5, "logps/rejected": -1011.5, "loss": 0.5762, "rewards/accuracies": 0.8125, "rewards/chosen": 1.587890625, "rewards/margins": 3.70703125, "rewards/rejected": -2.1181640625, "step": 1860 }, { "epoch": 0.35164627521375597, "grad_norm": 2.8348093845485907, "learning_rate": 8.376724277283595e-07, "logits/chosen": 2.140625, "logits/rejected": 2.6640625, "logps/chosen": -687.5, "logps/rejected": -1117.0, "loss": 0.6996, "rewards/accuracies": 0.75, "rewards/chosen": 0.166015625, "rewards/margins": 3.84375, "rewards/rejected": -3.671875, "step": 1861 }, { "epoch": 0.35183523076196327, "grad_norm": 1.7734423657223537, "learning_rate": 8.374441223023226e-07, "logits/chosen": 2.341796875, "logits/rejected": 2.302734375, "logps/chosen": -1190.0, "logps/rejected": -1056.5, "loss": 0.5119, "rewards/accuracies": 0.875, "rewards/chosen": 1.39453125, "rewards/margins": 4.6875, "rewards/rejected": -3.296875, "step": 1862 }, { "epoch": 0.3520241863101705, "grad_norm": 2.206678017088816, "learning_rate": 8.372156918238946e-07, "logits/chosen": 2.166015625, "logits/rejected": 2.484375, "logps/chosen": -708.0, "logps/rejected": -1603.0, "loss": 0.5806, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7921142578125, "rewards/margins": 4.76171875, "rewards/rejected": -3.9765625, "step": 1863 }, { "epoch": 0.3522131418583778, "grad_norm": 3.2586659646840976, "learning_rate": 8.369871363924545e-07, "logits/chosen": 2.802734375, "logits/rejected": 2.79296875, "logps/chosen": -554.0, "logps/rejected": -558.0, "loss": 0.5808, "rewards/accuracies": 0.8125, "rewards/chosen": 1.341796875, "rewards/margins": 3.23046875, "rewards/rejected": -1.888671875, "step": 1864 }, { "epoch": 0.3524020974065851, "grad_norm": 2.3155773471286634, "learning_rate": 8.367584561074352e-07, "logits/chosen": 3.3046875, "logits/rejected": 3.45703125, "logps/chosen": -903.5, "logps/rejected": -901.5, "loss": 0.6373, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5751953125, "rewards/margins": 4.9375, "rewards/rejected": -4.35546875, "step": 1865 }, { "epoch": 0.35259105295479237, "grad_norm": 2.4033652439792177, "learning_rate": 8.365296510683234e-07, "logits/chosen": 2.96484375, "logits/rejected": 2.9140625, "logps/chosen": -1022.0, "logps/rejected": -1130.0, "loss": 0.5761, "rewards/accuracies": 0.90625, "rewards/chosen": 0.56658935546875, "rewards/margins": 3.94921875, "rewards/rejected": -3.3671875, "step": 1866 }, { "epoch": 0.3527800085029997, "grad_norm": 2.740384758451312, "learning_rate": 8.363007213746612e-07, "logits/chosen": 3.2578125, "logits/rejected": 3.224609375, "logps/chosen": -797.0, "logps/rejected": -824.0, "loss": 0.6719, "rewards/accuracies": 0.71875, "rewards/chosen": 0.22216796875, "rewards/margins": 2.294921875, "rewards/rejected": -2.076171875, "step": 1867 }, { "epoch": 0.352968964051207, "grad_norm": 1.7465033854140517, "learning_rate": 8.360716671260441e-07, "logits/chosen": 1.6097412109375, "logits/rejected": 1.684326171875, "logps/chosen": -980.0, "logps/rejected": -1226.0, "loss": 0.502, "rewards/accuracies": 0.84375, "rewards/chosen": 1.21630859375, "rewards/margins": 5.078125, "rewards/rejected": -3.859375, "step": 1868 }, { "epoch": 0.3531579195994142, "grad_norm": 2.8250142561057467, "learning_rate": 8.358424884221219e-07, "logits/chosen": 2.69921875, "logits/rejected": 2.60546875, "logps/chosen": -868.0, "logps/rejected": -1724.0, "loss": 0.5731, "rewards/accuracies": 0.84375, "rewards/chosen": 1.19140625, "rewards/margins": 5.564453125, "rewards/rejected": -4.380859375, "step": 1869 }, { "epoch": 0.35334687514762153, "grad_norm": 2.0987997013220783, "learning_rate": 8.356131853625988e-07, "logits/chosen": 2.462890625, "logits/rejected": 2.5400390625, "logps/chosen": -850.0, "logps/rejected": -723.5, "loss": 0.6126, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2568359375, "rewards/margins": 3.54296875, "rewards/rejected": -2.296875, "step": 1870 }, { "epoch": 0.3535358306958288, "grad_norm": 3.984194440256632, "learning_rate": 8.353837580472328e-07, "logits/chosen": 1.6123046875, "logits/rejected": 1.07421875, "logps/chosen": -967.25, "logps/rejected": -988.5, "loss": 0.4272, "rewards/accuracies": 0.96875, "rewards/chosen": 1.49560546875, "rewards/margins": 5.359375, "rewards/rejected": -3.8671875, "step": 1871 }, { "epoch": 0.3537247862440361, "grad_norm": 2.0879593315925087, "learning_rate": 8.351542065758363e-07, "logits/chosen": 2.361328125, "logits/rejected": 2.169921875, "logps/chosen": -670.0, "logps/rejected": -767.0, "loss": 0.5887, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1552734375, "rewards/margins": 4.2421875, "rewards/rejected": -3.0859375, "step": 1872 }, { "epoch": 0.3539137417922434, "grad_norm": 2.123156599670955, "learning_rate": 8.349245310482754e-07, "logits/chosen": 3.76953125, "logits/rejected": 3.54296875, "logps/chosen": -532.25, "logps/rejected": -511.0, "loss": 0.6494, "rewards/accuracies": 0.8125, "rewards/chosen": 0.062774658203125, "rewards/margins": 2.78515625, "rewards/rejected": -2.720703125, "step": 1873 }, { "epoch": 0.35410269734045063, "grad_norm": 3.4439172928853785, "learning_rate": 8.346947315644702e-07, "logits/chosen": 2.376953125, "logits/rejected": 2.021484375, "logps/chosen": -986.0, "logps/rejected": -1290.5, "loss": 0.4281, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7587890625, "rewards/margins": 6.21484375, "rewards/rejected": -4.44921875, "step": 1874 }, { "epoch": 0.35429165288865794, "grad_norm": 2.81538313566261, "learning_rate": 8.344648082243952e-07, "logits/chosen": 2.6748046875, "logits/rejected": 2.51953125, "logps/chosen": -780.5, "logps/rejected": -1148.5, "loss": 0.531, "rewards/accuracies": 0.875, "rewards/chosen": 1.404296875, "rewards/margins": 5.72265625, "rewards/rejected": -4.3115234375, "step": 1875 }, { "epoch": 0.35448060843686524, "grad_norm": 3.729612657904329, "learning_rate": 8.34234761128078e-07, "logits/chosen": 2.03515625, "logits/rejected": 1.9765625, "logps/chosen": -675.0, "logps/rejected": -1506.0, "loss": 0.67, "rewards/accuracies": 0.75, "rewards/chosen": -0.174285888671875, "rewards/margins": 4.14453125, "rewards/rejected": -4.33203125, "step": 1876 }, { "epoch": 0.3546695639850725, "grad_norm": 3.020807054317691, "learning_rate": 8.340045903756007e-07, "logits/chosen": 2.853515625, "logits/rejected": 2.974609375, "logps/chosen": -787.5, "logps/rejected": -872.0, "loss": 0.6244, "rewards/accuracies": 0.78125, "rewards/chosen": 0.70440673828125, "rewards/margins": 4.72265625, "rewards/rejected": -4.03125, "step": 1877 }, { "epoch": 0.3548585195332798, "grad_norm": 1.881183725482175, "learning_rate": 8.337742960670988e-07, "logits/chosen": 2.97265625, "logits/rejected": 2.9765625, "logps/chosen": -1525.0, "logps/rejected": -588.0, "loss": 0.7375, "rewards/accuracies": 0.75, "rewards/chosen": -0.39910888671875, "rewards/margins": 2.5234375, "rewards/rejected": -2.91796875, "step": 1878 }, { "epoch": 0.3550474750814871, "grad_norm": 1.7932896969693537, "learning_rate": 8.335438783027619e-07, "logits/chosen": 2.7421875, "logits/rejected": 2.80078125, "logps/chosen": -1064.0, "logps/rejected": -1119.0, "loss": 0.5494, "rewards/accuracies": 0.875, "rewards/chosen": 0.3876953125, "rewards/margins": 4.71875, "rewards/rejected": -4.32421875, "step": 1879 }, { "epoch": 0.35523643062969434, "grad_norm": 2.4494167641091504, "learning_rate": 8.333133371828328e-07, "logits/chosen": 2.283203125, "logits/rejected": 2.281494140625, "logps/chosen": -1082.0, "logps/rejected": -1036.0, "loss": 0.7368, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1551513671875, "rewards/margins": 3.2467041015625, "rewards/rejected": -3.09814453125, "step": 1880 }, { "epoch": 0.35542538617790165, "grad_norm": 2.3724854501907826, "learning_rate": 8.330826728076086e-07, "logits/chosen": 2.000732421875, "logits/rejected": 2.01708984375, "logps/chosen": -638.5, "logps/rejected": -733.5, "loss": 0.7378, "rewards/accuracies": 0.59375, "rewards/chosen": 0.117919921875, "rewards/margins": 2.8046875, "rewards/rejected": -2.6875, "step": 1881 }, { "epoch": 0.35561434172610895, "grad_norm": 2.052252882209756, "learning_rate": 8.328518852774396e-07, "logits/chosen": 2.55859375, "logits/rejected": 2.56640625, "logps/chosen": -750.0, "logps/rejected": -690.5, "loss": 0.6427, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0029296875, "rewards/margins": 3.546875, "rewards/rejected": -2.55078125, "step": 1882 }, { "epoch": 0.3558032972743162, "grad_norm": 2.4839884307145823, "learning_rate": 8.326209746927295e-07, "logits/chosen": 3.474609375, "logits/rejected": 3.63671875, "logps/chosen": -717.5, "logps/rejected": -743.0, "loss": 0.7622, "rewards/accuracies": 0.5625, "rewards/chosen": 0.331787109375, "rewards/margins": 2.88671875, "rewards/rejected": -2.55078125, "step": 1883 }, { "epoch": 0.3559922528225235, "grad_norm": 3.072132978608476, "learning_rate": 8.323899411539363e-07, "logits/chosen": 2.294921875, "logits/rejected": 1.7275390625, "logps/chosen": -851.5, "logps/rejected": -901.0, "loss": 0.557, "rewards/accuracies": 0.875, "rewards/chosen": 1.17236328125, "rewards/margins": 4.46484375, "rewards/rejected": -3.29296875, "step": 1884 }, { "epoch": 0.3561812083707308, "grad_norm": 1.8446856636338422, "learning_rate": 8.321587847615704e-07, "logits/chosen": 1.87890625, "logits/rejected": 2.6171875, "logps/chosen": -1084.0, "logps/rejected": -1590.0, "loss": 0.5118, "rewards/accuracies": 0.96875, "rewards/chosen": 1.63525390625, "rewards/margins": 6.71875, "rewards/rejected": -5.0859375, "step": 1885 }, { "epoch": 0.35637016391893805, "grad_norm": 2.6681256258173685, "learning_rate": 8.31927505616197e-07, "logits/chosen": 2.8701171875, "logits/rejected": 2.78125, "logps/chosen": -497.0, "logps/rejected": -659.5, "loss": 0.7012, "rewards/accuracies": 0.6875, "rewards/chosen": 0.61376953125, "rewards/margins": 2.748046875, "rewards/rejected": -2.1328125, "step": 1886 }, { "epoch": 0.35655911946714536, "grad_norm": 2.359093053597335, "learning_rate": 8.316961038184332e-07, "logits/chosen": 2.115234375, "logits/rejected": 2.166015625, "logps/chosen": -749.0, "logps/rejected": -727.0, "loss": 0.6444, "rewards/accuracies": 0.6875, "rewards/chosen": 0.637115478515625, "rewards/margins": 3.91796875, "rewards/rejected": -3.26953125, "step": 1887 }, { "epoch": 0.35674807501535266, "grad_norm": 1.7588395343565104, "learning_rate": 8.314645794689505e-07, "logits/chosen": 3.09765625, "logits/rejected": 2.796875, "logps/chosen": -866.5, "logps/rejected": -830.0, "loss": 0.557, "rewards/accuracies": 0.84375, "rewards/chosen": 1.32080078125, "rewards/margins": 4.05078125, "rewards/rejected": -2.73046875, "step": 1888 }, { "epoch": 0.3569370305635599, "grad_norm": 2.4127354188306533, "learning_rate": 8.312329326684738e-07, "logits/chosen": 3.19921875, "logits/rejected": 3.51171875, "logps/chosen": -650.0, "logps/rejected": -927.0, "loss": 0.5775, "rewards/accuracies": 0.78125, "rewards/chosen": 0.99853515625, "rewards/margins": 3.6171875, "rewards/rejected": -2.6171875, "step": 1889 }, { "epoch": 0.3571259861117672, "grad_norm": 2.094529595074821, "learning_rate": 8.310011635177803e-07, "logits/chosen": 2.51171875, "logits/rejected": 2.279296875, "logps/chosen": -712.5, "logps/rejected": -758.0, "loss": 0.7347, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1474609375, "rewards/margins": 3.25390625, "rewards/rejected": -3.4140625, "step": 1890 }, { "epoch": 0.3573149416599745, "grad_norm": 2.3768207376917547, "learning_rate": 8.307692721177014e-07, "logits/chosen": 2.90234375, "logits/rejected": 3.279296875, "logps/chosen": -1122.0, "logps/rejected": -1602.0, "loss": 0.5211, "rewards/accuracies": 0.8125, "rewards/chosen": 1.7314453125, "rewards/margins": 5.7734375, "rewards/rejected": -4.04296875, "step": 1891 }, { "epoch": 0.35750389720818176, "grad_norm": 2.9654122107556877, "learning_rate": 8.305372585691212e-07, "logits/chosen": 1.6895751953125, "logits/rejected": 1.80859375, "logps/chosen": -395.5, "logps/rejected": -432.0, "loss": 0.7572, "rewards/accuracies": 0.65625, "rewards/chosen": 0.041015625, "rewards/margins": 2.818359375, "rewards/rejected": -2.78125, "step": 1892 }, { "epoch": 0.35769285275638907, "grad_norm": 4.0661766192650415, "learning_rate": 8.30305122972977e-07, "logits/chosen": 2.794921875, "logits/rejected": 2.837890625, "logps/chosen": -542.5, "logps/rejected": -1015.0, "loss": 0.6728, "rewards/accuracies": 0.75, "rewards/chosen": -0.01708984375, "rewards/margins": 4.25, "rewards/rejected": -4.26171875, "step": 1893 }, { "epoch": 0.3578818083045963, "grad_norm": 1.8717662478265988, "learning_rate": 8.300728654302593e-07, "logits/chosen": 3.25390625, "logits/rejected": 2.7109375, "logps/chosen": -553.5, "logps/rejected": -733.5, "loss": 0.5631, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5992431640625, "rewards/margins": 4.44140625, "rewards/rejected": -3.83984375, "step": 1894 }, { "epoch": 0.3580707638528036, "grad_norm": 1.8104444155614257, "learning_rate": 8.298404860420118e-07, "logits/chosen": 2.109375, "logits/rejected": 1.70751953125, "logps/chosen": -729.0, "logps/rejected": -588.0, "loss": 0.625, "rewards/accuracies": 0.90625, "rewards/chosen": 0.52703857421875, "rewards/margins": 3.71484375, "rewards/rejected": -3.185546875, "step": 1895 }, { "epoch": 0.3582597194010109, "grad_norm": 2.0785529965366245, "learning_rate": 8.296079849093307e-07, "logits/chosen": 1.994140625, "logits/rejected": 2.251953125, "logps/chosen": -810.5, "logps/rejected": -803.5, "loss": 0.5384, "rewards/accuracies": 0.78125, "rewards/chosen": 0.87109375, "rewards/margins": 5.46875, "rewards/rejected": -4.58984375, "step": 1896 }, { "epoch": 0.35844867494921817, "grad_norm": 2.3751426716954165, "learning_rate": 8.293753621333656e-07, "logits/chosen": 1.76171875, "logits/rejected": 1.71875, "logps/chosen": -770.0, "logps/rejected": -752.0, "loss": 0.5266, "rewards/accuracies": 0.84375, "rewards/chosen": 0.52197265625, "rewards/margins": 3.96875, "rewards/rejected": -3.44140625, "step": 1897 }, { "epoch": 0.3586376304974255, "grad_norm": 1.9117595306882764, "learning_rate": 8.291426178153188e-07, "logits/chosen": 1.6513671875, "logits/rejected": 1.3193359375, "logps/chosen": -960.5, "logps/rejected": -854.0, "loss": 0.5822, "rewards/accuracies": 0.8125, "rewards/chosen": 0.67681884765625, "rewards/margins": 4.57421875, "rewards/rejected": -3.8984375, "step": 1898 }, { "epoch": 0.3588265860456328, "grad_norm": 2.169170315438563, "learning_rate": 8.28909752056446e-07, "logits/chosen": 2.6640625, "logits/rejected": 2.701171875, "logps/chosen": -786.5, "logps/rejected": -1570.5, "loss": 0.6355, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5400390625, "rewards/margins": 5.0625, "rewards/rejected": -4.51953125, "step": 1899 }, { "epoch": 0.35901554159384, "grad_norm": 1.9891849508156847, "learning_rate": 8.286767649580548e-07, "logits/chosen": 2.798828125, "logits/rejected": 3.07421875, "logps/chosen": -658.5, "logps/rejected": -1782.0, "loss": 0.7004, "rewards/accuracies": 0.71875, "rewards/chosen": 0.086669921875, "rewards/margins": 4.025390625, "rewards/rejected": -3.943359375, "step": 1900 }, { "epoch": 0.35920449714204733, "grad_norm": 2.3211014567858848, "learning_rate": 8.284436566215061e-07, "logits/chosen": 2.3642578125, "logits/rejected": 2.197265625, "logps/chosen": -1249.0, "logps/rejected": -1068.5, "loss": 0.5359, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7578125, "rewards/margins": 5.3203125, "rewards/rejected": -4.5703125, "step": 1901 }, { "epoch": 0.35939345269025463, "grad_norm": 1.723799210443928, "learning_rate": 8.28210427148214e-07, "logits/chosen": 2.462890625, "logits/rejected": 1.9443359375, "logps/chosen": -525.75, "logps/rejected": -548.5, "loss": 0.6684, "rewards/accuracies": 0.75, "rewards/chosen": 0.371337890625, "rewards/margins": 2.55859375, "rewards/rejected": -2.1875, "step": 1902 }, { "epoch": 0.3595824082384619, "grad_norm": 2.6453720459845926, "learning_rate": 8.279770766396445e-07, "logits/chosen": 2.283203125, "logits/rejected": 1.880859375, "logps/chosen": -902.0, "logps/rejected": -729.0, "loss": 0.5396, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7789306640625, "rewards/margins": 4.46484375, "rewards/rejected": -3.68359375, "step": 1903 }, { "epoch": 0.3597713637866692, "grad_norm": 2.215978826856319, "learning_rate": 8.277436051973166e-07, "logits/chosen": 1.701171875, "logits/rejected": 1.9521484375, "logps/chosen": -570.5, "logps/rejected": -929.0, "loss": 0.5727, "rewards/accuracies": 0.90625, "rewards/chosen": 0.431640625, "rewards/margins": 4.1796875, "rewards/rejected": -3.74609375, "step": 1904 }, { "epoch": 0.3599603193348765, "grad_norm": 2.282828961087828, "learning_rate": 8.275100129228019e-07, "logits/chosen": 2.142578125, "logits/rejected": 2.1953125, "logps/chosen": -1304.0, "logps/rejected": -3076.0, "loss": 0.4325, "rewards/accuracies": 0.875, "rewards/chosen": 1.048828125, "rewards/margins": 6.0078125, "rewards/rejected": -4.96484375, "step": 1905 }, { "epoch": 0.36014927488308374, "grad_norm": 3.088670956538432, "learning_rate": 8.272762999177247e-07, "logits/chosen": 2.2861328125, "logits/rejected": 1.9892578125, "logps/chosen": -582.0, "logps/rejected": -538.5, "loss": 0.5645, "rewards/accuracies": 0.875, "rewards/chosen": 0.55810546875, "rewards/margins": 4.50390625, "rewards/rejected": -3.94140625, "step": 1906 }, { "epoch": 0.36033823043129104, "grad_norm": 2.2536561790667746, "learning_rate": 8.270424662837617e-07, "logits/chosen": 2.572265625, "logits/rejected": 2.10546875, "logps/chosen": -765.0, "logps/rejected": -1011.0, "loss": 0.674, "rewards/accuracies": 0.75, "rewards/chosen": 0.46600341796875, "rewards/margins": 3.423828125, "rewards/rejected": -2.955078125, "step": 1907 }, { "epoch": 0.36052718597949834, "grad_norm": 1.3633366924691133, "learning_rate": 8.268085121226419e-07, "logits/chosen": 2.8828125, "logits/rejected": 2.947265625, "logps/chosen": -1249.0, "logps/rejected": -1395.5, "loss": 0.5759, "rewards/accuracies": 0.84375, "rewards/chosen": 1.04144287109375, "rewards/margins": 5.16796875, "rewards/rejected": -4.12890625, "step": 1908 }, { "epoch": 0.3607161415277056, "grad_norm": 2.3620888208795385, "learning_rate": 8.265744375361472e-07, "logits/chosen": 2.14453125, "logits/rejected": 2.2587890625, "logps/chosen": -824.0, "logps/rejected": -973.5, "loss": 0.6342, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1978759765625, "rewards/margins": 6.453125, "rewards/rejected": -6.2578125, "step": 1909 }, { "epoch": 0.3609050970759129, "grad_norm": 2.3606312930312083, "learning_rate": 8.263402426261113e-07, "logits/chosen": 1.9345703125, "logits/rejected": 1.7294921875, "logps/chosen": -819.0, "logps/rejected": -678.0, "loss": 0.6235, "rewards/accuracies": 0.8125, "rewards/chosen": 0.409820556640625, "rewards/margins": 3.6015625, "rewards/rejected": -3.19140625, "step": 1910 }, { "epoch": 0.3610940526241202, "grad_norm": 2.424860242076653, "learning_rate": 8.261059274944209e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0859375, "logps/chosen": -620.5, "logps/rejected": -698.5, "loss": 0.5652, "rewards/accuracies": 0.84375, "rewards/chosen": 0.557861328125, "rewards/margins": 5.2421875, "rewards/rejected": -4.67578125, "step": 1911 }, { "epoch": 0.36128300817232745, "grad_norm": 1.9554297956591438, "learning_rate": 8.258714922430144e-07, "logits/chosen": 1.1201171875, "logits/rejected": 1.240234375, "logps/chosen": -1127.0, "logps/rejected": -1558.0, "loss": 0.5629, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7822265625, "rewards/margins": 5.45703125, "rewards/rejected": -4.6875, "step": 1912 }, { "epoch": 0.36147196372053475, "grad_norm": 3.1492752189967437, "learning_rate": 8.256369369738827e-07, "logits/chosen": 3.091796875, "logits/rejected": 2.5966796875, "logps/chosen": -702.0, "logps/rejected": -661.0, "loss": 0.5237, "rewards/accuracies": 0.875, "rewards/chosen": 0.2109375, "rewards/margins": 4.6875, "rewards/rejected": -4.46875, "step": 1913 }, { "epoch": 0.36166091926874205, "grad_norm": 2.7368006900707806, "learning_rate": 8.25402261789069e-07, "logits/chosen": 1.818359375, "logits/rejected": 1.603515625, "logps/chosen": -852.0, "logps/rejected": -1058.0, "loss": 0.5723, "rewards/accuracies": 0.90625, "rewards/chosen": 0.037109375, "rewards/margins": 5.59375, "rewards/rejected": -5.5546875, "step": 1914 }, { "epoch": 0.3618498748169493, "grad_norm": 2.0348298079899405, "learning_rate": 8.251674667906686e-07, "logits/chosen": 2.1484375, "logits/rejected": 2.3486328125, "logps/chosen": -702.0, "logps/rejected": -2023.5, "loss": 0.564, "rewards/accuracies": 0.875, "rewards/chosen": 0.460205078125, "rewards/margins": 9.4921875, "rewards/rejected": -9.015625, "step": 1915 }, { "epoch": 0.3620388303651566, "grad_norm": 3.231563969353842, "learning_rate": 8.24932552080829e-07, "logits/chosen": 3.3203125, "logits/rejected": 3.3046875, "logps/chosen": -486.75, "logps/rejected": -603.0, "loss": 0.6969, "rewards/accuracies": 0.75, "rewards/chosen": 0.09375, "rewards/margins": 2.96875, "rewards/rejected": -2.8671875, "step": 1916 }, { "epoch": 0.3622277859133639, "grad_norm": 1.4451568227719396, "learning_rate": 8.246975177617494e-07, "logits/chosen": 1.560546875, "logits/rejected": 1.7265625, "logps/chosen": -781.0, "logps/rejected": -1028.0, "loss": 0.6826, "rewards/accuracies": 0.78125, "rewards/chosen": 0.50390625, "rewards/margins": 4.015625, "rewards/rejected": -3.51171875, "step": 1917 }, { "epoch": 0.36241674146157116, "grad_norm": 2.379104000037416, "learning_rate": 8.244623639356818e-07, "logits/chosen": 2.365234375, "logits/rejected": 1.822265625, "logps/chosen": -1498.0, "logps/rejected": -897.0, "loss": 0.6053, "rewards/accuracies": 0.84375, "rewards/chosen": -0.00146484375, "rewards/margins": 3.28125, "rewards/rejected": -3.2890625, "step": 1918 }, { "epoch": 0.36260569700977846, "grad_norm": 3.6489121507717344, "learning_rate": 8.242270907049295e-07, "logits/chosen": 1.8935546875, "logits/rejected": 2.21484375, "logps/chosen": -1028.0, "logps/rejected": -1047.5, "loss": 0.7426, "rewards/accuracies": 0.59375, "rewards/chosen": 0.635498046875, "rewards/margins": 3.87890625, "rewards/rejected": -3.24169921875, "step": 1919 }, { "epoch": 0.3627946525579857, "grad_norm": 2.315231963244568, "learning_rate": 8.239916981718477e-07, "logits/chosen": 2.396484375, "logits/rejected": 2.62890625, "logps/chosen": -651.5, "logps/rejected": -916.0, "loss": 0.6123, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9375, "rewards/margins": 3.833984375, "rewards/rejected": -2.9013671875, "step": 1920 }, { "epoch": 0.362983608106193, "grad_norm": 3.2124665887026684, "learning_rate": 8.237561864388442e-07, "logits/chosen": 2.115234375, "logits/rejected": 2.55078125, "logps/chosen": -782.0, "logps/rejected": -839.0, "loss": 0.6419, "rewards/accuracies": 0.84375, "rewards/chosen": 0.784912109375, "rewards/margins": 2.828125, "rewards/rejected": -2.044921875, "step": 1921 }, { "epoch": 0.3631725636544003, "grad_norm": 1.9151447500082865, "learning_rate": 8.235205556083781e-07, "logits/chosen": 2.06494140625, "logits/rejected": 1.9345703125, "logps/chosen": -878.0, "logps/rejected": -722.5, "loss": 0.565, "rewards/accuracies": 0.875, "rewards/chosen": 0.8916015625, "rewards/margins": 4.1171875, "rewards/rejected": -3.2265625, "step": 1922 }, { "epoch": 0.36336151920260756, "grad_norm": 1.679153560788573, "learning_rate": 8.232848057829604e-07, "logits/chosen": 2.56640625, "logits/rejected": 2.7421875, "logps/chosen": -745.5, "logps/rejected": -891.0, "loss": 0.599, "rewards/accuracies": 0.8125, "rewards/chosen": 0.81298828125, "rewards/margins": 3.859375, "rewards/rejected": -3.04296875, "step": 1923 }, { "epoch": 0.36355047475081487, "grad_norm": 2.0549603008807384, "learning_rate": 8.230489370651539e-07, "logits/chosen": 2.0859375, "logits/rejected": 2.29296875, "logps/chosen": -446.75, "logps/rejected": -864.0, "loss": 0.706, "rewards/accuracies": 0.71875, "rewards/chosen": 0.60595703125, "rewards/margins": 2.25390625, "rewards/rejected": -1.64453125, "step": 1924 }, { "epoch": 0.36373943029902217, "grad_norm": 1.9818538254258233, "learning_rate": 8.228129495575731e-07, "logits/chosen": 2.197265625, "logits/rejected": 1.890625, "logps/chosen": -784.0, "logps/rejected": -744.0, "loss": 0.6147, "rewards/accuracies": 0.8125, "rewards/chosen": 0.978515625, "rewards/margins": 3.67578125, "rewards/rejected": -2.69140625, "step": 1925 }, { "epoch": 0.3639283858472294, "grad_norm": 1.8450909499525567, "learning_rate": 8.225768433628846e-07, "logits/chosen": 2.98828125, "logits/rejected": 2.0703125, "logps/chosen": -1060.5, "logps/rejected": -921.5, "loss": 0.5109, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9423828125, "rewards/margins": 5.875, "rewards/rejected": -3.927734375, "step": 1926 }, { "epoch": 0.3641173413954367, "grad_norm": 2.3608650013407453, "learning_rate": 8.223406185838057e-07, "logits/chosen": 2.671875, "logits/rejected": 2.947265625, "logps/chosen": -894.5, "logps/rejected": -750.25, "loss": 0.7247, "rewards/accuracies": 0.71875, "rewards/chosen": 1.30615234375, "rewards/margins": 2.935546875, "rewards/rejected": -1.634521484375, "step": 1927 }, { "epoch": 0.364306296943644, "grad_norm": 1.7725181883869765, "learning_rate": 8.221042753231063e-07, "logits/chosen": 2.52734375, "logits/rejected": 2.5361328125, "logps/chosen": -947.0, "logps/rejected": -993.0, "loss": 0.5919, "rewards/accuracies": 0.8125, "rewards/chosen": 1.57421875, "rewards/margins": 4.19140625, "rewards/rejected": -2.619140625, "step": 1928 }, { "epoch": 0.3644952524918513, "grad_norm": 2.2380843326530293, "learning_rate": 8.218678136836071e-07, "logits/chosen": 2.078125, "logits/rejected": 2.353515625, "logps/chosen": -665.5, "logps/rejected": -1714.0, "loss": 0.6092, "rewards/accuracies": 0.84375, "rewards/chosen": 1.093994140625, "rewards/margins": 5.91015625, "rewards/rejected": -4.81640625, "step": 1929 }, { "epoch": 0.3646842080400586, "grad_norm": 1.841468719576069, "learning_rate": 8.216312337681806e-07, "logits/chosen": 2.06201171875, "logits/rejected": 1.6251220703125, "logps/chosen": -511.5, "logps/rejected": -585.5, "loss": 0.629, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7529296875, "rewards/margins": 3.484375, "rewards/rejected": -2.736328125, "step": 1930 }, { "epoch": 0.3648731635882659, "grad_norm": 3.0437308506474543, "learning_rate": 8.213945356797511e-07, "logits/chosen": 2.279296875, "logits/rejected": 2.005859375, "logps/chosen": -968.0, "logps/rejected": -759.0, "loss": 0.5919, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1748046875, "rewards/margins": 3.64453125, "rewards/rejected": -2.470703125, "step": 1931 }, { "epoch": 0.36506211913647313, "grad_norm": 1.9158847954445486, "learning_rate": 8.211577195212935e-07, "logits/chosen": 2.001953125, "logits/rejected": 2.111328125, "logps/chosen": -707.5, "logps/rejected": -624.5, "loss": 0.6587, "rewards/accuracies": 0.84375, "rewards/chosen": 1.181640625, "rewards/margins": 3.3720703125, "rewards/rejected": -2.1923828125, "step": 1932 }, { "epoch": 0.36525107468468043, "grad_norm": 2.224336552859959, "learning_rate": 8.209207853958349e-07, "logits/chosen": 3.12109375, "logits/rejected": 2.6796875, "logps/chosen": -560.5, "logps/rejected": -831.0, "loss": 0.7533, "rewards/accuracies": 0.5625, "rewards/chosen": 0.18310546875, "rewards/margins": 2.97265625, "rewards/rejected": -2.7933349609375, "step": 1933 }, { "epoch": 0.36544003023288774, "grad_norm": 2.4723391349341544, "learning_rate": 8.20683733406453e-07, "logits/chosen": 1.526611328125, "logits/rejected": 1.39697265625, "logps/chosen": -892.0, "logps/rejected": -1345.5, "loss": 0.6598, "rewards/accuracies": 0.78125, "rewards/chosen": 0.67822265625, "rewards/margins": 3.1640625, "rewards/rejected": -2.484375, "step": 1934 }, { "epoch": 0.365628985781095, "grad_norm": 1.7982330993026057, "learning_rate": 8.204465636562774e-07, "logits/chosen": 1.974609375, "logits/rejected": 1.783203125, "logps/chosen": -770.0, "logps/rejected": -767.5, "loss": 0.553, "rewards/accuracies": 0.8125, "rewards/chosen": 1.369140625, "rewards/margins": 4.140625, "rewards/rejected": -2.771484375, "step": 1935 }, { "epoch": 0.3658179413293023, "grad_norm": 1.6833281764316042, "learning_rate": 8.202092762484884e-07, "logits/chosen": 2.625, "logits/rejected": 3.1171875, "logps/chosen": -688.5, "logps/rejected": -771.5, "loss": 0.7365, "rewards/accuracies": 0.71875, "rewards/chosen": 0.839111328125, "rewards/margins": 2.6767578125, "rewards/rejected": -1.8408203125, "step": 1936 }, { "epoch": 0.3660068968775096, "grad_norm": 2.037884354424453, "learning_rate": 8.199718712863181e-07, "logits/chosen": 2.115478515625, "logits/rejected": 2.165771484375, "logps/chosen": -994.0, "logps/rejected": -1026.0, "loss": 0.6563, "rewards/accuracies": 0.78125, "rewards/chosen": 1.927734375, "rewards/margins": 4.357421875, "rewards/rejected": -2.43798828125, "step": 1937 }, { "epoch": 0.36619585242571684, "grad_norm": 3.0393558723675222, "learning_rate": 8.197343488730492e-07, "logits/chosen": 1.3720703125, "logits/rejected": 1.58154296875, "logps/chosen": -590.5, "logps/rejected": -545.0, "loss": 0.7207, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5294189453125, "rewards/margins": 2.11328125, "rewards/rejected": -1.593505859375, "step": 1938 }, { "epoch": 0.36638480797392414, "grad_norm": 1.815956359735672, "learning_rate": 8.194967091120156e-07, "logits/chosen": 2.6328125, "logits/rejected": 3.5859375, "logps/chosen": -665.0, "logps/rejected": -2015.0, "loss": 0.6022, "rewards/accuracies": 0.71875, "rewards/chosen": 0.66552734375, "rewards/margins": 8.34375, "rewards/rejected": -7.6796875, "step": 1939 }, { "epoch": 0.36657376352213145, "grad_norm": 2.2704726552286334, "learning_rate": 8.192589521066024e-07, "logits/chosen": 2.287109375, "logits/rejected": 1.5146484375, "logps/chosen": -1230.5, "logps/rejected": -1316.0, "loss": 0.4913, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8486328125, "rewards/margins": 5.09765625, "rewards/rejected": -4.2421875, "step": 1940 }, { "epoch": 0.3667627190703387, "grad_norm": 1.811000251180605, "learning_rate": 8.190210779602457e-07, "logits/chosen": 1.6240234375, "logits/rejected": 1.91015625, "logps/chosen": -595.0, "logps/rejected": -814.0, "loss": 0.6912, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2108154296875, "rewards/margins": 3.98046875, "rewards/rejected": -3.76953125, "step": 1941 }, { "epoch": 0.366951674618546, "grad_norm": 2.5795019445296483, "learning_rate": 8.187830867764323e-07, "logits/chosen": 2.109375, "logits/rejected": 1.5654296875, "logps/chosen": -671.5, "logps/rejected": -799.5, "loss": 0.6948, "rewards/accuracies": 0.75, "rewards/chosen": 0.8896484375, "rewards/margins": 2.962890625, "rewards/rejected": -2.069580078125, "step": 1942 }, { "epoch": 0.36714063016675325, "grad_norm": 1.980182119780197, "learning_rate": 8.185449786587002e-07, "logits/chosen": 2.115234375, "logits/rejected": 2.431640625, "logps/chosen": -671.0, "logps/rejected": -1813.0, "loss": 0.6287, "rewards/accuracies": 0.8125, "rewards/chosen": 0.630615234375, "rewards/margins": 4.68359375, "rewards/rejected": -4.05078125, "step": 1943 }, { "epoch": 0.36732958571496055, "grad_norm": 2.053481959590063, "learning_rate": 8.183067537106384e-07, "logits/chosen": 2.328125, "logits/rejected": 2.4453125, "logps/chosen": -533.75, "logps/rejected": -1570.0, "loss": 0.6917, "rewards/accuracies": 0.75, "rewards/chosen": 0.66796875, "rewards/margins": 8.50390625, "rewards/rejected": -7.8046875, "step": 1944 }, { "epoch": 0.36751854126316785, "grad_norm": 3.258520270703759, "learning_rate": 8.180684120358864e-07, "logits/chosen": 1.6962890625, "logits/rejected": 1.5009765625, "logps/chosen": -768.5, "logps/rejected": -641.5, "loss": 0.5751, "rewards/accuracies": 0.875, "rewards/chosen": 1.41015625, "rewards/margins": 4.66015625, "rewards/rejected": -3.25, "step": 1945 }, { "epoch": 0.3677074968113751, "grad_norm": 2.600834771815516, "learning_rate": 8.178299537381345e-07, "logits/chosen": 2.04296875, "logits/rejected": 2.208984375, "logps/chosen": -839.0, "logps/rejected": -947.0, "loss": 0.5346, "rewards/accuracies": 0.84375, "rewards/chosen": 0.62353515625, "rewards/margins": 4.6796875, "rewards/rejected": -4.0546875, "step": 1946 }, { "epoch": 0.3678964523595824, "grad_norm": 1.5408023168759502, "learning_rate": 8.17591378921124e-07, "logits/chosen": 1.701171875, "logits/rejected": 1.39892578125, "logps/chosen": -570.0, "logps/rejected": -829.5, "loss": 0.7192, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2249755859375, "rewards/margins": 5.4140625, "rewards/rejected": -5.1953125, "step": 1947 }, { "epoch": 0.3680854079077897, "grad_norm": 1.965879536413443, "learning_rate": 8.173526876886466e-07, "logits/chosen": 1.970703125, "logits/rejected": 1.54736328125, "logps/chosen": -759.0, "logps/rejected": -838.0, "loss": 0.6565, "rewards/accuracies": 0.71875, "rewards/chosen": 0.531005859375, "rewards/margins": 3.3203125, "rewards/rejected": -2.7890625, "step": 1948 }, { "epoch": 0.36827436345599696, "grad_norm": 1.8629992035821308, "learning_rate": 8.171138801445447e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.35693359375, "logps/chosen": -694.0, "logps/rejected": -795.5, "loss": 0.4517, "rewards/accuracies": 1.0, "rewards/chosen": 0.76220703125, "rewards/margins": 6.26171875, "rewards/rejected": -5.50390625, "step": 1949 }, { "epoch": 0.36846331900420426, "grad_norm": 2.4751180747507573, "learning_rate": 8.168749563927118e-07, "logits/chosen": 0.947265625, "logits/rejected": 0.945068359375, "logps/chosen": -509.75, "logps/rejected": -571.5, "loss": 0.6516, "rewards/accuracies": 0.71875, "rewards/chosen": 0.67431640625, "rewards/margins": 3.046875, "rewards/rejected": -2.373046875, "step": 1950 }, { "epoch": 0.36865227455241156, "grad_norm": 2.251732315001278, "learning_rate": 8.16635916537091e-07, "logits/chosen": 1.9853515625, "logits/rejected": 2.419921875, "logps/chosen": -753.0, "logps/rejected": -928.0, "loss": 0.668, "rewards/accuracies": 0.75, "rewards/chosen": 0.46435546875, "rewards/margins": 4.6171875, "rewards/rejected": -4.1484375, "step": 1951 }, { "epoch": 0.3688412301006188, "grad_norm": 1.9532155938383124, "learning_rate": 8.163967606816766e-07, "logits/chosen": 1.634765625, "logits/rejected": 1.109375, "logps/chosen": -729.5, "logps/rejected": -734.0, "loss": 0.6344, "rewards/accuracies": 0.75, "rewards/chosen": 0.56494140625, "rewards/margins": 5.08984375, "rewards/rejected": -4.53515625, "step": 1952 }, { "epoch": 0.3690301856488261, "grad_norm": 2.0487313338359456, "learning_rate": 8.161574889305134e-07, "logits/chosen": 1.68359375, "logits/rejected": 1.1025390625, "logps/chosen": -980.0, "logps/rejected": -1373.0, "loss": 0.5012, "rewards/accuracies": 0.90625, "rewards/chosen": 0.673828125, "rewards/margins": 6.671875, "rewards/rejected": -6.00390625, "step": 1953 }, { "epoch": 0.3692191411970334, "grad_norm": 2.795470392001835, "learning_rate": 8.159181013876962e-07, "logits/chosen": 1.43017578125, "logits/rejected": 0.93017578125, "logps/chosen": -479.5, "logps/rejected": -508.5, "loss": 0.6409, "rewards/accuracies": 0.875, "rewards/chosen": 0.687255859375, "rewards/margins": 3.390625, "rewards/rejected": -2.70703125, "step": 1954 }, { "epoch": 0.36940809674524067, "grad_norm": 1.9930489793035258, "learning_rate": 8.156785981573707e-07, "logits/chosen": 1.685546875, "logits/rejected": 1.5859375, "logps/chosen": -805.0, "logps/rejected": -921.0, "loss": 0.6968, "rewards/accuracies": 0.6875, "rewards/chosen": 0.59765625, "rewards/margins": 3.4609375, "rewards/rejected": -2.865234375, "step": 1955 }, { "epoch": 0.36959705229344797, "grad_norm": 1.5828575104616989, "learning_rate": 8.154389793437324e-07, "logits/chosen": 3.2265625, "logits/rejected": 2.4453125, "logps/chosen": -790.0, "logps/rejected": -769.0, "loss": 0.5358, "rewards/accuracies": 0.875, "rewards/chosen": 1.18408203125, "rewards/margins": 4.26953125, "rewards/rejected": -3.078125, "step": 1956 }, { "epoch": 0.3697860078416553, "grad_norm": 1.7397550092240834, "learning_rate": 8.151992450510274e-07, "logits/chosen": 2.12890625, "logits/rejected": 1.75, "logps/chosen": -486.5, "logps/rejected": -633.0, "loss": 0.6556, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0732421875, "rewards/margins": 4.26171875, "rewards/rejected": -4.1875, "step": 1957 }, { "epoch": 0.3699749633898625, "grad_norm": 2.306859587035619, "learning_rate": 8.149593953835519e-07, "logits/chosen": 2.568359375, "logits/rejected": 2.28515625, "logps/chosen": -720.0, "logps/rejected": -1188.0, "loss": 0.5671, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2265625, "rewards/margins": 5.08984375, "rewards/rejected": -3.853515625, "step": 1958 }, { "epoch": 0.3701639189380698, "grad_norm": 2.35619453206304, "learning_rate": 8.147194304456523e-07, "logits/chosen": 2.0078125, "logits/rejected": 2.45703125, "logps/chosen": -865.0, "logps/rejected": -1614.0, "loss": 0.5616, "rewards/accuracies": 0.78125, "rewards/chosen": 0.57568359375, "rewards/margins": 6.82421875, "rewards/rejected": -6.24609375, "step": 1959 }, { "epoch": 0.37035287448627713, "grad_norm": 2.3025642658327405, "learning_rate": 8.144793503417252e-07, "logits/chosen": 3.037109375, "logits/rejected": 2.359375, "logps/chosen": -335.0, "logps/rejected": -438.0, "loss": 0.7502, "rewards/accuracies": 0.875, "rewards/chosen": -0.7188720703125, "rewards/margins": 3.3720703125, "rewards/rejected": -4.08984375, "step": 1960 }, { "epoch": 0.3705418300344844, "grad_norm": 1.9334548931891058, "learning_rate": 8.142391551762177e-07, "logits/chosen": 1.8203125, "logits/rejected": 1.70849609375, "logps/chosen": -1160.0, "logps/rejected": -1204.0, "loss": 0.5675, "rewards/accuracies": 0.8125, "rewards/chosen": 0.969970703125, "rewards/margins": 4.91015625, "rewards/rejected": -3.93359375, "step": 1961 }, { "epoch": 0.3707307855826917, "grad_norm": 1.9633366149606237, "learning_rate": 8.13998845053626e-07, "logits/chosen": 1.9150390625, "logits/rejected": 2.025390625, "logps/chosen": -666.0, "logps/rejected": -829.0, "loss": 0.567, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9267578125, "rewards/margins": 4.42578125, "rewards/rejected": -3.4921875, "step": 1962 }, { "epoch": 0.370919741130899, "grad_norm": 1.7002884781531802, "learning_rate": 8.137584200784971e-07, "logits/chosen": 2.6328125, "logits/rejected": 2.376953125, "logps/chosen": -920.0, "logps/rejected": -757.0, "loss": 0.6123, "rewards/accuracies": 0.875, "rewards/chosen": 1.35089111328125, "rewards/margins": 3.318359375, "rewards/rejected": -1.96875, "step": 1963 }, { "epoch": 0.37110869667910623, "grad_norm": 2.069353087684281, "learning_rate": 8.135178803554279e-07, "logits/chosen": 2.6953125, "logits/rejected": 2.6953125, "logps/chosen": -925.0, "logps/rejected": -979.5, "loss": 0.6623, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1630859375, "rewards/margins": 3.763671875, "rewards/rejected": -2.5958251953125, "step": 1964 }, { "epoch": 0.37129765222731353, "grad_norm": 3.3685280313092236, "learning_rate": 8.132772259890648e-07, "logits/chosen": 2.412109375, "logits/rejected": 2.275390625, "logps/chosen": -851.0, "logps/rejected": -926.0, "loss": 0.5733, "rewards/accuracies": 0.8125, "rewards/chosen": 1.39453125, "rewards/margins": 3.140625, "rewards/rejected": -1.748046875, "step": 1965 }, { "epoch": 0.3714866077755208, "grad_norm": 2.218697527546462, "learning_rate": 8.130364570841046e-07, "logits/chosen": 1.9521484375, "logits/rejected": 1.677734375, "logps/chosen": -737.0, "logps/rejected": -896.0, "loss": 0.506, "rewards/accuracies": 0.875, "rewards/chosen": 1.501953125, "rewards/margins": 5.2265625, "rewards/rejected": -3.73046875, "step": 1966 }, { "epoch": 0.3716755633237281, "grad_norm": 1.9306690351528446, "learning_rate": 8.127955737452934e-07, "logits/chosen": 1.77734375, "logits/rejected": 2.0859375, "logps/chosen": -850.5, "logps/rejected": -864.5, "loss": 0.5977, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3076171875, "rewards/margins": 3.939453125, "rewards/rejected": -2.6376953125, "step": 1967 }, { "epoch": 0.3718645188719354, "grad_norm": 1.9105268690963444, "learning_rate": 8.125545760774275e-07, "logits/chosen": 2.927734375, "logits/rejected": 2.814453125, "logps/chosen": -1225.0, "logps/rejected": -1044.0, "loss": 0.7155, "rewards/accuracies": 0.75, "rewards/chosen": 0.65673828125, "rewards/margins": 3.595703125, "rewards/rejected": -2.92578125, "step": 1968 }, { "epoch": 0.37205347442014264, "grad_norm": 2.364196062681139, "learning_rate": 8.123134641853527e-07, "logits/chosen": 1.44677734375, "logits/rejected": 1.37744140625, "logps/chosen": -754.0, "logps/rejected": -1004.5, "loss": 0.7138, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0263671875, "rewards/margins": 3.50390625, "rewards/rejected": -3.484375, "step": 1969 }, { "epoch": 0.37224242996834994, "grad_norm": 2.043410877928981, "learning_rate": 8.120722381739648e-07, "logits/chosen": 2.755859375, "logits/rejected": 2.58203125, "logps/chosen": -553.0, "logps/rejected": -599.0, "loss": 0.6449, "rewards/accuracies": 0.71875, "rewards/chosen": 0.533203125, "rewards/margins": 2.728515625, "rewards/rejected": -2.1953125, "step": 1970 }, { "epoch": 0.37243138551655725, "grad_norm": 2.149939368793082, "learning_rate": 8.118308981482087e-07, "logits/chosen": 2.3134765625, "logits/rejected": 3.072265625, "logps/chosen": -922.0, "logps/rejected": -711.5, "loss": 0.5865, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9736328125, "rewards/margins": 4.58984375, "rewards/rejected": -3.62109375, "step": 1971 }, { "epoch": 0.3726203410647645, "grad_norm": 2.244198914800986, "learning_rate": 8.115894442130796e-07, "logits/chosen": 2.75390625, "logits/rejected": 2.181640625, "logps/chosen": -1350.0, "logps/rejected": -1130.0, "loss": 0.4369, "rewards/accuracies": 0.875, "rewards/chosen": 1.31884765625, "rewards/margins": 5.62890625, "rewards/rejected": -4.3046875, "step": 1972 }, { "epoch": 0.3728092966129718, "grad_norm": 2.4049534966078046, "learning_rate": 8.113478764736215e-07, "logits/chosen": 3.3515625, "logits/rejected": 3.34765625, "logps/chosen": -757.5, "logps/rejected": -809.5, "loss": 0.6382, "rewards/accuracies": 0.78125, "rewards/chosen": 0.782958984375, "rewards/margins": 3.06640625, "rewards/rejected": -2.28515625, "step": 1973 }, { "epoch": 0.3729982521611791, "grad_norm": 2.4462635710228766, "learning_rate": 8.111061950349288e-07, "logits/chosen": 2.34765625, "logits/rejected": 2.251953125, "logps/chosen": -463.0, "logps/rejected": -613.0, "loss": 0.6826, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1220703125, "rewards/margins": 2.728515625, "rewards/rejected": -2.603515625, "step": 1974 }, { "epoch": 0.37318720770938635, "grad_norm": 1.5105178126533234, "learning_rate": 8.10864400002144e-07, "logits/chosen": 2.4609375, "logits/rejected": 1.81884765625, "logps/chosen": -635.5, "logps/rejected": -592.5, "loss": 0.5565, "rewards/accuracies": 0.8125, "rewards/chosen": 1.203125, "rewards/margins": 3.95703125, "rewards/rejected": -2.755859375, "step": 1975 }, { "epoch": 0.37337616325759365, "grad_norm": 1.7172160965377616, "learning_rate": 8.106224914804608e-07, "logits/chosen": 1.86328125, "logits/rejected": 1.80078125, "logps/chosen": -776.0, "logps/rejected": -866.0, "loss": 0.5483, "rewards/accuracies": 0.875, "rewards/chosen": 0.67578125, "rewards/margins": 4.25390625, "rewards/rejected": -3.58203125, "step": 1976 }, { "epoch": 0.37356511880580096, "grad_norm": 1.9380539616620784, "learning_rate": 8.10380469575121e-07, "logits/chosen": 2.677734375, "logits/rejected": 2.90234375, "logps/chosen": -636.5, "logps/rejected": -1468.0, "loss": 0.599, "rewards/accuracies": 0.75, "rewards/chosen": -0.26953125, "rewards/margins": 4.59375, "rewards/rejected": -4.8671875, "step": 1977 }, { "epoch": 0.3737540743540082, "grad_norm": 2.1102882494266573, "learning_rate": 8.10138334391416e-07, "logits/chosen": 2.2294921875, "logits/rejected": 2.07177734375, "logps/chosen": -907.0, "logps/rejected": -739.0, "loss": 0.4981, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9833984375, "rewards/margins": 5.2421875, "rewards/rejected": -4.2578125, "step": 1978 }, { "epoch": 0.3739430299022155, "grad_norm": 1.730486328650198, "learning_rate": 8.098960860346867e-07, "logits/chosen": 2.3984375, "logits/rejected": 2.1171875, "logps/chosen": -995.0, "logps/rejected": -830.5, "loss": 0.6447, "rewards/accuracies": 0.84375, "rewards/chosen": 0.73388671875, "rewards/margins": 4.3671875, "rewards/rejected": -3.6328125, "step": 1979 }, { "epoch": 0.3741319854504228, "grad_norm": 3.185361748027941, "learning_rate": 8.09653724610323e-07, "logits/chosen": 2.484375, "logits/rejected": 2.068359375, "logps/chosen": -797.5, "logps/rejected": -654.5, "loss": 0.5831, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3955078125, "rewards/margins": 4.671875, "rewards/rejected": -4.26953125, "step": 1980 }, { "epoch": 0.37432094099863006, "grad_norm": 1.8227366638541735, "learning_rate": 8.094112502237641e-07, "logits/chosen": 1.5908203125, "logits/rejected": 1.6201171875, "logps/chosen": -776.0, "logps/rejected": -1250.0, "loss": 0.4896, "rewards/accuracies": 0.96875, "rewards/chosen": 1.076171875, "rewards/margins": 5.82421875, "rewards/rejected": -4.736328125, "step": 1981 }, { "epoch": 0.37450989654683736, "grad_norm": 2.38529651816135, "learning_rate": 8.091686629804985e-07, "logits/chosen": 2.259765625, "logits/rejected": 2.798828125, "logps/chosen": -727.5, "logps/rejected": -881.0, "loss": 0.5914, "rewards/accuracies": 0.78125, "rewards/chosen": -0.083984375, "rewards/margins": 4.2109375, "rewards/rejected": -4.296875, "step": 1982 }, { "epoch": 0.37469885209504467, "grad_norm": 2.873393627655334, "learning_rate": 8.089259629860633e-07, "logits/chosen": 1.0733642578125, "logits/rejected": 0.820556640625, "logps/chosen": -669.5, "logps/rejected": -1488.0, "loss": 0.5647, "rewards/accuracies": 0.90625, "rewards/chosen": 0.359130859375, "rewards/margins": 4.84375, "rewards/rejected": -4.48828125, "step": 1983 }, { "epoch": 0.3748878076432519, "grad_norm": 2.1189660816104943, "learning_rate": 8.086831503460452e-07, "logits/chosen": 2.59375, "logits/rejected": 2.4130859375, "logps/chosen": -721.0, "logps/rejected": -776.0, "loss": 0.5477, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7353515625, "rewards/margins": 3.90234375, "rewards/rejected": -3.162109375, "step": 1984 }, { "epoch": 0.3750767631914592, "grad_norm": 1.6644577695192264, "learning_rate": 8.084402251660798e-07, "logits/chosen": 2.880859375, "logits/rejected": 2.68359375, "logps/chosen": -727.0, "logps/rejected": -1211.0, "loss": 0.4675, "rewards/accuracies": 0.875, "rewards/chosen": 1.3759765625, "rewards/margins": 6.25, "rewards/rejected": -4.87109375, "step": 1985 }, { "epoch": 0.3752657187396665, "grad_norm": 1.6873340895107547, "learning_rate": 8.081971875518514e-07, "logits/chosen": 1.9296875, "logits/rejected": 1.87890625, "logps/chosen": -1079.0, "logps/rejected": -1644.0, "loss": 0.4965, "rewards/accuracies": 0.8125, "rewards/chosen": 1.8359375, "rewards/margins": 5.90625, "rewards/rejected": -4.072265625, "step": 1986 }, { "epoch": 0.37545467428787377, "grad_norm": 2.0737908785674812, "learning_rate": 8.079540376090933e-07, "logits/chosen": 2.2265625, "logits/rejected": 1.837890625, "logps/chosen": -695.0, "logps/rejected": -864.5, "loss": 0.5774, "rewards/accuracies": 0.875, "rewards/chosen": 0.8916015625, "rewards/margins": 3.82421875, "rewards/rejected": -2.935546875, "step": 1987 }, { "epoch": 0.37564362983608107, "grad_norm": 2.1066677003150787, "learning_rate": 8.07710775443588e-07, "logits/chosen": 2.12890625, "logits/rejected": 1.671875, "logps/chosen": -1191.0, "logps/rejected": -1099.0, "loss": 0.6105, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5078125, "rewards/margins": 3.67578125, "rewards/rejected": -3.16796875, "step": 1988 }, { "epoch": 0.3758325853842883, "grad_norm": 1.72561530498165, "learning_rate": 8.07467401161166e-07, "logits/chosen": 2.736328125, "logits/rejected": 3.099609375, "logps/chosen": -686.5, "logps/rejected": -792.0, "loss": 0.5908, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5732421875, "rewards/margins": 3.82421875, "rewards/rejected": -3.25, "step": 1989 }, { "epoch": 0.3760215409324956, "grad_norm": 1.7903103850162219, "learning_rate": 8.07223914867708e-07, "logits/chosen": 2.44921875, "logits/rejected": 2.177734375, "logps/chosen": -1031.5, "logps/rejected": -1177.0, "loss": 0.5794, "rewards/accuracies": 0.75, "rewards/chosen": 1.07421875, "rewards/margins": 4.2734375, "rewards/rejected": -3.19921875, "step": 1990 }, { "epoch": 0.3762104964807029, "grad_norm": 1.648074883502981, "learning_rate": 8.069803166691418e-07, "logits/chosen": 1.9853515625, "logits/rejected": 1.12860107421875, "logps/chosen": -625.5, "logps/rejected": -584.0, "loss": 0.5714, "rewards/accuracies": 0.78125, "rewards/chosen": 0.547607421875, "rewards/margins": 4.15625, "rewards/rejected": -3.609375, "step": 1991 }, { "epoch": 0.3763994520289102, "grad_norm": 2.1850141215929337, "learning_rate": 8.06736606671445e-07, "logits/chosen": 1.5087890625, "logits/rejected": 1.77734375, "logps/chosen": -868.0, "logps/rejected": -717.0, "loss": 0.7745, "rewards/accuracies": 0.65625, "rewards/chosen": 0.162109375, "rewards/margins": 2.505859375, "rewards/rejected": -2.34814453125, "step": 1992 }, { "epoch": 0.3765884075771175, "grad_norm": 1.8638946096117501, "learning_rate": 8.064927849806433e-07, "logits/chosen": 2.275390625, "logits/rejected": 1.8583984375, "logps/chosen": -946.5, "logps/rejected": -841.5, "loss": 0.5758, "rewards/accuracies": 0.78125, "rewards/chosen": 0.84820556640625, "rewards/margins": 4.4609375, "rewards/rejected": -3.60546875, "step": 1993 }, { "epoch": 0.3767773631253248, "grad_norm": 1.7706236480223116, "learning_rate": 8.062488517028116e-07, "logits/chosen": 3.01171875, "logits/rejected": 2.763671875, "logps/chosen": -878.0, "logps/rejected": -1515.5, "loss": 0.4803, "rewards/accuracies": 0.9375, "rewards/chosen": 1.099609375, "rewards/margins": 6.9765625, "rewards/rejected": -5.8828125, "step": 1994 }, { "epoch": 0.37696631867353203, "grad_norm": 4.125220320975153, "learning_rate": 8.060048069440723e-07, "logits/chosen": 2.0166015625, "logits/rejected": 1.4166259765625, "logps/chosen": -920.5, "logps/rejected": -857.0, "loss": 0.6275, "rewards/accuracies": 0.8125, "rewards/chosen": 0.829833984375, "rewards/margins": 4.099609375, "rewards/rejected": -3.2734375, "step": 1995 }, { "epoch": 0.37715527422173933, "grad_norm": 2.463748408935365, "learning_rate": 8.057606508105974e-07, "logits/chosen": 2.14453125, "logits/rejected": 2.36328125, "logps/chosen": -1872.0, "logps/rejected": -1430.0, "loss": 0.626, "rewards/accuracies": 0.75, "rewards/chosen": 0.31396484375, "rewards/margins": 4.265625, "rewards/rejected": -3.96875, "step": 1996 }, { "epoch": 0.37734422976994664, "grad_norm": 2.061334800547598, "learning_rate": 8.055163834086068e-07, "logits/chosen": 2.87109375, "logits/rejected": 2.67578125, "logps/chosen": -566.0, "logps/rejected": -602.5, "loss": 0.6293, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5003662109375, "rewards/margins": 3.14453125, "rewards/rejected": -2.638671875, "step": 1997 }, { "epoch": 0.3775331853181539, "grad_norm": 3.2064595182255715, "learning_rate": 8.052720048443685e-07, "logits/chosen": 2.501953125, "logits/rejected": 2.72265625, "logps/chosen": -865.0, "logps/rejected": -938.0, "loss": 0.5905, "rewards/accuracies": 0.8125, "rewards/chosen": 0.80712890625, "rewards/margins": 5.3359375, "rewards/rejected": -4.52734375, "step": 1998 }, { "epoch": 0.3777221408663612, "grad_norm": 1.952641560284647, "learning_rate": 8.050275152241998e-07, "logits/chosen": 2.26953125, "logits/rejected": 2.3828125, "logps/chosen": -971.5, "logps/rejected": -1205.0, "loss": 0.5932, "rewards/accuracies": 0.84375, "rewards/chosen": 0.832366943359375, "rewards/margins": 5.1875, "rewards/rejected": -4.3515625, "step": 1999 }, { "epoch": 0.3779110964145685, "grad_norm": 2.2051895690469565, "learning_rate": 8.047829146544654e-07, "logits/chosen": 1.8505859375, "logits/rejected": 2.07763671875, "logps/chosen": -1156.0, "logps/rejected": -1001.0, "loss": 0.6435, "rewards/accuracies": 0.78125, "rewards/chosen": 1.37060546875, "rewards/margins": 4.36328125, "rewards/rejected": -2.9921875, "step": 2000 }, { "epoch": 0.37810005196277574, "grad_norm": 2.294609630863023, "learning_rate": 8.045382032415788e-07, "logits/chosen": 2.65625, "logits/rejected": 1.90234375, "logps/chosen": -532.5, "logps/rejected": -656.0, "loss": 0.5916, "rewards/accuracies": 0.84375, "rewards/chosen": 0.66168212890625, "rewards/margins": 4.10546875, "rewards/rejected": -3.44921875, "step": 2001 }, { "epoch": 0.37828900751098304, "grad_norm": 2.317066869039492, "learning_rate": 8.042933810920016e-07, "logits/chosen": 1.4765625, "logits/rejected": 1.77734375, "logps/chosen": -897.5, "logps/rejected": -988.5, "loss": 0.6301, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8201904296875, "rewards/margins": 4.1796875, "rewards/rejected": -3.35546875, "step": 2002 }, { "epoch": 0.37847796305919035, "grad_norm": 1.9352963589468675, "learning_rate": 8.040484483122434e-07, "logits/chosen": 1.62744140625, "logits/rejected": 1.98046875, "logps/chosen": -618.5, "logps/rejected": -697.0, "loss": 0.6174, "rewards/accuracies": 0.84375, "rewards/chosen": -0.087890625, "rewards/margins": 3.5078125, "rewards/rejected": -3.59765625, "step": 2003 }, { "epoch": 0.3786669186073976, "grad_norm": 2.0790373237096436, "learning_rate": 8.038034050088619e-07, "logits/chosen": 1.6552734375, "logits/rejected": 1.7568359375, "logps/chosen": -833.0, "logps/rejected": -1732.0, "loss": 0.6402, "rewards/accuracies": 0.75, "rewards/chosen": 0.279052734375, "rewards/margins": 6.21484375, "rewards/rejected": -5.947265625, "step": 2004 }, { "epoch": 0.3788558741556049, "grad_norm": 2.6747931808805263, "learning_rate": 8.035582512884632e-07, "logits/chosen": 1.2353515625, "logits/rejected": 1.7216796875, "logps/chosen": -774.0, "logps/rejected": -1394.0, "loss": 0.6615, "rewards/accuracies": 0.75, "rewards/chosen": 0.028564453125, "rewards/margins": 4.51171875, "rewards/rejected": -4.484375, "step": 2005 }, { "epoch": 0.3790448297038122, "grad_norm": 1.9213377391422297, "learning_rate": 8.033129872577014e-07, "logits/chosen": 2.0703125, "logits/rejected": 2.330078125, "logps/chosen": -561.5, "logps/rejected": -476.0, "loss": 0.7652, "rewards/accuracies": 0.625, "rewards/chosen": -1.400634765625, "rewards/margins": 1.400390625, "rewards/rejected": -2.80078125, "step": 2006 }, { "epoch": 0.37923378525201945, "grad_norm": 1.8097947332527178, "learning_rate": 8.030676130232783e-07, "logits/chosen": 1.2529296875, "logits/rejected": 0.86328125, "logps/chosen": -595.75, "logps/rejected": -538.0, "loss": 0.5584, "rewards/accuracies": 0.875, "rewards/chosen": 0.38525390625, "rewards/margins": 3.8125, "rewards/rejected": -3.4296875, "step": 2007 }, { "epoch": 0.37942274080022675, "grad_norm": 1.7952518795824925, "learning_rate": 8.02822128691944e-07, "logits/chosen": 1.458984375, "logits/rejected": 1.1630859375, "logps/chosen": -548.5, "logps/rejected": -676.5, "loss": 0.6467, "rewards/accuracies": 0.75, "rewards/chosen": -0.2305908203125, "rewards/margins": 4.16015625, "rewards/rejected": -4.38671875, "step": 2008 }, { "epoch": 0.37961169634843406, "grad_norm": 2.392595428538875, "learning_rate": 8.025765343704962e-07, "logits/chosen": 2.189453125, "logits/rejected": 2.1787109375, "logps/chosen": -898.0, "logps/rejected": -1276.0, "loss": 0.5734, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7080078125, "rewards/margins": 4.921875, "rewards/rejected": -4.21484375, "step": 2009 }, { "epoch": 0.3798006518966413, "grad_norm": 1.5359983814012144, "learning_rate": 8.023308301657805e-07, "logits/chosen": 2.165283203125, "logits/rejected": 2.3193359375, "logps/chosen": -519.5, "logps/rejected": -952.0, "loss": 0.6724, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2579345703125, "rewards/margins": 3.2578125, "rewards/rejected": -3.51953125, "step": 2010 }, { "epoch": 0.3799896074448486, "grad_norm": 2.006124532908212, "learning_rate": 8.020850161846904e-07, "logits/chosen": 2.23046875, "logits/rejected": 2.2265625, "logps/chosen": -668.0, "logps/rejected": -2133.75, "loss": 0.5891, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4814453125, "rewards/margins": 6.0625, "rewards/rejected": -5.58984375, "step": 2011 }, { "epoch": 0.38017856299305586, "grad_norm": 1.9484236287126886, "learning_rate": 8.01839092534167e-07, "logits/chosen": 2.435546875, "logits/rejected": 2.9609375, "logps/chosen": -692.0, "logps/rejected": -2450.5, "loss": 0.6904, "rewards/accuracies": 0.75, "rewards/chosen": 0.210693359375, "rewards/margins": 6.232666015625, "rewards/rejected": -6.025390625, "step": 2012 }, { "epoch": 0.38036751854126316, "grad_norm": 1.8857301281202394, "learning_rate": 8.015930593211994e-07, "logits/chosen": 2.70703125, "logits/rejected": 2.35546875, "logps/chosen": -809.0, "logps/rejected": -2023.0, "loss": 0.5385, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3330078125, "rewards/margins": 5.703125, "rewards/rejected": -5.375, "step": 2013 }, { "epoch": 0.38055647408947046, "grad_norm": 2.4446545058985922, "learning_rate": 8.013469166528244e-07, "logits/chosen": 3.0029296875, "logits/rejected": 3.00244140625, "logps/chosen": -940.0, "logps/rejected": -979.5, "loss": 0.6139, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3828125, "rewards/margins": 4.15625, "rewards/rejected": -2.771484375, "step": 2014 }, { "epoch": 0.3807454296376777, "grad_norm": 1.8295165535251594, "learning_rate": 8.011006646361256e-07, "logits/chosen": 2.0166015625, "logits/rejected": 2.111328125, "logps/chosen": -17960.0, "logps/rejected": -886.0, "loss": 0.6524, "rewards/accuracies": 0.75, "rewards/chosen": -12.68310546875, "rewards/margins": -9.51171875, "rewards/rejected": -3.1953125, "step": 2015 }, { "epoch": 0.380934385185885, "grad_norm": 2.0281610556181513, "learning_rate": 8.008543033782354e-07, "logits/chosen": 2.8515625, "logits/rejected": 2.859375, "logps/chosen": -593.0, "logps/rejected": -523.5, "loss": 0.8068, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3671875, "rewards/margins": 2.078125, "rewards/rejected": -2.44140625, "step": 2016 }, { "epoch": 0.3811233407340923, "grad_norm": 2.180672284673268, "learning_rate": 8.00607832986333e-07, "logits/chosen": 1.576171875, "logits/rejected": 1.640625, "logps/chosen": -525.5, "logps/rejected": -514.0, "loss": 0.7283, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4521484375, "rewards/margins": 2.5498046875, "rewards/rejected": -2.103515625, "step": 2017 }, { "epoch": 0.38131229628229957, "grad_norm": 1.7571260709849712, "learning_rate": 8.003612535676449e-07, "logits/chosen": 2.9140625, "logits/rejected": 2.900390625, "logps/chosen": -724.5, "logps/rejected": -1200.0, "loss": 0.6194, "rewards/accuracies": 0.875, "rewards/chosen": 0.98114013671875, "rewards/margins": 4.80078125, "rewards/rejected": -3.82421875, "step": 2018 }, { "epoch": 0.38150125183050687, "grad_norm": 2.6778505837800526, "learning_rate": 8.001145652294456e-07, "logits/chosen": 3.28125, "logits/rejected": 3.140625, "logps/chosen": -907.0, "logps/rejected": -1076.0, "loss": 0.5017, "rewards/accuracies": 0.875, "rewards/chosen": 0.89404296875, "rewards/margins": 5.443359375, "rewards/rejected": -4.55078125, "step": 2019 }, { "epoch": 0.3816902073787142, "grad_norm": 1.8027889117138258, "learning_rate": 7.998677680790566e-07, "logits/chosen": 3.17578125, "logits/rejected": 2.919921875, "logps/chosen": -513.5, "logps/rejected": -496.0, "loss": 0.7317, "rewards/accuracies": 0.78125, "rewards/chosen": 0.212890625, "rewards/margins": 2.642578125, "rewards/rejected": -2.431640625, "step": 2020 }, { "epoch": 0.3818791629269214, "grad_norm": 2.048033882545628, "learning_rate": 7.99620862223847e-07, "logits/chosen": 3.169921875, "logits/rejected": 2.7734375, "logps/chosen": -961.0, "logps/rejected": -707.0, "loss": 0.6704, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1513671875, "rewards/margins": 2.8212890625, "rewards/rejected": -1.67333984375, "step": 2021 }, { "epoch": 0.3820681184751287, "grad_norm": 1.6168232790347161, "learning_rate": 7.993738477712327e-07, "logits/chosen": 2.74609375, "logits/rejected": 2.763671875, "logps/chosen": -796.0, "logps/rejected": -803.5, "loss": 0.6691, "rewards/accuracies": 0.78125, "rewards/chosen": 1.109375, "rewards/margins": 3.42578125, "rewards/rejected": -2.31640625, "step": 2022 }, { "epoch": 0.38225707402333603, "grad_norm": 1.5062491598508196, "learning_rate": 7.991267248286776e-07, "logits/chosen": 3.015625, "logits/rejected": 3.15625, "logps/chosen": -854.0, "logps/rejected": -2594.0, "loss": 0.5976, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9462890625, "rewards/margins": 6.0546875, "rewards/rejected": -5.1015625, "step": 2023 }, { "epoch": 0.3824460295715433, "grad_norm": 2.234971047273963, "learning_rate": 7.98879493503692e-07, "logits/chosen": 2.25, "logits/rejected": 2.33203125, "logps/chosen": -807.5, "logps/rejected": -1355.0, "loss": 0.5625, "rewards/accuracies": 0.84375, "rewards/chosen": 0.11871337890625, "rewards/margins": 5.0859375, "rewards/rejected": -4.96875, "step": 2024 }, { "epoch": 0.3826349851197506, "grad_norm": 3.7007547703950006, "learning_rate": 7.98632153903834e-07, "logits/chosen": 2.400390625, "logits/rejected": 2.30078125, "logps/chosen": -1046.0, "logps/rejected": -865.5, "loss": 0.6346, "rewards/accuracies": 0.75, "rewards/chosen": 1.235595703125, "rewards/margins": 4.126953125, "rewards/rejected": -2.892578125, "step": 2025 }, { "epoch": 0.3828239406679579, "grad_norm": 2.781702908192, "learning_rate": 7.983847061367085e-07, "logits/chosen": 2.533203125, "logits/rejected": 2.41015625, "logps/chosen": -651.0, "logps/rejected": -726.0, "loss": 0.687, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5400390625, "rewards/margins": 3.3359375, "rewards/rejected": -2.794921875, "step": 2026 }, { "epoch": 0.38301289621616513, "grad_norm": 1.9904795138499498, "learning_rate": 7.981371503099674e-07, "logits/chosen": 3.0546875, "logits/rejected": 3.33203125, "logps/chosen": -1500.0, "logps/rejected": -1539.0, "loss": 0.4768, "rewards/accuracies": 0.90625, "rewards/chosen": 1.947265625, "rewards/margins": 6.26953125, "rewards/rejected": -4.3203125, "step": 2027 }, { "epoch": 0.38320185176437244, "grad_norm": 3.5878331204367977, "learning_rate": 7.978894865313099e-07, "logits/chosen": 2.138671875, "logits/rejected": 1.8271484375, "logps/chosen": -1049.0, "logps/rejected": -800.0, "loss": 0.5116, "rewards/accuracies": 0.875, "rewards/chosen": 0.8134765625, "rewards/margins": 4.421875, "rewards/rejected": -3.6015625, "step": 2028 }, { "epoch": 0.38339080731257974, "grad_norm": 4.683242484868848, "learning_rate": 7.976417149084819e-07, "logits/chosen": 2.8046875, "logits/rejected": 2.568359375, "logps/chosen": -683.5, "logps/rejected": -841.5, "loss": 0.6094, "rewards/accuracies": 0.84375, "rewards/chosen": 0.40478515625, "rewards/margins": 4.203125, "rewards/rejected": -3.794921875, "step": 2029 }, { "epoch": 0.383579762860787, "grad_norm": 2.317667577323626, "learning_rate": 7.97393835549276e-07, "logits/chosen": 2.9287109375, "logits/rejected": 2.857421875, "logps/chosen": -888.0, "logps/rejected": -864.0, "loss": 0.6752, "rewards/accuracies": 0.71875, "rewards/chosen": 0.681640625, "rewards/margins": 3.287109375, "rewards/rejected": -2.61328125, "step": 2030 }, { "epoch": 0.3837687184089943, "grad_norm": 2.6773685207654054, "learning_rate": 7.971458485615325e-07, "logits/chosen": 2.48046875, "logits/rejected": 1.9013671875, "logps/chosen": -1058.5, "logps/rejected": -925.0, "loss": 0.5128, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4619140625, "rewards/margins": 5.5078125, "rewards/rejected": -4.03515625, "step": 2031 }, { "epoch": 0.3839576739572016, "grad_norm": 2.1131806619885327, "learning_rate": 7.968977540531374e-07, "logits/chosen": 2.25, "logits/rejected": 2.45947265625, "logps/chosen": -583.0, "logps/rejected": -843.0, "loss": 0.6282, "rewards/accuracies": 0.78125, "rewards/chosen": 0.44830322265625, "rewards/margins": 3.189453125, "rewards/rejected": -2.735107421875, "step": 2032 }, { "epoch": 0.38414662950540884, "grad_norm": 1.9430061540023218, "learning_rate": 7.966495521320246e-07, "logits/chosen": 3.4609375, "logits/rejected": 3.43359375, "logps/chosen": -869.0, "logps/rejected": -1143.0, "loss": 0.6594, "rewards/accuracies": 0.75, "rewards/chosen": 0.3671875, "rewards/margins": 4.8515625, "rewards/rejected": -4.484375, "step": 2033 }, { "epoch": 0.38433558505361615, "grad_norm": 2.369888421741702, "learning_rate": 7.964012429061739e-07, "logits/chosen": 1.81640625, "logits/rejected": 1.783203125, "logps/chosen": -536.5, "logps/rejected": -627.0, "loss": 0.7336, "rewards/accuracies": 0.65625, "rewards/chosen": -0.530517578125, "rewards/margins": 2.88671875, "rewards/rejected": -3.4140625, "step": 2034 }, { "epoch": 0.3845245406018234, "grad_norm": 2.0808369579918917, "learning_rate": 7.961528264836122e-07, "logits/chosen": 2.60546875, "logits/rejected": 2.984375, "logps/chosen": -517.5, "logps/rejected": -1425.5, "loss": 0.6972, "rewards/accuracies": 0.75, "rewards/chosen": -0.184814453125, "rewards/margins": 3.69140625, "rewards/rejected": -3.875, "step": 2035 }, { "epoch": 0.3847134961500307, "grad_norm": 2.6442678471505316, "learning_rate": 7.959043029724127e-07, "logits/chosen": 2.64453125, "logits/rejected": 2.462890625, "logps/chosen": -1305.0, "logps/rejected": -1414.0, "loss": 0.5504, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3310546875, "rewards/margins": 5.5703125, "rewards/rejected": -4.2421875, "step": 2036 }, { "epoch": 0.384902451698238, "grad_norm": 3.4996018756328136, "learning_rate": 7.956556724806957e-07, "logits/chosen": 3.234375, "logits/rejected": 3.17578125, "logps/chosen": -485.5, "logps/rejected": -577.0, "loss": 0.6415, "rewards/accuracies": 0.75, "rewards/chosen": 0.37890625, "rewards/margins": 3.923828125, "rewards/rejected": -3.5390625, "step": 2037 }, { "epoch": 0.38509140724644525, "grad_norm": 1.6062258659756, "learning_rate": 7.954069351166278e-07, "logits/chosen": 3.267578125, "logits/rejected": 2.7841796875, "logps/chosen": -675.5, "logps/rejected": -717.0, "loss": 0.5362, "rewards/accuracies": 0.875, "rewards/chosen": 1.138671875, "rewards/margins": 4.34375, "rewards/rejected": -3.20703125, "step": 2038 }, { "epoch": 0.38528036279465255, "grad_norm": 1.8577543945169823, "learning_rate": 7.951580909884215e-07, "logits/chosen": 2.390625, "logits/rejected": 2.54296875, "logps/chosen": -898.0, "logps/rejected": -773.5, "loss": 0.5693, "rewards/accuracies": 0.875, "rewards/chosen": 1.328125, "rewards/margins": 4.18798828125, "rewards/rejected": -2.85791015625, "step": 2039 }, { "epoch": 0.38546931834285986, "grad_norm": 1.953990842634981, "learning_rate": 7.949091402043368e-07, "logits/chosen": 3.02734375, "logits/rejected": 3.388671875, "logps/chosen": -866.25, "logps/rejected": -1089.0, "loss": 0.646, "rewards/accuracies": 0.75, "rewards/chosen": 0.7109375, "rewards/margins": 5.234375, "rewards/rejected": -4.5234375, "step": 2040 }, { "epoch": 0.3856582738910671, "grad_norm": 3.0290143549170256, "learning_rate": 7.946600828726793e-07, "logits/chosen": 3.13671875, "logits/rejected": 3.0859375, "logps/chosen": -597.5, "logps/rejected": -734.0, "loss": 0.6153, "rewards/accuracies": 0.78125, "rewards/chosen": 0.65966796875, "rewards/margins": 3.77734375, "rewards/rejected": -3.12109375, "step": 2041 }, { "epoch": 0.3858472294392744, "grad_norm": 2.606164888256876, "learning_rate": 7.944109191018015e-07, "logits/chosen": 2.380859375, "logits/rejected": 2.36328125, "logps/chosen": -628.0, "logps/rejected": -558.5, "loss": 0.6219, "rewards/accuracies": 0.90625, "rewards/chosen": 0.810546875, "rewards/margins": 2.81640625, "rewards/rejected": -2.005859375, "step": 2042 }, { "epoch": 0.3860361849874817, "grad_norm": 2.3136416672058404, "learning_rate": 7.941616490001017e-07, "logits/chosen": 2.79296875, "logits/rejected": 3.09375, "logps/chosen": -1313.5, "logps/rejected": -1198.0, "loss": 0.5679, "rewards/accuracies": 0.84375, "rewards/chosen": 1.5302734375, "rewards/margins": 4.7578125, "rewards/rejected": -3.21875, "step": 2043 }, { "epoch": 0.38622514053568896, "grad_norm": 1.9821620455229423, "learning_rate": 7.939122726760249e-07, "logits/chosen": 2.287109375, "logits/rejected": 1.80712890625, "logps/chosen": -1064.0, "logps/rejected": -1061.0, "loss": 0.5083, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9658203125, "rewards/margins": 5.26953125, "rewards/rejected": -3.30859375, "step": 2044 }, { "epoch": 0.38641409608389626, "grad_norm": 2.213423388465419, "learning_rate": 7.93662790238062e-07, "logits/chosen": 2.296875, "logits/rejected": 1.8662109375, "logps/chosen": -1004.0, "logps/rejected": -821.0, "loss": 0.5901, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2177734375, "rewards/margins": 4.25, "rewards/rejected": -3.01953125, "step": 2045 }, { "epoch": 0.38660305163210357, "grad_norm": 1.6914375652265297, "learning_rate": 7.934132017947503e-07, "logits/chosen": 2.177734375, "logits/rejected": 2.435546875, "logps/chosen": -883.0, "logps/rejected": -835.0, "loss": 0.6836, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1044921875, "rewards/margins": 2.6796875, "rewards/rejected": -1.5751953125, "step": 2046 }, { "epoch": 0.3867920071803108, "grad_norm": 1.42125389508816, "learning_rate": 7.931635074546727e-07, "logits/chosen": 3.4921875, "logits/rejected": 3.044921875, "logps/chosen": -780.25, "logps/rejected": -1532.5, "loss": 0.6238, "rewards/accuracies": 0.75, "rewards/chosen": 1.368408203125, "rewards/margins": 4.8212890625, "rewards/rejected": -3.4521484375, "step": 2047 }, { "epoch": 0.3869809627285181, "grad_norm": 2.050455194910622, "learning_rate": 7.929137073264592e-07, "logits/chosen": 2.083984375, "logits/rejected": 2.306640625, "logps/chosen": -571.5, "logps/rejected": -537.0, "loss": 0.6432, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9912109375, "rewards/margins": 2.76953125, "rewards/rejected": -1.7763671875, "step": 2048 }, { "epoch": 0.3871699182767254, "grad_norm": 1.9334108644794958, "learning_rate": 7.926638015187847e-07, "logits/chosen": 2.33203125, "logits/rejected": 1.80859375, "logps/chosen": -812.0, "logps/rejected": -694.5, "loss": 0.6226, "rewards/accuracies": 0.84375, "rewards/chosen": 1.027740478515625, "rewards/margins": 3.248291015625, "rewards/rejected": -2.22625732421875, "step": 2049 }, { "epoch": 0.38735887382493267, "grad_norm": 2.7794659138336364, "learning_rate": 7.924137901403709e-07, "logits/chosen": 2.810546875, "logits/rejected": 2.6357421875, "logps/chosen": -759.0, "logps/rejected": -673.5, "loss": 0.6161, "rewards/accuracies": 0.84375, "rewards/chosen": 1.14453125, "rewards/margins": 3.28125, "rewards/rejected": -2.1259765625, "step": 2050 }, { "epoch": 0.38754782937314, "grad_norm": 3.391776746736688, "learning_rate": 7.921636732999849e-07, "logits/chosen": 1.5146484375, "logits/rejected": 1.900390625, "logps/chosen": -1038.0, "logps/rejected": -1100.0, "loss": 0.5175, "rewards/accuracies": 0.84375, "rewards/chosen": 1.517578125, "rewards/margins": 4.59375, "rewards/rejected": -3.076171875, "step": 2051 }, { "epoch": 0.3877367849213473, "grad_norm": 2.031564167318893, "learning_rate": 7.919134511064398e-07, "logits/chosen": 3.865234375, "logits/rejected": 3.72265625, "logps/chosen": -684.0, "logps/rejected": -747.5, "loss": 0.7371, "rewards/accuracies": 0.625, "rewards/chosen": 1.17626953125, "rewards/margins": 2.90234375, "rewards/rejected": -1.72265625, "step": 2052 }, { "epoch": 0.3879257404695545, "grad_norm": 1.635508025974354, "learning_rate": 7.916631236685949e-07, "logits/chosen": 1.527099609375, "logits/rejected": 1.63525390625, "logps/chosen": -845.0, "logps/rejected": -770.0, "loss": 0.5795, "rewards/accuracies": 0.78125, "rewards/chosen": 0.851806640625, "rewards/margins": 4.3984375, "rewards/rejected": -3.55078125, "step": 2053 }, { "epoch": 0.38811469601776183, "grad_norm": 1.8521107984712186, "learning_rate": 7.914126910953547e-07, "logits/chosen": 2.61328125, "logits/rejected": 1.951171875, "logps/chosen": -576.5, "logps/rejected": -1156.5, "loss": 0.5294, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0888671875, "rewards/margins": 5.83984375, "rewards/rejected": -4.751953125, "step": 2054 }, { "epoch": 0.38830365156596913, "grad_norm": 1.6398554829423229, "learning_rate": 7.9116215349567e-07, "logits/chosen": 2.0595703125, "logits/rejected": 1.85028076171875, "logps/chosen": -792.5, "logps/rejected": -1059.5, "loss": 0.6016, "rewards/accuracies": 0.84375, "rewards/chosen": 1.461669921875, "rewards/margins": 4.0625, "rewards/rejected": -2.6005859375, "step": 2055 }, { "epoch": 0.3884926071141764, "grad_norm": 2.016110135033127, "learning_rate": 7.909115109785369e-07, "logits/chosen": 2.318359375, "logits/rejected": 2.16748046875, "logps/chosen": -727.0, "logps/rejected": -695.5, "loss": 0.7416, "rewards/accuracies": 0.71875, "rewards/chosen": 0.065185546875, "rewards/margins": 2.39453125, "rewards/rejected": -2.328125, "step": 2056 }, { "epoch": 0.3886815626623837, "grad_norm": 2.2424959718377995, "learning_rate": 7.906607636529973e-07, "logits/chosen": 2.8515625, "logits/rejected": 2.625, "logps/chosen": -722.5, "logps/rejected": -952.5, "loss": 0.542, "rewards/accuracies": 0.875, "rewards/chosen": 1.53125, "rewards/margins": 6.357421875, "rewards/rejected": -4.830078125, "step": 2057 }, { "epoch": 0.38887051821059093, "grad_norm": 2.7506133423083274, "learning_rate": 7.904099116281385e-07, "logits/chosen": 2.71484375, "logits/rejected": 3.12109375, "logps/chosen": -494.5, "logps/rejected": -1270.0, "loss": 0.6746, "rewards/accuracies": 0.84375, "rewards/chosen": 0.76904296875, "rewards/margins": 7.546875, "rewards/rejected": -6.8076171875, "step": 2058 }, { "epoch": 0.38905947375879824, "grad_norm": 1.7186137163323285, "learning_rate": 7.901589550130938e-07, "logits/chosen": 2.990234375, "logits/rejected": 2.837890625, "logps/chosen": -748.5, "logps/rejected": -965.0, "loss": 0.6653, "rewards/accuracies": 0.75, "rewards/chosen": 0.72216796875, "rewards/margins": -4.41796875, "rewards/rejected": 5.11328125, "step": 2059 }, { "epoch": 0.38924842930700554, "grad_norm": 1.8285684299135283, "learning_rate": 7.899078939170417e-07, "logits/chosen": 2.083984375, "logits/rejected": 2.0078125, "logps/chosen": -17619.0, "logps/rejected": -18314.5, "loss": 0.6891, "rewards/accuracies": 0.78125, "rewards/chosen": -18.4814453125, "rewards/margins": 3.55859375, "rewards/rejected": -22.12109375, "step": 2060 }, { "epoch": 0.3894373848552128, "grad_norm": 2.17873801378497, "learning_rate": 7.896567284492059e-07, "logits/chosen": 2.513671875, "logits/rejected": 2.794921875, "logps/chosen": -1181.5, "logps/rejected": -1432.0, "loss": 0.5843, "rewards/accuracies": 0.75, "rewards/chosen": 1.037109375, "rewards/margins": 5.453125, "rewards/rejected": -4.4296875, "step": 2061 }, { "epoch": 0.3896263404034201, "grad_norm": 1.969733161559498, "learning_rate": 7.894054587188561e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.759765625, "logps/chosen": -748.0, "logps/rejected": -870.0, "loss": 0.5765, "rewards/accuracies": 0.875, "rewards/chosen": 0.908203125, "rewards/margins": 3.6875, "rewards/rejected": -2.775390625, "step": 2062 }, { "epoch": 0.3898152959516274, "grad_norm": 2.1828673613014224, "learning_rate": 7.891540848353067e-07, "logits/chosen": 2.615234375, "logits/rejected": 2.62890625, "logps/chosen": -616.5, "logps/rejected": -698.0, "loss": 0.6651, "rewards/accuracies": 0.65625, "rewards/chosen": 0.64794921875, "rewards/margins": 3.0546875, "rewards/rejected": -2.40771484375, "step": 2063 }, { "epoch": 0.39000425149983464, "grad_norm": 2.7064288976094986, "learning_rate": 7.889026069079182e-07, "logits/chosen": 3.359375, "logits/rejected": 3.51953125, "logps/chosen": -636.5, "logps/rejected": -961.0, "loss": 0.6571, "rewards/accuracies": 0.75, "rewards/chosen": 0.612060546875, "rewards/margins": 3.37646484375, "rewards/rejected": -2.76123046875, "step": 2064 }, { "epoch": 0.39019320704804195, "grad_norm": 1.8577050518771137, "learning_rate": 7.886510250460957e-07, "logits/chosen": 2.734375, "logits/rejected": 2.51171875, "logps/chosen": -605.5, "logps/rejected": -660.5, "loss": 0.7564, "rewards/accuracies": 0.6875, "rewards/chosen": 0.298828125, "rewards/margins": 2.6328125, "rewards/rejected": -2.3359375, "step": 2065 }, { "epoch": 0.39038216259624925, "grad_norm": 2.3011024654721988, "learning_rate": 7.883993393592894e-07, "logits/chosen": 2.015625, "logits/rejected": 1.986328125, "logps/chosen": -714.0, "logps/rejected": -751.0, "loss": 0.6966, "rewards/accuracies": 0.75, "rewards/chosen": 0.4091796875, "rewards/margins": 3.501953125, "rewards/rejected": -3.0859375, "step": 2066 }, { "epoch": 0.3905711181444565, "grad_norm": 2.545037197037691, "learning_rate": 7.881475499569955e-07, "logits/chosen": 1.984375, "logits/rejected": 1.7197265625, "logps/chosen": -1163.0, "logps/rejected": -928.0, "loss": 0.5573, "rewards/accuracies": 0.78125, "rewards/chosen": 0.74951171875, "rewards/margins": 4.787109375, "rewards/rejected": -4.0390625, "step": 2067 }, { "epoch": 0.3907600736926638, "grad_norm": 2.8378714120908644, "learning_rate": 7.878956569487547e-07, "logits/chosen": 3.35546875, "logits/rejected": 3.59375, "logps/chosen": -601.25, "logps/rejected": -743.5, "loss": 0.5896, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4501953125, "rewards/margins": 4.46875, "rewards/rejected": -3.01953125, "step": 2068 }, { "epoch": 0.3909490292408711, "grad_norm": 2.5380269376984916, "learning_rate": 7.876436604441529e-07, "logits/chosen": 2.212890625, "logits/rejected": 1.98046875, "logps/chosen": -1060.0, "logps/rejected": -1945.0, "loss": 0.6139, "rewards/accuracies": 0.8125, "rewards/chosen": 0.381103515625, "rewards/margins": 6.83203125, "rewards/rejected": -6.4609375, "step": 2069 }, { "epoch": 0.39113798478907835, "grad_norm": 2.2175374688494713, "learning_rate": 7.873915605528209e-07, "logits/chosen": 2.130859375, "logits/rejected": 2.123046875, "logps/chosen": -1054.0, "logps/rejected": -1229.0, "loss": 0.5986, "rewards/accuracies": 0.84375, "rewards/chosen": 1.143310546875, "rewards/margins": 4.72265625, "rewards/rejected": -3.58203125, "step": 2070 }, { "epoch": 0.39132694033728566, "grad_norm": 1.7903852296984504, "learning_rate": 7.871393573844346e-07, "logits/chosen": 1.596435546875, "logits/rejected": 1.27587890625, "logps/chosen": -1556.5, "logps/rejected": -1243.0, "loss": 0.5397, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3787841796875, "rewards/margins": 6.3486328125, "rewards/rejected": -5.9609375, "step": 2071 }, { "epoch": 0.39151589588549296, "grad_norm": 2.6473108414578364, "learning_rate": 7.86887051048715e-07, "logits/chosen": 1.8349609375, "logits/rejected": 1.7197265625, "logps/chosen": -918.0, "logps/rejected": -671.5, "loss": 0.6171, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5078125, "rewards/margins": 3.4296875, "rewards/rejected": -2.9140625, "step": 2072 }, { "epoch": 0.3917048514337002, "grad_norm": 1.9362973227695675, "learning_rate": 7.866346416554277e-07, "logits/chosen": 1.56494140625, "logits/rejected": 1.306640625, "logps/chosen": -495.0, "logps/rejected": -671.0, "loss": 0.6297, "rewards/accuracies": 0.875, "rewards/chosen": 0.473388671875, "rewards/margins": 3.3046875, "rewards/rejected": -2.828125, "step": 2073 }, { "epoch": 0.3918938069819075, "grad_norm": 2.0728504829291814, "learning_rate": 7.863821293143833e-07, "logits/chosen": 1.625, "logits/rejected": 0.7626953125, "logps/chosen": -1030.5, "logps/rejected": -877.0, "loss": 0.6386, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4814453125, "rewards/margins": 3.609375, "rewards/rejected": -3.12109375, "step": 2074 }, { "epoch": 0.3920827625301148, "grad_norm": 1.62226828446749, "learning_rate": 7.861295141354372e-07, "logits/chosen": 2.068359375, "logits/rejected": 1.876708984375, "logps/chosen": -661.0, "logps/rejected": -767.0, "loss": 0.6246, "rewards/accuracies": 0.84375, "rewards/chosen": 0.54388427734375, "rewards/margins": 4.033203125, "rewards/rejected": -3.484375, "step": 2075 }, { "epoch": 0.39227171807832206, "grad_norm": 2.8597052725421976, "learning_rate": 7.858767962284892e-07, "logits/chosen": 2.083984375, "logits/rejected": 1.90625, "logps/chosen": -725.5, "logps/rejected": -586.0, "loss": 0.6411, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0078125, "rewards/margins": 3.5234375, "rewards/rejected": -2.515625, "step": 2076 }, { "epoch": 0.39246067362652937, "grad_norm": 1.9601761760576588, "learning_rate": 7.856239757034844e-07, "logits/chosen": 2.1611328125, "logits/rejected": 2.166015625, "logps/chosen": -545.5, "logps/rejected": -777.0, "loss": 0.7029, "rewards/accuracies": 0.78125, "rewards/chosen": 0.15478515625, "rewards/margins": 5.48828125, "rewards/rejected": -5.33203125, "step": 2077 }, { "epoch": 0.39264962917473667, "grad_norm": 2.654143254090152, "learning_rate": 7.853710526704121e-07, "logits/chosen": 2.30224609375, "logits/rejected": 2.4073486328125, "logps/chosen": -427.5, "logps/rejected": -705.5, "loss": 0.5625, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5693359375, "rewards/margins": 4.7578125, "rewards/rejected": -4.18359375, "step": 2078 }, { "epoch": 0.3928385847229439, "grad_norm": 2.414766847813382, "learning_rate": 7.851180272393066e-07, "logits/chosen": 1.9365234375, "logits/rejected": 1.31884765625, "logps/chosen": -1147.0, "logps/rejected": -1014.0, "loss": 0.6089, "rewards/accuracies": 0.75, "rewards/chosen": 1.2109375, "rewards/margins": 3.33203125, "rewards/rejected": -2.12109375, "step": 2079 }, { "epoch": 0.3930275402711512, "grad_norm": 2.080323765664717, "learning_rate": 7.848648995202457e-07, "logits/chosen": 2.62109375, "logits/rejected": 2.234375, "logps/chosen": -878.5, "logps/rejected": -702.0, "loss": 0.7791, "rewards/accuracies": 0.65625, "rewards/chosen": 0.282958984375, "rewards/margins": 2.455078125, "rewards/rejected": -2.173828125, "step": 2080 }, { "epoch": 0.39321649581935847, "grad_norm": 2.731722470838107, "learning_rate": 7.84611669623353e-07, "logits/chosen": 1.0888671875, "logits/rejected": 1.390625, "logps/chosen": -542.0, "logps/rejected": -1608.5, "loss": 0.6271, "rewards/accuracies": 0.78125, "rewards/chosen": 0.50048828125, "rewards/margins": 4.83203125, "rewards/rejected": -4.33984375, "step": 2081 }, { "epoch": 0.3934054513675658, "grad_norm": 3.0224780708014958, "learning_rate": 7.84358337658796e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.65234375, "logps/chosen": -669.0, "logps/rejected": -868.5, "loss": 0.5368, "rewards/accuracies": 0.90625, "rewards/chosen": 1.541015625, "rewards/margins": 6.40625, "rewards/rejected": -4.85546875, "step": 2082 }, { "epoch": 0.3935944069157731, "grad_norm": 2.2943437820220214, "learning_rate": 7.841049037367868e-07, "logits/chosen": 2.111328125, "logits/rejected": 1.65625, "logps/chosen": -1117.0, "logps/rejected": -1353.0, "loss": 0.5224, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9970703125, "rewards/margins": 5.51953125, "rewards/rejected": -4.51953125, "step": 2083 }, { "epoch": 0.3937833624639803, "grad_norm": 1.9315331971610854, "learning_rate": 7.838513679675812e-07, "logits/chosen": 1.958984375, "logits/rejected": 1.5361328125, "logps/chosen": -536.0, "logps/rejected": -855.0, "loss": 0.5455, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4691162109375, "rewards/margins": 4.90234375, "rewards/rejected": -4.4375, "step": 2084 }, { "epoch": 0.39397231801218763, "grad_norm": 2.8309051651828816, "learning_rate": 7.8359773046148e-07, "logits/chosen": 2.19140625, "logits/rejected": 1.927734375, "logps/chosen": -699.5, "logps/rejected": -963.0, "loss": 0.6487, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3515625, "rewards/margins": 4.19140625, "rewards/rejected": -3.83984375, "step": 2085 }, { "epoch": 0.39416127356039493, "grad_norm": 2.374913600482032, "learning_rate": 7.833439913288281e-07, "logits/chosen": 2.40625, "logits/rejected": 2.3525390625, "logps/chosen": -805.5, "logps/rejected": -1142.0, "loss": 0.6106, "rewards/accuracies": 0.8125, "rewards/chosen": 0.919921875, "rewards/margins": 5.427734375, "rewards/rejected": -4.51171875, "step": 2086 }, { "epoch": 0.3943502291086022, "grad_norm": 2.5590025968503114, "learning_rate": 7.830901506800146e-07, "logits/chosen": 3.10546875, "logits/rejected": 2.84765625, "logps/chosen": -541.5, "logps/rejected": -561.0, "loss": 0.782, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5810546875, "rewards/margins": 2.27734375, "rewards/rejected": -1.69921875, "step": 2087 }, { "epoch": 0.3945391846568095, "grad_norm": 2.287525420295982, "learning_rate": 7.828362086254724e-07, "logits/chosen": 2.986328125, "logits/rejected": 2.837890625, "logps/chosen": -1233.0, "logps/rejected": -1221.5, "loss": 0.5727, "rewards/accuracies": 0.8125, "rewards/chosen": 1.658203125, "rewards/margins": 4.9921875, "rewards/rejected": -3.3359375, "step": 2088 }, { "epoch": 0.3947281402050168, "grad_norm": 3.0527093606148776, "learning_rate": 7.825821652756792e-07, "logits/chosen": 3.3984375, "logits/rejected": 2.623046875, "logps/chosen": -1174.0, "logps/rejected": -1099.0, "loss": 0.512, "rewards/accuracies": 0.875, "rewards/chosen": 1.9482421875, "rewards/margins": 6.0078125, "rewards/rejected": -4.0703125, "step": 2089 }, { "epoch": 0.39491709575322403, "grad_norm": 1.87502739291785, "learning_rate": 7.823280207411565e-07, "logits/chosen": 2.251953125, "logits/rejected": 2.04296875, "logps/chosen": -793.5, "logps/rejected": -1019.0, "loss": 0.5368, "rewards/accuracies": 0.84375, "rewards/chosen": 1.380859375, "rewards/margins": 5.05859375, "rewards/rejected": -3.6875, "step": 2090 }, { "epoch": 0.39510605130143134, "grad_norm": 2.0557163469011486, "learning_rate": 7.820737751324692e-07, "logits/chosen": 2.513671875, "logits/rejected": 2.26171875, "logps/chosen": -797.0, "logps/rejected": -1139.0, "loss": 0.6381, "rewards/accuracies": 0.71875, "rewards/chosen": 1.11865234375, "rewards/margins": 5.09375, "rewards/rejected": -3.97265625, "step": 2091 }, { "epoch": 0.39529500684963864, "grad_norm": 2.6161264118616225, "learning_rate": 7.818194285602271e-07, "logits/chosen": 1.86279296875, "logits/rejected": 1.72509765625, "logps/chosen": -872.0, "logps/rejected": -790.0, "loss": 0.593, "rewards/accuracies": 0.875, "rewards/chosen": 0.8427734375, "rewards/margins": 3.82421875, "rewards/rejected": -2.982421875, "step": 2092 }, { "epoch": 0.3954839623978459, "grad_norm": 1.9783135081745715, "learning_rate": 7.815649811350834e-07, "logits/chosen": 3.01171875, "logits/rejected": 2.861328125, "logps/chosen": -1224.0, "logps/rejected": -1264.5, "loss": 0.6833, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1953125, "rewards/margins": 3.658203125, "rewards/rejected": -3.455078125, "step": 2093 }, { "epoch": 0.3956729179460532, "grad_norm": 1.8559047903377646, "learning_rate": 7.813104329677352e-07, "logits/chosen": 2.314453125, "logits/rejected": 2.220703125, "logps/chosen": -869.0, "logps/rejected": -1044.5, "loss": 0.6525, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9677734375, "rewards/margins": 4.07421875, "rewards/rejected": -3.09375, "step": 2094 }, { "epoch": 0.3958618734942605, "grad_norm": 1.6580852837271012, "learning_rate": 7.810557841689238e-07, "logits/chosen": 1.3916015625, "logits/rejected": 1.67578125, "logps/chosen": -850.0, "logps/rejected": -1127.0, "loss": 0.6157, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0048828125, "rewards/margins": 7.23828125, "rewards/rejected": -7.2421875, "step": 2095 }, { "epoch": 0.39605082904246774, "grad_norm": 2.22146645973158, "learning_rate": 7.808010348494338e-07, "logits/chosen": 2.0859375, "logits/rejected": 2.154296875, "logps/chosen": -827.5, "logps/rejected": -1165.0, "loss": 0.6728, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0419921875, "rewards/margins": 4.6171875, "rewards/rejected": -4.57421875, "step": 2096 }, { "epoch": 0.39623978459067505, "grad_norm": 1.8583171300276298, "learning_rate": 7.805461851200938e-07, "logits/chosen": 2.7734375, "logits/rejected": 3.08203125, "logps/chosen": -837.0, "logps/rejected": -1107.0, "loss": 0.5852, "rewards/accuracies": 0.875, "rewards/chosen": 0.806640625, "rewards/margins": 5.58203125, "rewards/rejected": -4.78125, "step": 2097 }, { "epoch": 0.39642874013888235, "grad_norm": 1.8527182332533483, "learning_rate": 7.80291235091776e-07, "logits/chosen": 2.5, "logits/rejected": 2.384765625, "logps/chosen": -768.0, "logps/rejected": -853.0, "loss": 0.5341, "rewards/accuracies": 0.84375, "rewards/chosen": 1.35791015625, "rewards/margins": 5.3671875, "rewards/rejected": -4.01953125, "step": 2098 }, { "epoch": 0.3966176956870896, "grad_norm": 2.383254379344845, "learning_rate": 7.800361848753963e-07, "logits/chosen": 2.291015625, "logits/rejected": 1.96484375, "logps/chosen": -855.0, "logps/rejected": -1531.0, "loss": 0.5395, "rewards/accuracies": 0.84375, "rewards/chosen": 1.014404296875, "rewards/margins": 5.34375, "rewards/rejected": -4.328125, "step": 2099 }, { "epoch": 0.3968066512352969, "grad_norm": 2.2226053497681573, "learning_rate": 7.79781034581914e-07, "logits/chosen": 1.912109375, "logits/rejected": 1.4287109375, "logps/chosen": -702.0, "logps/rejected": -533.5, "loss": 0.5449, "rewards/accuracies": 0.84375, "rewards/chosen": 0.50439453125, "rewards/margins": 4.15625, "rewards/rejected": -3.658203125, "step": 2100 }, { "epoch": 0.3969956067835042, "grad_norm": 1.6033244765490107, "learning_rate": 7.795257843223323e-07, "logits/chosen": 3.15625, "logits/rejected": 2.701171875, "logps/chosen": -569.0, "logps/rejected": -1590.0, "loss": 0.6283, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6787109375, "rewards/margins": 9.3359375, "rewards/rejected": -8.640625, "step": 2101 }, { "epoch": 0.39718456233171145, "grad_norm": 2.15940949097373, "learning_rate": 7.792704342076975e-07, "logits/chosen": 2.66796875, "logits/rejected": 2.4375, "logps/chosen": -802.5, "logps/rejected": -846.0, "loss": 0.6748, "rewards/accuracies": 0.6875, "rewards/chosen": 1.036376953125, "rewards/margins": 5.03515625, "rewards/rejected": -4.009765625, "step": 2102 }, { "epoch": 0.39737351787991876, "grad_norm": 2.0192365913400723, "learning_rate": 7.790149843490995e-07, "logits/chosen": 2.4375, "logits/rejected": 1.919921875, "logps/chosen": -1068.0, "logps/rejected": -1053.0, "loss": 0.5648, "rewards/accuracies": 0.75, "rewards/chosen": 1.375, "rewards/margins": 5.828125, "rewards/rejected": -4.45703125, "step": 2103 }, { "epoch": 0.397562473428126, "grad_norm": 1.7316831305872098, "learning_rate": 7.787594348576717e-07, "logits/chosen": 2.455078125, "logits/rejected": 2.267578125, "logps/chosen": -801.5, "logps/rejected": -1878.0, "loss": 0.5049, "rewards/accuracies": 0.84375, "rewards/chosen": 1.333984375, "rewards/margins": 10.2734375, "rewards/rejected": -8.9375, "step": 2104 }, { "epoch": 0.3977514289763333, "grad_norm": 2.1970365016336744, "learning_rate": 7.785037858445907e-07, "logits/chosen": 2.37890625, "logits/rejected": 2.0048828125, "logps/chosen": -508.5, "logps/rejected": -504.5, "loss": 0.7587, "rewards/accuracies": 0.71875, "rewards/chosen": -0.057861328125, "rewards/margins": 2.068359375, "rewards/rejected": -2.1279296875, "step": 2105 }, { "epoch": 0.3979403845245406, "grad_norm": 2.170646861000275, "learning_rate": 7.782480374210766e-07, "logits/chosen": 1.787109375, "logits/rejected": 1.1875, "logps/chosen": -845.0, "logps/rejected": -892.0, "loss": 0.5133, "rewards/accuracies": 0.875, "rewards/chosen": 0.990234375, "rewards/margins": 4.16796875, "rewards/rejected": -3.1796875, "step": 2106 }, { "epoch": 0.39812934007274786, "grad_norm": 3.7515947693448273, "learning_rate": 7.779921896983923e-07, "logits/chosen": 2.53515625, "logits/rejected": 3.14453125, "logps/chosen": -751.0, "logps/rejected": -1262.0, "loss": 0.7139, "rewards/accuracies": 0.6875, "rewards/chosen": 0.369140625, "rewards/margins": 3.033203125, "rewards/rejected": -2.6650390625, "step": 2107 }, { "epoch": 0.39831829562095517, "grad_norm": 1.5055933298633328, "learning_rate": 7.777362427878445e-07, "logits/chosen": 2.7890625, "logits/rejected": 2.56640625, "logps/chosen": -655.0, "logps/rejected": -1010.0, "loss": 0.6534, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3515625, "rewards/margins": 4.677734375, "rewards/rejected": -4.32421875, "step": 2108 }, { "epoch": 0.39850725116916247, "grad_norm": 1.9578881089363749, "learning_rate": 7.774801968007823e-07, "logits/chosen": 1.79296875, "logits/rejected": 2.10546875, "logps/chosen": -657.0, "logps/rejected": -634.0, "loss": 0.6862, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4146728515625, "rewards/margins": 3.13671875, "rewards/rejected": -2.71484375, "step": 2109 }, { "epoch": 0.3986962067173697, "grad_norm": 1.9473782488529816, "learning_rate": 7.772240518485991e-07, "logits/chosen": 2.494140625, "logits/rejected": 2.470703125, "logps/chosen": -549.5, "logps/rejected": -924.5, "loss": 0.5662, "rewards/accuracies": 0.875, "rewards/chosen": 0.0262451171875, "rewards/margins": 5.984375, "rewards/rejected": -5.96484375, "step": 2110 }, { "epoch": 0.398885162265577, "grad_norm": 2.1890486307480024, "learning_rate": 7.769678080427298e-07, "logits/chosen": 2.36328125, "logits/rejected": 2.212890625, "logps/chosen": -574.0, "logps/rejected": -535.5, "loss": 0.7308, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22052001953125, "rewards/margins": 2.70703125, "rewards/rejected": -2.92578125, "step": 2111 }, { "epoch": 0.3990741178137843, "grad_norm": 2.347098044333424, "learning_rate": 7.767114654946537e-07, "logits/chosen": 3.01953125, "logits/rejected": 3.013671875, "logps/chosen": -988.5, "logps/rejected": -1013.0, "loss": 0.6757, "rewards/accuracies": 0.75, "rewards/chosen": 0.695556640625, "rewards/margins": 4.01953125, "rewards/rejected": -3.32421875, "step": 2112 }, { "epoch": 0.39926307336199157, "grad_norm": 1.6699305228017645, "learning_rate": 7.764550243158922e-07, "logits/chosen": 2.296875, "logits/rejected": 2.146484375, "logps/chosen": -752.0, "logps/rejected": -799.0, "loss": 0.6163, "rewards/accuracies": 0.78125, "rewards/chosen": 0.498779296875, "rewards/margins": 5.4375, "rewards/rejected": -4.9453125, "step": 2113 }, { "epoch": 0.3994520289101989, "grad_norm": 2.4175974381745147, "learning_rate": 7.761984846180099e-07, "logits/chosen": 2.373046875, "logits/rejected": 2.0400390625, "logps/chosen": -786.0, "logps/rejected": -660.0, "loss": 0.6773, "rewards/accuracies": 0.84375, "rewards/chosen": 0.79736328125, "rewards/margins": 3.28515625, "rewards/rejected": -2.49609375, "step": 2114 }, { "epoch": 0.3996409844584062, "grad_norm": 1.6133043446681739, "learning_rate": 7.759418465126143e-07, "logits/chosen": 1.4501953125, "logits/rejected": 1.2685546875, "logps/chosen": -570.0, "logps/rejected": -603.0, "loss": 0.5386, "rewards/accuracies": 0.9375, "rewards/chosen": 0.799896240234375, "rewards/margins": 3.59375, "rewards/rejected": -2.7890625, "step": 2115 }, { "epoch": 0.3998299400066134, "grad_norm": 1.773683788630619, "learning_rate": 7.756851101113555e-07, "logits/chosen": 2.291015625, "logits/rejected": 2.357421875, "logps/chosen": -1037.0, "logps/rejected": -1071.0, "loss": 0.4657, "rewards/accuracies": 0.875, "rewards/chosen": 1.7060546875, "rewards/margins": 5.68359375, "rewards/rejected": -3.96875, "step": 2116 }, { "epoch": 0.40001889555482073, "grad_norm": 2.631194663978628, "learning_rate": 7.754282755259267e-07, "logits/chosen": 1.85546875, "logits/rejected": 1.96484375, "logps/chosen": -584.0, "logps/rejected": -651.0, "loss": 0.7082, "rewards/accuracies": 0.75, "rewards/chosen": -0.10107421875, "rewards/margins": 3.02734375, "rewards/rejected": -3.12109375, "step": 2117 }, { "epoch": 0.40020785110302803, "grad_norm": 1.86314035135901, "learning_rate": 7.751713428680636e-07, "logits/chosen": 1.52734375, "logits/rejected": 1.10693359375, "logps/chosen": -696.0, "logps/rejected": -1431.0, "loss": 0.5975, "rewards/accuracies": 0.75, "rewards/chosen": 0.12548828125, "rewards/margins": 7.21875, "rewards/rejected": -7.109375, "step": 2118 }, { "epoch": 0.4003968066512353, "grad_norm": 1.8714812400908567, "learning_rate": 7.749143122495446e-07, "logits/chosen": 1.92578125, "logits/rejected": 1.828125, "logps/chosen": -1258.0, "logps/rejected": -1170.0, "loss": 0.5178, "rewards/accuracies": 0.8125, "rewards/chosen": 1.22900390625, "rewards/margins": 4.67578125, "rewards/rejected": -3.4453125, "step": 2119 }, { "epoch": 0.4005857621994426, "grad_norm": 3.08527391407342, "learning_rate": 7.746571837821905e-07, "logits/chosen": 2.77734375, "logits/rejected": 2.22265625, "logps/chosen": -1110.0, "logps/rejected": -921.0, "loss": 0.6117, "rewards/accuracies": 0.875, "rewards/chosen": 0.460601806640625, "rewards/margins": 3.783203125, "rewards/rejected": -3.310546875, "step": 2120 }, { "epoch": 0.4007747177476499, "grad_norm": 2.1371721409270146, "learning_rate": 7.743999575778654e-07, "logits/chosen": 2.544921875, "logits/rejected": 2.43359375, "logps/chosen": -690.5, "logps/rejected": -805.5, "loss": 0.62, "rewards/accuracies": 0.75, "rewards/chosen": 0.256072998046875, "rewards/margins": 4.88671875, "rewards/rejected": -4.6328125, "step": 2121 }, { "epoch": 0.40096367329585714, "grad_norm": 2.009419807349137, "learning_rate": 7.741426337484748e-07, "logits/chosen": 2.40625, "logits/rejected": 2.28125, "logps/chosen": -1082.0, "logps/rejected": -1294.0, "loss": 0.4973, "rewards/accuracies": 0.875, "rewards/chosen": 0.697998046875, "rewards/margins": 5.171875, "rewards/rejected": -4.48046875, "step": 2122 }, { "epoch": 0.40115262884406444, "grad_norm": 1.9185005630681875, "learning_rate": 7.738852124059676e-07, "logits/chosen": 2.83984375, "logits/rejected": 2.955078125, "logps/chosen": -571.0, "logps/rejected": -1023.5, "loss": 0.5931, "rewards/accuracies": 0.8125, "rewards/chosen": 0.73046875, "rewards/margins": 7.23046875, "rewards/rejected": -6.5, "step": 2123 }, { "epoch": 0.40134158439227174, "grad_norm": 2.4309723379878787, "learning_rate": 7.736276936623347e-07, "logits/chosen": 3.32421875, "logits/rejected": 3.68359375, "logps/chosen": -1170.0, "logps/rejected": -1643.0, "loss": 0.69, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0283203125, "rewards/margins": 5.51171875, "rewards/rejected": -4.48828125, "step": 2124 }, { "epoch": 0.401530539940479, "grad_norm": 2.987222202472205, "learning_rate": 7.733700776296094e-07, "logits/chosen": 2.3359375, "logits/rejected": 2.31201171875, "logps/chosen": -678.0, "logps/rejected": -844.5, "loss": 0.5265, "rewards/accuracies": 0.8125, "rewards/chosen": 0.44140625, "rewards/margins": 5.546875, "rewards/rejected": -5.1171875, "step": 2125 }, { "epoch": 0.4017194954886863, "grad_norm": 1.662387734776214, "learning_rate": 7.731123644198677e-07, "logits/chosen": 2.44921875, "logits/rejected": 2.478515625, "logps/chosen": -920.0, "logps/rejected": -1131.0, "loss": 0.5553, "rewards/accuracies": 0.8125, "rewards/chosen": 1.548828125, "rewards/margins": 4.8828125, "rewards/rejected": -3.3359375, "step": 2126 }, { "epoch": 0.40190845103689354, "grad_norm": 1.7723006758402613, "learning_rate": 7.728545541452273e-07, "logits/chosen": 2.59375, "logits/rejected": 2.548828125, "logps/chosen": -922.0, "logps/rejected": -726.5, "loss": 0.6375, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4814453125, "rewards/margins": 1.08203125, "rewards/rejected": -2.58447265625, "step": 2127 }, { "epoch": 0.40209740658510085, "grad_norm": 2.1079395317884226, "learning_rate": 7.725966469178485e-07, "logits/chosen": 1.8134765625, "logits/rejected": 1.58203125, "logps/chosen": -758.0, "logps/rejected": -564.0, "loss": 0.5421, "rewards/accuracies": 0.875, "rewards/chosen": 0.98046875, "rewards/margins": 4.21484375, "rewards/rejected": -3.24609375, "step": 2128 }, { "epoch": 0.40228636213330815, "grad_norm": 2.1286563365919022, "learning_rate": 7.723386428499337e-07, "logits/chosen": 2.654296875, "logits/rejected": 2.88671875, "logps/chosen": -480.0, "logps/rejected": -804.5, "loss": 0.6382, "rewards/accuracies": 0.84375, "rewards/chosen": 0.893798828125, "rewards/margins": 3.70703125, "rewards/rejected": -2.806640625, "step": 2129 }, { "epoch": 0.4024753176815154, "grad_norm": 1.8113837282759684, "learning_rate": 7.720805420537273e-07, "logits/chosen": 2.359375, "logits/rejected": 1.876220703125, "logps/chosen": -1169.5, "logps/rejected": -1045.0, "loss": 0.5248, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2578125, "rewards/margins": 4.609375, "rewards/rejected": -3.35546875, "step": 2130 }, { "epoch": 0.4026642732297227, "grad_norm": 2.140542382110958, "learning_rate": 7.718223446415161e-07, "logits/chosen": 2.447265625, "logits/rejected": 2.484375, "logps/chosen": -898.0, "logps/rejected": -789.0, "loss": 0.6998, "rewards/accuracies": 0.6875, "rewards/chosen": 1.29443359375, "rewards/margins": 4.29296875, "rewards/rejected": -2.990234375, "step": 2131 }, { "epoch": 0.40285322877793, "grad_norm": 2.773781526472497, "learning_rate": 7.715640507256285e-07, "logits/chosen": 2.0390625, "logits/rejected": 2.005859375, "logps/chosen": -694.5, "logps/rejected": -770.0, "loss": 0.6623, "rewards/accuracies": 0.75, "rewards/chosen": 0.062652587890625, "rewards/margins": 2.8359375, "rewards/rejected": -2.76953125, "step": 2132 }, { "epoch": 0.40304218432613725, "grad_norm": 1.769406395446013, "learning_rate": 7.713056604184353e-07, "logits/chosen": 2.9609375, "logits/rejected": 3.10546875, "logps/chosen": -639.5, "logps/rejected": -1550.0, "loss": 0.6349, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7353515625, "rewards/margins": 6.4375, "rewards/rejected": -5.693359375, "step": 2133 }, { "epoch": 0.40323113987434456, "grad_norm": 2.526783666028629, "learning_rate": 7.710471738323489e-07, "logits/chosen": 2.8515625, "logits/rejected": 2.9765625, "logps/chosen": -636.0, "logps/rejected": -1322.0, "loss": 0.6562, "rewards/accuracies": 0.875, "rewards/chosen": -0.093017578125, "rewards/margins": 6.59375, "rewards/rejected": -6.67578125, "step": 2134 }, { "epoch": 0.40342009542255186, "grad_norm": 2.5255758496082703, "learning_rate": 7.707885910798239e-07, "logits/chosen": 1.357421875, "logits/rejected": 0.9150390625, "logps/chosen": -1089.0, "logps/rejected": -963.0, "loss": 0.5529, "rewards/accuracies": 0.84375, "rewards/chosen": 0.446929931640625, "rewards/margins": 4.15234375, "rewards/rejected": -3.703125, "step": 2135 }, { "epoch": 0.4036090509707591, "grad_norm": 1.7529124506484366, "learning_rate": 7.705299122733563e-07, "logits/chosen": 3.56640625, "logits/rejected": 3.2333984375, "logps/chosen": -663.0, "logps/rejected": -666.0, "loss": 0.5664, "rewards/accuracies": 0.8125, "rewards/chosen": 1.330078125, "rewards/margins": 4.46484375, "rewards/rejected": -3.13671875, "step": 2136 }, { "epoch": 0.4037980065189664, "grad_norm": 3.160787528027896, "learning_rate": 7.702711375254844e-07, "logits/chosen": 1.92578125, "logits/rejected": 1.89453125, "logps/chosen": -592.0, "logps/rejected": -625.0, "loss": 0.6969, "rewards/accuracies": 0.75, "rewards/chosen": 0.439453125, "rewards/margins": 2.859375, "rewards/rejected": -2.41650390625, "step": 2137 }, { "epoch": 0.4039869620671737, "grad_norm": 1.688581928035365, "learning_rate": 7.700122669487878e-07, "logits/chosen": 2.41015625, "logits/rejected": 2.31103515625, "logps/chosen": -652.0, "logps/rejected": -801.0, "loss": 0.4477, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1201171875, "rewards/margins": 5.1796875, "rewards/rejected": -4.05859375, "step": 2138 }, { "epoch": 0.40417591761538096, "grad_norm": 1.7911554729709733, "learning_rate": 7.697533006558883e-07, "logits/chosen": 2.5234375, "logits/rejected": 2.037109375, "logps/chosen": -979.0, "logps/rejected": -877.0, "loss": 0.6076, "rewards/accuracies": 0.8125, "rewards/chosen": 0.419189453125, "rewards/margins": 3.6875, "rewards/rejected": -3.26953125, "step": 2139 }, { "epoch": 0.40436487316358827, "grad_norm": 1.5610046582424193, "learning_rate": 7.694942387594487e-07, "logits/chosen": 1.884765625, "logits/rejected": 1.40576171875, "logps/chosen": -595.5, "logps/rejected": -643.5, "loss": 0.5792, "rewards/accuracies": 0.78125, "rewards/chosen": 0.21630859375, "rewards/margins": 4.03515625, "rewards/rejected": -3.8125, "step": 2140 }, { "epoch": 0.40455382871179557, "grad_norm": 1.6670230982811984, "learning_rate": 7.69235081372174e-07, "logits/chosen": 2.6328125, "logits/rejected": 2.9921875, "logps/chosen": -1235.0, "logps/rejected": -2328.0, "loss": 0.412, "rewards/accuracies": 0.875, "rewards/chosen": 2.46875, "rewards/margins": 8.671875, "rewards/rejected": -6.203125, "step": 2141 }, { "epoch": 0.4047427842600028, "grad_norm": 2.5004589517239184, "learning_rate": 7.689758286068102e-07, "logits/chosen": 1.4453125, "logits/rejected": 1.3270263671875, "logps/chosen": -720.0, "logps/rejected": -745.5, "loss": 0.5708, "rewards/accuracies": 0.8125, "rewards/chosen": 0.33740234375, "rewards/margins": 4.03515625, "rewards/rejected": -3.6953125, "step": 2142 }, { "epoch": 0.4049317398082101, "grad_norm": 1.6914361337666126, "learning_rate": 7.687164805761451e-07, "logits/chosen": 2.828125, "logits/rejected": 2.517578125, "logps/chosen": -997.0, "logps/rejected": -759.0, "loss": 0.5912, "rewards/accuracies": 0.8125, "rewards/chosen": 0.441162109375, "rewards/margins": 4.98046875, "rewards/rejected": -4.53515625, "step": 2143 }, { "epoch": 0.4051206953564174, "grad_norm": 2.423189234841138, "learning_rate": 7.684570373930082e-07, "logits/chosen": 2.751953125, "logits/rejected": 2.362060546875, "logps/chosen": -1082.0, "logps/rejected": -1048.0, "loss": 0.6536, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3563232421875, "rewards/margins": 5.46875, "rewards/rejected": -5.11328125, "step": 2144 }, { "epoch": 0.4053096509046247, "grad_norm": 2.6312481774456433, "learning_rate": 7.681974991702697e-07, "logits/chosen": 1.951171875, "logits/rejected": 1.671875, "logps/chosen": -653.0, "logps/rejected": -630.0, "loss": 0.6449, "rewards/accuracies": 0.875, "rewards/chosen": -0.10498046875, "rewards/margins": 3.845703125, "rewards/rejected": -3.95703125, "step": 2145 }, { "epoch": 0.405498606452832, "grad_norm": 2.377482799759234, "learning_rate": 7.679378660208418e-07, "logits/chosen": 3.046875, "logits/rejected": 2.8154296875, "logps/chosen": -999.0, "logps/rejected": -1069.5, "loss": 0.6019, "rewards/accuracies": 0.875, "rewards/chosen": 0.42919921875, "rewards/margins": 5.12109375, "rewards/rejected": -4.69921875, "step": 2146 }, { "epoch": 0.4056875620010393, "grad_norm": 2.1061068362860125, "learning_rate": 7.676781380576776e-07, "logits/chosen": 2.22265625, "logits/rejected": 2.494140625, "logps/chosen": -825.0, "logps/rejected": -574.5, "loss": 0.5687, "rewards/accuracies": 0.90625, "rewards/chosen": 0.501708984375, "rewards/margins": 4.11328125, "rewards/rejected": -3.61328125, "step": 2147 }, { "epoch": 0.40587651754924653, "grad_norm": 1.8180233328991506, "learning_rate": 7.674183153937717e-07, "logits/chosen": 2.78759765625, "logits/rejected": 2.50732421875, "logps/chosen": -657.25, "logps/rejected": -636.0, "loss": 0.5327, "rewards/accuracies": 0.84375, "rewards/chosen": 0.99853515625, "rewards/margins": 4.3515625, "rewards/rejected": -3.3515625, "step": 2148 }, { "epoch": 0.40606547309745383, "grad_norm": 2.305381784831941, "learning_rate": 7.671583981421599e-07, "logits/chosen": 1.9521484375, "logits/rejected": 2.130859375, "logps/chosen": -815.0, "logps/rejected": -907.0, "loss": 0.6334, "rewards/accuracies": 0.71875, "rewards/chosen": 0.509765625, "rewards/margins": 3.6640625, "rewards/rejected": -3.16015625, "step": 2149 }, { "epoch": 0.4062544286456611, "grad_norm": 1.6259691463357675, "learning_rate": 7.668983864159188e-07, "logits/chosen": 2.875, "logits/rejected": 3.060546875, "logps/chosen": -456.0, "logps/rejected": -664.0, "loss": 0.5567, "rewards/accuracies": 0.90625, "rewards/chosen": 0.53857421875, "rewards/margins": 4.71484375, "rewards/rejected": -4.18359375, "step": 2150 }, { "epoch": 0.4064433841938684, "grad_norm": 2.0490546762176933, "learning_rate": 7.666382803281662e-07, "logits/chosen": 2.1484375, "logits/rejected": 1.798828125, "logps/chosen": -513.0, "logps/rejected": -842.0, "loss": 0.5991, "rewards/accuracies": 0.84375, "rewards/chosen": -0.25634765625, "rewards/margins": 3.4453125, "rewards/rejected": -3.703125, "step": 2151 }, { "epoch": 0.4066323397420757, "grad_norm": 2.209784643187897, "learning_rate": 7.663780799920616e-07, "logits/chosen": 2.521484375, "logits/rejected": 2.7890625, "logps/chosen": -944.0, "logps/rejected": -1286.0, "loss": 0.556, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6728515625, "rewards/margins": 4.97265625, "rewards/rejected": -4.3046875, "step": 2152 }, { "epoch": 0.40682129529028294, "grad_norm": 1.8536730081186175, "learning_rate": 7.661177855208048e-07, "logits/chosen": 3.421875, "logits/rejected": 3.6328125, "logps/chosen": -534.5, "logps/rejected": -922.0, "loss": 0.7591, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2291259765625, "rewards/margins": 4.1318359375, "rewards/rejected": -3.9111328125, "step": 2153 }, { "epoch": 0.40701025083849024, "grad_norm": 1.9117473044741673, "learning_rate": 7.658573970276366e-07, "logits/chosen": 2.84375, "logits/rejected": 2.994140625, "logps/chosen": -644.5, "logps/rejected": -621.0, "loss": 0.5592, "rewards/accuracies": 0.8125, "rewards/chosen": 0.47119140625, "rewards/margins": 3.9296875, "rewards/rejected": -3.453125, "step": 2154 }, { "epoch": 0.40719920638669754, "grad_norm": 1.4047617647106243, "learning_rate": 7.655969146258387e-07, "logits/chosen": 2.4296875, "logits/rejected": 2.1865234375, "logps/chosen": -630.0, "logps/rejected": -644.5, "loss": 0.6215, "rewards/accuracies": 0.875, "rewards/chosen": 0.26611328125, "rewards/margins": 3.88671875, "rewards/rejected": -3.62890625, "step": 2155 }, { "epoch": 0.4073881619349048, "grad_norm": 2.9411514809962247, "learning_rate": 7.653363384287343e-07, "logits/chosen": 3.34765625, "logits/rejected": 3.03515625, "logps/chosen": -764.0, "logps/rejected": -727.0, "loss": 0.6336, "rewards/accuracies": 0.84375, "rewards/chosen": 0.440185546875, "rewards/margins": 3.44140625, "rewards/rejected": -2.99609375, "step": 2156 }, { "epoch": 0.4075771174831121, "grad_norm": 2.3707757623230425, "learning_rate": 7.650756685496866e-07, "logits/chosen": 2.126953125, "logits/rejected": 2.123046875, "logps/chosen": -750.0, "logps/rejected": -985.0, "loss": 0.6785, "rewards/accuracies": 0.71875, "rewards/chosen": 0.25634765625, "rewards/margins": 3.103515625, "rewards/rejected": -2.849609375, "step": 2157 }, { "epoch": 0.4077660730313194, "grad_norm": 1.6169114914887752, "learning_rate": 7.648149051020999e-07, "logits/chosen": 3.197265625, "logits/rejected": 3.3828125, "logps/chosen": -4288.5, "logps/rejected": -734.0, "loss": 0.5884, "rewards/accuracies": 0.84375, "rewards/chosen": 2.595458984375, "rewards/margins": 5.2578125, "rewards/rejected": -2.6640625, "step": 2158 }, { "epoch": 0.40795502857952665, "grad_norm": 1.4414259619619367, "learning_rate": 7.645540481994194e-07, "logits/chosen": 3.51953125, "logits/rejected": 3.3984375, "logps/chosen": -704.5, "logps/rejected": -834.5, "loss": 0.6132, "rewards/accuracies": 0.71875, "rewards/chosen": 0.668701171875, "rewards/margins": 4.0546875, "rewards/rejected": -3.3828125, "step": 2159 }, { "epoch": 0.40814398412773395, "grad_norm": 1.6891727221773463, "learning_rate": 7.642930979551304e-07, "logits/chosen": 2.953125, "logits/rejected": 2.91796875, "logps/chosen": -889.5, "logps/rejected": -604.5, "loss": 0.7943, "rewards/accuracies": 0.625, "rewards/chosen": 0.2392578125, "rewards/margins": 2.171875, "rewards/rejected": -1.930908203125, "step": 2160 }, { "epoch": 0.40833293967594125, "grad_norm": 2.252313936880799, "learning_rate": 7.640320544827594e-07, "logits/chosen": 2.4296875, "logits/rejected": 2.3427734375, "logps/chosen": -876.0, "logps/rejected": -648.5, "loss": 0.6355, "rewards/accuracies": 0.84375, "rewards/chosen": 0.644287109375, "rewards/margins": 3.1640625, "rewards/rejected": -2.5234375, "step": 2161 }, { "epoch": 0.4085218952241485, "grad_norm": 2.3020026607835775, "learning_rate": 7.637709178958729e-07, "logits/chosen": 1.6181640625, "logits/rejected": 1.6240234375, "logps/chosen": -865.5, "logps/rejected": -897.0, "loss": 0.5843, "rewards/accuracies": 0.8125, "rewards/chosen": 1.14990234375, "rewards/margins": 4.62890625, "rewards/rejected": -3.47265625, "step": 2162 }, { "epoch": 0.4087108507723558, "grad_norm": 2.705021905310181, "learning_rate": 7.635096883080786e-07, "logits/chosen": 2.56640625, "logits/rejected": 2.38671875, "logps/chosen": -828.0, "logps/rejected": -925.0, "loss": 0.5108, "rewards/accuracies": 0.9375, "rewards/chosen": 0.693359375, "rewards/margins": 5.7890625, "rewards/rejected": -5.09375, "step": 2163 }, { "epoch": 0.4088998063205631, "grad_norm": 2.2976159621464762, "learning_rate": 7.63248365833024e-07, "logits/chosen": 2.53125, "logits/rejected": 2.52734375, "logps/chosen": -581.0, "logps/rejected": -766.5, "loss": 0.4972, "rewards/accuracies": 0.90625, "rewards/chosen": 0.75927734375, "rewards/margins": 4.87109375, "rewards/rejected": -4.1171875, "step": 2164 }, { "epoch": 0.40908876186877036, "grad_norm": 1.423045763797423, "learning_rate": 7.629869505843974e-07, "logits/chosen": 3.9453125, "logits/rejected": 4.04296875, "logps/chosen": -855.0, "logps/rejected": -1404.0, "loss": 0.4326, "rewards/accuracies": 0.90625, "rewards/chosen": 1.369140625, "rewards/margins": 6.765625, "rewards/rejected": -5.3984375, "step": 2165 }, { "epoch": 0.40927771741697766, "grad_norm": 1.8688826810513508, "learning_rate": 7.627254426759274e-07, "logits/chosen": 3.068359375, "logits/rejected": 3.013671875, "logps/chosen": -675.5, "logps/rejected": -755.0, "loss": 0.6767, "rewards/accuracies": 0.75, "rewards/chosen": 0.619140625, "rewards/margins": 3.083984375, "rewards/rejected": -2.46484375, "step": 2166 }, { "epoch": 0.40946667296518496, "grad_norm": 2.2082301433547227, "learning_rate": 7.624638422213828e-07, "logits/chosen": 2.2578125, "logits/rejected": 2.25390625, "logps/chosen": -554.0, "logps/rejected": -611.0, "loss": 0.651, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4652099609375, "rewards/margins": 3.4765625, "rewards/rejected": -3.015625, "step": 2167 }, { "epoch": 0.4096556285133922, "grad_norm": 4.944807601767761, "learning_rate": 7.622021493345726e-07, "logits/chosen": 2.85546875, "logits/rejected": 2.88671875, "logps/chosen": -941.0, "logps/rejected": -933.0, "loss": 0.6388, "rewards/accuracies": 0.875, "rewards/chosen": 0.575927734375, "rewards/margins": 4.326171875, "rewards/rejected": -3.7431640625, "step": 2168 }, { "epoch": 0.4098445840615995, "grad_norm": 1.9930671018211068, "learning_rate": 7.619403641293462e-07, "logits/chosen": 3.52734375, "logits/rejected": 2.921875, "logps/chosen": -746.0, "logps/rejected": -751.0, "loss": 0.6512, "rewards/accuracies": 0.78125, "rewards/chosen": 0.841796875, "rewards/margins": 3.5625, "rewards/rejected": -2.71484375, "step": 2169 }, { "epoch": 0.4100335396098068, "grad_norm": 1.7228477801394027, "learning_rate": 7.616784867195932e-07, "logits/chosen": 3.63671875, "logits/rejected": 4.0703125, "logps/chosen": -746.5, "logps/rejected": -1142.0, "loss": 0.617, "rewards/accuracies": 0.78125, "rewards/chosen": 0.827880859375, "rewards/margins": 4.671875, "rewards/rejected": -3.8515625, "step": 2170 }, { "epoch": 0.41022249515801407, "grad_norm": 1.5855676678507424, "learning_rate": 7.614165172192432e-07, "logits/chosen": 2.734375, "logits/rejected": 3.05859375, "logps/chosen": -684.0, "logps/rejected": -651.0, "loss": 0.5933, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9732666015625, "rewards/margins": 3.591796875, "rewards/rejected": -2.62109375, "step": 2171 }, { "epoch": 0.41041145070622137, "grad_norm": 2.261689643860033, "learning_rate": 7.611544557422656e-07, "logits/chosen": 3.0546875, "logits/rejected": 3.015625, "logps/chosen": -753.0, "logps/rejected": -1079.0, "loss": 0.6378, "rewards/accuracies": 0.78125, "rewards/chosen": 0.227294921875, "rewards/margins": 3.6796875, "rewards/rejected": -3.453125, "step": 2172 }, { "epoch": 0.4106004062544286, "grad_norm": 2.0614961214124516, "learning_rate": 7.608923024026704e-07, "logits/chosen": 2.5146484375, "logits/rejected": 2.853515625, "logps/chosen": -888.0, "logps/rejected": -887.0, "loss": 0.6794, "rewards/accuracies": 0.625, "rewards/chosen": 1.2109375, "rewards/margins": 3.0234375, "rewards/rejected": -1.81640625, "step": 2173 }, { "epoch": 0.4107893618026359, "grad_norm": 1.7576082211181125, "learning_rate": 7.606300573145071e-07, "logits/chosen": 3.640625, "logits/rejected": 3.47265625, "logps/chosen": -752.5, "logps/rejected": -1510.5, "loss": 0.6474, "rewards/accuracies": 0.71875, "rewards/chosen": 0.65966796875, "rewards/margins": 4.111328125, "rewards/rejected": -3.455078125, "step": 2174 }, { "epoch": 0.4109783173508432, "grad_norm": 2.1194900461832202, "learning_rate": 7.603677205918654e-07, "logits/chosen": 2.5625, "logits/rejected": 2.318359375, "logps/chosen": -879.0, "logps/rejected": -626.0, "loss": 0.5862, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8583984375, "rewards/margins": 3.28515625, "rewards/rejected": -2.42578125, "step": 2175 }, { "epoch": 0.4111672728990505, "grad_norm": 1.5035826849066036, "learning_rate": 7.601052923488746e-07, "logits/chosen": 2.509765625, "logits/rejected": 2.501953125, "logps/chosen": -797.5, "logps/rejected": -733.0, "loss": 0.5705, "rewards/accuracies": 0.96875, "rewards/chosen": 0.78271484375, "rewards/margins": 4.09765625, "rewards/rejected": -3.3203125, "step": 2176 }, { "epoch": 0.4113562284472578, "grad_norm": 2.3865159194489167, "learning_rate": 7.59842772699704e-07, "logits/chosen": 2.548828125, "logits/rejected": 3.17578125, "logps/chosen": -494.5, "logps/rejected": -822.0, "loss": 0.7201, "rewards/accuracies": 0.65625, "rewards/chosen": 0.332763671875, "rewards/margins": 3.890625, "rewards/rejected": -3.55859375, "step": 2177 }, { "epoch": 0.4115451839954651, "grad_norm": 1.447685503867611, "learning_rate": 7.595801617585625e-07, "logits/chosen": 3.30859375, "logits/rejected": 3.41796875, "logps/chosen": -820.0, "logps/rejected": -1851.0, "loss": 0.5576, "rewards/accuracies": 0.8125, "rewards/chosen": 1.184814453125, "rewards/margins": 10.32421875, "rewards/rejected": -9.12890625, "step": 2178 }, { "epoch": 0.41173413954367233, "grad_norm": 2.5056617065780733, "learning_rate": 7.593174596396991e-07, "logits/chosen": 2.91796875, "logits/rejected": 2.681640625, "logps/chosen": -1592.0, "logps/rejected": -811.0, "loss": 0.5683, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5504150390625, "rewards/margins": 4.02734375, "rewards/rejected": -3.47265625, "step": 2179 }, { "epoch": 0.41192309509187963, "grad_norm": 2.591012749711324, "learning_rate": 7.59054666457402e-07, "logits/chosen": 2.56640625, "logits/rejected": 2.92578125, "logps/chosen": -1003.0, "logps/rejected": -1060.0, "loss": 0.6102, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9892578125, "rewards/margins": 5.140625, "rewards/rejected": -4.1484375, "step": 2180 }, { "epoch": 0.41211205064008694, "grad_norm": 1.989841739904271, "learning_rate": 7.587917823259991e-07, "logits/chosen": 2.875, "logits/rejected": 2.6943359375, "logps/chosen": -653.0, "logps/rejected": -682.5, "loss": 0.6409, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9658203125, "rewards/margins": 3.3125, "rewards/rejected": -2.3466796875, "step": 2181 }, { "epoch": 0.4123010061882942, "grad_norm": 2.001883613394163, "learning_rate": 7.585288073598583e-07, "logits/chosen": 2.15869140625, "logits/rejected": 2.1611328125, "logps/chosen": -776.5, "logps/rejected": -824.5, "loss": 0.5915, "rewards/accuracies": 0.875, "rewards/chosen": 0.79345703125, "rewards/margins": 3.90234375, "rewards/rejected": -3.103515625, "step": 2182 }, { "epoch": 0.4124899617365015, "grad_norm": 2.1372246758187003, "learning_rate": 7.582657416733864e-07, "logits/chosen": 2.185546875, "logits/rejected": 2.470703125, "logps/chosen": -748.5, "logps/rejected": -800.0, "loss": 0.6447, "rewards/accuracies": 0.8125, "rewards/chosen": 0.583984375, "rewards/margins": 4.751953125, "rewards/rejected": -4.1796875, "step": 2183 }, { "epoch": 0.4126789172847088, "grad_norm": 1.5923113593109548, "learning_rate": 7.580025853810301e-07, "logits/chosen": 2.30859375, "logits/rejected": 2.62890625, "logps/chosen": -1029.0, "logps/rejected": -14235.0, "loss": 0.6342, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4228515625, "rewards/margins": 55.935546875, "rewards/rejected": -55.564453125, "step": 2184 }, { "epoch": 0.41286787283291604, "grad_norm": 1.9306665827623082, "learning_rate": 7.577393385972755e-07, "logits/chosen": 2.8310546875, "logits/rejected": 3.6044921875, "logps/chosen": -833.0, "logps/rejected": -834.0, "loss": 0.7463, "rewards/accuracies": 0.6875, "rewards/chosen": 0.60498046875, "rewards/margins": 3.43408203125, "rewards/rejected": -2.82354736328125, "step": 2185 }, { "epoch": 0.41305682838112334, "grad_norm": 1.7418036371388437, "learning_rate": 7.574760014366476e-07, "logits/chosen": 2.3115234375, "logits/rejected": 2.388671875, "logps/chosen": -669.0, "logps/rejected": -885.5, "loss": 0.6107, "rewards/accuracies": 0.78125, "rewards/chosen": 0.669921875, "rewards/margins": 3.6484375, "rewards/rejected": -2.9765625, "step": 2186 }, { "epoch": 0.41324578392933065, "grad_norm": 1.8895477026959788, "learning_rate": 7.572125740137115e-07, "logits/chosen": 3.6875, "logits/rejected": 3.38671875, "logps/chosen": -911.0, "logps/rejected": -1098.5, "loss": 0.5681, "rewards/accuracies": 0.96875, "rewards/chosen": 1.05712890625, "rewards/margins": 5.0625, "rewards/rejected": -4.0078125, "step": 2187 }, { "epoch": 0.4134347394775379, "grad_norm": 1.8552529862900045, "learning_rate": 7.569490564430708e-07, "logits/chosen": 2.681640625, "logits/rejected": 3.03125, "logps/chosen": -668.5, "logps/rejected": -970.5, "loss": 0.593, "rewards/accuracies": 0.71875, "rewards/chosen": 1.04248046875, "rewards/margins": 4.044921875, "rewards/rejected": -3.00390625, "step": 2188 }, { "epoch": 0.4136236950257452, "grad_norm": 1.647339393024524, "learning_rate": 7.566854488393689e-07, "logits/chosen": 3.67578125, "logits/rejected": 3.76953125, "logps/chosen": -550.5, "logps/rejected": -781.0, "loss": 0.7596, "rewards/accuracies": 0.6875, "rewards/chosen": 0.114501953125, "rewards/margins": 2.26171875, "rewards/rejected": -2.146484375, "step": 2189 }, { "epoch": 0.4138126505739525, "grad_norm": 1.7788069752585143, "learning_rate": 7.564217513172879e-07, "logits/chosen": 2.619140625, "logits/rejected": 2.07421875, "logps/chosen": -490.5, "logps/rejected": -421.0, "loss": 0.6361, "rewards/accuracies": 0.8125, "rewards/chosen": 0.725830078125, "rewards/margins": 2.630859375, "rewards/rejected": -1.904296875, "step": 2190 }, { "epoch": 0.41400160612215975, "grad_norm": 1.5839217896429971, "learning_rate": 7.561579639915496e-07, "logits/chosen": 2.2772216796875, "logits/rejected": 1.8076171875, "logps/chosen": -728.5, "logps/rejected": -726.0, "loss": 0.6809, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3916015625, "rewards/margins": 2.845703125, "rewards/rejected": -1.462890625, "step": 2191 }, { "epoch": 0.41419056167036705, "grad_norm": 2.1729027772437624, "learning_rate": 7.55894086976914e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.720703125, "logps/chosen": -438.0, "logps/rejected": -552.0, "loss": 0.6302, "rewards/accuracies": 0.875, "rewards/chosen": 0.4453125, "rewards/margins": 2.99609375, "rewards/rejected": -2.5546875, "step": 2192 }, { "epoch": 0.41437951721857436, "grad_norm": 1.8354347093763272, "learning_rate": 7.556301203881812e-07, "logits/chosen": 2.837890625, "logits/rejected": 2.96484375, "logps/chosen": -967.0, "logps/rejected": -936.0, "loss": 0.6372, "rewards/accuracies": 0.75, "rewards/chosen": 1.3505859375, "rewards/margins": 4.125, "rewards/rejected": -2.775390625, "step": 2193 }, { "epoch": 0.4145684727667816, "grad_norm": 1.7393051568166817, "learning_rate": 7.553660643401894e-07, "logits/chosen": 2.89453125, "logits/rejected": 2.53515625, "logps/chosen": -905.0, "logps/rejected": -861.0, "loss": 0.5299, "rewards/accuracies": 0.8125, "rewards/chosen": 1.8388671875, "rewards/margins": 4.54296875, "rewards/rejected": -2.697265625, "step": 2194 }, { "epoch": 0.4147574283149889, "grad_norm": 1.6495205781372961, "learning_rate": 7.551019189478159e-07, "logits/chosen": 3.11328125, "logits/rejected": 3.09765625, "logps/chosen": -802.0, "logps/rejected": -1576.0, "loss": 0.5448, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0115966796875, "rewards/margins": 9.0703125, "rewards/rejected": -8.046875, "step": 2195 }, { "epoch": 0.41494638386319616, "grad_norm": 1.6360326565358025, "learning_rate": 7.548376843259773e-07, "logits/chosen": 2.126953125, "logits/rejected": 1.8564453125, "logps/chosen": -664.0, "logps/rejected": -532.5, "loss": 0.7631, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8994140625, "rewards/margins": 2.03125, "rewards/rejected": -1.130615234375, "step": 2196 }, { "epoch": 0.41513533941140346, "grad_norm": 2.20706281341963, "learning_rate": 7.545733605896285e-07, "logits/chosen": 1.265625, "logits/rejected": 1.35546875, "logps/chosen": -931.0, "logps/rejected": -1041.0, "loss": 0.6477, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1181640625, "rewards/margins": 4.08984375, "rewards/rejected": -2.966796875, "step": 2197 }, { "epoch": 0.41532429495961076, "grad_norm": 2.0426987899504176, "learning_rate": 7.543089478537634e-07, "logits/chosen": 1.9296875, "logits/rejected": 2.181640625, "logps/chosen": -637.0, "logps/rejected": -663.0, "loss": 0.581, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1669921875, "rewards/margins": 4.265625, "rewards/rejected": -3.091796875, "step": 2198 }, { "epoch": 0.415513250507818, "grad_norm": 1.9561732609806635, "learning_rate": 7.540444462334145e-07, "logits/chosen": 2.83984375, "logits/rejected": 2.1796875, "logps/chosen": -1559.5, "logps/rejected": -916.0, "loss": 0.6049, "rewards/accuracies": 0.84375, "rewards/chosen": -0.32421875, "rewards/margins": 2.44140625, "rewards/rejected": -2.771484375, "step": 2199 }, { "epoch": 0.4157022060560253, "grad_norm": 1.8669174239199144, "learning_rate": 7.537798558436535e-07, "logits/chosen": 2.4970703125, "logits/rejected": 2.072265625, "logps/chosen": -650.0, "logps/rejected": -741.0, "loss": 0.6342, "rewards/accuracies": 0.78125, "rewards/chosen": 0.85693359375, "rewards/margins": 3.82421875, "rewards/rejected": -2.9609375, "step": 2200 }, { "epoch": 0.4158911616042326, "grad_norm": 1.7464162344104273, "learning_rate": 7.535151767995898e-07, "logits/chosen": 2.75390625, "logits/rejected": 2.68359375, "logps/chosen": -733.0, "logps/rejected": -1276.0, "loss": 0.5702, "rewards/accuracies": 0.875, "rewards/chosen": 0.861083984375, "rewards/margins": 6.31640625, "rewards/rejected": -5.45703125, "step": 2201 }, { "epoch": 0.41608011715243987, "grad_norm": 1.8328776129394448, "learning_rate": 7.532504092163722e-07, "logits/chosen": 2.232421875, "logits/rejected": 1.927734375, "logps/chosen": -642.5, "logps/rejected": -708.0, "loss": 0.623, "rewards/accuracies": 0.8125, "rewards/chosen": 0.55126953125, "rewards/margins": 4.154296875, "rewards/rejected": -3.6044921875, "step": 2202 }, { "epoch": 0.41626907270064717, "grad_norm": 2.730924091934463, "learning_rate": 7.529855532091873e-07, "logits/chosen": 1.939453125, "logits/rejected": 2.23828125, "logps/chosen": -657.0, "logps/rejected": -891.0, "loss": 0.6853, "rewards/accuracies": 0.84375, "rewards/chosen": 0.32275390625, "rewards/margins": 4.2890625, "rewards/rejected": -3.96875, "step": 2203 }, { "epoch": 0.4164580282488545, "grad_norm": 1.9728620497007165, "learning_rate": 7.527206088932611e-07, "logits/chosen": 2.96875, "logits/rejected": 3.07421875, "logps/chosen": -842.0, "logps/rejected": -761.5, "loss": 0.6569, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3668212890625, "rewards/margins": 4.59375, "rewards/rejected": -4.22265625, "step": 2204 }, { "epoch": 0.4166469837970617, "grad_norm": 2.2341775284188397, "learning_rate": 7.52455576383857e-07, "logits/chosen": 3.08203125, "logits/rejected": 2.951171875, "logps/chosen": -376.5, "logps/rejected": -459.5, "loss": 0.6121, "rewards/accuracies": 0.84375, "rewards/chosen": 0.40234375, "rewards/margins": 4.046875, "rewards/rejected": -3.6484375, "step": 2205 }, { "epoch": 0.416835939345269, "grad_norm": 2.110952251921937, "learning_rate": 7.521904557962772e-07, "logits/chosen": 1.6279296875, "logits/rejected": 1.587890625, "logps/chosen": -781.0, "logps/rejected": -1006.0, "loss": 0.5775, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2509765625, "rewards/margins": 5.85546875, "rewards/rejected": -5.59765625, "step": 2206 }, { "epoch": 0.41702489489347633, "grad_norm": 1.965869206089042, "learning_rate": 7.519252472458627e-07, "logits/chosen": 2.87890625, "logits/rejected": 2.875, "logps/chosen": -716.75, "logps/rejected": -612.5, "loss": 0.6659, "rewards/accuracies": 0.65625, "rewards/chosen": 0.47314453125, "rewards/margins": 2.863037109375, "rewards/rejected": -2.384765625, "step": 2207 }, { "epoch": 0.4172138504416836, "grad_norm": 2.9518142192858132, "learning_rate": 7.516599508479917e-07, "logits/chosen": 2.3857421875, "logits/rejected": 2.101806640625, "logps/chosen": -887.0, "logps/rejected": -1036.0, "loss": 0.6025, "rewards/accuracies": 0.84375, "rewards/chosen": 0.50885009765625, "rewards/margins": 4.14453125, "rewards/rejected": -3.640625, "step": 2208 }, { "epoch": 0.4174028059898909, "grad_norm": 1.9387099854048058, "learning_rate": 7.513945667180819e-07, "logits/chosen": 1.642578125, "logits/rejected": 1.96142578125, "logps/chosen": -1067.5, "logps/rejected": -1163.0, "loss": 0.6795, "rewards/accuracies": 0.78125, "rewards/chosen": 0.968505859375, "rewards/margins": 5.076171875, "rewards/rejected": -4.10546875, "step": 2209 }, { "epoch": 0.4175917615380982, "grad_norm": 2.630664949076663, "learning_rate": 7.511290949715881e-07, "logits/chosen": 2.255859375, "logits/rejected": 1.924560546875, "logps/chosen": -722.0, "logps/rejected": -773.5, "loss": 0.5297, "rewards/accuracies": 0.875, "rewards/chosen": 0.83349609375, "rewards/margins": 4.79296875, "rewards/rejected": -3.95703125, "step": 2210 }, { "epoch": 0.41778071708630543, "grad_norm": 2.468911424416529, "learning_rate": 7.508635357240036e-07, "logits/chosen": 2.748046875, "logits/rejected": 2.4306640625, "logps/chosen": -532.0, "logps/rejected": -483.5, "loss": 0.5366, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7137451171875, "rewards/margins": 4.01171875, "rewards/rejected": -3.296875, "step": 2211 }, { "epoch": 0.41796967263451273, "grad_norm": 1.5797674192475628, "learning_rate": 7.505978890908599e-07, "logits/chosen": 2.1123046875, "logits/rejected": 2.369140625, "logps/chosen": -762.0, "logps/rejected": -996.0, "loss": 0.6314, "rewards/accuracies": 0.78125, "rewards/chosen": 0.689453125, "rewards/margins": 5.21875, "rewards/rejected": -4.5234375, "step": 2212 }, { "epoch": 0.41815862818272004, "grad_norm": 2.2016118867450385, "learning_rate": 7.503321551877264e-07, "logits/chosen": 2.673828125, "logits/rejected": 2.615234375, "logps/chosen": -569.5, "logps/rejected": -746.0, "loss": 0.6815, "rewards/accuracies": 0.75, "rewards/chosen": 0.88775634765625, "rewards/margins": 4.234375, "rewards/rejected": -3.34375, "step": 2213 }, { "epoch": 0.4183475837309273, "grad_norm": 3.282564872306217, "learning_rate": 7.500663341302105e-07, "logits/chosen": 2.890625, "logits/rejected": 2.271484375, "logps/chosen": -740.0, "logps/rejected": -1152.0, "loss": 0.6202, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4393310546875, "rewards/margins": 6.81640625, "rewards/rejected": -6.3828125, "step": 2214 }, { "epoch": 0.4185365392791346, "grad_norm": 1.8457093018431991, "learning_rate": 7.498004260339575e-07, "logits/chosen": 3.078125, "logits/rejected": 2.830078125, "logps/chosen": -806.0, "logps/rejected": -830.5, "loss": 0.6139, "rewards/accuracies": 0.8125, "rewards/chosen": -0.46923828125, "rewards/margins": 3.63037109375, "rewards/rejected": -4.09375, "step": 2215 }, { "epoch": 0.4187254948273419, "grad_norm": 1.518387176160356, "learning_rate": 7.495344310146505e-07, "logits/chosen": 2.421875, "logits/rejected": 2.0234375, "logps/chosen": -634.5, "logps/rejected": -554.5, "loss": 0.7143, "rewards/accuracies": 0.78125, "rewards/chosen": 0.771240234375, "rewards/margins": 2.865234375, "rewards/rejected": -2.0966796875, "step": 2216 }, { "epoch": 0.41891445037554914, "grad_norm": 2.0985505708364833, "learning_rate": 7.492683491880105e-07, "logits/chosen": 3.65625, "logits/rejected": 3.234375, "logps/chosen": -749.5, "logps/rejected": -1582.0, "loss": 0.7125, "rewards/accuracies": 0.625, "rewards/chosen": 0.5087890625, "rewards/margins": 5.2421875, "rewards/rejected": -4.7421875, "step": 2217 }, { "epoch": 0.41910340592375644, "grad_norm": 1.91605913884024, "learning_rate": 7.490021806697964e-07, "logits/chosen": 2.9609375, "logits/rejected": 2.615234375, "logps/chosen": -732.0, "logps/rejected": -867.5, "loss": 0.5654, "rewards/accuracies": 0.75, "rewards/chosen": 0.8525390625, "rewards/margins": 5.42578125, "rewards/rejected": -4.5703125, "step": 2218 }, { "epoch": 0.41929236147196375, "grad_norm": 2.5067595554073274, "learning_rate": 7.487359255758042e-07, "logits/chosen": 2.095703125, "logits/rejected": 1.55419921875, "logps/chosen": -761.0, "logps/rejected": -687.5, "loss": 0.6341, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1214599609375, "rewards/margins": 5.2255859375, "rewards/rejected": -5.1044921875, "step": 2219 }, { "epoch": 0.419481317020171, "grad_norm": 2.0202536360780154, "learning_rate": 7.484695840218683e-07, "logits/chosen": 2.376953125, "logits/rejected": 2.017578125, "logps/chosen": -747.0, "logps/rejected": -962.0, "loss": 0.6041, "rewards/accuracies": 0.8125, "rewards/chosen": 0.857421875, "rewards/margins": 5.75, "rewards/rejected": -4.89453125, "step": 2220 }, { "epoch": 0.4196702725683783, "grad_norm": 1.862477670320236, "learning_rate": 7.482031561238606e-07, "logits/chosen": 3.921875, "logits/rejected": 3.41015625, "logps/chosen": -1010.0, "logps/rejected": -1172.0, "loss": 0.6655, "rewards/accuracies": 0.6875, "rewards/chosen": 0.77734375, "rewards/margins": 4.6875, "rewards/rejected": -3.91015625, "step": 2221 }, { "epoch": 0.41985922811658555, "grad_norm": 2.0173014691279105, "learning_rate": 7.479366419976901e-07, "logits/chosen": 2.287109375, "logits/rejected": 2.1298828125, "logps/chosen": -583.5, "logps/rejected": -875.5, "loss": 0.6236, "rewards/accuracies": 0.875, "rewards/chosen": 0.635009765625, "rewards/margins": 3.755859375, "rewards/rejected": -3.126953125, "step": 2222 }, { "epoch": 0.42004818366479285, "grad_norm": 1.8557494231347367, "learning_rate": 7.476700417593039e-07, "logits/chosen": 3.02734375, "logits/rejected": 2.7734375, "logps/chosen": -889.0, "logps/rejected": -846.5, "loss": 0.6536, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8524169921875, "rewards/margins": 4.099609375, "rewards/rejected": -3.2412109375, "step": 2223 }, { "epoch": 0.42023713921300015, "grad_norm": 2.8397042098685077, "learning_rate": 7.474033555246859e-07, "logits/chosen": 2.451171875, "logits/rejected": 2.115234375, "logps/chosen": -1016.0, "logps/rejected": -915.0, "loss": 0.5119, "rewards/accuracies": 0.875, "rewards/chosen": 1.4052734375, "rewards/margins": 4.73828125, "rewards/rejected": -3.33984375, "step": 2224 }, { "epoch": 0.4204260947612074, "grad_norm": 2.66179692311936, "learning_rate": 7.47136583409858e-07, "logits/chosen": 2.1044921875, "logits/rejected": 2.8037109375, "logps/chosen": -648.5, "logps/rejected": -1792.5, "loss": 0.717, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1171875, "rewards/margins": 6.751953125, "rewards/rejected": -6.849609375, "step": 2225 }, { "epoch": 0.4206150503094147, "grad_norm": 3.080386522451618, "learning_rate": 7.468697255308794e-07, "logits/chosen": 2.06640625, "logits/rejected": 1.8984375, "logps/chosen": -718.0, "logps/rejected": -911.0, "loss": 0.5889, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10009765625, "rewards/margins": 4.54296875, "rewards/rejected": -4.640625, "step": 2226 }, { "epoch": 0.420804005857622, "grad_norm": 3.2564869095728146, "learning_rate": 7.466027820038461e-07, "logits/chosen": 2.595703125, "logits/rejected": 2.7421875, "logps/chosen": -717.0, "logps/rejected": -1674.5, "loss": 0.6998, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0458984375, "rewards/margins": 6.55859375, "rewards/rejected": -6.60546875, "step": 2227 }, { "epoch": 0.42099296140582926, "grad_norm": 1.8050993732364744, "learning_rate": 7.463357529448921e-07, "logits/chosen": 2.3203125, "logits/rejected": 1.798828125, "logps/chosen": -729.0, "logps/rejected": -1048.5, "loss": 0.6299, "rewards/accuracies": 0.75, "rewards/chosen": -0.29150390625, "rewards/margins": 6.21484375, "rewards/rejected": -6.5078125, "step": 2228 }, { "epoch": 0.42118191695403656, "grad_norm": 1.8978561917895456, "learning_rate": 7.460686384701879e-07, "logits/chosen": 2.701171875, "logits/rejected": 2.533203125, "logps/chosen": -735.0, "logps/rejected": -1313.0, "loss": 0.59, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6220703125, "rewards/margins": 5.134765625, "rewards/rejected": -5.75390625, "step": 2229 }, { "epoch": 0.42137087250224387, "grad_norm": 3.6703443729079446, "learning_rate": 7.458014386959416e-07, "logits/chosen": 2.216796875, "logits/rejected": 2.009765625, "logps/chosen": -689.0, "logps/rejected": -1189.5, "loss": 0.6315, "rewards/accuracies": 0.875, "rewards/chosen": -0.146240234375, "rewards/margins": 7.275390625, "rewards/rejected": -7.44140625, "step": 2230 }, { "epoch": 0.4215598280504511, "grad_norm": 2.024165312429699, "learning_rate": 7.455341537383983e-07, "logits/chosen": 2.32177734375, "logits/rejected": 2.03125, "logps/chosen": -480.75, "logps/rejected": -682.0, "loss": 0.7217, "rewards/accuracies": 0.71875, "rewards/chosen": -0.39794921875, "rewards/margins": 4.27392578125, "rewards/rejected": -4.671875, "step": 2231 }, { "epoch": 0.4217487835986584, "grad_norm": 1.6473055775012548, "learning_rate": 7.452667837138405e-07, "logits/chosen": 2.9609375, "logits/rejected": 2.462890625, "logps/chosen": -1093.0, "logps/rejected": -907.0, "loss": 0.4199, "rewards/accuracies": 0.90625, "rewards/chosen": 1.08154296875, "rewards/margins": 6.0625, "rewards/rejected": -4.984375, "step": 2232 }, { "epoch": 0.4219377391468657, "grad_norm": 2.769291381912104, "learning_rate": 7.449993287385869e-07, "logits/chosen": 2.8828125, "logits/rejected": 3.111328125, "logps/chosen": -519.0, "logps/rejected": -1613.0, "loss": 0.6563, "rewards/accuracies": 0.84375, "rewards/chosen": -0.44873046875, "rewards/margins": 5.69921875, "rewards/rejected": -6.12890625, "step": 2233 }, { "epoch": 0.42212669469507297, "grad_norm": 4.011973501182632, "learning_rate": 7.447317889289937e-07, "logits/chosen": 2.9140625, "logits/rejected": 2.1904296875, "logps/chosen": -1165.0, "logps/rejected": -1062.0, "loss": 0.5892, "rewards/accuracies": 0.84375, "rewards/chosen": 1.213134765625, "rewards/margins": 5.8203125, "rewards/rejected": -4.61328125, "step": 2234 }, { "epoch": 0.42231565024328027, "grad_norm": 2.3508871460758165, "learning_rate": 7.444641644014539e-07, "logits/chosen": 3.15234375, "logits/rejected": 2.53125, "logps/chosen": -761.0, "logps/rejected": -734.0, "loss": 0.5599, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6630859375, "rewards/margins": 4.0546875, "rewards/rejected": -3.39453125, "step": 2235 }, { "epoch": 0.4225046057914876, "grad_norm": 1.941940143842757, "learning_rate": 7.441964552723977e-07, "logits/chosen": 2.54296875, "logits/rejected": 2.48828125, "logps/chosen": -813.5, "logps/rejected": -722.5, "loss": 0.5956, "rewards/accuracies": 0.8125, "rewards/chosen": 0.787841796875, "rewards/margins": 3.8203125, "rewards/rejected": -3.03515625, "step": 2236 }, { "epoch": 0.4226935613396948, "grad_norm": 2.360317304914376, "learning_rate": 7.439286616582915e-07, "logits/chosen": 1.935546875, "logits/rejected": 2.083984375, "logps/chosen": -622.0, "logps/rejected": -759.0, "loss": 0.6565, "rewards/accuracies": 0.75, "rewards/chosen": 0.119873046875, "rewards/margins": 4.5078125, "rewards/rejected": -4.3828125, "step": 2237 }, { "epoch": 0.4228825168879021, "grad_norm": 2.484646280555253, "learning_rate": 7.436607836756389e-07, "logits/chosen": 2.3671875, "logits/rejected": 2.3203125, "logps/chosen": -910.0, "logps/rejected": -724.5, "loss": 0.4537, "rewards/accuracies": 0.875, "rewards/chosen": 1.0400390625, "rewards/margins": 6.859375, "rewards/rejected": -5.82421875, "step": 2238 }, { "epoch": 0.42307147243610943, "grad_norm": 1.7459392489834227, "learning_rate": 7.433928214409799e-07, "logits/chosen": 2.978515625, "logits/rejected": 2.474609375, "logps/chosen": -811.0, "logps/rejected": -797.0, "loss": 0.5185, "rewards/accuracies": 0.90625, "rewards/chosen": 0.931640625, "rewards/margins": 4.671875, "rewards/rejected": -3.73046875, "step": 2239 }, { "epoch": 0.4232604279843167, "grad_norm": 1.8096645328446408, "learning_rate": 7.431247750708915e-07, "logits/chosen": 2.6796875, "logits/rejected": 2.46484375, "logps/chosen": -753.0, "logps/rejected": -803.0, "loss": 0.5133, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2236328125, "rewards/margins": 5.041015625, "rewards/rejected": -3.814453125, "step": 2240 }, { "epoch": 0.423449383532524, "grad_norm": 2.0807820292129104, "learning_rate": 7.428566446819871e-07, "logits/chosen": 2.8046875, "logits/rejected": 3.08203125, "logps/chosen": -727.75, "logps/rejected": -917.5, "loss": 0.607, "rewards/accuracies": 0.75, "rewards/chosen": 0.7470703125, "rewards/margins": 3.625, "rewards/rejected": -2.875, "step": 2241 }, { "epoch": 0.4236383390807313, "grad_norm": 2.3247239833395477, "learning_rate": 7.425884303909163e-07, "logits/chosen": 2.265625, "logits/rejected": 1.634521484375, "logps/chosen": -631.5, "logps/rejected": -752.5, "loss": 0.6232, "rewards/accuracies": 0.875, "rewards/chosen": 0.32220458984375, "rewards/margins": 4.37890625, "rewards/rejected": -4.0546875, "step": 2242 }, { "epoch": 0.42382729462893853, "grad_norm": 1.8520567885514843, "learning_rate": 7.42320132314366e-07, "logits/chosen": 3.04296875, "logits/rejected": 2.9375, "logps/chosen": -622.0, "logps/rejected": -1071.0, "loss": 0.6466, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5791015625, "rewards/margins": 3.859375, "rewards/rejected": -3.28125, "step": 2243 }, { "epoch": 0.42401625017714584, "grad_norm": 1.3856755706987183, "learning_rate": 7.42051750569059e-07, "logits/chosen": 2.892578125, "logits/rejected": 3.2109375, "logps/chosen": -609.0, "logps/rejected": -1854.0, "loss": 0.643, "rewards/accuracies": 0.6875, "rewards/chosen": 0.42333984375, "rewards/margins": 6.5, "rewards/rejected": -6.080078125, "step": 2244 }, { "epoch": 0.4242052057253531, "grad_norm": 1.6623428547167358, "learning_rate": 7.417832852717546e-07, "logits/chosen": 3.21484375, "logits/rejected": 2.9921875, "logps/chosen": -1129.0, "logps/rejected": -1125.0, "loss": 0.5305, "rewards/accuracies": 0.875, "rewards/chosen": 1.6259765625, "rewards/margins": 4.58203125, "rewards/rejected": -2.95703125, "step": 2245 }, { "epoch": 0.4243941612735604, "grad_norm": 1.9880717801768721, "learning_rate": 7.415147365392485e-07, "logits/chosen": 1.476318359375, "logits/rejected": 1.478759765625, "logps/chosen": -1076.0, "logps/rejected": -1058.0, "loss": 0.5764, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2001953125, "rewards/margins": 4.1484375, "rewards/rejected": -2.953125, "step": 2246 }, { "epoch": 0.4245831168217677, "grad_norm": 2.0198836633782795, "learning_rate": 7.412461044883724e-07, "logits/chosen": 1.998046875, "logits/rejected": 2.42578125, "logps/chosen": -900.0, "logps/rejected": -1084.0, "loss": 0.5696, "rewards/accuracies": 0.875, "rewards/chosen": 1.27294921875, "rewards/margins": 5.171875, "rewards/rejected": -3.90234375, "step": 2247 }, { "epoch": 0.42477207236997494, "grad_norm": 1.957156230362725, "learning_rate": 7.409773892359946e-07, "logits/chosen": 1.830078125, "logits/rejected": 2.126953125, "logps/chosen": -514.5, "logps/rejected": -999.5, "loss": 0.695, "rewards/accuracies": 0.875, "rewards/chosen": 0.857421875, "rewards/margins": 3.447265625, "rewards/rejected": -2.59375, "step": 2248 }, { "epoch": 0.42496102791818224, "grad_norm": 2.285237002192333, "learning_rate": 7.407085908990198e-07, "logits/chosen": 2.54638671875, "logits/rejected": 2.130615234375, "logps/chosen": -845.0, "logps/rejected": -937.0, "loss": 0.7062, "rewards/accuracies": 0.78125, "rewards/chosen": 0.85546875, "rewards/margins": 2.79833984375, "rewards/rejected": -1.943359375, "step": 2249 }, { "epoch": 0.42514998346638955, "grad_norm": 1.7126870352728691, "learning_rate": 7.404397095943884e-07, "logits/chosen": 2.0703125, "logits/rejected": 2.080078125, "logps/chosen": -607.0, "logps/rejected": -1219.0, "loss": 0.686, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5498046875, "rewards/margins": 4.615234375, "rewards/rejected": -4.060546875, "step": 2250 }, { "epoch": 0.4253389390145968, "grad_norm": 1.4022691106206358, "learning_rate": 7.401707454390771e-07, "logits/chosen": 2.80078125, "logits/rejected": 2.25390625, "logps/chosen": -969.5, "logps/rejected": -1006.0, "loss": 0.4758, "rewards/accuracies": 0.9375, "rewards/chosen": 1.587890625, "rewards/margins": 6.08203125, "rewards/rejected": -4.490234375, "step": 2251 }, { "epoch": 0.4255278945628041, "grad_norm": 6.576964175619271, "learning_rate": 7.399016985500984e-07, "logits/chosen": 4.046875, "logits/rejected": 3.78515625, "logps/chosen": -1030.5, "logps/rejected": -547.5, "loss": 0.7626, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02294921875, "rewards/margins": 1.5048828125, "rewards/rejected": -1.529296875, "step": 2252 }, { "epoch": 0.4257168501110114, "grad_norm": 1.506942608846709, "learning_rate": 7.396325690445011e-07, "logits/chosen": 2.50390625, "logits/rejected": 2.28125, "logps/chosen": -902.5, "logps/rejected": -969.0, "loss": 0.6026, "rewards/accuracies": 0.75, "rewards/chosen": 1.0283203125, "rewards/margins": 4.71484375, "rewards/rejected": -3.677734375, "step": 2253 }, { "epoch": 0.42590580565921865, "grad_norm": 1.9513634191097045, "learning_rate": 7.393633570393698e-07, "logits/chosen": 2.4296875, "logits/rejected": 2.318359375, "logps/chosen": -951.0, "logps/rejected": -1034.0, "loss": 0.6059, "rewards/accuracies": 0.78125, "rewards/chosen": 0.84521484375, "rewards/margins": 6.62109375, "rewards/rejected": -5.7734375, "step": 2254 }, { "epoch": 0.42609476120742595, "grad_norm": 2.0592245180810385, "learning_rate": 7.390940626518251e-07, "logits/chosen": 2.6640625, "logits/rejected": 2.8125, "logps/chosen": -507.75, "logps/rejected": -730.0, "loss": 0.6366, "rewards/accuracies": 0.75, "rewards/chosen": 0.513671875, "rewards/margins": 4.2861328125, "rewards/rejected": -3.7763671875, "step": 2255 }, { "epoch": 0.42628371675563326, "grad_norm": 2.16725918624535, "learning_rate": 7.388246859990233e-07, "logits/chosen": 2.8046875, "logits/rejected": 2.8828125, "logps/chosen": -610.0, "logps/rejected": -788.0, "loss": 0.5318, "rewards/accuracies": 0.875, "rewards/chosen": 1.1689453125, "rewards/margins": 4.9375, "rewards/rejected": -3.775390625, "step": 2256 }, { "epoch": 0.4264726723038405, "grad_norm": 2.024110381816919, "learning_rate": 7.385552271981565e-07, "logits/chosen": 2.78125, "logits/rejected": 2.53125, "logps/chosen": -1981.0, "logps/rejected": -1686.0, "loss": 0.6719, "rewards/accuracies": 0.75, "rewards/chosen": -1.09033203125, "rewards/margins": 7.044921875, "rewards/rejected": -8.138671875, "step": 2257 }, { "epoch": 0.4266616278520478, "grad_norm": 1.6686317130624362, "learning_rate": 7.382856863664527e-07, "logits/chosen": 2.5546875, "logits/rejected": 2.375, "logps/chosen": -603.5, "logps/rejected": -1031.5, "loss": 0.5179, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4195556640625, "rewards/margins": 6.89453125, "rewards/rejected": -6.48828125, "step": 2258 }, { "epoch": 0.4268505834002551, "grad_norm": 1.8949042114801693, "learning_rate": 7.380160636211753e-07, "logits/chosen": 2.40234375, "logits/rejected": 1.919921875, "logps/chosen": -660.0, "logps/rejected": -478.5, "loss": 0.6534, "rewards/accuracies": 0.875, "rewards/chosen": 0.271728515625, "rewards/margins": 3.703125, "rewards/rejected": -3.421875, "step": 2259 }, { "epoch": 0.42703953894846236, "grad_norm": 1.9871047392772305, "learning_rate": 7.377463590796238e-07, "logits/chosen": 1.3555908203125, "logits/rejected": 1.2265625, "logps/chosen": -608.0, "logps/rejected": -562.0, "loss": 0.5797, "rewards/accuracies": 0.84375, "rewards/chosen": 0.188232421875, "rewards/margins": 4.4921875, "rewards/rejected": -4.3046875, "step": 2260 }, { "epoch": 0.42722849449666966, "grad_norm": 2.032038851279672, "learning_rate": 7.374765728591326e-07, "logits/chosen": 2.361328125, "logits/rejected": 2.0107421875, "logps/chosen": -590.5, "logps/rejected": -891.5, "loss": 0.5354, "rewards/accuracies": 0.875, "rewards/chosen": 0.667236328125, "rewards/margins": 5.359375, "rewards/rejected": -4.6875, "step": 2261 }, { "epoch": 0.42741745004487697, "grad_norm": 2.864302684641959, "learning_rate": 7.372067050770724e-07, "logits/chosen": 2.5078125, "logits/rejected": 2.599609375, "logps/chosen": -838.0, "logps/rejected": -835.0, "loss": 0.5393, "rewards/accuracies": 0.875, "rewards/chosen": 0.89404296875, "rewards/margins": 5.140625, "rewards/rejected": -4.25390625, "step": 2262 }, { "epoch": 0.4276064055930842, "grad_norm": 1.780284097543586, "learning_rate": 7.36936755850849e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.193359375, "logps/chosen": -1005.0, "logps/rejected": -1126.0, "loss": 0.5446, "rewards/accuracies": 0.84375, "rewards/chosen": 0.60888671875, "rewards/margins": 5.04296875, "rewards/rejected": -4.431640625, "step": 2263 }, { "epoch": 0.4277953611412915, "grad_norm": 5.007625881640257, "learning_rate": 7.366667252979033e-07, "logits/chosen": 2.248046875, "logits/rejected": 2.033203125, "logps/chosen": -1164.5, "logps/rejected": -1238.0, "loss": 0.6395, "rewards/accuracies": 0.75, "rewards/chosen": 0.7978515625, "rewards/margins": 4.96484375, "rewards/rejected": -4.16015625, "step": 2264 }, { "epoch": 0.4279843166894988, "grad_norm": 2.3098213737242363, "learning_rate": 7.363966135357122e-07, "logits/chosen": 3.4296875, "logits/rejected": 3.453125, "logps/chosen": -678.0, "logps/rejected": -712.0, "loss": 0.7384, "rewards/accuracies": 0.6875, "rewards/chosen": 0.41229248046875, "rewards/margins": 2.6416015625, "rewards/rejected": -2.23046875, "step": 2265 }, { "epoch": 0.42817327223770607, "grad_norm": 2.890032654713999, "learning_rate": 7.361264206817876e-07, "logits/chosen": 2.333984375, "logits/rejected": 2.19140625, "logps/chosen": -563.0, "logps/rejected": -463.0, "loss": 0.6872, "rewards/accuracies": 0.96875, "rewards/chosen": 0.2978515625, "rewards/margins": 2.619140625, "rewards/rejected": -2.31640625, "step": 2266 }, { "epoch": 0.4283622277859134, "grad_norm": 1.76963394273193, "learning_rate": 7.358561468536768e-07, "logits/chosen": 2.576171875, "logits/rejected": 2.53125, "logps/chosen": -849.0, "logps/rejected": -830.0, "loss": 0.566, "rewards/accuracies": 0.78125, "rewards/chosen": 0.98974609375, "rewards/margins": 4.90625, "rewards/rejected": -3.9140625, "step": 2267 }, { "epoch": 0.4285511833341206, "grad_norm": 2.983035930622047, "learning_rate": 7.355857921689622e-07, "logits/chosen": 2.59375, "logits/rejected": 2.2548828125, "logps/chosen": -769.5, "logps/rejected": -651.0, "loss": 0.6356, "rewards/accuracies": 0.8125, "rewards/chosen": 0.570556640625, "rewards/margins": 3.48046875, "rewards/rejected": -2.904296875, "step": 2268 }, { "epoch": 0.4287401388823279, "grad_norm": 3.109965790890841, "learning_rate": 7.353153567452614e-07, "logits/chosen": 1.982421875, "logits/rejected": 2.04296875, "logps/chosen": -963.5, "logps/rejected": -1514.0, "loss": 0.5823, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3595199584960938, "rewards/margins": 5.53515625, "rewards/rejected": -4.18359375, "step": 2269 }, { "epoch": 0.42892909443053523, "grad_norm": 1.8582952585154835, "learning_rate": 7.350448407002271e-07, "logits/chosen": 2.546875, "logits/rejected": 2.791015625, "logps/chosen": -1270.0, "logps/rejected": -1007.0, "loss": 0.5774, "rewards/accuracies": 0.84375, "rewards/chosen": 0.912353515625, "rewards/margins": 5.6484375, "rewards/rejected": -4.724609375, "step": 2270 }, { "epoch": 0.4291180499787425, "grad_norm": 2.3907744096527286, "learning_rate": 7.347742441515474e-07, "logits/chosen": 2.73046875, "logits/rejected": 3.078125, "logps/chosen": -638.0, "logps/rejected": -774.0, "loss": 0.6509, "rewards/accuracies": 0.75, "rewards/chosen": 0.8896484375, "rewards/margins": 3.5078125, "rewards/rejected": -2.61328125, "step": 2271 }, { "epoch": 0.4293070055269498, "grad_norm": 2.7533259744402074, "learning_rate": 7.345035672169451e-07, "logits/chosen": 2.6171875, "logits/rejected": 2.376953125, "logps/chosen": -694.0, "logps/rejected": -829.0, "loss": 0.7314, "rewards/accuracies": 0.75, "rewards/chosen": 0.5234375, "rewards/margins": 3.25390625, "rewards/rejected": -2.7353515625, "step": 2272 }, { "epoch": 0.4294959610751571, "grad_norm": 2.1205030870618735, "learning_rate": 7.34232810014178e-07, "logits/chosen": 2.693359375, "logits/rejected": 2.4931640625, "logps/chosen": -661.0, "logps/rejected": -697.5, "loss": 0.5651, "rewards/accuracies": 0.875, "rewards/chosen": 0.90380859375, "rewards/margins": 3.5859375, "rewards/rejected": -2.677734375, "step": 2273 }, { "epoch": 0.42968491662336433, "grad_norm": 3.625167457661446, "learning_rate": 7.339619726610386e-07, "logits/chosen": 2.619140625, "logits/rejected": 2.796875, "logps/chosen": -535.5, "logps/rejected": -683.5, "loss": 0.6943, "rewards/accuracies": 0.75, "rewards/chosen": -0.12353515625, "rewards/margins": 2.828125, "rewards/rejected": -2.951171875, "step": 2274 }, { "epoch": 0.42987387217157164, "grad_norm": 2.193059928633564, "learning_rate": 7.336910552753545e-07, "logits/chosen": 2.546875, "logits/rejected": 2.279296875, "logps/chosen": -1318.0, "logps/rejected": -1193.0, "loss": 0.606, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7142333984375, "rewards/margins": 5.0234375, "rewards/rejected": -4.3046875, "step": 2275 }, { "epoch": 0.43006282771977894, "grad_norm": 2.4620999418368954, "learning_rate": 7.334200579749889e-07, "logits/chosen": 2.4921875, "logits/rejected": 2.09765625, "logps/chosen": -725.0, "logps/rejected": -596.5, "loss": 0.658, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39300537109375, "rewards/margins": 3.765625, "rewards/rejected": -3.36328125, "step": 2276 }, { "epoch": 0.4302517832679862, "grad_norm": 2.9502333614527307, "learning_rate": 7.331489808778383e-07, "logits/chosen": 2.62890625, "logits/rejected": 2.076171875, "logps/chosen": -1170.0, "logps/rejected": -1122.0, "loss": 0.5502, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6533203125, "rewards/margins": 4.8359375, "rewards/rejected": -4.171875, "step": 2277 }, { "epoch": 0.4304407388161935, "grad_norm": 2.0694448665155867, "learning_rate": 7.328778241018349e-07, "logits/chosen": 2.662109375, "logits/rejected": 2.72265625, "logps/chosen": -1034.0, "logps/rejected": -1559.0, "loss": 0.5687, "rewards/accuracies": 0.78125, "rewards/chosen": 0.70654296875, "rewards/margins": 4.98828125, "rewards/rejected": -4.29296875, "step": 2278 }, { "epoch": 0.4306296943644008, "grad_norm": 1.9048088835478774, "learning_rate": 7.326065877649449e-07, "logits/chosen": 2.48046875, "logits/rejected": 1.927734375, "logps/chosen": -852.5, "logps/rejected": -583.5, "loss": 0.6503, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3212890625, "rewards/margins": 3.203125, "rewards/rejected": -3.5234375, "step": 2279 }, { "epoch": 0.43081864991260804, "grad_norm": 1.7326590159345021, "learning_rate": 7.3233527198517e-07, "logits/chosen": 2.7734375, "logits/rejected": 2.45703125, "logps/chosen": -833.5, "logps/rejected": -945.5, "loss": 0.6347, "rewards/accuracies": 0.75, "rewards/chosen": 0.6806640625, "rewards/margins": 3.69921875, "rewards/rejected": -3.01953125, "step": 2280 }, { "epoch": 0.43100760546081535, "grad_norm": 2.1226687311910153, "learning_rate": 7.32063876880546e-07, "logits/chosen": 2.62890625, "logits/rejected": 2.673828125, "logps/chosen": -704.0, "logps/rejected": -851.0, "loss": 0.5952, "rewards/accuracies": 0.84375, "rewards/chosen": 0.134307861328125, "rewards/margins": 6.76171875, "rewards/rejected": -6.62890625, "step": 2281 }, { "epoch": 0.43119656100902265, "grad_norm": 1.5294592323380267, "learning_rate": 7.317924025691426e-07, "logits/chosen": 2.97265625, "logits/rejected": 2.666015625, "logps/chosen": -935.0, "logps/rejected": -1008.5, "loss": 0.5973, "rewards/accuracies": 0.8125, "rewards/chosen": 0.640625, "rewards/margins": 4.79296875, "rewards/rejected": -4.1484375, "step": 2282 }, { "epoch": 0.4313855165572299, "grad_norm": 2.203293939955203, "learning_rate": 7.315208491690651e-07, "logits/chosen": 2.94921875, "logits/rejected": 3.125, "logps/chosen": -573.5, "logps/rejected": -1522.0, "loss": 0.6864, "rewards/accuracies": 0.78125, "rewards/chosen": 0.169677734375, "rewards/margins": 3.52880859375, "rewards/rejected": -3.3583984375, "step": 2283 }, { "epoch": 0.4315744721054372, "grad_norm": 1.8355638567554535, "learning_rate": 7.312492167984523e-07, "logits/chosen": 1.46484375, "logits/rejected": 1.22235107421875, "logps/chosen": -1169.0, "logps/rejected": -1193.0, "loss": 0.5792, "rewards/accuracies": 0.8125, "rewards/chosen": 1.278076171875, "rewards/margins": 5.05078125, "rewards/rejected": -3.78125, "step": 2284 }, { "epoch": 0.4317634276536445, "grad_norm": 3.156528274036765, "learning_rate": 7.30977505575478e-07, "logits/chosen": 2.109375, "logits/rejected": 2.33203125, "logps/chosen": -502.0, "logps/rejected": -673.0, "loss": 0.6926, "rewards/accuracies": 0.75, "rewards/chosen": 0.0775146484375, "rewards/margins": 3.265625, "rewards/rejected": -3.1875, "step": 2285 }, { "epoch": 0.43195238320185175, "grad_norm": 1.8964629303787643, "learning_rate": 7.307057156183498e-07, "logits/chosen": 2.28125, "logits/rejected": 2.095703125, "logps/chosen": -749.5, "logps/rejected": -682.0, "loss": 0.6258, "rewards/accuracies": 0.75, "rewards/chosen": 0.117431640625, "rewards/margins": 3.21875, "rewards/rejected": -3.1015625, "step": 2286 }, { "epoch": 0.43214133875005906, "grad_norm": 1.610130569093241, "learning_rate": 7.304338470453097e-07, "logits/chosen": 3.07421875, "logits/rejected": 3.171875, "logps/chosen": -608.5, "logps/rejected": -2062.5, "loss": 0.4735, "rewards/accuracies": 0.84375, "rewards/chosen": 0.92138671875, "rewards/margins": 5.9296875, "rewards/rejected": -5.0, "step": 2287 }, { "epoch": 0.43233029429826636, "grad_norm": 1.7684066872431783, "learning_rate": 7.30161899974634e-07, "logits/chosen": 3.05078125, "logits/rejected": 3.072265625, "logps/chosen": -660.0, "logps/rejected": -765.0, "loss": 0.824, "rewards/accuracies": 0.625, "rewards/chosen": 0.233154296875, "rewards/margins": 2.3603515625, "rewards/rejected": -2.130859375, "step": 2288 }, { "epoch": 0.4325192498464736, "grad_norm": 2.50818288454062, "learning_rate": 7.298898745246331e-07, "logits/chosen": 2.3046875, "logits/rejected": 2.1787109375, "logps/chosen": -869.0, "logps/rejected": -967.0, "loss": 0.4686, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1787109375, "rewards/margins": 4.6796875, "rewards/rejected": -3.505859375, "step": 2289 }, { "epoch": 0.4327082053946809, "grad_norm": 1.4509101437003984, "learning_rate": 7.296177708136518e-07, "logits/chosen": 2.171875, "logits/rejected": 1.9541015625, "logps/chosen": -809.0, "logps/rejected": -800.5, "loss": 0.6032, "rewards/accuracies": 0.875, "rewards/chosen": 1.26416015625, "rewards/margins": 4.123046875, "rewards/rejected": -2.869140625, "step": 2290 }, { "epoch": 0.43289716094288816, "grad_norm": 1.9183639176520866, "learning_rate": 7.293455889600683e-07, "logits/chosen": 3.279296875, "logits/rejected": 3.01953125, "logps/chosen": -2153.5, "logps/rejected": -519.0, "loss": 0.7599, "rewards/accuracies": 0.6875, "rewards/chosen": -1.85986328125, "rewards/margins": -0.337890625, "rewards/rejected": -1.53515625, "step": 2291 }, { "epoch": 0.43308611649109546, "grad_norm": 3.084679012033935, "learning_rate": 7.290733290822951e-07, "logits/chosen": 1.903564453125, "logits/rejected": 2.0390625, "logps/chosen": -808.0, "logps/rejected": -1005.0, "loss": 0.6846, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6273193359375, "rewards/margins": 3.4765625, "rewards/rejected": -2.8515625, "step": 2292 }, { "epoch": 0.43327507203930277, "grad_norm": 2.2023968228635535, "learning_rate": 7.288009912987791e-07, "logits/chosen": 1.8671875, "logits/rejected": 1.951171875, "logps/chosen": -646.0, "logps/rejected": -932.0, "loss": 0.6902, "rewards/accuracies": 0.75, "rewards/chosen": 0.5537109375, "rewards/margins": 3.046875, "rewards/rejected": -2.48828125, "step": 2293 }, { "epoch": 0.43346402758751, "grad_norm": 2.3067173450255343, "learning_rate": 7.285285757280004e-07, "logits/chosen": 2.640625, "logits/rejected": 2.89453125, "logps/chosen": -677.0, "logps/rejected": -1043.0, "loss": 0.6202, "rewards/accuracies": 0.8125, "rewards/chosen": 0.068359375, "rewards/margins": 3.2109375, "rewards/rejected": -3.146484375, "step": 2294 }, { "epoch": 0.4336529831357173, "grad_norm": 1.7977769495275877, "learning_rate": 7.282560824884731e-07, "logits/chosen": 2.736328125, "logits/rejected": 2.7734375, "logps/chosen": -992.5, "logps/rejected": -1822.0, "loss": 0.5159, "rewards/accuracies": 0.90625, "rewards/chosen": 1.267578125, "rewards/margins": 6.234375, "rewards/rejected": -4.97265625, "step": 2295 }, { "epoch": 0.4338419386839246, "grad_norm": 2.1956567356904433, "learning_rate": 7.279835116987455e-07, "logits/chosen": 2.22265625, "logits/rejected": 2.4609375, "logps/chosen": -617.5, "logps/rejected": -16709.0, "loss": 0.6936, "rewards/accuracies": 0.71875, "rewards/chosen": 0.748046875, "rewards/margins": -26.3671875, "rewards/rejected": 27.0986328125, "step": 2296 }, { "epoch": 0.43403089423213187, "grad_norm": 2.052624666042839, "learning_rate": 7.277108634773992e-07, "logits/chosen": 2.599609375, "logits/rejected": 2.361328125, "logps/chosen": -522.5, "logps/rejected": -484.5, "loss": 0.8039, "rewards/accuracies": 0.71875, "rewards/chosen": -0.271484375, "rewards/margins": 1.6357421875, "rewards/rejected": -1.9072265625, "step": 2297 }, { "epoch": 0.4342198497803392, "grad_norm": 1.4893968186923445, "learning_rate": 7.274381379430497e-07, "logits/chosen": 2.6640625, "logits/rejected": 2.96875, "logps/chosen": -663.5, "logps/rejected": -1637.5, "loss": 0.6499, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2705078125, "rewards/margins": 7.048828125, "rewards/rejected": -5.790283203125, "step": 2298 }, { "epoch": 0.4344088053285465, "grad_norm": 2.1412699437134823, "learning_rate": 7.271653352143462e-07, "logits/chosen": 2.42578125, "logits/rejected": 1.76953125, "logps/chosen": -740.5, "logps/rejected": -575.0, "loss": 0.5082, "rewards/accuracies": 0.875, "rewards/chosen": 0.8701171875, "rewards/margins": 3.83203125, "rewards/rejected": -2.962890625, "step": 2299 }, { "epoch": 0.4345977608767537, "grad_norm": 1.9225884444952777, "learning_rate": 7.268924554099712e-07, "logits/chosen": 2.98828125, "logits/rejected": 2.7578125, "logps/chosen": -687.5, "logps/rejected": -675.0, "loss": 0.6587, "rewards/accuracies": 0.75, "rewards/chosen": 0.8681640625, "rewards/margins": 3.103515625, "rewards/rejected": -2.236328125, "step": 2300 }, { "epoch": 0.43478671642496103, "grad_norm": 2.3697656624859658, "learning_rate": 7.266194986486408e-07, "logits/chosen": 2.3935546875, "logits/rejected": 2.470703125, "logps/chosen": -482.5, "logps/rejected": -602.0, "loss": 0.656, "rewards/accuracies": 0.78125, "rewards/chosen": 0.95751953125, "rewards/margins": 2.61328125, "rewards/rejected": -1.650390625, "step": 2301 }, { "epoch": 0.43497567197316833, "grad_norm": 1.7418453636682814, "learning_rate": 7.263464650491049e-07, "logits/chosen": 3.02734375, "logits/rejected": 2.775390625, "logps/chosen": -596.0, "logps/rejected": -656.5, "loss": 0.7278, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21649169921875, "rewards/margins": 2.92578125, "rewards/rejected": -2.7080078125, "step": 2302 }, { "epoch": 0.4351646275213756, "grad_norm": 2.427212858163742, "learning_rate": 7.260733547301465e-07, "logits/chosen": 2.607421875, "logits/rejected": 2.6640625, "logps/chosen": -632.5, "logps/rejected": -1165.0, "loss": 0.6163, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8818359375, "rewards/margins": 5.203125, "rewards/rejected": -4.314453125, "step": 2303 }, { "epoch": 0.4353535830695829, "grad_norm": 1.854061712806987, "learning_rate": 7.25800167810582e-07, "logits/chosen": 2.44384765625, "logits/rejected": 1.9296875, "logps/chosen": -892.0, "logps/rejected": -841.0, "loss": 0.5745, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3718719482421875, "rewards/margins": 4.234375, "rewards/rejected": -3.87109375, "step": 2304 }, { "epoch": 0.4355425386177902, "grad_norm": 2.0355425142579366, "learning_rate": 7.255269044092612e-07, "logits/chosen": 1.77734375, "logits/rejected": 1.7119140625, "logps/chosen": -770.0, "logps/rejected": -813.0, "loss": 0.6299, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3289794921875, "rewards/margins": 4.0, "rewards/rejected": -3.66796875, "step": 2305 }, { "epoch": 0.43573149416599743, "grad_norm": 2.4456440598551645, "learning_rate": 7.252535646450673e-07, "logits/chosen": 2.080078125, "logits/rejected": 1.74169921875, "logps/chosen": -922.5, "logps/rejected": -734.5, "loss": 0.5872, "rewards/accuracies": 0.90625, "rewards/chosen": 0.94482421875, "rewards/margins": 4.072265625, "rewards/rejected": -3.130859375, "step": 2306 }, { "epoch": 0.43592044971420474, "grad_norm": 1.9863039925572417, "learning_rate": 7.249801486369168e-07, "logits/chosen": 2.3857421875, "logits/rejected": 2.23388671875, "logps/chosen": -876.5, "logps/rejected": -751.5, "loss": 0.4844, "rewards/accuracies": 0.96875, "rewards/chosen": 1.29296875, "rewards/margins": 5.328125, "rewards/rejected": -4.03515625, "step": 2307 }, { "epoch": 0.43610940526241204, "grad_norm": 1.8506700494665513, "learning_rate": 7.247066565037587e-07, "logits/chosen": 2.41796875, "logits/rejected": 2.42578125, "logps/chosen": -885.0, "logps/rejected": -1105.0, "loss": 0.6133, "rewards/accuracies": 0.75, "rewards/chosen": 1.1279296875, "rewards/margins": 4.27734375, "rewards/rejected": -3.14453125, "step": 2308 }, { "epoch": 0.4362983608106193, "grad_norm": 2.0613823758037384, "learning_rate": 7.244330883645759e-07, "logits/chosen": 2.1484375, "logits/rejected": 1.80078125, "logps/chosen": -1212.0, "logps/rejected": -1210.0, "loss": 0.5417, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1005859375, "rewards/margins": 5.552734375, "rewards/rejected": -4.4453125, "step": 2309 }, { "epoch": 0.4364873163588266, "grad_norm": 1.9936832483706872, "learning_rate": 7.241594443383839e-07, "logits/chosen": 2.00390625, "logits/rejected": 1.07958984375, "logps/chosen": -533.5, "logps/rejected": -473.75, "loss": 0.6835, "rewards/accuracies": 0.84375, "rewards/chosen": 0.030517578125, "rewards/margins": 2.69140625, "rewards/rejected": -2.66015625, "step": 2310 }, { "epoch": 0.4366762719070339, "grad_norm": 2.154131350361271, "learning_rate": 7.238857245442314e-07, "logits/chosen": 2.78515625, "logits/rejected": 2.4482421875, "logps/chosen": -887.0, "logps/rejected": -1276.0, "loss": 0.5966, "rewards/accuracies": 0.84375, "rewards/chosen": 0.04248046875, "rewards/margins": 5.0703125, "rewards/rejected": -5.03515625, "step": 2311 }, { "epoch": 0.43686522745524115, "grad_norm": 1.9046940975402442, "learning_rate": 7.236119291012001e-07, "logits/chosen": 1.65625, "logits/rejected": 1.998046875, "logps/chosen": -922.0, "logps/rejected": -731.5, "loss": 0.6952, "rewards/accuracies": 0.71875, "rewards/chosen": 0.021484375, "rewards/margins": 3.08203125, "rewards/rejected": -3.0546875, "step": 2312 }, { "epoch": 0.43705418300344845, "grad_norm": 3.566372592768639, "learning_rate": 7.233380581284046e-07, "logits/chosen": 1.85986328125, "logits/rejected": 1.402099609375, "logps/chosen": -925.0, "logps/rejected": -1588.0, "loss": 0.5805, "rewards/accuracies": 0.84375, "rewards/chosen": -0.09033203125, "rewards/margins": 6.23828125, "rewards/rejected": -6.3359375, "step": 2313 }, { "epoch": 0.4372431385516557, "grad_norm": 1.9327790824863673, "learning_rate": 7.230641117449921e-07, "logits/chosen": 1.43475341796875, "logits/rejected": 0.7529296875, "logps/chosen": -668.0, "logps/rejected": -639.0, "loss": 0.6085, "rewards/accuracies": 0.78125, "rewards/chosen": 0.192626953125, "rewards/margins": 4.22265625, "rewards/rejected": -4.03125, "step": 2314 }, { "epoch": 0.437432094099863, "grad_norm": 1.9753537793885407, "learning_rate": 7.227900900701429e-07, "logits/chosen": 1.9921875, "logits/rejected": 1.986328125, "logps/chosen": -843.5, "logps/rejected": -857.5, "loss": 0.573, "rewards/accuracies": 0.84375, "rewards/chosen": 0.35443115234375, "rewards/margins": 4.810546875, "rewards/rejected": -4.453125, "step": 2315 }, { "epoch": 0.4376210496480703, "grad_norm": 2.042860553915748, "learning_rate": 7.2251599322307e-07, "logits/chosen": 2.12890625, "logits/rejected": 1.9189453125, "logps/chosen": -923.0, "logps/rejected": -1043.0, "loss": 0.5288, "rewards/accuracies": 0.84375, "rewards/chosen": 0.791015625, "rewards/margins": 5.2890625, "rewards/rejected": -4.49609375, "step": 2316 }, { "epoch": 0.43781000519627755, "grad_norm": 2.725419230561292, "learning_rate": 7.22241821323019e-07, "logits/chosen": 2.740234375, "logits/rejected": 2.212890625, "logps/chosen": -1870.0, "logps/rejected": -746.0, "loss": 0.518, "rewards/accuracies": 0.75, "rewards/chosen": -0.40185546875, "rewards/margins": 3.3671875, "rewards/rejected": -3.7734375, "step": 2317 }, { "epoch": 0.43799896074448486, "grad_norm": 1.6311084392263668, "learning_rate": 7.219675744892683e-07, "logits/chosen": 2.369140625, "logits/rejected": 2.94921875, "logps/chosen": -668.0, "logps/rejected": -858.0, "loss": 0.4943, "rewards/accuracies": 0.90625, "rewards/chosen": 0.83251953125, "rewards/margins": 4.47265625, "rewards/rejected": -3.640625, "step": 2318 }, { "epoch": 0.43818791629269216, "grad_norm": 1.5176595197935567, "learning_rate": 7.216932528411287e-07, "logits/chosen": 2.185791015625, "logits/rejected": 1.9091796875, "logps/chosen": -546.5, "logps/rejected": -451.5, "loss": 0.7569, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19952392578125, "rewards/margins": 2.06341552734375, "rewards/rejected": -2.254730224609375, "step": 2319 }, { "epoch": 0.4383768718408994, "grad_norm": 2.3129570543335896, "learning_rate": 7.214188564979439e-07, "logits/chosen": 2.0048828125, "logits/rejected": 2.19873046875, "logps/chosen": -777.0, "logps/rejected": -906.0, "loss": 0.5485, "rewards/accuracies": 0.875, "rewards/chosen": 1.1871337890625, "rewards/margins": 4.73046875, "rewards/rejected": -3.541015625, "step": 2320 }, { "epoch": 0.4385658273891067, "grad_norm": 2.2264141925976038, "learning_rate": 7.211443855790897e-07, "logits/chosen": 1.47412109375, "logits/rejected": 1.75439453125, "logps/chosen": -680.5, "logps/rejected": -694.0, "loss": 0.7355, "rewards/accuracies": 0.78125, "rewards/chosen": 0.14404296875, "rewards/margins": 2.689453125, "rewards/rejected": -2.541015625, "step": 2321 }, { "epoch": 0.438754782937314, "grad_norm": 1.5789246923551405, "learning_rate": 7.208698402039742e-07, "logits/chosen": 2.708984375, "logits/rejected": 2.94140625, "logps/chosen": -427.0, "logps/rejected": -736.0, "loss": 0.6172, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22625732421875, "rewards/margins": 3.208984375, "rewards/rejected": -2.98046875, "step": 2322 }, { "epoch": 0.43894373848552126, "grad_norm": 1.9691913408446806, "learning_rate": 7.20595220492039e-07, "logits/chosen": 2.4765625, "logits/rejected": 1.5859375, "logps/chosen": -699.0, "logps/rejected": -750.0, "loss": 0.5482, "rewards/accuracies": 0.84375, "rewards/chosen": 0.300537109375, "rewards/margins": 3.9921875, "rewards/rejected": -3.6953125, "step": 2323 }, { "epoch": 0.43913269403372857, "grad_norm": 1.9324924525089209, "learning_rate": 7.203205265627562e-07, "logits/chosen": 2.216796875, "logits/rejected": 1.69873046875, "logps/chosen": -660.0, "logps/rejected": -718.0, "loss": 0.5134, "rewards/accuracies": 0.875, "rewards/chosen": 0.93115234375, "rewards/margins": 3.8828125, "rewards/rejected": -2.943359375, "step": 2324 }, { "epoch": 0.43932164958193587, "grad_norm": 2.0785438127906586, "learning_rate": 7.200457585356323e-07, "logits/chosen": 1.646484375, "logits/rejected": 1.775390625, "logps/chosen": -930.5, "logps/rejected": -698.0, "loss": 0.5964, "rewards/accuracies": 0.78125, "rewards/chosen": 1.224609375, "rewards/margins": 3.56640625, "rewards/rejected": -2.33984375, "step": 2325 }, { "epoch": 0.4395106051301431, "grad_norm": 1.7846491637997781, "learning_rate": 7.19770916530204e-07, "logits/chosen": 2.1015625, "logits/rejected": 2.0, "logps/chosen": -1058.0, "logps/rejected": -1100.0, "loss": 0.5229, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0654296875, "rewards/margins": 5.3828125, "rewards/rejected": -4.31640625, "step": 2326 }, { "epoch": 0.4396995606783504, "grad_norm": 1.8839899215725056, "learning_rate": 7.194960006660419e-07, "logits/chosen": 2.263671875, "logits/rejected": 1.5263671875, "logps/chosen": -953.0, "logps/rejected": -843.0, "loss": 0.5546, "rewards/accuracies": 0.78125, "rewards/chosen": 0.79547119140625, "rewards/margins": 4.41015625, "rewards/rejected": -3.619140625, "step": 2327 }, { "epoch": 0.4398885162265577, "grad_norm": 2.0748140294776336, "learning_rate": 7.192210110627476e-07, "logits/chosen": 0.45703125, "logits/rejected": 0.12060546875, "logps/chosen": -622.0, "logps/rejected": -576.5, "loss": 0.6868, "rewards/accuracies": 0.71875, "rewards/chosen": 0.13330078125, "rewards/margins": 2.65625, "rewards/rejected": -2.521484375, "step": 2328 }, { "epoch": 0.440077471774765, "grad_norm": 3.4315798352772284, "learning_rate": 7.189459478399552e-07, "logits/chosen": 3.0703125, "logits/rejected": 2.875, "logps/chosen": -607.0, "logps/rejected": -634.0, "loss": 0.5759, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8271484375, "rewards/margins": 3.91015625, "rewards/rejected": -3.087890625, "step": 2329 }, { "epoch": 0.4402664273229723, "grad_norm": 1.7424231661458884, "learning_rate": 7.186708111173308e-07, "logits/chosen": 3.71484375, "logits/rejected": 3.62109375, "logps/chosen": -610.0, "logps/rejected": -607.0, "loss": 0.6201, "rewards/accuracies": 0.8125, "rewards/chosen": 0.65234375, "rewards/margins": 3.76953125, "rewards/rejected": -3.11328125, "step": 2330 }, { "epoch": 0.4404553828711796, "grad_norm": 1.826205447362249, "learning_rate": 7.183956010145723e-07, "logits/chosen": 4.0, "logits/rejected": 3.390625, "logps/chosen": -5612.0, "logps/rejected": -1121.0, "loss": 0.6399, "rewards/accuracies": 0.75, "rewards/chosen": -0.83203125, "rewards/margins": 4.069580078125, "rewards/rejected": -4.890625, "step": 2331 }, { "epoch": 0.4406443384193868, "grad_norm": 1.7666508126773381, "learning_rate": 7.1812031765141e-07, "logits/chosen": 2.375, "logits/rejected": 1.9697265625, "logps/chosen": -717.0, "logps/rejected": -598.0, "loss": 0.622, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12890625, "rewards/margins": 3.4453125, "rewards/rejected": -3.3125, "step": 2332 }, { "epoch": 0.44083329396759413, "grad_norm": 2.162434080648587, "learning_rate": 7.178449611476057e-07, "logits/chosen": 1.84765625, "logits/rejected": 2.7265625, "logps/chosen": -523.0, "logps/rejected": -1066.5, "loss": 0.7199, "rewards/accuracies": 0.71875, "rewards/chosen": -0.484710693359375, "rewards/margins": 3.421875, "rewards/rejected": -3.90234375, "step": 2333 }, { "epoch": 0.44102224951580143, "grad_norm": 2.4634266967087046, "learning_rate": 7.175695316229531e-07, "logits/chosen": 2.607421875, "logits/rejected": 2.689453125, "logps/chosen": -837.0, "logps/rejected": -785.0, "loss": 0.6366, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7373046875, "rewards/margins": 3.580078125, "rewards/rejected": -2.841796875, "step": 2334 }, { "epoch": 0.4412112050640087, "grad_norm": 1.598775878157306, "learning_rate": 7.172940291972774e-07, "logits/chosen": 2.3779296875, "logits/rejected": 2.423828125, "logps/chosen": -10265.5, "logps/rejected": -889.5, "loss": 0.5447, "rewards/accuracies": 0.84375, "rewards/chosen": -11.67138671875, "rewards/margins": -6.703125, "rewards/rejected": -4.96484375, "step": 2335 }, { "epoch": 0.441400160612216, "grad_norm": 2.2040961383104403, "learning_rate": 7.170184539904358e-07, "logits/chosen": 2.56640625, "logits/rejected": 3.224609375, "logps/chosen": -1112.0, "logps/rejected": -1144.0, "loss": 0.5404, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5341796875, "rewards/margins": 5.4609375, "rewards/rejected": -4.921875, "step": 2336 }, { "epoch": 0.44158911616042323, "grad_norm": 2.6426518609662772, "learning_rate": 7.167428061223175e-07, "logits/chosen": 2.05859375, "logits/rejected": 2.234375, "logps/chosen": -551.5, "logps/rejected": -1208.0, "loss": 0.5829, "rewards/accuracies": 0.875, "rewards/chosen": -0.15087890625, "rewards/margins": 5.86328125, "rewards/rejected": -6.0078125, "step": 2337 }, { "epoch": 0.44177807170863054, "grad_norm": 2.5567570573411564, "learning_rate": 7.164670857128426e-07, "logits/chosen": 2.197265625, "logits/rejected": 1.5146484375, "logps/chosen": -601.75, "logps/rejected": -585.0, "loss": 0.6195, "rewards/accuracies": 0.75, "rewards/chosen": 0.0809326171875, "rewards/margins": 4.31640625, "rewards/rejected": -4.234375, "step": 2338 }, { "epoch": 0.44196702725683784, "grad_norm": 2.091762681055418, "learning_rate": 7.161912928819631e-07, "logits/chosen": 1.990234375, "logits/rejected": 1.89453125, "logps/chosen": -782.0, "logps/rejected": -1943.0, "loss": 0.6382, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0504150390625, "rewards/margins": 7.05859375, "rewards/rejected": -7.10546875, "step": 2339 }, { "epoch": 0.4421559828050451, "grad_norm": 3.0955407222452203, "learning_rate": 7.159154277496625e-07, "logits/chosen": 1.46484375, "logits/rejected": 1.205078125, "logps/chosen": -795.5, "logps/rejected": -987.0, "loss": 0.5813, "rewards/accuracies": 0.8125, "rewards/chosen": -0.326171875, "rewards/margins": 5.4140625, "rewards/rejected": -5.7421875, "step": 2340 }, { "epoch": 0.4423449383532524, "grad_norm": 1.9167932095681492, "learning_rate": 7.156394904359561e-07, "logits/chosen": 1.8349609375, "logits/rejected": 2.583984375, "logps/chosen": -457.5, "logps/rejected": -887.0, "loss": 0.6757, "rewards/accuracies": 0.84375, "rewards/chosen": -1.05908203125, "rewards/margins": 3.83203125, "rewards/rejected": -4.8828125, "step": 2341 }, { "epoch": 0.4425338939014597, "grad_norm": 2.3595543033979496, "learning_rate": 7.153634810608898e-07, "logits/chosen": 1.95068359375, "logits/rejected": 2.150390625, "logps/chosen": -866.5, "logps/rejected": -1046.0, "loss": 0.6871, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2353515625, "rewards/margins": 4.06640625, "rewards/rejected": -4.30078125, "step": 2342 }, { "epoch": 0.44272284944966694, "grad_norm": 1.9706158756488161, "learning_rate": 7.150873997445416e-07, "logits/chosen": 1.82421875, "logits/rejected": 1.392578125, "logps/chosen": -891.0, "logps/rejected": -909.0, "loss": 0.6267, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6846923828125, "rewards/margins": 4.76953125, "rewards/rejected": -4.078125, "step": 2343 }, { "epoch": 0.44291180499787425, "grad_norm": 1.6553085499957556, "learning_rate": 7.148112466070203e-07, "logits/chosen": 2.1875, "logits/rejected": 1.998046875, "logps/chosen": -925.0, "logps/rejected": -1000.0, "loss": 0.6338, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1845703125, "rewards/margins": 4.052734375, "rewards/rejected": -2.873046875, "step": 2344 }, { "epoch": 0.44310076054608155, "grad_norm": 3.5548778184107066, "learning_rate": 7.145350217684664e-07, "logits/chosen": 2.259765625, "logits/rejected": 2.326171875, "logps/chosen": -793.5, "logps/rejected": -610.5, "loss": 0.6215, "rewards/accuracies": 0.8125, "rewards/chosen": 0.84033203125, "rewards/margins": 3.8671875, "rewards/rejected": -3.02734375, "step": 2345 }, { "epoch": 0.4432897160942888, "grad_norm": 3.457649827085493, "learning_rate": 7.142587253490511e-07, "logits/chosen": 2.087890625, "logits/rejected": 1.953125, "logps/chosen": -754.0, "logps/rejected": -846.0, "loss": 0.5208, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9736328125, "rewards/margins": 4.67578125, "rewards/rejected": -3.703125, "step": 2346 }, { "epoch": 0.4434786716424961, "grad_norm": 2.6048554672184023, "learning_rate": 7.139823574689773e-07, "logits/chosen": 3.068359375, "logits/rejected": 2.6796875, "logps/chosen": -978.0, "logps/rejected": -955.0, "loss": 0.7196, "rewards/accuracies": 0.71875, "rewards/chosen": 0.77978515625, "rewards/margins": 3.6953125, "rewards/rejected": -2.9140625, "step": 2347 }, { "epoch": 0.4436676271907034, "grad_norm": 4.474212934082819, "learning_rate": 7.137059182484785e-07, "logits/chosen": 2.57421875, "logits/rejected": 2.65234375, "logps/chosen": -799.0, "logps/rejected": -923.0, "loss": 0.6295, "rewards/accuracies": 0.78125, "rewards/chosen": 0.55859375, "rewards/margins": 3.99609375, "rewards/rejected": -3.44140625, "step": 2348 }, { "epoch": 0.44385658273891065, "grad_norm": 2.6775107335122303, "learning_rate": 7.134294078078192e-07, "logits/chosen": 2.1083984375, "logits/rejected": 1.83447265625, "logps/chosen": -781.0, "logps/rejected": -718.5, "loss": 0.602, "rewards/accuracies": 0.875, "rewards/chosen": 0.18408203125, "rewards/margins": 3.5390625, "rewards/rejected": -3.34765625, "step": 2349 }, { "epoch": 0.44404553828711796, "grad_norm": 1.7078411219667455, "learning_rate": 7.131528262672954e-07, "logits/chosen": 2.365234375, "logits/rejected": 2.0654296875, "logps/chosen": -8047.0, "logps/rejected": -830.0, "loss": 0.6702, "rewards/accuracies": 0.78125, "rewards/chosen": -68.0562744140625, "rewards/margins": -65.08203125, "rewards/rejected": -3.25, "step": 2350 }, { "epoch": 0.44423449383532526, "grad_norm": 1.9736586764133153, "learning_rate": 7.128761737472336e-07, "logits/chosen": 2.599609375, "logits/rejected": 2.7890625, "logps/chosen": -520.0, "logps/rejected": -711.5, "loss": 0.8015, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14697265625, "rewards/margins": 1.97802734375, "rewards/rejected": -1.828125, "step": 2351 }, { "epoch": 0.4444234493835325, "grad_norm": 1.7612529847963478, "learning_rate": 7.125994503679913e-07, "logits/chosen": 2.78515625, "logits/rejected": 3.21875, "logps/chosen": -921.0, "logps/rejected": -1053.0, "loss": 0.6368, "rewards/accuracies": 0.6875, "rewards/chosen": 0.955810546875, "rewards/margins": 3.7021484375, "rewards/rejected": -2.75, "step": 2352 }, { "epoch": 0.4446124049317398, "grad_norm": 1.923660176854739, "learning_rate": 7.123226562499572e-07, "logits/chosen": 2.2646484375, "logits/rejected": 2.630859375, "logps/chosen": -531.5, "logps/rejected": -800.0, "loss": 0.7211, "rewards/accuracies": 0.75, "rewards/chosen": 0.389404296875, "rewards/margins": 2.923828125, "rewards/rejected": -2.532958984375, "step": 2353 }, { "epoch": 0.4448013604799471, "grad_norm": 1.4635718262104802, "learning_rate": 7.120457915135497e-07, "logits/chosen": 2.853515625, "logits/rejected": 2.421875, "logps/chosen": -614.5, "logps/rejected": -594.5, "loss": 0.6004, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1318359375, "rewards/margins": 3.69921875, "rewards/rejected": -2.5654296875, "step": 2354 }, { "epoch": 0.44499031602815436, "grad_norm": 2.4286838054319384, "learning_rate": 7.117688562792192e-07, "logits/chosen": 2.5673828125, "logits/rejected": 2.478515625, "logps/chosen": -873.0, "logps/rejected": -1584.0, "loss": 0.5436, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2041015625, "rewards/margins": 7.06640625, "rewards/rejected": -5.86328125, "step": 2355 }, { "epoch": 0.44517927157636167, "grad_norm": 1.8858329432198906, "learning_rate": 7.114918506674461e-07, "logits/chosen": 2.357421875, "logits/rejected": 2.0, "logps/chosen": -860.5, "logps/rejected": -634.0, "loss": 0.6546, "rewards/accuracies": 0.8125, "rewards/chosen": 0.187255859375, "rewards/margins": 2.310546875, "rewards/rejected": -2.12109375, "step": 2356 }, { "epoch": 0.44536822712456897, "grad_norm": 6.341292509217006, "learning_rate": 7.112147747987414e-07, "logits/chosen": 2.765625, "logits/rejected": 2.8359375, "logps/chosen": -640.0, "logps/rejected": -2652.0, "loss": 0.6646, "rewards/accuracies": 0.8125, "rewards/chosen": 1.19140625, "rewards/margins": 2.765625, "rewards/rejected": -1.572265625, "step": 2357 }, { "epoch": 0.4455571826727762, "grad_norm": 1.4427066361321441, "learning_rate": 7.109376287936467e-07, "logits/chosen": 2.732421875, "logits/rejected": 2.4609375, "logps/chosen": -644.0, "logps/rejected": -1327.0, "loss": 0.5377, "rewards/accuracies": 0.875, "rewards/chosen": 0.90380859375, "rewards/margins": 4.30859375, "rewards/rejected": -3.40234375, "step": 2358 }, { "epoch": 0.4457461382209835, "grad_norm": 1.4192186241093716, "learning_rate": 7.106604127727345e-07, "logits/chosen": 2.9296875, "logits/rejected": 2.421875, "logps/chosen": -825.5, "logps/rejected": -847.0, "loss": 0.707, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9140625, "rewards/margins": 3.6962890625, "rewards/rejected": -2.78125, "step": 2359 }, { "epoch": 0.44593509376919077, "grad_norm": 2.216548729452762, "learning_rate": 7.103831268566073e-07, "logits/chosen": 2.52734375, "logits/rejected": 2.208984375, "logps/chosen": -712.0, "logps/rejected": -1871.0, "loss": 0.5453, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6640625, "rewards/margins": 3.3046875, "rewards/rejected": -1.64453125, "step": 2360 }, { "epoch": 0.4461240493173981, "grad_norm": 1.883164534510739, "learning_rate": 7.101057711658983e-07, "logits/chosen": 3.03564453125, "logits/rejected": 2.519775390625, "logps/chosen": -796.5, "logps/rejected": -884.5, "loss": 0.5167, "rewards/accuracies": 0.90625, "rewards/chosen": 0.99072265625, "rewards/margins": 4.4140625, "rewards/rejected": -3.421875, "step": 2361 }, { "epoch": 0.4463130048656054, "grad_norm": 2.0103646216691455, "learning_rate": 7.098283458212707e-07, "logits/chosen": 3.67578125, "logits/rejected": 2.9140625, "logps/chosen": -801.5, "logps/rejected": -696.5, "loss": 0.6109, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7421875, "rewards/margins": 4.60546875, "rewards/rejected": -2.8671875, "step": 2362 }, { "epoch": 0.4465019604138126, "grad_norm": 2.200928633783249, "learning_rate": 7.095508509434183e-07, "logits/chosen": 2.458984375, "logits/rejected": 2.0654296875, "logps/chosen": -956.0, "logps/rejected": -2225.0, "loss": 0.5617, "rewards/accuracies": 0.84375, "rewards/chosen": 1.13909912109375, "rewards/margins": 7.64453125, "rewards/rejected": -6.5, "step": 2363 }, { "epoch": 0.44669091596201993, "grad_norm": 3.0249708883603375, "learning_rate": 7.092732866530652e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.48828125, "logps/chosen": -1058.0, "logps/rejected": -1223.0, "loss": 0.453, "rewards/accuracies": 0.9375, "rewards/chosen": 2.072265625, "rewards/margins": 7.2734375, "rewards/rejected": -5.201171875, "step": 2364 }, { "epoch": 0.44687987151022723, "grad_norm": 1.8944156200763276, "learning_rate": 7.089956530709653e-07, "logits/chosen": 2.37109375, "logits/rejected": 2.29296875, "logps/chosen": -878.0, "logps/rejected": -843.5, "loss": 0.7165, "rewards/accuracies": 0.75, "rewards/chosen": 0.505126953125, "rewards/margins": 2.810546875, "rewards/rejected": -2.30078125, "step": 2365 }, { "epoch": 0.4470688270584345, "grad_norm": 2.349700767671423, "learning_rate": 7.087179503179034e-07, "logits/chosen": 3.60546875, "logits/rejected": 3.109375, "logps/chosen": -601.0, "logps/rejected": -612.5, "loss": 0.67, "rewards/accuracies": 0.71875, "rewards/chosen": 1.03173828125, "rewards/margins": 4.150390625, "rewards/rejected": -3.1171875, "step": 2366 }, { "epoch": 0.4472577826066418, "grad_norm": 2.0430962870808473, "learning_rate": 7.084401785146934e-07, "logits/chosen": 2.1728515625, "logits/rejected": 1.8546142578125, "logps/chosen": -764.0, "logps/rejected": -762.0, "loss": 0.6156, "rewards/accuracies": 0.75, "rewards/chosen": 0.646484375, "rewards/margins": 3.8203125, "rewards/rejected": -3.171875, "step": 2367 }, { "epoch": 0.4474467381548491, "grad_norm": 3.339000890685006, "learning_rate": 7.081623377821799e-07, "logits/chosen": 2.1884765625, "logits/rejected": 1.5771484375, "logps/chosen": -1099.0, "logps/rejected": -957.5, "loss": 0.6295, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8770751953125, "rewards/margins": 3.09765625, "rewards/rejected": -2.2158203125, "step": 2368 }, { "epoch": 0.44763569370305634, "grad_norm": 2.3476822925182406, "learning_rate": 7.078844282412377e-07, "logits/chosen": 1.2578125, "logits/rejected": 1.3771486282348633, "logps/chosen": -770.0, "logps/rejected": -951.5, "loss": 0.6532, "rewards/accuracies": 0.75, "rewards/chosen": 0.3212890625, "rewards/margins": 5.02734375, "rewards/rejected": -4.70703125, "step": 2369 }, { "epoch": 0.44782464925126364, "grad_norm": 2.2189496061096037, "learning_rate": 7.076064500127708e-07, "logits/chosen": 2.6640625, "logits/rejected": 1.978515625, "logps/chosen": -890.0, "logps/rejected": -771.0, "loss": 0.5862, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4012451171875, "rewards/margins": 5.390625, "rewards/rejected": -4.98828125, "step": 2370 }, { "epoch": 0.44801360479947094, "grad_norm": 3.579976518314084, "learning_rate": 7.073284032177135e-07, "logits/chosen": 3.0654296875, "logits/rejected": 2.782958984375, "logps/chosen": -883.5, "logps/rejected": -830.0, "loss": 0.6663, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6728515625, "rewards/margins": 2.35546875, "rewards/rejected": -1.685546875, "step": 2371 }, { "epoch": 0.4482025603476782, "grad_norm": 2.6751828455817273, "learning_rate": 7.070502879770299e-07, "logits/chosen": 2.12451171875, "logits/rejected": 2.49951171875, "logps/chosen": -367.5, "logps/rejected": -1359.0, "loss": 0.5954, "rewards/accuracies": 0.84375, "rewards/chosen": 0.275146484375, "rewards/margins": 10.0390625, "rewards/rejected": -9.78125, "step": 2372 }, { "epoch": 0.4483915158958855, "grad_norm": 2.6505097801388846, "learning_rate": 7.06772104411714e-07, "logits/chosen": 1.51513671875, "logits/rejected": 1.693359375, "logps/chosen": -702.5, "logps/rejected": -1372.5, "loss": 0.5678, "rewards/accuracies": 0.84375, "rewards/chosen": 0.706298828125, "rewards/margins": 5.0625, "rewards/rejected": -4.359375, "step": 2373 }, { "epoch": 0.4485804714440928, "grad_norm": 2.0887089980706293, "learning_rate": 7.064938526427892e-07, "logits/chosen": 2.126953125, "logits/rejected": 1.755859375, "logps/chosen": -607.0, "logps/rejected": -699.0, "loss": 0.681, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3994140625, "rewards/margins": 3.484375, "rewards/rejected": -3.87890625, "step": 2374 }, { "epoch": 0.44876942699230005, "grad_norm": 1.9101399592906436, "learning_rate": 7.06215532791309e-07, "logits/chosen": 0.329345703125, "logits/rejected": 0.923828125, "logps/chosen": -943.0, "logps/rejected": -837.0, "loss": 0.6202, "rewards/accuracies": 0.875, "rewards/chosen": -0.5242919921875, "rewards/margins": 5.35546875, "rewards/rejected": -5.892578125, "step": 2375 }, { "epoch": 0.44895838254050735, "grad_norm": 1.947032149244773, "learning_rate": 7.059371449783561e-07, "logits/chosen": 2.04541015625, "logits/rejected": 2.041015625, "logps/chosen": -512.5, "logps/rejected": -622.5, "loss": 0.6912, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5546875, "rewards/margins": 3.873046875, "rewards/rejected": -4.4296875, "step": 2376 }, { "epoch": 0.44914733808871465, "grad_norm": 3.372718397301922, "learning_rate": 7.05658689325043e-07, "logits/chosen": 1.6884765625, "logits/rejected": 2.025390625, "logps/chosen": -591.5, "logps/rejected": -785.0, "loss": 0.6644, "rewards/accuracies": 0.78125, "rewards/chosen": 0.15374755859375, "rewards/margins": 3.669921875, "rewards/rejected": -3.515625, "step": 2377 }, { "epoch": 0.4493362936369219, "grad_norm": 5.09907439712902, "learning_rate": 7.053801659525117e-07, "logits/chosen": 2.2138671875, "logits/rejected": 2.29296875, "logps/chosen": -1036.0, "logps/rejected": -1149.0, "loss": 0.4227, "rewards/accuracies": 0.84375, "rewards/chosen": 1.216796875, "rewards/margins": 7.53125, "rewards/rejected": -6.30078125, "step": 2378 }, { "epoch": 0.4495252491851292, "grad_norm": 2.481021634432277, "learning_rate": 7.051015749819336e-07, "logits/chosen": 2.77734375, "logits/rejected": 3.01171875, "logps/chosen": -671.0, "logps/rejected": -1367.0, "loss": 0.5946, "rewards/accuracies": 0.8125, "rewards/chosen": 0.075927734375, "rewards/margins": 6.974609375, "rewards/rejected": -6.90234375, "step": 2379 }, { "epoch": 0.4497142047333365, "grad_norm": 2.793192401655995, "learning_rate": 7.048229165345094e-07, "logits/chosen": 1.896484375, "logits/rejected": 1.4814453125, "logps/chosen": -1108.5, "logps/rejected": -997.0, "loss": 0.4771, "rewards/accuracies": 0.90625, "rewards/chosen": 0.701171875, "rewards/margins": 4.98828125, "rewards/rejected": -4.28515625, "step": 2380 }, { "epoch": 0.44990316028154376, "grad_norm": 2.8391827204276905, "learning_rate": 7.045441907314693e-07, "logits/chosen": 2.400390625, "logits/rejected": 2.17626953125, "logps/chosen": -858.0, "logps/rejected": -1039.0, "loss": 0.528, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8916015625, "rewards/margins": 5.4296875, "rewards/rejected": -4.53125, "step": 2381 }, { "epoch": 0.45009211582975106, "grad_norm": 2.1810243863389283, "learning_rate": 7.042653976940731e-07, "logits/chosen": 1.80322265625, "logits/rejected": 1.88232421875, "logps/chosen": -717.5, "logps/rejected": -812.0, "loss": 0.5615, "rewards/accuracies": 0.8125, "rewards/chosen": 0.89404296875, "rewards/margins": 4.421875, "rewards/rejected": -3.53515625, "step": 2382 }, { "epoch": 0.4502810713779583, "grad_norm": 3.455689421542649, "learning_rate": 7.039865375436094e-07, "logits/chosen": 2.365234375, "logits/rejected": 1.9541015625, "logps/chosen": -677.0, "logps/rejected": -678.5, "loss": 0.6104, "rewards/accuracies": 0.75, "rewards/chosen": 0.468994140625, "rewards/margins": 5.36328125, "rewards/rejected": -4.890625, "step": 2383 }, { "epoch": 0.4504700269261656, "grad_norm": 2.566912385098469, "learning_rate": 7.037076104013958e-07, "logits/chosen": 2.77734375, "logits/rejected": 2.8984375, "logps/chosen": -707.5, "logps/rejected": -703.0, "loss": 0.6474, "rewards/accuracies": 0.84375, "rewards/chosen": 0.03271484375, "rewards/margins": 4.025390625, "rewards/rejected": -3.99609375, "step": 2384 }, { "epoch": 0.4506589824743729, "grad_norm": 2.514823504611369, "learning_rate": 7.034286163887799e-07, "logits/chosen": 2.271484375, "logits/rejected": 1.85546875, "logps/chosen": -695.0, "logps/rejected": -765.5, "loss": 0.7484, "rewards/accuracies": 0.6875, "rewards/chosen": -0.615966796875, "rewards/margins": 2.68505859375, "rewards/rejected": -3.3046875, "step": 2385 }, { "epoch": 0.45084793802258016, "grad_norm": 4.933018427888554, "learning_rate": 7.031495556271377e-07, "logits/chosen": 2.275390625, "logits/rejected": 2.27734375, "logps/chosen": -895.0, "logps/rejected": -1619.0, "loss": 0.6862, "rewards/accuracies": 0.75, "rewards/chosen": -0.1318359375, "rewards/margins": 12.265625, "rewards/rejected": -12.40625, "step": 2386 }, { "epoch": 0.45103689357078747, "grad_norm": 2.101902177454151, "learning_rate": 7.028704282378741e-07, "logits/chosen": 2.189453125, "logits/rejected": 1.94921875, "logps/chosen": -766.0, "logps/rejected": -732.5, "loss": 0.5572, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6845703125, "rewards/margins": 4.55859375, "rewards/rejected": -3.87890625, "step": 2387 }, { "epoch": 0.45122584911899477, "grad_norm": 1.9605570917080937, "learning_rate": 7.025912343424238e-07, "logits/chosen": 2.072265625, "logits/rejected": 2.40234375, "logps/chosen": -538.5, "logps/rejected": -602.5, "loss": 0.7427, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3228759765625, "rewards/margins": 4.806640625, "rewards/rejected": -4.48583984375, "step": 2388 }, { "epoch": 0.451414804667202, "grad_norm": 2.0324888218125134, "learning_rate": 7.023119740622494e-07, "logits/chosen": 1.6318359375, "logits/rejected": 1.576171875, "logps/chosen": -1013.0, "logps/rejected": -802.0, "loss": 0.6402, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8251953125, "rewards/margins": 1.7421875, "rewards/rejected": -2.5625, "step": 2389 }, { "epoch": 0.4516037602154093, "grad_norm": 3.06653483885332, "learning_rate": 7.020326475188434e-07, "logits/chosen": 2.1640625, "logits/rejected": 2.21484375, "logps/chosen": -771.0, "logps/rejected": -881.0, "loss": 0.6442, "rewards/accuracies": 0.78125, "rewards/chosen": 0.64794921875, "rewards/margins": 3.80859375, "rewards/rejected": -3.15625, "step": 2390 }, { "epoch": 0.4517927157636166, "grad_norm": 2.3515311024518057, "learning_rate": 7.017532548337265e-07, "logits/chosen": 2.46875, "logits/rejected": 2.103515625, "logps/chosen": -1002.0, "logps/rejected": -1089.0, "loss": 0.5242, "rewards/accuracies": 0.875, "rewards/chosen": 1.9794921875, "rewards/margins": 5.8828125, "rewards/rejected": -3.8984375, "step": 2391 }, { "epoch": 0.4519816713118239, "grad_norm": 2.6363118199502664, "learning_rate": 7.014737961284481e-07, "logits/chosen": 2.525390625, "logits/rejected": 2.455078125, "logps/chosen": -747.0, "logps/rejected": -919.0, "loss": 0.5484, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6904296875, "rewards/margins": 4.23046875, "rewards/rejected": -3.53515625, "step": 2392 }, { "epoch": 0.4521706268600312, "grad_norm": 1.4737158678361835, "learning_rate": 7.011942715245867e-07, "logits/chosen": 2.4140625, "logits/rejected": 2.060546875, "logps/chosen": -547.5, "logps/rejected": -755.5, "loss": 0.5988, "rewards/accuracies": 0.84375, "rewards/chosen": 0.23486328125, "rewards/margins": 4.54296875, "rewards/rejected": -4.32421875, "step": 2393 }, { "epoch": 0.4523595824082385, "grad_norm": 1.9177181452753296, "learning_rate": 7.009146811437494e-07, "logits/chosen": 1.7998046875, "logits/rejected": 1.283203125, "logps/chosen": -642.0, "logps/rejected": -636.0, "loss": 0.6224, "rewards/accuracies": 0.875, "rewards/chosen": 0.76129150390625, "rewards/margins": 3.283203125, "rewards/rejected": -2.5234375, "step": 2394 }, { "epoch": 0.45254853795644573, "grad_norm": 2.201352827781256, "learning_rate": 7.006350251075717e-07, "logits/chosen": 2.9375, "logits/rejected": 3.072265625, "logps/chosen": -1217.5, "logps/rejected": -1780.0, "loss": 0.6789, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2412109375, "rewards/margins": 3.33984375, "rewards/rejected": -6.57421875, "step": 2395 }, { "epoch": 0.45273749350465303, "grad_norm": 2.5384312878738493, "learning_rate": 7.003553035377179e-07, "logits/chosen": 2.1875, "logits/rejected": 1.845703125, "logps/chosen": -771.0, "logps/rejected": -872.0, "loss": 0.5809, "rewards/accuracies": 0.84375, "rewards/chosen": 0.208770751953125, "rewards/margins": 4.47265625, "rewards/rejected": -4.265625, "step": 2396 }, { "epoch": 0.45292644905286034, "grad_norm": 1.9629603590896327, "learning_rate": 7.000755165558806e-07, "logits/chosen": 2.01953125, "logits/rejected": 2.552734375, "logps/chosen": -615.5, "logps/rejected": -676.0, "loss": 0.6154, "rewards/accuracies": 0.84375, "rewards/chosen": 0.26953125, "rewards/margins": 4.06640625, "rewards/rejected": -3.80078125, "step": 2397 }, { "epoch": 0.4531154046010676, "grad_norm": 1.738996794036827, "learning_rate": 6.99795664283781e-07, "logits/chosen": 2.1015625, "logits/rejected": 2.2265625, "logps/chosen": -669.0, "logps/rejected": -808.0, "loss": 0.5606, "rewards/accuracies": 0.9375, "rewards/chosen": 0.390625, "rewards/margins": 4.62890625, "rewards/rejected": -4.23828125, "step": 2398 }, { "epoch": 0.4533043601492749, "grad_norm": 3.4205961213516165, "learning_rate": 6.995157468431687e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.37890625, "logps/chosen": -717.5, "logps/rejected": -734.0, "loss": 0.6705, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2655029296875, "rewards/margins": 4.03515625, "rewards/rejected": -3.76171875, "step": 2399 }, { "epoch": 0.4534933156974822, "grad_norm": 2.1775931692904766, "learning_rate": 6.992357643558212e-07, "logits/chosen": 1.8671875, "logits/rejected": 1.44140625, "logps/chosen": -786.5, "logps/rejected": -764.5, "loss": 0.4554, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3173828125, "rewards/margins": 5.4375, "rewards/rejected": -4.125, "step": 2400 }, { "epoch": 0.45368227124568944, "grad_norm": 2.176595909980019, "learning_rate": 6.989557169435453e-07, "logits/chosen": 2.1025390625, "logits/rejected": 1.9921875, "logps/chosen": -1747.5, "logps/rejected": -1772.0, "loss": 0.5835, "rewards/accuracies": 0.84375, "rewards/chosen": -1.948974609375, "rewards/margins": 3.9375, "rewards/rejected": -5.8828125, "step": 2401 }, { "epoch": 0.45387122679389674, "grad_norm": 1.916038984761016, "learning_rate": 6.986756047281754e-07, "logits/chosen": 1.64453125, "logits/rejected": 1.919921875, "logps/chosen": -752.0, "logps/rejected": -879.0, "loss": 0.6275, "rewards/accuracies": 0.78125, "rewards/chosen": 0.85528564453125, "rewards/margins": 3.75390625, "rewards/rejected": -2.904296875, "step": 2402 }, { "epoch": 0.45406018234210405, "grad_norm": 2.058336118096333, "learning_rate": 6.983954278315738e-07, "logits/chosen": 2.6875, "logits/rejected": 2.31640625, "logps/chosen": -899.0, "logps/rejected": -1065.0, "loss": 0.4832, "rewards/accuracies": 0.96875, "rewards/chosen": 1.130859375, "rewards/margins": 5.5625, "rewards/rejected": -4.4296875, "step": 2403 }, { "epoch": 0.4542491378903113, "grad_norm": 2.408310706469672, "learning_rate": 6.981151863756315e-07, "logits/chosen": 2.80078125, "logits/rejected": 2.97265625, "logps/chosen": -1656.0, "logps/rejected": -1608.0, "loss": 0.6353, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3564453125, "rewards/margins": 5.19140625, "rewards/rejected": -3.828125, "step": 2404 }, { "epoch": 0.4544380934385186, "grad_norm": 2.34148256543018, "learning_rate": 6.978348804822673e-07, "logits/chosen": 2.2548828125, "logits/rejected": 2.41015625, "logps/chosen": -708.0, "logps/rejected": -681.0, "loss": 0.6927, "rewards/accuracies": 0.78125, "rewards/chosen": 0.398529052734375, "rewards/margins": 3.14208984375, "rewards/rejected": -2.7431640625, "step": 2405 }, { "epoch": 0.45462704898672585, "grad_norm": 1.7328517581483178, "learning_rate": 6.975545102734281e-07, "logits/chosen": 1.724609375, "logits/rejected": 1.6826171875, "logps/chosen": -778.0, "logps/rejected": -1304.0, "loss": 0.6323, "rewards/accuracies": 0.84375, "rewards/chosen": 0.419921875, "rewards/margins": 6.3203125, "rewards/rejected": -5.91015625, "step": 2406 }, { "epoch": 0.45481600453493315, "grad_norm": 4.423704523332563, "learning_rate": 6.972740758710888e-07, "logits/chosen": 2.56640625, "logits/rejected": 2.162109375, "logps/chosen": -728.0, "logps/rejected": -866.0, "loss": 0.6923, "rewards/accuracies": 0.78125, "rewards/chosen": -0.538330078125, "rewards/margins": 3.662109375, "rewards/rejected": -4.19921875, "step": 2407 }, { "epoch": 0.45500496008314045, "grad_norm": 2.1489131880409205, "learning_rate": 6.969935773972524e-07, "logits/chosen": 1.7431640625, "logits/rejected": 1.58984375, "logps/chosen": -591.5, "logps/rejected": -683.0, "loss": 0.7386, "rewards/accuracies": 0.78125, "rewards/chosen": -0.08544921875, "rewards/margins": 2.939453125, "rewards/rejected": -3.025390625, "step": 2408 }, { "epoch": 0.4551939156313477, "grad_norm": 1.7388035567115316, "learning_rate": 6.967130149739494e-07, "logits/chosen": 1.31005859375, "logits/rejected": 1.3154296875, "logps/chosen": -825.0, "logps/rejected": -693.0, "loss": 0.4793, "rewards/accuracies": 0.90625, "rewards/chosen": 0.97076416015625, "rewards/margins": 5.359375, "rewards/rejected": -4.39453125, "step": 2409 }, { "epoch": 0.455382871179555, "grad_norm": 1.8345008925626138, "learning_rate": 6.964323887232384e-07, "logits/chosen": 2.0244140625, "logits/rejected": 1.564453125, "logps/chosen": -963.5, "logps/rejected": -985.5, "loss": 0.571, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1357421875, "rewards/margins": 4.87109375, "rewards/rejected": -3.72265625, "step": 2410 }, { "epoch": 0.4555718267277623, "grad_norm": 2.5250868801990243, "learning_rate": 6.961516987672055e-07, "logits/chosen": 2.57421875, "logits/rejected": 2.748046875, "logps/chosen": -746.0, "logps/rejected": -1296.5, "loss": 0.6552, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5303955078125, "rewards/margins": 4.375, "rewards/rejected": -3.84375, "step": 2411 }, { "epoch": 0.45576078227596956, "grad_norm": 2.6110355113638932, "learning_rate": 6.95870945227965e-07, "logits/chosen": 2.251953125, "logits/rejected": 2.134765625, "logps/chosen": -933.5, "logps/rejected": -1267.0, "loss": 0.6411, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6708984375, "rewards/margins": 4.19921875, "rewards/rejected": -2.52734375, "step": 2412 }, { "epoch": 0.45594973782417686, "grad_norm": 2.255108493194047, "learning_rate": 6.955901282276582e-07, "logits/chosen": 2.921875, "logits/rejected": 2.576171875, "logps/chosen": -756.5, "logps/rejected": -2669.0, "loss": 0.595, "rewards/accuracies": 0.875, "rewards/chosen": 0.4521484375, "rewards/margins": 8.578125, "rewards/rejected": -8.1328125, "step": 2413 }, { "epoch": 0.45613869337238416, "grad_norm": 2.576329607472384, "learning_rate": 6.953092478884547e-07, "logits/chosen": 1.646484375, "logits/rejected": 1.5146484375, "logps/chosen": -1131.5, "logps/rejected": -1199.5, "loss": 0.6095, "rewards/accuracies": 0.875, "rewards/chosen": 1.0927734375, "rewards/margins": 4.443359375, "rewards/rejected": -3.357421875, "step": 2414 }, { "epoch": 0.4563276489205914, "grad_norm": 2.4828051339943724, "learning_rate": 6.950283043325509e-07, "logits/chosen": 1.87890625, "logits/rejected": 1.9765625, "logps/chosen": -835.5, "logps/rejected": -971.0, "loss": 0.5258, "rewards/accuracies": 0.8125, "rewards/chosen": 0.577056884765625, "rewards/margins": 7.671875, "rewards/rejected": -7.09375, "step": 2415 }, { "epoch": 0.4565166044687987, "grad_norm": 2.481920351066632, "learning_rate": 6.947472976821717e-07, "logits/chosen": 2.2451171875, "logits/rejected": 2.22021484375, "logps/chosen": -767.0, "logps/rejected": -909.0, "loss": 0.4964, "rewards/accuracies": 0.875, "rewards/chosen": 0.877685546875, "rewards/margins": 4.71875, "rewards/rejected": -3.8359375, "step": 2416 }, { "epoch": 0.456705560017006, "grad_norm": 2.8335712733508287, "learning_rate": 6.944662280595685e-07, "logits/chosen": 1.3907470703125, "logits/rejected": 1.05859375, "logps/chosen": -798.0, "logps/rejected": -1608.0, "loss": 0.4677, "rewards/accuracies": 0.9375, "rewards/chosen": 0.70751953125, "rewards/margins": 7.54296875, "rewards/rejected": -6.83203125, "step": 2417 }, { "epoch": 0.45689451556521327, "grad_norm": 2.080169922259827, "learning_rate": 6.941850955870206e-07, "logits/chosen": 2.90234375, "logits/rejected": 2.3193359375, "logps/chosen": -657.5, "logps/rejected": -566.0, "loss": 0.6437, "rewards/accuracies": 0.8125, "rewards/chosen": 0.285888671875, "rewards/margins": 3.53515625, "rewards/rejected": -3.25, "step": 2418 }, { "epoch": 0.45708347111342057, "grad_norm": 2.9839578117302747, "learning_rate": 6.939039003868342e-07, "logits/chosen": 1.6845703125, "logits/rejected": 1.4970703125, "logps/chosen": -862.0, "logps/rejected": -1259.0, "loss": 0.5077, "rewards/accuracies": 0.875, "rewards/chosen": 1.01904296875, "rewards/margins": 8.109375, "rewards/rejected": -7.10546875, "step": 2419 }, { "epoch": 0.4572724266616279, "grad_norm": 2.1723382975014283, "learning_rate": 6.936226425813435e-07, "logits/chosen": 2.2578125, "logits/rejected": 2.60205078125, "logps/chosen": -925.0, "logps/rejected": -889.0, "loss": 0.511, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2939453125, "rewards/margins": 5.2890625, "rewards/rejected": -4.0078125, "step": 2420 }, { "epoch": 0.4574613822098351, "grad_norm": 1.9181643077943027, "learning_rate": 6.933413222929095e-07, "logits/chosen": 1.4365234375, "logits/rejected": 0.7947998046875, "logps/chosen": -762.0, "logps/rejected": -697.0, "loss": 0.7215, "rewards/accuracies": 0.75, "rewards/chosen": 0.09765625, "rewards/margins": 3.166015625, "rewards/rejected": -3.0703125, "step": 2421 }, { "epoch": 0.4576503377580424, "grad_norm": 1.5014875454672247, "learning_rate": 6.930599396439202e-07, "logits/chosen": 2.69140625, "logits/rejected": 2.8828125, "logps/chosen": -859.5, "logps/rejected": -2065.5, "loss": 0.5122, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7900390625, "rewards/margins": 6.3671875, "rewards/rejected": -5.5703125, "step": 2422 }, { "epoch": 0.45783929330624973, "grad_norm": 1.9552553407759679, "learning_rate": 6.927784947567911e-07, "logits/chosen": 2.923828125, "logits/rejected": 2.73828125, "logps/chosen": -902.0, "logps/rejected": -968.5, "loss": 0.5679, "rewards/accuracies": 0.84375, "rewards/chosen": 1.26904296875, "rewards/margins": 5.26953125, "rewards/rejected": -3.99609375, "step": 2423 }, { "epoch": 0.458028248854457, "grad_norm": 3.383873910377764, "learning_rate": 6.924969877539645e-07, "logits/chosen": 2.0546875, "logits/rejected": 2.060546875, "logps/chosen": -838.5, "logps/rejected": -1107.0, "loss": 0.6948, "rewards/accuracies": 0.78125, "rewards/chosen": 0.439208984375, "rewards/margins": 3.33984375, "rewards/rejected": -2.90625, "step": 2424 }, { "epoch": 0.4582172044026643, "grad_norm": 2.201899702471157, "learning_rate": 6.922154187579099e-07, "logits/chosen": 2.09991455078125, "logits/rejected": 1.794921875, "logps/chosen": -717.0, "logps/rejected": -648.5, "loss": 0.5205, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8172607421875, "rewards/margins": 4.0078125, "rewards/rejected": -3.19921875, "step": 2425 }, { "epoch": 0.4584061599508716, "grad_norm": 2.2858898414753526, "learning_rate": 6.919337878911239e-07, "logits/chosen": 2.212890625, "logits/rejected": 1.927734375, "logps/chosen": -1004.0, "logps/rejected": -1355.0, "loss": 0.5374, "rewards/accuracies": 0.84375, "rewards/chosen": 1.119140625, "rewards/margins": 4.6513671875, "rewards/rejected": -3.5400390625, "step": 2426 }, { "epoch": 0.45859511549907883, "grad_norm": 1.8126133334045362, "learning_rate": 6.916520952761295e-07, "logits/chosen": 3.62109375, "logits/rejected": 4.03515625, "logps/chosen": -764.0, "logps/rejected": -870.0, "loss": 0.5565, "rewards/accuracies": 0.84375, "rewards/chosen": 1.66796875, "rewards/margins": 4.5546875, "rewards/rejected": -2.892578125, "step": 2427 }, { "epoch": 0.45878407104728613, "grad_norm": 1.7424792270927167, "learning_rate": 6.913703410354772e-07, "logits/chosen": 3.30078125, "logits/rejected": 2.73828125, "logps/chosen": -551.75, "logps/rejected": -684.5, "loss": 0.6284, "rewards/accuracies": 0.90625, "rewards/chosen": 0.382080078125, "rewards/margins": 3.33203125, "rewards/rejected": -2.94921875, "step": 2428 }, { "epoch": 0.4589730265954934, "grad_norm": 1.4257257202464968, "learning_rate": 6.910885252917439e-07, "logits/chosen": 2.232421875, "logits/rejected": 2.6015625, "logps/chosen": -902.5, "logps/rejected": -1544.0, "loss": 0.5643, "rewards/accuracies": 0.875, "rewards/chosen": 0.82568359375, "rewards/margins": 7.19140625, "rewards/rejected": -6.36328125, "step": 2429 }, { "epoch": 0.4591619821437007, "grad_norm": 2.651501344812911, "learning_rate": 6.908066481675335e-07, "logits/chosen": 1.64453125, "logits/rejected": 1.056640625, "logps/chosen": -570.25, "logps/rejected": -515.75, "loss": 0.6342, "rewards/accuracies": 0.90625, "rewards/chosen": 0.693359375, "rewards/margins": 3.01953125, "rewards/rejected": -2.325439453125, "step": 2430 }, { "epoch": 0.459350937691908, "grad_norm": 1.9722982800901914, "learning_rate": 6.90524709785476e-07, "logits/chosen": 2.7421875, "logits/rejected": 2.85546875, "logps/chosen": -781.0, "logps/rejected": -1750.0, "loss": 0.641, "rewards/accuracies": 0.75, "rewards/chosen": 1.01708984375, "rewards/margins": 7.125, "rewards/rejected": -6.10546875, "step": 2431 }, { "epoch": 0.45953989324011524, "grad_norm": 2.0179825476226103, "learning_rate": 6.902427102682293e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.51953125, "logps/chosen": -771.0, "logps/rejected": -692.5, "loss": 0.6877, "rewards/accuracies": 0.71875, "rewards/chosen": 0.642578125, "rewards/margins": 3.03125, "rewards/rejected": -2.386962890625, "step": 2432 }, { "epoch": 0.45972884878832254, "grad_norm": 1.7084702685973567, "learning_rate": 6.899606497384764e-07, "logits/chosen": 3.4609375, "logits/rejected": 3.703125, "logps/chosen": -594.0, "logps/rejected": -689.5, "loss": 0.5367, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0390625, "rewards/margins": 4.65625, "rewards/rejected": -3.609375, "step": 2433 }, { "epoch": 0.45991780433652985, "grad_norm": 1.3789459527271013, "learning_rate": 6.896785283189279e-07, "logits/chosen": 2.671875, "logits/rejected": 2.66015625, "logps/chosen": -845.0, "logps/rejected": -1664.0, "loss": 0.5001, "rewards/accuracies": 0.8125, "rewards/chosen": 1.19287109375, "rewards/margins": 7.30078125, "rewards/rejected": -6.1015625, "step": 2434 }, { "epoch": 0.4601067598847371, "grad_norm": 2.230047095383452, "learning_rate": 6.893963461323205e-07, "logits/chosen": 1.90234375, "logits/rejected": 2.255859375, "logps/chosen": -823.0, "logps/rejected": -869.0, "loss": 0.604, "rewards/accuracies": 0.875, "rewards/chosen": 0.7498779296875, "rewards/margins": 4.0625, "rewards/rejected": -3.30859375, "step": 2435 }, { "epoch": 0.4602957154329444, "grad_norm": 3.5648744511233965, "learning_rate": 6.891141033014175e-07, "logits/chosen": 2.767578125, "logits/rejected": 3.359375, "logps/chosen": -714.0, "logps/rejected": -1645.0, "loss": 0.6554, "rewards/accuracies": 0.8125, "rewards/chosen": 1.27587890625, "rewards/margins": 4.505859375, "rewards/rejected": -3.22265625, "step": 2436 }, { "epoch": 0.4604846709811517, "grad_norm": 1.7836957385995587, "learning_rate": 6.888317999490081e-07, "logits/chosen": 2.88671875, "logits/rejected": 3.33984375, "logps/chosen": -1042.0, "logps/rejected": -1167.0, "loss": 0.6201, "rewards/accuracies": 0.65625, "rewards/chosen": 1.70849609375, "rewards/margins": 4.900390625, "rewards/rejected": -3.1875, "step": 2437 }, { "epoch": 0.46067362652935895, "grad_norm": 2.709109751597986, "learning_rate": 6.885494361979087e-07, "logits/chosen": 3.177734375, "logits/rejected": 3.5625, "logps/chosen": -749.5, "logps/rejected": -878.5, "loss": 0.6569, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7802734375, "rewards/margins": 4.234375, "rewards/rejected": -3.453125, "step": 2438 }, { "epoch": 0.46086258207756625, "grad_norm": 1.9605056344713, "learning_rate": 6.882670121709611e-07, "logits/chosen": 2.432373046875, "logits/rejected": 1.77557373046875, "logps/chosen": -923.0, "logps/rejected": -739.0, "loss": 0.637, "rewards/accuracies": 0.84375, "rewards/chosen": 0.69940185546875, "rewards/margins": 3.81640625, "rewards/rejected": -3.11328125, "step": 2439 }, { "epoch": 0.46105153762577356, "grad_norm": 2.390360132361526, "learning_rate": 6.879845279910341e-07, "logits/chosen": 2.9140625, "logits/rejected": 2.68359375, "logps/chosen": -809.0, "logps/rejected": -746.0, "loss": 0.6717, "rewards/accuracies": 0.8125, "rewards/chosen": 0.73779296875, "rewards/margins": 3.796875, "rewards/rejected": -3.06640625, "step": 2440 }, { "epoch": 0.4612404931739808, "grad_norm": 1.807017046654036, "learning_rate": 6.877019837810218e-07, "logits/chosen": 2.0859375, "logits/rejected": 1.908203125, "logps/chosen": -1017.5, "logps/rejected": -768.0, "loss": 0.7085, "rewards/accuracies": 0.6875, "rewards/chosen": 1.3740234375, "rewards/margins": 4.33984375, "rewards/rejected": -2.9755859375, "step": 2441 }, { "epoch": 0.4614294487221881, "grad_norm": 1.882839836011989, "learning_rate": 6.87419379663845e-07, "logits/chosen": 3.55859375, "logits/rejected": 3.46484375, "logps/chosen": -799.5, "logps/rejected": -1621.0, "loss": 0.6853, "rewards/accuracies": 0.75, "rewards/chosen": 0.9404296875, "rewards/margins": 7.083984375, "rewards/rejected": -6.1328125, "step": 2442 }, { "epoch": 0.4616184042703954, "grad_norm": 2.3821188152461588, "learning_rate": 6.871367157624507e-07, "logits/chosen": 2.25390625, "logits/rejected": 2.396484375, "logps/chosen": -824.0, "logps/rejected": -923.0, "loss": 0.5345, "rewards/accuracies": 0.875, "rewards/chosen": 1.07666015625, "rewards/margins": 5.3515625, "rewards/rejected": -4.27734375, "step": 2443 }, { "epoch": 0.46180735981860266, "grad_norm": 2.404813805545363, "learning_rate": 6.868539921998116e-07, "logits/chosen": 2.546875, "logits/rejected": 2.39453125, "logps/chosen": -767.0, "logps/rejected": -748.0, "loss": 0.691, "rewards/accuracies": 0.71875, "rewards/chosen": 0.31298828125, "rewards/margins": 3.48828125, "rewards/rejected": -3.17578125, "step": 2444 }, { "epoch": 0.46199631536680996, "grad_norm": 1.825132898166221, "learning_rate": 6.865712090989262e-07, "logits/chosen": 2.71875, "logits/rejected": 2.73046875, "logps/chosen": -489.0, "logps/rejected": -483.0, "loss": 0.7144, "rewards/accuracies": 0.71875, "rewards/chosen": 0.30419921875, "rewards/margins": 1.94189453125, "rewards/rejected": -1.6318359375, "step": 2445 }, { "epoch": 0.46218527091501727, "grad_norm": 2.125599210619295, "learning_rate": 6.862883665828191e-07, "logits/chosen": 2.513671875, "logits/rejected": 2.91015625, "logps/chosen": -380.5, "logps/rejected": -612.25, "loss": 0.6615, "rewards/accuracies": 0.75, "rewards/chosen": 0.3173828125, "rewards/margins": 3.25, "rewards/rejected": -2.931640625, "step": 2446 }, { "epoch": 0.4623742264632245, "grad_norm": 3.530488287310483, "learning_rate": 6.860054647745411e-07, "logits/chosen": 3.1953125, "logits/rejected": 3.2421875, "logps/chosen": -908.5, "logps/rejected": -1249.0, "loss": 0.573, "rewards/accuracies": 0.75, "rewards/chosen": 1.357421875, "rewards/margins": 4.6953125, "rewards/rejected": -3.33984375, "step": 2447 }, { "epoch": 0.4625631820114318, "grad_norm": 1.811284349516983, "learning_rate": 6.857225037971683e-07, "logits/chosen": 2.443359375, "logits/rejected": 2.365234375, "logps/chosen": -523.0, "logps/rejected": -851.0, "loss": 0.6623, "rewards/accuracies": 0.84375, "rewards/chosen": 0.40380859375, "rewards/margins": 4.6015625, "rewards/rejected": -4.203125, "step": 2448 }, { "epoch": 0.4627521375596391, "grad_norm": 2.215223969634442, "learning_rate": 6.854394837738024e-07, "logits/chosen": 2.60546875, "logits/rejected": 2.068359375, "logps/chosen": -580.5, "logps/rejected": -720.0, "loss": 0.5284, "rewards/accuracies": 0.9375, "rewards/chosen": 0.40234375, "rewards/margins": 5.453125, "rewards/rejected": -5.046875, "step": 2449 }, { "epoch": 0.46294109310784637, "grad_norm": 1.7205945572103647, "learning_rate": 6.851564048275716e-07, "logits/chosen": 3.0703125, "logits/rejected": 2.765625, "logps/chosen": -781.0, "logps/rejected": -1545.0, "loss": 0.4838, "rewards/accuracies": 0.875, "rewards/chosen": 0.9169921875, "rewards/margins": 7.65234375, "rewards/rejected": -6.734375, "step": 2450 }, { "epoch": 0.46313004865605367, "grad_norm": 1.4655136157247048, "learning_rate": 6.848732670816289e-07, "logits/chosen": 2.84375, "logits/rejected": 2.95703125, "logps/chosen": -657.0, "logps/rejected": -1309.0, "loss": 0.8115, "rewards/accuracies": 0.75, "rewards/chosen": -0.304443359375, "rewards/margins": 4.556640625, "rewards/rejected": -4.84814453125, "step": 2451 }, { "epoch": 0.4633190042042609, "grad_norm": 1.9691284347363747, "learning_rate": 6.845900706591533e-07, "logits/chosen": 2.6328125, "logits/rejected": 2.412109375, "logps/chosen": -739.5, "logps/rejected": -627.0, "loss": 0.6476, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6845703125, "rewards/margins": 4.3046875, "rewards/rejected": -3.6171875, "step": 2452 }, { "epoch": 0.4635079597524682, "grad_norm": 2.4469110982488593, "learning_rate": 6.843068156833492e-07, "logits/chosen": 2.4873046875, "logits/rejected": 2.517578125, "logps/chosen": -1039.0, "logps/rejected": -1046.5, "loss": 0.5911, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3603515625, "rewards/margins": 4.427734375, "rewards/rejected": -3.064453125, "step": 2453 }, { "epoch": 0.4636969153006755, "grad_norm": 1.984524780889156, "learning_rate": 6.840235022774465e-07, "logits/chosen": 1.900390625, "logits/rejected": 1.599609375, "logps/chosen": -823.0, "logps/rejected": -795.0, "loss": 0.563, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8291015625, "rewards/margins": 4.02734375, "rewards/rejected": -3.201171875, "step": 2454 }, { "epoch": 0.4638858708488828, "grad_norm": 2.4272432095134158, "learning_rate": 6.837401305647005e-07, "logits/chosen": 2.59765625, "logits/rejected": 2.46484375, "logps/chosen": -1016.5, "logps/rejected": -1038.0, "loss": 0.5782, "rewards/accuracies": 0.84375, "rewards/chosen": 1.8310546875, "rewards/margins": 5.03125, "rewards/rejected": -3.19140625, "step": 2455 }, { "epoch": 0.4640748263970901, "grad_norm": 2.4671700630551396, "learning_rate": 6.834567006683922e-07, "logits/chosen": 1.22412109375, "logits/rejected": 1.1820068359375, "logps/chosen": -880.0, "logps/rejected": -977.0, "loss": 0.6566, "rewards/accuracies": 0.6875, "rewards/chosen": 0.713623046875, "rewards/margins": 4.453125, "rewards/rejected": -3.73828125, "step": 2456 }, { "epoch": 0.4642637819452974, "grad_norm": 2.2517771701216795, "learning_rate": 6.83173212711827e-07, "logits/chosen": 1.974609375, "logits/rejected": 1.8427734375, "logps/chosen": -879.0, "logps/rejected": -821.5, "loss": 0.556, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9521484375, "rewards/margins": 4.71875, "rewards/rejected": -3.76171875, "step": 2457 }, { "epoch": 0.46445273749350463, "grad_norm": 1.999516186141773, "learning_rate": 6.828896668183367e-07, "logits/chosen": 2.625, "logits/rejected": 2.69140625, "logps/chosen": -626.5, "logps/rejected": -827.0, "loss": 0.643, "rewards/accuracies": 0.75, "rewards/chosen": 0.05902099609375, "rewards/margins": 3.373046875, "rewards/rejected": -3.310546875, "step": 2458 }, { "epoch": 0.46464169304171193, "grad_norm": 2.0667865630677125, "learning_rate": 6.826060631112775e-07, "logits/chosen": 1.4482421875, "logits/rejected": 1.408203125, "logps/chosen": -1141.0, "logps/rejected": -1303.0, "loss": 0.6252, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1158447265625, "rewards/margins": 5.83984375, "rewards/rejected": -4.71484375, "step": 2459 }, { "epoch": 0.46483064858991924, "grad_norm": 1.5362188415634228, "learning_rate": 6.82322401714031e-07, "logits/chosen": 3.06640625, "logits/rejected": 2.9765625, "logps/chosen": -1027.5, "logps/rejected": -1306.0, "loss": 0.5605, "rewards/accuracies": 0.75, "rewards/chosen": 1.4716796875, "rewards/margins": 5.3671875, "rewards/rejected": -3.89453125, "step": 2460 }, { "epoch": 0.4650196041381265, "grad_norm": 1.9908686665772117, "learning_rate": 6.820386827500042e-07, "logits/chosen": 2.083984375, "logits/rejected": 2.189453125, "logps/chosen": -677.5, "logps/rejected": -770.5, "loss": 0.6467, "rewards/accuracies": 0.71875, "rewards/chosen": 0.509765625, "rewards/margins": 3.69140625, "rewards/rejected": -3.17578125, "step": 2461 }, { "epoch": 0.4652085596863338, "grad_norm": 1.82243489717412, "learning_rate": 6.817549063426285e-07, "logits/chosen": 2.80859375, "logits/rejected": 2.99609375, "logps/chosen": -589.5, "logps/rejected": -1027.0, "loss": 0.5717, "rewards/accuracies": 0.90625, "rewards/chosen": 0.438720703125, "rewards/margins": 4.8984375, "rewards/rejected": -4.4609375, "step": 2462 }, { "epoch": 0.4653975152345411, "grad_norm": 3.178597827710879, "learning_rate": 6.81471072615361e-07, "logits/chosen": 1.978515625, "logits/rejected": 2.298828125, "logps/chosen": -698.0, "logps/rejected": -930.0, "loss": 0.6185, "rewards/accuracies": 0.8125, "rewards/chosen": 1.17578125, "rewards/margins": 4.6875, "rewards/rejected": -3.5185546875, "step": 2463 }, { "epoch": 0.46558647078274834, "grad_norm": 1.6336056311073053, "learning_rate": 6.811871816916831e-07, "logits/chosen": 2.541015625, "logits/rejected": 2.33984375, "logps/chosen": -842.25, "logps/rejected": -1154.0, "loss": 0.6004, "rewards/accuracies": 0.78125, "rewards/chosen": 1.00732421875, "rewards/margins": 5.15234375, "rewards/rejected": -4.15234375, "step": 2464 }, { "epoch": 0.46577542633095564, "grad_norm": 2.301926899884313, "learning_rate": 6.809032336951015e-07, "logits/chosen": 3.1171875, "logits/rejected": 3.40234375, "logps/chosen": -478.0, "logps/rejected": -662.5, "loss": 0.7337, "rewards/accuracies": 0.75, "rewards/chosen": 0.31201171875, "rewards/margins": 3.318359375, "rewards/rejected": -3.007568359375, "step": 2465 }, { "epoch": 0.46596438187916295, "grad_norm": 2.031645328315205, "learning_rate": 6.806192287491477e-07, "logits/chosen": 2.849609375, "logits/rejected": 2.7109375, "logps/chosen": -721.5, "logps/rejected": -793.5, "loss": 0.6297, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9990234375, "rewards/margins": 3.39453125, "rewards/rejected": -2.388671875, "step": 2466 }, { "epoch": 0.4661533374273702, "grad_norm": 1.9980694499883405, "learning_rate": 6.803351669773777e-07, "logits/chosen": 2.33203125, "logits/rejected": 2.2138671875, "logps/chosen": -687.0, "logps/rejected": -690.0, "loss": 0.6645, "rewards/accuracies": 0.84375, "rewards/chosen": 0.686767578125, "rewards/margins": 2.585693359375, "rewards/rejected": -1.8909912109375, "step": 2467 }, { "epoch": 0.4663422929755775, "grad_norm": 1.964877353939007, "learning_rate": 6.800510485033724e-07, "logits/chosen": 2.81640625, "logits/rejected": 2.22265625, "logps/chosen": -1101.0, "logps/rejected": -1123.0, "loss": 0.4473, "rewards/accuracies": 0.9375, "rewards/chosen": 1.87841796875, "rewards/margins": 6.08203125, "rewards/rejected": -4.1953125, "step": 2468 }, { "epoch": 0.4665312485237848, "grad_norm": 1.5908142181102858, "learning_rate": 6.797668734507375e-07, "logits/chosen": 2.9921875, "logits/rejected": 2.6629638671875, "logps/chosen": -1040.5, "logps/rejected": -1141.75, "loss": 0.6163, "rewards/accuracies": 0.75, "rewards/chosen": 1.947265625, "rewards/margins": 4.9296875, "rewards/rejected": -2.97998046875, "step": 2469 }, { "epoch": 0.46672020407199205, "grad_norm": 1.5289037199881632, "learning_rate": 6.794826419431033e-07, "logits/chosen": 2.26953125, "logits/rejected": 2.025390625, "logps/chosen": -1055.0, "logps/rejected": -1019.0, "loss": 0.5736, "rewards/accuracies": 0.84375, "rewards/chosen": 0.60595703125, "rewards/margins": 5.23046875, "rewards/rejected": -4.63671875, "step": 2470 }, { "epoch": 0.46690915962019935, "grad_norm": 1.9031354081248897, "learning_rate": 6.791983541041241e-07, "logits/chosen": 2.4970703125, "logits/rejected": 3.134765625, "logps/chosen": -761.5, "logps/rejected": -1505.5, "loss": 0.6495, "rewards/accuracies": 0.875, "rewards/chosen": 1.0501708984375, "rewards/margins": 7.037109375, "rewards/rejected": -5.982421875, "step": 2471 }, { "epoch": 0.46709811516840666, "grad_norm": 2.1344846254899084, "learning_rate": 6.789140100574794e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.5546875, "logps/chosen": -812.0, "logps/rejected": -960.0, "loss": 0.6013, "rewards/accuracies": 0.75, "rewards/chosen": 0.9267578125, "rewards/margins": 5.1875, "rewards/rejected": -4.2734375, "step": 2472 }, { "epoch": 0.4672870707166139, "grad_norm": 1.4404030317617045, "learning_rate": 6.786296099268731e-07, "logits/chosen": 2.400390625, "logits/rejected": 2.208984375, "logps/chosen": -968.0, "logps/rejected": -993.0, "loss": 0.6098, "rewards/accuracies": 0.78125, "rewards/chosen": 1.28515625, "rewards/margins": 4.740234375, "rewards/rejected": -3.453125, "step": 2473 }, { "epoch": 0.4674760262648212, "grad_norm": 2.9299010751827277, "learning_rate": 6.783451538360332e-07, "logits/chosen": 2.52734375, "logits/rejected": 2.259765625, "logps/chosen": -726.0, "logps/rejected": -713.0, "loss": 0.5522, "rewards/accuracies": 0.8125, "rewards/chosen": 0.95166015625, "rewards/margins": 4.140625, "rewards/rejected": -3.1796875, "step": 2474 }, { "epoch": 0.46766498181302846, "grad_norm": 2.07470002824476, "learning_rate": 6.78060641908712e-07, "logits/chosen": 2.12109375, "logits/rejected": 2.0078125, "logps/chosen": -757.5, "logps/rejected": -1961.0, "loss": 0.5822, "rewards/accuracies": 0.75, "rewards/chosen": 1.2490234375, "rewards/margins": 2.0078125, "rewards/rejected": -0.76953125, "step": 2475 }, { "epoch": 0.46785393736123576, "grad_norm": 1.8423376235428177, "learning_rate": 6.777760742686863e-07, "logits/chosen": 2.9140625, "logits/rejected": 2.96484375, "logps/chosen": -676.0, "logps/rejected": -587.0, "loss": 0.6771, "rewards/accuracies": 0.75, "rewards/chosen": 0.22900390625, "rewards/margins": 3.15625, "rewards/rejected": -2.923828125, "step": 2476 }, { "epoch": 0.46804289290944306, "grad_norm": 1.9424487697770982, "learning_rate": 6.774914510397571e-07, "logits/chosen": 1.904296875, "logits/rejected": 2.37890625, "logps/chosen": -1021.0, "logps/rejected": -990.0, "loss": 0.5781, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5029296875, "rewards/margins": 5.27734375, "rewards/rejected": -3.779296875, "step": 2477 }, { "epoch": 0.4682318484576503, "grad_norm": 1.615257277826278, "learning_rate": 6.772067723457495e-07, "logits/chosen": 3.13671875, "logits/rejected": 2.12890625, "logps/chosen": -797.0, "logps/rejected": -678.5, "loss": 0.5214, "rewards/accuracies": 0.84375, "rewards/chosen": 1.201171875, "rewards/margins": 4.4296875, "rewards/rejected": -3.23046875, "step": 2478 }, { "epoch": 0.4684208040058576, "grad_norm": 1.8837630728466634, "learning_rate": 6.769220383105131e-07, "logits/chosen": 1.556640625, "logits/rejected": 1.1171875, "logps/chosen": -932.0, "logps/rejected": -1213.0, "loss": 0.535, "rewards/accuracies": 0.90625, "rewards/chosen": 0.76220703125, "rewards/margins": 4.86328125, "rewards/rejected": -4.1015625, "step": 2479 }, { "epoch": 0.4686097595540649, "grad_norm": 2.2313190134129073, "learning_rate": 6.766372490579209e-07, "logits/chosen": 3.12109375, "logits/rejected": 1.998046875, "logps/chosen": -596.0, "logps/rejected": -591.5, "loss": 0.5836, "rewards/accuracies": 0.8125, "rewards/chosen": 0.74609375, "rewards/margins": 3.875, "rewards/rejected": -3.126953125, "step": 2480 }, { "epoch": 0.46879871510227217, "grad_norm": 1.9410945602574505, "learning_rate": 6.763524047118702e-07, "logits/chosen": 3.046875, "logits/rejected": 2.25390625, "logps/chosen": -1636.0, "logps/rejected": -1015.0, "loss": 0.567, "rewards/accuracies": 0.75, "rewards/chosen": 2.0263671875, "rewards/margins": 4.53125, "rewards/rejected": -2.50244140625, "step": 2481 }, { "epoch": 0.46898767065047947, "grad_norm": 1.7168625689730936, "learning_rate": 6.76067505396283e-07, "logits/chosen": 2.240234375, "logits/rejected": 2.14453125, "logps/chosen": -940.0, "logps/rejected": -1024.5, "loss": 0.7052, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8107452392578125, "rewards/margins": 3.9765625, "rewards/rejected": -3.1640625, "step": 2482 }, { "epoch": 0.4691766261986868, "grad_norm": 1.792991340694895, "learning_rate": 6.757825512351038e-07, "logits/chosen": 2.666015625, "logits/rejected": 1.947265625, "logps/chosen": -1588.5, "logps/rejected": -1433.0, "loss": 0.4355, "rewards/accuracies": 0.9375, "rewards/chosen": 1.728515625, "rewards/margins": 7.3046875, "rewards/rejected": -5.578125, "step": 2483 }, { "epoch": 0.469365581746894, "grad_norm": 1.8684801254351857, "learning_rate": 6.754975423523023e-07, "logits/chosen": 1.6689453125, "logits/rejected": 1.927734375, "logps/chosen": -658.5, "logps/rejected": -865.0, "loss": 0.6109, "rewards/accuracies": 0.8125, "rewards/chosen": 0.40673828125, "rewards/margins": 4.68359375, "rewards/rejected": -4.28125, "step": 2484 }, { "epoch": 0.4695545372951013, "grad_norm": 2.4219325734495696, "learning_rate": 6.752124788718711e-07, "logits/chosen": 2.18359375, "logits/rejected": 1.65625, "logps/chosen": -1136.0, "logps/rejected": -1010.0, "loss": 0.5505, "rewards/accuracies": 0.84375, "rewards/chosen": 0.796875, "rewards/margins": 5.1171875, "rewards/rejected": -4.3203125, "step": 2485 }, { "epoch": 0.46974349284330863, "grad_norm": 1.90318827946158, "learning_rate": 6.749273609178269e-07, "logits/chosen": 1.931640625, "logits/rejected": 1.955078125, "logps/chosen": -966.0, "logps/rejected": -1234.0, "loss": 0.4629, "rewards/accuracies": 0.90625, "rewards/chosen": 1.677734375, "rewards/margins": 5.640625, "rewards/rejected": -3.953125, "step": 2486 }, { "epoch": 0.4699324483915159, "grad_norm": 2.4730181026504905, "learning_rate": 6.746421886142105e-07, "logits/chosen": 3.041015625, "logits/rejected": 2.2880859375, "logps/chosen": -616.0, "logps/rejected": -942.0, "loss": 0.6412, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1661376953125, "rewards/margins": 5.1328125, "rewards/rejected": -4.9765625, "step": 2487 }, { "epoch": 0.4701214039397232, "grad_norm": 1.8034227130643756, "learning_rate": 6.743569620850856e-07, "logits/chosen": 3.154296875, "logits/rejected": 3.078125, "logps/chosen": -534.0, "logps/rejected": -540.5, "loss": 0.6659, "rewards/accuracies": 0.71875, "rewards/chosen": 0.53369140625, "rewards/margins": 3.00390625, "rewards/rejected": -2.46728515625, "step": 2488 }, { "epoch": 0.4703103594879305, "grad_norm": 1.7938634271672276, "learning_rate": 6.740716814545398e-07, "logits/chosen": 2.43359375, "logits/rejected": 2.798828125, "logps/chosen": -729.0, "logps/rejected": -984.0, "loss": 0.6206, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41845703125, "rewards/margins": 4.31640625, "rewards/rejected": -4.734375, "step": 2489 }, { "epoch": 0.47049931503613773, "grad_norm": 1.900101095093472, "learning_rate": 6.737863468466842e-07, "logits/chosen": 2.4384765625, "logits/rejected": 2.5390625, "logps/chosen": -884.0, "logps/rejected": -1031.0, "loss": 0.5287, "rewards/accuracies": 0.84375, "rewards/chosen": 1.137939453125, "rewards/margins": 4.3359375, "rewards/rejected": -3.19140625, "step": 2490 }, { "epoch": 0.47068827058434504, "grad_norm": 2.7488302671263307, "learning_rate": 6.735009583856539e-07, "logits/chosen": 2.0947265625, "logits/rejected": 1.76171875, "logps/chosen": -706.0, "logps/rejected": -590.0, "loss": 0.5417, "rewards/accuracies": 0.875, "rewards/chosen": 1.16455078125, "rewards/margins": 4.69921875, "rewards/rejected": -3.53515625, "step": 2491 }, { "epoch": 0.47087722613255234, "grad_norm": 2.9837754982413682, "learning_rate": 6.732155161956066e-07, "logits/chosen": 2.2265625, "logits/rejected": 2.1640625, "logps/chosen": -812.5, "logps/rejected": -592.5, "loss": 0.6982, "rewards/accuracies": 0.625, "rewards/chosen": -0.3043212890625, "rewards/margins": 2.8984375, "rewards/rejected": -3.19921875, "step": 2492 }, { "epoch": 0.4710661816807596, "grad_norm": 1.885438773842971, "learning_rate": 6.729300204007238e-07, "logits/chosen": 2.669921875, "logits/rejected": 2.685546875, "logps/chosen": -1140.0, "logps/rejected": -1872.0, "loss": 0.5107, "rewards/accuracies": 0.875, "rewards/chosen": 0.93853759765625, "rewards/margins": 9.015625, "rewards/rejected": -8.08984375, "step": 2493 }, { "epoch": 0.4712551372289669, "grad_norm": 1.5877307435588897, "learning_rate": 6.726444711252102e-07, "logits/chosen": 2.953125, "logits/rejected": 2.30859375, "logps/chosen": -696.5, "logps/rejected": -610.5, "loss": 0.6757, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1883544921875, "rewards/margins": 3.64453125, "rewards/rejected": -3.45703125, "step": 2494 }, { "epoch": 0.4714440927771742, "grad_norm": 2.6111797494816793, "learning_rate": 6.72358868493294e-07, "logits/chosen": 1.57568359375, "logits/rejected": 1.29931640625, "logps/chosen": -788.0, "logps/rejected": -567.5, "loss": 0.5722, "rewards/accuracies": 0.9375, "rewards/chosen": 1.201171875, "rewards/margins": 3.640625, "rewards/rejected": -2.427734375, "step": 2495 }, { "epoch": 0.47163304832538144, "grad_norm": 1.9524872693524955, "learning_rate": 6.720732126292264e-07, "logits/chosen": 1.962890625, "logits/rejected": 1.6806640625, "logps/chosen": -765.75, "logps/rejected": -750.0, "loss": 0.5775, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1279296875, "rewards/margins": 4.37109375, "rewards/rejected": -3.2421875, "step": 2496 }, { "epoch": 0.47182200387358875, "grad_norm": 3.333537155181325, "learning_rate": 6.717875036572817e-07, "logits/chosen": 3.61328125, "logits/rejected": 3.201171875, "logps/chosen": -583.0, "logps/rejected": -665.0, "loss": 0.6584, "rewards/accuracies": 0.78125, "rewards/chosen": 0.342437744140625, "rewards/margins": 4.17236328125, "rewards/rejected": -3.83642578125, "step": 2497 }, { "epoch": 0.472010959421796, "grad_norm": 2.0437804560710613, "learning_rate": 6.715017417017576e-07, "logits/chosen": 2.515625, "logits/rejected": 2.55078125, "logps/chosen": -918.0, "logps/rejected": -1984.0, "loss": 0.5657, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1160888671875, "rewards/margins": 5.93359375, "rewards/rejected": -4.8125, "step": 2498 }, { "epoch": 0.4721999149700033, "grad_norm": 2.4361804708627903, "learning_rate": 6.712159268869745e-07, "logits/chosen": 2.78125, "logits/rejected": 2.70703125, "logps/chosen": -522.0, "logps/rejected": -818.0, "loss": 0.676, "rewards/accuracies": 0.78125, "rewards/chosen": 0.423095703125, "rewards/margins": 3.810546875, "rewards/rejected": -3.390625, "step": 2499 }, { "epoch": 0.4723888705182106, "grad_norm": 2.191457417899378, "learning_rate": 6.709300593372761e-07, "logits/chosen": 2.927734375, "logits/rejected": 3.0390625, "logps/chosen": -825.5, "logps/rejected": -1583.0, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": 0.8193359375, "rewards/margins": 6.498046875, "rewards/rejected": -5.69921875, "step": 2500 }, { "epoch": 0.47257782606641785, "grad_norm": 1.9576455546495874, "learning_rate": 6.70644139177029e-07, "logits/chosen": 3.03515625, "logits/rejected": 2.83984375, "logps/chosen": -753.0, "logps/rejected": -747.5, "loss": 0.7055, "rewards/accuracies": 0.6875, "rewards/chosen": 0.363037109375, "rewards/margins": 2.779296875, "rewards/rejected": -2.4140625, "step": 2501 }, { "epoch": 0.47276678161462515, "grad_norm": 2.0981335555289813, "learning_rate": 6.703581665306222e-07, "logits/chosen": 2.484375, "logits/rejected": 2.83984375, "logps/chosen": -577.5, "logps/rejected": -1204.0, "loss": 0.5579, "rewards/accuracies": 0.875, "rewards/chosen": 0.95947265625, "rewards/margins": 5.890625, "rewards/rejected": -4.94140625, "step": 2502 }, { "epoch": 0.47295573716283246, "grad_norm": 2.5496107350770982, "learning_rate": 6.700721415224681e-07, "logits/chosen": 2.673828125, "logits/rejected": 2.3125, "logps/chosen": -665.0, "logps/rejected": -726.0, "loss": 0.5982, "rewards/accuracies": 0.78125, "rewards/chosen": 0.330078125, "rewards/margins": 3.82421875, "rewards/rejected": -3.5, "step": 2503 }, { "epoch": 0.4731446927110397, "grad_norm": 2.0604653173717047, "learning_rate": 6.697860642770019e-07, "logits/chosen": 3.703125, "logits/rejected": 3.33984375, "logps/chosen": -877.0, "logps/rejected": -1568.0, "loss": 0.6929, "rewards/accuracies": 0.6875, "rewards/chosen": -0.138671875, "rewards/margins": 3.1181640625, "rewards/rejected": -3.255859375, "step": 2504 }, { "epoch": 0.473333648259247, "grad_norm": 1.695001538862601, "learning_rate": 6.694999349186813e-07, "logits/chosen": 3.25390625, "logits/rejected": 2.734375, "logps/chosen": -1029.5, "logps/rejected": -767.0, "loss": 0.4493, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7998046875, "rewards/margins": 5.515625, "rewards/rejected": -3.70703125, "step": 2505 }, { "epoch": 0.4735226038074543, "grad_norm": 1.813275078400938, "learning_rate": 6.692137535719865e-07, "logits/chosen": 3.28125, "logits/rejected": 2.9453125, "logps/chosen": -579.0, "logps/rejected": -726.0, "loss": 0.6837, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6875, "rewards/margins": 2.8544921875, "rewards/rejected": -2.162109375, "step": 2506 }, { "epoch": 0.47371155935566156, "grad_norm": 2.0905380191281004, "learning_rate": 6.689275203614206e-07, "logits/chosen": 2.41015625, "logits/rejected": 2.25390625, "logps/chosen": -1123.0, "logps/rejected": -1095.0, "loss": 0.6274, "rewards/accuracies": 0.71875, "rewards/chosen": 1.391357421875, "rewards/margins": 5.4892578125, "rewards/rejected": -4.095703125, "step": 2507 }, { "epoch": 0.47390051490386886, "grad_norm": 2.1948607340678925, "learning_rate": 6.686412354115092e-07, "logits/chosen": 1.7021484375, "logits/rejected": 1.693359375, "logps/chosen": -780.0, "logps/rejected": -1388.0, "loss": 0.5559, "rewards/accuracies": 0.84375, "rewards/chosen": 0.927490234375, "rewards/margins": 10.86328125, "rewards/rejected": -9.9453125, "step": 2508 }, { "epoch": 0.47408947045207617, "grad_norm": 1.9955573725032267, "learning_rate": 6.683548988468005e-07, "logits/chosen": 2.416015625, "logits/rejected": 2.98046875, "logps/chosen": -551.0, "logps/rejected": -557.0, "loss": 0.7668, "rewards/accuracies": 0.71875, "rewards/chosen": 0.154541015625, "rewards/margins": 2.17578125, "rewards/rejected": -2.02587890625, "step": 2509 }, { "epoch": 0.4742784260002834, "grad_norm": 2.642778010912865, "learning_rate": 6.680685107918649e-07, "logits/chosen": 2.568359375, "logits/rejected": 2.435546875, "logps/chosen": -773.0, "logps/rejected": -814.5, "loss": 0.5698, "rewards/accuracies": 0.875, "rewards/chosen": 1.166015625, "rewards/margins": 4.0703125, "rewards/rejected": -2.90625, "step": 2510 }, { "epoch": 0.4744673815484907, "grad_norm": 2.6646484940295907, "learning_rate": 6.677820713712953e-07, "logits/chosen": 2.67578125, "logits/rejected": 2.05078125, "logps/chosen": -874.5, "logps/rejected": -824.5, "loss": 0.5947, "rewards/accuracies": 0.875, "rewards/chosen": 0.79345703125, "rewards/margins": 3.39453125, "rewards/rejected": -2.6015625, "step": 2511 }, { "epoch": 0.474656337096698, "grad_norm": 1.8257957532081937, "learning_rate": 6.674955807097071e-07, "logits/chosen": 2.919921875, "logits/rejected": 2.73828125, "logps/chosen": -1077.5, "logps/rejected": -1092.0, "loss": 0.6192, "rewards/accuracies": 0.78125, "rewards/chosen": 0.62158203125, "rewards/margins": 4.07421875, "rewards/rejected": -3.4609375, "step": 2512 }, { "epoch": 0.47484529264490527, "grad_norm": 1.822199558429811, "learning_rate": 6.67209038931738e-07, "logits/chosen": 3.001708984375, "logits/rejected": 2.7265625, "logps/chosen": -709.5, "logps/rejected": -1196.0, "loss": 0.6444, "rewards/accuracies": 0.8125, "rewards/chosen": 0.673828125, "rewards/margins": 4.421875, "rewards/rejected": -3.75, "step": 2513 }, { "epoch": 0.4750342481931126, "grad_norm": 2.0491389364584096, "learning_rate": 6.669224461620476e-07, "logits/chosen": 2.431640625, "logits/rejected": 2.201171875, "logps/chosen": -868.0, "logps/rejected": -2064.0, "loss": 0.4225, "rewards/accuracies": 0.875, "rewards/chosen": 1.33251953125, "rewards/margins": 6.09375, "rewards/rejected": -4.765625, "step": 2514 }, { "epoch": 0.4752232037413199, "grad_norm": 1.7024731779766658, "learning_rate": 6.666358025253183e-07, "logits/chosen": 2.607421875, "logits/rejected": 2.2841796875, "logps/chosen": -986.0, "logps/rejected": -1056.0, "loss": 0.5333, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4189453125, "rewards/margins": 4.71875, "rewards/rejected": -3.29296875, "step": 2515 }, { "epoch": 0.4754121592895271, "grad_norm": 1.593918211291237, "learning_rate": 6.663491081462536e-07, "logits/chosen": 2.443359375, "logits/rejected": 2.0, "logps/chosen": -595.0, "logps/rejected": -782.0, "loss": 0.5447, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3369140625, "rewards/margins": 7.1875, "rewards/rejected": -5.84765625, "step": 2516 }, { "epoch": 0.47560111483773443, "grad_norm": 2.1227177223401905, "learning_rate": 6.660623631495802e-07, "logits/chosen": 1.951171875, "logits/rejected": 2.072265625, "logps/chosen": -704.5, "logps/rejected": -636.5, "loss": 0.6847, "rewards/accuracies": 0.75, "rewards/chosen": 0.707275390625, "rewards/margins": 3.34765625, "rewards/rejected": -2.63671875, "step": 2517 }, { "epoch": 0.47579007038594173, "grad_norm": 2.1434702811575654, "learning_rate": 6.657755676600461e-07, "logits/chosen": 2.02734375, "logits/rejected": 1.595703125, "logps/chosen": -780.5, "logps/rejected": -740.0, "loss": 0.5907, "rewards/accuracies": 0.84375, "rewards/chosen": 0.727294921875, "rewards/margins": 3.265625, "rewards/rejected": -2.533203125, "step": 2518 }, { "epoch": 0.475979025934149, "grad_norm": 1.6009774653221491, "learning_rate": 6.654887218024219e-07, "logits/chosen": 1.94921875, "logits/rejected": 1.3623046875, "logps/chosen": -554.5, "logps/rejected": -489.75, "loss": 0.7508, "rewards/accuracies": 0.8125, "rewards/chosen": 0.098388671875, "rewards/margins": 2.3623046875, "rewards/rejected": -2.265625, "step": 2519 }, { "epoch": 0.4761679814823563, "grad_norm": 1.8817307254532483, "learning_rate": 6.652018257014993e-07, "logits/chosen": 2.34765625, "logits/rejected": 2.322265625, "logps/chosen": -881.5, "logps/rejected": -885.0, "loss": 0.6008, "rewards/accuracies": 0.8125, "rewards/chosen": 1.16796875, "rewards/margins": 7.3671875, "rewards/rejected": -6.1875, "step": 2520 }, { "epoch": 0.4763569370305636, "grad_norm": 1.8686889188875848, "learning_rate": 6.649148794820922e-07, "logits/chosen": 2.830078125, "logits/rejected": 2.64453125, "logps/chosen": -1116.0, "logps/rejected": -1321.0, "loss": 0.4873, "rewards/accuracies": 0.90625, "rewards/chosen": 1.716796875, "rewards/margins": 7.5, "rewards/rejected": -5.79296875, "step": 2521 }, { "epoch": 0.47654589257877084, "grad_norm": 1.4700455512532111, "learning_rate": 6.646278832690368e-07, "logits/chosen": 1.521484375, "logits/rejected": 1.26171875, "logps/chosen": -471.0, "logps/rejected": -593.0, "loss": 0.5274, "rewards/accuracies": 0.875, "rewards/chosen": 0.9951171875, "rewards/margins": 4.4453125, "rewards/rejected": -3.44921875, "step": 2522 }, { "epoch": 0.47673484812697814, "grad_norm": 3.136201450296688, "learning_rate": 6.643408371871905e-07, "logits/chosen": 2.11181640625, "logits/rejected": 1.9091796875, "logps/chosen": -675.0, "logps/rejected": -683.5, "loss": 0.5166, "rewards/accuracies": 0.875, "rewards/chosen": 0.94775390625, "rewards/margins": 5.19921875, "rewards/rejected": -4.265625, "step": 2523 }, { "epoch": 0.4769238036751854, "grad_norm": 2.5753885857931853, "learning_rate": 6.640537413614323e-07, "logits/chosen": 2.1748046875, "logits/rejected": 1.7939453125, "logps/chosen": -899.0, "logps/rejected": -1056.0, "loss": 0.5829, "rewards/accuracies": 0.84375, "rewards/chosen": 1.29296875, "rewards/margins": 4.16796875, "rewards/rejected": -2.87109375, "step": 2524 }, { "epoch": 0.4771127592233927, "grad_norm": 2.5512364532922707, "learning_rate": 6.637665959166631e-07, "logits/chosen": 2.0810546875, "logits/rejected": 1.859375, "logps/chosen": -731.5, "logps/rejected": -651.0, "loss": 0.5116, "rewards/accuracies": 0.875, "rewards/chosen": 0.705078125, "rewards/margins": 3.57421875, "rewards/rejected": -2.8671875, "step": 2525 }, { "epoch": 0.4773017147716, "grad_norm": 2.544589568228627, "learning_rate": 6.634794009778055e-07, "logits/chosen": 3.1640625, "logits/rejected": 2.8984375, "logps/chosen": -565.5, "logps/rejected": -1299.0, "loss": 0.544, "rewards/accuracies": 0.90625, "rewards/chosen": 1.05078125, "rewards/margins": 6.046875, "rewards/rejected": -4.994140625, "step": 2526 }, { "epoch": 0.47749067031980724, "grad_norm": 2.654673162923047, "learning_rate": 6.631921566698036e-07, "logits/chosen": 2.451171875, "logits/rejected": 2.63671875, "logps/chosen": -766.0, "logps/rejected": -1068.0, "loss": 0.5159, "rewards/accuracies": 0.875, "rewards/chosen": 1.671875, "rewards/margins": 4.9921875, "rewards/rejected": -3.32421875, "step": 2527 }, { "epoch": 0.47767962586801455, "grad_norm": 2.0458483206649056, "learning_rate": 6.629048631176226e-07, "logits/chosen": 2.783203125, "logits/rejected": 2.16796875, "logps/chosen": -1215.0, "logps/rejected": -965.75, "loss": 0.5717, "rewards/accuracies": 0.875, "rewards/chosen": 1.32421875, "rewards/margins": 4.515625, "rewards/rejected": -3.185546875, "step": 2528 }, { "epoch": 0.47786858141622185, "grad_norm": 1.7265243170475586, "learning_rate": 6.626175204462494e-07, "logits/chosen": 2.63671875, "logits/rejected": 2.35546875, "logps/chosen": -620.5, "logps/rejected": -598.0, "loss": 0.7084, "rewards/accuracies": 0.75, "rewards/chosen": 0.0767822265625, "rewards/margins": 3.0703125, "rewards/rejected": -2.9873046875, "step": 2529 }, { "epoch": 0.4780575369644291, "grad_norm": 2.1831820847792964, "learning_rate": 6.623301287806925e-07, "logits/chosen": 4.1796875, "logits/rejected": 2.630859375, "logps/chosen": -651.75, "logps/rejected": -659.5, "loss": 0.6017, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2686767578125, "rewards/margins": 3.92578125, "rewards/rejected": -3.66015625, "step": 2530 }, { "epoch": 0.4782464925126364, "grad_norm": 1.9105172552163519, "learning_rate": 6.620426882459811e-07, "logits/chosen": 2.24609375, "logits/rejected": 2.2578125, "logps/chosen": -600.0, "logps/rejected": -1000.0, "loss": 0.7094, "rewards/accuracies": 0.75, "rewards/chosen": -0.168701171875, "rewards/margins": 4.0703125, "rewards/rejected": -4.24609375, "step": 2531 }, { "epoch": 0.4784354480608437, "grad_norm": 1.876893502494885, "learning_rate": 6.617551989671663e-07, "logits/chosen": 3.6015625, "logits/rejected": 3.23046875, "logps/chosen": -711.0, "logps/rejected": -1637.0, "loss": 0.5574, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5245361328125, "rewards/margins": 6.59765625, "rewards/rejected": -6.0859375, "step": 2532 }, { "epoch": 0.47862440360905095, "grad_norm": 2.668137134798757, "learning_rate": 6.614676610693202e-07, "logits/chosen": 1.68359375, "logits/rejected": 1.6748046875, "logps/chosen": -972.0, "logps/rejected": -1457.0, "loss": 0.5542, "rewards/accuracies": 0.96875, "rewards/chosen": -0.359375, "rewards/margins": 7.5859375, "rewards/rejected": -7.9453125, "step": 2533 }, { "epoch": 0.47881335915725826, "grad_norm": 1.788489335154895, "learning_rate": 6.611800746775358e-07, "logits/chosen": 2.43359375, "logits/rejected": 2.1640625, "logps/chosen": -817.5, "logps/rejected": -724.0, "loss": 0.6402, "rewards/accuracies": 0.78125, "rewards/chosen": 0.22900390625, "rewards/margins": 4.75390625, "rewards/rejected": -4.53125, "step": 2534 }, { "epoch": 0.47900231470546556, "grad_norm": 2.8861049731272876, "learning_rate": 6.608924399169274e-07, "logits/chosen": 2.548828125, "logits/rejected": 2.697265625, "logps/chosen": -1037.0, "logps/rejected": -1174.0, "loss": 0.5223, "rewards/accuracies": 0.84375, "rewards/chosen": 1.05859375, "rewards/margins": 5.859375, "rewards/rejected": -4.796875, "step": 2535 }, { "epoch": 0.4791912702536728, "grad_norm": 1.9050509050852196, "learning_rate": 6.606047569126304e-07, "logits/chosen": 1.62890625, "logits/rejected": 1.974609375, "logps/chosen": -1021.0, "logps/rejected": -2079.0, "loss": 0.6049, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7060546875, "rewards/margins": 6.359375, "rewards/rejected": -5.6484375, "step": 2536 }, { "epoch": 0.4793802258018801, "grad_norm": 2.350572560020751, "learning_rate": 6.603170257898012e-07, "logits/chosen": 2.0576171875, "logits/rejected": 1.569091796875, "logps/chosen": -469.0, "logps/rejected": -471.5, "loss": 0.5697, "rewards/accuracies": 0.875, "rewards/chosen": -0.03515625, "rewards/margins": 4.08203125, "rewards/rejected": -4.11328125, "step": 2537 }, { "epoch": 0.4795691813500874, "grad_norm": 2.0568031483487226, "learning_rate": 6.600292466736167e-07, "logits/chosen": 2.78125, "logits/rejected": 2.19921875, "logps/chosen": -1048.0, "logps/rejected": -1008.0, "loss": 0.487, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0438232421875, "rewards/margins": 4.6484375, "rewards/rejected": -3.6015625, "step": 2538 }, { "epoch": 0.47975813689829466, "grad_norm": 1.8568265885530015, "learning_rate": 6.597414196892753e-07, "logits/chosen": 1.28662109375, "logits/rejected": 1.2880859375, "logps/chosen": -616.0, "logps/rejected": -677.0, "loss": 0.6569, "rewards/accuracies": 0.875, "rewards/chosen": -0.2457275390625, "rewards/margins": 3.15234375, "rewards/rejected": -3.390625, "step": 2539 }, { "epoch": 0.47994709244650197, "grad_norm": 2.1328827768389433, "learning_rate": 6.594535449619961e-07, "logits/chosen": 2.0869140625, "logits/rejected": 1.87353515625, "logps/chosen": -738.0, "logps/rejected": -767.0, "loss": 0.6196, "rewards/accuracies": 0.8125, "rewards/chosen": 0.327880859375, "rewards/margins": 3.98828125, "rewards/rejected": -3.65625, "step": 2540 }, { "epoch": 0.48013604799470927, "grad_norm": 1.8135873394353084, "learning_rate": 6.591656226170187e-07, "logits/chosen": 1.4931640625, "logits/rejected": 1.3955078125, "logps/chosen": -817.0, "logps/rejected": -911.0, "loss": 0.5701, "rewards/accuracies": 0.8125, "rewards/chosen": 1.18701171875, "rewards/margins": 4.640625, "rewards/rejected": -3.453125, "step": 2541 }, { "epoch": 0.4803250035429165, "grad_norm": 4.016444313955143, "learning_rate": 6.588776527796033e-07, "logits/chosen": 2.1572265625, "logits/rejected": 2.30859375, "logps/chosen": -818.0, "logps/rejected": -1732.0, "loss": 0.6027, "rewards/accuracies": 0.75, "rewards/chosen": 0.70849609375, "rewards/margins": 6.4296875, "rewards/rejected": -5.73828125, "step": 2542 }, { "epoch": 0.4805139590911238, "grad_norm": 1.6588704205617848, "learning_rate": 6.585896355750312e-07, "logits/chosen": 2.828125, "logits/rejected": 3.09375, "logps/chosen": -578.5, "logps/rejected": -731.5, "loss": 0.58, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2626953125, "rewards/margins": 3.98828125, "rewards/rejected": -3.7265625, "step": 2543 }, { "epoch": 0.4807029146393311, "grad_norm": 1.9906351556978397, "learning_rate": 6.583015711286042e-07, "logits/chosen": 2.25390625, "logits/rejected": 2.1953125, "logps/chosen": -935.0, "logps/rejected": -967.0, "loss": 0.55, "rewards/accuracies": 0.875, "rewards/chosen": 0.5867919921875, "rewards/margins": 4.24609375, "rewards/rejected": -3.6640625, "step": 2544 }, { "epoch": 0.4808918701875384, "grad_norm": 1.578710689991355, "learning_rate": 6.580134595656445e-07, "logits/chosen": 3.435546875, "logits/rejected": 3.08203125, "logps/chosen": -752.5, "logps/rejected": -765.0, "loss": 0.5967, "rewards/accuracies": 0.8125, "rewards/chosen": 0.451904296875, "rewards/margins": 4.52734375, "rewards/rejected": -4.0703125, "step": 2545 }, { "epoch": 0.4810808257357457, "grad_norm": 2.2397794799376776, "learning_rate": 6.577253010114946e-07, "logits/chosen": 2.037109375, "logits/rejected": 2.14453125, "logps/chosen": -524.5, "logps/rejected": -704.5, "loss": 0.718, "rewards/accuracies": 0.6875, "rewards/chosen": 0.157958984375, "rewards/margins": 3.509765625, "rewards/rejected": -3.3447265625, "step": 2546 }, { "epoch": 0.4812697812839529, "grad_norm": 2.4804263289968853, "learning_rate": 6.574370955915181e-07, "logits/chosen": 2.958984375, "logits/rejected": 2.27734375, "logps/chosen": -888.0, "logps/rejected": -811.0, "loss": 0.4971, "rewards/accuracies": 0.90625, "rewards/chosen": 1.12890625, "rewards/margins": 4.650390625, "rewards/rejected": -3.515625, "step": 2547 }, { "epoch": 0.48145873683216023, "grad_norm": 1.6078096349296704, "learning_rate": 6.571488434310984e-07, "logits/chosen": 2.5419921875, "logits/rejected": 2.30078125, "logps/chosen": -998.0, "logps/rejected": -2112.0, "loss": 0.5038, "rewards/accuracies": 0.84375, "rewards/chosen": 1.28955078125, "rewards/margins": 6.6171875, "rewards/rejected": -5.3359375, "step": 2548 }, { "epoch": 0.48164769238036753, "grad_norm": 2.7468750957009473, "learning_rate": 6.568605446556394e-07, "logits/chosen": 2.27734375, "logits/rejected": 2.048828125, "logps/chosen": -783.0, "logps/rejected": -737.5, "loss": 0.6357, "rewards/accuracies": 0.78125, "rewards/chosen": 0.669921875, "rewards/margins": 3.142578125, "rewards/rejected": -2.474609375, "step": 2549 }, { "epoch": 0.4818366479285748, "grad_norm": 3.5500825865859107, "learning_rate": 6.565721993905656e-07, "logits/chosen": 2.34765625, "logits/rejected": 2.69140625, "logps/chosen": -741.5, "logps/rejected": -925.0, "loss": 0.5359, "rewards/accuracies": 0.84375, "rewards/chosen": 1.28436279296875, "rewards/margins": 4.14453125, "rewards/rejected": -2.85546875, "step": 2550 }, { "epoch": 0.4820256034767821, "grad_norm": 2.597504350018731, "learning_rate": 6.562838077613208e-07, "logits/chosen": 2.0654296875, "logits/rejected": 1.49609375, "logps/chosen": -1030.0, "logps/rejected": -928.0, "loss": 0.55, "rewards/accuracies": 0.875, "rewards/chosen": 0.791015625, "rewards/margins": 4.44140625, "rewards/rejected": -3.6484375, "step": 2551 }, { "epoch": 0.4822145590249894, "grad_norm": 1.9873135622233318, "learning_rate": 6.559953698933701e-07, "logits/chosen": 2.1796875, "logits/rejected": 1.7197265625, "logps/chosen": -698.0, "logps/rejected": -619.0, "loss": 0.6207, "rewards/accuracies": 0.78125, "rewards/chosen": 0.48974609375, "rewards/margins": 3.8515625, "rewards/rejected": -3.3671875, "step": 2552 }, { "epoch": 0.48240351457319663, "grad_norm": 1.8480945618913578, "learning_rate": 6.557068859121984e-07, "logits/chosen": 2.798828125, "logits/rejected": 2.623046875, "logps/chosen": -1340.0, "logps/rejected": -1066.0, "loss": 0.608, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7797622680664062, "rewards/margins": 4.77294921875, "rewards/rejected": -4.00390625, "step": 2553 }, { "epoch": 0.48259247012140394, "grad_norm": 2.919105496671924, "learning_rate": 6.5541835594331e-07, "logits/chosen": 3.70703125, "logits/rejected": 4.11328125, "logps/chosen": -801.0, "logps/rejected": -1800.0, "loss": 0.614, "rewards/accuracies": 0.78125, "rewards/chosen": 0.765625, "rewards/margins": 4.796875, "rewards/rejected": -4.02734375, "step": 2554 }, { "epoch": 0.48278142566961124, "grad_norm": 1.6358488565256841, "learning_rate": 6.551297801122299e-07, "logits/chosen": 3.5625, "logits/rejected": 3.03125, "logps/chosen": -772.5, "logps/rejected": -729.5, "loss": 0.596, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9248046875, "rewards/margins": 5.17578125, "rewards/rejected": -4.25390625, "step": 2555 }, { "epoch": 0.4829703812178185, "grad_norm": 2.39202694296642, "learning_rate": 6.548411585445027e-07, "logits/chosen": 2.3203125, "logits/rejected": 1.88671875, "logps/chosen": -748.0, "logps/rejected": -875.0, "loss": 0.621, "rewards/accuracies": 0.8125, "rewards/chosen": 0.33251953125, "rewards/margins": 3.99609375, "rewards/rejected": -3.6640625, "step": 2556 }, { "epoch": 0.4831593367660258, "grad_norm": 2.2539757630365065, "learning_rate": 6.545524913656932e-07, "logits/chosen": 2.83984375, "logits/rejected": 2.544921875, "logps/chosen": -853.0, "logps/rejected": -1004.0, "loss": 0.6095, "rewards/accuracies": 0.875, "rewards/chosen": 0.6954345703125, "rewards/margins": 4.03515625, "rewards/rejected": -3.34765625, "step": 2557 }, { "epoch": 0.4833482923142331, "grad_norm": 1.864543567756037, "learning_rate": 6.542637787013861e-07, "logits/chosen": 3.28125, "logits/rejected": 2.89453125, "logps/chosen": -938.0, "logps/rejected": -880.0, "loss": 0.4648, "rewards/accuracies": 0.8125, "rewards/chosen": 1.6083984375, "rewards/margins": 6.4296875, "rewards/rejected": -4.828125, "step": 2558 }, { "epoch": 0.48353724786244034, "grad_norm": 1.9920072626585903, "learning_rate": 6.53975020677185e-07, "logits/chosen": 2.1953125, "logits/rejected": 2.076171875, "logps/chosen": -742.5, "logps/rejected": -1065.0, "loss": 0.7087, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0535888671875, "rewards/margins": 3.975830078125, "rewards/rejected": -4.044921875, "step": 2559 }, { "epoch": 0.48372620341064765, "grad_norm": 1.9973813640117377, "learning_rate": 6.536862174187147e-07, "logits/chosen": 3.1171875, "logits/rejected": 2.560546875, "logps/chosen": -2507.5, "logps/rejected": -448.5, "loss": 0.632, "rewards/accuracies": 0.6875, "rewards/chosen": -9.5390625, "rewards/margins": -6.7734375, "rewards/rejected": -2.751953125, "step": 2560 }, { "epoch": 0.48391515895885495, "grad_norm": 2.753393096411817, "learning_rate": 6.533973690516181e-07, "logits/chosen": 1.111328125, "logits/rejected": 1.22265625, "logps/chosen": -532.0, "logps/rejected": -533.0, "loss": 0.7326, "rewards/accuracies": 0.78125, "rewards/chosen": 0.119140625, "rewards/margins": 2.67578125, "rewards/rejected": -2.5546875, "step": 2561 }, { "epoch": 0.4841041145070622, "grad_norm": 3.9766825998121744, "learning_rate": 6.531084757015593e-07, "logits/chosen": 3.080078125, "logits/rejected": 3.06640625, "logps/chosen": -1004.5, "logps/rejected": -897.0, "loss": 0.6014, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4716796875, "rewards/margins": 3.736328125, "rewards/rejected": -3.26953125, "step": 2562 }, { "epoch": 0.4842930700552695, "grad_norm": 2.4280785141167116, "learning_rate": 6.528195374942205e-07, "logits/chosen": 2.48828125, "logits/rejected": 2.474609375, "logps/chosen": -910.0, "logps/rejected": -972.0, "loss": 0.6605, "rewards/accuracies": 0.78125, "rewards/chosen": 0.53515625, "rewards/margins": 4.4375, "rewards/rejected": -3.90234375, "step": 2563 }, { "epoch": 0.4844820256034768, "grad_norm": 2.4279632925018277, "learning_rate": 6.525305545553046e-07, "logits/chosen": 3.1484375, "logits/rejected": 3.33984375, "logps/chosen": -940.0, "logps/rejected": -1267.0, "loss": 0.7091, "rewards/accuracies": 0.75, "rewards/chosen": 0.19677734375, "rewards/margins": 4.3671875, "rewards/rejected": -4.1640625, "step": 2564 }, { "epoch": 0.48467098115168405, "grad_norm": 2.0901844358121493, "learning_rate": 6.522415270105329e-07, "logits/chosen": 2.951171875, "logits/rejected": 2.68359375, "logps/chosen": -771.5, "logps/rejected": -995.0, "loss": 0.622, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5390625, "rewards/margins": 4.12109375, "rewards/rejected": -3.5859375, "step": 2565 }, { "epoch": 0.48485993669989136, "grad_norm": 1.8824900417500217, "learning_rate": 6.519524549856472e-07, "logits/chosen": 1.59765625, "logits/rejected": 2.080078125, "logps/chosen": -505.5, "logps/rejected": -709.5, "loss": 0.5856, "rewards/accuracies": 0.90625, "rewards/chosen": 0.32763671875, "rewards/margins": 4.13671875, "rewards/rejected": -3.810546875, "step": 2566 }, { "epoch": 0.48504889224809866, "grad_norm": 1.6853287525482399, "learning_rate": 6.516633386064079e-07, "logits/chosen": 2.2421875, "logits/rejected": 2.080078125, "logps/chosen": -1139.0, "logps/rejected": -1253.0, "loss": 0.5534, "rewards/accuracies": 0.8125, "rewards/chosen": 1.091796875, "rewards/margins": 6.18359375, "rewards/rejected": -5.09375, "step": 2567 }, { "epoch": 0.4852378477963059, "grad_norm": 2.1146897570730268, "learning_rate": 6.513741779985947e-07, "logits/chosen": 2.541015625, "logits/rejected": 2.3984375, "logps/chosen": -836.5, "logps/rejected": -1557.0, "loss": 0.6428, "rewards/accuracies": 0.65625, "rewards/chosen": -0.511474609375, "rewards/margins": 6.83984375, "rewards/rejected": -7.37109375, "step": 2568 }, { "epoch": 0.4854268033445132, "grad_norm": 2.5896235317094924, "learning_rate": 6.510849732880072e-07, "logits/chosen": 3.55078125, "logits/rejected": 3.390625, "logps/chosen": -700.0, "logps/rejected": -655.0, "loss": 0.6932, "rewards/accuracies": 0.71875, "rewards/chosen": 0.02783203125, "rewards/margins": 3.8125, "rewards/rejected": -3.78515625, "step": 2569 }, { "epoch": 0.48561575889272046, "grad_norm": 2.176167968351557, "learning_rate": 6.507957246004633e-07, "logits/chosen": 3.37890625, "logits/rejected": 2.90234375, "logps/chosen": -843.5, "logps/rejected": -766.0, "loss": 0.588, "rewards/accuracies": 0.78125, "rewards/chosen": 0.619140625, "rewards/margins": 4.37890625, "rewards/rejected": -3.7578125, "step": 2570 }, { "epoch": 0.48580471444092777, "grad_norm": 2.4268569905399198, "learning_rate": 6.505064320618007e-07, "logits/chosen": 2.09375, "logits/rejected": 1.69140625, "logps/chosen": -678.0, "logps/rejected": -698.0, "loss": 0.6299, "rewards/accuracies": 0.84375, "rewards/chosen": -0.39453125, "rewards/margins": 4.96484375, "rewards/rejected": -5.359375, "step": 2571 }, { "epoch": 0.48599366998913507, "grad_norm": 2.0537317107701716, "learning_rate": 6.502170957978758e-07, "logits/chosen": 2.275390625, "logits/rejected": 1.931640625, "logps/chosen": -675.5, "logps/rejected": -779.0, "loss": 0.5895, "rewards/accuracies": 0.875, "rewards/chosen": 0.08160400390625, "rewards/margins": 5.0, "rewards/rejected": -4.9140625, "step": 2572 }, { "epoch": 0.4861826255373423, "grad_norm": 1.795122541000166, "learning_rate": 6.499277159345645e-07, "logits/chosen": 2.0947265625, "logits/rejected": 2.12890625, "logps/chosen": -756.0, "logps/rejected": -736.0, "loss": 0.6085, "rewards/accuracies": 0.65625, "rewards/chosen": 0.154052734375, "rewards/margins": 4.224609375, "rewards/rejected": -4.06640625, "step": 2573 }, { "epoch": 0.4863715810855496, "grad_norm": 2.026049406919501, "learning_rate": 6.496382925977607e-07, "logits/chosen": 2.00390625, "logits/rejected": 2.28515625, "logps/chosen": -902.0, "logps/rejected": -1257.0, "loss": 0.6392, "rewards/accuracies": 0.78125, "rewards/chosen": 0.354736328125, "rewards/margins": 5.51953125, "rewards/rejected": -5.15625, "step": 2574 }, { "epoch": 0.4865605366337569, "grad_norm": 1.883973502997004, "learning_rate": 6.493488259133784e-07, "logits/chosen": 1.931640625, "logits/rejected": 1.9296875, "logps/chosen": -859.0, "logps/rejected": -1057.0, "loss": 0.6434, "rewards/accuracies": 0.78125, "rewards/chosen": 0.19189453125, "rewards/margins": 5.42578125, "rewards/rejected": -5.24609375, "step": 2575 }, { "epoch": 0.48674949218196417, "grad_norm": 1.6663940648925124, "learning_rate": 6.490593160073499e-07, "logits/chosen": 1.921875, "logits/rejected": 1.388671875, "logps/chosen": -998.0, "logps/rejected": -1197.0, "loss": 0.5481, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2298583984375, "rewards/margins": 5.28125, "rewards/rejected": -5.0625, "step": 2576 }, { "epoch": 0.4869384477301715, "grad_norm": 2.057945904210972, "learning_rate": 6.48769763005626e-07, "logits/chosen": 2.09375, "logits/rejected": 2.25, "logps/chosen": -782.5, "logps/rejected": -1130.5, "loss": 0.6306, "rewards/accuracies": 0.71875, "rewards/chosen": -0.236328125, "rewards/margins": 5.640625, "rewards/rejected": -5.8671875, "step": 2577 }, { "epoch": 0.4871274032783788, "grad_norm": 1.8848342769194908, "learning_rate": 6.484801670341766e-07, "logits/chosen": 1.072265625, "logits/rejected": 1.3359375, "logps/chosen": -711.5, "logps/rejected": -680.0, "loss": 0.6103, "rewards/accuracies": 0.84375, "rewards/chosen": 0.26416015625, "rewards/margins": 3.67578125, "rewards/rejected": -3.4140625, "step": 2578 }, { "epoch": 0.487316358826586, "grad_norm": 1.949117198999886, "learning_rate": 6.481905282189905e-07, "logits/chosen": 2.6328125, "logits/rejected": 2.537109375, "logps/chosen": -845.5, "logps/rejected": -1528.5, "loss": 0.6275, "rewards/accuracies": 0.75, "rewards/chosen": 0.3603515625, "rewards/margins": 6.203125, "rewards/rejected": -5.84765625, "step": 2579 }, { "epoch": 0.48750531437479333, "grad_norm": 2.8999854583980498, "learning_rate": 6.479008466860746e-07, "logits/chosen": 2.01171875, "logits/rejected": 1.962890625, "logps/chosen": -747.0, "logps/rejected": -1013.5, "loss": 0.5662, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6376953125, "rewards/margins": 5.98046875, "rewards/rejected": -5.3359375, "step": 2580 }, { "epoch": 0.48769426992300063, "grad_norm": 2.4541726478355383, "learning_rate": 6.476111225614549e-07, "logits/chosen": 2.646484375, "logits/rejected": 2.78515625, "logps/chosen": -578.25, "logps/rejected": -1737.0, "loss": 0.6403, "rewards/accuracies": 0.8125, "rewards/chosen": -0.76025390625, "rewards/margins": 3.708984375, "rewards/rejected": -4.4765625, "step": 2581 }, { "epoch": 0.4878832254712079, "grad_norm": 1.6048470165608886, "learning_rate": 6.473213559711758e-07, "logits/chosen": 2.3072509765625, "logits/rejected": 2.35546875, "logps/chosen": -896.0, "logps/rejected": -893.5, "loss": 0.4969, "rewards/accuracies": 0.90625, "rewards/chosen": 1.142333984375, "rewards/margins": 4.80078125, "rewards/rejected": -3.6640625, "step": 2582 }, { "epoch": 0.4880721810194152, "grad_norm": 1.920724048760405, "learning_rate": 6.470315470412996e-07, "logits/chosen": 3.02734375, "logits/rejected": 2.87890625, "logps/chosen": -428.25, "logps/rejected": -414.25, "loss": 0.6353, "rewards/accuracies": 0.78125, "rewards/chosen": 0.34326171875, "rewards/margins": 3.56640625, "rewards/rejected": -3.2265625, "step": 2583 }, { "epoch": 0.4882611365676225, "grad_norm": 1.226372190246746, "learning_rate": 6.467416958979079e-07, "logits/chosen": 3.09375, "logits/rejected": 3.1328125, "logps/chosen": -917.5, "logps/rejected": -1044.25, "loss": 0.5633, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1708984375, "rewards/margins": 5.544921875, "rewards/rejected": -4.369140625, "step": 2584 }, { "epoch": 0.48845009211582974, "grad_norm": 2.497442330537524, "learning_rate": 6.464518026671003e-07, "logits/chosen": 2.4765625, "logits/rejected": 2.6015625, "logps/chosen": -335.25, "logps/rejected": -460.0, "loss": 0.7048, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1934814453125, "rewards/margins": 2.568359375, "rewards/rejected": -2.755859375, "step": 2585 }, { "epoch": 0.48863904766403704, "grad_norm": 2.7355756974403374, "learning_rate": 6.461618674749945e-07, "logits/chosen": 2.302734375, "logits/rejected": 3.0234375, "logps/chosen": -752.0, "logps/rejected": -1109.0, "loss": 0.7143, "rewards/accuracies": 0.625, "rewards/chosen": 0.28076171875, "rewards/margins": 3.09765625, "rewards/rejected": -2.80859375, "step": 2586 }, { "epoch": 0.48882800321224434, "grad_norm": 3.5631766782926464, "learning_rate": 6.458718904477264e-07, "logits/chosen": 3.1640625, "logits/rejected": 2.755859375, "logps/chosen": -571.5, "logps/rejected": -1204.5, "loss": 0.6217, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3153076171875, "rewards/margins": 7.15625, "rewards/rejected": -6.83984375, "step": 2587 }, { "epoch": 0.4890169587604516, "grad_norm": 2.4229312782772863, "learning_rate": 6.455818717114508e-07, "logits/chosen": 2.626953125, "logits/rejected": 2.4140625, "logps/chosen": -471.0, "logps/rejected": -884.0, "loss": 0.5763, "rewards/accuracies": 0.84375, "rewards/chosen": -0.16375732421875, "rewards/margins": 4.171875, "rewards/rejected": -4.328125, "step": 2588 }, { "epoch": 0.4892059143086589, "grad_norm": 1.6691087027016984, "learning_rate": 6.4529181139234e-07, "logits/chosen": 2.00927734375, "logits/rejected": 2.00732421875, "logps/chosen": -584.0, "logps/rejected": -1711.0, "loss": 0.5896, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5068359375, "rewards/margins": 3.990234375, "rewards/rejected": -3.484375, "step": 2589 }, { "epoch": 0.4893948698568662, "grad_norm": 1.7883830460054777, "learning_rate": 6.450017096165845e-07, "logits/chosen": 3.01171875, "logits/rejected": 2.662109375, "logps/chosen": -701.5, "logps/rejected": -846.0, "loss": 0.5187, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9892578125, "rewards/margins": 4.287109375, "rewards/rejected": -3.298828125, "step": 2590 }, { "epoch": 0.48958382540507345, "grad_norm": 1.5378991175579, "learning_rate": 6.447115665103929e-07, "logits/chosen": 2.31640625, "logits/rejected": 2.19140625, "logps/chosen": -591.0, "logps/rejected": -1454.0, "loss": 0.6141, "rewards/accuracies": 0.78125, "rewards/chosen": 0.82861328125, "rewards/margins": 9.3125, "rewards/rejected": -8.4921875, "step": 2591 }, { "epoch": 0.48977278095328075, "grad_norm": 2.0011561862481515, "learning_rate": 6.444213821999919e-07, "logits/chosen": 3.01171875, "logits/rejected": 2.470703125, "logps/chosen": -613.5, "logps/rejected": -655.0, "loss": 0.7534, "rewards/accuracies": 0.625, "rewards/chosen": 0.47509765625, "rewards/margins": 2.5390625, "rewards/rejected": -2.0615234375, "step": 2592 }, { "epoch": 0.489961736501488, "grad_norm": 2.5717111932559837, "learning_rate": 6.441311568116259e-07, "logits/chosen": 2.9921875, "logits/rejected": 2.72265625, "logps/chosen": -716.0, "logps/rejected": -781.0, "loss": 0.6434, "rewards/accuracies": 0.84375, "rewards/chosen": 1.30194091796875, "rewards/margins": 3.93359375, "rewards/rejected": -2.625, "step": 2593 }, { "epoch": 0.4901506920496953, "grad_norm": 1.3333037306775437, "learning_rate": 6.438408904715573e-07, "logits/chosen": 2.599609375, "logits/rejected": 1.9453125, "logps/chosen": -925.5, "logps/rejected": -864.5, "loss": 0.6513, "rewards/accuracies": 0.75, "rewards/chosen": 0.408447265625, "rewards/margins": 3.82421875, "rewards/rejected": -3.421875, "step": 2594 }, { "epoch": 0.4903396475979026, "grad_norm": 1.8082279146222422, "learning_rate": 6.435505833060663e-07, "logits/chosen": 2.869140625, "logits/rejected": 3.35546875, "logps/chosen": -807.0, "logps/rejected": -822.0, "loss": 0.5765, "rewards/accuracies": 0.75, "rewards/chosen": 1.041015625, "rewards/margins": 4.359375, "rewards/rejected": -3.3203125, "step": 2595 }, { "epoch": 0.49052860314610985, "grad_norm": 1.344248958500967, "learning_rate": 6.432602354414508e-07, "logits/chosen": 2.7109375, "logits/rejected": 2.6953125, "logps/chosen": -645.5, "logps/rejected": -687.5, "loss": 0.6112, "rewards/accuracies": 0.84375, "rewards/chosen": 0.848876953125, "rewards/margins": 3.44140625, "rewards/rejected": -2.6015625, "step": 2596 }, { "epoch": 0.49071755869431716, "grad_norm": 2.0830869560755194, "learning_rate": 6.429698470040267e-07, "logits/chosen": 2.251953125, "logits/rejected": 2.88330078125, "logps/chosen": -983.0, "logps/rejected": -2077.0, "loss": 0.5361, "rewards/accuracies": 0.875, "rewards/chosen": 1.6669921875, "rewards/margins": 8.60546875, "rewards/rejected": -6.9375, "step": 2597 }, { "epoch": 0.49090651424252446, "grad_norm": 1.8288036621724093, "learning_rate": 6.426794181201269e-07, "logits/chosen": 2.986328125, "logits/rejected": 2.974609375, "logps/chosen": -670.0, "logps/rejected": -977.0, "loss": 0.5578, "rewards/accuracies": 0.90625, "rewards/chosen": 1.27294921875, "rewards/margins": 3.880859375, "rewards/rejected": -2.6080322265625, "step": 2598 }, { "epoch": 0.4910954697907317, "grad_norm": 2.338832562673644, "learning_rate": 6.423889489161028e-07, "logits/chosen": 3.078125, "logits/rejected": 3.30859375, "logps/chosen": -1011.0, "logps/rejected": -1097.5, "loss": 0.6023, "rewards/accuracies": 0.75, "rewards/chosen": -0.651611328125, "rewards/margins": 3.73828125, "rewards/rejected": -4.39453125, "step": 2599 }, { "epoch": 0.491284425338939, "grad_norm": 2.2068485789843604, "learning_rate": 6.420984395183223e-07, "logits/chosen": 2.78125, "logits/rejected": 2.8984375, "logps/chosen": -921.0, "logps/rejected": -1168.0, "loss": 0.5204, "rewards/accuracies": 0.875, "rewards/chosen": 1.275390625, "rewards/margins": 5.09375, "rewards/rejected": -3.8203125, "step": 2600 }, { "epoch": 0.4914733808871463, "grad_norm": 2.1126039217477537, "learning_rate": 6.418078900531717e-07, "logits/chosen": 3.341796875, "logits/rejected": 3.38671875, "logps/chosen": -817.0, "logps/rejected": -1054.0, "loss": 0.6255, "rewards/accuracies": 0.75, "rewards/chosen": 1.4853515625, "rewards/margins": 4.1865234375, "rewards/rejected": -2.70458984375, "step": 2601 }, { "epoch": 0.49166233643535356, "grad_norm": 1.9167681456645524, "learning_rate": 6.415173006470545e-07, "logits/chosen": 3.7265625, "logits/rejected": 3.76171875, "logps/chosen": -792.5, "logps/rejected": -779.5, "loss": 0.7269, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1107177734375, "rewards/margins": 3.1748046875, "rewards/rejected": -2.0625, "step": 2602 }, { "epoch": 0.49185129198356087, "grad_norm": 1.857345040479366, "learning_rate": 6.412266714263911e-07, "logits/chosen": 2.572265625, "logits/rejected": 2.447265625, "logps/chosen": -965.5, "logps/rejected": -846.5, "loss": 0.7287, "rewards/accuracies": 0.625, "rewards/chosen": 1.08544921875, "rewards/margins": 2.6513671875, "rewards/rejected": -1.568359375, "step": 2603 }, { "epoch": 0.49204024753176817, "grad_norm": 2.210946263811558, "learning_rate": 6.409360025176197e-07, "logits/chosen": 2.90234375, "logits/rejected": 2.5625, "logps/chosen": -644.5, "logps/rejected": -676.0, "loss": 0.5762, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4921875, "rewards/margins": 3.591796875, "rewards/rejected": -2.1005859375, "step": 2604 }, { "epoch": 0.4922292030799754, "grad_norm": 2.864644850404739, "learning_rate": 6.406452940471957e-07, "logits/chosen": 2.79296875, "logits/rejected": 2.55859375, "logps/chosen": -11487.5, "logps/rejected": -1188.5, "loss": 0.7113, "rewards/accuracies": 0.8125, "rewards/chosen": -56.779296875, "rewards/margins": -54.3046875, "rewards/rejected": -2.4765625, "step": 2605 }, { "epoch": 0.4924181586281827, "grad_norm": 2.1043427206239183, "learning_rate": 6.403545461415915e-07, "logits/chosen": 3.15234375, "logits/rejected": 2.703125, "logps/chosen": -629.5, "logps/rejected": -661.0, "loss": 0.5486, "rewards/accuracies": 0.90625, "rewards/chosen": 0.845703125, "rewards/margins": 4.6328125, "rewards/rejected": -3.791015625, "step": 2606 }, { "epoch": 0.49260711417639, "grad_norm": 1.598341646929114, "learning_rate": 6.400637589272969e-07, "logits/chosen": 3.22265625, "logits/rejected": 3.03515625, "logps/chosen": -610.0, "logps/rejected": -651.5, "loss": 0.6613, "rewards/accuracies": 0.71875, "rewards/chosen": 1.138671875, "rewards/margins": 3.1259765625, "rewards/rejected": -1.98504638671875, "step": 2607 }, { "epoch": 0.4927960697245973, "grad_norm": 1.6970906071040128, "learning_rate": 6.397729325308188e-07, "logits/chosen": 2.8515625, "logits/rejected": 2.43359375, "logps/chosen": -834.0, "logps/rejected": -1205.0, "loss": 0.6259, "rewards/accuracies": 0.84375, "rewards/chosen": 0.46978759765625, "rewards/margins": 4.5390625, "rewards/rejected": -4.0703125, "step": 2608 }, { "epoch": 0.4929850252728046, "grad_norm": 2.2146203073456645, "learning_rate": 6.394820670786808e-07, "logits/chosen": 2.21484375, "logits/rejected": 1.6171875, "logps/chosen": -812.0, "logps/rejected": -1072.0, "loss": 0.4429, "rewards/accuracies": 1.0, "rewards/chosen": 0.998046875, "rewards/margins": 5.0703125, "rewards/rejected": -4.0625, "step": 2609 }, { "epoch": 0.4931739808210119, "grad_norm": 3.3425672259860435, "learning_rate": 6.391911626974239e-07, "logits/chosen": 2.14453125, "logits/rejected": 2.345703125, "logps/chosen": -634.0, "logps/rejected": -1107.0, "loss": 0.6479, "rewards/accuracies": 0.75, "rewards/chosen": 0.224609375, "rewards/margins": 3.70703125, "rewards/rejected": -3.484375, "step": 2610 }, { "epoch": 0.49336293636921913, "grad_norm": 2.0083749924421177, "learning_rate": 6.389002195136059e-07, "logits/chosen": 2.4296875, "logits/rejected": 1.9638671875, "logps/chosen": -418.0, "logps/rejected": -420.5, "loss": 0.674, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0732421875, "rewards/margins": 3.265625, "rewards/rejected": -3.193359375, "step": 2611 }, { "epoch": 0.49355189191742643, "grad_norm": 2.254363763518361, "learning_rate": 6.386092376538014e-07, "logits/chosen": 2.6484375, "logits/rejected": 2.623046875, "logps/chosen": -602.0, "logps/rejected": -791.0, "loss": 0.6301, "rewards/accuracies": 0.78125, "rewards/chosen": 1.13037109375, "rewards/margins": 4.15234375, "rewards/rejected": -3.01953125, "step": 2612 }, { "epoch": 0.49374084746563374, "grad_norm": 1.8325135218909077, "learning_rate": 6.383182172446017e-07, "logits/chosen": 1.9853515625, "logits/rejected": 1.5205078125, "logps/chosen": -714.0, "logps/rejected": -1578.0, "loss": 0.5139, "rewards/accuracies": 0.84375, "rewards/chosen": 0.65380859375, "rewards/margins": 8.0546875, "rewards/rejected": -7.40625, "step": 2613 }, { "epoch": 0.493929803013841, "grad_norm": 2.240760839954718, "learning_rate": 6.380271584126156e-07, "logits/chosen": 1.62109375, "logits/rejected": 1.369140625, "logps/chosen": -649.0, "logps/rejected": -610.5, "loss": 0.5282, "rewards/accuracies": 0.9375, "rewards/chosen": 0.84869384765625, "rewards/margins": 3.5234375, "rewards/rejected": -2.671875, "step": 2614 }, { "epoch": 0.4941187585620483, "grad_norm": 2.361705330418251, "learning_rate": 6.377360612844673e-07, "logits/chosen": 2.697265625, "logits/rejected": 1.8984375, "logps/chosen": -823.0, "logps/rejected": -524.0, "loss": 0.6146, "rewards/accuracies": 0.8125, "rewards/chosen": 1.36474609375, "rewards/margins": 4.859375, "rewards/rejected": -3.48828125, "step": 2615 }, { "epoch": 0.49430771411025554, "grad_norm": 1.4833166400424405, "learning_rate": 6.374449259867991e-07, "logits/chosen": 2.498046875, "logits/rejected": 2.23828125, "logps/chosen": -578.0, "logps/rejected": -530.0, "loss": 0.6575, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5748291015625, "rewards/margins": 3.23828125, "rewards/rejected": -2.662109375, "step": 2616 }, { "epoch": 0.49449666965846284, "grad_norm": 1.8315288417648699, "learning_rate": 6.37153752646269e-07, "logits/chosen": 1.7421875, "logits/rejected": 1.318359375, "logps/chosen": -656.0, "logps/rejected": -697.0, "loss": 0.5442, "rewards/accuracies": 0.875, "rewards/chosen": 0.754638671875, "rewards/margins": 5.03515625, "rewards/rejected": -4.28515625, "step": 2617 }, { "epoch": 0.49468562520667014, "grad_norm": 1.976106840134263, "learning_rate": 6.368625413895515e-07, "logits/chosen": 3.002349853515625, "logits/rejected": 2.78857421875, "logps/chosen": -838.0, "logps/rejected": -1815.0, "loss": 0.5713, "rewards/accuracies": 0.8125, "rewards/chosen": 0.686767578125, "rewards/margins": 8.640625, "rewards/rejected": -7.92578125, "step": 2618 }, { "epoch": 0.4948745807548774, "grad_norm": 1.7900557258513838, "learning_rate": 6.365712923433383e-07, "logits/chosen": 2.0029296875, "logits/rejected": 1.993408203125, "logps/chosen": -772.0, "logps/rejected": -785.0, "loss": 0.6425, "rewards/accuracies": 0.78125, "rewards/chosen": -0.23602294921875, "rewards/margins": 3.56640625, "rewards/rejected": -3.8046875, "step": 2619 }, { "epoch": 0.4950635363030847, "grad_norm": 1.442631813476098, "learning_rate": 6.362800056343369e-07, "logits/chosen": 3.27734375, "logits/rejected": 3.01953125, "logps/chosen": -585.0, "logps/rejected": -573.0, "loss": 0.7907, "rewards/accuracies": 0.6875, "rewards/chosen": 0.420654296875, "rewards/margins": 1.97021484375, "rewards/rejected": -1.5439453125, "step": 2620 }, { "epoch": 0.495252491851292, "grad_norm": 2.7776946964608413, "learning_rate": 6.359886813892715e-07, "logits/chosen": 3.171875, "logits/rejected": 3.001953125, "logps/chosen": -684.0, "logps/rejected": -911.0, "loss": 0.5796, "rewards/accuracies": 0.90625, "rewards/chosen": 0.54296875, "rewards/margins": 5.20703125, "rewards/rejected": -4.66796875, "step": 2621 }, { "epoch": 0.49544144739949925, "grad_norm": 2.5331602797910895, "learning_rate": 6.356973197348825e-07, "logits/chosen": 2.693359375, "logits/rejected": 2.677734375, "logps/chosen": -822.0, "logps/rejected": -810.0, "loss": 0.697, "rewards/accuracies": 0.71875, "rewards/chosen": 0.80419921875, "rewards/margins": 3.466796875, "rewards/rejected": -2.666015625, "step": 2622 }, { "epoch": 0.49563040294770655, "grad_norm": 1.9508672641508784, "learning_rate": 6.354059207979265e-07, "logits/chosen": 2.81640625, "logits/rejected": 2.4375, "logps/chosen": -530.5, "logps/rejected": -573.5, "loss": 0.5299, "rewards/accuracies": 0.875, "rewards/chosen": 0.8935546875, "rewards/margins": 4.37109375, "rewards/rejected": -3.4765625, "step": 2623 }, { "epoch": 0.49581935849591385, "grad_norm": 2.255100918958072, "learning_rate": 6.351144847051766e-07, "logits/chosen": 2.87890625, "logits/rejected": 3.087890625, "logps/chosen": -666.5, "logps/rejected": -2269.0, "loss": 0.5996, "rewards/accuracies": 0.875, "rewards/chosen": 0.5506591796875, "rewards/margins": 8.3046875, "rewards/rejected": -7.7578125, "step": 2624 }, { "epoch": 0.4960083140441211, "grad_norm": 1.7246531456290062, "learning_rate": 6.348230115834219e-07, "logits/chosen": 2.431640625, "logits/rejected": 2.470703125, "logps/chosen": -706.0, "logps/rejected": -663.0, "loss": 0.5502, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9453125, "rewards/margins": 4.390625, "rewards/rejected": -3.4453125, "step": 2625 }, { "epoch": 0.4961972695923284, "grad_norm": 1.4794504451388661, "learning_rate": 6.345315015594674e-07, "logits/chosen": 3.1953125, "logits/rejected": 2.513671875, "logps/chosen": -899.0, "logps/rejected": -862.5, "loss": 0.6252, "rewards/accuracies": 0.78125, "rewards/chosen": 0.34326171875, "rewards/margins": 3.95703125, "rewards/rejected": -3.61328125, "step": 2626 }, { "epoch": 0.4963862251405357, "grad_norm": 1.9412862340859642, "learning_rate": 6.342399547601346e-07, "logits/chosen": 2.0478515625, "logits/rejected": 1.987548828125, "logps/chosen": -1020.0, "logps/rejected": -1011.0, "loss": 0.5531, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3251953125, "rewards/margins": 4.93359375, "rewards/rejected": -3.6015625, "step": 2627 }, { "epoch": 0.49657518068874296, "grad_norm": 1.6573618491169118, "learning_rate": 6.339483713122607e-07, "logits/chosen": 2.4921875, "logits/rejected": 2.708984375, "logps/chosen": -625.0, "logps/rejected": -1392.0, "loss": 0.6692, "rewards/accuracies": 0.84375, "rewards/chosen": 0.60205078125, "rewards/margins": 7.171875, "rewards/rejected": -6.59375, "step": 2628 }, { "epoch": 0.49676413623695026, "grad_norm": 1.6137327844061375, "learning_rate": 6.336567513426987e-07, "logits/chosen": 2.75, "logits/rejected": 3.20703125, "logps/chosen": -831.0, "logps/rejected": -1058.5, "loss": 0.5369, "rewards/accuracies": 0.875, "rewards/chosen": 1.9580078125, "rewards/margins": 6.64453125, "rewards/rejected": -4.671875, "step": 2629 }, { "epoch": 0.49695309178515756, "grad_norm": 1.6583276869064483, "learning_rate": 6.333650949783179e-07, "logits/chosen": 3.201171875, "logits/rejected": 3.296875, "logps/chosen": -711.5, "logps/rejected": -680.0, "loss": 0.5714, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0174560546875, "rewards/margins": 4.03125, "rewards/rejected": -3.0078125, "step": 2630 }, { "epoch": 0.4971420473333648, "grad_norm": 1.5422094944405493, "learning_rate": 6.330734023460031e-07, "logits/chosen": 3.30078125, "logits/rejected": 3.181640625, "logps/chosen": -795.5, "logps/rejected": -1369.5, "loss": 0.7825, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5029296875, "rewards/margins": 4.74072265625, "rewards/rejected": -4.240234375, "step": 2631 }, { "epoch": 0.4973310028815721, "grad_norm": 1.664367867805608, "learning_rate": 6.327816735726552e-07, "logits/chosen": 2.3857421875, "logits/rejected": 2.03125, "logps/chosen": -849.0, "logps/rejected": -1735.5, "loss": 0.5438, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2822265625, "rewards/margins": 5.09375, "rewards/rejected": -3.8203125, "step": 2632 }, { "epoch": 0.4975199584297794, "grad_norm": 2.7617191383666535, "learning_rate": 6.324899087851904e-07, "logits/chosen": 2.416015625, "logits/rejected": 2.23046875, "logps/chosen": -531.0, "logps/rejected": -1444.0, "loss": 0.5813, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7998046875, "rewards/margins": 4.5703125, "rewards/rejected": -3.76953125, "step": 2633 }, { "epoch": 0.49770891397798667, "grad_norm": 1.979289964715728, "learning_rate": 6.321981081105411e-07, "logits/chosen": 1.2962646484375, "logits/rejected": 1.42333984375, "logps/chosen": -607.5, "logps/rejected": -987.5, "loss": 0.567, "rewards/accuracies": 0.875, "rewards/chosen": 1.087890625, "rewards/margins": 5.11328125, "rewards/rejected": -4.03125, "step": 2634 }, { "epoch": 0.49789786952619397, "grad_norm": 2.0027157130717117, "learning_rate": 6.319062716756546e-07, "logits/chosen": 1.3896484375, "logits/rejected": 1.3525390625, "logps/chosen": -1015.5, "logps/rejected": -1087.0, "loss": 0.5981, "rewards/accuracies": 0.71875, "rewards/chosen": 1.27001953125, "rewards/margins": 5.34765625, "rewards/rejected": -4.072265625, "step": 2635 }, { "epoch": 0.4980868250744013, "grad_norm": 2.543086197535582, "learning_rate": 6.316143996074944e-07, "logits/chosen": 2.1894989013671875, "logits/rejected": 1.80517578125, "logps/chosen": -725.0, "logps/rejected": -804.0, "loss": 0.5006, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3193359375, "rewards/margins": 5.17578125, "rewards/rejected": -3.84375, "step": 2636 }, { "epoch": 0.4982757806226085, "grad_norm": 2.0216947884418577, "learning_rate": 6.313224920330391e-07, "logits/chosen": 1.927001953125, "logits/rejected": 1.388671875, "logps/chosen": -943.0, "logps/rejected": -884.0, "loss": 0.5346, "rewards/accuracies": 0.8125, "rewards/chosen": 1.53515625, "rewards/margins": 4.8046875, "rewards/rejected": -3.26171875, "step": 2637 }, { "epoch": 0.4984647361708158, "grad_norm": 1.518982233080679, "learning_rate": 6.310305490792833e-07, "logits/chosen": 2.744140625, "logits/rejected": 2.78125, "logps/chosen": -991.0, "logps/rejected": -1022.0, "loss": 0.6047, "rewards/accuracies": 0.84375, "rewards/chosen": 0.814453125, "rewards/margins": 4.7421875, "rewards/rejected": -3.92578125, "step": 2638 }, { "epoch": 0.4986536917190231, "grad_norm": 1.673296894994204, "learning_rate": 6.307385708732362e-07, "logits/chosen": 2.60546875, "logits/rejected": 2.197265625, "logps/chosen": -718.0, "logps/rejected": -693.5, "loss": 0.5252, "rewards/accuracies": 0.90625, "rewards/chosen": 1.701171875, "rewards/margins": 5.32421875, "rewards/rejected": -3.6171875, "step": 2639 }, { "epoch": 0.4988426472672304, "grad_norm": 2.009844338994573, "learning_rate": 6.304465575419229e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.470703125, "logps/chosen": -1016.0, "logps/rejected": -968.5, "loss": 0.5995, "rewards/accuracies": 0.71875, "rewards/chosen": 0.574951171875, "rewards/margins": 4.41015625, "rewards/rejected": -3.83984375, "step": 2640 }, { "epoch": 0.4990316028154377, "grad_norm": 1.8226273139092362, "learning_rate": 6.301545092123835e-07, "logits/chosen": 2.734375, "logits/rejected": 2.53125, "logps/chosen": -365.0, "logps/rejected": -438.5, "loss": 0.7049, "rewards/accuracies": 0.75, "rewards/chosen": 0.3623046875, "rewards/margins": 2.4638671875, "rewards/rejected": -2.1025390625, "step": 2641 }, { "epoch": 0.49922055836364493, "grad_norm": 3.131001043659458, "learning_rate": 6.298624260116738e-07, "logits/chosen": 2.1064453125, "logits/rejected": 1.80322265625, "logps/chosen": -794.5, "logps/rejected": -804.0, "loss": 0.6344, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4755859375, "rewards/margins": 3.921875, "rewards/rejected": -3.453125, "step": 2642 }, { "epoch": 0.49940951391185223, "grad_norm": 1.7896777537995747, "learning_rate": 6.295703080668639e-07, "logits/chosen": 2.10546875, "logits/rejected": 2.740234375, "logps/chosen": -738.0, "logps/rejected": -1884.0, "loss": 0.5322, "rewards/accuracies": 0.90625, "rewards/chosen": 0.951171875, "rewards/margins": 6.109375, "rewards/rejected": -5.16015625, "step": 2643 }, { "epoch": 0.49959846946005954, "grad_norm": 1.9752720311454646, "learning_rate": 6.292781555050399e-07, "logits/chosen": 2.345703125, "logits/rejected": 2.314453125, "logps/chosen": -694.5, "logps/rejected": -17117.0, "loss": 0.6329, "rewards/accuracies": 0.75, "rewards/chosen": -0.05316162109375, "rewards/margins": -46.40625, "rewards/rejected": 46.5078125, "step": 2644 }, { "epoch": 0.4997874250082668, "grad_norm": 2.31591792355621, "learning_rate": 6.289859684533026e-07, "logits/chosen": 2.248046875, "logits/rejected": 1.9765625, "logps/chosen": -785.0, "logps/rejected": -975.0, "loss": 0.5699, "rewards/accuracies": 0.78125, "rewards/chosen": 0.62890625, "rewards/margins": 5.203125, "rewards/rejected": -4.5703125, "step": 2645 }, { "epoch": 0.4999763805564741, "grad_norm": 1.8315985657264011, "learning_rate": 6.286937470387675e-07, "logits/chosen": 2.009765625, "logits/rejected": 1.577392578125, "logps/chosen": -597.0, "logps/rejected": -587.0, "loss": 0.5035, "rewards/accuracies": 0.875, "rewards/chosen": 0.98779296875, "rewards/margins": 4.765625, "rewards/rejected": -3.7578125, "step": 2646 }, { "epoch": 0.5001653361046814, "grad_norm": 1.969492742783069, "learning_rate": 6.284014913885658e-07, "logits/chosen": 1.728515625, "logits/rejected": 1.74609375, "logps/chosen": -611.0, "logps/rejected": -774.0, "loss": 0.5369, "rewards/accuracies": 0.96875, "rewards/chosen": 0.8433837890625, "rewards/margins": 3.796875, "rewards/rejected": -2.9453125, "step": 2647 }, { "epoch": 0.5003542916528887, "grad_norm": 1.8470613449729278, "learning_rate": 6.281092016298427e-07, "logits/chosen": 2.708984375, "logits/rejected": 2.658203125, "logps/chosen": -849.0, "logps/rejected": -2625.0, "loss": 0.6386, "rewards/accuracies": 0.75, "rewards/chosen": 0.4254150390625, "rewards/margins": 1.9736328125, "rewards/rejected": -1.546875, "step": 2648 }, { "epoch": 0.500543247201096, "grad_norm": 1.8980124921387516, "learning_rate": 6.278168778897591e-07, "logits/chosen": 2.033203125, "logits/rejected": 1.724609375, "logps/chosen": -506.0, "logps/rejected": -733.0, "loss": 0.7024, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06103515625, "rewards/margins": 3.61328125, "rewards/rejected": -3.6796875, "step": 2649 }, { "epoch": 0.5007322027493032, "grad_norm": 2.4709117629683264, "learning_rate": 6.275245202954897e-07, "logits/chosen": 1.75732421875, "logits/rejected": 1.75341796875, "logps/chosen": -618.0, "logps/rejected": -706.5, "loss": 0.7, "rewards/accuracies": 0.6875, "rewards/chosen": 0.552734375, "rewards/margins": 3.50390625, "rewards/rejected": -2.958984375, "step": 2650 }, { "epoch": 0.5009211582975105, "grad_norm": 1.986561741037959, "learning_rate": 6.272321289742252e-07, "logits/chosen": 2.35986328125, "logits/rejected": 2.259765625, "logps/chosen": -878.0, "logps/rejected": -1537.0, "loss": 0.5022, "rewards/accuracies": 0.96875, "rewards/chosen": 1.125, "rewards/margins": 5.921875, "rewards/rejected": -4.80078125, "step": 2651 }, { "epoch": 0.5011101138457178, "grad_norm": 1.8465715606893598, "learning_rate": 6.269397040531698e-07, "logits/chosen": 2.37109375, "logits/rejected": 1.931640625, "logps/chosen": -907.5, "logps/rejected": -847.0, "loss": 0.5602, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7314453125, "rewards/margins": 4.234375, "rewards/rejected": -3.5078125, "step": 2652 }, { "epoch": 0.5012990693939251, "grad_norm": 1.6180555609566452, "learning_rate": 6.266472456595428e-07, "logits/chosen": 2.400390625, "logits/rejected": 2.29296875, "logps/chosen": -850.5, "logps/rejected": -1016.5, "loss": 0.542, "rewards/accuracies": 0.84375, "rewards/chosen": 1.10986328125, "rewards/margins": 5.673828125, "rewards/rejected": -4.56640625, "step": 2653 }, { "epoch": 0.5014880249421324, "grad_norm": 2.433787165039763, "learning_rate": 6.263547539205781e-07, "logits/chosen": 2.189453125, "logits/rejected": 2.22265625, "logps/chosen": -960.0, "logps/rejected": -1238.0, "loss": 0.5079, "rewards/accuracies": 0.90625, "rewards/chosen": 1.744140625, "rewards/margins": 5.97265625, "rewards/rejected": -4.23046875, "step": 2654 }, { "epoch": 0.5016769804903396, "grad_norm": 1.8811278575131551, "learning_rate": 6.260622289635242e-07, "logits/chosen": 2.53125, "logits/rejected": 2.33984375, "logps/chosen": -891.0, "logps/rejected": -865.0, "loss": 0.5916, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0537109375, "rewards/margins": 4.4765625, "rewards/rejected": -4.421875, "step": 2655 }, { "epoch": 0.5018659360385469, "grad_norm": 1.9178451418455096, "learning_rate": 6.257696709156437e-07, "logits/chosen": 3.58203125, "logits/rejected": 2.875, "logps/chosen": -748.0, "logps/rejected": -860.5, "loss": 0.4746, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2587890625, "rewards/margins": 5.0546875, "rewards/rejected": -3.79296875, "step": 2656 }, { "epoch": 0.5020548915867542, "grad_norm": 1.6487109567571951, "learning_rate": 6.254770799042139e-07, "logits/chosen": 3.1171875, "logits/rejected": 2.9296875, "logps/chosen": -675.0, "logps/rejected": -782.0, "loss": 0.6278, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2421875, "rewards/margins": 3.828125, "rewards/rejected": -2.5830078125, "step": 2657 }, { "epoch": 0.5022438471349615, "grad_norm": 2.1685058308605387, "learning_rate": 6.251844560565263e-07, "logits/chosen": 1.40234375, "logits/rejected": 1.5859375, "logps/chosen": -873.0, "logps/rejected": -1559.0, "loss": 0.4538, "rewards/accuracies": 0.96875, "rewards/chosen": 1.353515625, "rewards/margins": 7.546875, "rewards/rejected": -6.1796875, "step": 2658 }, { "epoch": 0.5024328026831688, "grad_norm": 2.0147731120254346, "learning_rate": 6.248917994998869e-07, "logits/chosen": 2.97265625, "logits/rejected": 2.9765625, "logps/chosen": -653.0, "logps/rejected": -881.0, "loss": 0.5959, "rewards/accuracies": 0.75, "rewards/chosen": 0.4560546875, "rewards/margins": 4.234375, "rewards/rejected": -3.783203125, "step": 2659 }, { "epoch": 0.5026217582313761, "grad_norm": 2.7545773385253405, "learning_rate": 6.245991103616155e-07, "logits/chosen": 3.7158203125, "logits/rejected": 3.484375, "logps/chosen": -437.75, "logps/rejected": -1178.0, "loss": 0.5912, "rewards/accuracies": 0.875, "rewards/chosen": 0.353515625, "rewards/margins": 4.765625, "rewards/rejected": -4.41015625, "step": 2660 }, { "epoch": 0.5028107137795833, "grad_norm": 3.2090183892072783, "learning_rate": 6.243063887690466e-07, "logits/chosen": 2.1171875, "logits/rejected": 1.91796875, "logps/chosen": -701.0, "logps/rejected": -998.5, "loss": 0.5253, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4869384765625, "rewards/margins": 5.2265625, "rewards/rejected": -4.734375, "step": 2661 }, { "epoch": 0.5029996693277906, "grad_norm": 1.9397478335757763, "learning_rate": 6.240136348495282e-07, "logits/chosen": 2.19921875, "logits/rejected": 2.62890625, "logps/chosen": -471.0, "logps/rejected": -1406.5, "loss": 0.5762, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1298828125, "rewards/margins": 5.5390625, "rewards/rejected": -5.40625, "step": 2662 }, { "epoch": 0.5031886248759979, "grad_norm": 2.506743887838635, "learning_rate": 6.23720848730423e-07, "logits/chosen": 2.884765625, "logits/rejected": 2.681640625, "logps/chosen": -691.0, "logps/rejected": -704.0, "loss": 0.6318, "rewards/accuracies": 0.8125, "rewards/chosen": 0.77490234375, "rewards/margins": 3.7578125, "rewards/rejected": -2.98046875, "step": 2663 }, { "epoch": 0.5033775804242052, "grad_norm": 2.0196028626164453, "learning_rate": 6.234280305391073e-07, "logits/chosen": 1.94921875, "logits/rejected": 2.201171875, "logps/chosen": -593.0, "logps/rejected": -1342.5, "loss": 0.4956, "rewards/accuracies": 0.90625, "rewards/chosen": 1.259765625, "rewards/margins": 6.3125, "rewards/rejected": -5.05078125, "step": 2664 }, { "epoch": 0.5035665359724125, "grad_norm": 1.9717992759997143, "learning_rate": 6.231351804029716e-07, "logits/chosen": 2.2890625, "logits/rejected": 2.431640625, "logps/chosen": -928.0, "logps/rejected": -735.75, "loss": 0.6127, "rewards/accuracies": 0.84375, "rewards/chosen": 1.6904296875, "rewards/margins": 3.9296875, "rewards/rejected": -2.244140625, "step": 2665 }, { "epoch": 0.5037554915206198, "grad_norm": 1.6874594411302342, "learning_rate": 6.2284229844942e-07, "logits/chosen": 2.591796875, "logits/rejected": 1.7568359375, "logps/chosen": -4227.0, "logps/rejected": -971.0, "loss": 0.5074, "rewards/accuracies": 0.8125, "rewards/chosen": 47.50830078125, "rewards/margins": 51.77734375, "rewards/rejected": -4.17578125, "step": 2666 }, { "epoch": 0.503944447068827, "grad_norm": 1.7841497149812888, "learning_rate": 6.225493848058709e-07, "logits/chosen": 2.041015625, "logits/rejected": 1.908203125, "logps/chosen": -617.0, "logps/rejected": -1205.0, "loss": 0.6142, "rewards/accuracies": 0.71875, "rewards/chosen": 0.724609375, "rewards/margins": 5.02734375, "rewards/rejected": -4.30078125, "step": 2667 }, { "epoch": 0.5041334026170343, "grad_norm": 1.6224473578945537, "learning_rate": 6.22256439599756e-07, "logits/chosen": 2.29296875, "logits/rejected": 2.5859375, "logps/chosen": -543.5, "logps/rejected": -572.5, "loss": 0.6697, "rewards/accuracies": 0.78125, "rewards/chosen": 0.861572265625, "rewards/margins": 2.875, "rewards/rejected": -2.013671875, "step": 2668 }, { "epoch": 0.5043223581652416, "grad_norm": 2.9089155169993854, "learning_rate": 6.219634629585208e-07, "logits/chosen": 1.90625, "logits/rejected": 1.79296875, "logps/chosen": -686.0, "logps/rejected": -1249.0, "loss": 0.611, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2236328125, "rewards/margins": 4.228515625, "rewards/rejected": -3.0107421875, "step": 2669 }, { "epoch": 0.5045113137134489, "grad_norm": 1.5524783264754387, "learning_rate": 6.216704550096247e-07, "logits/chosen": 1.22119140625, "logits/rejected": 1.2598876953125, "logps/chosen": -956.5, "logps/rejected": -1133.0, "loss": 0.5126, "rewards/accuracies": 0.875, "rewards/chosen": 1.5068359375, "rewards/margins": 5.08203125, "rewards/rejected": -3.58203125, "step": 2670 }, { "epoch": 0.5047002692616562, "grad_norm": 1.8307606249761892, "learning_rate": 6.213774158805406e-07, "logits/chosen": 1.6376953125, "logits/rejected": 1.1044921875, "logps/chosen": -882.0, "logps/rejected": -16386.0, "loss": 0.6369, "rewards/accuracies": 0.84375, "rewards/chosen": 0.63671875, "rewards/margins": -48.29833984375, "rewards/rejected": 49.1171875, "step": 2671 }, { "epoch": 0.5048892248098635, "grad_norm": 2.230354752128511, "learning_rate": 6.210843456987552e-07, "logits/chosen": 2.560546875, "logits/rejected": 2.8046875, "logps/chosen": -657.25, "logps/rejected": -649.0, "loss": 0.6602, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4453125, "rewards/margins": 3.578125, "rewards/rejected": -2.12890625, "step": 2672 }, { "epoch": 0.5050781803580707, "grad_norm": 1.8557803301120996, "learning_rate": 6.207912445917686e-07, "logits/chosen": 2.859375, "logits/rejected": 3.12890625, "logps/chosen": -596.75, "logps/rejected": -898.0, "loss": 0.6245, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1044921875, "rewards/margins": 4.3125, "rewards/rejected": -3.20703125, "step": 2673 }, { "epoch": 0.505267135906278, "grad_norm": 2.00756834217471, "learning_rate": 6.204981126870938e-07, "logits/chosen": 2.123046875, "logits/rejected": 2.4951171875, "logps/chosen": -650.5, "logps/rejected": -1772.0, "loss": 0.5175, "rewards/accuracies": 0.875, "rewards/chosen": 0.986328125, "rewards/margins": 8.703125, "rewards/rejected": -7.7265625, "step": 2674 }, { "epoch": 0.5054560914544853, "grad_norm": 1.3832471011618097, "learning_rate": 6.20204950112258e-07, "logits/chosen": 2.44140625, "logits/rejected": 2.244140625, "logps/chosen": -477.5, "logps/rejected": -516.0, "loss": 0.665, "rewards/accuracies": 0.8125, "rewards/chosen": 0.355712890625, "rewards/margins": 2.94921875, "rewards/rejected": -2.59765625, "step": 2675 }, { "epoch": 0.5056450470026926, "grad_norm": 1.9817200624265, "learning_rate": 6.199117569948011e-07, "logits/chosen": 2.013671875, "logits/rejected": 1.955078125, "logps/chosen": -1094.0, "logps/rejected": -1837.0, "loss": 0.5355, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4404296875, "rewards/margins": 6.11328125, "rewards/rejected": -4.673828125, "step": 2676 }, { "epoch": 0.5058340025508999, "grad_norm": 2.12410726044086, "learning_rate": 6.19618533462277e-07, "logits/chosen": 2.9921875, "logits/rejected": 3.0625, "logps/chosen": -866.5, "logps/rejected": -1195.0, "loss": 0.651, "rewards/accuracies": 0.65625, "rewards/chosen": 1.2705078125, "rewards/margins": 4.43359375, "rewards/rejected": -3.169921875, "step": 2677 }, { "epoch": 0.5060229580991071, "grad_norm": 2.865550355896608, "learning_rate": 6.193252796422521e-07, "logits/chosen": 2.70703125, "logits/rejected": 2.328125, "logps/chosen": -772.0, "logps/rejected": -651.5, "loss": 0.6842, "rewards/accuracies": 0.75, "rewards/chosen": 0.858642578125, "rewards/margins": 2.953125, "rewards/rejected": -2.093505859375, "step": 2678 }, { "epoch": 0.5062119136473144, "grad_norm": 2.261629288763089, "learning_rate": 6.190319956623065e-07, "logits/chosen": 2.0888671875, "logits/rejected": 1.6650390625, "logps/chosen": -751.0, "logps/rejected": -1446.0, "loss": 0.5279, "rewards/accuracies": 0.90625, "rewards/chosen": 0.56640625, "rewards/margins": 4.95703125, "rewards/rejected": -4.388671875, "step": 2679 }, { "epoch": 0.5064008691955217, "grad_norm": 2.0689820172990583, "learning_rate": 6.18738681650033e-07, "logits/chosen": 3.03125, "logits/rejected": 2.7734375, "logps/chosen": -1060.0, "logps/rejected": -1053.0, "loss": 0.5796, "rewards/accuracies": 0.8125, "rewards/chosen": 1.28271484375, "rewards/margins": 5.1875, "rewards/rejected": -3.91015625, "step": 2680 }, { "epoch": 0.506589824743729, "grad_norm": 2.0204612695510558, "learning_rate": 6.18445337733038e-07, "logits/chosen": 1.611328125, "logits/rejected": 2.07659912109375, "logps/chosen": -846.0, "logps/rejected": -978.0, "loss": 0.5154, "rewards/accuracies": 0.875, "rewards/chosen": 0.3929443359375, "rewards/margins": 4.765625, "rewards/rejected": -4.375, "step": 2681 }, { "epoch": 0.5067787802919363, "grad_norm": 2.508948824392915, "learning_rate": 6.181519640389403e-07, "logits/chosen": 2.912109375, "logits/rejected": 2.4482421875, "logps/chosen": -816.0, "logps/rejected": -701.0, "loss": 0.6869, "rewards/accuracies": 0.71875, "rewards/chosen": 1.16748046875, "rewards/margins": 3.65234375, "rewards/rejected": -2.48828125, "step": 2682 }, { "epoch": 0.5069677358401437, "grad_norm": 2.7219310542631794, "learning_rate": 6.17858560695372e-07, "logits/chosen": 3.15625, "logits/rejected": 3.3828125, "logps/chosen": -578.5, "logps/rejected": -721.0, "loss": 0.6003, "rewards/accuracies": 0.78125, "rewards/chosen": 0.83056640625, "rewards/margins": 3.72265625, "rewards/rejected": -2.890625, "step": 2683 }, { "epoch": 0.5071566913883508, "grad_norm": 1.8639761761064877, "learning_rate": 6.175651278299778e-07, "logits/chosen": 2.8125, "logits/rejected": 2.8671875, "logps/chosen": -592.5, "logps/rejected": -2024.5, "loss": 0.5856, "rewards/accuracies": 0.84375, "rewards/chosen": 0.50341796875, "rewards/margins": 5.458984375, "rewards/rejected": -4.953125, "step": 2684 }, { "epoch": 0.5073456469365581, "grad_norm": 2.1865199419096792, "learning_rate": 6.172716655704163e-07, "logits/chosen": 2.623046875, "logits/rejected": 2.3828125, "logps/chosen": -738.0, "logps/rejected": -556.5, "loss": 0.762, "rewards/accuracies": 0.65625, "rewards/chosen": 0.267578125, "rewards/margins": 2.380859375, "rewards/rejected": -2.11328125, "step": 2685 }, { "epoch": 0.5075346024847655, "grad_norm": 2.012989995459834, "learning_rate": 6.169781740443572e-07, "logits/chosen": 1.525390625, "logits/rejected": 1.23046875, "logps/chosen": -834.0, "logps/rejected": -834.0, "loss": 0.4659, "rewards/accuracies": 0.96875, "rewards/chosen": 0.587646484375, "rewards/margins": 5.7109375, "rewards/rejected": -5.1328125, "step": 2686 }, { "epoch": 0.5077235580329728, "grad_norm": 1.7761885720019583, "learning_rate": 6.166846533794844e-07, "logits/chosen": 1.3896484375, "logits/rejected": 1.711181640625, "logps/chosen": -800.5, "logps/rejected": -1002.0, "loss": 0.661, "rewards/accuracies": 0.8125, "rewards/chosen": 0.17431640625, "rewards/margins": 3.6923828125, "rewards/rejected": -3.51953125, "step": 2687 }, { "epoch": 0.5079125135811801, "grad_norm": 1.8138165013372423, "learning_rate": 6.163911037034934e-07, "logits/chosen": 1.8212890625, "logits/rejected": 1.6689453125, "logps/chosen": -694.5, "logps/rejected": -624.0, "loss": 0.5044, "rewards/accuracies": 0.96875, "rewards/chosen": 1.111328125, "rewards/margins": 4.1953125, "rewards/rejected": -3.07421875, "step": 2688 }, { "epoch": 0.5081014691293874, "grad_norm": 2.4844942820657008, "learning_rate": 6.160975251440932e-07, "logits/chosen": 1.87890625, "logits/rejected": 2.08984375, "logps/chosen": -1227.0, "logps/rejected": -1509.0, "loss": 0.5308, "rewards/accuracies": 0.84375, "rewards/chosen": 1.69580078125, "rewards/margins": 5.57421875, "rewards/rejected": -3.875, "step": 2689 }, { "epoch": 0.5082904246775946, "grad_norm": 1.9248641002627271, "learning_rate": 6.15803917829005e-07, "logits/chosen": 1.2978515625, "logits/rejected": 0.9833984375, "logps/chosen": -1074.0, "logps/rejected": -972.0, "loss": 0.5551, "rewards/accuracies": 0.875, "rewards/chosen": 1.701171875, "rewards/margins": 5.828125, "rewards/rejected": -4.12890625, "step": 2690 }, { "epoch": 0.5084793802258019, "grad_norm": 1.9124933680205027, "learning_rate": 6.15510281885962e-07, "logits/chosen": 2.453125, "logits/rejected": 2.3408203125, "logps/chosen": -469.0, "logps/rejected": -486.5, "loss": 0.7123, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17626953125, "rewards/margins": 2.98828125, "rewards/rejected": -3.158203125, "step": 2691 }, { "epoch": 0.5086683357740092, "grad_norm": 1.9601422597752567, "learning_rate": 6.15216617442711e-07, "logits/chosen": 3.546875, "logits/rejected": 3.51953125, "logps/chosen": -781.0, "logps/rejected": -1347.0, "loss": 0.7089, "rewards/accuracies": 0.71875, "rewards/chosen": 0.30810546875, "rewards/margins": 5.3779296875, "rewards/rejected": -5.071533203125, "step": 2692 }, { "epoch": 0.5088572913222165, "grad_norm": 2.582435749562145, "learning_rate": 6.149229246270099e-07, "logits/chosen": 2.167236328125, "logits/rejected": 1.76904296875, "logps/chosen": -691.5, "logps/rejected": -780.0, "loss": 0.5546, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8753662109375, "rewards/margins": 4.8125, "rewards/rejected": -3.9453125, "step": 2693 }, { "epoch": 0.5090462468704238, "grad_norm": 1.4764706530129692, "learning_rate": 6.1462920356663e-07, "logits/chosen": 3.197265625, "logits/rejected": 2.98046875, "logps/chosen": -773.5, "logps/rejected": -1268.0, "loss": 0.5017, "rewards/accuracies": 0.84375, "rewards/chosen": 0.722412109375, "rewards/margins": 5.5, "rewards/rejected": -4.78125, "step": 2694 }, { "epoch": 0.5092352024186311, "grad_norm": 2.1909616786391775, "learning_rate": 6.143354543893543e-07, "logits/chosen": 2.4453125, "logits/rejected": 2.279296875, "logps/chosen": -469.0, "logps/rejected": -760.0, "loss": 0.5143, "rewards/accuracies": 0.90625, "rewards/chosen": 0.61376953125, "rewards/margins": 4.9140625, "rewards/rejected": -4.3046875, "step": 2695 }, { "epoch": 0.5094241579668383, "grad_norm": 1.73888595440854, "learning_rate": 6.140416772229784e-07, "logits/chosen": 2.17724609375, "logits/rejected": 1.670654296875, "logps/chosen": -881.0, "logps/rejected": -1126.0, "loss": 0.6033, "rewards/accuracies": 0.78125, "rewards/chosen": 1.166015625, "rewards/margins": 5.046875, "rewards/rejected": -3.875, "step": 2696 }, { "epoch": 0.5096131135150456, "grad_norm": 2.0788940760692864, "learning_rate": 6.137478721953096e-07, "logits/chosen": 2.552734375, "logits/rejected": 2.708984375, "logps/chosen": -1230.0, "logps/rejected": -1395.0, "loss": 0.5235, "rewards/accuracies": 0.875, "rewards/chosen": 1.5107421875, "rewards/margins": 5.19921875, "rewards/rejected": -3.6875, "step": 2697 }, { "epoch": 0.5098020690632529, "grad_norm": 1.9597362099884776, "learning_rate": 6.134540394341676e-07, "logits/chosen": 2.4326171875, "logits/rejected": 2.5546875, "logps/chosen": -770.875, "logps/rejected": -856.0, "loss": 0.6379, "rewards/accuracies": 0.875, "rewards/chosen": 1.50732421875, "rewards/margins": 3.703125, "rewards/rejected": -2.19189453125, "step": 2698 }, { "epoch": 0.5099910246114602, "grad_norm": 1.705727643530876, "learning_rate": 6.131601790673844e-07, "logits/chosen": 3.68359375, "logits/rejected": 3.92578125, "logps/chosen": -975.0, "logps/rejected": -1116.0, "loss": 0.5334, "rewards/accuracies": 0.875, "rewards/chosen": 1.390625, "rewards/margins": 5.375, "rewards/rejected": -3.98828125, "step": 2699 }, { "epoch": 0.5101799801596675, "grad_norm": 2.0974961804219303, "learning_rate": 6.128662912228038e-07, "logits/chosen": 2.439453125, "logits/rejected": 2.380859375, "logps/chosen": -497.0, "logps/rejected": -686.5, "loss": 0.7233, "rewards/accuracies": 0.65625, "rewards/chosen": -1.134033203125, "rewards/margins": 0.689453125, "rewards/rejected": -1.8251953125, "step": 2700 }, { "epoch": 0.5103689357078747, "grad_norm": 1.5609870317562937, "learning_rate": 6.125723760282813e-07, "logits/chosen": 1.634765625, "logits/rejected": 1.4443359375, "logps/chosen": -16170.5, "logps/rejected": -656.5, "loss": 0.6047, "rewards/accuracies": 0.75, "rewards/chosen": 58.3388671875, "rewards/margins": 61.32421875, "rewards/rejected": -3.029296875, "step": 2701 }, { "epoch": 0.510557891256082, "grad_norm": 2.4112847697991597, "learning_rate": 6.122784336116844e-07, "logits/chosen": 2.337890625, "logits/rejected": 2.671875, "logps/chosen": -663.5, "logps/rejected": -1585.0, "loss": 0.5607, "rewards/accuracies": 0.8125, "rewards/chosen": 0.91845703125, "rewards/margins": 6.4765625, "rewards/rejected": -5.55859375, "step": 2702 }, { "epoch": 0.5107468468042893, "grad_norm": 1.8562820868107497, "learning_rate": 6.119844641008931e-07, "logits/chosen": 2.21484375, "logits/rejected": 2.037109375, "logps/chosen": -666.0, "logps/rejected": -657.0, "loss": 0.643, "rewards/accuracies": 0.71875, "rewards/chosen": 0.77001953125, "rewards/margins": 4.0322265625, "rewards/rejected": -3.2607421875, "step": 2703 }, { "epoch": 0.5109358023524966, "grad_norm": 2.7746328780956753, "learning_rate": 6.116904676237982e-07, "logits/chosen": 3.20703125, "logits/rejected": 3.3125, "logps/chosen": -992.0, "logps/rejected": -1362.0, "loss": 0.6322, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3837890625, "rewards/margins": 3.83984375, "rewards/rejected": -2.462890625, "step": 2704 }, { "epoch": 0.5111247579007039, "grad_norm": 2.6368048227728815, "learning_rate": 6.113964443083028e-07, "logits/chosen": 2.859375, "logits/rejected": 2.6259765625, "logps/chosen": -698.0, "logps/rejected": -1116.0, "loss": 0.6341, "rewards/accuracies": 0.6875, "rewards/chosen": 1.248046875, "rewards/margins": 2.75390625, "rewards/rejected": -1.505859375, "step": 2705 }, { "epoch": 0.5113137134489112, "grad_norm": 1.5346058831049878, "learning_rate": 6.111023942823215e-07, "logits/chosen": 2.66015625, "logits/rejected": 3.234375, "logps/chosen": -940.5, "logps/rejected": -1345.0, "loss": 0.6439, "rewards/accuracies": 0.75, "rewards/chosen": -1.29345703125, "rewards/margins": 7.87109375, "rewards/rejected": -9.171875, "step": 2706 }, { "epoch": 0.5115026689971184, "grad_norm": 3.300465612721547, "learning_rate": 6.10808317673781e-07, "logits/chosen": 2.392578125, "logits/rejected": 2.12109375, "logps/chosen": -884.0, "logps/rejected": -839.0, "loss": 0.5404, "rewards/accuracies": 0.8125, "rewards/chosen": 0.26171875, "rewards/margins": 3.912109375, "rewards/rejected": -3.65234375, "step": 2707 }, { "epoch": 0.5116916245453257, "grad_norm": 1.8461623612522506, "learning_rate": 6.105142146106187e-07, "logits/chosen": 2.388671875, "logits/rejected": 2.287109375, "logps/chosen": -658.0, "logps/rejected": -748.5, "loss": 0.638, "rewards/accuracies": 0.84375, "rewards/chosen": 0.18505859375, "rewards/margins": 3.47265625, "rewards/rejected": -3.29296875, "step": 2708 }, { "epoch": 0.511880580093533, "grad_norm": 3.404710019944166, "learning_rate": 6.10220085220784e-07, "logits/chosen": 2.111328125, "logits/rejected": 1.818359375, "logps/chosen": -820.5, "logps/rejected": -1750.0, "loss": 0.493, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1123046875, "rewards/margins": 6.2265625, "rewards/rejected": -5.1171875, "step": 2709 }, { "epoch": 0.5120695356417403, "grad_norm": 2.5762026777381157, "learning_rate": 6.099259296322379e-07, "logits/chosen": 3.0234375, "logits/rejected": 2.748046875, "logps/chosen": -909.0, "logps/rejected": -952.0, "loss": 0.5209, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7354736328125, "rewards/margins": 5.375, "rewards/rejected": -4.6328125, "step": 2710 }, { "epoch": 0.5122584911899476, "grad_norm": 2.593092968950804, "learning_rate": 6.096317479729525e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.2216796875, "logps/chosen": -762.0, "logps/rejected": -485.5, "loss": 0.63, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1044921875, "rewards/margins": 3.90625, "rewards/rejected": -4.015625, "step": 2711 }, { "epoch": 0.5124474467381549, "grad_norm": 2.0186780115125895, "learning_rate": 6.093375403709112e-07, "logits/chosen": 2.765625, "logits/rejected": 2.7421875, "logps/chosen": -714.5, "logps/rejected": -799.0, "loss": 0.6465, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4853515625, "rewards/margins": 3.81640625, "rewards/rejected": -4.296875, "step": 2712 }, { "epoch": 0.5126364022863621, "grad_norm": 1.809448529869209, "learning_rate": 6.090433069541092e-07, "logits/chosen": 2.958984375, "logits/rejected": 2.525390625, "logps/chosen": -803.0, "logps/rejected": -727.0, "loss": 0.6664, "rewards/accuracies": 0.6875, "rewards/chosen": 0.255615234375, "rewards/margins": 3.73828125, "rewards/rejected": -3.48046875, "step": 2713 }, { "epoch": 0.5128253578345694, "grad_norm": 2.098756851159424, "learning_rate": 6.087490478505524e-07, "logits/chosen": 2.546875, "logits/rejected": 2.89453125, "logps/chosen": -807.5, "logps/rejected": -2075.0, "loss": 0.6903, "rewards/accuracies": 0.75, "rewards/chosen": -0.5888671875, "rewards/margins": 4.763671875, "rewards/rejected": -5.35546875, "step": 2714 }, { "epoch": 0.5130143133827767, "grad_norm": 2.03807042091429, "learning_rate": 6.084547631882578e-07, "logits/chosen": 2.90234375, "logits/rejected": 2.337890625, "logps/chosen": -886.5, "logps/rejected": -1319.0, "loss": 0.6215, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03125, "rewards/margins": 6.515625, "rewards/rejected": -6.53125, "step": 2715 }, { "epoch": 0.513203268930984, "grad_norm": 2.2035732456147517, "learning_rate": 6.081604530952542e-07, "logits/chosen": 2.525390625, "logits/rejected": 2.677734375, "logps/chosen": -761.5, "logps/rejected": -1391.0, "loss": 0.57, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6171875, "rewards/margins": 7.32421875, "rewards/rejected": -6.7109375, "step": 2716 }, { "epoch": 0.5133922244791913, "grad_norm": 3.0826282237386082, "learning_rate": 6.078661176995808e-07, "logits/chosen": 3.1640625, "logits/rejected": 2.97265625, "logps/chosen": -1149.0, "logps/rejected": -892.0, "loss": 0.6104, "rewards/accuracies": 0.78125, "rewards/chosen": 0.523681640625, "rewards/margins": 3.98828125, "rewards/rejected": -3.4609375, "step": 2717 }, { "epoch": 0.5135811800273986, "grad_norm": 2.44304772017521, "learning_rate": 6.07571757129288e-07, "logits/chosen": 2.322265625, "logits/rejected": 1.76171875, "logps/chosen": -827.0, "logps/rejected": -942.0, "loss": 0.6987, "rewards/accuracies": 0.8125, "rewards/chosen": -0.33203125, "rewards/margins": 3.6796875, "rewards/rejected": -4.0, "step": 2718 }, { "epoch": 0.5137701355756058, "grad_norm": 3.0360572240905994, "learning_rate": 6.072773715124371e-07, "logits/chosen": 2.78125, "logits/rejected": 2.623046875, "logps/chosen": -570.0, "logps/rejected": -620.5, "loss": 0.6077, "rewards/accuracies": 0.8125, "rewards/chosen": 0.033203125, "rewards/margins": 3.8876953125, "rewards/rejected": -3.8515625, "step": 2719 }, { "epoch": 0.5139590911238131, "grad_norm": 2.8617936208764396, "learning_rate": 6.069829609771006e-07, "logits/chosen": 2.6328125, "logits/rejected": 2.51953125, "logps/chosen": -894.5, "logps/rejected": -728.5, "loss": 0.6383, "rewards/accuracies": 0.78125, "rewards/chosen": 0.81591796875, "rewards/margins": 3.9375, "rewards/rejected": -3.125, "step": 2720 }, { "epoch": 0.5141480466720204, "grad_norm": 2.4432239119217147, "learning_rate": 6.066885256513615e-07, "logits/chosen": 2.986328125, "logits/rejected": 2.537109375, "logps/chosen": -848.0, "logps/rejected": -841.0, "loss": 0.6358, "rewards/accuracies": 0.8125, "rewards/chosen": 0.58447265625, "rewards/margins": 4.71484375, "rewards/rejected": -4.13671875, "step": 2721 }, { "epoch": 0.5143370022202277, "grad_norm": 2.275746950482248, "learning_rate": 6.063940656633138e-07, "logits/chosen": 3.1875, "logits/rejected": 2.798828125, "logps/chosen": -808.0, "logps/rejected": -927.0, "loss": 0.5924, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4072265625, "rewards/margins": 4.7822265625, "rewards/rejected": -4.3671875, "step": 2722 }, { "epoch": 0.514525957768435, "grad_norm": 3.834304483201369, "learning_rate": 6.060995811410622e-07, "logits/chosen": 1.70086669921875, "logits/rejected": 1.614990234375, "logps/chosen": -808.0, "logps/rejected": -1008.0, "loss": 0.7025, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1572265625, "rewards/margins": 3.8271484375, "rewards/rejected": -3.66796875, "step": 2723 }, { "epoch": 0.5147149133166422, "grad_norm": 2.3281243933518456, "learning_rate": 6.058050722127217e-07, "logits/chosen": 2.560546875, "logits/rejected": 1.91796875, "logps/chosen": -748.5, "logps/rejected": -899.0, "loss": 0.5319, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7080078125, "rewards/margins": 4.91015625, "rewards/rejected": -4.2109375, "step": 2724 }, { "epoch": 0.5149038688648495, "grad_norm": 2.171899392186939, "learning_rate": 6.055105390064186e-07, "logits/chosen": 3.2109375, "logits/rejected": 2.34765625, "logps/chosen": -740.5, "logps/rejected": -801.0, "loss": 0.4501, "rewards/accuracies": 1.0, "rewards/chosen": 1.115234375, "rewards/margins": 6.78125, "rewards/rejected": -5.6640625, "step": 2725 }, { "epoch": 0.5150928244130568, "grad_norm": 3.213106690601998, "learning_rate": 6.052159816502891e-07, "logits/chosen": 1.966796875, "logits/rejected": 2.1572265625, "logps/chosen": -480.0, "logps/rejected": -888.0, "loss": 0.6082, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4190673828125, "rewards/margins": 4.77734375, "rewards/rejected": -4.3671875, "step": 2726 }, { "epoch": 0.5152817799612641, "grad_norm": 1.9356097831002335, "learning_rate": 6.049214002724804e-07, "logits/chosen": 2.875, "logits/rejected": 2.736328125, "logps/chosen": -812.0, "logps/rejected": -709.0, "loss": 0.7788, "rewards/accuracies": 0.625, "rewards/chosen": -1.001220703125, "rewards/margins": 2.400390625, "rewards/rejected": -3.3984375, "step": 2727 }, { "epoch": 0.5154707355094714, "grad_norm": 3.5310348280318204, "learning_rate": 6.046267950011498e-07, "logits/chosen": 2.9609375, "logits/rejected": 2.955078125, "logps/chosen": -747.5, "logps/rejected": -1242.0, "loss": 0.5833, "rewards/accuracies": 0.8125, "rewards/chosen": 0.32476806640625, "rewards/margins": 5.5078125, "rewards/rejected": -5.18359375, "step": 2728 }, { "epoch": 0.5156596910576787, "grad_norm": 2.2324225346923856, "learning_rate": 6.043321659644654e-07, "logits/chosen": 2.94921875, "logits/rejected": 2.203125, "logps/chosen": -516.5, "logps/rejected": -606.0, "loss": 0.6397, "rewards/accuracies": 0.84375, "rewards/chosen": 0.272216796875, "rewards/margins": 3.328125, "rewards/rejected": -3.05859375, "step": 2729 }, { "epoch": 0.5158486466058859, "grad_norm": 1.8638228434889053, "learning_rate": 6.04037513290605e-07, "logits/chosen": 1.80859375, "logits/rejected": 2.27734375, "logps/chosen": -724.5, "logps/rejected": -842.0, "loss": 0.5834, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1640625, "rewards/margins": 4.56640625, "rewards/rejected": -4.40625, "step": 2730 }, { "epoch": 0.5160376021540932, "grad_norm": 2.167185694371444, "learning_rate": 6.037428371077574e-07, "logits/chosen": 1.998046875, "logits/rejected": 1.78662109375, "logps/chosen": -740.0, "logps/rejected": -817.5, "loss": 0.5656, "rewards/accuracies": 0.8125, "rewards/chosen": 0.70849609375, "rewards/margins": 4.322265625, "rewards/rejected": -3.609375, "step": 2731 }, { "epoch": 0.5162265577023005, "grad_norm": 2.806013115034353, "learning_rate": 6.03448137544121e-07, "logits/chosen": 2.123046875, "logits/rejected": 2.119140625, "logps/chosen": -834.0, "logps/rejected": -1252.0, "loss": 0.6213, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1552734375, "rewards/margins": 4.59375, "rewards/rejected": -3.4296875, "step": 2732 }, { "epoch": 0.5164155132505078, "grad_norm": 1.9114976723998685, "learning_rate": 6.03153414727905e-07, "logits/chosen": 1.2841796875, "logits/rejected": 1.4765625, "logps/chosen": -787.5, "logps/rejected": -1120.0, "loss": 0.532, "rewards/accuracies": 0.9375, "rewards/chosen": 1.548828125, "rewards/margins": 5.51953125, "rewards/rejected": -3.96875, "step": 2733 }, { "epoch": 0.5166044687987151, "grad_norm": 2.5310536045444487, "learning_rate": 6.02858668787328e-07, "logits/chosen": 2.8505859375, "logits/rejected": 2.384765625, "logps/chosen": -896.0, "logps/rejected": -1181.5, "loss": 0.481, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7947998046875, "rewards/margins": 4.85546875, "rewards/rejected": -4.05078125, "step": 2734 }, { "epoch": 0.5167934243469224, "grad_norm": 2.127455236899954, "learning_rate": 6.025638998506194e-07, "logits/chosen": 3.26171875, "logits/rejected": 3.09765625, "logps/chosen": -1026.5, "logps/rejected": -873.0, "loss": 0.6051, "rewards/accuracies": 0.8125, "rewards/chosen": 1.33544921875, "rewards/margins": 4.26171875, "rewards/rejected": -2.92578125, "step": 2735 }, { "epoch": 0.5169823798951296, "grad_norm": 2.3774753800914423, "learning_rate": 6.022691080460181e-07, "logits/chosen": 2.71875, "logits/rejected": 2.34375, "logps/chosen": -531.25, "logps/rejected": -514.0, "loss": 0.6342, "rewards/accuracies": 0.875, "rewards/chosen": 0.2890625, "rewards/margins": 3.65625, "rewards/rejected": -3.359375, "step": 2736 }, { "epoch": 0.5171713354433369, "grad_norm": 1.820135333573835, "learning_rate": 6.01974293501773e-07, "logits/chosen": 3.4609375, "logits/rejected": 2.91796875, "logps/chosen": -1090.0, "logps/rejected": -1388.0, "loss": 0.4916, "rewards/accuracies": 0.8125, "rewards/chosen": 1.8818359375, "rewards/margins": 7.0546875, "rewards/rejected": -5.171875, "step": 2737 }, { "epoch": 0.5173602909915442, "grad_norm": 1.9200705891026522, "learning_rate": 6.016794563461433e-07, "logits/chosen": 2.296875, "logits/rejected": 1.978515625, "logps/chosen": -487.25, "logps/rejected": -941.5, "loss": 0.6097, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12939453125, "rewards/margins": 4.69921875, "rewards/rejected": -4.5625, "step": 2738 }, { "epoch": 0.5175492465397515, "grad_norm": 2.828644287340454, "learning_rate": 6.013845967073973e-07, "logits/chosen": 2.1552734375, "logits/rejected": 2.37890625, "logps/chosen": -435.25, "logps/rejected": -530.5, "loss": 0.7345, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0966796875, "rewards/margins": 2.322265625, "rewards/rejected": -2.2265625, "step": 2739 }, { "epoch": 0.5177382020879588, "grad_norm": 2.2158212498741405, "learning_rate": 6.010897147138137e-07, "logits/chosen": 1.8271484375, "logits/rejected": 1.125244140625, "logps/chosen": -598.0, "logps/rejected": -642.5, "loss": 0.6353, "rewards/accuracies": 0.75, "rewards/chosen": 0.544189453125, "rewards/margins": 3.84765625, "rewards/rejected": -3.314453125, "step": 2740 }, { "epoch": 0.5179271576361661, "grad_norm": 1.1848725024458495, "learning_rate": 6.007948104936808e-07, "logits/chosen": 2.94921875, "logits/rejected": 2.66796875, "logps/chosen": -729.0, "logps/rejected": -769.0, "loss": 0.6045, "rewards/accuracies": 0.8125, "rewards/chosen": 1.7646484375, "rewards/margins": 4.77734375, "rewards/rejected": -3.009765625, "step": 2741 }, { "epoch": 0.5181161131843733, "grad_norm": 2.2504225869731296, "learning_rate": 6.004998841752966e-07, "logits/chosen": 2.01171875, "logits/rejected": 1.533203125, "logps/chosen": -1069.0, "logps/rejected": -953.0, "loss": 0.4792, "rewards/accuracies": 0.875, "rewards/chosen": 1.5087890625, "rewards/margins": 5.8046875, "rewards/rejected": -4.30078125, "step": 2742 }, { "epoch": 0.5183050687325806, "grad_norm": 2.22707660209811, "learning_rate": 6.002049358869683e-07, "logits/chosen": 2.685546875, "logits/rejected": 2.646484375, "logps/chosen": -608.5, "logps/rejected": -510.5, "loss": 0.5899, "rewards/accuracies": 0.71875, "rewards/chosen": 1.041015625, "rewards/margins": 3.79296875, "rewards/rejected": -2.75439453125, "step": 2743 }, { "epoch": 0.518494024280788, "grad_norm": 1.8821309859875923, "learning_rate": 5.999099657570133e-07, "logits/chosen": 1.50390625, "logits/rejected": 0.9794921875, "logps/chosen": -758.0, "logps/rejected": -820.0, "loss": 0.5144, "rewards/accuracies": 0.96875, "rewards/chosen": 1.19384765625, "rewards/margins": 4.58984375, "rewards/rejected": -3.39453125, "step": 2744 }, { "epoch": 0.5186829798289952, "grad_norm": 2.4118116541071664, "learning_rate": 5.996149739137578e-07, "logits/chosen": 2.2841796875, "logits/rejected": 1.868896484375, "logps/chosen": -539.5, "logps/rejected": -1127.5, "loss": 0.5437, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05859375, "rewards/margins": 6.078125, "rewards/rejected": -6.013671875, "step": 2745 }, { "epoch": 0.5188719353772026, "grad_norm": 2.1488148622689507, "learning_rate": 5.993199604855381e-07, "logits/chosen": 1.96875, "logits/rejected": 1.8046875, "logps/chosen": -1115.5, "logps/rejected": -1591.0, "loss": 0.5288, "rewards/accuracies": 0.90625, "rewards/chosen": 1.142578125, "rewards/margins": 8.5546875, "rewards/rejected": -7.41015625, "step": 2746 }, { "epoch": 0.5190608909254097, "grad_norm": 2.3330465151343156, "learning_rate": 5.990249256006996e-07, "logits/chosen": 1.50146484375, "logits/rejected": 1.51513671875, "logps/chosen": -16638.0, "logps/rejected": -2012.0, "loss": 0.5471, "rewards/accuracies": 0.8125, "rewards/chosen": 58.2373046875, "rewards/margins": 62.40625, "rewards/rejected": -4.16015625, "step": 2747 }, { "epoch": 0.519249846473617, "grad_norm": 1.6107876767495803, "learning_rate": 5.98729869387597e-07, "logits/chosen": 1.6240234375, "logits/rejected": 1.5625, "logps/chosen": -642.5, "logps/rejected": -556.0, "loss": 0.7122, "rewards/accuracies": 0.75, "rewards/chosen": -0.2646484375, "rewards/margins": 3.30859375, "rewards/rejected": -3.57421875, "step": 2748 }, { "epoch": 0.5194388020218244, "grad_norm": 2.55960262812896, "learning_rate": 5.984347919745944e-07, "logits/chosen": 1.02392578125, "logits/rejected": 0.812744140625, "logps/chosen": -693.0, "logps/rejected": -913.0, "loss": 0.5084, "rewards/accuracies": 0.875, "rewards/chosen": 0.36962890625, "rewards/margins": 5.421875, "rewards/rejected": -5.046875, "step": 2749 }, { "epoch": 0.5196277575700317, "grad_norm": 2.535328159109146, "learning_rate": 5.981396934900647e-07, "logits/chosen": 2.837890625, "logits/rejected": 2.291015625, "logps/chosen": -940.0, "logps/rejected": -1033.0, "loss": 0.7171, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7156982421875, "rewards/margins": 4.515625, "rewards/rejected": -3.79296875, "step": 2750 }, { "epoch": 0.519816713118239, "grad_norm": 2.040081947775141, "learning_rate": 5.978445740623908e-07, "logits/chosen": 2.19140625, "logits/rejected": 1.60546875, "logps/chosen": -770.5, "logps/rejected": -758.0, "loss": 0.5551, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6650390625, "rewards/margins": 5.21484375, "rewards/rejected": -4.55078125, "step": 2751 }, { "epoch": 0.5200056686664463, "grad_norm": 1.9150850925233442, "learning_rate": 5.975494338199641e-07, "logits/chosen": 2.38671875, "logits/rejected": 2.072265625, "logps/chosen": -593.0, "logps/rejected": -1970.0, "loss": 0.6586, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4873046875, "rewards/margins": 17.19140625, "rewards/rejected": -17.640625, "step": 2752 }, { "epoch": 0.5201946242146535, "grad_norm": 5.458771339307406, "learning_rate": 5.972542728911849e-07, "logits/chosen": 1.734375, "logits/rejected": 1.09423828125, "logps/chosen": -799.0, "logps/rejected": -998.0, "loss": 0.6307, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4189453125, "rewards/margins": 4.8359375, "rewards/rejected": -5.2578125, "step": 2753 }, { "epoch": 0.5203835797628608, "grad_norm": 2.8816337433746884, "learning_rate": 5.96959091404463e-07, "logits/chosen": 1.544921875, "logits/rejected": 1.263427734375, "logps/chosen": -946.0, "logps/rejected": -915.0, "loss": 0.5786, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1474609375, "rewards/margins": 5.5625, "rewards/rejected": -5.6953125, "step": 2754 }, { "epoch": 0.5205725353110681, "grad_norm": 3.3990222283678966, "learning_rate": 5.966638894882169e-07, "logits/chosen": 1.5625, "logits/rejected": 1.1279296875, "logps/chosen": -699.0, "logps/rejected": -704.5, "loss": 0.645, "rewards/accuracies": 0.75, "rewards/chosen": 0.17626953125, "rewards/margins": 4.44140625, "rewards/rejected": -4.2578125, "step": 2755 }, { "epoch": 0.5207614908592754, "grad_norm": 2.3772157270963903, "learning_rate": 5.963686672708742e-07, "logits/chosen": 1.67236328125, "logits/rejected": 1.1807861328125, "logps/chosen": -720.0, "logps/rejected": -1042.0, "loss": 0.6071, "rewards/accuracies": 0.875, "rewards/chosen": -0.421630859375, "rewards/margins": 7.36328125, "rewards/rejected": -7.7890625, "step": 2756 }, { "epoch": 0.5209504464074827, "grad_norm": 3.6776268556794425, "learning_rate": 5.960734248808709e-07, "logits/chosen": 2.611328125, "logits/rejected": 1.98828125, "logps/chosen": -496.5, "logps/rejected": -678.75, "loss": 0.6806, "rewards/accuracies": 0.75, "rewards/chosen": -0.2255859375, "rewards/margins": 5.0859375, "rewards/rejected": -5.3125, "step": 2757 }, { "epoch": 0.52113940195569, "grad_norm": 1.988454637574645, "learning_rate": 5.957781624466519e-07, "logits/chosen": 2.4161376953125, "logits/rejected": 2.099609375, "logps/chosen": -769.5, "logps/rejected": -742.0, "loss": 0.707, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3427734375, "rewards/margins": 4.7578125, "rewards/rejected": -4.41796875, "step": 2758 }, { "epoch": 0.5213283575038972, "grad_norm": 3.4800339886817735, "learning_rate": 5.954828800966713e-07, "logits/chosen": 1.84765625, "logits/rejected": 2.142578125, "logps/chosen": -892.5, "logps/rejected": -1026.0, "loss": 0.5297, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9794921875, "rewards/margins": 5.80859375, "rewards/rejected": -4.84375, "step": 2759 }, { "epoch": 0.5215173130521045, "grad_norm": 2.3025660913469688, "learning_rate": 5.951875779593913e-07, "logits/chosen": 1.8447265625, "logits/rejected": 1.12109375, "logps/chosen": -735.5, "logps/rejected": -815.0, "loss": 0.5799, "rewards/accuracies": 0.875, "rewards/chosen": 0.452392578125, "rewards/margins": 6.1484375, "rewards/rejected": -5.69140625, "step": 2760 }, { "epoch": 0.5217062686003118, "grad_norm": 2.0473324692573707, "learning_rate": 5.948922561632829e-07, "logits/chosen": 1.241943359375, "logits/rejected": 1.2003173828125, "logps/chosen": -658.0, "logps/rejected": -2360.0, "loss": 0.6259, "rewards/accuracies": 0.90625, "rewards/chosen": 0.090087890625, "rewards/margins": 10.34375, "rewards/rejected": -10.234375, "step": 2761 }, { "epoch": 0.5218952241485191, "grad_norm": 2.5775039690901065, "learning_rate": 5.945969148368259e-07, "logits/chosen": 2.16796875, "logits/rejected": 2.1923828125, "logps/chosen": -904.0, "logps/rejected": -1471.0, "loss": 0.7068, "rewards/accuracies": 0.75, "rewards/chosen": -0.439453125, "rewards/margins": 8.66845703125, "rewards/rejected": -9.11572265625, "step": 2762 }, { "epoch": 0.5220841796967264, "grad_norm": 1.9940010073759304, "learning_rate": 5.94301554108508e-07, "logits/chosen": 2.552734375, "logits/rejected": 2.537109375, "logps/chosen": -1160.5, "logps/rejected": -983.0, "loss": 0.6224, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2083740234375, "rewards/margins": 5.052734375, "rewards/rejected": -3.84765625, "step": 2763 }, { "epoch": 0.5222731352449337, "grad_norm": 1.8959177081735035, "learning_rate": 5.940061741068261e-07, "logits/chosen": 2.103515625, "logits/rejected": 1.7412109375, "logps/chosen": -1091.0, "logps/rejected": -1126.0, "loss": 0.4436, "rewards/accuracies": 0.90625, "rewards/chosen": 1.380859375, "rewards/margins": 5.59375, "rewards/rejected": -4.2109375, "step": 2764 }, { "epoch": 0.5224620907931409, "grad_norm": 4.01200187190484, "learning_rate": 5.937107749602848e-07, "logits/chosen": 1.9609375, "logits/rejected": 1.5537109375, "logps/chosen": -810.5, "logps/rejected": -754.5, "loss": 0.5366, "rewards/accuracies": 0.84375, "rewards/chosen": -0.03662109375, "rewards/margins": 4.91015625, "rewards/rejected": -4.9375, "step": 2765 }, { "epoch": 0.5226510463413482, "grad_norm": 1.4004742131380374, "learning_rate": 5.934153567973974e-07, "logits/chosen": 3.48046875, "logits/rejected": 3.06640625, "logps/chosen": -640.5, "logps/rejected": -732.0, "loss": 0.7397, "rewards/accuracies": 0.625, "rewards/chosen": -0.091796875, "rewards/margins": 2.66455078125, "rewards/rejected": -2.7578125, "step": 2766 }, { "epoch": 0.5228400018895555, "grad_norm": 2.5581370578039917, "learning_rate": 5.931199197466853e-07, "logits/chosen": 2.68359375, "logits/rejected": 2.837890625, "logps/chosen": -896.0, "logps/rejected": -1012.0, "loss": 0.5223, "rewards/accuracies": 0.875, "rewards/chosen": 0.5855712890625, "rewards/margins": 6.03125, "rewards/rejected": -5.4375, "step": 2767 }, { "epoch": 0.5230289574377628, "grad_norm": 2.1136290955863806, "learning_rate": 5.928244639366783e-07, "logits/chosen": 2.630859375, "logits/rejected": 3.2109375, "logps/chosen": -1087.5, "logps/rejected": -1288.0, "loss": 0.6404, "rewards/accuracies": 0.75, "rewards/chosen": 0.365234375, "rewards/margins": 6.15625, "rewards/rejected": -5.80078125, "step": 2768 }, { "epoch": 0.5232179129859701, "grad_norm": 2.6955034931931534, "learning_rate": 5.925289894959145e-07, "logits/chosen": 2.79296875, "logits/rejected": 2.2353515625, "logps/chosen": -811.0, "logps/rejected": -739.0, "loss": 0.6141, "rewards/accuracies": 0.71875, "rewards/chosen": 1.07275390625, "rewards/margins": 3.546875, "rewards/rejected": -2.462890625, "step": 2769 }, { "epoch": 0.5234068685341773, "grad_norm": 1.7958129062682535, "learning_rate": 5.922334965529394e-07, "logits/chosen": 3.33984375, "logits/rejected": 2.658203125, "logps/chosen": -681.5, "logps/rejected": -725.5, "loss": 0.6794, "rewards/accuracies": 0.71875, "rewards/chosen": 0.687744140625, "rewards/margins": 3.72607421875, "rewards/rejected": -3.0458984375, "step": 2770 }, { "epoch": 0.5235958240823846, "grad_norm": 2.3053150139395915, "learning_rate": 5.919379852363074e-07, "logits/chosen": 2.607421875, "logits/rejected": 1.994140625, "logps/chosen": -848.0, "logps/rejected": -731.0, "loss": 0.5809, "rewards/accuracies": 0.90625, "rewards/chosen": 1.11181640625, "rewards/margins": 3.66796875, "rewards/rejected": -2.5517578125, "step": 2771 }, { "epoch": 0.5237847796305919, "grad_norm": 2.149250595373384, "learning_rate": 5.916424556745805e-07, "logits/chosen": 1.8525390625, "logits/rejected": 1.34423828125, "logps/chosen": -974.0, "logps/rejected": -1184.0, "loss": 0.6267, "rewards/accuracies": 0.75, "rewards/chosen": 0.90966796875, "rewards/margins": 4.58984375, "rewards/rejected": -3.67578125, "step": 2772 }, { "epoch": 0.5239737351787992, "grad_norm": 2.0093507943445075, "learning_rate": 5.913469079963284e-07, "logits/chosen": 2.0078125, "logits/rejected": 1.408203125, "logps/chosen": -628.0, "logps/rejected": -519.5, "loss": 0.6752, "rewards/accuracies": 0.78125, "rewards/chosen": 0.765625, "rewards/margins": 2.93359375, "rewards/rejected": -2.1640625, "step": 2773 }, { "epoch": 0.5241626907270065, "grad_norm": 1.677791962100507, "learning_rate": 5.910513423301291e-07, "logits/chosen": 1.78125, "logits/rejected": 1.2646484375, "logps/chosen": -839.0, "logps/rejected": -1087.0, "loss": 0.4672, "rewards/accuracies": 0.875, "rewards/chosen": 1.6123046875, "rewards/margins": 6.78125, "rewards/rejected": -5.16015625, "step": 2774 }, { "epoch": 0.5243516462752138, "grad_norm": 2.376778106839017, "learning_rate": 5.907557588045683e-07, "logits/chosen": 2.06640625, "logits/rejected": 1.7685546875, "logps/chosen": -823.0, "logps/rejected": -800.0, "loss": 0.5481, "rewards/accuracies": 0.875, "rewards/chosen": 1.021484375, "rewards/margins": 4.55859375, "rewards/rejected": -3.5390625, "step": 2775 }, { "epoch": 0.524540601823421, "grad_norm": 1.8511231943251523, "learning_rate": 5.904601575482394e-07, "logits/chosen": 2.84765625, "logits/rejected": 2.736328125, "logps/chosen": -925.5, "logps/rejected": -790.5, "loss": 0.6137, "rewards/accuracies": 0.71875, "rewards/chosen": 1.2265625, "rewards/margins": 4.91015625, "rewards/rejected": -3.6796875, "step": 2776 }, { "epoch": 0.5247295573716283, "grad_norm": 1.8477622815345651, "learning_rate": 5.901645386897435e-07, "logits/chosen": 2.65625, "logits/rejected": 2.302734375, "logps/chosen": -833.0, "logps/rejected": -695.5, "loss": 0.5265, "rewards/accuracies": 0.875, "rewards/chosen": 0.89453125, "rewards/margins": 4.71875, "rewards/rejected": -3.82421875, "step": 2777 }, { "epoch": 0.5249185129198356, "grad_norm": 1.7590714345217324, "learning_rate": 5.898689023576895e-07, "logits/chosen": 2.9921875, "logits/rejected": 3.2109375, "logps/chosen": -602.0, "logps/rejected": -760.0, "loss": 0.7662, "rewards/accuracies": 0.65625, "rewards/chosen": 0.662109375, "rewards/margins": 2.93359375, "rewards/rejected": -2.267578125, "step": 2778 }, { "epoch": 0.5251074684680429, "grad_norm": 2.0984112855272183, "learning_rate": 5.895732486806937e-07, "logits/chosen": 2.3125, "logits/rejected": 2.0703125, "logps/chosen": -951.0, "logps/rejected": -822.0, "loss": 0.569, "rewards/accuracies": 0.8125, "rewards/chosen": 1.584228515625, "rewards/margins": 4.09765625, "rewards/rejected": -2.51171875, "step": 2779 }, { "epoch": 0.5252964240162502, "grad_norm": 1.6263403284170885, "learning_rate": 5.892775777873801e-07, "logits/chosen": 2.8095703125, "logits/rejected": 2.166015625, "logps/chosen": -483.0, "logps/rejected": -562.5, "loss": 0.6495, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0029296875, "rewards/margins": 3.72265625, "rewards/rejected": -2.71484375, "step": 2780 }, { "epoch": 0.5254853795644575, "grad_norm": 6.775470424194362, "learning_rate": 5.8898188980638e-07, "logits/chosen": 3.14453125, "logits/rejected": 3.298828125, "logps/chosen": -998.5, "logps/rejected": -1803.0, "loss": 0.5913, "rewards/accuracies": 0.8125, "rewards/chosen": 1.100341796875, "rewards/margins": 4.015625, "rewards/rejected": -2.912109375, "step": 2781 }, { "epoch": 0.5256743351126647, "grad_norm": 1.448054207188766, "learning_rate": 5.886861848663326e-07, "logits/chosen": 2.65625, "logits/rejected": 2.3359375, "logps/chosen": -920.0, "logps/rejected": -941.0, "loss": 0.5503, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0732421875, "rewards/margins": 4.71875, "rewards/rejected": -3.6337890625, "step": 2782 }, { "epoch": 0.525863290660872, "grad_norm": 1.4183378153233108, "learning_rate": 5.883904630958839e-07, "logits/chosen": 2.62890625, "logits/rejected": 2.18359375, "logps/chosen": -798.0, "logps/rejected": -893.0, "loss": 0.505, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4443359375, "rewards/margins": 4.6953125, "rewards/rejected": -3.25, "step": 2783 }, { "epoch": 0.5260522462090793, "grad_norm": 3.6079111226512373, "learning_rate": 5.880947246236877e-07, "logits/chosen": 3.08984375, "logits/rejected": 3.15625, "logps/chosen": -1063.0, "logps/rejected": -1474.0, "loss": 0.5736, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2979736328125, "rewards/margins": 4.708984375, "rewards/rejected": -3.404296875, "step": 2784 }, { "epoch": 0.5262412017572866, "grad_norm": 2.2134496309425287, "learning_rate": 5.877989695784045e-07, "logits/chosen": 2.826171875, "logits/rejected": 2.427734375, "logps/chosen": -1167.0, "logps/rejected": -1016.0, "loss": 0.5065, "rewards/accuracies": 0.90625, "rewards/chosen": 1.59375, "rewards/margins": 4.56640625, "rewards/rejected": -2.9765625, "step": 2785 }, { "epoch": 0.5264301573054939, "grad_norm": 1.8258539711876045, "learning_rate": 5.875031980887026e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.99609375, "logps/chosen": -533.5, "logps/rejected": -898.5, "loss": 0.6969, "rewards/accuracies": 0.8125, "rewards/chosen": 0.349609375, "rewards/margins": 3.84375, "rewards/rejected": -3.4931640625, "step": 2786 }, { "epoch": 0.5266191128537012, "grad_norm": 1.8670082580921465, "learning_rate": 5.872074102832572e-07, "logits/chosen": 2.78515625, "logits/rejected": 2.650390625, "logps/chosen": -577.0, "logps/rejected": -846.0, "loss": 0.7051, "rewards/accuracies": 0.75, "rewards/chosen": -0.006103515625, "rewards/margins": 4.66015625, "rewards/rejected": -4.65087890625, "step": 2787 }, { "epoch": 0.5268080684019084, "grad_norm": 2.0993736598941277, "learning_rate": 5.869116062907505e-07, "logits/chosen": 3.662109375, "logits/rejected": 3.466796875, "logps/chosen": -723.5, "logps/rejected": -511.0, "loss": 0.67, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5634765625, "rewards/margins": 3.681640625, "rewards/rejected": -3.11328125, "step": 2788 }, { "epoch": 0.5269970239501157, "grad_norm": 1.6194061787196903, "learning_rate": 5.86615786239872e-07, "logits/chosen": 3.140625, "logits/rejected": 2.818359375, "logps/chosen": -1406.0, "logps/rejected": -1437.0, "loss": 0.5223, "rewards/accuracies": 0.875, "rewards/chosen": 1.96319580078125, "rewards/margins": 7.375, "rewards/rejected": -5.40625, "step": 2789 }, { "epoch": 0.527185979498323, "grad_norm": 1.8915171964955273, "learning_rate": 5.863199502593178e-07, "logits/chosen": 2.8203125, "logits/rejected": 2.20703125, "logps/chosen": -729.5, "logps/rejected": -770.0, "loss": 0.5568, "rewards/accuracies": 0.8125, "rewards/chosen": 0.189605712890625, "rewards/margins": 4.4765625, "rewards/rejected": -4.28125, "step": 2790 }, { "epoch": 0.5273749350465303, "grad_norm": 3.053955524268058, "learning_rate": 5.860240984777914e-07, "logits/chosen": 2.046875, "logits/rejected": 1.46484375, "logps/chosen": -824.0, "logps/rejected": -786.0, "loss": 0.5629, "rewards/accuracies": 0.84375, "rewards/chosen": 0.869140625, "rewards/margins": 3.99609375, "rewards/rejected": -3.125, "step": 2791 }, { "epoch": 0.5275638905947376, "grad_norm": 1.95890432035473, "learning_rate": 5.857282310240028e-07, "logits/chosen": 2.421875, "logits/rejected": 2.6796875, "logps/chosen": -1015.0, "logps/rejected": -1109.0, "loss": 0.6305, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1932373046875, "rewards/margins": 4.90234375, "rewards/rejected": -3.716796875, "step": 2792 }, { "epoch": 0.5277528461429448, "grad_norm": 2.2849535639836183, "learning_rate": 5.854323480266691e-07, "logits/chosen": 2.55078125, "logits/rejected": 2.328125, "logps/chosen": -890.0, "logps/rejected": -838.5, "loss": 0.5722, "rewards/accuracies": 0.8125, "rewards/chosen": 0.62060546875, "rewards/margins": 3.763671875, "rewards/rejected": -3.14111328125, "step": 2793 }, { "epoch": 0.5279418016911521, "grad_norm": 2.9937125716393638, "learning_rate": 5.851364496145137e-07, "logits/chosen": 1.7998046875, "logits/rejected": 1.58740234375, "logps/chosen": -911.0, "logps/rejected": -1088.0, "loss": 0.6612, "rewards/accuracies": 0.75, "rewards/chosen": 0.490234375, "rewards/margins": 4.11328125, "rewards/rejected": -3.62109375, "step": 2794 }, { "epoch": 0.5281307572393594, "grad_norm": 1.5635035603934735, "learning_rate": 5.848405359162673e-07, "logits/chosen": 2.302734375, "logits/rejected": 2.44921875, "logps/chosen": -1321.0, "logps/rejected": -2213.0, "loss": 0.4443, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1669921875, "rewards/margins": 8.81640625, "rewards/rejected": -7.671875, "step": 2795 }, { "epoch": 0.5283197127875667, "grad_norm": 1.9521901930936836, "learning_rate": 5.84544607060667e-07, "logits/chosen": 1.60205078125, "logits/rejected": 1.45849609375, "logps/chosen": -487.5, "logps/rejected": -661.0, "loss": 0.5729, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2802734375, "rewards/margins": 4.484375, "rewards/rejected": -4.1953125, "step": 2796 }, { "epoch": 0.528508668335774, "grad_norm": 3.1619516166089356, "learning_rate": 5.842486631764564e-07, "logits/chosen": 2.373046875, "logits/rejected": 2.1484375, "logps/chosen": -954.25, "logps/rejected": -929.0, "loss": 0.6968, "rewards/accuracies": 0.8125, "rewards/chosen": 0.56378173828125, "rewards/margins": 4.19921875, "rewards/rejected": -3.62890625, "step": 2797 }, { "epoch": 0.5286976238839813, "grad_norm": 5.907655885814681, "learning_rate": 5.839527043923856e-07, "logits/chosen": 3.35546875, "logits/rejected": 3.12890625, "logps/chosen": -441.5, "logps/rejected": -650.5, "loss": 0.6779, "rewards/accuracies": 0.75, "rewards/chosen": 0.53125, "rewards/margins": 3.5927734375, "rewards/rejected": -3.0546875, "step": 2798 }, { "epoch": 0.5288865794321885, "grad_norm": 1.8328676603450031, "learning_rate": 5.836567308372115e-07, "logits/chosen": 3.44140625, "logits/rejected": 3.52734375, "logps/chosen": -916.0, "logps/rejected": -921.0, "loss": 0.6301, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1923828125, "rewards/margins": 4.5625, "rewards/rejected": -3.3798828125, "step": 2799 }, { "epoch": 0.5290755349803958, "grad_norm": 1.7251990032531386, "learning_rate": 5.83360742639697e-07, "logits/chosen": 2.49609375, "logits/rejected": 2.6484375, "logps/chosen": -772.0, "logps/rejected": -622.0, "loss": 0.5367, "rewards/accuracies": 0.8125, "rewards/chosen": 0.60595703125, "rewards/margins": 5.21484375, "rewards/rejected": -4.61328125, "step": 2800 }, { "epoch": 0.5292644905286031, "grad_norm": 2.8209942362279294, "learning_rate": 5.830647399286116e-07, "logits/chosen": 2.9873046875, "logits/rejected": 2.4921875, "logps/chosen": -917.5, "logps/rejected": -966.0, "loss": 0.5799, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9990234375, "rewards/margins": 4.705078125, "rewards/rejected": -3.703125, "step": 2801 }, { "epoch": 0.5294534460768104, "grad_norm": 3.5267618090921182, "learning_rate": 5.827687228327312e-07, "logits/chosen": 2.69921875, "logits/rejected": 2.302734375, "logps/chosen": -1175.0, "logps/rejected": -1120.0, "loss": 0.5707, "rewards/accuracies": 0.8125, "rewards/chosen": 1.63623046875, "rewards/margins": 7.609375, "rewards/rejected": -5.96875, "step": 2802 }, { "epoch": 0.5296424016250177, "grad_norm": 3.2978507958431518, "learning_rate": 5.824726914808379e-07, "logits/chosen": 1.771484375, "logits/rejected": 1.685546875, "logps/chosen": -756.0, "logps/rejected": -712.5, "loss": 0.5783, "rewards/accuracies": 0.875, "rewards/chosen": -0.444091796875, "rewards/margins": 5.2421875, "rewards/rejected": -5.6875, "step": 2803 }, { "epoch": 0.529831357173225, "grad_norm": 2.335839257170664, "learning_rate": 5.8217664600172e-07, "logits/chosen": 2.57421875, "logits/rejected": 2.3173828125, "logps/chosen": -653.0, "logps/rejected": -1089.0, "loss": 0.6095, "rewards/accuracies": 0.71875, "rewards/chosen": 0.39947509765625, "rewards/margins": 4.525390625, "rewards/rejected": -4.12109375, "step": 2804 }, { "epoch": 0.5300203127214322, "grad_norm": 3.0640993261822267, "learning_rate": 5.818805865241716e-07, "logits/chosen": 2.5263671875, "logits/rejected": 2.3255615234375, "logps/chosen": -458.0, "logps/rejected": -585.0, "loss": 0.5695, "rewards/accuracies": 0.875, "rewards/chosen": -0.06396484375, "rewards/margins": 4.2578125, "rewards/rejected": -4.328125, "step": 2805 }, { "epoch": 0.5302092682696395, "grad_norm": 1.7864858958782703, "learning_rate": 5.815845131769936e-07, "logits/chosen": 2.337890625, "logits/rejected": 2.3623046875, "logps/chosen": -990.0, "logps/rejected": -941.0, "loss": 0.6979, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3984375, "rewards/margins": 3.95703125, "rewards/rejected": -3.5546875, "step": 2806 }, { "epoch": 0.5303982238178468, "grad_norm": 2.468305128336835, "learning_rate": 5.812884260889922e-07, "logits/chosen": 2.84765625, "logits/rejected": 2.0859375, "logps/chosen": -769.5, "logps/rejected": -880.0, "loss": 0.488, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4833984375, "rewards/margins": 5.90234375, "rewards/rejected": -5.4296875, "step": 2807 }, { "epoch": 0.5305871793660542, "grad_norm": 1.7677185190775977, "learning_rate": 5.8099232538898e-07, "logits/chosen": 2.4609375, "logits/rejected": 2.482421875, "logps/chosen": -580.0, "logps/rejected": -784.0, "loss": 0.6397, "rewards/accuracies": 0.78125, "rewards/chosen": 0.69287109375, "rewards/margins": 4.39453125, "rewards/rejected": -3.70703125, "step": 2808 }, { "epoch": 0.5307761349142615, "grad_norm": 2.239185278863328, "learning_rate": 5.806962112057755e-07, "logits/chosen": 3.4873046875, "logits/rejected": 3.423828125, "logps/chosen": -536.25, "logps/rejected": -764.0, "loss": 0.5881, "rewards/accuracies": 0.875, "rewards/chosen": -0.4140625, "rewards/margins": 4.6640625, "rewards/rejected": -5.078125, "step": 2809 }, { "epoch": 0.5309650904624688, "grad_norm": 4.195460570974121, "learning_rate": 5.804000836682031e-07, "logits/chosen": 1.191162109375, "logits/rejected": 2.20703125, "logps/chosen": -885.0, "logps/rejected": -1007.5, "loss": 0.6906, "rewards/accuracies": 0.71875, "rewards/chosen": 0.59765625, "rewards/margins": 3.4375, "rewards/rejected": -2.837890625, "step": 2810 }, { "epoch": 0.531154046010676, "grad_norm": 2.8685739792488625, "learning_rate": 5.801039429050927e-07, "logits/chosen": 2.1796875, "logits/rejected": 2.328125, "logps/chosen": -912.0, "logps/rejected": -964.0, "loss": 0.5325, "rewards/accuracies": 0.875, "rewards/chosen": 0.28662109375, "rewards/margins": 5.14453125, "rewards/rejected": -4.85546875, "step": 2811 }, { "epoch": 0.5313430015588833, "grad_norm": 4.100198503364445, "learning_rate": 5.798077890452801e-07, "logits/chosen": 2.751953125, "logits/rejected": 3.33984375, "logps/chosen": -706.5, "logps/rejected": -1318.0, "loss": 0.6192, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8477783203125, "rewards/margins": 7.34375, "rewards/rejected": -6.5, "step": 2812 }, { "epoch": 0.5315319571070906, "grad_norm": 3.6711003308520636, "learning_rate": 5.795116222176069e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.138671875, "logps/chosen": -586.0, "logps/rejected": -626.5, "loss": 0.5239, "rewards/accuracies": 0.875, "rewards/chosen": 1.07275390625, "rewards/margins": 4.890625, "rewards/rejected": -3.81640625, "step": 2813 }, { "epoch": 0.5317209126552979, "grad_norm": 1.750216465281343, "learning_rate": 5.792154425509207e-07, "logits/chosen": 2.58203125, "logits/rejected": 2.34375, "logps/chosen": -996.5, "logps/rejected": -1572.5, "loss": 0.5301, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9873046875, "rewards/margins": 6.1015625, "rewards/rejected": -5.1171875, "step": 2814 }, { "epoch": 0.5319098682035052, "grad_norm": 2.504675463737584, "learning_rate": 5.789192501740739e-07, "logits/chosen": 2.3857421875, "logits/rejected": 2.611328125, "logps/chosen": -1162.0, "logps/rejected": -985.0, "loss": 0.5678, "rewards/accuracies": 0.84375, "rewards/chosen": 1.484375, "rewards/margins": 5.19921875, "rewards/rejected": -3.720703125, "step": 2815 }, { "epoch": 0.5320988237517124, "grad_norm": 2.0388498092780707, "learning_rate": 5.786230452159245e-07, "logits/chosen": 2.962890625, "logits/rejected": 2.474609375, "logps/chosen": -671.0, "logps/rejected": -597.0, "loss": 0.6678, "rewards/accuracies": 0.78125, "rewards/chosen": -0.314208984375, "rewards/margins": 3.5146484375, "rewards/rejected": -3.828125, "step": 2816 }, { "epoch": 0.5322877792999197, "grad_norm": 2.087544269848114, "learning_rate": 5.783268278053371e-07, "logits/chosen": 2.98828125, "logits/rejected": 2.94140625, "logps/chosen": -1035.5, "logps/rejected": -1189.0, "loss": 0.5183, "rewards/accuracies": 0.8125, "rewards/chosen": 1.31591796875, "rewards/margins": 5.6796875, "rewards/rejected": -4.3515625, "step": 2817 }, { "epoch": 0.532476734848127, "grad_norm": 2.0405281998700593, "learning_rate": 5.780305980711804e-07, "logits/chosen": 3.099609375, "logits/rejected": 3.044921875, "logps/chosen": -836.75, "logps/rejected": -780.5, "loss": 0.703, "rewards/accuracies": 0.75, "rewards/chosen": 0.8310546875, "rewards/margins": 3.9755859375, "rewards/rejected": -3.146484375, "step": 2818 }, { "epoch": 0.5326656903963343, "grad_norm": 4.62888354151693, "learning_rate": 5.77734356142329e-07, "logits/chosen": 2.953125, "logits/rejected": 2.85546875, "logps/chosen": -1082.0, "logps/rejected": -1495.0, "loss": 0.584, "rewards/accuracies": 0.75, "rewards/chosen": 1.3447265625, "rewards/margins": 6.15234375, "rewards/rejected": -4.818359375, "step": 2819 }, { "epoch": 0.5328546459445416, "grad_norm": 1.6873448056911633, "learning_rate": 5.774381021476626e-07, "logits/chosen": 2.94140625, "logits/rejected": 3.19140625, "logps/chosen": -16045.0, "logps/rejected": -903.5, "loss": 0.6227, "rewards/accuracies": 0.8125, "rewards/chosen": 71.00390625, "rewards/margins": 74.056640625, "rewards/rejected": -3.046875, "step": 2820 }, { "epoch": 0.5330436014927489, "grad_norm": 4.180011914129447, "learning_rate": 5.771418362160668e-07, "logits/chosen": 2.646484375, "logits/rejected": 1.9052734375, "logps/chosen": -869.0, "logps/rejected": -607.0, "loss": 0.618, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6025390625, "rewards/margins": 3.099609375, "rewards/rejected": -2.50390625, "step": 2821 }, { "epoch": 0.5332325570409561, "grad_norm": 2.631829227876642, "learning_rate": 5.768455584764315e-07, "logits/chosen": 2.37109375, "logits/rejected": 1.71630859375, "logps/chosen": -974.0, "logps/rejected": -1118.0, "loss": 0.543, "rewards/accuracies": 0.875, "rewards/chosen": 0.1064453125, "rewards/margins": 6.169921875, "rewards/rejected": -6.07421875, "step": 2822 }, { "epoch": 0.5334215125891634, "grad_norm": 2.914032168171479, "learning_rate": 5.765492690576522e-07, "logits/chosen": 2.55078125, "logits/rejected": 2.65234375, "logps/chosen": -867.0, "logps/rejected": -1014.0, "loss": 0.7586, "rewards/accuracies": 0.625, "rewards/chosen": 0.19091796875, "rewards/margins": 3.017578125, "rewards/rejected": -2.830078125, "step": 2823 }, { "epoch": 0.5336104681373707, "grad_norm": 1.7454334058409775, "learning_rate": 5.762529680886298e-07, "logits/chosen": 2.60546875, "logits/rejected": 2.171875, "logps/chosen": -1237.0, "logps/rejected": -2033.0, "loss": 0.4666, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0029296875, "rewards/margins": 7.625, "rewards/rejected": -6.625, "step": 2824 }, { "epoch": 0.533799423685578, "grad_norm": 3.1545632771361705, "learning_rate": 5.759566556982693e-07, "logits/chosen": 1.720703125, "logits/rejected": 2.0118408203125, "logps/chosen": -791.0, "logps/rejected": -1516.0, "loss": 0.5622, "rewards/accuracies": 0.875, "rewards/chosen": 1.19921875, "rewards/margins": 5.58984375, "rewards/rejected": -4.38671875, "step": 2825 }, { "epoch": 0.5339883792337853, "grad_norm": 4.3870322926907885, "learning_rate": 5.756603320154816e-07, "logits/chosen": 2.478515625, "logits/rejected": 2.2939453125, "logps/chosen": -856.5, "logps/rejected": -15998.0, "loss": 0.7014, "rewards/accuracies": 0.71875, "rewards/chosen": 0.34375, "rewards/margins": -72.9375, "rewards/rejected": 73.08984375, "step": 2826 }, { "epoch": 0.5341773347819926, "grad_norm": 1.425932960602487, "learning_rate": 5.753639971691822e-07, "logits/chosen": 3.48828125, "logits/rejected": 3.54296875, "logps/chosen": -578.0, "logps/rejected": -869.0, "loss": 0.6482, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8369140625, "rewards/margins": 3.888671875, "rewards/rejected": -3.05859375, "step": 2827 }, { "epoch": 0.5343662903301998, "grad_norm": 1.9592473607494603, "learning_rate": 5.750676512882911e-07, "logits/chosen": 2.71875, "logits/rejected": 1.923583984375, "logps/chosen": -803.5, "logps/rejected": -545.0, "loss": 0.5635, "rewards/accuracies": 0.875, "rewards/chosen": 1.140625, "rewards/margins": 3.8359375, "rewards/rejected": -2.6953125, "step": 2828 }, { "epoch": 0.5345552458784071, "grad_norm": 1.590321927008722, "learning_rate": 5.747712945017334e-07, "logits/chosen": 2.46484375, "logits/rejected": 2.2265625, "logps/chosen": -1113.0, "logps/rejected": -1804.0, "loss": 0.4699, "rewards/accuracies": 0.875, "rewards/chosen": 0.66259765625, "rewards/margins": 6.6796875, "rewards/rejected": -6.0078125, "step": 2829 }, { "epoch": 0.5347442014266144, "grad_norm": 2.502976854307483, "learning_rate": 5.744749269384392e-07, "logits/chosen": 3.76171875, "logits/rejected": 3.62890625, "logps/chosen": -856.5, "logps/rejected": -1691.0, "loss": 0.5157, "rewards/accuracies": 0.96875, "rewards/chosen": 0.701171875, "rewards/margins": 6.9609375, "rewards/rejected": -6.265625, "step": 2830 }, { "epoch": 0.5349331569748217, "grad_norm": 3.28717178125548, "learning_rate": 5.741785487273429e-07, "logits/chosen": 2.7421875, "logits/rejected": 2.78515625, "logps/chosen": -542.5, "logps/rejected": -933.0, "loss": 0.6817, "rewards/accuracies": 0.75, "rewards/chosen": 0.672119140625, "rewards/margins": 5.3203125, "rewards/rejected": -4.65234375, "step": 2831 }, { "epoch": 0.535122112523029, "grad_norm": 2.273081197598247, "learning_rate": 5.738821599973837e-07, "logits/chosen": 2.23046875, "logits/rejected": 1.904296875, "logps/chosen": -778.5, "logps/rejected": -2209.0, "loss": 0.5263, "rewards/accuracies": 0.9375, "rewards/chosen": 0.794921875, "rewards/margins": 8.953125, "rewards/rejected": -8.16015625, "step": 2832 }, { "epoch": 0.5353110680712363, "grad_norm": 2.2269071293569125, "learning_rate": 5.735857608775054e-07, "logits/chosen": 3.181640625, "logits/rejected": 2.59375, "logps/chosen": -837.0, "logps/rejected": -813.0, "loss": 0.5401, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1884765625, "rewards/margins": 5.2578125, "rewards/rejected": -5.078125, "step": 2833 }, { "epoch": 0.5355000236194435, "grad_norm": 2.88704317650529, "learning_rate": 5.73289351496656e-07, "logits/chosen": 2.369140625, "logits/rejected": 2.41796875, "logps/chosen": -808.5, "logps/rejected": -908.0, "loss": 0.5911, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6962890625, "rewards/margins": 3.646484375, "rewards/rejected": -2.951171875, "step": 2834 }, { "epoch": 0.5356889791676508, "grad_norm": 2.1175045458076474, "learning_rate": 5.729929319837885e-07, "logits/chosen": 2.630859375, "logits/rejected": 2.5, "logps/chosen": -1014.0, "logps/rejected": -734.0, "loss": 0.5766, "rewards/accuracies": 0.8125, "rewards/chosen": 0.42333984375, "rewards/margins": 4.58203125, "rewards/rejected": -4.1640625, "step": 2835 }, { "epoch": 0.5358779347158581, "grad_norm": 1.8146321132429657, "learning_rate": 5.726965024678599e-07, "logits/chosen": 3.70703125, "logits/rejected": 3.5, "logps/chosen": -1277.0, "logps/rejected": -998.0, "loss": 0.5046, "rewards/accuracies": 0.9375, "rewards/chosen": 2.123046875, "rewards/margins": 5.81640625, "rewards/rejected": -3.685546875, "step": 2836 }, { "epoch": 0.5360668902640654, "grad_norm": 1.6364180379203688, "learning_rate": 5.724000630778315e-07, "logits/chosen": 3.0390625, "logits/rejected": 2.734375, "logps/chosen": -719.0, "logps/rejected": -835.5, "loss": 0.6064, "rewards/accuracies": 0.875, "rewards/chosen": 1.315673828125, "rewards/margins": 4.55859375, "rewards/rejected": -3.23828125, "step": 2837 }, { "epoch": 0.5362558458122727, "grad_norm": 1.7515533224027315, "learning_rate": 5.721036139426693e-07, "logits/chosen": 2.458984375, "logits/rejected": 2.01953125, "logps/chosen": -654.0, "logps/rejected": -700.5, "loss": 0.611, "rewards/accuracies": 0.8125, "rewards/chosen": 0.28125, "rewards/margins": 4.30078125, "rewards/rejected": -4.01171875, "step": 2838 }, { "epoch": 0.5364448013604799, "grad_norm": 2.833959316136032, "learning_rate": 5.718071551913434e-07, "logits/chosen": 2.765625, "logits/rejected": 2.5, "logps/chosen": -775.0, "logps/rejected": -1217.0, "loss": 0.7021, "rewards/accuracies": 0.78125, "rewards/chosen": 0.127197265625, "rewards/margins": 5.041015625, "rewards/rejected": -4.91015625, "step": 2839 }, { "epoch": 0.5366337569086872, "grad_norm": 1.2836169853928738, "learning_rate": 5.715106869528279e-07, "logits/chosen": 2.5703125, "logits/rejected": 2.376953125, "logps/chosen": -794.5, "logps/rejected": -941.0, "loss": 0.5182, "rewards/accuracies": 0.875, "rewards/chosen": -0.451171875, "rewards/margins": 6.19140625, "rewards/rejected": -6.66015625, "step": 2840 }, { "epoch": 0.5368227124568945, "grad_norm": 2.1692578706515575, "learning_rate": 5.71214209356101e-07, "logits/chosen": 3.34375, "logits/rejected": 2.65625, "logps/chosen": -1417.0, "logps/rejected": -1201.0, "loss": 0.5306, "rewards/accuracies": 0.78125, "rewards/chosen": 1.09033203125, "rewards/margins": 5.0703125, "rewards/rejected": -3.9765625, "step": 2841 }, { "epoch": 0.5370116680051018, "grad_norm": 1.708032875146135, "learning_rate": 5.709177225301452e-07, "logits/chosen": 1.7607421875, "logits/rejected": 1.4521484375, "logps/chosen": -698.5, "logps/rejected": -811.5, "loss": 0.537, "rewards/accuracies": 0.75, "rewards/chosen": 0.790771484375, "rewards/margins": 4.53125, "rewards/rejected": -3.73046875, "step": 2842 }, { "epoch": 0.5372006235533091, "grad_norm": 5.18002878308053, "learning_rate": 5.70621226603947e-07, "logits/chosen": 3.166015625, "logits/rejected": 3.27734375, "logps/chosen": -576.0, "logps/rejected": -1709.0, "loss": 0.6007, "rewards/accuracies": 0.84375, "rewards/chosen": 0.29345703125, "rewards/margins": 7.328125, "rewards/rejected": -7.02734375, "step": 2843 }, { "epoch": 0.5373895791015164, "grad_norm": 1.5910821372488568, "learning_rate": 5.703247217064966e-07, "logits/chosen": 2.634765625, "logits/rejected": 2.583984375, "logps/chosen": -830.0, "logps/rejected": -1140.0, "loss": 0.4949, "rewards/accuracies": 0.875, "rewards/chosen": 1.2890625, "rewards/margins": 5.421875, "rewards/rejected": -4.12890625, "step": 2844 }, { "epoch": 0.5375785346497236, "grad_norm": 2.048692616859907, "learning_rate": 5.700282079667886e-07, "logits/chosen": 1.89453125, "logits/rejected": 2.0478515625, "logps/chosen": -905.0, "logps/rejected": -943.0, "loss": 0.5355, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2724609375, "rewards/margins": 4.546875, "rewards/rejected": -3.271484375, "step": 2845 }, { "epoch": 0.5377674901979309, "grad_norm": 2.1169266405597273, "learning_rate": 5.697316855138206e-07, "logits/chosen": 2.349609375, "logits/rejected": 2.546875, "logps/chosen": -736.0, "logps/rejected": -805.0, "loss": 0.5927, "rewards/accuracies": 0.78125, "rewards/chosen": 0.15972900390625, "rewards/margins": 4.84375, "rewards/rejected": -4.6875, "step": 2846 }, { "epoch": 0.5379564457461382, "grad_norm": 2.528015629013745, "learning_rate": 5.69435154476595e-07, "logits/chosen": 3.09375, "logits/rejected": 2.978515625, "logps/chosen": -696.0, "logps/rejected": -687.5, "loss": 0.6139, "rewards/accuracies": 0.875, "rewards/chosen": 0.178955078125, "rewards/margins": 3.89453125, "rewards/rejected": -3.71875, "step": 2847 }, { "epoch": 0.5381454012943455, "grad_norm": 2.4197077958656235, "learning_rate": 5.69138614984117e-07, "logits/chosen": 3.24609375, "logits/rejected": 2.98046875, "logps/chosen": -848.0, "logps/rejected": -837.0, "loss": 0.5264, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4677734375, "rewards/margins": 4.6328125, "rewards/rejected": -4.1640625, "step": 2848 }, { "epoch": 0.5383343568425528, "grad_norm": 1.8714761381607008, "learning_rate": 5.688420671653965e-07, "logits/chosen": 1.771484375, "logits/rejected": 1.482421875, "logps/chosen": -906.0, "logps/rejected": -1101.0, "loss": 0.6878, "rewards/accuracies": 0.75, "rewards/chosen": 0.701171875, "rewards/margins": 3.431640625, "rewards/rejected": -2.724609375, "step": 2849 }, { "epoch": 0.5385233123907601, "grad_norm": 1.7791811452501354, "learning_rate": 5.685455111494459e-07, "logits/chosen": 2.888671875, "logits/rejected": 2.81640625, "logps/chosen": -748.0, "logps/rejected": -1728.5, "loss": 0.5954, "rewards/accuracies": 0.8125, "rewards/chosen": 0.96875, "rewards/margins": 4.96875, "rewards/rejected": -4.00390625, "step": 2850 }, { "epoch": 0.5387122679389673, "grad_norm": 2.290906344891834, "learning_rate": 5.68248947065282e-07, "logits/chosen": 2.765625, "logits/rejected": 2.51171875, "logps/chosen": -1258.0, "logps/rejected": -1006.0, "loss": 0.5167, "rewards/accuracies": 0.8125, "rewards/chosen": 1.70703125, "rewards/margins": 5.796875, "rewards/rejected": -4.09765625, "step": 2851 }, { "epoch": 0.5389012234871746, "grad_norm": 1.7926223123996228, "learning_rate": 5.679523750419247e-07, "logits/chosen": 2.64453125, "logits/rejected": 1.984375, "logps/chosen": -575.5, "logps/rejected": -543.0, "loss": 0.5985, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0877685546875, "rewards/margins": 3.642578125, "rewards/rejected": -3.7265625, "step": 2852 }, { "epoch": 0.5390901790353819, "grad_norm": 2.1412124373106782, "learning_rate": 5.676557952083977e-07, "logits/chosen": 2.78515625, "logits/rejected": 2.845703125, "logps/chosen": -621.0, "logps/rejected": -1554.75, "loss": 0.5256, "rewards/accuracies": 0.875, "rewards/chosen": 1.1318359375, "rewards/margins": 5.5703125, "rewards/rejected": -4.42578125, "step": 2853 }, { "epoch": 0.5392791345835892, "grad_norm": 1.942561108891116, "learning_rate": 5.673592076937276e-07, "logits/chosen": 2.71875, "logits/rejected": 2.765625, "logps/chosen": -601.5, "logps/rejected": -874.0, "loss": 0.6437, "rewards/accuracies": 0.8125, "rewards/chosen": 0.58544921875, "rewards/margins": 3.7734375, "rewards/rejected": -3.181640625, "step": 2854 }, { "epoch": 0.5394680901317965, "grad_norm": 1.5447724757189245, "learning_rate": 5.670626126269449e-07, "logits/chosen": 2.708984375, "logits/rejected": 1.7100830078125, "logps/chosen": -1251.0, "logps/rejected": -884.0, "loss": 0.5184, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2890625, "rewards/margins": 4.734375, "rewards/rejected": -5.015625, "step": 2855 }, { "epoch": 0.5396570456800038, "grad_norm": 2.291497156549203, "learning_rate": 5.667660101370827e-07, "logits/chosen": 3.015625, "logits/rejected": 2.48046875, "logps/chosen": -868.0, "logps/rejected": -837.5, "loss": 0.4851, "rewards/accuracies": 0.90625, "rewards/chosen": 1.35546875, "rewards/margins": 5.375, "rewards/rejected": -4.01953125, "step": 2856 }, { "epoch": 0.539846001228211, "grad_norm": 1.8752322365306187, "learning_rate": 5.664694003531782e-07, "logits/chosen": 2.775390625, "logits/rejected": 2.638671875, "logps/chosen": -612.5, "logps/rejected": -846.0, "loss": 0.6252, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4921875, "rewards/margins": 4.77734375, "rewards/rejected": -4.27734375, "step": 2857 }, { "epoch": 0.5400349567764183, "grad_norm": 2.338486131490618, "learning_rate": 5.661727834042709e-07, "logits/chosen": 1.970703125, "logits/rejected": 1.98046875, "logps/chosen": -722.5, "logps/rejected": -818.0, "loss": 0.5749, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1396484375, "rewards/margins": 5.0234375, "rewards/rejected": -4.8828125, "step": 2858 }, { "epoch": 0.5402239123246256, "grad_norm": 2.3303828257792945, "learning_rate": 5.65876159419404e-07, "logits/chosen": 2.45703125, "logits/rejected": 2.244140625, "logps/chosen": -1309.0, "logps/rejected": -2526.0, "loss": 0.6096, "rewards/accuracies": 0.875, "rewards/chosen": -0.329345703125, "rewards/margins": 7.197265625, "rewards/rejected": -7.5234375, "step": 2859 }, { "epoch": 0.5404128678728329, "grad_norm": 4.207155024384298, "learning_rate": 5.655795285276237e-07, "logits/chosen": 2.71875, "logits/rejected": 2.48828125, "logps/chosen": -1708.0, "logps/rejected": -1570.0, "loss": 0.6283, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4453125, "rewards/margins": 4.2421875, "rewards/rejected": -4.6875, "step": 2860 }, { "epoch": 0.5406018234210402, "grad_norm": 4.225732934843683, "learning_rate": 5.652828908579789e-07, "logits/chosen": 2.17578125, "logits/rejected": 2.03515625, "logps/chosen": -955.0, "logps/rejected": -1906.0, "loss": 0.5635, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6201171875, "rewards/margins": 6.7421875, "rewards/rejected": -6.1015625, "step": 2861 }, { "epoch": 0.5407907789692474, "grad_norm": 2.0659770813745775, "learning_rate": 5.649862465395218e-07, "logits/chosen": 2.16015625, "logits/rejected": 1.859375, "logps/chosen": -840.5, "logps/rejected": -781.5, "loss": 0.6439, "rewards/accuracies": 0.75, "rewards/chosen": -0.505859375, "rewards/margins": 2.53515625, "rewards/rejected": -3.046875, "step": 2862 }, { "epoch": 0.5409797345174547, "grad_norm": 2.410207009752726, "learning_rate": 5.646895957013073e-07, "logits/chosen": 2.0068359375, "logits/rejected": 1.706787109375, "logps/chosen": -871.0, "logps/rejected": -1054.0, "loss": 0.458, "rewards/accuracies": 0.875, "rewards/chosen": 0.98681640625, "rewards/margins": 6.37109375, "rewards/rejected": -5.375, "step": 2863 }, { "epoch": 0.541168690065662, "grad_norm": 1.7133623125200517, "learning_rate": 5.64392938472393e-07, "logits/chosen": 2.74609375, "logits/rejected": 2.35546875, "logps/chosen": -737.0, "logps/rejected": -745.0, "loss": 0.6583, "rewards/accuracies": 0.75, "rewards/chosen": 0.39892578125, "rewards/margins": 4.0078125, "rewards/rejected": -3.609375, "step": 2864 }, { "epoch": 0.5413576456138693, "grad_norm": 1.8804550627567997, "learning_rate": 5.640962749818395e-07, "logits/chosen": 2.498046875, "logits/rejected": 1.892578125, "logps/chosen": -890.0, "logps/rejected": -872.0, "loss": 0.4962, "rewards/accuracies": 0.875, "rewards/chosen": 0.8701171875, "rewards/margins": 5.48828125, "rewards/rejected": -4.62109375, "step": 2865 }, { "epoch": 0.5415466011620766, "grad_norm": 2.45028641601544, "learning_rate": 5.637996053587101e-07, "logits/chosen": 3.177734375, "logits/rejected": 3.291015625, "logps/chosen": -697.0, "logps/rejected": -16768.0, "loss": 0.7507, "rewards/accuracies": 0.6875, "rewards/chosen": -0.382080078125, "rewards/margins": 101.41796875, "rewards/rejected": -101.84765625, "step": 2866 }, { "epoch": 0.541735556710284, "grad_norm": 1.8589590099494235, "learning_rate": 5.635029297320709e-07, "logits/chosen": 2.796875, "logits/rejected": 2.341796875, "logps/chosen": -942.5, "logps/rejected": -1121.0, "loss": 0.5152, "rewards/accuracies": 0.875, "rewards/chosen": 0.470703125, "rewards/margins": 6.296875, "rewards/rejected": -5.8203125, "step": 2867 }, { "epoch": 0.5419245122584911, "grad_norm": 2.8082134143687987, "learning_rate": 5.632062482309904e-07, "logits/chosen": 2.78125, "logits/rejected": 2.755859375, "logps/chosen": -884.5, "logps/rejected": -891.0, "loss": 0.5819, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2705078125, "rewards/margins": 5.08203125, "rewards/rejected": -5.3515625, "step": 2868 }, { "epoch": 0.5421134678066984, "grad_norm": 3.8361524523297748, "learning_rate": 5.629095609845394e-07, "logits/chosen": 2.099609375, "logits/rejected": 1.873046875, "logps/chosen": -6819.0, "logps/rejected": -1708.0, "loss": 0.5187, "rewards/accuracies": 0.90625, "rewards/chosen": -4.08203125, "rewards/margins": 3.40625, "rewards/rejected": -7.484375, "step": 2869 }, { "epoch": 0.5423024233549057, "grad_norm": 2.1316234490730404, "learning_rate": 5.62612868121792e-07, "logits/chosen": 1.81640625, "logits/rejected": 1.50927734375, "logps/chosen": -924.5, "logps/rejected": -908.0, "loss": 0.5212, "rewards/accuracies": 0.84375, "rewards/chosen": 0.528076171875, "rewards/margins": 5.31640625, "rewards/rejected": -4.79296875, "step": 2870 }, { "epoch": 0.542491378903113, "grad_norm": 1.9475240382456995, "learning_rate": 5.623161697718237e-07, "logits/chosen": 2.78125, "logits/rejected": 3.1796875, "logps/chosen": -1148.0, "logps/rejected": -1246.0, "loss": 0.5584, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6328125, "rewards/margins": 6.3046875, "rewards/rejected": -5.671875, "step": 2871 }, { "epoch": 0.5426803344513204, "grad_norm": 3.6959922933174525, "learning_rate": 5.620194660637136e-07, "logits/chosen": 2.86328125, "logits/rejected": 3.16796875, "logps/chosen": -459.5, "logps/rejected": -1170.0, "loss": 0.5542, "rewards/accuracies": 0.90625, "rewards/chosen": -0.34814453125, "rewards/margins": 5.94140625, "rewards/rejected": -6.29296875, "step": 2872 }, { "epoch": 0.5428692899995277, "grad_norm": 1.8900587764283094, "learning_rate": 5.617227571265418e-07, "logits/chosen": 3.41015625, "logits/rejected": 3.50390625, "logps/chosen": -749.5, "logps/rejected": -763.0, "loss": 0.6476, "rewards/accuracies": 0.71875, "rewards/chosen": 0.283203125, "rewards/margins": 4.46875, "rewards/rejected": -4.18359375, "step": 2873 }, { "epoch": 0.5430582455477349, "grad_norm": 3.5439949587217803, "learning_rate": 5.614260430893916e-07, "logits/chosen": 3.05859375, "logits/rejected": 2.498046875, "logps/chosen": -753.0, "logps/rejected": -792.0, "loss": 0.6475, "rewards/accuracies": 0.875, "rewards/chosen": 0.30859375, "rewards/margins": 4.671173095703125, "rewards/rejected": -4.36328125, "step": 2874 }, { "epoch": 0.5432472010959422, "grad_norm": 2.2514003362213475, "learning_rate": 5.611293240813484e-07, "logits/chosen": 2.25390625, "logits/rejected": 2.18115234375, "logps/chosen": -841.0, "logps/rejected": -803.0, "loss": 0.5791, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0623779296875, "rewards/margins": 4.62841796875, "rewards/rejected": -4.6875, "step": 2875 }, { "epoch": 0.5434361566441495, "grad_norm": 1.9238512476545986, "learning_rate": 5.608326002314996e-07, "logits/chosen": 2.505859375, "logits/rejected": 2.134765625, "logps/chosen": -1046.0, "logps/rejected": -946.5, "loss": 0.5069, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7890625, "rewards/margins": 5.06640625, "rewards/rejected": -5.859375, "step": 2876 }, { "epoch": 0.5436251121923568, "grad_norm": 2.3006600972687594, "learning_rate": 5.605358716689345e-07, "logits/chosen": 2.876953125, "logits/rejected": 2.806640625, "logps/chosen": -861.0, "logps/rejected": -844.0, "loss": 0.4674, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6134033203125, "rewards/margins": 6.3203125, "rewards/rejected": -5.71875, "step": 2877 }, { "epoch": 0.5438140677405641, "grad_norm": 1.9181236467863414, "learning_rate": 5.602391385227448e-07, "logits/chosen": 2.41796875, "logits/rejected": 2.279541015625, "logps/chosen": -944.0, "logps/rejected": -1817.0, "loss": 0.5113, "rewards/accuracies": 0.90625, "rewards/chosen": 0.777130126953125, "rewards/margins": 7.8828125, "rewards/rejected": -7.0859375, "step": 2878 }, { "epoch": 0.5440030232887714, "grad_norm": 2.8152729570160644, "learning_rate": 5.599424009220243e-07, "logits/chosen": 3.2265625, "logits/rejected": 2.953125, "logps/chosen": -673.0, "logps/rejected": -754.0, "loss": 0.5507, "rewards/accuracies": 0.875, "rewards/chosen": 0.9453125, "rewards/margins": 5.26171875, "rewards/rejected": -4.30859375, "step": 2879 }, { "epoch": 0.5441919788369786, "grad_norm": 1.6381063415013126, "learning_rate": 5.596456589958682e-07, "logits/chosen": 1.99609375, "logits/rejected": 1.51171875, "logps/chosen": -961.0, "logps/rejected": -18320.0, "loss": 0.4827, "rewards/accuracies": 0.84375, "rewards/chosen": 0.995849609375, "rewards/margins": 36.8203125, "rewards/rejected": -36.05859375, "step": 2880 }, { "epoch": 0.5443809343851859, "grad_norm": 2.118413589153439, "learning_rate": 5.593489128733739e-07, "logits/chosen": 2.544921875, "logits/rejected": 3.0234375, "logps/chosen": -1048.0, "logps/rejected": -1207.0, "loss": 0.5944, "rewards/accuracies": 0.75, "rewards/chosen": 1.18017578125, "rewards/margins": 4.578125, "rewards/rejected": -3.40625, "step": 2881 }, { "epoch": 0.5445698899333932, "grad_norm": 2.047349479608891, "learning_rate": 5.590521626836409e-07, "logits/chosen": 2.01953125, "logits/rejected": 2.1884765625, "logps/chosen": -1154.0, "logps/rejected": -1321.0, "loss": 0.6401, "rewards/accuracies": 0.78125, "rewards/chosen": 0.73583984375, "rewards/margins": 4.48046875, "rewards/rejected": -3.744140625, "step": 2882 }, { "epoch": 0.5447588454816005, "grad_norm": 2.0573329606722592, "learning_rate": 5.5875540855577e-07, "logits/chosen": 2.89453125, "logits/rejected": 2.1923828125, "logps/chosen": -740.0, "logps/rejected": -738.0, "loss": 0.4821, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6533203125, "rewards/margins": 4.63671875, "rewards/rejected": -3.9765625, "step": 2883 }, { "epoch": 0.5449478010298078, "grad_norm": 2.625729358349814, "learning_rate": 5.584586506188638e-07, "logits/chosen": 2.64453125, "logits/rejected": 1.9033203125, "logps/chosen": -660.0, "logps/rejected": -735.0, "loss": 0.5795, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4324951171875, "rewards/margins": 3.76953125, "rewards/rejected": -3.33203125, "step": 2884 }, { "epoch": 0.545136756578015, "grad_norm": 2.147265659506374, "learning_rate": 5.58161889002027e-07, "logits/chosen": 2.369140625, "logits/rejected": 1.74609375, "logps/chosen": -949.0, "logps/rejected": -850.0, "loss": 0.6394, "rewards/accuracies": 0.75, "rewards/chosen": 1.462890625, "rewards/margins": 3.818359375, "rewards/rejected": -2.3564453125, "step": 2885 }, { "epoch": 0.5453257121262223, "grad_norm": 2.518085713005955, "learning_rate": 5.578651238343652e-07, "logits/chosen": 3.19140625, "logits/rejected": 3.251953125, "logps/chosen": -1202.0, "logps/rejected": -1204.0, "loss": 0.4952, "rewards/accuracies": 0.90625, "rewards/chosen": 1.939453125, "rewards/margins": 6.515625, "rewards/rejected": -4.58203125, "step": 2886 }, { "epoch": 0.5455146676744296, "grad_norm": 1.919390014346239, "learning_rate": 5.57568355244986e-07, "logits/chosen": 2.4765625, "logits/rejected": 2.734375, "logps/chosen": -753.75, "logps/rejected": -1691.0, "loss": 0.4842, "rewards/accuracies": 0.875, "rewards/chosen": 1.31732177734375, "rewards/margins": 6.76171875, "rewards/rejected": -5.4375, "step": 2887 }, { "epoch": 0.5457036232226369, "grad_norm": 1.551700862818045, "learning_rate": 5.572715833629986e-07, "logits/chosen": 3.09375, "logits/rejected": 2.078125, "logps/chosen": -615.0, "logps/rejected": -590.25, "loss": 0.5438, "rewards/accuracies": 0.84375, "rewards/chosen": 1.00390625, "rewards/margins": 4.2421875, "rewards/rejected": -3.240234375, "step": 2888 }, { "epoch": 0.5458925787708442, "grad_norm": 2.047434766723561, "learning_rate": 5.569748083175133e-07, "logits/chosen": 2.38671875, "logits/rejected": 2.068115234375, "logps/chosen": -889.0, "logps/rejected": -956.5, "loss": 0.587, "rewards/accuracies": 0.78125, "rewards/chosen": 1.261474609375, "rewards/margins": 4.171875, "rewards/rejected": -2.91015625, "step": 2889 }, { "epoch": 0.5460815343190515, "grad_norm": 1.269807740896515, "learning_rate": 5.566780302376418e-07, "logits/chosen": 2.94140625, "logits/rejected": 2.6796875, "logps/chosen": -529.5, "logps/rejected": -777.5, "loss": 0.6534, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9560546875, "rewards/margins": 4.0234375, "rewards/rejected": -3.0703125, "step": 2890 }, { "epoch": 0.5462704898672587, "grad_norm": 2.0629852256578496, "learning_rate": 5.563812492524973e-07, "logits/chosen": 2.767578125, "logits/rejected": 2.359375, "logps/chosen": -521.5, "logps/rejected": -1414.0, "loss": 0.5446, "rewards/accuracies": 0.90625, "rewards/chosen": 0.77545166015625, "rewards/margins": 4.5703125, "rewards/rejected": -3.796875, "step": 2891 }, { "epoch": 0.546459445415466, "grad_norm": 1.9154600894047056, "learning_rate": 5.56084465491194e-07, "logits/chosen": 3.009765625, "logits/rejected": 2.61328125, "logps/chosen": -773.5, "logps/rejected": -682.0, "loss": 0.5656, "rewards/accuracies": 0.65625, "rewards/chosen": 0.94384765625, "rewards/margins": 4.115234375, "rewards/rejected": -3.171875, "step": 2892 }, { "epoch": 0.5466484009636733, "grad_norm": 2.6465836572430934, "learning_rate": 5.55787679082848e-07, "logits/chosen": 2.74853515625, "logits/rejected": 2.69921875, "logps/chosen": -686.0, "logps/rejected": -1430.0, "loss": 0.6649, "rewards/accuracies": 0.75, "rewards/chosen": 0.841796875, "rewards/margins": 4.33203125, "rewards/rejected": -3.4765625, "step": 2893 }, { "epoch": 0.5468373565118806, "grad_norm": 1.9280725683634181, "learning_rate": 5.554908901565756e-07, "logits/chosen": 2.77734375, "logits/rejected": 2.046875, "logps/chosen": -725.0, "logps/rejected": -559.5, "loss": 0.5403, "rewards/accuracies": 0.84375, "rewards/chosen": 0.92333984375, "rewards/margins": 4.19921875, "rewards/rejected": -3.27734375, "step": 2894 }, { "epoch": 0.5470263120600879, "grad_norm": 1.6963025045817581, "learning_rate": 5.551940988414947e-07, "logits/chosen": 2.91796875, "logits/rejected": 2.353515625, "logps/chosen": -1210.0, "logps/rejected": -1073.0, "loss": 0.4797, "rewards/accuracies": 0.875, "rewards/chosen": 1.9423828125, "rewards/margins": 6.140625, "rewards/rejected": -4.1875, "step": 2895 }, { "epoch": 0.5472152676082952, "grad_norm": 3.657877853124192, "learning_rate": 5.548973052667244e-07, "logits/chosen": 3.484375, "logits/rejected": 2.76953125, "logps/chosen": -973.5, "logps/rejected": -884.5, "loss": 0.6611, "rewards/accuracies": 0.75, "rewards/chosen": 0.880859375, "rewards/margins": 4.576171875, "rewards/rejected": -3.6953125, "step": 2896 }, { "epoch": 0.5474042231565024, "grad_norm": 1.8435244599586578, "learning_rate": 5.546005095613844e-07, "logits/chosen": 3.05859375, "logits/rejected": 2.46923828125, "logps/chosen": -451.0, "logps/rejected": -609.0, "loss": 0.5599, "rewards/accuracies": 0.78125, "rewards/chosen": 1.03106689453125, "rewards/margins": 4.3193359375, "rewards/rejected": -3.28515625, "step": 2897 }, { "epoch": 0.5475931787047097, "grad_norm": 1.3802062720285828, "learning_rate": 5.543037118545954e-07, "logits/chosen": 3.0546875, "logits/rejected": 2.67578125, "logps/chosen": -953.0, "logps/rejected": -1131.0, "loss": 0.5259, "rewards/accuracies": 0.84375, "rewards/chosen": 1.74609375, "rewards/margins": 6.5546875, "rewards/rejected": -4.814453125, "step": 2898 }, { "epoch": 0.547782134252917, "grad_norm": 3.04614985865949, "learning_rate": 5.540069122754792e-07, "logits/chosen": 3.0859375, "logits/rejected": 2.7734375, "logps/chosen": -832.0, "logps/rejected": -841.5, "loss": 0.537, "rewards/accuracies": 0.875, "rewards/chosen": 0.8076171875, "rewards/margins": 5.0703125, "rewards/rejected": -4.26171875, "step": 2899 }, { "epoch": 0.5479710898011243, "grad_norm": 2.256584980549599, "learning_rate": 5.537101109531583e-07, "logits/chosen": 2.3173828125, "logits/rejected": 2.19775390625, "logps/chosen": -935.5, "logps/rejected": -1148.5, "loss": 0.5118, "rewards/accuracies": 0.875, "rewards/chosen": 1.19024658203125, "rewards/margins": 6.0859375, "rewards/rejected": -4.890625, "step": 2900 }, { "epoch": 0.5481600453493316, "grad_norm": 1.8641928426910161, "learning_rate": 5.534133080167558e-07, "logits/chosen": 3.16015625, "logits/rejected": 2.79296875, "logps/chosen": -564.0, "logps/rejected": -681.5, "loss": 0.594, "rewards/accuracies": 0.78125, "rewards/chosen": 0.93310546875, "rewards/margins": 4.0625, "rewards/rejected": -3.126953125, "step": 2901 }, { "epoch": 0.5483490008975389, "grad_norm": 1.5745080174941442, "learning_rate": 5.531165035953959e-07, "logits/chosen": 2.4921875, "logits/rejected": 2.103515625, "logps/chosen": -725.0, "logps/rejected": -786.0, "loss": 0.6224, "rewards/accuracies": 0.78125, "rewards/chosen": 0.74951171875, "rewards/margins": 3.76171875, "rewards/rejected": -3.01953125, "step": 2902 }, { "epoch": 0.5485379564457461, "grad_norm": 2.0605243653220726, "learning_rate": 5.528196978182028e-07, "logits/chosen": 1.4755859375, "logits/rejected": 1.044921875, "logps/chosen": -884.0, "logps/rejected": -1002.0, "loss": 0.4838, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4970703125, "rewards/margins": 5.375, "rewards/rejected": -3.875, "step": 2903 }, { "epoch": 0.5487269119939534, "grad_norm": 1.4601799644462423, "learning_rate": 5.525228908143019e-07, "logits/chosen": 2.880859375, "logits/rejected": 2.8984375, "logps/chosen": -663.0, "logps/rejected": -1549.0, "loss": 0.6607, "rewards/accuracies": 0.6875, "rewards/chosen": 1.158203125, "rewards/margins": 6.201171875, "rewards/rejected": -5.0322265625, "step": 2904 }, { "epoch": 0.5489158675421607, "grad_norm": 1.9567776646589616, "learning_rate": 5.52226082712819e-07, "logits/chosen": 2.234375, "logits/rejected": 1.65869140625, "logps/chosen": -773.0, "logps/rejected": -16099.0, "loss": 0.6603, "rewards/accuracies": 0.6875, "rewards/chosen": 1.21875, "rewards/margins": -66.7265625, "rewards/rejected": 67.80078125, "step": 2905 }, { "epoch": 0.549104823090368, "grad_norm": 1.861511057906443, "learning_rate": 5.519292736428801e-07, "logits/chosen": 2.65625, "logits/rejected": 2.15234375, "logps/chosen": -709.0, "logps/rejected": -402.0, "loss": 0.6625, "rewards/accuracies": 0.78125, "rewards/chosen": 0.84375, "rewards/margins": 3.93359375, "rewards/rejected": -3.08984375, "step": 2906 }, { "epoch": 0.5492937786385753, "grad_norm": 3.650461212049953, "learning_rate": 5.516324637336119e-07, "logits/chosen": 2.29296875, "logits/rejected": 2.203125, "logps/chosen": -867.0, "logps/rejected": -1595.0, "loss": 0.6592, "rewards/accuracies": 0.75, "rewards/chosen": 0.73046875, "rewards/margins": 17.73828125, "rewards/rejected": -17.03515625, "step": 2907 }, { "epoch": 0.5494827341867825, "grad_norm": 1.9024202989800478, "learning_rate": 5.513356531141414e-07, "logits/chosen": 2.203125, "logits/rejected": 2.126953125, "logps/chosen": -855.0, "logps/rejected": -1442.0, "loss": 0.6064, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9921875, "rewards/margins": 5.984375, "rewards/rejected": -5.0, "step": 2908 }, { "epoch": 0.5496716897349898, "grad_norm": 1.6375107781970148, "learning_rate": 5.510388419135956e-07, "logits/chosen": 2.21484375, "logits/rejected": 1.8568115234375, "logps/chosen": -858.5, "logps/rejected": -1300.0, "loss": 0.5347, "rewards/accuracies": 0.875, "rewards/chosen": 0.28369140625, "rewards/margins": 8.5703125, "rewards/rejected": -8.296875, "step": 2909 }, { "epoch": 0.5498606452831971, "grad_norm": 1.7616031230367597, "learning_rate": 5.507420302611024e-07, "logits/chosen": 3.0390625, "logits/rejected": 2.62890625, "logps/chosen": -914.5, "logps/rejected": -664.0, "loss": 0.5384, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1171875, "rewards/margins": 4.44140625, "rewards/rejected": -3.3173828125, "step": 2910 }, { "epoch": 0.5500496008314044, "grad_norm": 2.139740885804995, "learning_rate": 5.504452182857895e-07, "logits/chosen": 3.43359375, "logits/rejected": 3.060546875, "logps/chosen": -854.5, "logps/rejected": -744.0, "loss": 0.6448, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8818359375, "rewards/margins": 3.294921875, "rewards/rejected": -2.41748046875, "step": 2911 }, { "epoch": 0.5502385563796117, "grad_norm": 1.8486062596754824, "learning_rate": 5.501484061167846e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.279296875, "logps/chosen": -920.5, "logps/rejected": -799.0, "loss": 0.5924, "rewards/accuracies": 0.78125, "rewards/chosen": 0.87890625, "rewards/margins": 4.85546875, "rewards/rejected": -3.9755859375, "step": 2912 }, { "epoch": 0.550427511927819, "grad_norm": 1.8960011379438428, "learning_rate": 5.498515938832154e-07, "logits/chosen": 2.25390625, "logits/rejected": 2.337890625, "logps/chosen": -526.0, "logps/rejected": -569.0, "loss": 0.8162, "rewards/accuracies": 0.625, "rewards/chosen": -0.31640625, "rewards/margins": 2.9736328125, "rewards/rejected": -3.29443359375, "step": 2913 }, { "epoch": 0.5506164674760262, "grad_norm": 2.3368504336452074, "learning_rate": 5.495547817142106e-07, "logits/chosen": 2.302734375, "logits/rejected": 2.2080078125, "logps/chosen": -697.5, "logps/rejected": -1037.0, "loss": 0.6937, "rewards/accuracies": 0.6875, "rewards/chosen": -0.029296875, "rewards/margins": 4.4140625, "rewards/rejected": -4.43359375, "step": 2914 }, { "epoch": 0.5508054230242335, "grad_norm": 1.6276893446280751, "learning_rate": 5.492579697388976e-07, "logits/chosen": 1.287109375, "logits/rejected": 0.793212890625, "logps/chosen": -614.0, "logps/rejected": -753.0, "loss": 0.489, "rewards/accuracies": 0.90625, "rewards/chosen": 0.61279296875, "rewards/margins": 5.359375, "rewards/rejected": -4.75, "step": 2915 }, { "epoch": 0.5509943785724408, "grad_norm": 2.0551996527144367, "learning_rate": 5.489611580864043e-07, "logits/chosen": 3.388671875, "logits/rejected": 3.01953125, "logps/chosen": -659.0, "logps/rejected": -864.0, "loss": 0.603, "rewards/accuracies": 0.8125, "rewards/chosen": 1.16845703125, "rewards/margins": 4.005859375, "rewards/rejected": -2.83984375, "step": 2916 }, { "epoch": 0.5511833341206481, "grad_norm": 2.213126349458722, "learning_rate": 5.486643468858587e-07, "logits/chosen": 2.4912109375, "logits/rejected": 2.251953125, "logps/chosen": -721.0, "logps/rejected": -733.0, "loss": 0.6478, "rewards/accuracies": 0.78125, "rewards/chosen": 0.431396484375, "rewards/margins": 3.396484375, "rewards/rejected": -2.96484375, "step": 2917 }, { "epoch": 0.5513722896688554, "grad_norm": 2.899497262046931, "learning_rate": 5.483675362663881e-07, "logits/chosen": 3.1796875, "logits/rejected": 2.42578125, "logps/chosen": -768.5, "logps/rejected": -839.0, "loss": 0.5157, "rewards/accuracies": 0.875, "rewards/chosen": 1.18896484375, "rewards/margins": 4.90234375, "rewards/rejected": -3.7109375, "step": 2918 }, { "epoch": 0.5515612452170627, "grad_norm": 2.2120707245407973, "learning_rate": 5.4807072635712e-07, "logits/chosen": 2.2177734375, "logits/rejected": 2.305419921875, "logps/chosen": -1286.0, "logps/rejected": -1930.0, "loss": 0.6918, "rewards/accuracies": 0.71875, "rewards/chosen": 0.23486328125, "rewards/margins": 6.810546875, "rewards/rejected": -6.587890625, "step": 2919 }, { "epoch": 0.5517502007652699, "grad_norm": 2.2974650815156727, "learning_rate": 5.47773917287181e-07, "logits/chosen": 2.2724609375, "logits/rejected": 2.458984375, "logps/chosen": -759.0, "logps/rejected": -872.0, "loss": 0.676, "rewards/accuracies": 0.75, "rewards/chosen": 0.008056640625, "rewards/margins": 4.213623046875, "rewards/rejected": -4.216796875, "step": 2920 }, { "epoch": 0.5519391563134772, "grad_norm": 2.3185038243125864, "learning_rate": 5.474771091856981e-07, "logits/chosen": 1.76904296875, "logits/rejected": 1.97412109375, "logps/chosen": -432.0, "logps/rejected": -527.5, "loss": 0.6791, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1513671875, "rewards/margins": 3.58984375, "rewards/rejected": -3.73046875, "step": 2921 }, { "epoch": 0.5521281118616845, "grad_norm": 1.7139885152827279, "learning_rate": 5.471803021817972e-07, "logits/chosen": 3.359375, "logits/rejected": 2.8115234375, "logps/chosen": -1102.0, "logps/rejected": -1242.0, "loss": 0.5572, "rewards/accuracies": 0.875, "rewards/chosen": 1.34417724609375, "rewards/margins": 7.1796875, "rewards/rejected": -5.8515625, "step": 2922 }, { "epoch": 0.5523170674098918, "grad_norm": 1.4813498286618303, "learning_rate": 5.468834964046041e-07, "logits/chosen": 1.978515625, "logits/rejected": 1.853515625, "logps/chosen": -773.5, "logps/rejected": -847.0, "loss": 0.6057, "rewards/accuracies": 0.8125, "rewards/chosen": -0.148193359375, "rewards/margins": 4.40234375, "rewards/rejected": -4.546875, "step": 2923 }, { "epoch": 0.5525060229580991, "grad_norm": 2.225677908176233, "learning_rate": 5.465866919832443e-07, "logits/chosen": 3.26171875, "logits/rejected": 2.521484375, "logps/chosen": -503.25, "logps/rejected": -537.5, "loss": 0.5279, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8544921875, "rewards/margins": 4.46875, "rewards/rejected": -3.6171875, "step": 2924 }, { "epoch": 0.5526949785063064, "grad_norm": 1.9246214543457318, "learning_rate": 5.462898890468417e-07, "logits/chosen": 2.66796875, "logits/rejected": 2.181640625, "logps/chosen": -673.625, "logps/rejected": -733.5, "loss": 0.7082, "rewards/accuracies": 0.75, "rewards/chosen": 0.091796875, "rewards/margins": 3.765625, "rewards/rejected": -3.669921875, "step": 2925 }, { "epoch": 0.5528839340545136, "grad_norm": 1.7588840895956441, "learning_rate": 5.459930877245208e-07, "logits/chosen": 3.20703125, "logits/rejected": 3.3359375, "logps/chosen": -962.0, "logps/rejected": -1591.0, "loss": 0.5721, "rewards/accuracies": 0.84375, "rewards/chosen": 1.5225830078125, "rewards/margins": 11.78515625, "rewards/rejected": -10.27734375, "step": 2926 }, { "epoch": 0.5530728896027209, "grad_norm": 2.0247866876880978, "learning_rate": 5.456962881454045e-07, "logits/chosen": 2.833984375, "logits/rejected": 2.9453125, "logps/chosen": -769.0, "logps/rejected": -1471.0, "loss": 0.5284, "rewards/accuracies": 0.84375, "rewards/chosen": 0.71875, "rewards/margins": 7.296875, "rewards/rejected": -6.5703125, "step": 2927 }, { "epoch": 0.5532618451509282, "grad_norm": 2.2513862295474105, "learning_rate": 5.453994904386157e-07, "logits/chosen": 2.8984375, "logits/rejected": 2.6484375, "logps/chosen": -744.5, "logps/rejected": -1646.0, "loss": 0.6351, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9349365234375, "rewards/margins": 7.98828125, "rewards/rejected": -7.04296875, "step": 2928 }, { "epoch": 0.5534508006991355, "grad_norm": 1.725423219134925, "learning_rate": 5.451026947332756e-07, "logits/chosen": 2.98046875, "logits/rejected": 2.416015625, "logps/chosen": -652.0, "logps/rejected": -597.5, "loss": 0.6852, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9560546875, "rewards/margins": 3.04296875, "rewards/rejected": -2.080078125, "step": 2929 }, { "epoch": 0.5536397562473429, "grad_norm": 1.9935228574802062, "learning_rate": 5.448059011585052e-07, "logits/chosen": 1.427734375, "logits/rejected": 1.7073974609375, "logps/chosen": -1058.0, "logps/rejected": -1384.0, "loss": 0.5949, "rewards/accuracies": 0.78125, "rewards/chosen": 1.108642578125, "rewards/margins": 5.5234375, "rewards/rejected": -4.4140625, "step": 2930 }, { "epoch": 0.55382871179555, "grad_norm": 1.777016656768839, "learning_rate": 5.445091098434243e-07, "logits/chosen": 2.103515625, "logits/rejected": 2.1806640625, "logps/chosen": -578.5, "logps/rejected": -608.0, "loss": 0.6002, "rewards/accuracies": 0.78125, "rewards/chosen": 0.74609375, "rewards/margins": 4.02734375, "rewards/rejected": -3.2734375, "step": 2931 }, { "epoch": 0.5540176673437573, "grad_norm": 1.9242316604401581, "learning_rate": 5.44212320917152e-07, "logits/chosen": 2.359375, "logits/rejected": 2.5556640625, "logps/chosen": -756.5, "logps/rejected": -1122.0, "loss": 0.6993, "rewards/accuracies": 0.75, "rewards/chosen": 0.6083984375, "rewards/margins": 4.2890625, "rewards/rejected": -3.681640625, "step": 2932 }, { "epoch": 0.5542066228919647, "grad_norm": 2.4577491723758293, "learning_rate": 5.43915534508806e-07, "logits/chosen": 2.671875, "logits/rejected": 2.490234375, "logps/chosen": -902.0, "logps/rejected": -1027.0, "loss": 0.4955, "rewards/accuracies": 0.8125, "rewards/chosen": 1.7265625, "rewards/margins": 5.2890625, "rewards/rejected": -3.56640625, "step": 2933 }, { "epoch": 0.554395578440172, "grad_norm": 1.6486664501637718, "learning_rate": 5.436187507475028e-07, "logits/chosen": 2.994140625, "logits/rejected": 2.8671875, "logps/chosen": -1268.0, "logps/rejected": -1498.0, "loss": 0.4038, "rewards/accuracies": 0.875, "rewards/chosen": 2.109375, "rewards/margins": 8.2578125, "rewards/rejected": -6.15625, "step": 2934 }, { "epoch": 0.5545845339883793, "grad_norm": 2.254296224745545, "learning_rate": 5.433219697623584e-07, "logits/chosen": 2.3046875, "logits/rejected": 2.18359375, "logps/chosen": -610.25, "logps/rejected": -562.0, "loss": 0.7168, "rewards/accuracies": 0.625, "rewards/chosen": -0.07177734375, "rewards/margins": 2.5634765625, "rewards/rejected": -2.64453125, "step": 2935 }, { "epoch": 0.5547734895365866, "grad_norm": 1.695529173284165, "learning_rate": 5.430251916824868e-07, "logits/chosen": 2.3125, "logits/rejected": 1.921875, "logps/chosen": -1107.0, "logps/rejected": -1593.0, "loss": 0.5586, "rewards/accuracies": 0.8125, "rewards/chosen": 1.095703125, "rewards/margins": 5.484375, "rewards/rejected": -4.388671875, "step": 2936 }, { "epoch": 0.5549624450847938, "grad_norm": 2.3534669027696333, "learning_rate": 5.427284166370013e-07, "logits/chosen": 1.6005859375, "logits/rejected": 1.6328125, "logps/chosen": -778.5, "logps/rejected": -963.5, "loss": 0.5535, "rewards/accuracies": 0.84375, "rewards/chosen": 1.306304931640625, "rewards/margins": 5.392578125, "rewards/rejected": -4.07421875, "step": 2937 }, { "epoch": 0.5551514006330011, "grad_norm": 1.3373185556728315, "learning_rate": 5.424316447550139e-07, "logits/chosen": 2.875, "logits/rejected": 2.443359375, "logps/chosen": -422.5, "logps/rejected": -665.0, "loss": 0.516, "rewards/accuracies": 0.9375, "rewards/chosen": 0.45947265625, "rewards/margins": 5.265625, "rewards/rejected": -4.80078125, "step": 2938 }, { "epoch": 0.5553403561812084, "grad_norm": 2.761505080753374, "learning_rate": 5.421348761656348e-07, "logits/chosen": 2.583984375, "logits/rejected": 2.87890625, "logps/chosen": -779.0, "logps/rejected": -794.0, "loss": 0.6822, "rewards/accuracies": 0.8125, "rewards/chosen": 0.25341796875, "rewards/margins": 3.52734375, "rewards/rejected": -3.2734375, "step": 2939 }, { "epoch": 0.5555293117294157, "grad_norm": 2.4578171841490772, "learning_rate": 5.418381109979731e-07, "logits/chosen": 2.05810546875, "logits/rejected": 1.3115234375, "logps/chosen": -1024.0, "logps/rejected": -1033.0, "loss": 0.5546, "rewards/accuracies": 0.875, "rewards/chosen": 0.7265625, "rewards/margins": 5.8515625, "rewards/rejected": -5.12109375, "step": 2940 }, { "epoch": 0.555718267277623, "grad_norm": 2.4701221451937765, "learning_rate": 5.41541349381136e-07, "logits/chosen": 2.2421875, "logits/rejected": 1.6650390625, "logps/chosen": -633.0, "logps/rejected": -693.0, "loss": 0.5552, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5662841796875, "rewards/margins": 5.28125, "rewards/rejected": -4.71484375, "step": 2941 }, { "epoch": 0.5559072228258303, "grad_norm": 1.985700564162136, "learning_rate": 5.412445914442301e-07, "logits/chosen": 2.748046875, "logits/rejected": 2.21826171875, "logps/chosen": -871.0, "logps/rejected": -1629.0, "loss": 0.5973, "rewards/accuracies": 0.84375, "rewards/chosen": 0.763671875, "rewards/margins": 10.140625, "rewards/rejected": -9.390625, "step": 2942 }, { "epoch": 0.5560961783740375, "grad_norm": 2.017115768253479, "learning_rate": 5.409478373163592e-07, "logits/chosen": 2.720703125, "logits/rejected": 2.537109375, "logps/chosen": -967.0, "logps/rejected": -1298.0, "loss": 0.6425, "rewards/accuracies": 0.71875, "rewards/chosen": 0.962890625, "rewards/margins": 6.0078125, "rewards/rejected": -5.03125, "step": 2943 }, { "epoch": 0.5562851339222448, "grad_norm": 1.9566210438142395, "learning_rate": 5.40651087126626e-07, "logits/chosen": 3.5625, "logits/rejected": 3.171875, "logps/chosen": -870.5, "logps/rejected": -890.5, "loss": 0.5514, "rewards/accuracies": 0.875, "rewards/chosen": 0.6220703125, "rewards/margins": 5.73046875, "rewards/rejected": -5.1015625, "step": 2944 }, { "epoch": 0.5564740894704521, "grad_norm": 2.5320960917418183, "learning_rate": 5.40354341004132e-07, "logits/chosen": 1.900390625, "logits/rejected": 1.162109375, "logps/chosen": -945.0, "logps/rejected": -1077.5, "loss": 0.56, "rewards/accuracies": 0.84375, "rewards/chosen": 0.89599609375, "rewards/margins": 6.0, "rewards/rejected": -5.1015625, "step": 2945 }, { "epoch": 0.5566630450186594, "grad_norm": 3.049224494064219, "learning_rate": 5.400575990779758e-07, "logits/chosen": 3.06640625, "logits/rejected": 2.234375, "logps/chosen": -1574.0, "logps/rejected": -769.5, "loss": 0.6379, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1259765625, "rewards/margins": 5.046875, "rewards/rejected": -4.9140625, "step": 2946 }, { "epoch": 0.5568520005668667, "grad_norm": 1.6962676561612227, "learning_rate": 5.397608614772552e-07, "logits/chosen": 2.154296875, "logits/rejected": 1.931640625, "logps/chosen": -825.5, "logps/rejected": -830.5, "loss": 0.6034, "rewards/accuracies": 0.71875, "rewards/chosen": 1.31201171875, "rewards/margins": 5.23046875, "rewards/rejected": -3.9140625, "step": 2947 }, { "epoch": 0.557040956115074, "grad_norm": 2.4435803591754808, "learning_rate": 5.394641283310656e-07, "logits/chosen": 2.25390625, "logits/rejected": 1.986328125, "logps/chosen": -701.0, "logps/rejected": -822.5, "loss": 0.6269, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3349609375, "rewards/margins": 4.12890625, "rewards/rejected": -3.787109375, "step": 2948 }, { "epoch": 0.5572299116632812, "grad_norm": 1.9763194952426186, "learning_rate": 5.391673997685006e-07, "logits/chosen": 2.193359375, "logits/rejected": 2.240234375, "logps/chosen": -948.0, "logps/rejected": -800.0, "loss": 0.696, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6298828125, "rewards/margins": 3.57421875, "rewards/rejected": -2.953125, "step": 2949 }, { "epoch": 0.5574188672114885, "grad_norm": 1.7166841876646697, "learning_rate": 5.388706759186517e-07, "logits/chosen": 3.73828125, "logits/rejected": 2.9140625, "logps/chosen": -726.25, "logps/rejected": -704.5, "loss": 0.7138, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21142578125, "rewards/margins": 3.734375, "rewards/rejected": -3.947265625, "step": 2950 }, { "epoch": 0.5576078227596958, "grad_norm": 1.6201941829340307, "learning_rate": 5.385739569106084e-07, "logits/chosen": 2.53125, "logits/rejected": 1.763671875, "logps/chosen": -723.0, "logps/rejected": -1314.0, "loss": 0.5548, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1796875, "rewards/margins": 6.203125, "rewards/rejected": -6.03125, "step": 2951 }, { "epoch": 0.5577967783079031, "grad_norm": 2.122670465539804, "learning_rate": 5.382772428734583e-07, "logits/chosen": 1.6064453125, "logits/rejected": 1.63818359375, "logps/chosen": -682.5, "logps/rejected": -888.0, "loss": 0.5829, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8311767578125, "rewards/margins": 5.390625, "rewards/rejected": -4.560546875, "step": 2952 }, { "epoch": 0.5579857338561104, "grad_norm": 2.800701397495754, "learning_rate": 5.379805339362865e-07, "logits/chosen": 2.412109375, "logits/rejected": 2.05859375, "logps/chosen": -874.0, "logps/rejected": -743.0, "loss": 0.6061, "rewards/accuracies": 0.875, "rewards/chosen": 0.6279296875, "rewards/margins": 4.703125, "rewards/rejected": -4.087890625, "step": 2953 }, { "epoch": 0.5581746894043176, "grad_norm": 2.6960418230569574, "learning_rate": 5.376838302281762e-07, "logits/chosen": 1.235595703125, "logits/rejected": 0.97900390625, "logps/chosen": -963.5, "logps/rejected": -956.0, "loss": 0.5076, "rewards/accuracies": 0.90625, "rewards/chosen": 1.77587890625, "rewards/margins": 5.7421875, "rewards/rejected": -3.9736328125, "step": 2954 }, { "epoch": 0.5583636449525249, "grad_norm": 1.710775502393805, "learning_rate": 5.37387131878208e-07, "logits/chosen": 2.8916015625, "logits/rejected": 2.1396484375, "logps/chosen": -456.0, "logps/rejected": -1065.0, "loss": 0.596, "rewards/accuracies": 0.78125, "rewards/chosen": 0.57275390625, "rewards/margins": 8.015625, "rewards/rejected": -7.41796875, "step": 2955 }, { "epoch": 0.5585526005007322, "grad_norm": 1.6477066735819128, "learning_rate": 5.370904390154606e-07, "logits/chosen": 2.705078125, "logits/rejected": 2.7421875, "logps/chosen": -978.0, "logps/rejected": -1173.0, "loss": 0.6242, "rewards/accuracies": 0.78125, "rewards/chosen": 1.8671875, "rewards/margins": 4.64453125, "rewards/rejected": -2.76953125, "step": 2956 }, { "epoch": 0.5587415560489395, "grad_norm": 2.1297273206522656, "learning_rate": 5.367937517690095e-07, "logits/chosen": 3.13671875, "logits/rejected": 3.271484375, "logps/chosen": -956.0, "logps/rejected": -1383.0, "loss": 0.4508, "rewards/accuracies": 0.84375, "rewards/chosen": 1.8857421875, "rewards/margins": 7.43359375, "rewards/rejected": -5.5390625, "step": 2957 }, { "epoch": 0.5589305115971468, "grad_norm": 2.5916489587372022, "learning_rate": 5.364970702679289e-07, "logits/chosen": 2.375, "logits/rejected": 2.5546875, "logps/chosen": -713.5, "logps/rejected": -1226.0, "loss": 0.6838, "rewards/accuracies": 0.71875, "rewards/chosen": 0.357940673828125, "rewards/margins": 3.69140625, "rewards/rejected": -3.33203125, "step": 2958 }, { "epoch": 0.5591194671453541, "grad_norm": 3.7087052956726683, "learning_rate": 5.362003946412898e-07, "logits/chosen": 2.3505859375, "logits/rejected": 2.74609375, "logps/chosen": -796.0, "logps/rejected": -1713.0, "loss": 0.6543, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0679931640625, "rewards/margins": 5.40625, "rewards/rejected": -5.466796875, "step": 2959 }, { "epoch": 0.5593084226935613, "grad_norm": 1.552628328030376, "learning_rate": 5.359037250181605e-07, "logits/chosen": 2.71875, "logits/rejected": 2.7421875, "logps/chosen": -852.0, "logps/rejected": -1190.0, "loss": 0.585, "rewards/accuracies": 0.78125, "rewards/chosen": 1.18359375, "rewards/margins": 5.39453125, "rewards/rejected": -4.205078125, "step": 2960 }, { "epoch": 0.5594973782417686, "grad_norm": 2.719763819508849, "learning_rate": 5.356070615276072e-07, "logits/chosen": 2.935546875, "logits/rejected": 2.212890625, "logps/chosen": -837.0, "logps/rejected": -800.0, "loss": 0.5038, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3397216796875, "rewards/margins": 5.3828125, "rewards/rejected": -4.0390625, "step": 2961 }, { "epoch": 0.5596863337899759, "grad_norm": 1.2970982568228997, "learning_rate": 5.353104042986928e-07, "logits/chosen": 3.12109375, "logits/rejected": 1.8519287109375, "logps/chosen": -764.5, "logps/rejected": -11958.5, "loss": 0.4659, "rewards/accuracies": 0.875, "rewards/chosen": 1.2734375, "rewards/margins": 24.09375, "rewards/rejected": -22.841796875, "step": 2962 }, { "epoch": 0.5598752893381832, "grad_norm": 1.8617370959272637, "learning_rate": 5.350137534604783e-07, "logits/chosen": 1.8193359375, "logits/rejected": 1.814453125, "logps/chosen": -664.5, "logps/rejected": -719.0, "loss": 0.7469, "rewards/accuracies": 0.625, "rewards/chosen": 0.718475341796875, "rewards/margins": 3.466796875, "rewards/rejected": -2.75390625, "step": 2963 }, { "epoch": 0.5600642448863905, "grad_norm": 2.87964834540087, "learning_rate": 5.34717109142021e-07, "logits/chosen": 3.115234375, "logits/rejected": 2.6640625, "logps/chosen": -581.0, "logps/rejected": -904.0, "loss": 0.5285, "rewards/accuracies": 0.90625, "rewards/chosen": 0.28515625, "rewards/margins": 5.6875, "rewards/rejected": -5.40625, "step": 2964 }, { "epoch": 0.5602532004345978, "grad_norm": 3.391531024683644, "learning_rate": 5.344204714723763e-07, "logits/chosen": 2.3515625, "logits/rejected": 1.81640625, "logps/chosen": -1649.0, "logps/rejected": -1013.0, "loss": 0.5383, "rewards/accuracies": 0.84375, "rewards/chosen": -0.626953125, "rewards/margins": 4.01904296875, "rewards/rejected": -4.6484375, "step": 2965 }, { "epoch": 0.560442155982805, "grad_norm": 1.8138685443163325, "learning_rate": 5.34123840580596e-07, "logits/chosen": 2.265625, "logits/rejected": 1.6630859375, "logps/chosen": -777.0, "logps/rejected": -666.0, "loss": 0.6456, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4970703125, "rewards/margins": 3.591796875, "rewards/rejected": -3.08984375, "step": 2966 }, { "epoch": 0.5606311115310123, "grad_norm": 1.6258570457374066, "learning_rate": 5.338272165957291e-07, "logits/chosen": 2.783203125, "logits/rejected": 2.390625, "logps/chosen": -682.0, "logps/rejected": -1117.0, "loss": 0.4927, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1646728515625, "rewards/margins": 7.1328125, "rewards/rejected": -6.96875, "step": 2967 }, { "epoch": 0.5608200670792196, "grad_norm": 3.690510451399294, "learning_rate": 5.33530599646822e-07, "logits/chosen": 1.92578125, "logits/rejected": 1.703125, "logps/chosen": -738.5, "logps/rejected": -918.0, "loss": 0.6126, "rewards/accuracies": 0.84375, "rewards/chosen": 0.32568359375, "rewards/margins": 4.2734375, "rewards/rejected": -3.94140625, "step": 2968 }, { "epoch": 0.5610090226274269, "grad_norm": 2.2926275639786944, "learning_rate": 5.332339898629173e-07, "logits/chosen": 3.16796875, "logits/rejected": 2.89453125, "logps/chosen": -786.5, "logps/rejected": -769.0, "loss": 0.5288, "rewards/accuracies": 0.84375, "rewards/chosen": 1.02099609375, "rewards/margins": 4.6484375, "rewards/rejected": -3.62890625, "step": 2969 }, { "epoch": 0.5611979781756342, "grad_norm": 2.4270407452153746, "learning_rate": 5.329373873730552e-07, "logits/chosen": 3.20703125, "logits/rejected": 2.615234375, "logps/chosen": -753.0, "logps/rejected": -752.0, "loss": 0.7497, "rewards/accuracies": 0.75, "rewards/chosen": 0.41357421875, "rewards/margins": 3.73828125, "rewards/rejected": -3.33203125, "step": 2970 }, { "epoch": 0.5613869337238415, "grad_norm": 2.37856223313587, "learning_rate": 5.326407923062723e-07, "logits/chosen": 2.44921875, "logits/rejected": 2.361328125, "logps/chosen": -1142.5, "logps/rejected": -1330.5, "loss": 0.7249, "rewards/accuracies": 0.78125, "rewards/chosen": 0.228271484375, "rewards/margins": 5.466796875, "rewards/rejected": -5.23046875, "step": 2971 }, { "epoch": 0.5615758892720487, "grad_norm": 2.8650065679633494, "learning_rate": 5.323442047916023e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.3203125, "logps/chosen": -1097.0, "logps/rejected": -957.0, "loss": 0.4668, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7421875, "rewards/margins": 5.4609375, "rewards/rejected": -4.72265625, "step": 2972 }, { "epoch": 0.561764844820256, "grad_norm": 2.0731400709155134, "learning_rate": 5.320476249580753e-07, "logits/chosen": 2.478515625, "logits/rejected": 2.5482177734375, "logps/chosen": -960.0, "logps/rejected": -1886.0, "loss": 0.5554, "rewards/accuracies": 0.875, "rewards/chosen": 0.5712890625, "rewards/margins": 9.203125, "rewards/rejected": -8.640625, "step": 2973 }, { "epoch": 0.5619538003684633, "grad_norm": 1.9038388144821574, "learning_rate": 5.31751052934718e-07, "logits/chosen": 1.61328125, "logits/rejected": 1.494415283203125, "logps/chosen": -657.0, "logps/rejected": -604.5, "loss": 0.8295, "rewards/accuracies": 0.625, "rewards/chosen": -1.00390625, "rewards/margins": 2.5390625, "rewards/rejected": -3.5390625, "step": 2974 }, { "epoch": 0.5621427559166706, "grad_norm": 1.8714445244901685, "learning_rate": 5.314544888505542e-07, "logits/chosen": 2.544921875, "logits/rejected": 1.947265625, "logps/chosen": -649.0, "logps/rejected": -727.5, "loss": 0.5227, "rewards/accuracies": 0.78125, "rewards/chosen": 0.58056640625, "rewards/margins": 6.703125, "rewards/rejected": -6.12890625, "step": 2975 }, { "epoch": 0.5623317114648779, "grad_norm": 2.3470287580186335, "learning_rate": 5.311579328346035e-07, "logits/chosen": 2.833984375, "logits/rejected": 2.564453125, "logps/chosen": -488.0, "logps/rejected": -604.5, "loss": 0.6806, "rewards/accuracies": 0.75, "rewards/chosen": -0.92578125, "rewards/margins": 2.94140625, "rewards/rejected": -3.875, "step": 2976 }, { "epoch": 0.5625206670130851, "grad_norm": 2.50854860912531, "learning_rate": 5.308613850158831e-07, "logits/chosen": 2.96875, "logits/rejected": 2.38671875, "logps/chosen": -843.0, "logps/rejected": -821.0, "loss": 0.5936, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12939453125, "rewards/margins": 5.91796875, "rewards/rejected": -6.046875, "step": 2977 }, { "epoch": 0.5627096225612924, "grad_norm": 2.406932189538766, "learning_rate": 5.305648455234052e-07, "logits/chosen": 1.94921875, "logits/rejected": 1.59375, "logps/chosen": -850.0, "logps/rejected": -871.0, "loss": 0.4587, "rewards/accuracies": 0.90625, "rewards/chosen": 0.952392578125, "rewards/margins": 6.1796875, "rewards/rejected": -5.23046875, "step": 2978 }, { "epoch": 0.5628985781094997, "grad_norm": 2.246607177882641, "learning_rate": 5.302683144861794e-07, "logits/chosen": 2.68359375, "logits/rejected": 2.58203125, "logps/chosen": -767.0, "logps/rejected": -1251.0, "loss": 0.5515, "rewards/accuracies": 0.78125, "rewards/chosen": 1.32568359375, "rewards/margins": 5.85546875, "rewards/rejected": -4.53125, "step": 2979 }, { "epoch": 0.563087533657707, "grad_norm": 2.639662076309948, "learning_rate": 5.299717920332116e-07, "logits/chosen": 2.57421875, "logits/rejected": 2.5185546875, "logps/chosen": -719.25, "logps/rejected": -966.0, "loss": 0.6896, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7586669921875, "rewards/margins": 4.08984375, "rewards/rejected": -3.328125, "step": 2980 }, { "epoch": 0.5632764892059143, "grad_norm": 2.5064487792090118, "learning_rate": 5.296752782935034e-07, "logits/chosen": 2.923828125, "logits/rejected": 3.08203125, "logps/chosen": -1450.0, "logps/rejected": -1303.0, "loss": 0.568, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7718505859375, "rewards/margins": 5.90625, "rewards/rejected": -5.125, "step": 2981 }, { "epoch": 0.5634654447541216, "grad_norm": 3.6344497563032223, "learning_rate": 5.293787733960531e-07, "logits/chosen": 2.169921875, "logits/rejected": 2.328125, "logps/chosen": -848.5, "logps/rejected": -1403.0, "loss": 0.6154, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5919189453125, "rewards/margins": 4.76171875, "rewards/rejected": -4.171875, "step": 2982 }, { "epoch": 0.5636544003023288, "grad_norm": 2.01270670093636, "learning_rate": 5.290822774698548e-07, "logits/chosen": 1.587890625, "logits/rejected": 1.964599609375, "logps/chosen": -726.0, "logps/rejected": -1361.0, "loss": 0.5532, "rewards/accuracies": 0.84375, "rewards/chosen": 0.59228515625, "rewards/margins": 5.421875, "rewards/rejected": -4.841796875, "step": 2983 }, { "epoch": 0.5638433558505361, "grad_norm": 1.991815305120074, "learning_rate": 5.28785790643899e-07, "logits/chosen": 2.416015625, "logits/rejected": 2.52734375, "logps/chosen": -530.5, "logps/rejected": -634.0, "loss": 0.6888, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3409423828125, "rewards/margins": 3.55859375, "rewards/rejected": -3.90234375, "step": 2984 }, { "epoch": 0.5640323113987434, "grad_norm": 2.0905737554165427, "learning_rate": 5.284893130471721e-07, "logits/chosen": 2.955078125, "logits/rejected": 2.38671875, "logps/chosen": -582.5, "logps/rejected": -749.5, "loss": 0.5557, "rewards/accuracies": 0.78125, "rewards/chosen": 0.677978515625, "rewards/margins": 5.21484375, "rewards/rejected": -4.53125, "step": 2985 }, { "epoch": 0.5642212669469507, "grad_norm": 1.8423391515941399, "learning_rate": 5.281928448086566e-07, "logits/chosen": 2.6875, "logits/rejected": 2.603515625, "logps/chosen": -663.0, "logps/rejected": -838.0, "loss": 0.6334, "rewards/accuracies": 0.8125, "rewards/chosen": 0.947265625, "rewards/margins": 4.326171875, "rewards/rejected": -3.3779296875, "step": 2986 }, { "epoch": 0.564410222495158, "grad_norm": 2.195840624808198, "learning_rate": 5.278963860573308e-07, "logits/chosen": 1.6953125, "logits/rejected": 1.64453125, "logps/chosen": -728.5, "logps/rejected": -791.5, "loss": 0.6126, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1396484375, "rewards/margins": 3.7880859375, "rewards/rejected": -3.64453125, "step": 2987 }, { "epoch": 0.5645991780433653, "grad_norm": 2.4193952952246107, "learning_rate": 5.275999369221684e-07, "logits/chosen": 2.984375, "logits/rejected": 2.61328125, "logps/chosen": -848.5, "logps/rejected": -1579.0, "loss": 0.5486, "rewards/accuracies": 0.8125, "rewards/chosen": 0.720947265625, "rewards/margins": 7.3828125, "rewards/rejected": -6.68359375, "step": 2988 }, { "epoch": 0.5647881335915725, "grad_norm": 2.350282843413406, "learning_rate": 5.273034975321403e-07, "logits/chosen": 1.8544921875, "logits/rejected": 1.663330078125, "logps/chosen": -624.0, "logps/rejected": -1531.0, "loss": 0.4651, "rewards/accuracies": 0.90625, "rewards/chosen": 1.04718017578125, "rewards/margins": 7.3515625, "rewards/rejected": -6.3046875, "step": 2989 }, { "epoch": 0.5649770891397798, "grad_norm": 1.7648432683035502, "learning_rate": 5.270070680162117e-07, "logits/chosen": 3.2109375, "logits/rejected": 2.90234375, "logps/chosen": -797.0, "logps/rejected": -700.0, "loss": 0.5061, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5107421875, "rewards/margins": 4.36328125, "rewards/rejected": -4.88671875, "step": 2990 }, { "epoch": 0.5651660446879871, "grad_norm": 1.5800686494897986, "learning_rate": 5.267106485033441e-07, "logits/chosen": 2.18359375, "logits/rejected": 1.4794921875, "logps/chosen": -684.0, "logps/rejected": -774.5, "loss": 0.517, "rewards/accuracies": 0.9375, "rewards/chosen": 0.94140625, "rewards/margins": 5.07421875, "rewards/rejected": -4.126953125, "step": 2991 }, { "epoch": 0.5653550002361944, "grad_norm": 1.674548218196203, "learning_rate": 5.264142391224947e-07, "logits/chosen": 2.474609375, "logits/rejected": 1.861328125, "logps/chosen": -747.5, "logps/rejected": -1336.5, "loss": 0.5585, "rewards/accuracies": 0.84375, "rewards/chosen": -0.82666015625, "rewards/margins": 8.5751953125, "rewards/rejected": -9.39453125, "step": 2992 }, { "epoch": 0.5655439557844018, "grad_norm": 2.9897069406383263, "learning_rate": 5.261178400026161e-07, "logits/chosen": 2.525390625, "logits/rejected": 2.119140625, "logps/chosen": -673.0, "logps/rejected": -762.5, "loss": 0.6279, "rewards/accuracies": 0.8125, "rewards/chosen": 0.283447265625, "rewards/margins": 5.046875, "rewards/rejected": -4.76171875, "step": 2993 }, { "epoch": 0.5657329113326091, "grad_norm": 2.0194480819117864, "learning_rate": 5.25821451272657e-07, "logits/chosen": 2.154296875, "logits/rejected": 1.77734375, "logps/chosen": -1280.0, "logps/rejected": -1038.0, "loss": 0.4871, "rewards/accuracies": 0.84375, "rewards/chosen": 1.32659912109375, "rewards/margins": 6.2265625, "rewards/rejected": -4.89453125, "step": 2994 }, { "epoch": 0.5659218668808162, "grad_norm": 2.147302830793205, "learning_rate": 5.255250730615606e-07, "logits/chosen": 3.046875, "logits/rejected": 2.62109375, "logps/chosen": -976.0, "logps/rejected": -1152.5, "loss": 0.5192, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2841796875, "rewards/margins": 9.3359375, "rewards/rejected": -8.04296875, "step": 2995 }, { "epoch": 0.5661108224290236, "grad_norm": 2.907522093733538, "learning_rate": 5.252287054982665e-07, "logits/chosen": 2.90234375, "logits/rejected": 1.9765625, "logps/chosen": -656.5, "logps/rejected": -933.0, "loss": 0.5257, "rewards/accuracies": 0.875, "rewards/chosen": 0.990966796875, "rewards/margins": 6.78125, "rewards/rejected": -5.7890625, "step": 2996 }, { "epoch": 0.5662997779772309, "grad_norm": 2.114428130887814, "learning_rate": 5.24932348711709e-07, "logits/chosen": 2.4208984375, "logits/rejected": 2.345703125, "logps/chosen": -741.0, "logps/rejected": -634.0, "loss": 0.6543, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6307373046875, "rewards/margins": 3.3671875, "rewards/rejected": -4.00390625, "step": 2997 }, { "epoch": 0.5664887335254382, "grad_norm": 2.632023192535414, "learning_rate": 5.24636002830818e-07, "logits/chosen": 1.45703125, "logits/rejected": 1.3819580078125, "logps/chosen": -660.0, "logps/rejected": -729.5, "loss": 0.5439, "rewards/accuracies": 0.8125, "rewards/chosen": 0.626953125, "rewards/margins": 4.8828125, "rewards/rejected": -4.25390625, "step": 2998 }, { "epoch": 0.5666776890736455, "grad_norm": 2.0184763789544653, "learning_rate": 5.243396679845184e-07, "logits/chosen": 2.384765625, "logits/rejected": 2.09765625, "logps/chosen": -677.0, "logps/rejected": -1132.5, "loss": 0.5475, "rewards/accuracies": 0.875, "rewards/chosen": 0.716064453125, "rewards/margins": 7.29296875, "rewards/rejected": -6.57421875, "step": 2999 }, { "epoch": 0.5668666446218527, "grad_norm": 9.673144123388123, "learning_rate": 5.240433443017306e-07, "logits/chosen": 2.892578125, "logits/rejected": 2.529296875, "logps/chosen": -1069.0, "logps/rejected": -831.0, "loss": 0.4356, "rewards/accuracies": 0.875, "rewards/chosen": 1.45751953125, "rewards/margins": 6.4453125, "rewards/rejected": -4.97265625, "step": 3000 }, { "epoch": 0.56705560017006, "grad_norm": 2.132913832795002, "learning_rate": 5.237470319113703e-07, "logits/chosen": 2.625, "logits/rejected": 1.99609375, "logps/chosen": -717.0, "logps/rejected": -1232.0, "loss": 0.6485, "rewards/accuracies": 0.78125, "rewards/chosen": 0.177734375, "rewards/margins": 5.3515625, "rewards/rejected": -5.171875, "step": 3001 }, { "epoch": 0.5672445557182673, "grad_norm": 2.5687491379661997, "learning_rate": 5.234507309423477e-07, "logits/chosen": 2.625, "logits/rejected": 2.349609375, "logps/chosen": -810.5, "logps/rejected": -752.0, "loss": 0.4936, "rewards/accuracies": 1.0, "rewards/chosen": 0.5379638671875, "rewards/margins": 6.0390625, "rewards/rejected": -5.4921875, "step": 3002 }, { "epoch": 0.5674335112664746, "grad_norm": 2.821959443557011, "learning_rate": 5.231544415235686e-07, "logits/chosen": 2.4140625, "logits/rejected": 1.974609375, "logps/chosen": -755.5, "logps/rejected": -753.5, "loss": 0.5278, "rewards/accuracies": 0.875, "rewards/chosen": 1.111328125, "rewards/margins": 5.5546875, "rewards/rejected": -4.4453125, "step": 3003 }, { "epoch": 0.5676224668146819, "grad_norm": 2.3011213933740855, "learning_rate": 5.228581637839333e-07, "logits/chosen": 3.154296875, "logits/rejected": 2.708984375, "logps/chosen": -642.0, "logps/rejected": -748.0, "loss": 0.633, "rewards/accuracies": 0.75, "rewards/chosen": -0.1328125, "rewards/margins": 3.99267578125, "rewards/rejected": -4.126953125, "step": 3004 }, { "epoch": 0.5678114223628892, "grad_norm": 3.367535637124011, "learning_rate": 5.225618978523374e-07, "logits/chosen": 2.884765625, "logits/rejected": 2.705078125, "logps/chosen": -813.0, "logps/rejected": -685.0, "loss": 0.6266, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0625, "rewards/margins": 3.99609375, "rewards/rejected": -3.9375, "step": 3005 }, { "epoch": 0.5680003779110964, "grad_norm": 2.572623721002194, "learning_rate": 5.222656438576711e-07, "logits/chosen": 3.033203125, "logits/rejected": 2.357421875, "logps/chosen": -882.0, "logps/rejected": -800.0, "loss": 0.5724, "rewards/accuracies": 0.71875, "rewards/chosen": 0.099609375, "rewards/margins": 5.01953125, "rewards/rejected": -4.92578125, "step": 3006 }, { "epoch": 0.5681893334593037, "grad_norm": 2.618319984044492, "learning_rate": 5.219694019288197e-07, "logits/chosen": 2.2001953125, "logits/rejected": 1.7724609375, "logps/chosen": -657.5, "logps/rejected": -593.0, "loss": 0.6284, "rewards/accuracies": 0.84375, "rewards/chosen": -0.015380859375, "rewards/margins": 4.15234375, "rewards/rejected": -4.1640625, "step": 3007 }, { "epoch": 0.568378289007511, "grad_norm": 1.5542751848381704, "learning_rate": 5.216731721946629e-07, "logits/chosen": 2.703125, "logits/rejected": 2.61328125, "logps/chosen": -682.0, "logps/rejected": -853.0, "loss": 0.6401, "rewards/accuracies": 0.75, "rewards/chosen": 0.3826904296875, "rewards/margins": 3.58203125, "rewards/rejected": -3.1953125, "step": 3008 }, { "epoch": 0.5685672445557183, "grad_norm": 2.0994064104706833, "learning_rate": 5.213769547840753e-07, "logits/chosen": 3.31640625, "logits/rejected": 3.2578125, "logps/chosen": -780.0, "logps/rejected": -1391.0, "loss": 0.5919, "rewards/accuracies": 0.6875, "rewards/chosen": 0.34326171875, "rewards/margins": 7.609375, "rewards/rejected": -7.25, "step": 3009 }, { "epoch": 0.5687562001039256, "grad_norm": 2.5350654743846355, "learning_rate": 5.210807498259263e-07, "logits/chosen": 2.533203125, "logits/rejected": 2.533203125, "logps/chosen": -774.5, "logps/rejected": -1122.0, "loss": 0.6626, "rewards/accuracies": 0.75, "rewards/chosen": -0.21875, "rewards/margins": 4.6328125, "rewards/rejected": -4.85546875, "step": 3010 }, { "epoch": 0.5689451556521329, "grad_norm": 2.0734962107074257, "learning_rate": 5.207845574490792e-07, "logits/chosen": 2.83984375, "logits/rejected": 2.6640625, "logps/chosen": -1208.0, "logps/rejected": -1168.0, "loss": 0.5727, "rewards/accuracies": 0.75, "rewards/chosen": 1.6572265625, "rewards/margins": 5.44921875, "rewards/rejected": -3.791015625, "step": 3011 }, { "epoch": 0.5691341112003401, "grad_norm": 2.4137696632701773, "learning_rate": 5.204883777823931e-07, "logits/chosen": 2.1875, "logits/rejected": 1.517578125, "logps/chosen": -774.5, "logps/rejected": -945.5, "loss": 0.5994, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6494140625, "rewards/margins": 4.75, "rewards/rejected": -4.09375, "step": 3012 }, { "epoch": 0.5693230667485474, "grad_norm": 1.6291334310376637, "learning_rate": 5.2019221095472e-07, "logits/chosen": 2.796875, "logits/rejected": 2.154296875, "logps/chosen": -803.0, "logps/rejected": -1352.0, "loss": 0.4705, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0859375, "rewards/margins": 9.1015625, "rewards/rejected": -7.99609375, "step": 3013 }, { "epoch": 0.5695120222967547, "grad_norm": 1.5498618130621165, "learning_rate": 5.198960570949075e-07, "logits/chosen": 3.43359375, "logits/rejected": 3.22265625, "logps/chosen": -560.5, "logps/rejected": -660.5, "loss": 0.7871, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4013671875, "rewards/margins": 3.294921875, "rewards/rejected": -3.681640625, "step": 3014 }, { "epoch": 0.569700977844962, "grad_norm": 3.0739693357781275, "learning_rate": 5.19599916331797e-07, "logits/chosen": 3.3359375, "logits/rejected": 2.47265625, "logps/chosen": -844.0, "logps/rejected": -841.5, "loss": 0.4045, "rewards/accuracies": 0.875, "rewards/chosen": 2.251953125, "rewards/margins": 8.0, "rewards/rejected": -5.734375, "step": 3015 }, { "epoch": 0.5698899333931693, "grad_norm": 1.8815825099268757, "learning_rate": 5.193037887942244e-07, "logits/chosen": 2.6875, "logits/rejected": 2.658203125, "logps/chosen": -1056.0, "logps/rejected": -1424.0, "loss": 0.5192, "rewards/accuracies": 0.90625, "rewards/chosen": 1.251953125, "rewards/margins": 6.3671875, "rewards/rejected": -5.1171875, "step": 3016 }, { "epoch": 0.5700788889413766, "grad_norm": 2.450847882500097, "learning_rate": 5.190076746110202e-07, "logits/chosen": 3.203125, "logits/rejected": 2.90625, "logps/chosen": -744.0, "logps/rejected": -858.0, "loss": 0.6559, "rewards/accuracies": 0.71875, "rewards/chosen": 1.38671875, "rewards/margins": 3.568359375, "rewards/rejected": -2.18359375, "step": 3017 }, { "epoch": 0.5702678444895838, "grad_norm": 2.114783604623853, "learning_rate": 5.187115739110078e-07, "logits/chosen": 3.078125, "logits/rejected": 2.3525390625, "logps/chosen": -805.5, "logps/rejected": -650.5, "loss": 0.4951, "rewards/accuracies": 0.9375, "rewards/chosen": 0.87451171875, "rewards/margins": 4.37890625, "rewards/rejected": -3.5, "step": 3018 }, { "epoch": 0.5704568000377911, "grad_norm": 2.194357960325047, "learning_rate": 5.184154868230065e-07, "logits/chosen": 2.234375, "logits/rejected": 1.978515625, "logps/chosen": -568.0, "logps/rejected": -820.0, "loss": 0.5821, "rewards/accuracies": 0.875, "rewards/chosen": 0.82177734375, "rewards/margins": 4.828125, "rewards/rejected": -4.00390625, "step": 3019 }, { "epoch": 0.5706457555859984, "grad_norm": 2.577695911240919, "learning_rate": 5.181194134758284e-07, "logits/chosen": 2.68359375, "logits/rejected": 2.515625, "logps/chosen": -713.0, "logps/rejected": -864.0, "loss": 0.6336, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7119140625, "rewards/margins": 4.23828125, "rewards/rejected": -3.51953125, "step": 3020 }, { "epoch": 0.5708347111342057, "grad_norm": 2.105742750349348, "learning_rate": 5.1782335399828e-07, "logits/chosen": 2.349609375, "logits/rejected": 2.2373046875, "logps/chosen": -711.5, "logps/rejected": -1792.0, "loss": 0.5592, "rewards/accuracies": 0.8125, "rewards/chosen": 0.828125, "rewards/margins": 10.1796875, "rewards/rejected": -9.3515625, "step": 3021 }, { "epoch": 0.571023666682413, "grad_norm": 2.0188283244907073, "learning_rate": 5.175273085191621e-07, "logits/chosen": 3.109375, "logits/rejected": 2.26171875, "logps/chosen": -734.0, "logps/rejected": -667.0, "loss": 0.6358, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7880859375, "rewards/margins": 3.6484375, "rewards/rejected": -2.857421875, "step": 3022 }, { "epoch": 0.5712126222306202, "grad_norm": 1.9312269835030071, "learning_rate": 5.172312771672687e-07, "logits/chosen": 3.47265625, "logits/rejected": 3.1015625, "logps/chosen": -631.5, "logps/rejected": -782.0, "loss": 0.6392, "rewards/accuracies": 0.8125, "rewards/chosen": 1.02239990234375, "rewards/margins": 4.16796875, "rewards/rejected": -3.14453125, "step": 3023 }, { "epoch": 0.5714015777788275, "grad_norm": 3.234584022581935, "learning_rate": 5.169352600713884e-07, "logits/chosen": 2.103515625, "logits/rejected": 2.013671875, "logps/chosen": -770.5, "logps/rejected": -764.0, "loss": 0.6956, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4364013671875, "rewards/margins": 4.49609375, "rewards/rejected": -4.0625, "step": 3024 }, { "epoch": 0.5715905333270348, "grad_norm": 3.019032246963104, "learning_rate": 5.166392573603031e-07, "logits/chosen": 3.1796875, "logits/rejected": 2.73828125, "logps/chosen": -603.0, "logps/rejected": -918.0, "loss": 0.6098, "rewards/accuracies": 0.8125, "rewards/chosen": 0.48095703125, "rewards/margins": 4.12109375, "rewards/rejected": -3.634765625, "step": 3025 }, { "epoch": 0.5717794888752421, "grad_norm": 2.8073590988742856, "learning_rate": 5.163432691627887e-07, "logits/chosen": 2.62890625, "logits/rejected": 2.6533203125, "logps/chosen": -972.0, "logps/rejected": -1851.0, "loss": 0.5957, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3525390625, "rewards/margins": 13.14453125, "rewards/rejected": -11.7890625, "step": 3026 }, { "epoch": 0.5719684444234494, "grad_norm": 2.0417036727767526, "learning_rate": 5.160472956076145e-07, "logits/chosen": 2.19921875, "logits/rejected": 1.98876953125, "logps/chosen": -674.5, "logps/rejected": -1128.0, "loss": 0.6175, "rewards/accuracies": 0.75, "rewards/chosen": 1.07177734375, "rewards/margins": 6.45703125, "rewards/rejected": -5.3671875, "step": 3027 }, { "epoch": 0.5721573999716567, "grad_norm": 1.580349020334709, "learning_rate": 5.157513368235435e-07, "logits/chosen": 2.0791015625, "logits/rejected": 1.8359375, "logps/chosen": -595.5, "logps/rejected": -786.0, "loss": 0.6739, "rewards/accuracies": 0.71875, "rewards/chosen": 1.01953125, "rewards/margins": 3.111328125, "rewards/rejected": -2.09228515625, "step": 3028 }, { "epoch": 0.5723463555198639, "grad_norm": 2.057765709951002, "learning_rate": 5.15455392939333e-07, "logits/chosen": 2.353515625, "logits/rejected": 2.298828125, "logps/chosen": -989.0, "logps/rejected": -948.5, "loss": 0.6774, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9296875, "rewards/margins": 5.1640625, "rewards/rejected": -4.234375, "step": 3029 }, { "epoch": 0.5725353110680712, "grad_norm": 2.053982090278472, "learning_rate": 5.151594640837327e-07, "logits/chosen": 3.00390625, "logits/rejected": 2.8359375, "logps/chosen": -638.0, "logps/rejected": -619.0, "loss": 0.6675, "rewards/accuracies": 0.75, "rewards/chosen": 0.1904296875, "rewards/margins": 3.099609375, "rewards/rejected": -2.9228515625, "step": 3030 }, { "epoch": 0.5727242666162785, "grad_norm": 1.755542977937641, "learning_rate": 5.148635503854863e-07, "logits/chosen": 2.75, "logits/rejected": 2.44921875, "logps/chosen": -791.5, "logps/rejected": -760.0, "loss": 0.5598, "rewards/accuracies": 0.6875, "rewards/chosen": 1.791015625, "rewards/margins": 4.45703125, "rewards/rejected": -2.67578125, "step": 3031 }, { "epoch": 0.5729132221644858, "grad_norm": 1.9434547757444256, "learning_rate": 5.145676519733309e-07, "logits/chosen": 2.57763671875, "logits/rejected": 2.078125, "logps/chosen": -658.5, "logps/rejected": -18010.0, "loss": 0.6074, "rewards/accuracies": 0.71875, "rewards/chosen": 1.36376953125, "rewards/margins": 23.08984375, "rewards/rejected": -21.7607421875, "step": 3032 }, { "epoch": 0.5731021777126931, "grad_norm": 1.7057186743501196, "learning_rate": 5.142717689759974e-07, "logits/chosen": 2.44921875, "logits/rejected": 1.8984375, "logps/chosen": -713.0, "logps/rejected": -624.0, "loss": 0.6421, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8544921875, "rewards/margins": 3.7060546875, "rewards/rejected": -2.845703125, "step": 3033 }, { "epoch": 0.5732911332609004, "grad_norm": 2.1980160534068856, "learning_rate": 5.139759015222087e-07, "logits/chosen": 3.44140625, "logits/rejected": 2.962890625, "logps/chosen": -582.5, "logps/rejected": -700.0, "loss": 0.6316, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1533203125, "rewards/margins": 4.6484375, "rewards/rejected": -3.498046875, "step": 3034 }, { "epoch": 0.5734800888091076, "grad_norm": 2.9524547802181296, "learning_rate": 5.136800497406821e-07, "logits/chosen": 2.599609375, "logits/rejected": 2.84375, "logps/chosen": -837.0, "logps/rejected": -1437.5, "loss": 0.585, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2998046875, "rewards/margins": 4.8515625, "rewards/rejected": -3.5615234375, "step": 3035 }, { "epoch": 0.5736690443573149, "grad_norm": 1.917222159539496, "learning_rate": 5.13384213760128e-07, "logits/chosen": 2.3642578125, "logits/rejected": 2.1513671875, "logps/chosen": -1122.0, "logps/rejected": -1114.0, "loss": 0.6029, "rewards/accuracies": 0.75, "rewards/chosen": 1.6171875, "rewards/margins": 5.72265625, "rewards/rejected": -4.09765625, "step": 3036 }, { "epoch": 0.5738579999055222, "grad_norm": 1.8579648093106431, "learning_rate": 5.130883937092494e-07, "logits/chosen": 2.12890625, "logits/rejected": 1.7109375, "logps/chosen": -775.0, "logps/rejected": -790.0, "loss": 0.566, "rewards/accuracies": 0.90625, "rewards/chosen": 1.361328125, "rewards/margins": 5.3515625, "rewards/rejected": -3.99609375, "step": 3037 }, { "epoch": 0.5740469554537295, "grad_norm": 2.3417647383858635, "learning_rate": 5.127925897167429e-07, "logits/chosen": 2.11328125, "logits/rejected": 1.4853515625, "logps/chosen": -981.0, "logps/rejected": -2127.0, "loss": 0.4894, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0673828125, "rewards/margins": 14.9375, "rewards/rejected": -13.875, "step": 3038 }, { "epoch": 0.5742359110019368, "grad_norm": 1.7926426556467983, "learning_rate": 5.124968019112974e-07, "logits/chosen": 2.90625, "logits/rejected": 2.638671875, "logps/chosen": -705.0, "logps/rejected": -1700.0, "loss": 0.5749, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9580078125, "rewards/margins": 14.3828125, "rewards/rejected": -13.42578125, "step": 3039 }, { "epoch": 0.5744248665501441, "grad_norm": 1.9677291591611157, "learning_rate": 5.122010304215956e-07, "logits/chosen": 2.0625, "logits/rejected": 1.345703125, "logps/chosen": -605.0, "logps/rejected": -632.5, "loss": 0.5528, "rewards/accuracies": 0.75, "rewards/chosen": 0.888671875, "rewards/margins": 4.7578125, "rewards/rejected": -3.8671875, "step": 3040 }, { "epoch": 0.5746138220983513, "grad_norm": 1.8652365338180754, "learning_rate": 5.119052753763124e-07, "logits/chosen": 3.8984375, "logits/rejected": 3.423828125, "logps/chosen": -1415.0, "logps/rejected": -962.5, "loss": 0.6963, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9495849609375, "rewards/margins": 1.5, "rewards/rejected": -3.447265625, "step": 3041 }, { "epoch": 0.5748027776465586, "grad_norm": 1.7615366470171157, "learning_rate": 5.116095369041161e-07, "logits/chosen": 2.34375, "logits/rejected": 1.720703125, "logps/chosen": -755.0, "logps/rejected": -670.0, "loss": 0.5967, "rewards/accuracies": 0.875, "rewards/chosen": -0.05810546875, "rewards/margins": 5.0859375, "rewards/rejected": -5.13671875, "step": 3042 }, { "epoch": 0.5749917331947659, "grad_norm": 2.302463301954398, "learning_rate": 5.113138151336674e-07, "logits/chosen": 1.39111328125, "logits/rejected": 0.80810546875, "logps/chosen": -939.5, "logps/rejected": -952.0, "loss": 0.5244, "rewards/accuracies": 0.90625, "rewards/chosen": 1.087890625, "rewards/margins": 5.59375, "rewards/rejected": -4.515625, "step": 3043 }, { "epoch": 0.5751806887429732, "grad_norm": 3.0081599727714607, "learning_rate": 5.1101811019362e-07, "logits/chosen": 1.3134765625, "logits/rejected": 0.737060546875, "logps/chosen": -840.5, "logps/rejected": -858.5, "loss": 0.535, "rewards/accuracies": 0.90625, "rewards/chosen": 0.92840576171875, "rewards/margins": 5.40234375, "rewards/rejected": -4.46875, "step": 3044 }, { "epoch": 0.5753696442911805, "grad_norm": 2.1509904202233643, "learning_rate": 5.1072242221262e-07, "logits/chosen": 2.845703125, "logits/rejected": 3.154296875, "logps/chosen": -685.0, "logps/rejected": -1284.5, "loss": 0.6633, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1439208984375, "rewards/margins": 5.3359375, "rewards/rejected": -5.4921875, "step": 3045 }, { "epoch": 0.5755585998393877, "grad_norm": 2.478300890836866, "learning_rate": 5.104267513193064e-07, "logits/chosen": 3.1875, "logits/rejected": 2.96875, "logps/chosen": -701.0, "logps/rejected": -743.5, "loss": 0.5847, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3974609375, "rewards/margins": 3.82421875, "rewards/rejected": -2.431640625, "step": 3046 }, { "epoch": 0.575747555387595, "grad_norm": 2.3811222478267484, "learning_rate": 5.101310976423106e-07, "logits/chosen": 2.2138671875, "logits/rejected": 1.8828125, "logps/chosen": -638.5, "logps/rejected": -719.0, "loss": 0.6701, "rewards/accuracies": 0.8125, "rewards/chosen": 0.276123046875, "rewards/margins": 4.45703125, "rewards/rejected": -4.18359375, "step": 3047 }, { "epoch": 0.5759365109358023, "grad_norm": 1.9184877039098327, "learning_rate": 5.098354613102566e-07, "logits/chosen": 1.693359375, "logits/rejected": 1.294921875, "logps/chosen": -610.5, "logps/rejected": -690.5, "loss": 0.4479, "rewards/accuracies": 0.875, "rewards/chosen": 0.646484375, "rewards/margins": 5.59375, "rewards/rejected": -4.953125, "step": 3048 }, { "epoch": 0.5761254664840096, "grad_norm": 2.5208535631850135, "learning_rate": 5.095398424517606e-07, "logits/chosen": 2.29296875, "logits/rejected": 1.806640625, "logps/chosen": -1112.0, "logps/rejected": -815.0, "loss": 0.6351, "rewards/accuracies": 0.78125, "rewards/chosen": -0.126220703125, "rewards/margins": 3.8203125, "rewards/rejected": -3.9375, "step": 3049 }, { "epoch": 0.5763144220322169, "grad_norm": 1.7340054980046178, "learning_rate": 5.092442411954318e-07, "logits/chosen": 1.794921875, "logits/rejected": 1.953125, "logps/chosen": -581.25, "logps/rejected": -549.0, "loss": 0.6682, "rewards/accuracies": 0.75, "rewards/chosen": 0.94482421875, "rewards/margins": 3.2451171875, "rewards/rejected": -2.296875, "step": 3050 }, { "epoch": 0.5765033775804242, "grad_norm": 1.4794917043022302, "learning_rate": 5.089486576698708e-07, "logits/chosen": 1.7392578125, "logits/rejected": 1.224609375, "logps/chosen": -525.25, "logps/rejected": -809.75, "loss": 0.538, "rewards/accuracies": 0.875, "rewards/chosen": 0.71484375, "rewards/margins": 5.48828125, "rewards/rejected": -4.78515625, "step": 3051 }, { "epoch": 0.5766923331286314, "grad_norm": 2.1354180508638994, "learning_rate": 5.086530920036717e-07, "logits/chosen": 2.1181640625, "logits/rejected": 2.546875, "logps/chosen": -650.0, "logps/rejected": -702.0, "loss": 0.5946, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5107421875, "rewards/margins": 3.86328125, "rewards/rejected": -3.361328125, "step": 3052 }, { "epoch": 0.5768812886768387, "grad_norm": 1.7226794281457836, "learning_rate": 5.083575443254196e-07, "logits/chosen": 2.09375, "logits/rejected": 1.51513671875, "logps/chosen": -533.5, "logps/rejected": -1206.0, "loss": 0.6023, "rewards/accuracies": 0.78125, "rewards/chosen": 0.755126953125, "rewards/margins": 9.2265625, "rewards/rejected": -8.47265625, "step": 3053 }, { "epoch": 0.577070244225046, "grad_norm": 2.1995189496164382, "learning_rate": 5.080620147636925e-07, "logits/chosen": 3.0625, "logits/rejected": 2.189453125, "logps/chosen": -609.25, "logps/rejected": -709.0, "loss": 0.6205, "rewards/accuracies": 0.8125, "rewards/chosen": 0.17333984375, "rewards/margins": 3.921875, "rewards/rejected": -3.748046875, "step": 3054 }, { "epoch": 0.5772591997732534, "grad_norm": 1.8703670505955186, "learning_rate": 5.077665034470607e-07, "logits/chosen": 2.21728515625, "logits/rejected": 2.1005859375, "logps/chosen": -914.0, "logps/rejected": -852.0, "loss": 0.5919, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9208984375, "rewards/margins": 3.98046875, "rewards/rejected": -3.05859375, "step": 3055 }, { "epoch": 0.5774481553214607, "grad_norm": 2.0701149758480337, "learning_rate": 5.074710105040855e-07, "logits/chosen": 1.314453125, "logits/rejected": 1.1103515625, "logps/chosen": -854.0, "logps/rejected": -791.0, "loss": 0.6707, "rewards/accuracies": 0.75, "rewards/chosen": 0.032958984375, "rewards/margins": 3.734375, "rewards/rejected": -3.697265625, "step": 3056 }, { "epoch": 0.577637110869668, "grad_norm": 2.3673919241925514, "learning_rate": 5.071755360633216e-07, "logits/chosen": 2.087890625, "logits/rejected": 1.716796875, "logps/chosen": -526.0, "logps/rejected": -800.5, "loss": 0.597, "rewards/accuracies": 0.78125, "rewards/chosen": 0.19677734375, "rewards/margins": 4.615234375, "rewards/rejected": -4.4140625, "step": 3057 }, { "epoch": 0.5778260664178752, "grad_norm": 2.8465820222210825, "learning_rate": 5.068800802533148e-07, "logits/chosen": 0.96337890625, "logits/rejected": 1.3193359375, "logps/chosen": -853.0, "logps/rejected": -957.0, "loss": 0.4855, "rewards/accuracies": 0.84375, "rewards/chosen": 0.53369140625, "rewards/margins": 5.80078125, "rewards/rejected": -5.26953125, "step": 3058 }, { "epoch": 0.5780150219660825, "grad_norm": 2.4955785830095976, "learning_rate": 5.065846432026028e-07, "logits/chosen": 1.833984375, "logits/rejected": 1.626953125, "logps/chosen": -932.0, "logps/rejected": -971.0, "loss": 0.495, "rewards/accuracies": 0.8125, "rewards/chosen": 0.88916015625, "rewards/margins": 7.49609375, "rewards/rejected": -6.59375, "step": 3059 }, { "epoch": 0.5782039775142898, "grad_norm": 2.05434910584396, "learning_rate": 5.062892250397153e-07, "logits/chosen": 1.65625, "logits/rejected": 1.515625, "logps/chosen": -475.5, "logps/rejected": -571.0, "loss": 0.6838, "rewards/accuracies": 0.84375, "rewards/chosen": 0.548828125, "rewards/margins": 2.41796875, "rewards/rejected": -1.8621826171875, "step": 3060 }, { "epoch": 0.5783929330624971, "grad_norm": 1.5341607136837185, "learning_rate": 5.059938258931741e-07, "logits/chosen": 2.48876953125, "logits/rejected": 2.005859375, "logps/chosen": -649.0, "logps/rejected": -961.0, "loss": 0.4861, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9140625, "rewards/margins": 6.96875, "rewards/rejected": -6.05859375, "step": 3061 }, { "epoch": 0.5785818886107044, "grad_norm": 2.6128646027666678, "learning_rate": 5.056984458914919e-07, "logits/chosen": 2.0517578125, "logits/rejected": 1.57421875, "logps/chosen": -758.0, "logps/rejected": -628.0, "loss": 0.5969, "rewards/accuracies": 0.8125, "rewards/chosen": -0.01171875, "rewards/margins": 3.86328125, "rewards/rejected": -3.880859375, "step": 3062 }, { "epoch": 0.5787708441589117, "grad_norm": 1.5573352650339742, "learning_rate": 5.05403085163174e-07, "logits/chosen": 2.373291015625, "logits/rejected": 2.4443359375, "logps/chosen": -766.75, "logps/rejected": -748.0, "loss": 0.7201, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4228515625, "rewards/margins": 3.447265625, "rewards/rejected": -3.873046875, "step": 3063 }, { "epoch": 0.5789597997071189, "grad_norm": 1.5674449734235403, "learning_rate": 5.051077438367171e-07, "logits/chosen": 1.31640625, "logits/rejected": 1.568359375, "logps/chosen": -622.5, "logps/rejected": -658.5, "loss": 0.6443, "rewards/accuracies": 0.78125, "rewards/chosen": 0.88037109375, "rewards/margins": 3.59765625, "rewards/rejected": -2.71484375, "step": 3064 }, { "epoch": 0.5791487552553262, "grad_norm": 2.2699635363119457, "learning_rate": 5.048124220406087e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.09375, "logps/chosen": -1085.0, "logps/rejected": -999.0, "loss": 0.5278, "rewards/accuracies": 0.84375, "rewards/chosen": 1.412353515625, "rewards/margins": 5.28125, "rewards/rejected": -3.875, "step": 3065 }, { "epoch": 0.5793377108035335, "grad_norm": 2.442086000144016, "learning_rate": 5.045171199033288e-07, "logits/chosen": 2.69140625, "logits/rejected": 2.642578125, "logps/chosen": -638.0, "logps/rejected": -772.0, "loss": 0.6394, "rewards/accuracies": 0.875, "rewards/chosen": 0.04736328125, "rewards/margins": 6.1328125, "rewards/rejected": -6.078125, "step": 3066 }, { "epoch": 0.5795266663517408, "grad_norm": 1.7642653694623043, "learning_rate": 5.04221837553348e-07, "logits/chosen": 2.650390625, "logits/rejected": 1.60400390625, "logps/chosen": -636.0, "logps/rejected": -579.0, "loss": 0.5366, "rewards/accuracies": 0.875, "rewards/chosen": 0.636962890625, "rewards/margins": 4.5, "rewards/rejected": -3.8515625, "step": 3067 }, { "epoch": 0.5797156218999481, "grad_norm": 1.286249647261625, "learning_rate": 5.039265751191293e-07, "logits/chosen": 3.328125, "logits/rejected": 2.4296875, "logps/chosen": -647.0, "logps/rejected": -541.0, "loss": 0.561, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03076171875, "rewards/margins": 5.3125, "rewards/rejected": -5.3515625, "step": 3068 }, { "epoch": 0.5799045774481553, "grad_norm": 2.191374257994646, "learning_rate": 5.036313327291259e-07, "logits/chosen": 2.52734375, "logits/rejected": 1.931640625, "logps/chosen": -846.0, "logps/rejected": -1023.5, "loss": 0.551, "rewards/accuracies": 0.875, "rewards/chosen": 0.95556640625, "rewards/margins": 5.8828125, "rewards/rejected": -4.92578125, "step": 3069 }, { "epoch": 0.5800935329963626, "grad_norm": 2.371630651264496, "learning_rate": 5.033361105117831e-07, "logits/chosen": 1.42236328125, "logits/rejected": 1.36767578125, "logps/chosen": -839.0, "logps/rejected": -1868.0, "loss": 0.6249, "rewards/accuracies": 0.75, "rewards/chosen": 0.19140625, "rewards/margins": 6.01953125, "rewards/rejected": -5.828125, "step": 3070 }, { "epoch": 0.5802824885445699, "grad_norm": 2.443697448377852, "learning_rate": 5.030409085955371e-07, "logits/chosen": 3.03515625, "logits/rejected": 2.4375, "logps/chosen": -541.5, "logps/rejected": -1148.0, "loss": 0.7318, "rewards/accuracies": 0.75, "rewards/chosen": -0.3994140625, "rewards/margins": 10.6982421875, "rewards/rejected": -11.08203125, "step": 3071 }, { "epoch": 0.5804714440927772, "grad_norm": 3.471207745068322, "learning_rate": 5.027457271088152e-07, "logits/chosen": 2.328125, "logits/rejected": 1.791015625, "logps/chosen": -924.0, "logps/rejected": -1041.75, "loss": 0.5293, "rewards/accuracies": 0.875, "rewards/chosen": 0.962890625, "rewards/margins": 6.22265625, "rewards/rejected": -5.26171875, "step": 3072 }, { "epoch": 0.5806603996409845, "grad_norm": 1.4041081881983184, "learning_rate": 5.024505661800361e-07, "logits/chosen": 2.115234375, "logits/rejected": 1.96484375, "logps/chosen": -644.0, "logps/rejected": -617.0, "loss": 0.6233, "rewards/accuracies": 0.78125, "rewards/chosen": -0.435791015625, "rewards/margins": 4.51171875, "rewards/rejected": -4.947265625, "step": 3073 }, { "epoch": 0.5808493551891918, "grad_norm": 2.3928933678718094, "learning_rate": 5.021554259376092e-07, "logits/chosen": 2.50390625, "logits/rejected": 2.337890625, "logps/chosen": -943.0, "logps/rejected": -1005.0, "loss": 0.5505, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0517578125, "rewards/margins": 6.109375, "rewards/rejected": -6.1484375, "step": 3074 }, { "epoch": 0.581038310737399, "grad_norm": 1.5850394761333708, "learning_rate": 5.018603065099352e-07, "logits/chosen": 2.77734375, "logits/rejected": 2.58203125, "logps/chosen": -657.5, "logps/rejected": -1397.0, "loss": 0.6541, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19970703125, "rewards/margins": 12.53125, "rewards/rejected": -12.6953125, "step": 3075 }, { "epoch": 0.5812272662856063, "grad_norm": 2.5947783161073845, "learning_rate": 5.015652080254057e-07, "logits/chosen": 1.87109375, "logits/rejected": 1.6318359375, "logps/chosen": -574.5, "logps/rejected": -735.0, "loss": 0.6168, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8753662109375, "rewards/margins": 4.87890625, "rewards/rejected": -5.74609375, "step": 3076 }, { "epoch": 0.5814162218338136, "grad_norm": 2.690869063300569, "learning_rate": 5.01270130612403e-07, "logits/chosen": 2.359375, "logits/rejected": 1.75927734375, "logps/chosen": -877.0, "logps/rejected": -695.5, "loss": 0.4545, "rewards/accuracies": 0.875, "rewards/chosen": 0.822265625, "rewards/margins": 6.046875, "rewards/rejected": -5.2265625, "step": 3077 }, { "epoch": 0.5816051773820209, "grad_norm": 1.7583156523304395, "learning_rate": 5.009750743993005e-07, "logits/chosen": 2.73828125, "logits/rejected": 2.296875, "logps/chosen": -735.0, "logps/rejected": -735.0, "loss": 0.5543, "rewards/accuracies": 0.75, "rewards/chosen": 1.3021240234375, "rewards/margins": 5.93359375, "rewards/rejected": -4.62890625, "step": 3078 }, { "epoch": 0.5817941329302282, "grad_norm": 1.4634893090960326, "learning_rate": 5.006800395144619e-07, "logits/chosen": 3.171875, "logits/rejected": 2.73828125, "logps/chosen": -721.0, "logps/rejected": -690.0, "loss": 0.6151, "rewards/accuracies": 0.8125, "rewards/chosen": 0.09765625, "rewards/margins": 4.32421875, "rewards/rejected": -4.228515625, "step": 3079 }, { "epoch": 0.5819830884784355, "grad_norm": 1.3244585085658718, "learning_rate": 5.003850260862423e-07, "logits/chosen": 2.87890625, "logits/rejected": 2.4482421875, "logps/chosen": -717.5, "logps/rejected": -801.5, "loss": 0.5172, "rewards/accuracies": 0.84375, "rewards/chosen": 1.10791015625, "rewards/margins": 7.6484375, "rewards/rejected": -6.546875, "step": 3080 }, { "epoch": 0.5821720440266427, "grad_norm": 2.372753773514353, "learning_rate": 5.000900342429868e-07, "logits/chosen": 1.80859375, "logits/rejected": 1.19921875, "logps/chosen": -862.0, "logps/rejected": -1249.0, "loss": 0.4327, "rewards/accuracies": 0.90625, "rewards/chosen": 1.08203125, "rewards/margins": 6.203125, "rewards/rejected": -5.1171875, "step": 3081 }, { "epoch": 0.58236099957485, "grad_norm": 1.856984254708326, "learning_rate": 4.997950641130318e-07, "logits/chosen": 2.822265625, "logits/rejected": 3.54296875, "logps/chosen": -897.0, "logps/rejected": -1829.0, "loss": 0.582, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8046875, "rewards/margins": 8.421875, "rewards/rejected": -7.62109375, "step": 3082 }, { "epoch": 0.5825499551230573, "grad_norm": 2.2790153270417903, "learning_rate": 4.995001158247034e-07, "logits/chosen": 2.583984375, "logits/rejected": 2.193359375, "logps/chosen": -873.5, "logps/rejected": -944.0, "loss": 0.5584, "rewards/accuracies": 0.875, "rewards/chosen": 0.55078125, "rewards/margins": 6.0859375, "rewards/rejected": -5.54296875, "step": 3083 }, { "epoch": 0.5827389106712646, "grad_norm": 2.5291532778646837, "learning_rate": 4.99205189506319e-07, "logits/chosen": 2.1552734375, "logits/rejected": 1.857421875, "logps/chosen": -626.5, "logps/rejected": -762.0, "loss": 0.5887, "rewards/accuracies": 0.875, "rewards/chosen": -0.40966796875, "rewards/margins": 4.54296875, "rewards/rejected": -4.95703125, "step": 3084 }, { "epoch": 0.5829278662194719, "grad_norm": 2.1426427605580805, "learning_rate": 4.989102852861863e-07, "logits/chosen": 2.583984375, "logits/rejected": 2.89453125, "logps/chosen": -645.5, "logps/rejected": -974.5, "loss": 0.5937, "rewards/accuracies": 0.71875, "rewards/chosen": 0.94775390625, "rewards/margins": 6.61328125, "rewards/rejected": -5.6796875, "step": 3085 }, { "epoch": 0.5831168217676792, "grad_norm": 1.6691809446665893, "learning_rate": 4.986154032926026e-07, "logits/chosen": 2.654296875, "logits/rejected": 1.744140625, "logps/chosen": -539.5, "logps/rejected": -561.5, "loss": 0.4964, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3125, "rewards/margins": 5.640625, "rewards/rejected": -5.328125, "step": 3086 }, { "epoch": 0.5833057773158864, "grad_norm": 2.020575023442613, "learning_rate": 4.983205436538567e-07, "logits/chosen": 2.443359375, "logits/rejected": 2.03125, "logps/chosen": -1135.0, "logps/rejected": -916.0, "loss": 0.4325, "rewards/accuracies": 0.875, "rewards/chosen": 1.71258544921875, "rewards/margins": 5.65234375, "rewards/rejected": -3.939453125, "step": 3087 }, { "epoch": 0.5834947328640937, "grad_norm": 2.8185910398620275, "learning_rate": 4.980257064982269e-07, "logits/chosen": 1.8515625, "logits/rejected": 1.61395263671875, "logps/chosen": -769.5, "logps/rejected": -1142.0, "loss": 0.5172, "rewards/accuracies": 0.90625, "rewards/chosen": 1.070068359375, "rewards/margins": 6.453125, "rewards/rejected": -5.390625, "step": 3088 }, { "epoch": 0.583683688412301, "grad_norm": 2.186314070410514, "learning_rate": 4.97730891953982e-07, "logits/chosen": 2.615234375, "logits/rejected": 2.986328125, "logps/chosen": -644.5, "logps/rejected": -1034.5, "loss": 0.6281, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2431640625, "rewards/margins": 4.2578125, "rewards/rejected": -4.50390625, "step": 3089 }, { "epoch": 0.5838726439605083, "grad_norm": 1.575482293065651, "learning_rate": 4.974361001493805e-07, "logits/chosen": 2.640625, "logits/rejected": 2.046875, "logps/chosen": -1041.0, "logps/rejected": -1231.0, "loss": 0.4721, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7021484375, "rewards/margins": 6.5703125, "rewards/rejected": -5.859375, "step": 3090 }, { "epoch": 0.5840615995087156, "grad_norm": 1.629493234325292, "learning_rate": 4.971413312126718e-07, "logits/chosen": 1.896484375, "logits/rejected": 1.9912109375, "logps/chosen": -779.5, "logps/rejected": -1234.0, "loss": 0.4635, "rewards/accuracies": 0.90625, "rewards/chosen": 0.698486328125, "rewards/margins": 7.3203125, "rewards/rejected": -6.61328125, "step": 3091 }, { "epoch": 0.5842505550569228, "grad_norm": 3.226319256763813, "learning_rate": 4.968465852720951e-07, "logits/chosen": 2.2607421875, "logits/rejected": 1.8172607421875, "logps/chosen": -725.0, "logps/rejected": -681.0, "loss": 0.609, "rewards/accuracies": 0.9375, "rewards/chosen": -0.208984375, "rewards/margins": 4.70703125, "rewards/rejected": -4.9140625, "step": 3092 }, { "epoch": 0.5844395106051301, "grad_norm": 2.4723955468827765, "learning_rate": 4.96551862455879e-07, "logits/chosen": 2.466796875, "logits/rejected": 2.345703125, "logps/chosen": -927.0, "logps/rejected": -813.0, "loss": 0.51, "rewards/accuracies": 0.84375, "rewards/chosen": 0.78125, "rewards/margins": 4.87109375, "rewards/rejected": -4.08984375, "step": 3093 }, { "epoch": 0.5846284661533374, "grad_norm": 2.4117277672245194, "learning_rate": 4.962571628922427e-07, "logits/chosen": 2.912109375, "logits/rejected": 2.357421875, "logps/chosen": -609.25, "logps/rejected": -589.0, "loss": 0.6395, "rewards/accuracies": 0.75, "rewards/chosen": 0.142333984375, "rewards/margins": 3.3515625, "rewards/rejected": -3.20703125, "step": 3094 }, { "epoch": 0.5848174217015447, "grad_norm": 1.679982799047129, "learning_rate": 4.95962486709395e-07, "logits/chosen": 2.546875, "logits/rejected": 2.580078125, "logps/chosen": -861.0, "logps/rejected": -1869.0, "loss": 0.5753, "rewards/accuracies": 0.75, "rewards/chosen": 0.79248046875, "rewards/margins": 7.68359375, "rewards/rejected": -6.88671875, "step": 3095 }, { "epoch": 0.585006377249752, "grad_norm": 1.9315536380910783, "learning_rate": 4.956678340355348e-07, "logits/chosen": 2.890625, "logits/rejected": 2.7109375, "logps/chosen": -997.0, "logps/rejected": -1267.5, "loss": 0.6126, "rewards/accuracies": 0.78125, "rewards/chosen": 0.60546875, "rewards/margins": 4.700927734375, "rewards/rejected": -4.08984375, "step": 3096 }, { "epoch": 0.5851953327979593, "grad_norm": 3.265741428272893, "learning_rate": 4.953732049988502e-07, "logits/chosen": 2.67578125, "logits/rejected": 2.634765625, "logps/chosen": -757.0, "logps/rejected": -763.5, "loss": 0.592, "rewards/accuracies": 0.8125, "rewards/chosen": 0.974609375, "rewards/margins": 4.515625, "rewards/rejected": -3.5390625, "step": 3097 }, { "epoch": 0.5853842883461665, "grad_norm": 2.0119142993994594, "learning_rate": 4.950785997275196e-07, "logits/chosen": 2.2275390625, "logits/rejected": 2.2763671875, "logps/chosen": -920.0, "logps/rejected": -1731.0, "loss": 0.4385, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1123046875, "rewards/margins": 7.6328125, "rewards/rejected": -6.5234375, "step": 3098 }, { "epoch": 0.5855732438943738, "grad_norm": 1.3813529463056091, "learning_rate": 4.947840183497108e-07, "logits/chosen": 3.18359375, "logits/rejected": 3.75390625, "logps/chosen": -778.0, "logps/rejected": -697.5, "loss": 0.5429, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9285888671875, "rewards/margins": 5.78125, "rewards/rejected": -4.86328125, "step": 3099 }, { "epoch": 0.5857621994425811, "grad_norm": 2.8972213616006504, "learning_rate": 4.944894609935814e-07, "logits/chosen": 2.474609375, "logits/rejected": 2.3310546875, "logps/chosen": -1093.0, "logps/rejected": -1168.0, "loss": 0.5406, "rewards/accuracies": 0.78125, "rewards/chosen": 2.3125, "rewards/margins": 5.6484375, "rewards/rejected": -3.34765625, "step": 3100 }, { "epoch": 0.5859511549907884, "grad_norm": 2.268921433176074, "learning_rate": 4.941949277872784e-07, "logits/chosen": 2.271484375, "logits/rejected": 2.6328125, "logps/chosen": -861.5, "logps/rejected": -1686.0, "loss": 0.6263, "rewards/accuracies": 0.75, "rewards/chosen": 0.861328125, "rewards/margins": 8.25, "rewards/rejected": -7.361328125, "step": 3101 }, { "epoch": 0.5861401105389957, "grad_norm": 2.0041455873390586, "learning_rate": 4.939004188589379e-07, "logits/chosen": 3.296875, "logits/rejected": 2.28125, "logps/chosen": -698.0, "logps/rejected": -618.0, "loss": 0.5798, "rewards/accuracies": 0.8125, "rewards/chosen": 0.62939453125, "rewards/margins": 3.8828125, "rewards/rejected": -3.24609375, "step": 3102 }, { "epoch": 0.586329066087203, "grad_norm": 2.7624643505274493, "learning_rate": 4.936059343366862e-07, "logits/chosen": 3.4609375, "logits/rejected": 3.1328125, "logps/chosen": -663.5, "logps/rejected": -741.0, "loss": 0.5723, "rewards/accuracies": 0.8125, "rewards/chosen": 0.81689453125, "rewards/margins": 4.13671875, "rewards/rejected": -3.310546875, "step": 3103 }, { "epoch": 0.5865180216354102, "grad_norm": 1.673068963358612, "learning_rate": 4.933114743486384e-07, "logits/chosen": 2.38671875, "logits/rejected": 2.38671875, "logps/chosen": -599.0, "logps/rejected": -1401.5, "loss": 0.5532, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3099365234375, "rewards/margins": 7.015625, "rewards/rejected": -6.703125, "step": 3104 }, { "epoch": 0.5867069771836175, "grad_norm": 2.7222809971627093, "learning_rate": 4.930170390228994e-07, "logits/chosen": 2.71484375, "logits/rejected": 2.666015625, "logps/chosen": -751.0, "logps/rejected": -1478.0, "loss": 0.5329, "rewards/accuracies": 0.90625, "rewards/chosen": 0.87158203125, "rewards/margins": 5.734375, "rewards/rejected": -4.85546875, "step": 3105 }, { "epoch": 0.5868959327318248, "grad_norm": 2.1349832870877052, "learning_rate": 4.927226284875628e-07, "logits/chosen": 2.71484375, "logits/rejected": 2.16015625, "logps/chosen": -698.0, "logps/rejected": -690.0, "loss": 0.6197, "rewards/accuracies": 0.78125, "rewards/chosen": 0.83154296875, "rewards/margins": 3.70703125, "rewards/rejected": -2.87890625, "step": 3106 }, { "epoch": 0.5870848882800321, "grad_norm": 3.3003814053826455, "learning_rate": 4.924282428707121e-07, "logits/chosen": 3.380859375, "logits/rejected": 3.099609375, "logps/chosen": -674.5, "logps/rejected": -752.0, "loss": 0.5152, "rewards/accuracies": 0.875, "rewards/chosen": 1.1455078125, "rewards/margins": 4.884765625, "rewards/rejected": -3.7421875, "step": 3107 }, { "epoch": 0.5872738438282394, "grad_norm": 3.1100605807840256, "learning_rate": 4.921338823004193e-07, "logits/chosen": 2.294921875, "logits/rejected": 1.91796875, "logps/chosen": -785.5, "logps/rejected": -926.0, "loss": 0.5813, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1005859375, "rewards/margins": 5.03125, "rewards/rejected": -3.927734375, "step": 3108 }, { "epoch": 0.5874627993764467, "grad_norm": 2.1603456047072536, "learning_rate": 4.918395469047458e-07, "logits/chosen": 2.955078125, "logits/rejected": 2.5625, "logps/chosen": -792.0, "logps/rejected": -619.0, "loss": 0.5489, "rewards/accuracies": 0.8125, "rewards/chosen": 0.73876953125, "rewards/margins": 4.4921875, "rewards/rejected": -3.75390625, "step": 3109 }, { "epoch": 0.5876517549246539, "grad_norm": 2.201199280059923, "learning_rate": 4.915452368117422e-07, "logits/chosen": 2.669921875, "logits/rejected": 2.830078125, "logps/chosen": -843.0, "logps/rejected": -982.0, "loss": 0.5812, "rewards/accuracies": 0.75, "rewards/chosen": 0.957275390625, "rewards/margins": 4.8671875, "rewards/rejected": -3.90625, "step": 3110 }, { "epoch": 0.5878407104728612, "grad_norm": 2.6831208259069586, "learning_rate": 4.912509521494477e-07, "logits/chosen": 3.748046875, "logits/rejected": 3.275390625, "logps/chosen": -705.5, "logps/rejected": -595.5, "loss": 0.6199, "rewards/accuracies": 0.875, "rewards/chosen": 1.17431640625, "rewards/margins": 3.92578125, "rewards/rejected": -2.75390625, "step": 3111 }, { "epoch": 0.5880296660210685, "grad_norm": 1.951199544118502, "learning_rate": 4.909566930458907e-07, "logits/chosen": 2.69140625, "logits/rejected": 2.607421875, "logps/chosen": -598.0, "logps/rejected": -786.5, "loss": 0.6317, "rewards/accuracies": 0.8125, "rewards/chosen": 0.740478515625, "rewards/margins": 4.3125, "rewards/rejected": -3.58203125, "step": 3112 }, { "epoch": 0.5882186215692758, "grad_norm": 1.693826997908574, "learning_rate": 4.906624596290887e-07, "logits/chosen": 2.173828125, "logits/rejected": 1.634765625, "logps/chosen": -930.0, "logps/rejected": -892.0, "loss": 0.5028, "rewards/accuracies": 0.90625, "rewards/chosen": 1.18359375, "rewards/margins": 5.02734375, "rewards/rejected": -3.8359375, "step": 3113 }, { "epoch": 0.5884075771174831, "grad_norm": 1.7282993356624319, "learning_rate": 4.903682520270476e-07, "logits/chosen": 3.275390625, "logits/rejected": 2.43359375, "logps/chosen": -768.5, "logps/rejected": -555.5, "loss": 0.8239, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8463134765625, "rewards/margins": 2.041015625, "rewards/rejected": -2.888671875, "step": 3114 }, { "epoch": 0.5885965326656903, "grad_norm": 1.299484194160932, "learning_rate": 4.900740703677622e-07, "logits/chosen": 2.4892578125, "logits/rejected": 1.8359375, "logps/chosen": -1003.0, "logps/rejected": -820.0, "loss": 0.4947, "rewards/accuracies": 0.875, "rewards/chosen": 1.08203125, "rewards/margins": 5.90234375, "rewards/rejected": -4.82421875, "step": 3115 }, { "epoch": 0.5887854882138976, "grad_norm": 1.5438335657697446, "learning_rate": 4.89779914779216e-07, "logits/chosen": 3.20703125, "logits/rejected": 2.99609375, "logps/chosen": -833.5, "logps/rejected": -707.5, "loss": 0.5182, "rewards/accuracies": 0.84375, "rewards/chosen": 0.223876953125, "rewards/margins": 5.08984375, "rewards/rejected": -4.87109375, "step": 3116 }, { "epoch": 0.588974443762105, "grad_norm": 1.7286442528653838, "learning_rate": 4.894857853893815e-07, "logits/chosen": 2.921875, "logits/rejected": 2.318359375, "logps/chosen": -889.0, "logps/rejected": -798.0, "loss": 0.5933, "rewards/accuracies": 0.8125, "rewards/chosen": 0.855133056640625, "rewards/margins": 4.5078125, "rewards/rejected": -3.66796875, "step": 3117 }, { "epoch": 0.5891633993103123, "grad_norm": 2.511583670060111, "learning_rate": 4.89191682326219e-07, "logits/chosen": 3.6015625, "logits/rejected": 2.861328125, "logps/chosen": -771.5, "logps/rejected": -1020.0, "loss": 0.5096, "rewards/accuracies": 0.84375, "rewards/chosen": 1.34765625, "rewards/margins": 5.2890625, "rewards/rejected": -3.9296875, "step": 3118 }, { "epoch": 0.5893523548585196, "grad_norm": 2.4497836024969186, "learning_rate": 4.888976057176782e-07, "logits/chosen": 3.609375, "logits/rejected": 3.73828125, "logps/chosen": -734.25, "logps/rejected": -1842.0, "loss": 0.6441, "rewards/accuracies": 0.8125, "rewards/chosen": 0.481689453125, "rewards/margins": 7.5859375, "rewards/rejected": -7.109375, "step": 3119 }, { "epoch": 0.5895413104067269, "grad_norm": 3.15600426540011, "learning_rate": 4.886035556916972e-07, "logits/chosen": 2.2421875, "logits/rejected": 1.9443359375, "logps/chosen": -1100.0, "logps/rejected": -978.0, "loss": 0.6005, "rewards/accuracies": 0.78125, "rewards/chosen": 0.79547119140625, "rewards/margins": 4.771484375, "rewards/rejected": -3.96875, "step": 3120 }, { "epoch": 0.589730265954934, "grad_norm": 5.256631016341012, "learning_rate": 4.883095323762017e-07, "logits/chosen": 2.056640625, "logits/rejected": 1.5546875, "logps/chosen": -593.5, "logps/rejected": -669.0, "loss": 0.4947, "rewards/accuracies": 0.875, "rewards/chosen": 0.3251953125, "rewards/margins": 5.8125, "rewards/rejected": -5.4921875, "step": 3121 }, { "epoch": 0.5899192215031414, "grad_norm": 6.357365084352099, "learning_rate": 4.88015535899107e-07, "logits/chosen": 1.955078125, "logits/rejected": 1.568115234375, "logps/chosen": -489.0, "logps/rejected": -607.5, "loss": 0.7333, "rewards/accuracies": 0.8125, "rewards/chosen": -0.64453125, "rewards/margins": 3.130859375, "rewards/rejected": -3.77734375, "step": 3122 }, { "epoch": 0.5901081770513487, "grad_norm": 2.8223232097117057, "learning_rate": 4.877215663883155e-07, "logits/chosen": 2.216796875, "logits/rejected": 2.416015625, "logps/chosen": -627.5, "logps/rejected": -1182.5, "loss": 0.8022, "rewards/accuracies": 0.59375, "rewards/chosen": -0.870849609375, "rewards/margins": 3.19970703125, "rewards/rejected": -4.078125, "step": 3123 }, { "epoch": 0.590297132599556, "grad_norm": 2.15032774930259, "learning_rate": 4.874276239717188e-07, "logits/chosen": 2.39453125, "logits/rejected": 2.287109375, "logps/chosen": -845.5, "logps/rejected": -1953.0, "loss": 0.6185, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7568359375, "rewards/margins": 5.55078125, "rewards/rejected": -6.3046875, "step": 3124 }, { "epoch": 0.5904860881477633, "grad_norm": 2.031217633867272, "learning_rate": 4.871337087771963e-07, "logits/chosen": 1.42578125, "logits/rejected": 1.35009765625, "logps/chosen": -693.0, "logps/rejected": -885.5, "loss": 0.6639, "rewards/accuracies": 0.75, "rewards/chosen": -0.11767578125, "rewards/margins": 4.20703125, "rewards/rejected": -4.328125, "step": 3125 }, { "epoch": 0.5906750436959706, "grad_norm": 4.243522533171954, "learning_rate": 4.868398209326154e-07, "logits/chosen": 2.353515625, "logits/rejected": 2.228515625, "logps/chosen": -775.0, "logps/rejected": -1605.0, "loss": 0.4961, "rewards/accuracies": 0.9375, "rewards/chosen": 0.96484375, "rewards/margins": 6.6875, "rewards/rejected": -5.71875, "step": 3126 }, { "epoch": 0.5908639992441778, "grad_norm": 2.4835043943694597, "learning_rate": 4.865459605658324e-07, "logits/chosen": 2.111328125, "logits/rejected": 1.751953125, "logps/chosen": -859.5, "logps/rejected": -1246.5, "loss": 0.5861, "rewards/accuracies": 0.84375, "rewards/chosen": 0.05126953125, "rewards/margins": 4.96484375, "rewards/rejected": -4.91796875, "step": 3127 }, { "epoch": 0.5910529547923851, "grad_norm": 1.747084378087773, "learning_rate": 4.862521278046905e-07, "logits/chosen": 2.767578125, "logits/rejected": 2.458984375, "logps/chosen": -568.0, "logps/rejected": -746.0, "loss": 0.6763, "rewards/accuracies": 0.75, "rewards/chosen": 0.102294921875, "rewards/margins": 3.55078125, "rewards/rejected": -3.451171875, "step": 3128 }, { "epoch": 0.5912419103405924, "grad_norm": 1.7245319247662325, "learning_rate": 4.859583227770217e-07, "logits/chosen": 2.76171875, "logits/rejected": 2.576171875, "logps/chosen": -830.0, "logps/rejected": -864.0, "loss": 0.53, "rewards/accuracies": 0.875, "rewards/chosen": 0.963134765625, "rewards/margins": 5.69140625, "rewards/rejected": -4.7265625, "step": 3129 }, { "epoch": 0.5914308658887997, "grad_norm": 2.1573334088359033, "learning_rate": 4.856645456106456e-07, "logits/chosen": 1.32763671875, "logits/rejected": 1.5234375, "logps/chosen": -919.5, "logps/rejected": -878.0, "loss": 0.5602, "rewards/accuracies": 0.875, "rewards/chosen": 1.25616455078125, "rewards/margins": 5.00390625, "rewards/rejected": -3.744140625, "step": 3130 }, { "epoch": 0.591619821437007, "grad_norm": 2.89570599808429, "learning_rate": 4.8537079643337e-07, "logits/chosen": 3.4921875, "logits/rejected": 3.57421875, "logps/chosen": -812.5, "logps/rejected": -781.0, "loss": 0.6048, "rewards/accuracies": 0.875, "rewards/chosen": 1.548828125, "rewards/margins": 4.513671875, "rewards/rejected": -2.9677734375, "step": 3131 }, { "epoch": 0.5918087769852143, "grad_norm": 2.9111735388318123, "learning_rate": 4.850770753729901e-07, "logits/chosen": 3.484375, "logits/rejected": 3.28515625, "logps/chosen": -646.75, "logps/rejected": -854.0, "loss": 0.5987, "rewards/accuracies": 0.78125, "rewards/chosen": 1.025390625, "rewards/margins": 4.890625, "rewards/rejected": -3.8671875, "step": 3132 }, { "epoch": 0.5919977325334215, "grad_norm": 2.6331177908387766, "learning_rate": 4.847833825572891e-07, "logits/chosen": 3.4375, "logits/rejected": 3.544921875, "logps/chosen": -692.0, "logps/rejected": -1221.0, "loss": 0.4969, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0830078125, "rewards/margins": 5.7578125, "rewards/rejected": -4.671875, "step": 3133 }, { "epoch": 0.5921866880816288, "grad_norm": 3.3085698355471824, "learning_rate": 4.844897181140379e-07, "logits/chosen": 3.30859375, "logits/rejected": 3.04296875, "logps/chosen": -783.5, "logps/rejected": -1024.0, "loss": 0.5323, "rewards/accuracies": 0.8125, "rewards/chosen": 0.530029296875, "rewards/margins": 5.359375, "rewards/rejected": -4.83203125, "step": 3134 }, { "epoch": 0.5923756436298361, "grad_norm": 2.244361424617773, "learning_rate": 4.84196082170995e-07, "logits/chosen": 2.775390625, "logits/rejected": 2.1943359375, "logps/chosen": -711.0, "logps/rejected": -584.0, "loss": 0.6194, "rewards/accuracies": 0.75, "rewards/chosen": 0.539306640625, "rewards/margins": 3.927734375, "rewards/rejected": -3.388671875, "step": 3135 }, { "epoch": 0.5925645991780434, "grad_norm": 1.7142747672013683, "learning_rate": 4.839024748559069e-07, "logits/chosen": 3.98046875, "logits/rejected": 3.62890625, "logps/chosen": -518.5, "logps/rejected": -808.0, "loss": 0.605, "rewards/accuracies": 0.78125, "rewards/chosen": 0.653076171875, "rewards/margins": 4.45703125, "rewards/rejected": -3.80859375, "step": 3136 }, { "epoch": 0.5927535547262507, "grad_norm": 2.4342749356861013, "learning_rate": 4.836088962965066e-07, "logits/chosen": 2.205078125, "logits/rejected": 1.466796875, "logps/chosen": -857.0, "logps/rejected": -695.0, "loss": 0.4699, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3740234375, "rewards/margins": 5.17578125, "rewards/rejected": -3.79296875, "step": 3137 }, { "epoch": 0.5929425102744579, "grad_norm": 1.4509804415802023, "learning_rate": 4.833153466205157e-07, "logits/chosen": 2.783203125, "logits/rejected": 2.2802734375, "logps/chosen": -624.0, "logps/rejected": -1113.0, "loss": 0.6437, "rewards/accuracies": 0.75, "rewards/chosen": 0.578125, "rewards/margins": 5.310546875, "rewards/rejected": -4.734375, "step": 3138 }, { "epoch": 0.5931314658226652, "grad_norm": 2.7604287490424895, "learning_rate": 4.830218259556427e-07, "logits/chosen": 3.20703125, "logits/rejected": 2.98046875, "logps/chosen": -477.5, "logps/rejected": -1008.0, "loss": 0.5644, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9365234375, "rewards/margins": 6.125, "rewards/rejected": -5.18359375, "step": 3139 }, { "epoch": 0.5933204213708725, "grad_norm": 2.2819542955997782, "learning_rate": 4.827283344295837e-07, "logits/chosen": 2.7880859375, "logits/rejected": 2.86328125, "logps/chosen": -542.5, "logps/rejected": -1203.0, "loss": 0.5794, "rewards/accuracies": 0.78125, "rewards/chosen": 1.30908203125, "rewards/margins": 7.1953125, "rewards/rejected": -5.8828125, "step": 3140 }, { "epoch": 0.5935093769190798, "grad_norm": 2.438157131250929, "learning_rate": 4.824348721700221e-07, "logits/chosen": 1.97265625, "logits/rejected": 1.5537109375, "logps/chosen": -694.0, "logps/rejected": -903.5, "loss": 0.5419, "rewards/accuracies": 0.8125, "rewards/chosen": 1.23828125, "rewards/margins": 5.203125, "rewards/rejected": -3.96875, "step": 3141 }, { "epoch": 0.5936983324672871, "grad_norm": 2.456739403863369, "learning_rate": 4.821414393046281e-07, "logits/chosen": 2.78515625, "logits/rejected": 2.90234375, "logps/chosen": -887.25, "logps/rejected": -920.0, "loss": 0.6316, "rewards/accuracies": 0.78125, "rewards/chosen": 0.392578125, "rewards/margins": 4.005859375, "rewards/rejected": -3.61279296875, "step": 3142 }, { "epoch": 0.5938872880154944, "grad_norm": 1.3436324231190329, "learning_rate": 4.8184803596106e-07, "logits/chosen": 2.291015625, "logits/rejected": 1.57275390625, "logps/chosen": -804.0, "logps/rejected": -839.0, "loss": 0.5941, "rewards/accuracies": 0.75, "rewards/chosen": 0.843017578125, "rewards/margins": 4.24609375, "rewards/rejected": -3.400390625, "step": 3143 }, { "epoch": 0.5940762435637016, "grad_norm": 2.2505916524390903, "learning_rate": 4.815546622669621e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.375, "logps/chosen": -412.0, "logps/rejected": -619.5, "loss": 0.6395, "rewards/accuracies": 0.75, "rewards/chosen": 1.0224609375, "rewards/margins": 3.55859375, "rewards/rejected": -2.53515625, "step": 3144 }, { "epoch": 0.5942651991119089, "grad_norm": 1.9547692017185943, "learning_rate": 4.812613183499668e-07, "logits/chosen": 1.898193359375, "logits/rejected": 1.73858642578125, "logps/chosen": -879.0, "logps/rejected": -17080.0, "loss": 0.6221, "rewards/accuracies": 0.78125, "rewards/chosen": 0.926025390625, "rewards/margins": -121.0859375, "rewards/rejected": 121.685546875, "step": 3145 }, { "epoch": 0.5944541546601162, "grad_norm": 1.5935299583099733, "learning_rate": 4.809680043376935e-07, "logits/chosen": 1.8779296875, "logits/rejected": 2.37109375, "logps/chosen": -884.5, "logps/rejected": -949.0, "loss": 0.6706, "rewards/accuracies": 0.75, "rewards/chosen": 1.5146484375, "rewards/margins": 3.6875, "rewards/rejected": -2.1806640625, "step": 3146 }, { "epoch": 0.5946431102083235, "grad_norm": 1.9421896426211605, "learning_rate": 4.806747203577479e-07, "logits/chosen": 3.41015625, "logits/rejected": 2.876953125, "logps/chosen": -634.0, "logps/rejected": -773.0, "loss": 0.5747, "rewards/accuracies": 0.8125, "rewards/chosen": 1.244140625, "rewards/margins": 4.46875, "rewards/rejected": -3.21875, "step": 3147 }, { "epoch": 0.5948320657565308, "grad_norm": 1.7557944955837734, "learning_rate": 4.803814665377231e-07, "logits/chosen": 3.078125, "logits/rejected": 3.35546875, "logps/chosen": -768.0, "logps/rejected": -1313.0, "loss": 0.6678, "rewards/accuracies": 0.71875, "rewards/chosen": 1.1455078125, "rewards/margins": 7.13671875, "rewards/rejected": -5.994140625, "step": 3148 }, { "epoch": 0.5950210213047381, "grad_norm": 1.7594970793990052, "learning_rate": 4.800882430051988e-07, "logits/chosen": 1.55078125, "logits/rejected": 1.0791015625, "logps/chosen": -910.0, "logps/rejected": -796.0, "loss": 0.5333, "rewards/accuracies": 0.90625, "rewards/chosen": 0.826171875, "rewards/margins": 4.69140625, "rewards/rejected": -3.85546875, "step": 3149 }, { "epoch": 0.5952099768529453, "grad_norm": 1.907861037743811, "learning_rate": 4.797950498877421e-07, "logits/chosen": 1.998046875, "logits/rejected": 2.65234375, "logps/chosen": -704.5, "logps/rejected": -920.0, "loss": 0.6751, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9716796875, "rewards/margins": 4.59375, "rewards/rejected": -3.630859375, "step": 3150 }, { "epoch": 0.5953989324011526, "grad_norm": 2.191786446034914, "learning_rate": 4.795018873129062e-07, "logits/chosen": 2.78515625, "logits/rejected": 3.12109375, "logps/chosen": -642.0, "logps/rejected": -876.0, "loss": 0.5411, "rewards/accuracies": 0.90625, "rewards/chosen": 1.3994140625, "rewards/margins": 4.1875, "rewards/rejected": -2.79296875, "step": 3151 }, { "epoch": 0.5955878879493599, "grad_norm": 2.0580680981727255, "learning_rate": 4.792087554082313e-07, "logits/chosen": 2.61328125, "logits/rejected": 2.609375, "logps/chosen": -600.5, "logps/rejected": -1297.0, "loss": 0.5581, "rewards/accuracies": 0.875, "rewards/chosen": 1.07861328125, "rewards/margins": 5.0546875, "rewards/rejected": -3.9765625, "step": 3152 }, { "epoch": 0.5957768434975672, "grad_norm": 1.6602240217921493, "learning_rate": 4.789156543012447e-07, "logits/chosen": 2.796875, "logits/rejected": 3.26171875, "logps/chosen": -643.0, "logps/rejected": -2191.0, "loss": 0.6395, "rewards/accuracies": 0.875, "rewards/chosen": 0.681640625, "rewards/margins": 7.93359375, "rewards/rejected": -7.2578125, "step": 3153 }, { "epoch": 0.5959657990457745, "grad_norm": 1.9700656009125816, "learning_rate": 4.786225841194593e-07, "logits/chosen": 2.3994140625, "logits/rejected": 2.28125, "logps/chosen": -570.5, "logps/rejected": -1172.0, "loss": 0.6348, "rewards/accuracies": 0.78125, "rewards/chosen": 1.193359375, "rewards/margins": 4.390625, "rewards/rejected": -3.203125, "step": 3154 }, { "epoch": 0.5961547545939818, "grad_norm": 1.7647828302747266, "learning_rate": 4.783295449903753e-07, "logits/chosen": 2.67578125, "logits/rejected": 2.37109375, "logps/chosen": -767.0, "logps/rejected": -700.0, "loss": 0.6445, "rewards/accuracies": 0.75, "rewards/chosen": 0.47998046875, "rewards/margins": 3.53125, "rewards/rejected": -3.056640625, "step": 3155 }, { "epoch": 0.596343710142189, "grad_norm": 2.9004936826617005, "learning_rate": 4.780365370414793e-07, "logits/chosen": 2.68359375, "logits/rejected": 2.259765625, "logps/chosen": -513.5, "logps/rejected": -671.0, "loss": 0.5474, "rewards/accuracies": 0.8125, "rewards/chosen": 0.92626953125, "rewards/margins": 4.96875, "rewards/rejected": -4.04296875, "step": 3156 }, { "epoch": 0.5965326656903963, "grad_norm": 3.1439124110799734, "learning_rate": 4.777435604002442e-07, "logits/chosen": 2.7880859375, "logits/rejected": 2.4326171875, "logps/chosen": -781.5, "logps/rejected": -885.0, "loss": 0.7451, "rewards/accuracies": 0.625, "rewards/chosen": 0.951171875, "rewards/margins": 2.958984375, "rewards/rejected": -2.00244140625, "step": 3157 }, { "epoch": 0.5967216212386036, "grad_norm": 2.1576691760854905, "learning_rate": 4.774506151941292e-07, "logits/chosen": 1.837890625, "logits/rejected": 1.783203125, "logps/chosen": -915.0, "logps/rejected": -769.0, "loss": 0.6369, "rewards/accuracies": 0.71875, "rewards/chosen": 1.22314453125, "rewards/margins": 4.2578125, "rewards/rejected": -3.02734375, "step": 3158 }, { "epoch": 0.5969105767868109, "grad_norm": 1.9272190837481236, "learning_rate": 4.771577015505799e-07, "logits/chosen": 2.68359375, "logits/rejected": 2.623046875, "logps/chosen": -746.0, "logps/rejected": -1017.0, "loss": 0.6532, "rewards/accuracies": 0.75, "rewards/chosen": 0.7980728149414062, "rewards/margins": 4.08984375, "rewards/rejected": -3.28515625, "step": 3159 }, { "epoch": 0.5970995323350182, "grad_norm": 2.784920004233921, "learning_rate": 4.768648195970283e-07, "logits/chosen": 1.9921875, "logits/rejected": 1.7744140625, "logps/chosen": -932.0, "logps/rejected": -1007.5, "loss": 0.5501, "rewards/accuracies": 0.875, "rewards/chosen": 1.453125, "rewards/margins": 5.19921875, "rewards/rejected": -3.73828125, "step": 3160 }, { "epoch": 0.5972884878832254, "grad_norm": 2.4216624429500473, "learning_rate": 4.765719694608925e-07, "logits/chosen": 2.5859375, "logits/rejected": 2.2568359375, "logps/chosen": -540.0, "logps/rejected": -531.5, "loss": 0.6917, "rewards/accuracies": 0.78125, "rewards/chosen": 0.94677734375, "rewards/margins": 2.275390625, "rewards/rejected": -1.3291015625, "step": 3161 }, { "epoch": 0.5974774434314327, "grad_norm": 1.7369929477360189, "learning_rate": 4.76279151269577e-07, "logits/chosen": 1.65234375, "logits/rejected": 1.89453125, "logps/chosen": -1134.5, "logps/rejected": -2082.0, "loss": 0.4909, "rewards/accuracies": 0.875, "rewards/chosen": 1.322998046875, "rewards/margins": 9.4375, "rewards/rejected": -8.109375, "step": 3162 }, { "epoch": 0.59766639897964, "grad_norm": 1.8834648424364293, "learning_rate": 4.7598636515047176e-07, "logits/chosen": 2.23828125, "logits/rejected": 1.76953125, "logps/chosen": -857.0, "logps/rejected": -935.5, "loss": 0.544, "rewards/accuracies": 0.84375, "rewards/chosen": 1.107421875, "rewards/margins": 5.6875, "rewards/rejected": -4.5859375, "step": 3163 }, { "epoch": 0.5978553545278473, "grad_norm": 4.851137887992354, "learning_rate": 4.7569361123095355e-07, "logits/chosen": 1.48681640625, "logits/rejected": 0.95361328125, "logps/chosen": -736.0, "logps/rejected": -748.5, "loss": 0.5654, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0875244140625, "rewards/margins": 5.6875, "rewards/rejected": -5.578125, "step": 3164 }, { "epoch": 0.5980443100760546, "grad_norm": 1.9060768143258262, "learning_rate": 4.754008896383844e-07, "logits/chosen": 3.24609375, "logits/rejected": 2.646484375, "logps/chosen": -692.0, "logps/rejected": -722.0, "loss": 0.589, "rewards/accuracies": 0.8125, "rewards/chosen": 0.78466796875, "rewards/margins": 4.38671875, "rewards/rejected": -3.6015625, "step": 3165 }, { "epoch": 0.5982332656242619, "grad_norm": 2.1577519768326825, "learning_rate": 4.7510820050011303e-07, "logits/chosen": 2.337890625, "logits/rejected": 2.037109375, "logps/chosen": -917.5, "logps/rejected": -959.5, "loss": 0.5719, "rewards/accuracies": 0.84375, "rewards/chosen": 0.50537109375, "rewards/margins": 4.11328125, "rewards/rejected": -3.6015625, "step": 3166 }, { "epoch": 0.5984222211724691, "grad_norm": 2.4436847165413305, "learning_rate": 4.748155439434737e-07, "logits/chosen": 2.78125, "logits/rejected": 2.51953125, "logps/chosen": -681.0, "logps/rejected": -550.5, "loss": 0.535, "rewards/accuracies": 0.875, "rewards/chosen": 0.28369140625, "rewards/margins": 5.4765625, "rewards/rejected": -5.1875, "step": 3167 }, { "epoch": 0.5986111767206764, "grad_norm": 2.2272387689524455, "learning_rate": 4.745229200957861e-07, "logits/chosen": 2.5625, "logits/rejected": 2.490234375, "logps/chosen": -941.5, "logps/rejected": -1848.0, "loss": 0.3683, "rewards/accuracies": 0.96875, "rewards/chosen": 1.41650390625, "rewards/margins": 7.2109375, "rewards/rejected": -5.79296875, "step": 3168 }, { "epoch": 0.5988001322688837, "grad_norm": 2.5016521292740435, "learning_rate": 4.7423032908435634e-07, "logits/chosen": 2.892578125, "logits/rejected": 2.662109375, "logps/chosen": -566.0, "logps/rejected": -611.5, "loss": 0.6663, "rewards/accuracies": 0.75, "rewards/chosen": -0.2716064453125, "rewards/margins": 3.6416015625, "rewards/rejected": -3.91015625, "step": 3169 }, { "epoch": 0.598989087817091, "grad_norm": 2.1634384049734283, "learning_rate": 4.739377710364758e-07, "logits/chosen": 2.26904296875, "logits/rejected": 1.6141357421875, "logps/chosen": -779.0, "logps/rejected": -688.0, "loss": 0.4946, "rewards/accuracies": 0.84375, "rewards/chosen": -0.173614501953125, "rewards/margins": 4.32421875, "rewards/rejected": -4.5, "step": 3170 }, { "epoch": 0.5991780433652983, "grad_norm": 2.044019188831645, "learning_rate": 4.73645246079422e-07, "logits/chosen": 1.17718505859375, "logits/rejected": 1.27001953125, "logps/chosen": -599.5, "logps/rejected": -831.5, "loss": 0.59, "rewards/accuracies": 0.75, "rewards/chosen": 0.224609375, "rewards/margins": 5.3359375, "rewards/rejected": -5.12109375, "step": 3171 }, { "epoch": 0.5993669989135056, "grad_norm": 3.1105971299100075, "learning_rate": 4.733527543404573e-07, "logits/chosen": 2.5654296875, "logits/rejected": 2.4345703125, "logps/chosen": -690.5, "logps/rejected": -617.5, "loss": 0.5592, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3934326171875, "rewards/margins": 4.58984375, "rewards/rejected": -3.201171875, "step": 3172 }, { "epoch": 0.5995559544617128, "grad_norm": 2.989850880923922, "learning_rate": 4.730602959468303e-07, "logits/chosen": 2.033203125, "logits/rejected": 1.8701171875, "logps/chosen": -808.0, "logps/rejected": -2671.0, "loss": 0.5324, "rewards/accuracies": 0.78125, "rewards/chosen": 0.57763671875, "rewards/margins": 10.97265625, "rewards/rejected": -10.4140625, "step": 3173 }, { "epoch": 0.5997449100099201, "grad_norm": 2.881197132852068, "learning_rate": 4.7276787102577487e-07, "logits/chosen": 2.501953125, "logits/rejected": 2.013671875, "logps/chosen": -655.0, "logps/rejected": -604.0, "loss": 0.4949, "rewards/accuracies": 0.90625, "rewards/chosen": 0.453125, "rewards/margins": 5.75, "rewards/rejected": -5.3046875, "step": 3174 }, { "epoch": 0.5999338655581274, "grad_norm": 1.9317113826590548, "learning_rate": 4.724754797045101e-07, "logits/chosen": 2.9609375, "logits/rejected": 2.759765625, "logps/chosen": -747.0, "logps/rejected": -914.0, "loss": 0.4718, "rewards/accuracies": 0.90625, "rewards/chosen": 1.794921875, "rewards/margins": 6.5078125, "rewards/rejected": -4.7265625, "step": 3175 }, { "epoch": 0.6001228211063347, "grad_norm": 1.9131257883810844, "learning_rate": 4.7218312211024105e-07, "logits/chosen": 3.05859375, "logits/rejected": 2.326171875, "logps/chosen": -618.25, "logps/rejected": -1026.5, "loss": 0.5515, "rewards/accuracies": 0.90625, "rewards/chosen": 0.486328125, "rewards/margins": 7.08203125, "rewards/rejected": -6.58984375, "step": 3176 }, { "epoch": 0.600311776654542, "grad_norm": 1.8290257643512873, "learning_rate": 4.718907983701572e-07, "logits/chosen": 2.025390625, "logits/rejected": 2.49609375, "logps/chosen": -1226.0, "logps/rejected": -1486.0, "loss": 0.5758, "rewards/accuracies": 0.65625, "rewards/chosen": 1.177734375, "rewards/margins": 6.578125, "rewards/rejected": -5.390625, "step": 3177 }, { "epoch": 0.6005007322027494, "grad_norm": 2.5746278798830007, "learning_rate": 4.7159850861143423e-07, "logits/chosen": 2.181396484375, "logits/rejected": 1.23291015625, "logps/chosen": -828.0, "logps/rejected": -1084.0, "loss": 0.6, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9404296875, "rewards/margins": 6.3515625, "rewards/rejected": -5.40234375, "step": 3178 }, { "epoch": 0.6006896877509565, "grad_norm": 1.3783410198246047, "learning_rate": 4.713062529612324e-07, "logits/chosen": 2.25390625, "logits/rejected": 1.6669921875, "logps/chosen": -671.0, "logps/rejected": -701.0, "loss": 0.676, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0029296875, "rewards/margins": 4.16015625, "rewards/rejected": -4.15625, "step": 3179 }, { "epoch": 0.6008786432991639, "grad_norm": 3.6023646341054705, "learning_rate": 4.710140315466974e-07, "logits/chosen": 2.30078125, "logits/rejected": 2.97265625, "logps/chosen": -895.5, "logps/rejected": -1054.0, "loss": 0.6715, "rewards/accuracies": 0.6875, "rewards/chosen": -0.560546875, "rewards/margins": 4.6796875, "rewards/rejected": -5.25, "step": 3180 }, { "epoch": 0.6010675988473712, "grad_norm": 1.8997824779502597, "learning_rate": 4.707218444949602e-07, "logits/chosen": 2.876953125, "logits/rejected": 2.041015625, "logps/chosen": -911.0, "logps/rejected": -697.5, "loss": 0.517, "rewards/accuracies": 0.8125, "rewards/chosen": 0.431640625, "rewards/margins": 5.44921875, "rewards/rejected": -5.01171875, "step": 3181 }, { "epoch": 0.6012565543955785, "grad_norm": 2.1192561928423483, "learning_rate": 4.7042969193313597e-07, "logits/chosen": 1.900390625, "logits/rejected": 1.39013671875, "logps/chosen": -1229.0, "logps/rejected": -1373.5, "loss": 0.5104, "rewards/accuracies": 0.875, "rewards/chosen": 0.9619140625, "rewards/margins": 6.0234375, "rewards/rejected": -5.0703125, "step": 3182 }, { "epoch": 0.6014455099437858, "grad_norm": 1.929155362441168, "learning_rate": 4.7013757398832634e-07, "logits/chosen": 1.921875, "logits/rejected": 1.0498046875, "logps/chosen": -988.5, "logps/rejected": -1014.0, "loss": 0.4274, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2734375, "rewards/margins": 7.08203125, "rewards/rejected": -5.8125, "step": 3183 }, { "epoch": 0.601634465491993, "grad_norm": 1.3792865170031876, "learning_rate": 4.698454907876165e-07, "logits/chosen": 2.8720703125, "logits/rejected": 2.380859375, "logps/chosen": -842.0, "logps/rejected": -743.0, "loss": 0.4711, "rewards/accuracies": 0.90625, "rewards/chosen": 0.98828125, "rewards/margins": 6.734375, "rewards/rejected": -5.75, "step": 3184 }, { "epoch": 0.6018234210402003, "grad_norm": 1.9107954002461764, "learning_rate": 4.695534424580772e-07, "logits/chosen": 3.109375, "logits/rejected": 2.1484375, "logps/chosen": -716.0, "logps/rejected": -1175.0, "loss": 0.5882, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4609375, "rewards/margins": 5.859375, "rewards/rejected": -5.4140625, "step": 3185 }, { "epoch": 0.6020123765884076, "grad_norm": 3.1373925525793394, "learning_rate": 4.6926142912676383e-07, "logits/chosen": 2.876953125, "logits/rejected": 2.986328125, "logps/chosen": -536.0, "logps/rejected": -592.0, "loss": 0.6251, "rewards/accuracies": 0.75, "rewards/chosen": 0.5792236328125, "rewards/margins": 2.9765625, "rewards/rejected": -2.396484375, "step": 3186 }, { "epoch": 0.6022013321366149, "grad_norm": 2.7925923382581845, "learning_rate": 4.689694509207166e-07, "logits/chosen": 2.3125, "logits/rejected": 2.29296875, "logps/chosen": -827.0, "logps/rejected": -1446.0, "loss": 0.6619, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7900390625, "rewards/margins": 10.4609375, "rewards/rejected": -9.654296875, "step": 3187 }, { "epoch": 0.6023902876848222, "grad_norm": 2.2963083204152643, "learning_rate": 4.6867750796696084e-07, "logits/chosen": 2.6171875, "logits/rejected": 1.91796875, "logps/chosen": -659.0, "logps/rejected": -15321.5, "loss": 0.5233, "rewards/accuracies": 0.875, "rewards/chosen": 0.947265625, "rewards/margins": -104.56640625, "rewards/rejected": 105.8671875, "step": 3188 }, { "epoch": 0.6025792432330295, "grad_norm": 2.2769506762678176, "learning_rate": 4.6838560039250565e-07, "logits/chosen": 3.17578125, "logits/rejected": 2.64453125, "logps/chosen": -740.0, "logps/rejected": -593.0, "loss": 0.6186, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9091796875, "rewards/margins": 5.62109375, "rewards/rejected": -4.70703125, "step": 3189 }, { "epoch": 0.6027681987812367, "grad_norm": 3.335579376851257, "learning_rate": 4.6809372832434546e-07, "logits/chosen": 1.75390625, "logits/rejected": 1.71484375, "logps/chosen": -596.0, "logps/rejected": -566.0, "loss": 0.6379, "rewards/accuracies": 0.875, "rewards/chosen": 0.828125, "rewards/margins": 3.34765625, "rewards/rejected": -2.513671875, "step": 3190 }, { "epoch": 0.602957154329444, "grad_norm": 2.001531825053185, "learning_rate": 4.6780189188945884e-07, "logits/chosen": 2.443359375, "logits/rejected": 2.404296875, "logps/chosen": -695.0, "logps/rejected": -1031.0, "loss": 0.6733, "rewards/accuracies": 0.84375, "rewards/chosen": 0.359375, "rewards/margins": 4.798583984375, "rewards/rejected": -4.4609375, "step": 3191 }, { "epoch": 0.6031461098776513, "grad_norm": 1.770892169442012, "learning_rate": 4.6751009121480956e-07, "logits/chosen": 2.0546875, "logits/rejected": 2.2265625, "logps/chosen": -569.0, "logps/rejected": -1768.0, "loss": 0.6578, "rewards/accuracies": 0.75, "rewards/chosen": -0.560302734375, "rewards/margins": 9.6494140625, "rewards/rejected": -10.220703125, "step": 3192 }, { "epoch": 0.6033350654258586, "grad_norm": 2.1611716464335786, "learning_rate": 4.6721832642734484e-07, "logits/chosen": 1.97265625, "logits/rejected": 2.4140625, "logps/chosen": -541.5, "logps/rejected": -893.0, "loss": 0.6735, "rewards/accuracies": 0.71875, "rewards/chosen": 0.73046875, "rewards/margins": 4.0234375, "rewards/rejected": -3.296875, "step": 3193 }, { "epoch": 0.6035240209740659, "grad_norm": 3.2976614422847046, "learning_rate": 4.669265976539968e-07, "logits/chosen": 2.3642578125, "logits/rejected": 2.51171875, "logps/chosen": -791.5, "logps/rejected": -848.0, "loss": 0.5605, "rewards/accuracies": 0.71875, "rewards/chosen": 0.80908203125, "rewards/margins": 5.3984375, "rewards/rejected": -4.58203125, "step": 3194 }, { "epoch": 0.6037129765222732, "grad_norm": 3.063557183079208, "learning_rate": 4.6663490502168213e-07, "logits/chosen": 1.8388671875, "logits/rejected": 1.20849609375, "logps/chosen": -920.0, "logps/rejected": -1189.0, "loss": 0.571, "rewards/accuracies": 0.84375, "rewards/chosen": 1.19677734375, "rewards/margins": 5.51171875, "rewards/rejected": -4.31640625, "step": 3195 }, { "epoch": 0.6039019320704804, "grad_norm": 3.3085101627896023, "learning_rate": 4.6634324865730123e-07, "logits/chosen": 3.45703125, "logits/rejected": 3.015625, "logps/chosen": -1106.0, "logps/rejected": -692.5, "loss": 0.6652, "rewards/accuracies": 0.71875, "rewards/chosen": 1.14501953125, "rewards/margins": 4.93359375, "rewards/rejected": -3.78515625, "step": 3196 }, { "epoch": 0.6040908876186877, "grad_norm": 1.7703222434797445, "learning_rate": 4.660516286877395e-07, "logits/chosen": 1.4580078125, "logits/rejected": 1.6416015625, "logps/chosen": -987.0, "logps/rejected": -2005.0, "loss": 0.4489, "rewards/accuracies": 0.9375, "rewards/chosen": 1.58349609375, "rewards/margins": 9.8984375, "rewards/rejected": -8.34375, "step": 3197 }, { "epoch": 0.604279843166895, "grad_norm": 4.104630477820813, "learning_rate": 4.6576004523986535e-07, "logits/chosen": 2.5458984375, "logits/rejected": 2.18017578125, "logps/chosen": -1057.0, "logps/rejected": -1666.0, "loss": 0.556, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6107177734375, "rewards/margins": 7.99609375, "rewards/rejected": -6.3662109375, "step": 3198 }, { "epoch": 0.6044687987151023, "grad_norm": 2.67539877554565, "learning_rate": 4.654684984405326e-07, "logits/chosen": 2.90234375, "logits/rejected": 3.150390625, "logps/chosen": -841.0, "logps/rejected": -997.0, "loss": 0.5974, "rewards/accuracies": 0.75, "rewards/chosen": 0.681640625, "rewards/margins": 5.16015625, "rewards/rejected": -4.46875, "step": 3199 }, { "epoch": 0.6046577542633096, "grad_norm": 3.804599098684102, "learning_rate": 4.6517698841657813e-07, "logits/chosen": 2.65625, "logits/rejected": 2.5703125, "logps/chosen": -809.5, "logps/rejected": -716.0, "loss": 0.6043, "rewards/accuracies": 0.875, "rewards/chosen": 0.734375, "rewards/margins": 6.0859375, "rewards/rejected": -5.3671875, "step": 3200 }, { "epoch": 0.6048467098115169, "grad_norm": 3.4355788195441503, "learning_rate": 4.648855152948232e-07, "logits/chosen": 3.1611328125, "logits/rejected": 2.6435546875, "logps/chosen": -784.0, "logps/rejected": -837.0, "loss": 0.5612, "rewards/accuracies": 0.78125, "rewards/chosen": 0.447265625, "rewards/margins": 4.8515625, "rewards/rejected": -4.39453125, "step": 3201 }, { "epoch": 0.6050356653597241, "grad_norm": 1.9290020413085311, "learning_rate": 4.645940792020735e-07, "logits/chosen": 2.748046875, "logits/rejected": 2.576171875, "logps/chosen": -732.25, "logps/rejected": -733.5, "loss": 0.562, "rewards/accuracies": 0.8125, "rewards/chosen": 1.15966796875, "rewards/margins": 5.11328125, "rewards/rejected": -3.96875, "step": 3202 }, { "epoch": 0.6052246209079314, "grad_norm": 1.818796983350179, "learning_rate": 4.6430268026511753e-07, "logits/chosen": 2.552734375, "logits/rejected": 2.357421875, "logps/chosen": -762.0, "logps/rejected": -1235.0, "loss": 0.5278, "rewards/accuracies": 0.875, "rewards/chosen": 1.33544921875, "rewards/margins": 5.9453125, "rewards/rejected": -4.60546875, "step": 3203 }, { "epoch": 0.6054135764561387, "grad_norm": 2.8336711483093913, "learning_rate": 4.640113186107285e-07, "logits/chosen": 2.880859375, "logits/rejected": 2.7578125, "logps/chosen": -845.5, "logps/rejected": -822.0, "loss": 0.508, "rewards/accuracies": 0.875, "rewards/chosen": 0.8330078125, "rewards/margins": 6.4921875, "rewards/rejected": -5.640625, "step": 3204 }, { "epoch": 0.605602532004346, "grad_norm": 3.1724576522218957, "learning_rate": 4.63719994365663e-07, "logits/chosen": 2.529296875, "logits/rejected": 2.466796875, "logps/chosen": -1062.0, "logps/rejected": -2120.0, "loss": 0.4997, "rewards/accuracies": 0.875, "rewards/chosen": 1.80224609375, "rewards/margins": 8.296875, "rewards/rejected": -6.4921875, "step": 3205 }, { "epoch": 0.6057914875525533, "grad_norm": 1.6483841787773785, "learning_rate": 4.6342870765666175e-07, "logits/chosen": 2.5, "logits/rejected": 2.4609375, "logps/chosen": -919.0, "logps/rejected": -1674.0, "loss": 0.62, "rewards/accuracies": 0.875, "rewards/chosen": 0.70654296875, "rewards/margins": 5.49609375, "rewards/rejected": -4.796875, "step": 3206 }, { "epoch": 0.6059804431007605, "grad_norm": 2.384483232629586, "learning_rate": 4.631374586104485e-07, "logits/chosen": 2.26953125, "logits/rejected": 2.1005859375, "logps/chosen": -742.0, "logps/rejected": -952.0, "loss": 0.5173, "rewards/accuracies": 0.875, "rewards/chosen": 0.925201416015625, "rewards/margins": 5.16015625, "rewards/rejected": -4.2265625, "step": 3207 }, { "epoch": 0.6061693986489678, "grad_norm": 2.241112194787016, "learning_rate": 4.628462473537311e-07, "logits/chosen": 3.0625, "logits/rejected": 3.02734375, "logps/chosen": -1096.5, "logps/rejected": -1232.0, "loss": 0.5757, "rewards/accuracies": 0.78125, "rewards/chosen": 1.381591796875, "rewards/margins": 6.1796875, "rewards/rejected": -4.80078125, "step": 3208 }, { "epoch": 0.6063583541971751, "grad_norm": 1.8129939313219137, "learning_rate": 4.625550740132009e-07, "logits/chosen": 3.671875, "logits/rejected": 3.125, "logps/chosen": -805.0, "logps/rejected": -883.5, "loss": 0.5642, "rewards/accuracies": 0.875, "rewards/chosen": 1.3330078125, "rewards/margins": 8.734375, "rewards/rejected": -7.4140625, "step": 3209 }, { "epoch": 0.6065473097453824, "grad_norm": 2.967381901847952, "learning_rate": 4.622639387155326e-07, "logits/chosen": 3.41015625, "logits/rejected": 3.16796875, "logps/chosen": -686.0, "logps/rejected": -887.0, "loss": 0.7189, "rewards/accuracies": 0.75, "rewards/chosen": -0.30908203125, "rewards/margins": 4.28125, "rewards/rejected": -4.58984375, "step": 3210 }, { "epoch": 0.6067362652935897, "grad_norm": 2.0865052988536075, "learning_rate": 4.6197284158738456e-07, "logits/chosen": 3.01171875, "logits/rejected": 3.30078125, "logps/chosen": -601.0, "logps/rejected": -806.5, "loss": 0.746, "rewards/accuracies": 0.6875, "rewards/chosen": 0.173828125, "rewards/margins": 3.064453125, "rewards/rejected": -2.890625, "step": 3211 }, { "epoch": 0.606925220841797, "grad_norm": 1.4890018967634155, "learning_rate": 4.6168178275539827e-07, "logits/chosen": 2.81640625, "logits/rejected": 2.890625, "logps/chosen": -379.0, "logps/rejected": -674.0, "loss": 0.5592, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7373046875, "rewards/margins": 4.4375, "rewards/rejected": -3.703125, "step": 3212 }, { "epoch": 0.6071141763900042, "grad_norm": 1.8892067677644675, "learning_rate": 4.6139076234619866e-07, "logits/chosen": 2.8046875, "logits/rejected": 2.41796875, "logps/chosen": -1097.5, "logps/rejected": -957.5, "loss": 0.6108, "rewards/accuracies": 0.8125, "rewards/chosen": 1.26611328125, "rewards/margins": 5.310546875, "rewards/rejected": -4.03515625, "step": 3213 }, { "epoch": 0.6073031319382115, "grad_norm": 2.5376164430370154, "learning_rate": 4.610997804863941e-07, "logits/chosen": 3.15234375, "logits/rejected": 2.73046875, "logps/chosen": -1260.0, "logps/rejected": -1012.5, "loss": 0.4206, "rewards/accuracies": 0.90625, "rewards/chosen": 1.821533203125, "rewards/margins": 6.94921875, "rewards/rejected": -5.125, "step": 3214 }, { "epoch": 0.6074920874864188, "grad_norm": 2.285540376591301, "learning_rate": 4.60808837302576e-07, "logits/chosen": 3.01171875, "logits/rejected": 3.11328125, "logps/chosen": -631.75, "logps/rejected": -629.0, "loss": 0.5923, "rewards/accuracies": 0.8125, "rewards/chosen": 0.837890625, "rewards/margins": 4.09375, "rewards/rejected": -3.2578125, "step": 3215 }, { "epoch": 0.6076810430346261, "grad_norm": 2.096567796428213, "learning_rate": 4.6051793292131934e-07, "logits/chosen": 2.96484375, "logits/rejected": 2.5234375, "logps/chosen": -654.0, "logps/rejected": -797.0, "loss": 0.5622, "rewards/accuracies": 0.8125, "rewards/chosen": 0.27685546875, "rewards/margins": 6.01171875, "rewards/rejected": -5.75390625, "step": 3216 }, { "epoch": 0.6078699985828334, "grad_norm": 2.2441195316599645, "learning_rate": 4.602270674691813e-07, "logits/chosen": 2.44482421875, "logits/rejected": 2.09912109375, "logps/chosen": -634.0, "logps/rejected": -635.0, "loss": 0.5209, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3359375, "rewards/margins": 4.5078125, "rewards/rejected": -3.17578125, "step": 3217 }, { "epoch": 0.6080589541310407, "grad_norm": 2.328730259892204, "learning_rate": 4.599362410727031e-07, "logits/chosen": 2.5400390625, "logits/rejected": 2.23828125, "logps/chosen": -593.0, "logps/rejected": -997.0, "loss": 0.5136, "rewards/accuracies": 0.8125, "rewards/chosen": 0.21331787109375, "rewards/margins": 5.93359375, "rewards/rejected": -5.71875, "step": 3218 }, { "epoch": 0.6082479096792479, "grad_norm": 3.4184484873307377, "learning_rate": 4.596454538584086e-07, "logits/chosen": 2.72265625, "logits/rejected": 2.4765625, "logps/chosen": -800.0, "logps/rejected": -713.0, "loss": 0.5152, "rewards/accuracies": 0.84375, "rewards/chosen": 0.47119140625, "rewards/margins": 5.00390625, "rewards/rejected": -4.5390625, "step": 3219 }, { "epoch": 0.6084368652274552, "grad_norm": 2.6926726605077245, "learning_rate": 4.5935470595280445e-07, "logits/chosen": 2.4921875, "logits/rejected": 2.0146484375, "logps/chosen": -914.0, "logps/rejected": -877.0, "loss": 0.4544, "rewards/accuracies": 0.90625, "rewards/chosen": 1.306640625, "rewards/margins": 5.84375, "rewards/rejected": -4.53125, "step": 3220 }, { "epoch": 0.6086258207756625, "grad_norm": 1.3138918454094313, "learning_rate": 4.590639974823803e-07, "logits/chosen": 3.140625, "logits/rejected": 2.578125, "logps/chosen": -405.5, "logps/rejected": -545.75, "loss": 0.5973, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5751953125, "rewards/margins": 3.80078125, "rewards/rejected": -4.37890625, "step": 3221 }, { "epoch": 0.6088147763238698, "grad_norm": 1.6326324318705363, "learning_rate": 4.5877332857360883e-07, "logits/chosen": 2.2421875, "logits/rejected": 1.986328125, "logps/chosen": -1004.0, "logps/rejected": -1116.0, "loss": 0.5513, "rewards/accuracies": 0.78125, "rewards/chosen": 1.28515625, "rewards/margins": 5.5546875, "rewards/rejected": -4.27734375, "step": 3222 }, { "epoch": 0.6090037318720771, "grad_norm": 2.9697152142831444, "learning_rate": 4.584826993529456e-07, "logits/chosen": 3.267578125, "logits/rejected": 3.125, "logps/chosen": -1044.0, "logps/rejected": -1256.0, "loss": 0.5147, "rewards/accuracies": 0.84375, "rewards/chosen": 1.80859375, "rewards/margins": 7.46875, "rewards/rejected": -5.654296875, "step": 3223 }, { "epoch": 0.6091926874202844, "grad_norm": 2.0048490641012875, "learning_rate": 4.5819210994682823e-07, "logits/chosen": 2.740234375, "logits/rejected": 2.734375, "logps/chosen": -835.0, "logps/rejected": -953.0, "loss": 0.6395, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0849609375, "rewards/margins": 3.6513671875, "rewards/rejected": -3.5625, "step": 3224 }, { "epoch": 0.6093816429684916, "grad_norm": 3.7050256759126716, "learning_rate": 4.579015604816777e-07, "logits/chosen": 3.38671875, "logits/rejected": 2.89453125, "logps/chosen": -646.0, "logps/rejected": -707.0, "loss": 0.6204, "rewards/accuracies": 0.78125, "rewards/chosen": -0.083251953125, "rewards/margins": 4.23828125, "rewards/rejected": -4.32421875, "step": 3225 }, { "epoch": 0.6095705985166989, "grad_norm": 1.9371666254512547, "learning_rate": 4.576110510838973e-07, "logits/chosen": 3.8046875, "logits/rejected": 3.6640625, "logps/chosen": -1223.0, "logps/rejected": -1148.0, "loss": 0.5328, "rewards/accuracies": 0.90625, "rewards/chosen": 1.186279296875, "rewards/margins": 7.1171875, "rewards/rejected": -5.9296875, "step": 3226 }, { "epoch": 0.6097595540649062, "grad_norm": 2.389067540794849, "learning_rate": 4.573205818798732e-07, "logits/chosen": 3.013671875, "logits/rejected": 2.640625, "logps/chosen": -739.0, "logps/rejected": -1506.5, "loss": 0.5238, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5775146484375, "rewards/margins": 6.171875, "rewards/rejected": -5.59375, "step": 3227 }, { "epoch": 0.6099485096131135, "grad_norm": 2.232804476353557, "learning_rate": 4.5703015299597345e-07, "logits/chosen": 2.203125, "logits/rejected": 2.234375, "logps/chosen": -582.5, "logps/rejected": -707.0, "loss": 0.7166, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0419921875, "rewards/margins": 3.185546875, "rewards/rejected": -3.146484375, "step": 3228 }, { "epoch": 0.6101374651613208, "grad_norm": 2.282184609157376, "learning_rate": 4.5673976455854913e-07, "logits/chosen": 2.76953125, "logits/rejected": 2.21484375, "logps/chosen": -1048.5, "logps/rejected": -924.0, "loss": 0.5257, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5205078125, "rewards/margins": 6.03125, "rewards/rejected": -4.51171875, "step": 3229 }, { "epoch": 0.610326420709528, "grad_norm": 2.023652774501826, "learning_rate": 4.564494166939337e-07, "logits/chosen": 1.84765625, "logits/rejected": 1.697265625, "logps/chosen": -855.5, "logps/rejected": -1184.0, "loss": 0.5757, "rewards/accuracies": 0.78125, "rewards/chosen": 0.71435546875, "rewards/margins": 7.38671875, "rewards/rejected": -6.69140625, "step": 3230 }, { "epoch": 0.6105153762577353, "grad_norm": 1.6256076584780674, "learning_rate": 4.561591095284426e-07, "logits/chosen": 2.2255859375, "logits/rejected": 1.89501953125, "logps/chosen": -473.0, "logps/rejected": -565.0, "loss": 0.6824, "rewards/accuracies": 0.78125, "rewards/chosen": -0.02880859375, "rewards/margins": 3.7578125, "rewards/rejected": -3.787109375, "step": 3231 }, { "epoch": 0.6107043318059426, "grad_norm": 3.2411196144442718, "learning_rate": 4.5586884318837415e-07, "logits/chosen": 3.03125, "logits/rejected": 2.74609375, "logps/chosen": -634.5, "logps/rejected": -798.0, "loss": 0.6456, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17626953125, "rewards/margins": 3.7021484375, "rewards/rejected": -3.529296875, "step": 3232 }, { "epoch": 0.6108932873541499, "grad_norm": 3.710816286015468, "learning_rate": 4.555786178000081e-07, "logits/chosen": 3.5234375, "logits/rejected": 2.7734375, "logps/chosen": -626.0, "logps/rejected": -707.0, "loss": 0.7791, "rewards/accuracies": 0.71875, "rewards/chosen": 0.29736328125, "rewards/margins": 3.546875, "rewards/rejected": -3.24609375, "step": 3233 }, { "epoch": 0.6110822429023572, "grad_norm": 1.428842749164097, "learning_rate": 4.552884334896071e-07, "logits/chosen": 3.392578125, "logits/rejected": 2.65869140625, "logps/chosen": -991.0, "logps/rejected": -1159.0, "loss": 0.4925, "rewards/accuracies": 0.875, "rewards/chosen": 1.546875, "rewards/margins": 5.91796875, "rewards/rejected": -4.37109375, "step": 3234 }, { "epoch": 0.6112711984505645, "grad_norm": 4.7907686980242605, "learning_rate": 4.549982903834154e-07, "logits/chosen": 3.28125, "logits/rejected": 3.125, "logps/chosen": -683.0, "logps/rejected": -546.0, "loss": 0.6257, "rewards/accuracies": 0.78125, "rewards/chosen": 0.167633056640625, "rewards/margins": 3.7578125, "rewards/rejected": -3.59765625, "step": 3235 }, { "epoch": 0.6114601539987717, "grad_norm": 3.3124012036219113, "learning_rate": 4.547081886076599e-07, "logits/chosen": 2.119140625, "logits/rejected": 1.953125, "logps/chosen": -1024.0, "logps/rejected": -1052.0, "loss": 0.5825, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0234375, "rewards/margins": 5.73046875, "rewards/rejected": -4.71484375, "step": 3236 }, { "epoch": 0.611649109546979, "grad_norm": 3.1079304547474775, "learning_rate": 4.544181282885492e-07, "logits/chosen": 3.646484375, "logits/rejected": 3.404296875, "logps/chosen": -685.5, "logps/rejected": -590.5, "loss": 0.6627, "rewards/accuracies": 0.6875, "rewards/chosen": -0.080078125, "rewards/margins": 3.406982421875, "rewards/rejected": -3.494140625, "step": 3237 }, { "epoch": 0.6118380650951863, "grad_norm": 3.6988274161746237, "learning_rate": 4.5412810955227356e-07, "logits/chosen": 3.40234375, "logits/rejected": 3.42578125, "logps/chosen": -431.5, "logps/rejected": -1264.0, "loss": 0.7049, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2919921875, "rewards/margins": 3.7421875, "rewards/rejected": -3.451171875, "step": 3238 }, { "epoch": 0.6120270206433936, "grad_norm": 2.714215185133299, "learning_rate": 4.538381325250057e-07, "logits/chosen": 2.5625, "logits/rejected": 2.693359375, "logps/chosen": -922.0, "logps/rejected": -1825.0, "loss": 0.558, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2099609375, "rewards/margins": 8.875, "rewards/rejected": -7.6875, "step": 3239 }, { "epoch": 0.612215976191601, "grad_norm": 3.12323767945352, "learning_rate": 4.5354819733289973e-07, "logits/chosen": 2.23046875, "logits/rejected": 2.4482421875, "logps/chosen": -927.0, "logps/rejected": -944.0, "loss": 0.6836, "rewards/accuracies": 0.84375, "rewards/chosen": -0.263671875, "rewards/margins": 5.5625, "rewards/rejected": -5.8359375, "step": 3240 }, { "epoch": 0.6124049317398083, "grad_norm": 2.274673477205489, "learning_rate": 4.532583041020921e-07, "logits/chosen": 2.802734375, "logits/rejected": 2.375, "logps/chosen": -958.0, "logps/rejected": -1265.0, "loss": 0.4723, "rewards/accuracies": 0.875, "rewards/chosen": 0.5675048828125, "rewards/margins": 5.23828125, "rewards/rejected": -4.66796875, "step": 3241 }, { "epoch": 0.6125938872880154, "grad_norm": 3.4515991165159163, "learning_rate": 4.5296845295870043e-07, "logits/chosen": 2.310546875, "logits/rejected": 1.916015625, "logps/chosen": -927.0, "logps/rejected": -793.0, "loss": 0.593, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3203125, "rewards/margins": 3.91796875, "rewards/rejected": -3.595703125, "step": 3242 }, { "epoch": 0.6127828428362228, "grad_norm": 2.7638014203318413, "learning_rate": 4.5267864402882427e-07, "logits/chosen": 2.66796875, "logits/rejected": 2.4921875, "logps/chosen": -1009.0, "logps/rejected": -991.0, "loss": 0.5902, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7047119140625, "rewards/margins": 4.68359375, "rewards/rejected": -3.984375, "step": 3243 }, { "epoch": 0.61297179838443, "grad_norm": 1.9812790111593561, "learning_rate": 4.5238887743854505e-07, "logits/chosen": 2.08203125, "logits/rejected": 1.5712890625, "logps/chosen": -706.5, "logps/rejected": -528.5, "loss": 0.585, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4010009765625, "rewards/margins": 3.80078125, "rewards/rejected": -4.19921875, "step": 3244 }, { "epoch": 0.6131607539326374, "grad_norm": 2.431745418701393, "learning_rate": 4.520991533139252e-07, "logits/chosen": 1.8408203125, "logits/rejected": 1.41796875, "logps/chosen": -724.5, "logps/rejected": -1154.0, "loss": 0.7243, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7099609375, "rewards/margins": 3.255859375, "rewards/rejected": -2.541015625, "step": 3245 }, { "epoch": 0.6133497094808447, "grad_norm": 1.6866711612021372, "learning_rate": 4.518094717810096e-07, "logits/chosen": 3.53515625, "logits/rejected": 3.328125, "logps/chosen": -883.0, "logps/rejected": -885.0, "loss": 0.4442, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2900390625, "rewards/margins": 5.40625, "rewards/rejected": -4.1015625, "step": 3246 }, { "epoch": 0.613538665029052, "grad_norm": 2.2624870422473804, "learning_rate": 4.515198329658234e-07, "logits/chosen": 3.140625, "logits/rejected": 2.4375, "logps/chosen": -1012.0, "logps/rejected": -1026.5, "loss": 0.5324, "rewards/accuracies": 0.78125, "rewards/chosen": 1.252197265625, "rewards/margins": 6.37109375, "rewards/rejected": -5.125, "step": 3247 }, { "epoch": 0.6137276205772592, "grad_norm": 3.036976637141171, "learning_rate": 4.512302369943741e-07, "logits/chosen": 2.8095703125, "logits/rejected": 2.609375, "logps/chosen": -475.5, "logps/rejected": -822.0, "loss": 0.5992, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1474609375, "rewards/margins": 5.0703125, "rewards/rejected": -4.92578125, "step": 3248 }, { "epoch": 0.6139165761254665, "grad_norm": 2.160575261501197, "learning_rate": 4.5094068399265017e-07, "logits/chosen": 3.390625, "logits/rejected": 3.3046875, "logps/chosen": -926.0, "logps/rejected": -2123.0, "loss": 0.5667, "rewards/accuracies": 0.78125, "rewards/chosen": 1.5234375, "rewards/margins": 11.9921875, "rewards/rejected": -10.4609375, "step": 3249 }, { "epoch": 0.6141055316736738, "grad_norm": 2.959996217460523, "learning_rate": 4.5065117408662145e-07, "logits/chosen": 2.1640625, "logits/rejected": 2.05078125, "logps/chosen": -639.5, "logps/rejected": -847.0, "loss": 0.6635, "rewards/accuracies": 0.75, "rewards/chosen": 0.186767578125, "rewards/margins": 4.08203125, "rewards/rejected": -3.890625, "step": 3250 }, { "epoch": 0.6142944872218811, "grad_norm": 2.5303673062477445, "learning_rate": 4.5036170740223934e-07, "logits/chosen": 2.767578125, "logits/rejected": 3.275390625, "logps/chosen": -848.5, "logps/rejected": -672.5, "loss": 0.6621, "rewards/accuracies": 0.8125, "rewards/chosen": 0.85791015625, "rewards/margins": 4.96484375, "rewards/rejected": -4.09765625, "step": 3251 }, { "epoch": 0.6144834427700884, "grad_norm": 1.780340578403596, "learning_rate": 4.500722840654356e-07, "logits/chosen": 3.390625, "logits/rejected": 2.8671875, "logps/chosen": -1504.5, "logps/rejected": -968.5, "loss": 0.593, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8125, "rewards/margins": 2.0234375, "rewards/rejected": -3.84375, "step": 3252 }, { "epoch": 0.6146723983182957, "grad_norm": 2.5642374963451644, "learning_rate": 4.497829042021242e-07, "logits/chosen": 3.0625, "logits/rejected": 3.009765625, "logps/chosen": -792.0, "logps/rejected": -1072.0, "loss": 0.5524, "rewards/accuracies": 0.875, "rewards/chosen": 0.447021484375, "rewards/margins": 4.484375, "rewards/rejected": -4.0234375, "step": 3253 }, { "epoch": 0.6148613538665029, "grad_norm": 2.073348376089816, "learning_rate": 4.4949356793819927e-07, "logits/chosen": 3.72265625, "logits/rejected": 3.1953125, "logps/chosen": -864.0, "logps/rejected": -831.5, "loss": 0.6923, "rewards/accuracies": 0.8125, "rewards/chosen": 0.68017578125, "rewards/margins": 3.3671875, "rewards/rejected": -2.685546875, "step": 3254 }, { "epoch": 0.6150503094147102, "grad_norm": 2.276140483005835, "learning_rate": 4.492042753995368e-07, "logits/chosen": 2.48046875, "logits/rejected": 2.3515625, "logps/chosen": -510.0, "logps/rejected": -732.0, "loss": 0.6448, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5142822265625, "rewards/margins": 3.9296875, "rewards/rejected": -4.4375, "step": 3255 }, { "epoch": 0.6152392649629175, "grad_norm": 1.5635251497579246, "learning_rate": 4.489150267119929e-07, "logits/chosen": 2.71875, "logits/rejected": 2.341796875, "logps/chosen": -940.5, "logps/rejected": -1387.0, "loss": 0.496, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0283203125, "rewards/margins": 6.5078125, "rewards/rejected": -5.4765625, "step": 3256 }, { "epoch": 0.6154282205111248, "grad_norm": 3.1515456378379882, "learning_rate": 4.4862582200140507e-07, "logits/chosen": 3.6171875, "logits/rejected": 3.08984375, "logps/chosen": -1031.0, "logps/rejected": -617.5, "loss": 0.6587, "rewards/accuracies": 0.6875, "rewards/chosen": 0.96875, "rewards/margins": 3.44140625, "rewards/rejected": -2.4735107421875, "step": 3257 }, { "epoch": 0.6156171760593321, "grad_norm": 2.2726959493514016, "learning_rate": 4.483366613935922e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.302734375, "logps/chosen": -1346.0, "logps/rejected": -1534.0, "loss": 0.5303, "rewards/accuracies": 0.78125, "rewards/chosen": 1.171875, "rewards/margins": 7.21875, "rewards/rejected": -6.046875, "step": 3258 }, { "epoch": 0.6158061316075393, "grad_norm": 1.768637088316248, "learning_rate": 4.4804754501435285e-07, "logits/chosen": 1.7041015625, "logits/rejected": 1.48828125, "logps/chosen": -764.0, "logps/rejected": -849.0, "loss": 0.5025, "rewards/accuracies": 0.90625, "rewards/chosen": 1.55859375, "rewards/margins": 5.1875, "rewards/rejected": -3.63671875, "step": 3259 }, { "epoch": 0.6159950871557466, "grad_norm": 2.444330333311281, "learning_rate": 4.47758472989467e-07, "logits/chosen": 2.2294921875, "logits/rejected": 1.78759765625, "logps/chosen": -773.5, "logps/rejected": -745.0, "loss": 0.6165, "rewards/accuracies": 0.75, "rewards/chosen": 0.0126953125, "rewards/margins": 3.87890625, "rewards/rejected": -3.859375, "step": 3260 }, { "epoch": 0.6161840427039539, "grad_norm": 2.7153235494381462, "learning_rate": 4.4746944544469544e-07, "logits/chosen": 3.08203125, "logits/rejected": 2.935546875, "logps/chosen": -551.25, "logps/rejected": -8047.5, "loss": 0.5904, "rewards/accuracies": 0.78125, "rewards/chosen": 0.56494140625, "rewards/margins": -0.71875, "rewards/rejected": 1.26953125, "step": 3261 }, { "epoch": 0.6163729982521612, "grad_norm": 2.0195562207333624, "learning_rate": 4.4718046250577957e-07, "logits/chosen": 2.671875, "logits/rejected": 2.806640625, "logps/chosen": -698.0, "logps/rejected": -947.0, "loss": 0.6583, "rewards/accuracies": 0.84375, "rewards/chosen": 0.071533203125, "rewards/margins": 4.04296875, "rewards/rejected": -3.9765625, "step": 3262 }, { "epoch": 0.6165619538003685, "grad_norm": 2.7094618030392703, "learning_rate": 4.468915242984408e-07, "logits/chosen": 3.81640625, "logits/rejected": 3.26953125, "logps/chosen": -826.0, "logps/rejected": -1067.0, "loss": 0.5702, "rewards/accuracies": 0.875, "rewards/chosen": 0.8720703125, "rewards/margins": 6.4140625, "rewards/rejected": -5.5390625, "step": 3263 }, { "epoch": 0.6167509093485758, "grad_norm": 2.36230182680781, "learning_rate": 4.466026309483818e-07, "logits/chosen": 2.5859375, "logits/rejected": 2.369140625, "logps/chosen": -1138.5, "logps/rejected": -1023.0, "loss": 0.4417, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1064453125, "rewards/margins": 6.53125, "rewards/rejected": -5.41015625, "step": 3264 }, { "epoch": 0.616939864896783, "grad_norm": 1.8232497005240573, "learning_rate": 4.463137825812854e-07, "logits/chosen": 2.818359375, "logits/rejected": 3.14453125, "logps/chosen": -904.0, "logps/rejected": -1161.0, "loss": 0.5437, "rewards/accuracies": 0.875, "rewards/chosen": 0.2705078125, "rewards/margins": 7.4453125, "rewards/rejected": -7.1875, "step": 3265 }, { "epoch": 0.6171288204449903, "grad_norm": 3.475252531838494, "learning_rate": 4.460249793228148e-07, "logits/chosen": 2.740234375, "logits/rejected": 2.58984375, "logps/chosen": -899.5, "logps/rejected": -863.0, "loss": 0.5378, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6474609375, "rewards/margins": 5.0625, "rewards/rejected": -4.41796875, "step": 3266 }, { "epoch": 0.6173177759931976, "grad_norm": 2.111073113129444, "learning_rate": 4.457362212986141e-07, "logits/chosen": 1.818359375, "logits/rejected": 1.41796875, "logps/chosen": -881.5, "logps/rejected": -745.5, "loss": 0.5717, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5999755859375, "rewards/margins": 5.265625, "rewards/rejected": -4.6640625, "step": 3267 }, { "epoch": 0.6175067315414049, "grad_norm": 3.1123068943375882, "learning_rate": 4.454475086343067e-07, "logits/chosen": 3.27734375, "logits/rejected": 2.611328125, "logps/chosen": -827.0, "logps/rejected": -15172.0, "loss": 0.6345, "rewards/accuracies": 0.78125, "rewards/chosen": 0.309814453125, "rewards/margins": -120.546875, "rewards/rejected": 120.85546875, "step": 3268 }, { "epoch": 0.6176956870896122, "grad_norm": 3.420197374495423, "learning_rate": 4.451588414554973e-07, "logits/chosen": 2.744140625, "logits/rejected": 2.701171875, "logps/chosen": -683.0, "logps/rejected": -1604.0, "loss": 0.6287, "rewards/accuracies": 0.75, "rewards/chosen": -0.62109375, "rewards/margins": 8.9296875, "rewards/rejected": -9.5546875, "step": 3269 }, { "epoch": 0.6178846426378195, "grad_norm": 2.240436182673527, "learning_rate": 4.448702198877702e-07, "logits/chosen": 2.630859375, "logits/rejected": 2.345703125, "logps/chosen": -763.0, "logps/rejected": -1775.5, "loss": 0.5355, "rewards/accuracies": 0.875, "rewards/chosen": 0.609375, "rewards/margins": 6.63671875, "rewards/rejected": -6.0390625, "step": 3270 }, { "epoch": 0.6180735981860267, "grad_norm": 3.715839893630756, "learning_rate": 4.4458164405668997e-07, "logits/chosen": 2.724609375, "logits/rejected": 2.828125, "logps/chosen": -645.0, "logps/rejected": -2258.0, "loss": 0.5694, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5732421875, "rewards/margins": 10.0078125, "rewards/rejected": -9.421875, "step": 3271 }, { "epoch": 0.618262553734234, "grad_norm": 2.690866486273661, "learning_rate": 4.4429311408780167e-07, "logits/chosen": 2.78515625, "logits/rejected": 2.720703125, "logps/chosen": -696.5, "logps/rejected": -898.0, "loss": 0.6195, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1181640625, "rewards/margins": 5.82421875, "rewards/rejected": -5.6953125, "step": 3272 }, { "epoch": 0.6184515092824413, "grad_norm": 2.8402811539291735, "learning_rate": 4.440046301066297e-07, "logits/chosen": 2.7578125, "logits/rejected": 2.666015625, "logps/chosen": -610.0, "logps/rejected": -1524.0, "loss": 0.4874, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0003662109375, "rewards/margins": 7.7734375, "rewards/rejected": -6.77734375, "step": 3273 }, { "epoch": 0.6186404648306486, "grad_norm": 1.8308984875753602, "learning_rate": 4.437161922386792e-07, "logits/chosen": 3.109375, "logits/rejected": 3.0390625, "logps/chosen": -663.0, "logps/rejected": -1263.5, "loss": 0.5772, "rewards/accuracies": 0.84375, "rewards/chosen": 0.89306640625, "rewards/margins": 4.908203125, "rewards/rejected": -4.01171875, "step": 3274 }, { "epoch": 0.6188294203788559, "grad_norm": 1.985501307076303, "learning_rate": 4.434278006094345e-07, "logits/chosen": 2.71484375, "logits/rejected": 2.349609375, "logps/chosen": -1080.75, "logps/rejected": -1218.0, "loss": 0.4317, "rewards/accuracies": 0.84375, "rewards/chosen": 1.880859375, "rewards/margins": 8.46875, "rewards/rejected": -6.59375, "step": 3275 }, { "epoch": 0.6190183759270632, "grad_norm": 1.4088935876295197, "learning_rate": 4.431394553443606e-07, "logits/chosen": 3.23046875, "logits/rejected": 3.62109375, "logps/chosen": -1329.0, "logps/rejected": -3358.0, "loss": 0.3812, "rewards/accuracies": 0.875, "rewards/chosen": 2.4013671875, "rewards/margins": 11.6953125, "rewards/rejected": -9.2734375, "step": 3276 }, { "epoch": 0.6192073314752704, "grad_norm": 1.8126935665194654, "learning_rate": 4.428511565689016e-07, "logits/chosen": 3.390625, "logits/rejected": 3.03515625, "logps/chosen": -475.0, "logps/rejected": -2213.0, "loss": 0.6501, "rewards/accuracies": 0.90625, "rewards/chosen": 0.417236328125, "rewards/margins": 8.60546875, "rewards/rejected": -8.16015625, "step": 3277 }, { "epoch": 0.6193962870234777, "grad_norm": 2.352786003736648, "learning_rate": 4.4256290440848186e-07, "logits/chosen": 2.3642578125, "logits/rejected": 2.349609375, "logps/chosen": -684.5, "logps/rejected": -1404.0, "loss": 0.6895, "rewards/accuracies": 0.71875, "rewards/chosen": 0.43701171875, "rewards/margins": 5.98828125, "rewards/rejected": -5.53515625, "step": 3278 }, { "epoch": 0.619585242571685, "grad_norm": 1.8340281190857148, "learning_rate": 4.4227469898850534e-07, "logits/chosen": 2.3232421875, "logits/rejected": 2.27105712890625, "logps/chosen": -881.0, "logps/rejected": -931.0, "loss": 0.5491, "rewards/accuracies": 0.9375, "rewards/chosen": 0.738525390625, "rewards/margins": 5.3671875, "rewards/rejected": -4.62890625, "step": 3279 }, { "epoch": 0.6197741981198923, "grad_norm": 3.2185873446499653, "learning_rate": 4.419865404343556e-07, "logits/chosen": 2.33984375, "logits/rejected": 2.59765625, "logps/chosen": -713.0, "logps/rejected": -1538.0, "loss": 0.5366, "rewards/accuracies": 0.875, "rewards/chosen": 1.33837890625, "rewards/margins": 9.2734375, "rewards/rejected": -7.921875, "step": 3280 }, { "epoch": 0.6199631536680996, "grad_norm": 3.0916114600560967, "learning_rate": 4.4169842887139595e-07, "logits/chosen": 2.333984375, "logits/rejected": 1.544189453125, "logps/chosen": -554.0, "logps/rejected": -588.0, "loss": 0.5701, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4832763671875, "rewards/margins": 3.82421875, "rewards/rejected": -3.33984375, "step": 3281 }, { "epoch": 0.6201521092163068, "grad_norm": 3.8826240644316274, "learning_rate": 4.414103644249688e-07, "logits/chosen": 3.28125, "logits/rejected": 3.5078125, "logps/chosen": -919.0, "logps/rejected": -980.5, "loss": 0.5725, "rewards/accuracies": 0.78125, "rewards/chosen": 0.951171875, "rewards/margins": 4.48828125, "rewards/rejected": -3.5390625, "step": 3282 }, { "epoch": 0.6203410647645141, "grad_norm": 1.99704833299739, "learning_rate": 4.4112234722039677e-07, "logits/chosen": 1.91015625, "logits/rejected": 1.519775390625, "logps/chosen": -12834.0, "logps/rejected": -978.5, "loss": 0.569, "rewards/accuracies": 0.875, "rewards/chosen": 131.55029296875, "rewards/margins": 136.06640625, "rewards/rejected": -4.17578125, "step": 3283 }, { "epoch": 0.6205300203127214, "grad_norm": 2.509464459274205, "learning_rate": 4.4083437738298145e-07, "logits/chosen": 1.5345001220703125, "logits/rejected": 2.105712890625, "logps/chosen": -568.5, "logps/rejected": -645.0, "loss": 0.6168, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0948486328125, "rewards/margins": 3.96484375, "rewards/rejected": -4.0546875, "step": 3284 }, { "epoch": 0.6207189758609287, "grad_norm": 2.779084758366098, "learning_rate": 4.405464550380038e-07, "logits/chosen": 2.94140625, "logits/rejected": 3.515625, "logps/chosen": -682.5, "logps/rejected": -1014.0, "loss": 0.7242, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0341796875, "rewards/margins": 4.435546875, "rewards/rejected": -4.412109375, "step": 3285 }, { "epoch": 0.620907931409136, "grad_norm": 1.980044387816763, "learning_rate": 4.402585803107246e-07, "logits/chosen": 3.29296875, "logits/rejected": 3.0546875, "logps/chosen": -397.0, "logps/rejected": -560.0, "loss": 0.7088, "rewards/accuracies": 0.78125, "rewards/chosen": -0.16064453125, "rewards/margins": 3.00390625, "rewards/rejected": -3.171875, "step": 3286 }, { "epoch": 0.6210968869573433, "grad_norm": 2.232242931131998, "learning_rate": 4.3997075332638336e-07, "logits/chosen": 2.513671875, "logits/rejected": 2.255859375, "logps/chosen": -1279.0, "logps/rejected": -1195.0, "loss": 0.6047, "rewards/accuracies": 0.6875, "rewards/chosen": 0.671875, "rewards/margins": 5.34375, "rewards/rejected": -4.67578125, "step": 3287 }, { "epoch": 0.6212858425055505, "grad_norm": 2.287612869147886, "learning_rate": 4.39682974210199e-07, "logits/chosen": 1.86328125, "logits/rejected": 1.6953125, "logps/chosen": -971.5, "logps/rejected": -930.0, "loss": 0.5459, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6689453125, "rewards/margins": 4.703125, "rewards/rejected": -4.02734375, "step": 3288 }, { "epoch": 0.6214747980537578, "grad_norm": 1.8588845260626312, "learning_rate": 4.3939524308736963e-07, "logits/chosen": 3.33984375, "logits/rejected": 2.89453125, "logps/chosen": -825.0, "logps/rejected": -1272.0, "loss": 0.5221, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9951171875, "rewards/margins": 6.9453125, "rewards/rejected": -5.95703125, "step": 3289 }, { "epoch": 0.6216637536019651, "grad_norm": 2.264546691561343, "learning_rate": 4.391075600830727e-07, "logits/chosen": 3.359375, "logits/rejected": 2.91015625, "logps/chosen": -750.5, "logps/rejected": -672.0, "loss": 0.6105, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0859375, "rewards/margins": 4.171875, "rewards/rejected": -4.2578125, "step": 3290 }, { "epoch": 0.6218527091501724, "grad_norm": 3.9279646261338232, "learning_rate": 4.388199253224643e-07, "logits/chosen": 1.9580078125, "logits/rejected": 2.376953125, "logps/chosen": -659.0, "logps/rejected": -778.0, "loss": 0.6076, "rewards/accuracies": 0.8125, "rewards/chosen": 0.07373046875, "rewards/margins": 4.2890625, "rewards/rejected": -4.21875, "step": 3291 }, { "epoch": 0.6220416646983797, "grad_norm": 1.6556669231058498, "learning_rate": 4.3853233893067973e-07, "logits/chosen": 2.94921875, "logits/rejected": 2.15625, "logps/chosen": -738.0, "logps/rejected": -702.5, "loss": 0.6466, "rewards/accuracies": 0.6875, "rewards/chosen": 0.560546875, "rewards/margins": 3.9765625, "rewards/rejected": -3.4140625, "step": 3292 }, { "epoch": 0.622230620246587, "grad_norm": 2.338011742447001, "learning_rate": 4.382448010328337e-07, "logits/chosen": 3.2421875, "logits/rejected": 2.98828125, "logps/chosen": -598.5, "logps/rejected": -1096.0, "loss": 0.6017, "rewards/accuracies": 0.875, "rewards/chosen": -0.0883941650390625, "rewards/margins": 4.60546875, "rewards/rejected": -4.69140625, "step": 3293 }, { "epoch": 0.6224195757947942, "grad_norm": 2.2956546058855816, "learning_rate": 4.3795731175401886e-07, "logits/chosen": 2.6640625, "logits/rejected": 2.759765625, "logps/chosen": -722.0, "logps/rejected": -727.0, "loss": 0.533, "rewards/accuracies": 0.875, "rewards/chosen": 0.11669921875, "rewards/margins": 4.6171875, "rewards/rejected": -4.50390625, "step": 3294 }, { "epoch": 0.6226085313430015, "grad_norm": 1.899830230736856, "learning_rate": 4.3766987121930767e-07, "logits/chosen": 2.388671875, "logits/rejected": 2.20703125, "logps/chosen": -850.0, "logps/rejected": -683.0, "loss": 0.5203, "rewards/accuracies": 0.84375, "rewards/chosen": 0.355224609375, "rewards/margins": 5.5859375, "rewards/rejected": -5.234375, "step": 3295 }, { "epoch": 0.6227974868912088, "grad_norm": 1.9036406588192327, "learning_rate": 4.373824795537506e-07, "logits/chosen": 2.064453125, "logits/rejected": 1.826171875, "logps/chosen": -838.0, "logps/rejected": -830.0, "loss": 0.6264, "rewards/accuracies": 0.71875, "rewards/chosen": -0.34033203125, "rewards/margins": 4.4169921875, "rewards/rejected": -4.765625, "step": 3296 }, { "epoch": 0.6229864424394161, "grad_norm": 1.9235799726999112, "learning_rate": 4.370951368823775e-07, "logits/chosen": 3.02734375, "logits/rejected": 2.8046875, "logps/chosen": -705.0, "logps/rejected": -1009.5, "loss": 0.5204, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2626953125, "rewards/margins": 5.4921875, "rewards/rejected": -5.234375, "step": 3297 }, { "epoch": 0.6231753979876234, "grad_norm": 3.0546610446047158, "learning_rate": 4.368078433301964e-07, "logits/chosen": 3.1796875, "logits/rejected": 2.349609375, "logps/chosen": -641.5, "logps/rejected": -559.0, "loss": 0.5641, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2735595703125, "rewards/margins": 4.58203125, "rewards/rejected": -4.3046875, "step": 3298 }, { "epoch": 0.6233643535358308, "grad_norm": 2.800427909424057, "learning_rate": 4.365205990221944e-07, "logits/chosen": 3.08984375, "logits/rejected": 2.828125, "logps/chosen": -542.25, "logps/rejected": -576.0, "loss": 0.606, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6103515625, "rewards/margins": 3.94140625, "rewards/rejected": -3.33203125, "step": 3299 }, { "epoch": 0.6235533090840379, "grad_norm": 1.6248587472991853, "learning_rate": 4.3623340408333687e-07, "logits/chosen": 2.3203125, "logits/rejected": 2.28515625, "logps/chosen": -790.5, "logps/rejected": -929.0, "loss": 0.6929, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0712890625, "rewards/margins": 3.7431640625, "rewards/rejected": -3.663818359375, "step": 3300 }, { "epoch": 0.6237422646322452, "grad_norm": 2.391807559186066, "learning_rate": 4.359462586385677e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.486328125, "logps/chosen": -928.0, "logps/rejected": -983.0, "loss": 0.5142, "rewards/accuracies": 0.8125, "rewards/chosen": 1.46630859375, "rewards/margins": 5.76171875, "rewards/rejected": -4.314453125, "step": 3301 }, { "epoch": 0.6239312201804526, "grad_norm": 1.8028971707778807, "learning_rate": 4.356591628128097e-07, "logits/chosen": 2.0087890625, "logits/rejected": 1.8837890625, "logps/chosen": -748.0, "logps/rejected": -1747.0, "loss": 0.5205, "rewards/accuracies": 0.84375, "rewards/chosen": 1.06396484375, "rewards/margins": 6.828125, "rewards/rejected": -5.7734375, "step": 3302 }, { "epoch": 0.6241201757286599, "grad_norm": 1.88877187088777, "learning_rate": 4.353721167309632e-07, "logits/chosen": 2.689453125, "logits/rejected": 2.02734375, "logps/chosen": -1170.0, "logps/rejected": -906.0, "loss": 0.5638, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4697265625, "rewards/margins": 4.66796875, "rewards/rejected": -4.1953125, "step": 3303 }, { "epoch": 0.6243091312768672, "grad_norm": 1.7067227427560334, "learning_rate": 4.350851205179077e-07, "logits/chosen": 2.1279296875, "logits/rejected": 2.01025390625, "logps/chosen": -789.0, "logps/rejected": -1760.0, "loss": 0.4213, "rewards/accuracies": 0.875, "rewards/chosen": 1.0849609375, "rewards/margins": 8.2109375, "rewards/rejected": -7.125, "step": 3304 }, { "epoch": 0.6244980868250743, "grad_norm": 3.0963465693606937, "learning_rate": 4.3479817429850076e-07, "logits/chosen": 3.61328125, "logits/rejected": 3.58203125, "logps/chosen": -478.0, "logps/rejected": -437.0, "loss": 0.7432, "rewards/accuracies": 0.59375, "rewards/chosen": 0.7386474609375, "rewards/margins": 2.142578125, "rewards/rejected": -1.40087890625, "step": 3305 }, { "epoch": 0.6246870423732817, "grad_norm": 1.7667295037662556, "learning_rate": 4.3451127819757803e-07, "logits/chosen": 2.375, "logits/rejected": 2.634765625, "logps/chosen": -729.5, "logps/rejected": -764.0, "loss": 0.5437, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5634765625, "rewards/margins": 5.453125, "rewards/rejected": -4.888671875, "step": 3306 }, { "epoch": 0.624875997921489, "grad_norm": 6.443554005803872, "learning_rate": 4.342244323399539e-07, "logits/chosen": 2.71484375, "logits/rejected": 2.7421875, "logps/chosen": -518.5, "logps/rejected": -712.0, "loss": 0.6419, "rewards/accuracies": 0.78125, "rewards/chosen": 0.279296875, "rewards/margins": 3.75, "rewards/rejected": -3.47265625, "step": 3307 }, { "epoch": 0.6250649534696963, "grad_norm": 1.9124917313267311, "learning_rate": 4.3393763685041974e-07, "logits/chosen": 2.267578125, "logits/rejected": 2.59375, "logps/chosen": -983.0, "logps/rejected": -987.0, "loss": 0.5784, "rewards/accuracies": 0.75, "rewards/chosen": 0.7998046875, "rewards/margins": 4.703125, "rewards/rejected": -3.8984375, "step": 3308 }, { "epoch": 0.6252539090179036, "grad_norm": 2.047361462494746, "learning_rate": 4.336508918537464e-07, "logits/chosen": 2.890625, "logits/rejected": 2.1240234375, "logps/chosen": -1152.0, "logps/rejected": -1206.0, "loss": 0.4758, "rewards/accuracies": 0.8125, "rewards/chosen": 1.312255859375, "rewards/margins": 7.2421875, "rewards/rejected": -5.9296875, "step": 3309 }, { "epoch": 0.6254428645661109, "grad_norm": 2.467920139217747, "learning_rate": 4.333641974746818e-07, "logits/chosen": 2.34375, "logits/rejected": 1.7861328125, "logps/chosen": -594.5, "logps/rejected": -575.0, "loss": 0.6416, "rewards/accuracies": 0.75, "rewards/chosen": 0.70654296875, "rewards/margins": 3.923828125, "rewards/rejected": -3.21875, "step": 3310 }, { "epoch": 0.6256318201143181, "grad_norm": 2.516925867504486, "learning_rate": 4.330775538379524e-07, "logits/chosen": 2.537109375, "logits/rejected": 2.3720703125, "logps/chosen": -469.0, "logps/rejected": -1729.0, "loss": 0.6133, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4530029296875, "rewards/margins": 8.0546875, "rewards/rejected": -7.59375, "step": 3311 }, { "epoch": 0.6258207756625254, "grad_norm": 2.1224840880066473, "learning_rate": 4.3279096106826195e-07, "logits/chosen": 2.13671875, "logits/rejected": 2.14453125, "logps/chosen": -1071.0, "logps/rejected": -934.0, "loss": 0.5486, "rewards/accuracies": 0.875, "rewards/chosen": 1.560546875, "rewards/margins": 5.51953125, "rewards/rejected": -3.970703125, "step": 3312 }, { "epoch": 0.6260097312107327, "grad_norm": 1.7408322353263717, "learning_rate": 4.325044192902928e-07, "logits/chosen": 2.90234375, "logits/rejected": 2.88671875, "logps/chosen": -903.0, "logps/rejected": -1099.0, "loss": 0.5616, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6494140625, "rewards/margins": 5.125, "rewards/rejected": -4.4765625, "step": 3313 }, { "epoch": 0.62619868675894, "grad_norm": 3.108857840555722, "learning_rate": 4.3221792862870464e-07, "logits/chosen": 2.330078125, "logits/rejected": 2.84375, "logps/chosen": -676.5, "logps/rejected": -771.0, "loss": 0.7137, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0821533203125, "rewards/margins": 4.224609375, "rewards/rejected": -4.30078125, "step": 3314 }, { "epoch": 0.6263876423071473, "grad_norm": 2.059326933667393, "learning_rate": 4.3193148920813503e-07, "logits/chosen": 2.6015625, "logits/rejected": 2.1826171875, "logps/chosen": -930.5, "logps/rejected": -866.0, "loss": 0.615, "rewards/accuracies": 0.8125, "rewards/chosen": 0.043212890625, "rewards/margins": 4.3203125, "rewards/rejected": -4.27734375, "step": 3315 }, { "epoch": 0.6265765978553546, "grad_norm": 2.0762736272539604, "learning_rate": 4.316451011531996e-07, "logits/chosen": 2.99609375, "logits/rejected": 2.5859375, "logps/chosen": -773.5, "logps/rejected": -960.0, "loss": 0.5713, "rewards/accuracies": 0.75, "rewards/chosen": 0.79345703125, "rewards/margins": 7.259765625, "rewards/rejected": -6.46142578125, "step": 3316 }, { "epoch": 0.6267655534035618, "grad_norm": 5.617522144679025, "learning_rate": 4.3135876458849073e-07, "logits/chosen": 2.884765625, "logits/rejected": 2.40478515625, "logps/chosen": -1515.0, "logps/rejected": -1116.5, "loss": 0.5296, "rewards/accuracies": 0.8125, "rewards/chosen": 1.988525390625, "rewards/margins": 5.73046875, "rewards/rejected": -3.75, "step": 3317 }, { "epoch": 0.6269545089517691, "grad_norm": 1.7100627741273384, "learning_rate": 4.310724796385794e-07, "logits/chosen": 3.1796875, "logits/rejected": 2.833984375, "logps/chosen": -423.5, "logps/rejected": -486.0, "loss": 0.5628, "rewards/accuracies": 0.84375, "rewards/chosen": 0.58984375, "rewards/margins": 4.302734375, "rewards/rejected": -3.70703125, "step": 3318 }, { "epoch": 0.6271434644999764, "grad_norm": 3.2734681236838483, "learning_rate": 4.307862464280134e-07, "logits/chosen": 2.6748046875, "logits/rejected": 2.7607421875, "logps/chosen": -950.0, "logps/rejected": -1307.0, "loss": 0.5422, "rewards/accuracies": 0.875, "rewards/chosen": 1.49462890625, "rewards/margins": 5.76171875, "rewards/rejected": -4.28125, "step": 3319 }, { "epoch": 0.6273324200481837, "grad_norm": 3.80136842867361, "learning_rate": 4.305000650813186e-07, "logits/chosen": 2.33984375, "logits/rejected": 2.388671875, "logps/chosen": -859.5, "logps/rejected": -1503.0, "loss": 0.5319, "rewards/accuracies": 0.875, "rewards/chosen": 1.16455078125, "rewards/margins": 5.5625, "rewards/rejected": -4.404296875, "step": 3320 }, { "epoch": 0.627521375596391, "grad_norm": 2.904954570197036, "learning_rate": 4.30213935722998e-07, "logits/chosen": 2.296875, "logits/rejected": 2.404296875, "logps/chosen": -631.5, "logps/rejected": -593.5, "loss": 0.5718, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7559814453125, "rewards/margins": 4.62109375, "rewards/rejected": -3.8671875, "step": 3321 }, { "epoch": 0.6277103311445983, "grad_norm": 4.884995610828646, "learning_rate": 4.2992785847753176e-07, "logits/chosen": 3.60546875, "logits/rejected": 3.279296875, "logps/chosen": -639.0, "logps/rejected": -845.0, "loss": 0.6125, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3057861328125, "rewards/margins": 4.203125, "rewards/rejected": -3.91015625, "step": 3322 }, { "epoch": 0.6278992866928055, "grad_norm": 3.7088561146557253, "learning_rate": 4.2964183346937786e-07, "logits/chosen": 3.0234375, "logits/rejected": 2.56640625, "logps/chosen": -945.0, "logps/rejected": -992.0, "loss": 0.5895, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7159423828125, "rewards/margins": 5.00390625, "rewards/rejected": -4.296875, "step": 3323 }, { "epoch": 0.6280882422410128, "grad_norm": 2.60223071097804, "learning_rate": 4.2935586082297117e-07, "logits/chosen": 2.06640625, "logits/rejected": 2.189453125, "logps/chosen": -930.0, "logps/rejected": -952.0, "loss": 0.5679, "rewards/accuracies": 0.75, "rewards/chosen": 0.972900390625, "rewards/margins": 4.671875, "rewards/rejected": -3.69921875, "step": 3324 }, { "epoch": 0.6282771977892201, "grad_norm": 3.506313337296725, "learning_rate": 4.2906994066272394e-07, "logits/chosen": 2.1650390625, "logits/rejected": 1.9931640625, "logps/chosen": -415.5, "logps/rejected": -608.5, "loss": 0.5868, "rewards/accuracies": 0.78125, "rewards/chosen": 0.13427734375, "rewards/margins": 4.4375, "rewards/rejected": -4.30859375, "step": 3325 }, { "epoch": 0.6284661533374274, "grad_norm": 1.97129401584371, "learning_rate": 4.2878407311302544e-07, "logits/chosen": 2.69140625, "logits/rejected": 2.24609375, "logps/chosen": -996.0, "logps/rejected": -931.0, "loss": 0.5073, "rewards/accuracies": 0.78125, "rewards/chosen": 0.85205078125, "rewards/margins": 4.7265625, "rewards/rejected": -3.87890625, "step": 3326 }, { "epoch": 0.6286551088856347, "grad_norm": 1.970368088399917, "learning_rate": 4.2849825829824235e-07, "logits/chosen": 3.19140625, "logits/rejected": 2.662109375, "logps/chosen": -741.0, "logps/rejected": -834.5, "loss": 0.619, "rewards/accuracies": 0.75, "rewards/chosen": 0.2041015625, "rewards/margins": 4.984375, "rewards/rejected": -4.76171875, "step": 3327 }, { "epoch": 0.6288440644338419, "grad_norm": 2.646976885248606, "learning_rate": 4.2821249634271826e-07, "logits/chosen": 2.59375, "logits/rejected": 2.71875, "logps/chosen": -1012.5, "logps/rejected": -858.0, "loss": 0.6014, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4208984375, "rewards/margins": 2.234375, "rewards/rejected": -3.66015625, "step": 3328 }, { "epoch": 0.6290330199820492, "grad_norm": 3.0590042939240063, "learning_rate": 4.2792678737077357e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.30859375, "logps/chosen": -1114.5, "logps/rejected": -888.5, "loss": 0.6184, "rewards/accuracies": 0.71875, "rewards/chosen": 9.763671875, "rewards/margins": 12.501953125, "rewards/rejected": -2.78125, "step": 3329 }, { "epoch": 0.6292219755302565, "grad_norm": 3.142843068320741, "learning_rate": 4.2764113150670613e-07, "logits/chosen": 2.0390625, "logits/rejected": 2.4130859375, "logps/chosen": -659.0, "logps/rejected": -1313.0, "loss": 0.6403, "rewards/accuracies": 0.875, "rewards/chosen": 0.405548095703125, "rewards/margins": 4.955078125, "rewards/rejected": -4.546875, "step": 3330 }, { "epoch": 0.6294109310784638, "grad_norm": 2.01238290468048, "learning_rate": 4.2735552887478986e-07, "logits/chosen": 1.755859375, "logits/rejected": 2.0615234375, "logps/chosen": -14873.0, "logps/rejected": -941.0, "loss": 0.5663, "rewards/accuracies": 0.90625, "rewards/chosen": 138.69384765625, "rewards/margins": 142.390625, "rewards/rejected": -3.75390625, "step": 3331 }, { "epoch": 0.6295998866266711, "grad_norm": 2.774390935490397, "learning_rate": 4.270699795992763e-07, "logits/chosen": 2.8515625, "logits/rejected": 2.3046875, "logps/chosen": -866.5, "logps/rejected": -1458.0, "loss": 0.5731, "rewards/accuracies": 0.78125, "rewards/chosen": 0.39013671875, "rewards/margins": 6.29296875, "rewards/rejected": -5.9140625, "step": 3332 }, { "epoch": 0.6297888421748784, "grad_norm": 2.2279488618924352, "learning_rate": 4.267844838043935e-07, "logits/chosen": 2.3564453125, "logits/rejected": 2.45947265625, "logps/chosen": -633.5, "logps/rejected": -1659.0, "loss": 0.5549, "rewards/accuracies": 0.875, "rewards/chosen": 0.31201171875, "rewards/margins": 5.4296875, "rewards/rejected": -5.12109375, "step": 3333 }, { "epoch": 0.6299777977230856, "grad_norm": 2.661895444007193, "learning_rate": 4.2649904161434604e-07, "logits/chosen": 2.05615234375, "logits/rejected": 1.91748046875, "logps/chosen": -740.0, "logps/rejected": -814.0, "loss": 0.4606, "rewards/accuracies": 0.84375, "rewards/chosen": 1.130859375, "rewards/margins": 5.65234375, "rewards/rejected": -4.51953125, "step": 3334 }, { "epoch": 0.6301667532712929, "grad_norm": 1.6368571516449402, "learning_rate": 4.262136531533157e-07, "logits/chosen": 2.126953125, "logits/rejected": 1.892578125, "logps/chosen": -830.0, "logps/rejected": -865.5, "loss": 0.4958, "rewards/accuracies": 0.875, "rewards/chosen": 1.0, "rewards/margins": 7.140625, "rewards/rejected": -6.12890625, "step": 3335 }, { "epoch": 0.6303557088195002, "grad_norm": 3.0447266682528795, "learning_rate": 4.259283185454602e-07, "logits/chosen": 1.63720703125, "logits/rejected": 1.5615234375, "logps/chosen": -758.5, "logps/rejected": -715.5, "loss": 0.5507, "rewards/accuracies": 0.875, "rewards/chosen": 0.75, "rewards/margins": 5.12109375, "rewards/rejected": -4.36328125, "step": 3336 }, { "epoch": 0.6305446643677075, "grad_norm": 2.032031027229107, "learning_rate": 4.2564303791491454e-07, "logits/chosen": 1.955078125, "logits/rejected": 1.845703125, "logps/chosen": -491.0, "logps/rejected": -625.5, "loss": 0.5842, "rewards/accuracies": 0.8125, "rewards/chosen": 0.029052734375, "rewards/margins": 4.58203125, "rewards/rejected": -4.5546875, "step": 3337 }, { "epoch": 0.6307336199159148, "grad_norm": 5.053765485642212, "learning_rate": 4.253578113857895e-07, "logits/chosen": 2.625, "logits/rejected": 2.419921875, "logps/chosen": -596.0, "logps/rejected": -752.0, "loss": 0.5901, "rewards/accuracies": 0.90625, "rewards/chosen": 0.457275390625, "rewards/margins": 4.703125, "rewards/rejected": -4.23828125, "step": 3338 }, { "epoch": 0.6309225754641221, "grad_norm": 2.401858211958623, "learning_rate": 4.2507263908217297e-07, "logits/chosen": 2.068359375, "logits/rejected": 1.69140625, "logps/chosen": -803.0, "logps/rejected": -1166.0, "loss": 0.4948, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5224609375, "rewards/margins": 4.9375, "rewards/rejected": -4.4140625, "step": 3339 }, { "epoch": 0.6311115310123293, "grad_norm": 1.8509419207306226, "learning_rate": 4.2478752112812886e-07, "logits/chosen": 3.18359375, "logits/rejected": 2.83203125, "logps/chosen": -768.5, "logps/rejected": -1546.0, "loss": 0.4544, "rewards/accuracies": 0.9375, "rewards/chosen": 1.017578125, "rewards/margins": 6.8515625, "rewards/rejected": -5.83984375, "step": 3340 }, { "epoch": 0.6313004865605366, "grad_norm": 2.418982331533679, "learning_rate": 4.2450245764769776e-07, "logits/chosen": 3.3828125, "logits/rejected": 3.1171875, "logps/chosen": -1028.0, "logps/rejected": -1079.0, "loss": 0.435, "rewards/accuracies": 0.875, "rewards/chosen": 1.330078125, "rewards/margins": 6.4609375, "rewards/rejected": -5.125, "step": 3341 }, { "epoch": 0.6314894421087439, "grad_norm": 2.363602265516444, "learning_rate": 4.242174487648963e-07, "logits/chosen": 2.423828125, "logits/rejected": 2.23828125, "logps/chosen": -681.5, "logps/rejected": -1638.0, "loss": 0.5547, "rewards/accuracies": 0.78125, "rewards/chosen": 0.92041015625, "rewards/margins": 7.640625, "rewards/rejected": -6.734375, "step": 3342 }, { "epoch": 0.6316783976569512, "grad_norm": 2.721022588398586, "learning_rate": 4.239324946037172e-07, "logits/chosen": 2.80859375, "logits/rejected": 2.61328125, "logps/chosen": -1091.0, "logps/rejected": -1978.0, "loss": 0.5255, "rewards/accuracies": 0.84375, "rewards/chosen": 1.376953125, "rewards/margins": 8.24609375, "rewards/rejected": -6.86328125, "step": 3343 }, { "epoch": 0.6318673532051585, "grad_norm": 3.8857996191416175, "learning_rate": 4.2364759528812977e-07, "logits/chosen": 2.38671875, "logits/rejected": 2.6015625, "logps/chosen": -696.5, "logps/rejected": -750.0, "loss": 0.7215, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2109375, "rewards/margins": 2.607421875, "rewards/rejected": -2.396484375, "step": 3344 }, { "epoch": 0.6320563087533658, "grad_norm": 2.447880783118275, "learning_rate": 4.2336275094207917e-07, "logits/chosen": 2.982421875, "logits/rejected": 2.41796875, "logps/chosen": -1118.0, "logps/rejected": -704.5, "loss": 0.5322, "rewards/accuracies": 0.84375, "rewards/chosen": 0.041015625, "rewards/margins": 3.875, "rewards/rejected": -3.828125, "step": 3345 }, { "epoch": 0.632245264301573, "grad_norm": 1.6430101697275141, "learning_rate": 4.2307796168948685e-07, "logits/chosen": 2.208984375, "logits/rejected": 1.76025390625, "logps/chosen": -684.0, "logps/rejected": -705.5, "loss": 0.5635, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8701171875, "rewards/margins": 4.32421875, "rewards/rejected": -3.44921875, "step": 3346 }, { "epoch": 0.6324342198497803, "grad_norm": 3.064042672174086, "learning_rate": 4.227932276542504e-07, "logits/chosen": 2.60546875, "logits/rejected": 2.20703125, "logps/chosen": -745.0, "logps/rejected": -874.0, "loss": 0.5907, "rewards/accuracies": 0.71875, "rewards/chosen": 0.61773681640625, "rewards/margins": 3.34765625, "rewards/rejected": -2.724609375, "step": 3347 }, { "epoch": 0.6326231753979876, "grad_norm": 3.2840049689721083, "learning_rate": 4.225085489602429e-07, "logits/chosen": 2.2109375, "logits/rejected": 2.298828125, "logps/chosen": -675.0, "logps/rejected": -625.0, "loss": 0.7251, "rewards/accuracies": 0.75, "rewards/chosen": -0.071044921875, "rewards/margins": 3.138671875, "rewards/rejected": -3.20703125, "step": 3348 }, { "epoch": 0.6328121309461949, "grad_norm": 2.2414002452705035, "learning_rate": 4.222239257313137e-07, "logits/chosen": 2.875, "logits/rejected": 2.9609375, "logps/chosen": -806.0, "logps/rejected": -907.0, "loss": 0.5524, "rewards/accuracies": 0.9375, "rewards/chosen": 0.74853515625, "rewards/margins": 7.734375, "rewards/rejected": -6.984375, "step": 3349 }, { "epoch": 0.6330010864944022, "grad_norm": 2.031423532526352, "learning_rate": 4.21939358091288e-07, "logits/chosen": 2.30859375, "logits/rejected": 1.869140625, "logps/chosen": -1061.0, "logps/rejected": -1573.0, "loss": 0.4653, "rewards/accuracies": 0.875, "rewards/chosen": 0.8359375, "rewards/margins": 7.20703125, "rewards/rejected": -6.3671875, "step": 3350 }, { "epoch": 0.6331900420426094, "grad_norm": 2.0172484974236884, "learning_rate": 4.2165484616396686e-07, "logits/chosen": 2.908203125, "logits/rejected": 2.375, "logps/chosen": -1081.0, "logps/rejected": -889.0, "loss": 0.4735, "rewards/accuracies": 0.90625, "rewards/chosen": 1.220703125, "rewards/margins": 5.0703125, "rewards/rejected": -3.84765625, "step": 3351 }, { "epoch": 0.6333789975908167, "grad_norm": 3.500161335168872, "learning_rate": 4.213703900731268e-07, "logits/chosen": 2.04296875, "logits/rejected": 2.01171875, "logps/chosen": -744.0, "logps/rejected": -1468.5, "loss": 0.6803, "rewards/accuracies": 0.75, "rewards/chosen": 0.5537109375, "rewards/margins": 4.212890625, "rewards/rejected": -3.66796875, "step": 3352 }, { "epoch": 0.633567953139024, "grad_norm": 1.749186514506694, "learning_rate": 4.2108598994252043e-07, "logits/chosen": 2.640625, "logits/rejected": 2.078125, "logps/chosen": -774.0, "logps/rejected": -1083.0, "loss": 0.4646, "rewards/accuracies": 0.84375, "rewards/chosen": 1.158203125, "rewards/margins": 9.84765625, "rewards/rejected": -8.68359375, "step": 3353 }, { "epoch": 0.6337569086872313, "grad_norm": 1.8282662254489912, "learning_rate": 4.20801645895876e-07, "logits/chosen": 2.62890625, "logits/rejected": 2.4697265625, "logps/chosen": -999.0, "logps/rejected": -820.0, "loss": 0.599, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2412109375, "rewards/margins": 4.71484375, "rewards/rejected": -4.46484375, "step": 3354 }, { "epoch": 0.6339458642354386, "grad_norm": 1.3940630713035524, "learning_rate": 4.205173580568968e-07, "logits/chosen": 2.87890625, "logits/rejected": 2.408203125, "logps/chosen": -744.5, "logps/rejected": -820.5, "loss": 0.545, "rewards/accuracies": 0.84375, "rewards/chosen": 0.573486328125, "rewards/margins": 5.66015625, "rewards/rejected": -5.0859375, "step": 3355 }, { "epoch": 0.6341348197836459, "grad_norm": 1.994243741882459, "learning_rate": 4.2023312654926255e-07, "logits/chosen": 2.6796875, "logits/rejected": 2.958984375, "logps/chosen": -695.0, "logps/rejected": -856.0, "loss": 0.6042, "rewards/accuracies": 0.75, "rewards/chosen": 0.784423828125, "rewards/margins": 4.6171875, "rewards/rejected": -3.8359375, "step": 3356 }, { "epoch": 0.6343237753318531, "grad_norm": 1.7452171950592732, "learning_rate": 4.1994895149662754e-07, "logits/chosen": 3.33203125, "logits/rejected": 2.6328125, "logps/chosen": -821.5, "logps/rejected": -1235.0, "loss": 0.5399, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4697265625, "rewards/margins": 5.37890625, "rewards/rejected": -5.8671875, "step": 3357 }, { "epoch": 0.6345127308800604, "grad_norm": 3.2234147314088304, "learning_rate": 4.1966483302262245e-07, "logits/chosen": 2.6484375, "logits/rejected": 2.263671875, "logps/chosen": -1017.0, "logps/rejected": -904.0, "loss": 0.6498, "rewards/accuracies": 0.75, "rewards/chosen": 0.7470703125, "rewards/margins": 3.978515625, "rewards/rejected": -3.23046875, "step": 3358 }, { "epoch": 0.6347016864282677, "grad_norm": 5.024545488598471, "learning_rate": 4.193807712508524e-07, "logits/chosen": 3.18359375, "logits/rejected": 2.8125, "logps/chosen": -1286.0, "logps/rejected": -912.0, "loss": 0.5452, "rewards/accuracies": 0.8125, "rewards/chosen": 7.708984375, "rewards/margins": 11.83203125, "rewards/rejected": -4.1171875, "step": 3359 }, { "epoch": 0.634890641976475, "grad_norm": 4.72683300298047, "learning_rate": 4.190967663048984e-07, "logits/chosen": 3.609375, "logits/rejected": 3.5625, "logps/chosen": -906.0, "logps/rejected": -1163.0, "loss": 0.7095, "rewards/accuracies": 0.71875, "rewards/chosen": -0.281494140625, "rewards/margins": 5.9375, "rewards/rejected": -6.21875, "step": 3360 }, { "epoch": 0.6350795975246823, "grad_norm": 3.6707887736995595, "learning_rate": 4.1881281830831696e-07, "logits/chosen": 2.845703125, "logits/rejected": 2.7890625, "logps/chosen": -1032.0, "logps/rejected": -1165.0, "loss": 0.5075, "rewards/accuracies": 0.90625, "rewards/chosen": 0.981201171875, "rewards/margins": 5.74609375, "rewards/rejected": -4.76171875, "step": 3361 }, { "epoch": 0.6352685530728897, "grad_norm": 2.371782926071866, "learning_rate": 4.185289273846391e-07, "logits/chosen": 3.5546875, "logits/rejected": 3.4375, "logps/chosen": -938.0, "logps/rejected": -1229.0, "loss": 0.4153, "rewards/accuracies": 0.96875, "rewards/chosen": 1.84765625, "rewards/margins": 7.1484375, "rewards/rejected": -5.29296875, "step": 3362 }, { "epoch": 0.6354575086210968, "grad_norm": 4.112833075778218, "learning_rate": 4.182450936573715e-07, "logits/chosen": 1.96484375, "logits/rejected": 1.8046875, "logps/chosen": -806.0, "logps/rejected": -755.5, "loss": 0.622, "rewards/accuracies": 0.78125, "rewards/chosen": 0.77056884765625, "rewards/margins": 5.0625, "rewards/rejected": -4.30078125, "step": 3363 }, { "epoch": 0.6356464641693041, "grad_norm": 3.645409444630861, "learning_rate": 4.179613172499958e-07, "logits/chosen": 2.31640625, "logits/rejected": 1.791015625, "logps/chosen": -637.5, "logps/rejected": -831.5, "loss": 0.6646, "rewards/accuracies": 0.78125, "rewards/chosen": 0.390380859375, "rewards/margins": 4.11328125, "rewards/rejected": -3.71484375, "step": 3364 }, { "epoch": 0.6358354197175115, "grad_norm": 3.653270305539192, "learning_rate": 4.1767759828596906e-07, "logits/chosen": 4.06640625, "logits/rejected": 3.69921875, "logps/chosen": -1484.0, "logps/rejected": -789.0, "loss": 0.5448, "rewards/accuracies": 0.84375, "rewards/chosen": -0.673095703125, "rewards/margins": 3.27734375, "rewards/rejected": -3.953125, "step": 3365 }, { "epoch": 0.6360243752657188, "grad_norm": 2.3789789292917862, "learning_rate": 4.173939368887226e-07, "logits/chosen": 2.529296875, "logits/rejected": 2.193359375, "logps/chosen": -694.0, "logps/rejected": -808.5, "loss": 0.5472, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5224609375, "rewards/margins": 5.390625, "rewards/rejected": -4.87890625, "step": 3366 }, { "epoch": 0.6362133308139261, "grad_norm": 2.5822053893660835, "learning_rate": 4.171103331816633e-07, "logits/chosen": 1.88623046875, "logits/rejected": 1.583984375, "logps/chosen": -463.5, "logps/rejected": -635.0, "loss": 0.6385, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3746337890625, "rewards/margins": 3.240234375, "rewards/rejected": -2.865234375, "step": 3367 }, { "epoch": 0.6364022863621334, "grad_norm": 2.285343608643748, "learning_rate": 4.16826787288173e-07, "logits/chosen": 3.142578125, "logits/rejected": 3.263671875, "logps/chosen": -575.0, "logps/rejected": -1378.0, "loss": 0.6215, "rewards/accuracies": 0.75, "rewards/chosen": -0.12841796875, "rewards/margins": 5.01171875, "rewards/rejected": -5.140625, "step": 3368 }, { "epoch": 0.6365912419103406, "grad_norm": 2.859323373085165, "learning_rate": 4.1654329933160795e-07, "logits/chosen": 3.34375, "logits/rejected": 3.037109375, "logps/chosen": -821.0, "logps/rejected": -747.5, "loss": 0.6961, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10888671875, "rewards/margins": 3.8564453125, "rewards/rejected": -3.751953125, "step": 3369 }, { "epoch": 0.6367801974585479, "grad_norm": 3.7790659237103816, "learning_rate": 4.1625986943529944e-07, "logits/chosen": 2.8203125, "logits/rejected": 2.357421875, "logps/chosen": -584.5, "logps/rejected": -789.5, "loss": 0.5463, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9111328125, "rewards/margins": 5.1953125, "rewards/rejected": -4.28125, "step": 3370 }, { "epoch": 0.6369691530067552, "grad_norm": 4.303547573014119, "learning_rate": 4.159764977225535e-07, "logits/chosen": 3.10546875, "logits/rejected": 2.2265625, "logps/chosen": -706.75, "logps/rejected": -528.5, "loss": 0.6099, "rewards/accuracies": 0.78125, "rewards/chosen": 0.212890625, "rewards/margins": 4.24609375, "rewards/rejected": -4.03515625, "step": 3371 }, { "epoch": 0.6371581085549625, "grad_norm": 2.3340331690724145, "learning_rate": 4.1569318431665093e-07, "logits/chosen": 2.318359375, "logits/rejected": 2.7734375, "logps/chosen": -797.0, "logps/rejected": -677.0, "loss": 0.4798, "rewards/accuracies": 0.875, "rewards/chosen": 0.48822021484375, "rewards/margins": 5.046875, "rewards/rejected": -4.5546875, "step": 3372 }, { "epoch": 0.6373470641031698, "grad_norm": 2.7981411194636197, "learning_rate": 4.1540992934084674e-07, "logits/chosen": 3.375, "logits/rejected": 2.59765625, "logps/chosen": -762.0, "logps/rejected": -1515.5, "loss": 0.4707, "rewards/accuracies": 0.84375, "rewards/chosen": 1.37890625, "rewards/margins": 6.05859375, "rewards/rejected": -4.6875, "step": 3373 }, { "epoch": 0.637536019651377, "grad_norm": 1.9335940507203464, "learning_rate": 4.1512673291837106e-07, "logits/chosen": 2.677734375, "logits/rejected": 2.49609375, "logps/chosen": -668.0, "logps/rejected": -762.0, "loss": 0.6105, "rewards/accuracies": 0.78125, "rewards/chosen": 0.372802734375, "rewards/margins": 4.27734375, "rewards/rejected": -3.8984375, "step": 3374 }, { "epoch": 0.6377249751995843, "grad_norm": 2.4638635821119643, "learning_rate": 4.1484359517242837e-07, "logits/chosen": 2.490234375, "logits/rejected": 2.0234375, "logps/chosen": -770.0, "logps/rejected": -1048.0, "loss": 0.5502, "rewards/accuracies": 0.78125, "rewards/chosen": 1.044189453125, "rewards/margins": 6.87890625, "rewards/rejected": -5.8359375, "step": 3375 }, { "epoch": 0.6379139307477916, "grad_norm": 2.4305741464906316, "learning_rate": 4.145605162261975e-07, "logits/chosen": 2.19482421875, "logits/rejected": 1.361328125, "logps/chosen": -742.0, "logps/rejected": -516.5, "loss": 0.6553, "rewards/accuracies": 0.875, "rewards/chosen": 0.5673828125, "rewards/margins": 3.2353515625, "rewards/rejected": -2.666015625, "step": 3376 }, { "epoch": 0.6381028862959989, "grad_norm": 2.464479368295937, "learning_rate": 4.1427749620283183e-07, "logits/chosen": 2.8828125, "logits/rejected": 2.7109375, "logps/chosen": -692.0, "logps/rejected": -977.0, "loss": 0.5159, "rewards/accuracies": 0.8125, "rewards/chosen": 1.198486328125, "rewards/margins": 6.052734375, "rewards/rejected": -4.85546875, "step": 3377 }, { "epoch": 0.6382918418442062, "grad_norm": 2.3871638822635988, "learning_rate": 4.139945352254589e-07, "logits/chosen": 3.5859375, "logits/rejected": 3.60546875, "logps/chosen": -579.75, "logps/rejected": -709.5, "loss": 0.7339, "rewards/accuracies": 0.6875, "rewards/chosen": 0.603271484375, "rewards/margins": 2.849609375, "rewards/rejected": -2.24169921875, "step": 3378 }, { "epoch": 0.6384807973924135, "grad_norm": 3.7788716318684035, "learning_rate": 4.1371163341718084e-07, "logits/chosen": 3.109375, "logits/rejected": 3.35546875, "logps/chosen": -645.5, "logps/rejected": -870.0, "loss": 0.5022, "rewards/accuracies": 0.84375, "rewards/chosen": 1.361328125, "rewards/margins": 4.70703125, "rewards/rejected": -3.337890625, "step": 3379 }, { "epoch": 0.6386697529406207, "grad_norm": 2.4532890942914585, "learning_rate": 4.1342879090107383e-07, "logits/chosen": 2.6474609375, "logits/rejected": 2.255859375, "logps/chosen": -594.25, "logps/rejected": -1591.0, "loss": 0.566, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7001953125, "rewards/margins": 4.65234375, "rewards/rejected": -3.9453125, "step": 3380 }, { "epoch": 0.638858708488828, "grad_norm": 2.014936114840227, "learning_rate": 4.1314600780018836e-07, "logits/chosen": 2.556640625, "logits/rejected": 2.41796875, "logps/chosen": -639.5, "logps/rejected": -860.0, "loss": 0.5498, "rewards/accuracies": 0.875, "rewards/chosen": 1.580078125, "rewards/margins": 7.078125, "rewards/rejected": -5.4921875, "step": 3381 }, { "epoch": 0.6390476640370353, "grad_norm": 2.615348229833145, "learning_rate": 4.1286328423754934e-07, "logits/chosen": 2.531005859375, "logits/rejected": 2.84765625, "logps/chosen": -720.5, "logps/rejected": -2164.0, "loss": 0.6262, "rewards/accuracies": 0.75, "rewards/chosen": 0.62548828125, "rewards/margins": 6.8671875, "rewards/rejected": -6.234375, "step": 3382 }, { "epoch": 0.6392366195852426, "grad_norm": 2.184680323037557, "learning_rate": 4.125806203361549e-07, "logits/chosen": 3.40625, "logits/rejected": 2.9296875, "logps/chosen": -872.0, "logps/rejected": -926.0, "loss": 0.5727, "rewards/accuracies": 0.84375, "rewards/chosen": 1.34375, "rewards/margins": 4.9296875, "rewards/rejected": -3.591796875, "step": 3383 }, { "epoch": 0.6394255751334499, "grad_norm": 3.3325161396768643, "learning_rate": 4.122980162189783e-07, "logits/chosen": 2.78125, "logits/rejected": 2.5126953125, "logps/chosen": -1152.5, "logps/rejected": -593.0, "loss": 0.6076, "rewards/accuracies": 0.84375, "rewards/chosen": -1.638671875, "rewards/margins": 1.974609375, "rewards/rejected": -3.61328125, "step": 3384 }, { "epoch": 0.6396145306816572, "grad_norm": 2.157407627628336, "learning_rate": 4.120154720089659e-07, "logits/chosen": 3.33984375, "logits/rejected": 3.65234375, "logps/chosen": -629.5, "logps/rejected": -1330.0, "loss": 0.6444, "rewards/accuracies": 0.78125, "rewards/chosen": 1.30029296875, "rewards/margins": 6.3671875, "rewards/rejected": -5.0703125, "step": 3385 }, { "epoch": 0.6398034862298644, "grad_norm": 2.0936453773995223, "learning_rate": 4.117329878290389e-07, "logits/chosen": 3.1015625, "logits/rejected": 2.654296875, "logps/chosen": -494.5, "logps/rejected": -622.0, "loss": 0.5232, "rewards/accuracies": 0.8125, "rewards/chosen": 1.05322265625, "rewards/margins": 4.63671875, "rewards/rejected": -3.58984375, "step": 3386 }, { "epoch": 0.6399924417780717, "grad_norm": 1.701295217187966, "learning_rate": 4.114505638020913e-07, "logits/chosen": 2.181640625, "logits/rejected": 2.185546875, "logps/chosen": -698.0, "logps/rejected": -620.0, "loss": 0.6676, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2265625, "rewards/margins": 4.18359375, "rewards/rejected": -3.9609375, "step": 3387 }, { "epoch": 0.640181397326279, "grad_norm": 1.193401319914007, "learning_rate": 4.1116820005099184e-07, "logits/chosen": 2.796875, "logits/rejected": 2.2734375, "logps/chosen": -863.5, "logps/rejected": -997.0, "loss": 0.5044, "rewards/accuracies": 0.90625, "rewards/chosen": 1.048828125, "rewards/margins": 6.1171875, "rewards/rejected": -5.06640625, "step": 3388 }, { "epoch": 0.6403703528744863, "grad_norm": 1.6224843417365875, "learning_rate": 4.108858966985825e-07, "logits/chosen": 3.0078125, "logits/rejected": 2.640625, "logps/chosen": -933.0, "logps/rejected": -2636.0, "loss": 0.5776, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4482421875, "rewards/margins": 11.46875, "rewards/rejected": -11.025390625, "step": 3389 }, { "epoch": 0.6405593084226936, "grad_norm": 1.7002495439363916, "learning_rate": 4.106036538676795e-07, "logits/chosen": 2.984375, "logits/rejected": 3.03125, "logps/chosen": -919.0, "logps/rejected": -1092.0, "loss": 0.551, "rewards/accuracies": 0.875, "rewards/chosen": 1.296875, "rewards/margins": 5.12109375, "rewards/rejected": -3.8359375, "step": 3390 }, { "epoch": 0.6407482639709009, "grad_norm": 1.904936497887942, "learning_rate": 4.103214716810722e-07, "logits/chosen": 2.486328125, "logits/rejected": 2.57421875, "logps/chosen": -674.0, "logps/rejected": -593.5, "loss": 0.6231, "rewards/accuracies": 0.84375, "rewards/chosen": 0.123046875, "rewards/margins": 4.05078125, "rewards/rejected": -3.91796875, "step": 3391 }, { "epoch": 0.6409372195191081, "grad_norm": 4.527783902411896, "learning_rate": 4.100393502615236e-07, "logits/chosen": 3.03515625, "logits/rejected": 2.09765625, "logps/chosen": -736.0, "logps/rejected": -736.5, "loss": 0.4408, "rewards/accuracies": 1.0, "rewards/chosen": 0.739990234375, "rewards/margins": 6.6953125, "rewards/rejected": -5.9609375, "step": 3392 }, { "epoch": 0.6411261750673154, "grad_norm": 2.241493403850739, "learning_rate": 4.097572897317709e-07, "logits/chosen": 2.5625, "logits/rejected": 2.287109375, "logps/chosen": -751.0, "logps/rejected": -672.5, "loss": 0.5671, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2255859375, "rewards/margins": 4.77734375, "rewards/rejected": -3.55078125, "step": 3393 }, { "epoch": 0.6413151306155227, "grad_norm": 2.674565254035375, "learning_rate": 4.0947529021452385e-07, "logits/chosen": 2.8984375, "logits/rejected": 2.5390625, "logps/chosen": -1025.0, "logps/rejected": -1323.0, "loss": 0.4966, "rewards/accuracies": 0.84375, "rewards/chosen": 0.560546875, "rewards/margins": 5.8203125, "rewards/rejected": -5.26171875, "step": 3394 }, { "epoch": 0.64150408616373, "grad_norm": 4.041403175003799, "learning_rate": 4.0919335183246663e-07, "logits/chosen": 2.51953125, "logits/rejected": 1.797119140625, "logps/chosen": -778.0, "logps/rejected": -751.0, "loss": 0.6215, "rewards/accuracies": 0.78125, "rewards/chosen": 0.17138671875, "rewards/margins": 4.200927734375, "rewards/rejected": -4.03125, "step": 3395 }, { "epoch": 0.6416930417119373, "grad_norm": 2.0906096078169805, "learning_rate": 4.0891147470825615e-07, "logits/chosen": 2.72265625, "logits/rejected": 2.09765625, "logps/chosen": -808.5, "logps/rejected": -779.0, "loss": 0.5946, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2939453125, "rewards/margins": 4.6103515625, "rewards/rejected": -3.314453125, "step": 3396 }, { "epoch": 0.6418819972601445, "grad_norm": 3.070642870809259, "learning_rate": 4.0862965896452283e-07, "logits/chosen": 2.931640625, "logits/rejected": 2.5318603515625, "logps/chosen": -1386.0, "logps/rejected": -1538.0, "loss": 0.4812, "rewards/accuracies": 0.8125, "rewards/chosen": 1.94140625, "rewards/margins": 7.8125, "rewards/rejected": -5.859375, "step": 3397 }, { "epoch": 0.6420709528083518, "grad_norm": 1.7208905630584084, "learning_rate": 4.083479047238705e-07, "logits/chosen": 2.94921875, "logits/rejected": 2.3828125, "logps/chosen": -1227.0, "logps/rejected": -1298.0, "loss": 0.3471, "rewards/accuracies": 1.0, "rewards/chosen": 2.31640625, "rewards/margins": 8.296875, "rewards/rejected": -5.96875, "step": 3398 }, { "epoch": 0.6422599083565591, "grad_norm": 1.758471370155073, "learning_rate": 4.0806621210887613e-07, "logits/chosen": 2.572265625, "logits/rejected": 2.5859375, "logps/chosen": -860.0, "logps/rejected": -878.0, "loss": 0.5368, "rewards/accuracies": 0.9375, "rewards/chosen": 1.388671875, "rewards/margins": 4.423828125, "rewards/rejected": -3.041015625, "step": 3399 }, { "epoch": 0.6424488639047664, "grad_norm": 2.1710224790948565, "learning_rate": 4.0778458124209023e-07, "logits/chosen": 2.60546875, "logits/rejected": 2.2392578125, "logps/chosen": -1231.5, "logps/rejected": -1190.5, "loss": 0.5947, "rewards/accuracies": 0.8125, "rewards/chosen": 0.193115234375, "rewards/margins": 5.00390625, "rewards/rejected": -4.80859375, "step": 3400 }, { "epoch": 0.6426378194529737, "grad_norm": 1.5515874840248307, "learning_rate": 4.075030122460356e-07, "logits/chosen": 3.28125, "logits/rejected": 2.8671875, "logps/chosen": -957.0, "logps/rejected": -1255.0, "loss": 0.511, "rewards/accuracies": 0.8125, "rewards/chosen": 1.9189453125, "rewards/margins": 7.21875, "rewards/rejected": -5.294921875, "step": 3401 }, { "epoch": 0.642826775001181, "grad_norm": 2.068903023724022, "learning_rate": 4.0722150524320896e-07, "logits/chosen": 2.085205078125, "logits/rejected": 1.95745849609375, "logps/chosen": -1065.0, "logps/rejected": -1847.0, "loss": 0.5938, "rewards/accuracies": 0.875, "rewards/chosen": 0.820556640625, "rewards/margins": 6.4140625, "rewards/rejected": -5.59375, "step": 3402 }, { "epoch": 0.6430157305493882, "grad_norm": 1.9866543140027266, "learning_rate": 4.0694006035607986e-07, "logits/chosen": 2.162109375, "logits/rejected": 1.673828125, "logps/chosen": -894.0, "logps/rejected": -868.0, "loss": 0.5477, "rewards/accuracies": 0.75, "rewards/chosen": 1.2939453125, "rewards/margins": 4.80078125, "rewards/rejected": -3.51171875, "step": 3403 }, { "epoch": 0.6432046860975955, "grad_norm": 2.249673391917069, "learning_rate": 4.066586777070905e-07, "logits/chosen": 3.00390625, "logits/rejected": 3.0078125, "logps/chosen": -585.75, "logps/rejected": -827.0, "loss": 0.6094, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4814453125, "rewards/margins": 5.12109375, "rewards/rejected": -4.62890625, "step": 3404 }, { "epoch": 0.6433936416458028, "grad_norm": 2.048537897079192, "learning_rate": 4.0637735741865653e-07, "logits/chosen": 2.96875, "logits/rejected": 3.146484375, "logps/chosen": -657.5, "logps/rejected": -2462.0, "loss": 0.6451, "rewards/accuracies": 0.8125, "rewards/chosen": 0.37060546875, "rewards/margins": 6.94140625, "rewards/rejected": -6.55859375, "step": 3405 }, { "epoch": 0.6435825971940101, "grad_norm": 2.1265609786566615, "learning_rate": 4.060960996131657e-07, "logits/chosen": 3.1953125, "logits/rejected": 2.62109375, "logps/chosen": -991.5, "logps/rejected": -1136.0, "loss": 0.531, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3994140625, "rewards/margins": 5.515625, "rewards/rejected": -4.109375, "step": 3406 }, { "epoch": 0.6437715527422174, "grad_norm": 1.6943176765029486, "learning_rate": 4.058149044129795e-07, "logits/chosen": 3.07421875, "logits/rejected": 2.615234375, "logps/chosen": -560.0, "logps/rejected": -797.0, "loss": 0.6491, "rewards/accuracies": 0.65625, "rewards/chosen": 1.17529296875, "rewards/margins": 3.375, "rewards/rejected": -2.1953125, "step": 3407 }, { "epoch": 0.6439605082904247, "grad_norm": 3.236531674178731, "learning_rate": 4.055337719404315e-07, "logits/chosen": 3.369140625, "logits/rejected": 3.482421875, "logps/chosen": -430.5, "logps/rejected": -570.0, "loss": 0.6379, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0166015625, "rewards/margins": 3.5703125, "rewards/rejected": -2.5546875, "step": 3408 }, { "epoch": 0.6441494638386319, "grad_norm": 2.3813761460686336, "learning_rate": 4.0525270231782816e-07, "logits/chosen": 2.5546875, "logits/rejected": 2.28125, "logps/chosen": -924.0, "logps/rejected": -891.0, "loss": 0.571, "rewards/accuracies": 0.8125, "rewards/chosen": 1.27294921875, "rewards/margins": 5.046875, "rewards/rejected": -3.7734375, "step": 3409 }, { "epoch": 0.6443384193868392, "grad_norm": 4.042382036522, "learning_rate": 4.0497169566744895e-07, "logits/chosen": 2.8125, "logits/rejected": 2.84375, "logps/chosen": -749.5, "logps/rejected": -863.0, "loss": 0.6017, "rewards/accuracies": 0.8125, "rewards/chosen": 1.521484375, "rewards/margins": 4.81640625, "rewards/rejected": -3.2890625, "step": 3410 }, { "epoch": 0.6445273749350465, "grad_norm": 2.026412021459701, "learning_rate": 4.046907521115452e-07, "logits/chosen": 3.01171875, "logits/rejected": 2.98046875, "logps/chosen": -594.0, "logps/rejected": -1083.0, "loss": 0.6341, "rewards/accuracies": 0.8125, "rewards/chosen": 0.775390625, "rewards/margins": 4.2353515625, "rewards/rejected": -3.46484375, "step": 3411 }, { "epoch": 0.6447163304832538, "grad_norm": 2.9982991775832892, "learning_rate": 4.0440987177234176e-07, "logits/chosen": 2.84375, "logits/rejected": 2.978515625, "logps/chosen": -744.0, "logps/rejected": -1671.5, "loss": 0.5758, "rewards/accuracies": 0.78125, "rewards/chosen": 1.169708251953125, "rewards/margins": 8.0234375, "rewards/rejected": -6.857421875, "step": 3412 }, { "epoch": 0.6449052860314611, "grad_norm": 1.7556003495006913, "learning_rate": 4.0412905477203497e-07, "logits/chosen": 2.3359375, "logits/rejected": 2.189453125, "logps/chosen": -1036.0, "logps/rejected": -967.5, "loss": 0.5409, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1201171875, "rewards/margins": 5.75, "rewards/rejected": -4.63671875, "step": 3413 }, { "epoch": 0.6450942415796684, "grad_norm": 1.8361841982608453, "learning_rate": 4.0384830123279455e-07, "logits/chosen": 1.845703125, "logits/rejected": 1.5029296875, "logps/chosen": -851.0, "logps/rejected": -1025.0, "loss": 0.3975, "rewards/accuracies": 0.96875, "rewards/chosen": 1.439453125, "rewards/margins": 6.75, "rewards/rejected": -5.3125, "step": 3414 }, { "epoch": 0.6452831971278756, "grad_norm": 2.0488970849722565, "learning_rate": 4.0356761127676153e-07, "logits/chosen": 2.5546875, "logits/rejected": 1.9375, "logps/chosen": -977.5, "logps/rejected": -819.0, "loss": 0.6268, "rewards/accuracies": 0.78125, "rewards/chosen": 1.21875, "rewards/margins": 3.224609375, "rewards/rejected": -2.00439453125, "step": 3415 }, { "epoch": 0.6454721526760829, "grad_norm": 1.8021221302904364, "learning_rate": 4.032869850260504e-07, "logits/chosen": 2.833984375, "logits/rejected": 2.87109375, "logps/chosen": -795.0, "logps/rejected": -996.0, "loss": 0.607, "rewards/accuracies": 0.78125, "rewards/chosen": 0.91796875, "rewards/margins": 4.81640625, "rewards/rejected": -3.89453125, "step": 3416 }, { "epoch": 0.6456611082242902, "grad_norm": 2.551880980987454, "learning_rate": 4.0300642260274753e-07, "logits/chosen": 3.2265625, "logits/rejected": 3.06640625, "logps/chosen": -822.0, "logps/rejected": -642.0, "loss": 0.5478, "rewards/accuracies": 0.84375, "rewards/chosen": 1.31591796875, "rewards/margins": 5.12890625, "rewards/rejected": -3.81640625, "step": 3417 }, { "epoch": 0.6458500637724975, "grad_norm": 3.416323816052779, "learning_rate": 4.0272592412891105e-07, "logits/chosen": 1.9921875, "logits/rejected": 2.16796875, "logps/chosen": -745.0, "logps/rejected": -681.0, "loss": 0.5619, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1728515625, "rewards/margins": 4.265625, "rewards/rejected": -3.09765625, "step": 3418 }, { "epoch": 0.6460390193207048, "grad_norm": 2.0020285919281404, "learning_rate": 4.024454897265719e-07, "logits/chosen": 2.87109375, "logits/rejected": 2.6796875, "logps/chosen": -1050.0, "logps/rejected": -1885.0, "loss": 0.5532, "rewards/accuracies": 0.84375, "rewards/chosen": 0.618896484375, "rewards/margins": 9.4921875, "rewards/rejected": -8.91015625, "step": 3419 }, { "epoch": 0.646227974868912, "grad_norm": 2.0656101596132848, "learning_rate": 4.0216511951773267e-07, "logits/chosen": 3.10546875, "logits/rejected": 2.61328125, "logps/chosen": -885.5, "logps/rejected": -606.5, "loss": 0.633, "rewards/accuracies": 0.78125, "rewards/chosen": 0.33544921875, "rewards/margins": 3.85546875, "rewards/rejected": -3.5234375, "step": 3420 }, { "epoch": 0.6464169304171193, "grad_norm": 1.7024465625782674, "learning_rate": 4.0188481362436867e-07, "logits/chosen": 2.9326171875, "logits/rejected": 2.3671875, "logps/chosen": -743.5, "logps/rejected": -1037.0, "loss": 0.421, "rewards/accuracies": 0.875, "rewards/chosen": 1.43505859375, "rewards/margins": 6.640625, "rewards/rejected": -5.2109375, "step": 3421 }, { "epoch": 0.6466058859653266, "grad_norm": 1.9367872352769622, "learning_rate": 4.0160457216842627e-07, "logits/chosen": 2.794921875, "logits/rejected": 2.3671875, "logps/chosen": -812.0, "logps/rejected": -508.0, "loss": 0.5915, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0615234375, "rewards/margins": 4.375, "rewards/rejected": -3.3125, "step": 3422 }, { "epoch": 0.646794841513534, "grad_norm": 2.076531567767365, "learning_rate": 4.013243952718246e-07, "logits/chosen": 2.84375, "logits/rejected": 2.20703125, "logps/chosen": -725.0, "logps/rejected": -683.5, "loss": 0.631, "rewards/accuracies": 0.78125, "rewards/chosen": 0.885498046875, "rewards/margins": 3.7373046875, "rewards/rejected": -2.84375, "step": 3423 }, { "epoch": 0.6469837970617413, "grad_norm": 2.1510276282847816, "learning_rate": 4.0104428305645456e-07, "logits/chosen": 3.80859375, "logits/rejected": 3.78515625, "logps/chosen": -914.5, "logps/rejected": -2198.0, "loss": 0.6542, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4971504211425781, "rewards/margins": 9.2265625, "rewards/rejected": -8.734375, "step": 3424 }, { "epoch": 0.6471727526099486, "grad_norm": 2.571537219036983, "learning_rate": 4.007642356441786e-07, "logits/chosen": 2.521484375, "logits/rejected": 2.775390625, "logps/chosen": -524.5, "logps/rejected": -754.5, "loss": 0.6541, "rewards/accuracies": 0.78125, "rewards/chosen": -0.138427734375, "rewards/margins": 4.494140625, "rewards/rejected": -4.626953125, "step": 3425 }, { "epoch": 0.6473617081581557, "grad_norm": 2.224117433864253, "learning_rate": 4.0048425315683156e-07, "logits/chosen": 3.515625, "logits/rejected": 3.28515625, "logps/chosen": -805.0, "logps/rejected": -708.5, "loss": 0.7588, "rewards/accuracies": 0.6875, "rewards/chosen": 0.96923828125, "rewards/margins": 3.1328125, "rewards/rejected": -2.1630859375, "step": 3426 }, { "epoch": 0.647550663706363, "grad_norm": 2.20746428376895, "learning_rate": 4.0020433571621914e-07, "logits/chosen": 3.28515625, "logits/rejected": 3.01953125, "logps/chosen": -858.0, "logps/rejected": -974.0, "loss": 0.5908, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2900390625, "rewards/margins": 5.48828125, "rewards/rejected": -5.1953125, "step": 3427 }, { "epoch": 0.6477396192545704, "grad_norm": 6.1819677368972465, "learning_rate": 3.9992448344411945e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.640625, "logps/chosen": -727.0, "logps/rejected": -1011.0, "loss": 0.4799, "rewards/accuracies": 0.8125, "rewards/chosen": 0.938720703125, "rewards/margins": 7.6171875, "rewards/rejected": -6.6640625, "step": 3428 }, { "epoch": 0.6479285748027777, "grad_norm": 2.419951146513318, "learning_rate": 3.9964469646228203e-07, "logits/chosen": 3.359375, "logits/rejected": 3.61328125, "logps/chosen": -604.0, "logps/rejected": -976.0, "loss": 0.7723, "rewards/accuracies": 0.71875, "rewards/chosen": 0.10009765625, "rewards/margins": 2.509765625, "rewards/rejected": -2.404296875, "step": 3429 }, { "epoch": 0.648117530350985, "grad_norm": 2.300921421656479, "learning_rate": 3.9936497489242813e-07, "logits/chosen": 2.806640625, "logits/rejected": 2.52734375, "logps/chosen": -568.5, "logps/rejected": -705.5, "loss": 0.5651, "rewards/accuracies": 0.9375, "rewards/chosen": 0.25732421875, "rewards/margins": 4.8046875, "rewards/rejected": -4.54296875, "step": 3430 }, { "epoch": 0.6483064858991923, "grad_norm": 2.077863516428213, "learning_rate": 3.9908531885625064e-07, "logits/chosen": 3.30859375, "logits/rejected": 2.7978515625, "logps/chosen": -963.0, "logps/rejected": -862.5, "loss": 0.5676, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5771484375, "rewards/margins": 10.78125, "rewards/rejected": -10.18359375, "step": 3431 }, { "epoch": 0.6484954414473995, "grad_norm": 2.359205219257759, "learning_rate": 3.9880572847541325e-07, "logits/chosen": 3.99609375, "logits/rejected": 3.5859375, "logps/chosen": -818.0, "logps/rejected": -1190.5, "loss": 0.5386, "rewards/accuracies": 0.9375, "rewards/chosen": 1.16015625, "rewards/margins": 6.890625, "rewards/rejected": -5.7421875, "step": 3432 }, { "epoch": 0.6486843969956068, "grad_norm": 1.4660608506435757, "learning_rate": 3.98526203871552e-07, "logits/chosen": 2.546875, "logits/rejected": 2.185546875, "logps/chosen": -1108.0, "logps/rejected": -875.0, "loss": 0.4498, "rewards/accuracies": 0.9375, "rewards/chosen": 0.41650390625, "rewards/margins": 6.109375, "rewards/rejected": -5.6953125, "step": 3433 }, { "epoch": 0.6488733525438141, "grad_norm": 1.9534119483496701, "learning_rate": 3.9824674516627353e-07, "logits/chosen": 2.59765625, "logits/rejected": 2.390625, "logps/chosen": -991.0, "logps/rejected": -896.0, "loss": 0.4989, "rewards/accuracies": 0.84375, "rewards/chosen": 0.54052734375, "rewards/margins": 5.8671875, "rewards/rejected": -5.32421875, "step": 3434 }, { "epoch": 0.6490623080920214, "grad_norm": 1.9960634686065362, "learning_rate": 3.9796735248115663e-07, "logits/chosen": 3.001953125, "logits/rejected": 2.91015625, "logps/chosen": -592.0, "logps/rejected": -637.0, "loss": 0.6302, "rewards/accuracies": 0.75, "rewards/chosen": -0.0322265625, "rewards/margins": 3.888671875, "rewards/rejected": -3.9140625, "step": 3435 }, { "epoch": 0.6492512636402287, "grad_norm": 1.7854649874246176, "learning_rate": 3.9768802593775064e-07, "logits/chosen": 3.009765625, "logits/rejected": 2.94921875, "logps/chosen": -1041.0, "logps/rejected": -1172.0, "loss": 0.4775, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3935546875, "rewards/margins": 6.1328125, "rewards/rejected": -4.7421875, "step": 3436 }, { "epoch": 0.649440219188436, "grad_norm": 1.7704026818868355, "learning_rate": 3.974087656575763e-07, "logits/chosen": 3.93359375, "logits/rejected": 3.625, "logps/chosen": -659.5, "logps/rejected": -652.0, "loss": 0.5741, "rewards/accuracies": 0.75, "rewards/chosen": 0.453125, "rewards/margins": 4.62109375, "rewards/rejected": -4.17578125, "step": 3437 }, { "epoch": 0.6496291747366432, "grad_norm": 2.501126539373989, "learning_rate": 3.9712957176212593e-07, "logits/chosen": 3.03125, "logits/rejected": 3.04296875, "logps/chosen": -980.0, "logps/rejected": -896.5, "loss": 0.5988, "rewards/accuracies": 0.75, "rewards/chosen": 0.35107421875, "rewards/margins": 5.01953125, "rewards/rejected": -4.66796875, "step": 3438 }, { "epoch": 0.6498181302848505, "grad_norm": 1.9509570349393393, "learning_rate": 3.968504443728624e-07, "logits/chosen": 3.03515625, "logits/rejected": 3.35546875, "logps/chosen": -785.0, "logps/rejected": -992.0, "loss": 0.5309, "rewards/accuracies": 0.8125, "rewards/chosen": 1.37060546875, "rewards/margins": 6.4921875, "rewards/rejected": -5.125, "step": 3439 }, { "epoch": 0.6500070858330578, "grad_norm": 2.3157689729730833, "learning_rate": 3.9657138361122014e-07, "logits/chosen": 2.533203125, "logits/rejected": 1.849609375, "logps/chosen": -632.5, "logps/rejected": -541.5, "loss": 0.5994, "rewards/accuracies": 0.78125, "rewards/chosen": 0.59814453125, "rewards/margins": 3.66796875, "rewards/rejected": -3.072265625, "step": 3440 }, { "epoch": 0.6501960413812651, "grad_norm": 2.251243304535657, "learning_rate": 3.962923895986041e-07, "logits/chosen": 2.87109375, "logits/rejected": 2.7265625, "logps/chosen": -963.5, "logps/rejected": -1849.0, "loss": 0.4762, "rewards/accuracies": 0.84375, "rewards/chosen": 0.96044921875, "rewards/margins": 7.0625, "rewards/rejected": -6.109375, "step": 3441 }, { "epoch": 0.6503849969294724, "grad_norm": 1.7234977908368923, "learning_rate": 3.960134624563907e-07, "logits/chosen": 2.115234375, "logits/rejected": 2.3203125, "logps/chosen": -909.5, "logps/rejected": -659.0, "loss": 0.593, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0654296875, "rewards/margins": 3.81640625, "rewards/rejected": -3.88671875, "step": 3442 }, { "epoch": 0.6505739524776796, "grad_norm": 2.0493089546962557, "learning_rate": 3.9573460230592684e-07, "logits/chosen": 3.4140625, "logits/rejected": 3.3046875, "logps/chosen": -925.0, "logps/rejected": -2326.0, "loss": 0.4982, "rewards/accuracies": 0.75, "rewards/chosen": 0.875, "rewards/margins": 10.46875, "rewards/rejected": -9.61328125, "step": 3443 }, { "epoch": 0.6507629080258869, "grad_norm": 2.7017197699940403, "learning_rate": 3.9545580926853053e-07, "logits/chosen": 3.32421875, "logits/rejected": 2.77734375, "logps/chosen": -590.75, "logps/rejected": -1530.0, "loss": 0.6877, "rewards/accuracies": 0.8125, "rewards/chosen": 0.064453125, "rewards/margins": 5.48046875, "rewards/rejected": -5.427734375, "step": 3444 }, { "epoch": 0.6509518635740942, "grad_norm": 2.3080765599350794, "learning_rate": 3.9517708346549074e-07, "logits/chosen": 2.544921875, "logits/rejected": 2.583984375, "logps/chosen": -1230.0, "logps/rejected": -1130.0, "loss": 0.5742, "rewards/accuracies": 0.8125, "rewards/chosen": 1.44921875, "rewards/margins": 5.83203125, "rewards/rejected": -4.3828125, "step": 3445 }, { "epoch": 0.6511408191223015, "grad_norm": 1.8509515104061198, "learning_rate": 3.948984250180665e-07, "logits/chosen": 3.2265625, "logits/rejected": 2.62890625, "logps/chosen": -938.0, "logps/rejected": -1076.0, "loss": 0.46, "rewards/accuracies": 0.90625, "rewards/chosen": 1.26806640625, "rewards/margins": 7.1484375, "rewards/rejected": -5.86328125, "step": 3446 }, { "epoch": 0.6513297746705088, "grad_norm": 2.6508856074326936, "learning_rate": 3.9461983404748843e-07, "logits/chosen": 2.4765625, "logits/rejected": 2.078125, "logps/chosen": -788.5, "logps/rejected": -717.5, "loss": 0.5537, "rewards/accuracies": 0.84375, "rewards/chosen": 1.05078125, "rewards/margins": 4.625, "rewards/rejected": -3.57421875, "step": 3447 }, { "epoch": 0.6515187302187161, "grad_norm": 1.6670735344265293, "learning_rate": 3.94341310674957e-07, "logits/chosen": 2.78515625, "logits/rejected": 2.779296875, "logps/chosen": -1015.0, "logps/rejected": -879.0, "loss": 0.5804, "rewards/accuracies": 0.8125, "rewards/chosen": 0.96697998046875, "rewards/margins": 4.69140625, "rewards/rejected": -3.7265625, "step": 3448 }, { "epoch": 0.6517076857669233, "grad_norm": 2.3526862243842395, "learning_rate": 3.940628550216439e-07, "logits/chosen": 2.84375, "logits/rejected": 2.673828125, "logps/chosen": -831.5, "logps/rejected": -833.0, "loss": 0.7012, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3505859375, "rewards/margins": 3.421875, "rewards/rejected": -3.076171875, "step": 3449 }, { "epoch": 0.6518966413151306, "grad_norm": 2.545098223324085, "learning_rate": 3.9378446720869095e-07, "logits/chosen": 2.8203125, "logits/rejected": 2.3798828125, "logps/chosen": -1088.0, "logps/rejected": -1761.0, "loss": 0.5598, "rewards/accuracies": 0.78125, "rewards/chosen": 0.990234375, "rewards/margins": 6.58203125, "rewards/rejected": -5.5859375, "step": 3450 }, { "epoch": 0.6520855968633379, "grad_norm": 4.965044247052867, "learning_rate": 3.9350614735721056e-07, "logits/chosen": 2.583984375, "logits/rejected": 2.45703125, "logps/chosen": -12342.0, "logps/rejected": -917.0, "loss": 0.6129, "rewards/accuracies": 0.78125, "rewards/chosen": 16.63720703125, "rewards/margins": 21.0546875, "rewards/rejected": -4.37890625, "step": 3451 }, { "epoch": 0.6522745524115452, "grad_norm": 1.482308052973365, "learning_rate": 3.9322789558828606e-07, "logits/chosen": 2.259765625, "logits/rejected": 2.462890625, "logps/chosen": -787.0, "logps/rejected": -1574.0, "loss": 0.5326, "rewards/accuracies": 0.8125, "rewards/chosen": 0.37451171875, "rewards/margins": 7.30859375, "rewards/rejected": -6.9375, "step": 3452 }, { "epoch": 0.6524635079597525, "grad_norm": 4.512821749662487, "learning_rate": 3.9294971202297e-07, "logits/chosen": 3.93359375, "logits/rejected": 3.482421875, "logps/chosen": -796.0, "logps/rejected": -886.0, "loss": 0.6162, "rewards/accuracies": 0.78125, "rewards/chosen": 0.58935546875, "rewards/margins": 5.01953125, "rewards/rejected": -4.421875, "step": 3453 }, { "epoch": 0.6526524635079598, "grad_norm": 1.4851213418724865, "learning_rate": 3.9267159678228654e-07, "logits/chosen": 2.310546875, "logits/rejected": 2.513671875, "logps/chosen": -928.0, "logps/rejected": -1058.0, "loss": 0.6016, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6201171875, "rewards/margins": 4.125, "rewards/rejected": -3.4921875, "step": 3454 }, { "epoch": 0.652841419056167, "grad_norm": 3.434356298865597, "learning_rate": 3.923935499872292e-07, "logits/chosen": 3.62109375, "logits/rejected": 3.28125, "logps/chosen": -1230.0, "logps/rejected": -1803.0, "loss": 0.5078, "rewards/accuracies": 0.84375, "rewards/chosen": 1.71484375, "rewards/margins": 8.15625, "rewards/rejected": -6.4453125, "step": 3455 }, { "epoch": 0.6530303746043743, "grad_norm": 3.870233469581852, "learning_rate": 3.921155717587624e-07, "logits/chosen": 3.1484375, "logits/rejected": 3.10546875, "logps/chosen": -835.0, "logps/rejected": -15644.0, "loss": 0.4821, "rewards/accuracies": 0.8125, "rewards/chosen": 1.08984375, "rewards/margins": 60.48046875, "rewards/rejected": -59.421875, "step": 3456 }, { "epoch": 0.6532193301525816, "grad_norm": 2.4482418818677822, "learning_rate": 3.9183766221782e-07, "logits/chosen": 3.5546875, "logits/rejected": 3.21875, "logps/chosen": -1152.0, "logps/rejected": -954.0, "loss": 0.5263, "rewards/accuracies": 0.84375, "rewards/chosen": 1.02734375, "rewards/margins": 5.3984375, "rewards/rejected": -4.36328125, "step": 3457 }, { "epoch": 0.6534082857007889, "grad_norm": 2.4805279390631303, "learning_rate": 3.9155982148530655e-07, "logits/chosen": 2.875, "logits/rejected": 2.64453125, "logps/chosen": -694.0, "logps/rejected": -944.0, "loss": 0.657, "rewards/accuracies": 0.75, "rewards/chosen": 0.1456298828125, "rewards/margins": 3.970703125, "rewards/rejected": -3.82421875, "step": 3458 }, { "epoch": 0.6535972412489962, "grad_norm": 2.767936613166433, "learning_rate": 3.9128204968209664e-07, "logits/chosen": 3.1328125, "logits/rejected": 3.0234375, "logps/chosen": -865.0, "logps/rejected": -675.5, "loss": 0.6179, "rewards/accuracies": 0.75, "rewards/chosen": 0.2308349609375, "rewards/margins": 3.556640625, "rewards/rejected": -3.3203125, "step": 3459 }, { "epoch": 0.6537861967972035, "grad_norm": 2.2731311594353256, "learning_rate": 3.910043469290345e-07, "logits/chosen": 3.4921875, "logits/rejected": 3.16015625, "logps/chosen": -1266.0, "logps/rejected": -1431.0, "loss": 0.3919, "rewards/accuracies": 0.9375, "rewards/chosen": 2.9541015625, "rewards/margins": 13.328125, "rewards/rejected": -10.3828125, "step": 3460 }, { "epoch": 0.6539751523454107, "grad_norm": 2.394678097230197, "learning_rate": 3.9072671334693495e-07, "logits/chosen": 3.21484375, "logits/rejected": 2.509765625, "logps/chosen": -901.0, "logps/rejected": -728.5, "loss": 0.6289, "rewards/accuracies": 0.78125, "rewards/chosen": 0.30078125, "rewards/margins": 3.56689453125, "rewards/rejected": -3.265625, "step": 3461 }, { "epoch": 0.654164107893618, "grad_norm": 4.141895106426401, "learning_rate": 3.904491490565817e-07, "logits/chosen": 2.65625, "logits/rejected": 2.55859375, "logps/chosen": -874.5, "logps/rejected": -1570.0, "loss": 0.5671, "rewards/accuracies": 0.8125, "rewards/chosen": 1.12738037109375, "rewards/margins": 7.5, "rewards/rejected": -6.3828125, "step": 3462 }, { "epoch": 0.6543530634418253, "grad_norm": 2.580993543848566, "learning_rate": 3.901716541787293e-07, "logits/chosen": 2.640625, "logits/rejected": 2.59765625, "logps/chosen": -692.0, "logps/rejected": -974.5, "loss": 0.5721, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4140625, "rewards/margins": 5.25390625, "rewards/rejected": -3.83984375, "step": 3463 }, { "epoch": 0.6545420189900326, "grad_norm": 2.031005133278227, "learning_rate": 3.898942288341017e-07, "logits/chosen": 2.75390625, "logits/rejected": 3.0390625, "logps/chosen": -733.5, "logps/rejected": -1430.5, "loss": 0.5568, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2490234375, "rewards/margins": 6.23046875, "rewards/rejected": -4.9921875, "step": 3464 }, { "epoch": 0.6547309745382399, "grad_norm": 1.9463946339188238, "learning_rate": 3.896168731433925e-07, "logits/chosen": 2.693359375, "logits/rejected": 2.755859375, "logps/chosen": -925.0, "logps/rejected": -828.5, "loss": 0.52, "rewards/accuracies": 0.875, "rewards/chosen": 0.4404296875, "rewards/margins": 4.91796875, "rewards/rejected": -4.48046875, "step": 3465 }, { "epoch": 0.6549199300864471, "grad_norm": 1.4371062914002297, "learning_rate": 3.8933958722726545e-07, "logits/chosen": 2.92578125, "logits/rejected": 2.701171875, "logps/chosen": -568.0, "logps/rejected": -611.0, "loss": 0.6645, "rewards/accuracies": 0.75, "rewards/chosen": 0.84619140625, "rewards/margins": 3.91015625, "rewards/rejected": -3.05908203125, "step": 3466 }, { "epoch": 0.6551088856346544, "grad_norm": 1.625979259278613, "learning_rate": 3.890623712063532e-07, "logits/chosen": 2.41796875, "logits/rejected": 2.388671875, "logps/chosen": -849.0, "logps/rejected": -989.5, "loss": 0.5274, "rewards/accuracies": 0.84375, "rewards/chosen": 1.36376953125, "rewards/margins": 5.484375, "rewards/rejected": -4.115234375, "step": 3467 }, { "epoch": 0.6552978411828617, "grad_norm": 1.4175495319526967, "learning_rate": 3.8878522520125865e-07, "logits/chosen": 2.8984375, "logits/rejected": 2.357421875, "logps/chosen": -1056.0, "logps/rejected": -1045.0, "loss": 0.4394, "rewards/accuracies": 0.8125, "rewards/chosen": 2.50390625, "rewards/margins": 6.71875, "rewards/rejected": -4.2265625, "step": 3468 }, { "epoch": 0.655486796731069, "grad_norm": 3.246437007723214, "learning_rate": 3.8850814933255396e-07, "logits/chosen": 3.373046875, "logits/rejected": 3.3125, "logps/chosen": -678.5, "logps/rejected": -638.0, "loss": 0.5856, "rewards/accuracies": 0.75, "rewards/chosen": 1.9580078125, "rewards/margins": 3.8984375, "rewards/rejected": -1.93896484375, "step": 3469 }, { "epoch": 0.6556757522792763, "grad_norm": 1.7324893439611493, "learning_rate": 3.8823114372078093e-07, "logits/chosen": 3.640625, "logits/rejected": 3.17578125, "logps/chosen": -484.5, "logps/rejected": -669.5, "loss": 0.5981, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6903076171875, "rewards/margins": 4.005859375, "rewards/rejected": -3.30859375, "step": 3470 }, { "epoch": 0.6558647078274836, "grad_norm": 2.210079505951319, "learning_rate": 3.8795420848645033e-07, "logits/chosen": 2.251953125, "logits/rejected": 1.40087890625, "logps/chosen": -633.5, "logps/rejected": -538.5, "loss": 0.6461, "rewards/accuracies": 0.875, "rewards/chosen": 0.9189453125, "rewards/margins": 3.220703125, "rewards/rejected": -2.310546875, "step": 3471 }, { "epoch": 0.6560536633756908, "grad_norm": 1.9562354377163453, "learning_rate": 3.876773437500429e-07, "logits/chosen": 3.0234375, "logits/rejected": 2.87890625, "logps/chosen": -513.5, "logps/rejected": -1000.0, "loss": 0.5581, "rewards/accuracies": 0.75, "rewards/chosen": 1.09765625, "rewards/margins": 6.1484375, "rewards/rejected": -5.05078125, "step": 3472 }, { "epoch": 0.6562426189238981, "grad_norm": 1.8424582548050146, "learning_rate": 3.874005496320086e-07, "logits/chosen": 2.6171875, "logits/rejected": 2.828125, "logps/chosen": -655.5, "logps/rejected": -713.0, "loss": 0.7314, "rewards/accuracies": 0.71875, "rewards/chosen": 0.380096435546875, "rewards/margins": 2.4892578125, "rewards/rejected": -2.11328125, "step": 3473 }, { "epoch": 0.6564315744721054, "grad_norm": 1.9454101165669118, "learning_rate": 3.871238262527664e-07, "logits/chosen": 2.021484375, "logits/rejected": 1.958251953125, "logps/chosen": -807.0, "logps/rejected": -793.0, "loss": 0.498, "rewards/accuracies": 0.875, "rewards/chosen": 2.0556640625, "rewards/margins": 4.7265625, "rewards/rejected": -2.662109375, "step": 3474 }, { "epoch": 0.6566205300203127, "grad_norm": 1.4057373444537775, "learning_rate": 3.868471737327047e-07, "logits/chosen": 1.6962890625, "logits/rejected": 1.30322265625, "logps/chosen": -1206.0, "logps/rejected": -1031.0, "loss": 0.4231, "rewards/accuracies": 0.9375, "rewards/chosen": 2.091796875, "rewards/margins": 6.9375, "rewards/rejected": -4.8515625, "step": 3475 }, { "epoch": 0.65680948556852, "grad_norm": 2.6810391994939526, "learning_rate": 3.865705921921809e-07, "logits/chosen": 1.4375, "logits/rejected": 1.25048828125, "logps/chosen": -520.5, "logps/rejected": -14783.5, "loss": 0.6013, "rewards/accuracies": 0.75, "rewards/chosen": 0.52752685546875, "rewards/margins": -114.40234375, "rewards/rejected": 114.62109375, "step": 3476 }, { "epoch": 0.6569984411167273, "grad_norm": 1.953709923205291, "learning_rate": 3.8629408175152165e-07, "logits/chosen": 2.6640625, "logits/rejected": 3.12109375, "logps/chosen": -915.0, "logps/rejected": -1162.0, "loss": 0.6117, "rewards/accuracies": 0.71875, "rewards/chosen": 1.19873046875, "rewards/margins": 5.0859375, "rewards/rejected": -3.88671875, "step": 3477 }, { "epoch": 0.6571873966649345, "grad_norm": 1.528071806706184, "learning_rate": 3.860176425310228e-07, "logits/chosen": 2.83203125, "logits/rejected": 3.072265625, "logps/chosen": -1897.0, "logps/rejected": -1125.0, "loss": 0.4569, "rewards/accuracies": 0.875, "rewards/chosen": -2.3828125, "rewards/margins": 2.4453125, "rewards/rejected": -4.82421875, "step": 3478 }, { "epoch": 0.6573763522131418, "grad_norm": 3.3179014140472716, "learning_rate": 3.8574127465094876e-07, "logits/chosen": 3.015625, "logits/rejected": 2.181640625, "logps/chosen": -1016.0, "logps/rejected": -754.0, "loss": 0.458, "rewards/accuracies": 0.90625, "rewards/chosen": 1.634521484375, "rewards/margins": 6.09375, "rewards/rejected": -4.45703125, "step": 3479 }, { "epoch": 0.6575653077613491, "grad_norm": 4.189614912987583, "learning_rate": 3.8546497823153367e-07, "logits/chosen": 3.38671875, "logits/rejected": 2.583984375, "logps/chosen": -866.0, "logps/rejected": -668.5, "loss": 0.7413, "rewards/accuracies": 0.75, "rewards/chosen": 0.103515625, "rewards/margins": 3.728515625, "rewards/rejected": -3.62109375, "step": 3480 }, { "epoch": 0.6577542633095564, "grad_norm": 2.2173823946354942, "learning_rate": 3.851887533929796e-07, "logits/chosen": 3.00390625, "logits/rejected": 2.734375, "logps/chosen": -917.0, "logps/rejected": -933.0, "loss": 0.5426, "rewards/accuracies": 0.84375, "rewards/chosen": 1.423828125, "rewards/margins": 6.171875, "rewards/rejected": -4.75, "step": 3481 }, { "epoch": 0.6579432188577637, "grad_norm": 3.6692727496997275, "learning_rate": 3.8491260025545845e-07, "logits/chosen": 2.875, "logits/rejected": 2.8359375, "logps/chosen": -856.0, "logps/rejected": -893.5, "loss": 0.5582, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3817138671875, "rewards/margins": 4.560546875, "rewards/rejected": -3.1826171875, "step": 3482 }, { "epoch": 0.658132174405971, "grad_norm": 1.468674658067142, "learning_rate": 3.846365189391102e-07, "logits/chosen": 2.56640625, "logits/rejected": 2.501953125, "logps/chosen": -713.5, "logps/rejected": -838.0, "loss": 0.4646, "rewards/accuracies": 0.875, "rewards/chosen": 1.04931640625, "rewards/margins": 5.96875, "rewards/rejected": -4.9296875, "step": 3483 }, { "epoch": 0.6583211299541782, "grad_norm": 2.1404193331116037, "learning_rate": 3.843605095640441e-07, "logits/chosen": 3.193359375, "logits/rejected": 2.98046875, "logps/chosen": -776.0, "logps/rejected": -900.0, "loss": 0.5255, "rewards/accuracies": 0.78125, "rewards/chosen": 1.158935546875, "rewards/margins": 4.1171875, "rewards/rejected": -2.95703125, "step": 3484 }, { "epoch": 0.6585100855023855, "grad_norm": 2.1575970983392274, "learning_rate": 3.840845722503375e-07, "logits/chosen": 3.4921875, "logits/rejected": 2.50390625, "logps/chosen": -707.5, "logps/rejected": -729.0, "loss": 0.5063, "rewards/accuracies": 0.875, "rewards/chosen": 1.2275390625, "rewards/margins": 5.2890625, "rewards/rejected": -4.0703125, "step": 3485 }, { "epoch": 0.6586990410505928, "grad_norm": 1.9081208173075348, "learning_rate": 3.838087071180369e-07, "logits/chosen": 2.759765625, "logits/rejected": 2.91015625, "logps/chosen": -580.5, "logps/rejected": -1481.5, "loss": 0.6698, "rewards/accuracies": 0.78125, "rewards/chosen": 0.321533203125, "rewards/margins": 5.21484375, "rewards/rejected": -4.90234375, "step": 3486 }, { "epoch": 0.6588879965988002, "grad_norm": 2.5747988511802267, "learning_rate": 3.835329142871575e-07, "logits/chosen": 3.9609375, "logits/rejected": 3.71875, "logps/chosen": -553.75, "logps/rejected": -499.25, "loss": 0.685, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6953125, "rewards/margins": 2.9609375, "rewards/rejected": -2.26953125, "step": 3487 }, { "epoch": 0.6590769521470075, "grad_norm": 2.9306070695360593, "learning_rate": 3.832571938776825e-07, "logits/chosen": 3.3515625, "logits/rejected": 2.525390625, "logps/chosen": -1236.0, "logps/rejected": -1238.0, "loss": 0.573, "rewards/accuracies": 0.78125, "rewards/chosen": 1.91552734375, "rewards/margins": 5.66015625, "rewards/rejected": -3.73828125, "step": 3488 }, { "epoch": 0.6592659076952146, "grad_norm": 3.5821069886749175, "learning_rate": 3.829815460095642e-07, "logits/chosen": 2.6201171875, "logits/rejected": 2.6806640625, "logps/chosen": -1237.0, "logps/rejected": -1177.0, "loss": 0.5825, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7802734375, "rewards/margins": 5.158203125, "rewards/rejected": -4.390625, "step": 3489 }, { "epoch": 0.659454863243422, "grad_norm": 2.2856297504128436, "learning_rate": 3.827059708027226e-07, "logits/chosen": 3.06640625, "logits/rejected": 3.107421875, "logps/chosen": -425.0, "logps/rejected": -570.0, "loss": 0.6214, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2861328125, "rewards/margins": 4.5078125, "rewards/rejected": -4.2109375, "step": 3490 }, { "epoch": 0.6596438187916293, "grad_norm": 2.7415572485931805, "learning_rate": 3.8243046837704705e-07, "logits/chosen": 2.916015625, "logits/rejected": 2.669921875, "logps/chosen": -658.5, "logps/rejected": -714.0, "loss": 0.7076, "rewards/accuracies": 0.6875, "rewards/chosen": -0.510009765625, "rewards/margins": 3.0068359375, "rewards/rejected": -3.515625, "step": 3491 }, { "epoch": 0.6598327743398366, "grad_norm": 1.9881906450716427, "learning_rate": 3.821550388523943e-07, "logits/chosen": 2.0, "logits/rejected": 1.4775390625, "logps/chosen": -926.5, "logps/rejected": -1304.0, "loss": 0.5437, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0247802734375, "rewards/margins": 5.71484375, "rewards/rejected": -4.6875, "step": 3492 }, { "epoch": 0.6600217298880439, "grad_norm": 3.020163367262508, "learning_rate": 3.818796823485898e-07, "logits/chosen": 2.603515625, "logits/rejected": 2.3740234375, "logps/chosen": -803.0, "logps/rejected": -884.0, "loss": 0.6468, "rewards/accuracies": 0.8125, "rewards/chosen": -0.230224609375, "rewards/margins": 4.3564453125, "rewards/rejected": -4.5859375, "step": 3493 }, { "epoch": 0.6602106854362512, "grad_norm": 7.461690315519121, "learning_rate": 3.8160439898542764e-07, "logits/chosen": 3.4140625, "logits/rejected": 3.5390625, "logps/chosen": -968.5, "logps/rejected": -855.0, "loss": 0.6492, "rewards/accuracies": 0.8125, "rewards/chosen": 0.697265625, "rewards/margins": 4.419921875, "rewards/rejected": -3.728515625, "step": 3494 }, { "epoch": 0.6603996409844584, "grad_norm": 3.1575585100287675, "learning_rate": 3.8132918888266927e-07, "logits/chosen": 2.9140625, "logits/rejected": 2.578125, "logps/chosen": -666.0, "logps/rejected": -1654.0, "loss": 0.5032, "rewards/accuracies": 1.0, "rewards/chosen": 0.9736328125, "rewards/margins": 6.984375, "rewards/rejected": -6.0078125, "step": 3495 }, { "epoch": 0.6605885965326657, "grad_norm": 2.077590636410348, "learning_rate": 3.8105405216004504e-07, "logits/chosen": 3.15234375, "logits/rejected": 3.04296875, "logps/chosen": -559.5, "logps/rejected": -823.0, "loss": 0.7044, "rewards/accuracies": 0.71875, "rewards/chosen": 0.134521484375, "rewards/margins": 3.82421875, "rewards/rejected": -3.6923828125, "step": 3496 }, { "epoch": 0.660777552080873, "grad_norm": 1.9883492841973471, "learning_rate": 3.8077898893725247e-07, "logits/chosen": 2.552734375, "logits/rejected": 2.169921875, "logps/chosen": -1032.0, "logps/rejected": -1159.0, "loss": 0.5186, "rewards/accuracies": 0.8125, "rewards/chosen": 0.88916015625, "rewards/margins": 9.59375, "rewards/rejected": -8.734375, "step": 3497 }, { "epoch": 0.6609665076290803, "grad_norm": 1.984853985511826, "learning_rate": 3.8050399933395814e-07, "logits/chosen": 2.234375, "logits/rejected": 1.587890625, "logps/chosen": -887.0, "logps/rejected": -737.5, "loss": 0.5686, "rewards/accuracies": 0.84375, "rewards/chosen": 0.45361328125, "rewards/margins": 5.32421875, "rewards/rejected": -4.875, "step": 3498 }, { "epoch": 0.6611554631772876, "grad_norm": 4.365808409945374, "learning_rate": 3.802290834697959e-07, "logits/chosen": 2.400390625, "logits/rejected": 2.5888671875, "logps/chosen": -772.0, "logps/rejected": -821.0, "loss": 0.6425, "rewards/accuracies": 0.75, "rewards/chosen": 0.4140625, "rewards/margins": 3.27783203125, "rewards/rejected": -2.857421875, "step": 3499 }, { "epoch": 0.6613444187254949, "grad_norm": 2.4785261960881684, "learning_rate": 3.799542414643677e-07, "logits/chosen": 3.296875, "logits/rejected": 3.68359375, "logps/chosen": -829.5, "logps/rejected": -705.0, "loss": 0.5884, "rewards/accuracies": 0.75, "rewards/chosen": 0.75, "rewards/margins": 4.8671875, "rewards/rejected": -4.125, "step": 3500 }, { "epoch": 0.6615333742737021, "grad_norm": 5.624466233202155, "learning_rate": 3.796794734372437e-07, "logits/chosen": 3.19140625, "logits/rejected": 2.662109375, "logps/chosen": -1236.0, "logps/rejected": -911.0, "loss": 0.6328, "rewards/accuracies": 0.84375, "rewards/chosen": 1.6009521484375, "rewards/margins": 4.806640625, "rewards/rejected": -3.205078125, "step": 3501 }, { "epoch": 0.6617223298219094, "grad_norm": 2.3325039993071335, "learning_rate": 3.794047795079611e-07, "logits/chosen": 2.203125, "logits/rejected": 2.0765380859375, "logps/chosen": -903.0, "logps/rejected": -1587.0, "loss": 0.5753, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5478515625, "rewards/margins": 8.2578125, "rewards/rejected": -7.72265625, "step": 3502 }, { "epoch": 0.6619112853701167, "grad_norm": 1.671124007767491, "learning_rate": 3.791301597960257e-07, "logits/chosen": 3.07421875, "logits/rejected": 2.6953125, "logps/chosen": -986.0, "logps/rejected": -705.0, "loss": 0.6701, "rewards/accuracies": 0.78125, "rewards/chosen": 0.185791015625, "rewards/margins": 3.5234375, "rewards/rejected": -3.3359375, "step": 3503 }, { "epoch": 0.662100240918324, "grad_norm": 2.068773837011549, "learning_rate": 3.788556144209104e-07, "logits/chosen": 4.2421875, "logits/rejected": 3.52734375, "logps/chosen": -6184.5, "logps/rejected": -792.5, "loss": 0.698, "rewards/accuracies": 0.71875, "rewards/chosen": -23.750213623046875, "rewards/margins": -19.53955078125, "rewards/rejected": -4.24609375, "step": 3504 }, { "epoch": 0.6622891964665313, "grad_norm": 3.3365996085384158, "learning_rate": 3.785811435020563e-07, "logits/chosen": 3.05078125, "logits/rejected": 2.392578125, "logps/chosen": -604.5, "logps/rejected": -622.0, "loss": 0.6029, "rewards/accuracies": 0.78125, "rewards/chosen": 0.20263671875, "rewards/margins": 4.21875, "rewards/rejected": -4.01171875, "step": 3505 }, { "epoch": 0.6624781520147386, "grad_norm": 1.7594367591802844, "learning_rate": 3.7830674715887126e-07, "logits/chosen": 2.2548828125, "logits/rejected": 1.8984375, "logps/chosen": -945.0, "logps/rejected": -856.0, "loss": 0.4947, "rewards/accuracies": 0.875, "rewards/chosen": 0.81884765625, "rewards/margins": 5.30078125, "rewards/rejected": -4.48046875, "step": 3506 }, { "epoch": 0.6626671075629458, "grad_norm": 2.8363277924811756, "learning_rate": 3.7803242551073166e-07, "logits/chosen": 3.29296875, "logits/rejected": 3.59375, "logps/chosen": -716.5, "logps/rejected": -726.0, "loss": 0.7098, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3837890625, "rewards/margins": 3.0859375, "rewards/rejected": -3.47265625, "step": 3507 }, { "epoch": 0.6628560631111531, "grad_norm": 3.904070726964344, "learning_rate": 3.77758178676981e-07, "logits/chosen": 2.40234375, "logits/rejected": 2.34375, "logps/chosen": -715.5, "logps/rejected": -1635.5, "loss": 0.6007, "rewards/accuracies": 0.875, "rewards/chosen": 0.22509765625, "rewards/margins": 8.6015625, "rewards/rejected": -8.3671875, "step": 3508 }, { "epoch": 0.6630450186593604, "grad_norm": 2.908717768558519, "learning_rate": 3.7748400677692995e-07, "logits/chosen": 2.740234375, "logits/rejected": 3.625, "logps/chosen": -970.5, "logps/rejected": -2282.0, "loss": 0.5521, "rewards/accuracies": 0.875, "rewards/chosen": 0.3828125, "rewards/margins": 9.7265625, "rewards/rejected": -9.35546875, "step": 3509 }, { "epoch": 0.6632339742075677, "grad_norm": 2.042860813189588, "learning_rate": 3.772099099298571e-07, "logits/chosen": 3.3984375, "logits/rejected": 2.6015625, "logps/chosen": -784.0, "logps/rejected": -911.0, "loss": 0.6381, "rewards/accuracies": 0.78125, "rewards/chosen": 0.09228515625, "rewards/margins": 4.26953125, "rewards/rejected": -4.18359375, "step": 3510 }, { "epoch": 0.663422929755775, "grad_norm": 3.14633569947779, "learning_rate": 3.76935888255008e-07, "logits/chosen": 2.65234375, "logits/rejected": 2.9453125, "logps/chosen": -1087.0, "logps/rejected": -1204.0, "loss": 0.4298, "rewards/accuracies": 0.90625, "rewards/chosen": 0.73828125, "rewards/margins": 6.3671875, "rewards/rejected": -5.6328125, "step": 3511 }, { "epoch": 0.6636118853039822, "grad_norm": 2.6279152560048016, "learning_rate": 3.766619418715955e-07, "logits/chosen": 3.169921875, "logits/rejected": 3.275390625, "logps/chosen": -10331.0, "logps/rejected": -1178.0, "loss": 0.6828, "rewards/accuracies": 0.78125, "rewards/chosen": -15.296875, "rewards/margins": -11.5390625, "rewards/rejected": -3.765625, "step": 3512 }, { "epoch": 0.6638008408521895, "grad_norm": 2.020904626383631, "learning_rate": 3.763880708987999e-07, "logits/chosen": 2.439453125, "logits/rejected": 2.3359375, "logps/chosen": -807.0, "logps/rejected": -774.0, "loss": 0.5452, "rewards/accuracies": 0.875, "rewards/chosen": 0.3568115234375, "rewards/margins": 4.81640625, "rewards/rejected": -4.45703125, "step": 3513 }, { "epoch": 0.6639897964003968, "grad_norm": 1.9349538060352303, "learning_rate": 3.7611427545576857e-07, "logits/chosen": 2.4296875, "logits/rejected": 2.63671875, "logps/chosen": -635.5, "logps/rejected": -764.0, "loss": 0.5354, "rewards/accuracies": 0.875, "rewards/chosen": 0.1982421875, "rewards/margins": 5.59375, "rewards/rejected": -5.40234375, "step": 3514 }, { "epoch": 0.6641787519486041, "grad_norm": 1.6627458098429753, "learning_rate": 3.758405556616161e-07, "logits/chosen": 2.0361328125, "logits/rejected": 2.341796875, "logps/chosen": -475.5, "logps/rejected": -595.5, "loss": 0.6397, "rewards/accuracies": 0.8125, "rewards/chosen": 0.18017578125, "rewards/margins": 3.88671875, "rewards/rejected": -3.703125, "step": 3515 }, { "epoch": 0.6643677074968114, "grad_norm": 4.076742270327738, "learning_rate": 3.755669116354241e-07, "logits/chosen": 2.95703125, "logits/rejected": 2.59765625, "logps/chosen": -482.0, "logps/rejected": -543.5, "loss": 0.7461, "rewards/accuracies": 0.625, "rewards/chosen": -1.01708984375, "rewards/margins": 2.880859375, "rewards/rejected": -3.89453125, "step": 3516 }, { "epoch": 0.6645566630450187, "grad_norm": 2.65285814777504, "learning_rate": 3.752933434962413e-07, "logits/chosen": 2.54296875, "logits/rejected": 2.5078125, "logps/chosen": -853.0, "logps/rejected": -1183.0, "loss": 0.5911, "rewards/accuracies": 0.75, "rewards/chosen": 0.375, "rewards/margins": 9.181640625, "rewards/rejected": -8.8203125, "step": 3517 }, { "epoch": 0.6647456185932259, "grad_norm": 2.3830230241789363, "learning_rate": 3.7501985136308324e-07, "logits/chosen": 2.69140625, "logits/rejected": 2.79296875, "logps/chosen": -933.0, "logps/rejected": -1570.0, "loss": 0.6587, "rewards/accuracies": 0.75, "rewards/chosen": 0.156494140625, "rewards/margins": 8.5859375, "rewards/rejected": -8.4375, "step": 3518 }, { "epoch": 0.6649345741414332, "grad_norm": 1.9620639516813623, "learning_rate": 3.7474643535493267e-07, "logits/chosen": 2.8564453125, "logits/rejected": 2.40625, "logps/chosen": -1846.0, "logps/rejected": -1902.0, "loss": 0.3705, "rewards/accuracies": 0.96875, "rewards/chosen": 2.4111328125, "rewards/margins": 9.1015625, "rewards/rejected": -6.68359375, "step": 3519 }, { "epoch": 0.6651235296896405, "grad_norm": 2.3678627676138855, "learning_rate": 3.744730955907388e-07, "logits/chosen": 2.765625, "logits/rejected": 2.2265625, "logps/chosen": -427.0, "logps/rejected": -699.5, "loss": 0.6569, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1318359375, "rewards/margins": 4.44140625, "rewards/rejected": -4.296875, "step": 3520 }, { "epoch": 0.6653124852378478, "grad_norm": 1.8877890313176666, "learning_rate": 3.741998321894181e-07, "logits/chosen": 2.95703125, "logits/rejected": 2.15625, "logps/chosen": -646.5, "logps/rejected": -610.5, "loss": 0.6946, "rewards/accuracies": 0.75, "rewards/chosen": -0.47918701171875, "rewards/margins": 3.72265625, "rewards/rejected": -4.203125, "step": 3521 }, { "epoch": 0.6655014407860551, "grad_norm": 1.9516164361601827, "learning_rate": 3.739266452698536e-07, "logits/chosen": 2.9375, "logits/rejected": 2.91015625, "logps/chosen": -631.0, "logps/rejected": -862.0, "loss": 0.6802, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4757080078125, "rewards/margins": 5.8515625, "rewards/rejected": -6.33203125, "step": 3522 }, { "epoch": 0.6656903963342624, "grad_norm": 5.9495428202190075, "learning_rate": 3.7365353495089504e-07, "logits/chosen": 3.05078125, "logits/rejected": 2.73828125, "logps/chosen": -499.0, "logps/rejected": -893.5, "loss": 0.6003, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2020263671875, "rewards/margins": 5.390625, "rewards/rejected": -5.19140625, "step": 3523 }, { "epoch": 0.6658793518824696, "grad_norm": 6.113967925857097, "learning_rate": 3.733805013513592e-07, "logits/chosen": 2.77734375, "logits/rejected": 2.03125, "logps/chosen": -718.0, "logps/rejected": -701.5, "loss": 0.6923, "rewards/accuracies": 0.65625, "rewards/chosen": 0.05712890625, "rewards/margins": 4.68359375, "rewards/rejected": -4.62890625, "step": 3524 }, { "epoch": 0.6660683074306769, "grad_norm": 2.2472595000392, "learning_rate": 3.7310754459002893e-07, "logits/chosen": 1.841796875, "logits/rejected": 1.8046875, "logps/chosen": -1092.0, "logps/rejected": -1810.5, "loss": 0.5944, "rewards/accuracies": 0.875, "rewards/chosen": 0.6923828125, "rewards/margins": 8.16796875, "rewards/rejected": -7.4453125, "step": 3525 }, { "epoch": 0.6662572629788842, "grad_norm": 3.798717813610952, "learning_rate": 3.7283466478565383e-07, "logits/chosen": 3.26171875, "logits/rejected": 3.033203125, "logps/chosen": -671.0, "logps/rejected": -978.0, "loss": 0.6284, "rewards/accuracies": 0.78125, "rewards/chosen": 0.55126953125, "rewards/margins": 5.73046875, "rewards/rejected": -5.1796875, "step": 3526 }, { "epoch": 0.6664462185270915, "grad_norm": 1.813614464919864, "learning_rate": 3.725618620569502e-07, "logits/chosen": 2.873046875, "logits/rejected": 2.873046875, "logps/chosen": -686.0, "logps/rejected": -1074.0, "loss": 0.522, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3779296875, "rewards/margins": 6.4765625, "rewards/rejected": -5.1015625, "step": 3527 }, { "epoch": 0.6666351740752988, "grad_norm": 1.5443001239267697, "learning_rate": 3.7228913652260064e-07, "logits/chosen": 2.72265625, "logits/rejected": 2.3154296875, "logps/chosen": -631.5, "logps/rejected": -826.5, "loss": 0.7245, "rewards/accuracies": 0.625, "rewards/chosen": -0.25, "rewards/margins": 2.97802734375, "rewards/rejected": -3.232421875, "step": 3528 }, { "epoch": 0.6668241296235061, "grad_norm": 3.256006288149153, "learning_rate": 3.7201648830125456e-07, "logits/chosen": 2.326171875, "logits/rejected": 2.599609375, "logps/chosen": -877.5, "logps/rejected": -1263.0, "loss": 0.5064, "rewards/accuracies": 0.90625, "rewards/chosen": 1.5595703125, "rewards/margins": 7.0234375, "rewards/rejected": -5.46875, "step": 3529 }, { "epoch": 0.6670130851717133, "grad_norm": 2.353089937771551, "learning_rate": 3.717439175115268e-07, "logits/chosen": 2.18359375, "logits/rejected": 2.01171875, "logps/chosen": -803.0, "logps/rejected": -747.0, "loss": 0.655, "rewards/accuracies": 0.75, "rewards/chosen": 0.913330078125, "rewards/margins": 4.56640625, "rewards/rejected": -3.65625, "step": 3530 }, { "epoch": 0.6672020407199206, "grad_norm": 1.8594027754359412, "learning_rate": 3.7147142427199976e-07, "logits/chosen": 2.41015625, "logits/rejected": 1.810546875, "logps/chosen": -607.5, "logps/rejected": -740.5, "loss": 0.6671, "rewards/accuracies": 0.75, "rewards/chosen": 0.0992431640625, "rewards/margins": 3.185546875, "rewards/rejected": -3.091796875, "step": 3531 }, { "epoch": 0.6673909962681279, "grad_norm": 1.5818402373411904, "learning_rate": 3.711990087012209e-07, "logits/chosen": 3.125, "logits/rejected": 2.84375, "logps/chosen": -874.0, "logps/rejected": -798.0, "loss": 0.5489, "rewards/accuracies": 0.8125, "rewards/chosen": 0.208984375, "rewards/margins": 4.263671875, "rewards/rejected": -4.0546875, "step": 3532 }, { "epoch": 0.6675799518163352, "grad_norm": 4.012642224436559, "learning_rate": 3.709266709177049e-07, "logits/chosen": 3.345703125, "logits/rejected": 2.1689453125, "logps/chosen": -742.0, "logps/rejected": -890.5, "loss": 0.5027, "rewards/accuracies": 1.0, "rewards/chosen": 0.5087890625, "rewards/margins": 5.76953125, "rewards/rejected": -5.265625, "step": 3533 }, { "epoch": 0.6677689073645425, "grad_norm": 2.137792560911885, "learning_rate": 3.706544110399318e-07, "logits/chosen": 3.5859375, "logits/rejected": 3.33203125, "logps/chosen": -755.5, "logps/rejected": -666.0, "loss": 0.5177, "rewards/accuracies": 0.875, "rewards/chosen": 0.8095703125, "rewards/margins": 5.3515625, "rewards/rejected": -4.53515625, "step": 3534 }, { "epoch": 0.6679578629127497, "grad_norm": 3.9465791045051177, "learning_rate": 3.703822291863482e-07, "logits/chosen": 3.52734375, "logits/rejected": 3.125, "logps/chosen": -662.5, "logps/rejected": -587.5, "loss": 0.7798, "rewards/accuracies": 0.6875, "rewards/chosen": -0.537109375, "rewards/margins": 2.20703125, "rewards/rejected": -2.74609375, "step": 3535 }, { "epoch": 0.668146818460957, "grad_norm": 2.6290485318425043, "learning_rate": 3.701101254753669e-07, "logits/chosen": 3.28515625, "logits/rejected": 3.45703125, "logps/chosen": -1083.0, "logps/rejected": -1293.0, "loss": 0.5945, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5517578125, "rewards/margins": 6.51953125, "rewards/rejected": -5.96875, "step": 3536 }, { "epoch": 0.6683357740091643, "grad_norm": 4.532221219637518, "learning_rate": 3.6983810002536596e-07, "logits/chosen": 2.3046875, "logits/rejected": 2.00390625, "logps/chosen": -614.0, "logps/rejected": -535.0, "loss": 0.7474, "rewards/accuracies": 0.71875, "rewards/chosen": 0.106689453125, "rewards/margins": 2.880859375, "rewards/rejected": -2.76953125, "step": 3537 }, { "epoch": 0.6685247295573716, "grad_norm": 3.879955882254133, "learning_rate": 3.695661529546904e-07, "logits/chosen": 2.640625, "logits/rejected": 2.017578125, "logps/chosen": -791.0, "logps/rejected": -804.0, "loss": 0.6346, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6728515625, "rewards/margins": 5.23046875, "rewards/rejected": -5.8984375, "step": 3538 }, { "epoch": 0.6687136851055789, "grad_norm": 2.1942512989363117, "learning_rate": 3.692942843816502e-07, "logits/chosen": 2.6015625, "logits/rejected": 2.111328125, "logps/chosen": -1188.0, "logps/rejected": -1160.0, "loss": 0.5806, "rewards/accuracies": 0.8125, "rewards/chosen": 0.65478515625, "rewards/margins": 6.4296875, "rewards/rejected": -5.7734375, "step": 3539 }, { "epoch": 0.6689026406537862, "grad_norm": 2.8465761017389903, "learning_rate": 3.6902249442452195e-07, "logits/chosen": 3.25, "logits/rejected": 2.875, "logps/chosen": -953.0, "logps/rejected": -1158.0, "loss": 0.5821, "rewards/accuracies": 0.75, "rewards/chosen": 1.03515625, "rewards/margins": 5.7578125, "rewards/rejected": -4.71875, "step": 3540 }, { "epoch": 0.6690915962019934, "grad_norm": 2.649085213012825, "learning_rate": 3.687507832015476e-07, "logits/chosen": 3.23046875, "logits/rejected": 2.734375, "logps/chosen": -546.75, "logps/rejected": -981.0, "loss": 0.6726, "rewards/accuracies": 0.84375, "rewards/chosen": 0.10595703125, "rewards/margins": 4.376953125, "rewards/rejected": -4.2734375, "step": 3541 }, { "epoch": 0.6692805517502007, "grad_norm": 3.128767554630558, "learning_rate": 3.6847915083093494e-07, "logits/chosen": 2.41796875, "logits/rejected": 2.068359375, "logps/chosen": -457.0, "logps/rejected": -894.0, "loss": 0.5524, "rewards/accuracies": 0.78125, "rewards/chosen": 0.58447265625, "rewards/margins": 6.1171875, "rewards/rejected": -5.515625, "step": 3542 }, { "epoch": 0.669469507298408, "grad_norm": 2.243732056893089, "learning_rate": 3.682075974308574e-07, "logits/chosen": 2.09765625, "logits/rejected": 2.24169921875, "logps/chosen": -533.0, "logps/rejected": -1188.0, "loss": 0.6415, "rewards/accuracies": 0.78125, "rewards/chosen": -0.10595703125, "rewards/margins": 6.69140625, "rewards/rejected": -6.8046875, "step": 3543 }, { "epoch": 0.6696584628466153, "grad_norm": 1.7495681999345785, "learning_rate": 3.67936123119454e-07, "logits/chosen": 3.1875, "logits/rejected": 3.021484375, "logps/chosen": -777.0, "logps/rejected": -953.0, "loss": 0.5993, "rewards/accuracies": 0.6875, "rewards/chosen": 0.336181640625, "rewards/margins": 5.14453125, "rewards/rejected": -4.796875, "step": 3544 }, { "epoch": 0.6698474183948226, "grad_norm": 1.2781440221521767, "learning_rate": 3.6766472801482996e-07, "logits/chosen": 3.421875, "logits/rejected": 2.609375, "logps/chosen": -922.5, "logps/rejected": -15329.0, "loss": 0.7089, "rewards/accuracies": 0.6875, "rewards/chosen": -0.27978515625, "rewards/margins": -111.0, "rewards/rejected": 110.5, "step": 3545 }, { "epoch": 0.67003637394303, "grad_norm": 2.2069121636249758, "learning_rate": 3.6739341223505503e-07, "logits/chosen": 2.296875, "logits/rejected": 1.96484375, "logps/chosen": -596.5, "logps/rejected": -527.5, "loss": 0.6031, "rewards/accuracies": 0.8125, "rewards/chosen": 0.19873046875, "rewards/margins": 4.390625, "rewards/rejected": -4.1953125, "step": 3546 }, { "epoch": 0.6702253294912371, "grad_norm": 2.8586127130451606, "learning_rate": 3.6712217589816517e-07, "logits/chosen": 2.833984375, "logits/rejected": 2.48828125, "logps/chosen": -828.5, "logps/rejected": -890.5, "loss": 0.5541, "rewards/accuracies": 0.875, "rewards/chosen": 1.099609375, "rewards/margins": 5.078125, "rewards/rejected": -3.98046875, "step": 3547 }, { "epoch": 0.6704142850394444, "grad_norm": 2.1211357272760942, "learning_rate": 3.6685101912216176e-07, "logits/chosen": 3.0859375, "logits/rejected": 2.673828125, "logps/chosen": -762.0, "logps/rejected": -894.0, "loss": 0.6316, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0107421875, "rewards/margins": 4.55078125, "rewards/rejected": -4.5546875, "step": 3548 }, { "epoch": 0.6706032405876517, "grad_norm": 2.8641268385302885, "learning_rate": 3.665799420250111e-07, "logits/chosen": 2.55078125, "logits/rejected": 1.46875, "logps/chosen": -959.0, "logps/rejected": -1350.0, "loss": 0.4439, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7607421875, "rewards/margins": -1.265625, "rewards/rejected": 2.015625, "step": 3549 }, { "epoch": 0.670792196135859, "grad_norm": 2.1309496792965805, "learning_rate": 3.6630894472464535e-07, "logits/chosen": 3.3447265625, "logits/rejected": 3.013671875, "logps/chosen": -993.0, "logps/rejected": -1633.0, "loss": 0.5063, "rewards/accuracies": 0.875, "rewards/chosen": 0.99365234375, "rewards/margins": 6.56640625, "rewards/rejected": -5.57421875, "step": 3550 }, { "epoch": 0.6709811516840664, "grad_norm": 1.981916473987569, "learning_rate": 3.660380273389615e-07, "logits/chosen": 2.205078125, "logits/rejected": 1.65234375, "logps/chosen": -1291.0, "logps/rejected": -1100.0, "loss": 0.4473, "rewards/accuracies": 0.875, "rewards/chosen": 1.556640625, "rewards/margins": 6.59375, "rewards/rejected": -5.0390625, "step": 3551 }, { "epoch": 0.6711701072322737, "grad_norm": 2.4031726543517524, "learning_rate": 3.6576718998582225e-07, "logits/chosen": 3.6953125, "logits/rejected": 2.796875, "logps/chosen": -1010.0, "logps/rejected": -944.0, "loss": 0.6189, "rewards/accuracies": 0.75, "rewards/chosen": 1.635498046875, "rewards/margins": 4.55078125, "rewards/rejected": -2.9091796875, "step": 3552 }, { "epoch": 0.6713590627804809, "grad_norm": 1.544133527700508, "learning_rate": 3.6549643278305487e-07, "logits/chosen": 1.978515625, "logits/rejected": 2.24609375, "logps/chosen": -638.5, "logps/rejected": -1809.0, "loss": 0.5994, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0732421875, "rewards/margins": 9.8828125, "rewards/rejected": -9.8203125, "step": 3553 }, { "epoch": 0.6715480183286882, "grad_norm": 3.1379928088688316, "learning_rate": 3.652257558484525e-07, "logits/chosen": 1.46875, "logits/rejected": 1.31201171875, "logps/chosen": -525.5, "logps/rejected": -782.5, "loss": 0.463, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5543212890625, "rewards/margins": 5.796875, "rewards/rejected": -5.2421875, "step": 3554 }, { "epoch": 0.6717369738768955, "grad_norm": 1.963717326969214, "learning_rate": 3.649551592997728e-07, "logits/chosen": 2.8623046875, "logits/rejected": 2.25048828125, "logps/chosen": -465.25, "logps/rejected": -624.0, "loss": 0.5699, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0703125, "rewards/margins": 6.62890625, "rewards/rejected": -5.55078125, "step": 3555 }, { "epoch": 0.6719259294251028, "grad_norm": 2.3657582571713527, "learning_rate": 3.646846432547387e-07, "logits/chosen": 2.72265625, "logits/rejected": 2.8359375, "logps/chosen": -957.0, "logps/rejected": -1013.5, "loss": 0.6881, "rewards/accuracies": 0.6875, "rewards/chosen": 0.619873046875, "rewards/margins": 4.205078125, "rewards/rejected": -3.583984375, "step": 3556 }, { "epoch": 0.6721148849733101, "grad_norm": 2.2341387801729344, "learning_rate": 3.6441420783103793e-07, "logits/chosen": 2.578125, "logits/rejected": 2.5703125, "logps/chosen": -597.5, "logps/rejected": -2374.5, "loss": 0.6705, "rewards/accuracies": 0.71875, "rewards/chosen": 0.40478515625, "rewards/margins": 5.4609375, "rewards/rejected": -5.05859375, "step": 3557 }, { "epoch": 0.6723038405215173, "grad_norm": 3.8859635767893668, "learning_rate": 3.641438531463232e-07, "logits/chosen": 2.939453125, "logits/rejected": 2.6484375, "logps/chosen": -995.0, "logps/rejected": -1444.0, "loss": 0.4871, "rewards/accuracies": 0.875, "rewards/chosen": 1.544921875, "rewards/margins": 9.8828125, "rewards/rejected": -8.3515625, "step": 3558 }, { "epoch": 0.6724927960697246, "grad_norm": 3.0606906158900804, "learning_rate": 3.638735793182125e-07, "logits/chosen": 2.26953125, "logits/rejected": 2.462890625, "logps/chosen": -760.0, "logps/rejected": -1567.5, "loss": 0.6606, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2646484375, "rewards/margins": 9.21484375, "rewards/rejected": -9.47265625, "step": 3559 }, { "epoch": 0.6726817516179319, "grad_norm": 1.727161805031776, "learning_rate": 3.636033864642879e-07, "logits/chosen": 2.0595703125, "logits/rejected": 1.744384765625, "logps/chosen": -534.5, "logps/rejected": -543.0, "loss": 0.6644, "rewards/accuracies": 0.78125, "rewards/chosen": -0.099365234375, "rewards/margins": 3.41796875, "rewards/rejected": -3.525390625, "step": 3560 }, { "epoch": 0.6728707071661392, "grad_norm": 2.406258665209072, "learning_rate": 3.6333327470209675e-07, "logits/chosen": 2.767578125, "logits/rejected": 2.13525390625, "logps/chosen": -558.5, "logps/rejected": -697.5, "loss": 0.5138, "rewards/accuracies": 0.90625, "rewards/chosen": 0.447265625, "rewards/margins": 5.6640625, "rewards/rejected": -5.2109375, "step": 3561 }, { "epoch": 0.6730596627143465, "grad_norm": 1.8383399283073625, "learning_rate": 3.630632441491511e-07, "logits/chosen": 1.638671875, "logits/rejected": 1.3040771484375, "logps/chosen": -496.0, "logps/rejected": -1240.5, "loss": 0.673, "rewards/accuracies": 0.78125, "rewards/chosen": -0.364013671875, "rewards/margins": 4.72265625, "rewards/rejected": -5.078125, "step": 3562 }, { "epoch": 0.6732486182625538, "grad_norm": 2.8644961905381976, "learning_rate": 3.6279329492292754e-07, "logits/chosen": 3.0625, "logits/rejected": 2.3984375, "logps/chosen": -580.0, "logps/rejected": -812.0, "loss": 0.526, "rewards/accuracies": 0.875, "rewards/chosen": 0.4384765625, "rewards/margins": 5.94921875, "rewards/rejected": -5.5078125, "step": 3563 }, { "epoch": 0.673437573810761, "grad_norm": 2.2591142304557272, "learning_rate": 3.625234271408674e-07, "logits/chosen": 1.724609375, "logits/rejected": 1.984375, "logps/chosen": -771.5, "logps/rejected": -692.5, "loss": 0.661, "rewards/accuracies": 0.75, "rewards/chosen": 0.272216796875, "rewards/margins": 3.859375, "rewards/rejected": -3.5859375, "step": 3564 }, { "epoch": 0.6736265293589683, "grad_norm": 1.9441553868386994, "learning_rate": 3.6225364092037634e-07, "logits/chosen": 2.1796875, "logits/rejected": 1.95751953125, "logps/chosen": -858.0, "logps/rejected": -853.5, "loss": 0.5685, "rewards/accuracies": 0.78125, "rewards/chosen": 0.63916015625, "rewards/margins": 4.94140625, "rewards/rejected": -4.30859375, "step": 3565 }, { "epoch": 0.6738154849071756, "grad_norm": 1.4355600097075396, "learning_rate": 3.6198393637882474e-07, "logits/chosen": 3.546875, "logits/rejected": 3.375, "logps/chosen": -722.0, "logps/rejected": -1253.5, "loss": 0.5208, "rewards/accuracies": 0.84375, "rewards/chosen": 1.19140625, "rewards/margins": 7.82421875, "rewards/rejected": -6.6328125, "step": 3566 }, { "epoch": 0.6740044404553829, "grad_norm": 3.484447334073615, "learning_rate": 3.617143136335473e-07, "logits/chosen": 2.921875, "logits/rejected": 2.9140625, "logps/chosen": -844.0, "logps/rejected": -1512.0, "loss": 0.5906, "rewards/accuracies": 0.875, "rewards/chosen": -0.11578369140625, "rewards/margins": 8.421875, "rewards/rejected": -8.51953125, "step": 3567 }, { "epoch": 0.6741933960035902, "grad_norm": 2.6941279756043945, "learning_rate": 3.614447728018435e-07, "logits/chosen": 2.775390625, "logits/rejected": 2.953125, "logps/chosen": -648.5, "logps/rejected": -979.5, "loss": 0.4887, "rewards/accuracies": 0.84375, "rewards/chosen": 0.960205078125, "rewards/margins": 7.03125, "rewards/rejected": -6.08203125, "step": 3568 }, { "epoch": 0.6743823515517975, "grad_norm": 1.7557931379756426, "learning_rate": 3.611753140009768e-07, "logits/chosen": 4.04296875, "logits/rejected": 2.8828125, "logps/chosen": -792.0, "logps/rejected": -966.0, "loss": 0.5673, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8759765625, "rewards/margins": 5.13671875, "rewards/rejected": -4.26953125, "step": 3569 }, { "epoch": 0.6745713071000047, "grad_norm": 3.021521295894817, "learning_rate": 3.6090593734817496e-07, "logits/chosen": 2.271484375, "logits/rejected": 2.08984375, "logps/chosen": -1221.0, "logps/rejected": -1083.0, "loss": 0.4596, "rewards/accuracies": 0.90625, "rewards/chosen": 0.79107666015625, "rewards/margins": 6.9140625, "rewards/rejected": -6.125, "step": 3570 }, { "epoch": 0.674760262648212, "grad_norm": 5.400560625673006, "learning_rate": 3.6063664296063026e-07, "logits/chosen": 2.9609375, "logits/rejected": 2.68359375, "logps/chosen": -1083.0, "logps/rejected": -1091.0, "loss": 0.4695, "rewards/accuracies": 0.9375, "rewards/chosen": 1.671875, "rewards/margins": 6.328125, "rewards/rejected": -4.65625, "step": 3571 }, { "epoch": 0.6749492181964193, "grad_norm": 2.030884895758301, "learning_rate": 3.6036743095549884e-07, "logits/chosen": 2.576171875, "logits/rejected": 2.0859375, "logps/chosen": -917.0, "logps/rejected": -1024.5, "loss": 0.5388, "rewards/accuracies": 0.84375, "rewards/chosen": 0.453125, "rewards/margins": 5.85546875, "rewards/rejected": -5.390625, "step": 3572 }, { "epoch": 0.6751381737446266, "grad_norm": 2.4614394457900275, "learning_rate": 3.6009830144990174e-07, "logits/chosen": 2.84765625, "logits/rejected": 2.677734375, "logps/chosen": -630.0, "logps/rejected": -879.5, "loss": 0.5529, "rewards/accuracies": 0.90625, "rewards/chosen": 0.548828125, "rewards/margins": 5.94921875, "rewards/rejected": -5.3984375, "step": 3573 }, { "epoch": 0.6753271292928339, "grad_norm": 2.8899719382260085, "learning_rate": 3.5982925456092296e-07, "logits/chosen": 2.953125, "logits/rejected": 2.021484375, "logps/chosen": -646.0, "logps/rejected": -783.0, "loss": 0.7008, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19287109375, "rewards/margins": 3.74609375, "rewards/rejected": -3.556640625, "step": 3574 }, { "epoch": 0.6755160848410412, "grad_norm": 2.8334346315401064, "learning_rate": 3.5956029040561154e-07, "logits/chosen": 3.22265625, "logits/rejected": 3.04296875, "logps/chosen": -808.0, "logps/rejected": -1918.0, "loss": 0.4127, "rewards/accuracies": 0.875, "rewards/chosen": 1.15655517578125, "rewards/margins": 11.2578125, "rewards/rejected": -10.1015625, "step": 3575 }, { "epoch": 0.6757050403892484, "grad_norm": 2.749508506911809, "learning_rate": 3.592914091009801e-07, "logits/chosen": 2.28125, "logits/rejected": 1.705078125, "logps/chosen": -604.5, "logps/rejected": -619.5, "loss": 0.643, "rewards/accuracies": 0.71875, "rewards/chosen": 0.39239501953125, "rewards/margins": 5.3671875, "rewards/rejected": -4.9765625, "step": 3576 }, { "epoch": 0.6758939959374557, "grad_norm": 4.036251177992468, "learning_rate": 3.590226107640052e-07, "logits/chosen": 2.390625, "logits/rejected": 1.71875, "logps/chosen": -583.5, "logps/rejected": -660.0, "loss": 0.5518, "rewards/accuracies": 0.875, "rewards/chosen": -0.130859375, "rewards/margins": 5.3828125, "rewards/rejected": -5.5234375, "step": 3577 }, { "epoch": 0.676082951485663, "grad_norm": 4.358514216810752, "learning_rate": 3.587538955116278e-07, "logits/chosen": 2.84765625, "logits/rejected": 2.7421875, "logps/chosen": -831.5, "logps/rejected": -779.0, "loss": 0.7303, "rewards/accuracies": 0.6875, "rewards/chosen": 0.46484375, "rewards/margins": 3.654296875, "rewards/rejected": -3.1875, "step": 3578 }, { "epoch": 0.6762719070338703, "grad_norm": 2.0681316229379516, "learning_rate": 3.584852634607517e-07, "logits/chosen": 2.162109375, "logits/rejected": 2.7421875, "logps/chosen": -648.5, "logps/rejected": -1052.0, "loss": 0.6061, "rewards/accuracies": 0.875, "rewards/chosen": -0.2119140625, "rewards/margins": 6.4375, "rewards/rejected": -6.6484375, "step": 3579 }, { "epoch": 0.6764608625820776, "grad_norm": 2.4369930493067895, "learning_rate": 3.5821671472824545e-07, "logits/chosen": 1.857421875, "logits/rejected": 1.20703125, "logps/chosen": -902.0, "logps/rejected": -863.0, "loss": 0.5317, "rewards/accuracies": 0.8125, "rewards/chosen": 0.51025390625, "rewards/margins": 4.89453125, "rewards/rejected": -4.3828125, "step": 3580 }, { "epoch": 0.6766498181302848, "grad_norm": 2.684027338903517, "learning_rate": 3.5794824943094104e-07, "logits/chosen": 3.55859375, "logits/rejected": 3.40625, "logps/chosen": -739.5, "logps/rejected": -1506.0, "loss": 0.652, "rewards/accuracies": 0.75, "rewards/chosen": 0.8248291015625, "rewards/margins": 5.09375, "rewards/rejected": -4.28515625, "step": 3581 }, { "epoch": 0.6768387736784921, "grad_norm": 2.481248456170019, "learning_rate": 3.5767986768563384e-07, "logits/chosen": 2.302734375, "logits/rejected": 2.056640625, "logps/chosen": -768.5, "logps/rejected": -1815.5, "loss": 0.6145, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5572509765625, "rewards/margins": 6.408203125, "rewards/rejected": -5.84375, "step": 3582 }, { "epoch": 0.6770277292266994, "grad_norm": 3.2521642270336857, "learning_rate": 3.574115696090837e-07, "logits/chosen": 3.384765625, "logits/rejected": 3.001953125, "logps/chosen": -712.5, "logps/rejected": -1075.0, "loss": 0.5671, "rewards/accuracies": 0.8125, "rewards/chosen": 0.740234375, "rewards/margins": 5.8828125, "rewards/rejected": -5.125, "step": 3583 }, { "epoch": 0.6772166847749067, "grad_norm": 3.4106471508649476, "learning_rate": 3.57143355318013e-07, "logits/chosen": 2.5625, "logits/rejected": 2.146484375, "logps/chosen": -1044.0, "logps/rejected": -799.0, "loss": 0.539, "rewards/accuracies": 0.875, "rewards/chosen": 1.037109375, "rewards/margins": 4.73046875, "rewards/rejected": -3.703125, "step": 3584 }, { "epoch": 0.677405640323114, "grad_norm": 3.8178618703605722, "learning_rate": 3.5687522492910847e-07, "logits/chosen": 4.13671875, "logits/rejected": 3.421875, "logps/chosen": -514.125, "logps/rejected": -559.5, "loss": 0.6099, "rewards/accuracies": 0.75, "rewards/chosen": 0.78271484375, "rewards/margins": 3.4453125, "rewards/rejected": -2.6552734375, "step": 3585 }, { "epoch": 0.6775945958713213, "grad_norm": 1.833016818098378, "learning_rate": 3.5660717855902014e-07, "logits/chosen": 2.62060546875, "logits/rejected": 2.2421875, "logps/chosen": -620.0, "logps/rejected": -705.0, "loss": 0.6031, "rewards/accuracies": 0.78125, "rewards/chosen": 0.116943359375, "rewards/margins": 4.6875, "rewards/rejected": -4.5703125, "step": 3586 }, { "epoch": 0.6777835514195285, "grad_norm": 2.9108080034356107, "learning_rate": 3.5633921632436114e-07, "logits/chosen": 3.0, "logits/rejected": 3.205078125, "logps/chosen": -659.5, "logps/rejected": -642.0, "loss": 0.4974, "rewards/accuracies": 0.9375, "rewards/chosen": 1.185546875, "rewards/margins": 6.078125, "rewards/rejected": -4.8984375, "step": 3587 }, { "epoch": 0.6779725069677358, "grad_norm": 1.962315258568414, "learning_rate": 3.5607133834170857e-07, "logits/chosen": 2.388671875, "logits/rejected": 2.318359375, "logps/chosen": -753.0, "logps/rejected": -1491.0, "loss": 0.5645, "rewards/accuracies": 0.78125, "rewards/chosen": 0.31298828125, "rewards/margins": 8.98828125, "rewards/rejected": -8.66796875, "step": 3588 }, { "epoch": 0.6781614625159431, "grad_norm": 1.3983837566876396, "learning_rate": 3.5580354472760235e-07, "logits/chosen": 2.8671875, "logits/rejected": 2.3095703125, "logps/chosen": -606.5, "logps/rejected": -1591.5, "loss": 0.5338, "rewards/accuracies": 0.78125, "rewards/chosen": 0.15582275390625, "rewards/margins": 6.671875, "rewards/rejected": -6.515625, "step": 3589 }, { "epoch": 0.6783504180641504, "grad_norm": 2.335629012174292, "learning_rate": 3.5553583559854604e-07, "logits/chosen": 3.09765625, "logits/rejected": 2.138671875, "logps/chosen": -566.5, "logps/rejected": -1541.0, "loss": 0.6917, "rewards/accuracies": 0.6875, "rewards/chosen": 0.075958251953125, "rewards/margins": 5.927734375, "rewards/rejected": -5.84765625, "step": 3590 }, { "epoch": 0.6785393736123577, "grad_norm": 2.1389047473384517, "learning_rate": 3.5526821107100645e-07, "logits/chosen": 2.36328125, "logits/rejected": 1.9755859375, "logps/chosen": -826.5, "logps/rejected": -865.5, "loss": 0.5434, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9173583984375, "rewards/margins": 5.21875, "rewards/rejected": -4.30078125, "step": 3591 }, { "epoch": 0.678728329160565, "grad_norm": 2.3119687240927624, "learning_rate": 3.550006712614132e-07, "logits/chosen": 2.525390625, "logits/rejected": 2.00927734375, "logps/chosen": -889.0, "logps/rejected": -1354.0, "loss": 0.4071, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0087890625, "rewards/margins": 9.1328125, "rewards/rejected": -8.1328125, "step": 3592 }, { "epoch": 0.6789172847087722, "grad_norm": 2.9095602669096907, "learning_rate": 3.547332162861596e-07, "logits/chosen": 2.787109375, "logits/rejected": 2.828125, "logps/chosen": -955.0, "logps/rejected": -879.0, "loss": 0.5482, "rewards/accuracies": 0.8125, "rewards/chosen": 0.954864501953125, "rewards/margins": 5.84375, "rewards/rejected": -4.875, "step": 3593 }, { "epoch": 0.6791062402569795, "grad_norm": 1.865037587761789, "learning_rate": 3.544658462616017e-07, "logits/chosen": 1.75439453125, "logits/rejected": 1.9136962890625, "logps/chosen": -998.5, "logps/rejected": -1110.0, "loss": 0.6036, "rewards/accuracies": 0.78125, "rewards/chosen": 0.23193359375, "rewards/margins": 4.48828125, "rewards/rejected": -4.26171875, "step": 3594 }, { "epoch": 0.6792951958051868, "grad_norm": 4.052072770928681, "learning_rate": 3.5419856130405836e-07, "logits/chosen": 3.375, "logits/rejected": 3.234375, "logps/chosen": -1111.0, "logps/rejected": -1162.0, "loss": 0.5265, "rewards/accuracies": 0.875, "rewards/chosen": 1.96875, "rewards/margins": 6.19140625, "rewards/rejected": -4.22265625, "step": 3595 }, { "epoch": 0.6794841513533941, "grad_norm": 2.0522451992584116, "learning_rate": 3.539313615298121e-07, "logits/chosen": 1.876953125, "logits/rejected": 1.755859375, "logps/chosen": -627.0, "logps/rejected": -945.0, "loss": 0.5517, "rewards/accuracies": 0.9375, "rewards/chosen": 0.552734375, "rewards/margins": 4.89453125, "rewards/rejected": -4.33984375, "step": 3596 }, { "epoch": 0.6796731069016014, "grad_norm": 1.5614702039288526, "learning_rate": 3.53664247055108e-07, "logits/chosen": 2.53515625, "logits/rejected": 2.572265625, "logps/chosen": -842.0, "logps/rejected": -1695.0, "loss": 0.6064, "rewards/accuracies": 0.84375, "rewards/chosen": 1.08349609375, "rewards/margins": 5.17578125, "rewards/rejected": -4.09375, "step": 3597 }, { "epoch": 0.6798620624498087, "grad_norm": 3.631860008591747, "learning_rate": 3.53397217996154e-07, "logits/chosen": 2.37890625, "logits/rejected": 2.072265625, "logps/chosen": -774.0, "logps/rejected": -690.5, "loss": 0.5933, "rewards/accuracies": 0.84375, "rewards/chosen": 0.501953125, "rewards/margins": 4.39453125, "rewards/rejected": -3.892578125, "step": 3598 }, { "epoch": 0.6800510179980159, "grad_norm": 1.239412251204385, "learning_rate": 3.531302744691208e-07, "logits/chosen": 2.833984375, "logits/rejected": 2.7109375, "logps/chosen": -702.0, "logps/rejected": -955.0, "loss": 0.4388, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1435546875, "rewards/margins": 6.3828125, "rewards/rejected": -5.23828125, "step": 3599 }, { "epoch": 0.6802399735462232, "grad_norm": 3.383999698222134, "learning_rate": 3.5286341659014183e-07, "logits/chosen": 2.97265625, "logits/rejected": 2.97265625, "logps/chosen": -672.0, "logps/rejected": -1091.0, "loss": 0.5749, "rewards/accuracies": 0.8125, "rewards/chosen": 0.53466796875, "rewards/margins": 5.515625, "rewards/rejected": -4.98046875, "step": 3600 }, { "epoch": 0.6804289290944305, "grad_norm": 2.91768432279509, "learning_rate": 3.5259664447531426e-07, "logits/chosen": 1.958984375, "logits/rejected": 1.90625, "logps/chosen": -1108.0, "logps/rejected": -1046.0, "loss": 0.5277, "rewards/accuracies": 0.84375, "rewards/chosen": 1.25927734375, "rewards/margins": 5.96875, "rewards/rejected": -4.703125, "step": 3601 }, { "epoch": 0.6806178846426378, "grad_norm": 2.2938672513154286, "learning_rate": 3.523299582406962e-07, "logits/chosen": 2.09375, "logits/rejected": 1.568359375, "logps/chosen": -864.0, "logps/rejected": -573.5, "loss": 0.4979, "rewards/accuracies": 0.90625, "rewards/chosen": 0.83447265625, "rewards/margins": 5.08984375, "rewards/rejected": -4.26171875, "step": 3602 }, { "epoch": 0.6808068401908451, "grad_norm": 1.469779363893749, "learning_rate": 3.520633580023097e-07, "logits/chosen": 3.3203125, "logits/rejected": 3.09765625, "logps/chosen": -723.0, "logps/rejected": -921.0, "loss": 0.5799, "rewards/accuracies": 0.8125, "rewards/chosen": 1.14404296875, "rewards/margins": 5.40625, "rewards/rejected": -4.26953125, "step": 3603 }, { "epoch": 0.6809957957390523, "grad_norm": 1.8073298551809216, "learning_rate": 3.517968438761395e-07, "logits/chosen": 2.3828125, "logits/rejected": 2.23046875, "logps/chosen": -508.0, "logps/rejected": -695.0, "loss": 0.6866, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3583984375, "rewards/margins": 3.47265625, "rewards/rejected": -3.107421875, "step": 3604 }, { "epoch": 0.6811847512872596, "grad_norm": 3.713363986760203, "learning_rate": 3.515304159781316e-07, "logits/chosen": 2.087890625, "logits/rejected": 2.27734375, "logps/chosen": -516.5, "logps/rejected": -561.5, "loss": 0.6458, "rewards/accuracies": 0.78125, "rewards/chosen": 0.58837890625, "rewards/margins": 4.69921875, "rewards/rejected": -4.109375, "step": 3605 }, { "epoch": 0.6813737068354669, "grad_norm": 3.194353589246981, "learning_rate": 3.512640744241958e-07, "logits/chosen": 3.72265625, "logits/rejected": 3.1953125, "logps/chosen": -725.5, "logps/rejected": -1426.0, "loss": 0.6441, "rewards/accuracies": 0.75, "rewards/chosen": 0.77978515625, "rewards/margins": 4.68359375, "rewards/rejected": -3.91015625, "step": 3606 }, { "epoch": 0.6815626623836742, "grad_norm": 3.093880867232194, "learning_rate": 3.5099781933020376e-07, "logits/chosen": 2.90234375, "logits/rejected": 2.765625, "logps/chosen": -939.0, "logps/rejected": -1226.0, "loss": 0.4214, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4697265625, "rewards/margins": 6.5625, "rewards/rejected": -5.09375, "step": 3607 }, { "epoch": 0.6817516179318815, "grad_norm": 1.3861373702370736, "learning_rate": 3.507316508119894e-07, "logits/chosen": 2.021484375, "logits/rejected": 2.078125, "logps/chosen": -14965.5, "logps/rejected": -2016.0, "loss": 0.5503, "rewards/accuracies": 0.84375, "rewards/chosen": 139.193359375, "rewards/margins": 151.125, "rewards/rejected": -12.27734375, "step": 3608 }, { "epoch": 0.6819405734800889, "grad_norm": 1.1629893328818208, "learning_rate": 3.504655689853495e-07, "logits/chosen": 3.4296875, "logits/rejected": 3.15625, "logps/chosen": -760.75, "logps/rejected": -986.0, "loss": 0.5021, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4599609375, "rewards/margins": 5.98046875, "rewards/rejected": -4.517578125, "step": 3609 }, { "epoch": 0.682129529028296, "grad_norm": 2.2083015208758514, "learning_rate": 3.501995739660423e-07, "logits/chosen": 4.109375, "logits/rejected": 3.56640625, "logps/chosen": -1069.75, "logps/rejected": -1573.5, "loss": 0.4861, "rewards/accuracies": 0.875, "rewards/chosen": 0.8232421875, "rewards/margins": 6.046875, "rewards/rejected": -5.22265625, "step": 3610 }, { "epoch": 0.6823184845765033, "grad_norm": 2.3435616824140544, "learning_rate": 3.4993366586978935e-07, "logits/chosen": 3.248046875, "logits/rejected": 2.798828125, "logps/chosen": -900.0, "logps/rejected": -1154.0, "loss": 0.5893, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2109375, "rewards/margins": 4.984375, "rewards/rejected": -3.77734375, "step": 3611 }, { "epoch": 0.6825074401247107, "grad_norm": 2.2365110277029654, "learning_rate": 3.496678448122735e-07, "logits/chosen": 3.26171875, "logits/rejected": 3.02734375, "logps/chosen": -1013.0, "logps/rejected": -1416.0, "loss": 0.5605, "rewards/accuracies": 0.84375, "rewards/chosen": 0.80126953125, "rewards/margins": 6.890625, "rewards/rejected": -6.09375, "step": 3612 }, { "epoch": 0.682696395672918, "grad_norm": 1.8172115952676857, "learning_rate": 3.4940211090914007e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.830078125, "logps/chosen": -750.0, "logps/rejected": -1067.0, "loss": 0.6381, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3359375, "rewards/margins": 4.1259765625, "rewards/rejected": -3.7890625, "step": 3613 }, { "epoch": 0.6828853512211253, "grad_norm": 1.9953785307893723, "learning_rate": 3.4913646427599653e-07, "logits/chosen": 2.1708984375, "logits/rejected": 1.923828125, "logps/chosen": -998.0, "logps/rejected": -1446.0, "loss": 0.5478, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7587890625, "rewards/margins": 7.4296875, "rewards/rejected": -6.67578125, "step": 3614 }, { "epoch": 0.6830743067693326, "grad_norm": 2.7358654382670435, "learning_rate": 3.488709050284121e-07, "logits/chosen": 3.0078125, "logits/rejected": 2.53125, "logps/chosen": -917.0, "logps/rejected": -884.0, "loss": 0.586, "rewards/accuracies": 0.75, "rewards/chosen": 1.580078125, "rewards/margins": 4.66015625, "rewards/rejected": -3.08203125, "step": 3615 }, { "epoch": 0.6832632623175398, "grad_norm": 2.0291501439238586, "learning_rate": 3.486054332819181e-07, "logits/chosen": 3.3125, "logits/rejected": 2.904296875, "logps/chosen": -978.0, "logps/rejected": -987.0, "loss": 0.557, "rewards/accuracies": 0.84375, "rewards/chosen": 1.32275390625, "rewards/margins": 4.8984375, "rewards/rejected": -3.578125, "step": 3616 }, { "epoch": 0.6834522178657471, "grad_norm": 1.6962335738263286, "learning_rate": 3.483400491520082e-07, "logits/chosen": 1.8505859375, "logits/rejected": 2.185546875, "logps/chosen": -1032.0, "logps/rejected": -1038.0, "loss": 0.4344, "rewards/accuracies": 0.9375, "rewards/chosen": 1.396484375, "rewards/margins": 6.4296875, "rewards/rejected": -5.03125, "step": 3617 }, { "epoch": 0.6836411734139544, "grad_norm": 1.6385367227042473, "learning_rate": 3.480747527541374e-07, "logits/chosen": 3.68359375, "logits/rejected": 3.4609375, "logps/chosen": -923.0, "logps/rejected": -952.5, "loss": 0.5313, "rewards/accuracies": 0.8125, "rewards/chosen": 1.654296875, "rewards/margins": 6.24609375, "rewards/rejected": -4.58984375, "step": 3618 }, { "epoch": 0.6838301289621617, "grad_norm": 10.506803035353004, "learning_rate": 3.4780954420372283e-07, "logits/chosen": 2.94921875, "logits/rejected": 2.935546875, "logps/chosen": -1450.0, "logps/rejected": -1448.0, "loss": 0.5431, "rewards/accuracies": 0.84375, "rewards/chosen": -0.103515625, "rewards/margins": 4.13671875, "rewards/rejected": -4.234375, "step": 3619 }, { "epoch": 0.684019084510369, "grad_norm": 2.6108202296918663, "learning_rate": 3.4754442361614315e-07, "logits/chosen": 2.59765625, "logits/rejected": 2.193359375, "logps/chosen": -786.5, "logps/rejected": -1037.0, "loss": 0.5636, "rewards/accuracies": 0.84375, "rewards/chosen": 1.21435546875, "rewards/margins": 5.33203125, "rewards/rejected": -4.109375, "step": 3620 }, { "epoch": 0.6842080400585763, "grad_norm": 2.544862078321657, "learning_rate": 3.472793911067389e-07, "logits/chosen": 3.23046875, "logits/rejected": 2.794921875, "logps/chosen": -610.5, "logps/rejected": -776.0, "loss": 0.585, "rewards/accuracies": 0.78125, "rewards/chosen": 0.74951171875, "rewards/margins": 4.33203125, "rewards/rejected": -3.58203125, "step": 3621 }, { "epoch": 0.6843969956067835, "grad_norm": 2.103479012981375, "learning_rate": 3.470144467908127e-07, "logits/chosen": 3.578125, "logits/rejected": 3.486328125, "logps/chosen": -594.0, "logps/rejected": -813.0, "loss": 0.667, "rewards/accuracies": 0.8125, "rewards/chosen": 1.207275390625, "rewards/margins": 3.708984375, "rewards/rejected": -2.5078125, "step": 3622 }, { "epoch": 0.6845859511549908, "grad_norm": 2.4227071328933047, "learning_rate": 3.467495907836278e-07, "logits/chosen": 2.876953125, "logits/rejected": 2.72265625, "logps/chosen": -914.5, "logps/rejected": -1335.0, "loss": 0.6091, "rewards/accuracies": 0.78125, "rewards/chosen": 1.076171875, "rewards/margins": 3.8125, "rewards/rejected": -2.73828125, "step": 3623 }, { "epoch": 0.6847749067031981, "grad_norm": 1.8423806361774913, "learning_rate": 3.4648482320041015e-07, "logits/chosen": 3.70703125, "logits/rejected": 3.328125, "logps/chosen": -799.0, "logps/rejected": -793.0, "loss": 0.4707, "rewards/accuracies": 0.9375, "rewards/chosen": 0.99072265625, "rewards/margins": 5.12890625, "rewards/rejected": -4.140625, "step": 3624 }, { "epoch": 0.6849638622514054, "grad_norm": 1.9592381613718923, "learning_rate": 3.462201441563466e-07, "logits/chosen": 2.298828125, "logits/rejected": 2.33203125, "logps/chosen": -901.0, "logps/rejected": -694.5, "loss": 0.561, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1787109375, "rewards/margins": 3.47998046875, "rewards/rejected": -2.302734375, "step": 3625 }, { "epoch": 0.6851528177996127, "grad_norm": 2.79429972153802, "learning_rate": 3.459555537665853e-07, "logits/chosen": 2.279296875, "logits/rejected": 1.90625, "logps/chosen": -701.0, "logps/rejected": -762.0, "loss": 0.4317, "rewards/accuracies": 0.875, "rewards/chosen": 1.3759765625, "rewards/margins": 5.84375, "rewards/rejected": -4.46484375, "step": 3626 }, { "epoch": 0.6853417733478199, "grad_norm": 1.9865900204068998, "learning_rate": 3.4569105214623677e-07, "logits/chosen": 2.783203125, "logits/rejected": 2.76953125, "logps/chosen": -718.0, "logps/rejected": -688.0, "loss": 0.5647, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9091796875, "rewards/margins": 3.99609375, "rewards/rejected": -3.083984375, "step": 3627 }, { "epoch": 0.6855307288960272, "grad_norm": 2.26265295331227, "learning_rate": 3.4542663941037156e-07, "logits/chosen": 3.30859375, "logits/rejected": 3.24609375, "logps/chosen": -756.0, "logps/rejected": -687.5, "loss": 0.6245, "rewards/accuracies": 0.71875, "rewards/chosen": 1.323974609375, "rewards/margins": 5.19921875, "rewards/rejected": -3.859375, "step": 3628 }, { "epoch": 0.6857196844442345, "grad_norm": 3.0327717852136726, "learning_rate": 3.4516231567402274e-07, "logits/chosen": 3.703125, "logits/rejected": 3.02734375, "logps/chosen": -524.5, "logps/rejected": -1408.0, "loss": 0.6038, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4892578125, "rewards/margins": 6.45703125, "rewards/rejected": -5.958984375, "step": 3629 }, { "epoch": 0.6859086399924418, "grad_norm": 4.4658087835902185, "learning_rate": 3.448980810521841e-07, "logits/chosen": 2.380859375, "logits/rejected": 2.79296875, "logps/chosen": -626.0, "logps/rejected": -578.5, "loss": 0.6242, "rewards/accuracies": 0.78125, "rewards/chosen": 0.89697265625, "rewards/margins": 3.17578125, "rewards/rejected": -2.275390625, "step": 3630 }, { "epoch": 0.6860975955406491, "grad_norm": 3.913296681984792, "learning_rate": 3.4463393565981057e-07, "logits/chosen": 3.1640625, "logits/rejected": 2.462890625, "logps/chosen": -653.0, "logps/rejected": -1007.5, "loss": 0.5714, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0419921875, "rewards/margins": 4.1953125, "rewards/rejected": -3.15625, "step": 3631 }, { "epoch": 0.6862865510888564, "grad_norm": 2.543414468709438, "learning_rate": 3.443698796118188e-07, "logits/chosen": 1.28076171875, "logits/rejected": 1.2685546875, "logps/chosen": -752.5, "logps/rejected": -885.5, "loss": 0.6458, "rewards/accuracies": 0.78125, "rewards/chosen": 0.32861328125, "rewards/margins": 5.25, "rewards/rejected": -4.9140625, "step": 3632 }, { "epoch": 0.6864755066370636, "grad_norm": 2.6244417950544516, "learning_rate": 3.441059130230858e-07, "logits/chosen": 3.29296875, "logits/rejected": 3.484375, "logps/chosen": -1171.0, "logps/rejected": -1269.0, "loss": 0.5197, "rewards/accuracies": 0.6875, "rewards/chosen": 2.669921875, "rewards/margins": 4.5625, "rewards/rejected": -1.8984375, "step": 3633 }, { "epoch": 0.6866644621852709, "grad_norm": 1.3450402266079584, "learning_rate": 3.4384203600845044e-07, "logits/chosen": 3.24609375, "logits/rejected": 2.8671875, "logps/chosen": -775.0, "logps/rejected": -800.0, "loss": 0.4608, "rewards/accuracies": 0.9375, "rewards/chosen": 1.501922607421875, "rewards/margins": 5.8203125, "rewards/rejected": -4.3125, "step": 3634 }, { "epoch": 0.6868534177334782, "grad_norm": 1.6641661438064306, "learning_rate": 3.43578248682712e-07, "logits/chosen": 3.53125, "logits/rejected": 3.375, "logps/chosen": -1235.0, "logps/rejected": -1301.0, "loss": 0.446, "rewards/accuracies": 0.875, "rewards/chosen": 2.50390625, "rewards/margins": 7.90625, "rewards/rejected": -5.4140625, "step": 3635 }, { "epoch": 0.6870423732816855, "grad_norm": 1.577370740432624, "learning_rate": 3.4331455116063113e-07, "logits/chosen": 2.85546875, "logits/rejected": 2.7890625, "logps/chosen": -997.5, "logps/rejected": -1968.0, "loss": 0.4931, "rewards/accuracies": 0.875, "rewards/chosen": 1.171875, "rewards/margins": 8.2265625, "rewards/rejected": -7.05078125, "step": 3636 }, { "epoch": 0.6872313288298928, "grad_norm": 2.1868691654842385, "learning_rate": 3.430509435569293e-07, "logits/chosen": 2.099609375, "logits/rejected": 2.359375, "logps/chosen": -834.0, "logps/rejected": -843.0, "loss": 0.5498, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8338623046875, "rewards/margins": 4.21484375, "rewards/rejected": -3.375, "step": 3637 }, { "epoch": 0.6874202843781001, "grad_norm": 2.6638819523978783, "learning_rate": 3.427874259862885e-07, "logits/chosen": 3.25, "logits/rejected": 2.90625, "logps/chosen": -1058.5, "logps/rejected": -1798.0, "loss": 0.4765, "rewards/accuracies": 0.875, "rewards/chosen": 1.439208984375, "rewards/margins": 9.2265625, "rewards/rejected": -7.78125, "step": 3638 }, { "epoch": 0.6876092399263073, "grad_norm": 3.0312804668852893, "learning_rate": 3.425239985633524e-07, "logits/chosen": 3.1875, "logits/rejected": 2.99609375, "logps/chosen": -819.0, "logps/rejected": -926.0, "loss": 0.6338, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4306640625, "rewards/margins": 4.41796875, "rewards/rejected": -3.984375, "step": 3639 }, { "epoch": 0.6877981954745146, "grad_norm": 3.613931049634853, "learning_rate": 3.4226066140272463e-07, "logits/chosen": 3.28515625, "logits/rejected": 2.859375, "logps/chosen": -849.0, "logps/rejected": -871.0, "loss": 0.6742, "rewards/accuracies": 0.8125, "rewards/chosen": 0.37994384765625, "rewards/margins": 6.31640625, "rewards/rejected": -5.93359375, "step": 3640 }, { "epoch": 0.6879871510227219, "grad_norm": 3.5960387077897646, "learning_rate": 3.4199741461897e-07, "logits/chosen": 2.513671875, "logits/rejected": 2.203125, "logps/chosen": -716.5, "logps/rejected": -823.0, "loss": 0.5797, "rewards/accuracies": 0.875, "rewards/chosen": 0.8896484375, "rewards/margins": 5.9609375, "rewards/rejected": -5.0859375, "step": 3641 }, { "epoch": 0.6881761065709292, "grad_norm": 2.3749716599004103, "learning_rate": 3.417342583266135e-07, "logits/chosen": 2.87890625, "logits/rejected": 2.5625, "logps/chosen": -684.0, "logps/rejected": -955.5, "loss": 0.6939, "rewards/accuracies": 0.75, "rewards/chosen": 0.25537109375, "rewards/margins": 3.58984375, "rewards/rejected": -3.337890625, "step": 3642 }, { "epoch": 0.6883650621191365, "grad_norm": 2.608248173504481, "learning_rate": 3.414711926401419e-07, "logits/chosen": 3.23828125, "logits/rejected": 2.67578125, "logps/chosen": -901.0, "logps/rejected": -859.0, "loss": 0.5608, "rewards/accuracies": 0.78125, "rewards/chosen": 1.291259765625, "rewards/margins": 4.8984375, "rewards/rejected": -3.61328125, "step": 3643 }, { "epoch": 0.6885540176673438, "grad_norm": 2.5986346177185102, "learning_rate": 3.4120821767400086e-07, "logits/chosen": 2.66015625, "logits/rejected": 2.7578125, "logps/chosen": -726.5, "logps/rejected": -722.0, "loss": 0.5078, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4306640625, "rewards/margins": 5.26953125, "rewards/rejected": -3.84765625, "step": 3644 }, { "epoch": 0.688742973215551, "grad_norm": 1.8421891913720019, "learning_rate": 3.409453335425981e-07, "logits/chosen": 2.615234375, "logits/rejected": 2.39453125, "logps/chosen": -908.0, "logps/rejected": -1486.0, "loss": 0.4572, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7724609375, "rewards/margins": 6.62890625, "rewards/rejected": -5.8359375, "step": 3645 }, { "epoch": 0.6889319287637583, "grad_norm": 2.434696646208324, "learning_rate": 3.4068254036030093e-07, "logits/chosen": 3.2109375, "logits/rejected": 2.62109375, "logps/chosen": -749.0, "logps/rejected": -897.0, "loss": 0.518, "rewards/accuracies": 0.875, "rewards/chosen": 0.57080078125, "rewards/margins": 5.240234375, "rewards/rejected": -4.671875, "step": 3646 }, { "epoch": 0.6891208843119656, "grad_norm": 1.6235749944969096, "learning_rate": 3.404198382414373e-07, "logits/chosen": 2.771484375, "logits/rejected": 2.892578125, "logps/chosen": -717.0, "logps/rejected": -1156.0, "loss": 0.5272, "rewards/accuracies": 0.84375, "rewards/chosen": 1.03759765625, "rewards/margins": 7.9765625, "rewards/rejected": -6.9296875, "step": 3647 }, { "epoch": 0.6893098398601729, "grad_norm": 3.0239372313145942, "learning_rate": 3.4015722730029605e-07, "logits/chosen": 4.13671875, "logits/rejected": 4.19140625, "logps/chosen": -518.0, "logps/rejected": -1695.0, "loss": 0.5384, "rewards/accuracies": 0.84375, "rewards/chosen": 0.884765625, "rewards/margins": 7.7890625, "rewards/rejected": -6.890625, "step": 3648 }, { "epoch": 0.6894987954083802, "grad_norm": 2.5629077036668377, "learning_rate": 3.398947076511254e-07, "logits/chosen": 2.9609375, "logits/rejected": 2.865234375, "logps/chosen": -1050.0, "logps/rejected": -1057.5, "loss": 0.5399, "rewards/accuracies": 0.8125, "rewards/chosen": 0.867462158203125, "rewards/margins": 5.671875, "rewards/rejected": -4.80859375, "step": 3649 }, { "epoch": 0.6896877509565874, "grad_norm": 3.2055353852328756, "learning_rate": 3.3963227940813455e-07, "logits/chosen": 2.49609375, "logits/rejected": 2.30078125, "logps/chosen": -892.0, "logps/rejected": -1647.0, "loss": 0.6565, "rewards/accuracies": 0.78125, "rewards/chosen": 0.573974609375, "rewards/margins": 5.88671875, "rewards/rejected": -5.30859375, "step": 3650 }, { "epoch": 0.6898767065047947, "grad_norm": 2.44178913841024, "learning_rate": 3.393699426854929e-07, "logits/chosen": 2.73828125, "logits/rejected": 2.6826171875, "logps/chosen": -878.0, "logps/rejected": -1930.0, "loss": 0.5633, "rewards/accuracies": 0.78125, "rewards/chosen": 0.98095703125, "rewards/margins": 7.38671875, "rewards/rejected": -6.39453125, "step": 3651 }, { "epoch": 0.690065662053002, "grad_norm": 2.371339880889141, "learning_rate": 3.391076975973294e-07, "logits/chosen": 2.009765625, "logits/rejected": 2.056640625, "logps/chosen": -545.0, "logps/rejected": -701.0, "loss": 0.6086, "rewards/accuracies": 0.78125, "rewards/chosen": -0.517822265625, "rewards/margins": 5.11328125, "rewards/rejected": -5.640625, "step": 3652 }, { "epoch": 0.6902546176012093, "grad_norm": 1.8445245261564713, "learning_rate": 3.3884554425773447e-07, "logits/chosen": 4.0234375, "logits/rejected": 3.33984375, "logps/chosen": -1062.5, "logps/rejected": -936.5, "loss": 0.5214, "rewards/accuracies": 0.75, "rewards/chosen": 1.18408203125, "rewards/margins": 6.2578125, "rewards/rejected": -5.0703125, "step": 3653 }, { "epoch": 0.6904435731494166, "grad_norm": 3.8325079918544556, "learning_rate": 3.3858348278075684e-07, "logits/chosen": 2.798828125, "logits/rejected": 2.5546875, "logps/chosen": -880.0, "logps/rejected": -1353.0, "loss": 0.6044, "rewards/accuracies": 0.75, "rewards/chosen": 0.561279296875, "rewards/margins": 5.4375, "rewards/rejected": -4.875, "step": 3654 }, { "epoch": 0.6906325286976239, "grad_norm": 1.904972680107956, "learning_rate": 3.383215132804068e-07, "logits/chosen": 2.63671875, "logits/rejected": 2.064453125, "logps/chosen": -891.0, "logps/rejected": -925.0, "loss": 0.572, "rewards/accuracies": 0.78125, "rewards/chosen": 0.67578125, "rewards/margins": 5.048828125, "rewards/rejected": -4.359375, "step": 3655 }, { "epoch": 0.6908214842458311, "grad_norm": 2.182162656369195, "learning_rate": 3.380596358706538e-07, "logits/chosen": 2.33984375, "logits/rejected": 1.943359375, "logps/chosen": -911.0, "logps/rejected": -783.0, "loss": 0.5242, "rewards/accuracies": 0.8125, "rewards/chosen": 0.40478515625, "rewards/margins": 4.98046875, "rewards/rejected": -4.576171875, "step": 3656 }, { "epoch": 0.6910104397940384, "grad_norm": 2.6012804074164118, "learning_rate": 3.377978506654275e-07, "logits/chosen": 2.53515625, "logits/rejected": 2.15234375, "logps/chosen": -688.5, "logps/rejected": -876.0, "loss": 0.6378, "rewards/accuracies": 0.78125, "rewards/chosen": -0.031005859375, "rewards/margins": 4.2265625, "rewards/rejected": -4.26171875, "step": 3657 }, { "epoch": 0.6911993953422457, "grad_norm": 2.3107583280283293, "learning_rate": 3.3753615777861733e-07, "logits/chosen": 2.2841796875, "logits/rejected": 1.974609375, "logps/chosen": -1238.0, "logps/rejected": -996.0, "loss": 0.5571, "rewards/accuracies": 0.75, "rewards/chosen": 1.148193359375, "rewards/margins": 4.78515625, "rewards/rejected": -3.6328125, "step": 3658 }, { "epoch": 0.691388350890453, "grad_norm": 3.4449548855384537, "learning_rate": 3.372745573240725e-07, "logits/chosen": 2.25, "logits/rejected": 1.900390625, "logps/chosen": -546.0, "logps/rejected": -657.5, "loss": 0.687, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0888671875, "rewards/margins": 3.400390625, "rewards/rejected": -3.3125, "step": 3659 }, { "epoch": 0.6915773064386603, "grad_norm": 1.9161696088852782, "learning_rate": 3.370130494156025e-07, "logits/chosen": 2.7265625, "logits/rejected": 2.439453125, "logps/chosen": -852.0, "logps/rejected": -12152.0, "loss": 0.5879, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0751953125, "rewards/margins": -11.2734375, "rewards/rejected": 11.3984375, "step": 3660 }, { "epoch": 0.6917662619868676, "grad_norm": 3.055996414539332, "learning_rate": 3.367516341669759e-07, "logits/chosen": 2.87109375, "logits/rejected": 2.65625, "logps/chosen": -686.0, "logps/rejected": -676.0, "loss": 0.5329, "rewards/accuracies": 0.84375, "rewards/chosen": 0.462646484375, "rewards/margins": 5.64453125, "rewards/rejected": -5.1875, "step": 3661 }, { "epoch": 0.6919552175350748, "grad_norm": 2.573808679605482, "learning_rate": 3.3649031169192145e-07, "logits/chosen": 1.802734375, "logits/rejected": 1.4375, "logps/chosen": -1057.5, "logps/rejected": -913.0, "loss": 0.5745, "rewards/accuracies": 0.875, "rewards/chosen": 1.288818359375, "rewards/margins": 5.12890625, "rewards/rejected": -3.8359375, "step": 3662 }, { "epoch": 0.6921441730832821, "grad_norm": 3.3854866150743486, "learning_rate": 3.3622908210412713e-07, "logits/chosen": 2.568359375, "logits/rejected": 2.96484375, "logps/chosen": -910.0, "logps/rejected": -1451.0, "loss": 0.6901, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0389404296875, "rewards/margins": 5.5478515625, "rewards/rejected": -5.59765625, "step": 3663 }, { "epoch": 0.6923331286314894, "grad_norm": 2.965515038940123, "learning_rate": 3.3596794551724085e-07, "logits/chosen": 3.8671875, "logits/rejected": 2.7421875, "logps/chosen": -825.0, "logps/rejected": -669.0, "loss": 0.5344, "rewards/accuracies": 0.84375, "rewards/chosen": 0.61376953125, "rewards/margins": 5.25, "rewards/rejected": -4.6484375, "step": 3664 }, { "epoch": 0.6925220841796967, "grad_norm": 1.9750993243397823, "learning_rate": 3.3570690204486963e-07, "logits/chosen": 1.951171875, "logits/rejected": 2.359375, "logps/chosen": -631.0, "logps/rejected": -1201.0, "loss": 0.585, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4332275390625, "rewards/margins": 4.19140625, "rewards/rejected": -3.759765625, "step": 3665 }, { "epoch": 0.692711039727904, "grad_norm": 3.807492329666491, "learning_rate": 3.354459518005807e-07, "logits/chosen": 2.7109375, "logits/rejected": 2.505859375, "logps/chosen": -668.0, "logps/rejected": -715.5, "loss": 0.6662, "rewards/accuracies": 0.71875, "rewards/chosen": 0.20562744140625, "rewards/margins": 4.30859375, "rewards/rejected": -4.09765625, "step": 3666 }, { "epoch": 0.6928999952761113, "grad_norm": 1.7939779708556405, "learning_rate": 3.3518509489790005e-07, "logits/chosen": 3.162109375, "logits/rejected": 2.5400390625, "logps/chosen": -568.5, "logps/rejected": -626.0, "loss": 0.6136, "rewards/accuracies": 0.75, "rewards/chosen": -0.29541015625, "rewards/margins": 3.7099609375, "rewards/rejected": -4.0078125, "step": 3667 }, { "epoch": 0.6930889508243185, "grad_norm": 1.5880069421403193, "learning_rate": 3.3492433145031344e-07, "logits/chosen": 3.40625, "logits/rejected": 3.2734375, "logps/chosen": -654.75, "logps/rejected": -763.25, "loss": 0.6223, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1707763671875, "rewards/margins": 4.806640625, "rewards/rejected": -4.98828125, "step": 3668 }, { "epoch": 0.6932779063725258, "grad_norm": 3.9968415753922404, "learning_rate": 3.346636615712658e-07, "logits/chosen": 2.685546875, "logits/rejected": 2.5546875, "logps/chosen": -746.0, "logps/rejected": -868.0, "loss": 0.6341, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8994140625, "rewards/margins": 4.6875, "rewards/rejected": -3.78125, "step": 3669 }, { "epoch": 0.6934668619207331, "grad_norm": 1.991909364473993, "learning_rate": 3.344030853741612e-07, "logits/chosen": 3.35546875, "logits/rejected": 3.28125, "logps/chosen": -852.0, "logps/rejected": -740.0, "loss": 0.5198, "rewards/accuracies": 0.875, "rewards/chosen": 1.11865234375, "rewards/margins": 5.53125, "rewards/rejected": -4.40625, "step": 3670 }, { "epoch": 0.6936558174689404, "grad_norm": 2.1145902874878324, "learning_rate": 3.3414260297236366e-07, "logits/chosen": 3.0078125, "logits/rejected": 2.953125, "logps/chosen": -927.0, "logps/rejected": -929.0, "loss": 0.5757, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4365234375, "rewards/margins": 4.78125, "rewards/rejected": -4.3515625, "step": 3671 }, { "epoch": 0.6938447730171478, "grad_norm": 2.208554911170181, "learning_rate": 3.338822144791953e-07, "logits/chosen": 2.431640625, "logits/rejected": 2.130859375, "logps/chosen": -1281.0, "logps/rejected": -2208.0, "loss": 0.538, "rewards/accuracies": 0.84375, "rewards/chosen": 0.78759765625, "rewards/margins": 7.15625, "rewards/rejected": -6.37890625, "step": 3672 }, { "epoch": 0.694033728565355, "grad_norm": 2.5824172830750864, "learning_rate": 3.336219200079384e-07, "logits/chosen": 2.46875, "logits/rejected": 2.1220703125, "logps/chosen": -599.5, "logps/rejected": -629.5, "loss": 0.582, "rewards/accuracies": 0.875, "rewards/chosen": 0.4189453125, "rewards/margins": 4.22265625, "rewards/rejected": -3.80078125, "step": 3673 }, { "epoch": 0.6942226841135622, "grad_norm": 2.519608536413446, "learning_rate": 3.333617196718337e-07, "logits/chosen": 2.537109375, "logits/rejected": 1.9189453125, "logps/chosen": -863.5, "logps/rejected": -685.0, "loss": 0.5883, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8759765625, "rewards/margins": 4.0078125, "rewards/rejected": -3.125, "step": 3674 }, { "epoch": 0.6944116396617696, "grad_norm": 1.4567023893463318, "learning_rate": 3.3310161358408127e-07, "logits/chosen": 2.08984375, "logits/rejected": 1.74853515625, "logps/chosen": -1139.0, "logps/rejected": -1171.0, "loss": 0.4639, "rewards/accuracies": 0.90625, "rewards/chosen": 1.45556640625, "rewards/margins": 6.296875, "rewards/rejected": -4.83203125, "step": 3675 }, { "epoch": 0.6946005952099769, "grad_norm": 2.1065942210812056, "learning_rate": 3.328416018578403e-07, "logits/chosen": 2.1796875, "logits/rejected": 2.3857421875, "logps/chosen": -682.0, "logps/rejected": -705.0, "loss": 0.6378, "rewards/accuracies": 0.78125, "rewards/chosen": 0.92578125, "rewards/margins": 4.26171875, "rewards/rejected": -3.34765625, "step": 3676 }, { "epoch": 0.6947895507581842, "grad_norm": 2.490793192381455, "learning_rate": 3.325816846062282e-07, "logits/chosen": 1.6513671875, "logits/rejected": 2.0068359375, "logps/chosen": -3049.0, "logps/rejected": -1642.0, "loss": 0.611, "rewards/accuracies": 0.75, "rewards/chosen": 69.961181640625, "rewards/margins": 75.375, "rewards/rejected": -5.5546875, "step": 3677 }, { "epoch": 0.6949785063063915, "grad_norm": 1.3625240901689957, "learning_rate": 3.323218619423224e-07, "logits/chosen": 2.4296875, "logits/rejected": 2.390625, "logps/chosen": -975.0, "logps/rejected": -996.5, "loss": 0.5622, "rewards/accuracies": 0.71875, "rewards/chosen": 1.4443359375, "rewards/margins": 5.48828125, "rewards/rejected": -4.04296875, "step": 3678 }, { "epoch": 0.6951674618545987, "grad_norm": 2.374378596056838, "learning_rate": 3.3206213397915826e-07, "logits/chosen": 2.3330078125, "logits/rejected": 2.189453125, "logps/chosen": -646.0, "logps/rejected": -798.0, "loss": 0.5406, "rewards/accuracies": 0.84375, "rewards/chosen": 0.27197265625, "rewards/margins": 4.9453125, "rewards/rejected": -4.6796875, "step": 3679 }, { "epoch": 0.695356417402806, "grad_norm": 1.843294753981675, "learning_rate": 3.3180250082973014e-07, "logits/chosen": 2.23187255859375, "logits/rejected": 2.31982421875, "logps/chosen": -560.0, "logps/rejected": -956.5, "loss": 0.5592, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3349609375, "rewards/margins": 5.2890625, "rewards/rejected": -3.953125, "step": 3680 }, { "epoch": 0.6955453729510133, "grad_norm": 2.0836900203625683, "learning_rate": 3.31542962606992e-07, "logits/chosen": 3.57421875, "logits/rejected": 3.64453125, "logps/chosen": -784.0, "logps/rejected": -998.0, "loss": 0.6355, "rewards/accuracies": 0.71875, "rewards/chosen": 1.322265625, "rewards/margins": 4.1484375, "rewards/rejected": -2.822265625, "step": 3681 }, { "epoch": 0.6957343284992206, "grad_norm": 2.8325583073258254, "learning_rate": 3.312835194238548e-07, "logits/chosen": 2.470703125, "logits/rejected": 2.6328125, "logps/chosen": -853.0, "logps/rejected": -741.0, "loss": 0.5564, "rewards/accuracies": 0.90625, "rewards/chosen": 0.91015625, "rewards/margins": 4.9296875, "rewards/rejected": -4.015625, "step": 3682 }, { "epoch": 0.6959232840474279, "grad_norm": 2.067687464584347, "learning_rate": 3.3102417139318983e-07, "logits/chosen": 3.02734375, "logits/rejected": 2.640625, "logps/chosen": -925.5, "logps/rejected": -1038.0, "loss": 0.6713, "rewards/accuracies": 0.6875, "rewards/chosen": 1.70556640625, "rewards/margins": 4.4765625, "rewards/rejected": -2.7734375, "step": 3683 }, { "epoch": 0.6961122395956352, "grad_norm": 2.5858414692407665, "learning_rate": 3.307649186278261e-07, "logits/chosen": 2.970703125, "logits/rejected": 3.251953125, "logps/chosen": -989.5, "logps/rejected": -1322.0, "loss": 0.608, "rewards/accuracies": 0.8125, "rewards/chosen": 1.412109375, "rewards/margins": 6.9296875, "rewards/rejected": -5.515625, "step": 3684 }, { "epoch": 0.6963011951438424, "grad_norm": 2.9280974715050445, "learning_rate": 3.305057612405513e-07, "logits/chosen": 2.599609375, "logits/rejected": 2.0, "logps/chosen": -846.0, "logps/rejected": -849.0, "loss": 0.6334, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7176513671875, "rewards/margins": 3.548828125, "rewards/rejected": -2.83203125, "step": 3685 }, { "epoch": 0.6964901506920497, "grad_norm": 2.1237536255876335, "learning_rate": 3.3024669934411166e-07, "logits/chosen": 2.21875, "logits/rejected": 2.0791015625, "logps/chosen": -802.0, "logps/rejected": -898.0, "loss": 0.6272, "rewards/accuracies": 0.75, "rewards/chosen": 1.1484375, "rewards/margins": 3.650390625, "rewards/rejected": -2.498046875, "step": 3686 }, { "epoch": 0.696679106240257, "grad_norm": 3.09336629754543, "learning_rate": 3.299877330512121e-07, "logits/chosen": 1.418548583984375, "logits/rejected": 1.7001953125, "logps/chosen": -648.5, "logps/rejected": -884.0, "loss": 0.558, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4322662353515625, "rewards/margins": 5.453125, "rewards/rejected": -5.0234375, "step": 3687 }, { "epoch": 0.6968680617884643, "grad_norm": 10.229923085370205, "learning_rate": 3.297288624745156e-07, "logits/chosen": 3.16015625, "logits/rejected": 2.498046875, "logps/chosen": -972.0, "logps/rejected": -879.5, "loss": 0.5698, "rewards/accuracies": 0.875, "rewards/chosen": 1.390625, "rewards/margins": 4.91015625, "rewards/rejected": -3.53515625, "step": 3688 }, { "epoch": 0.6970570173366716, "grad_norm": 1.9960623764855585, "learning_rate": 3.2947008772664373e-07, "logits/chosen": 2.7578125, "logits/rejected": 2.57421875, "logps/chosen": -665.0, "logps/rejected": -749.5, "loss": 0.5132, "rewards/accuracies": 0.84375, "rewards/chosen": 0.57275390625, "rewards/margins": 4.43359375, "rewards/rejected": -3.85546875, "step": 3689 }, { "epoch": 0.6972459728848789, "grad_norm": 1.7656054365037965, "learning_rate": 3.2921140892017625e-07, "logits/chosen": 2.9375, "logits/rejected": 2.7734375, "logps/chosen": -772.0, "logps/rejected": -825.5, "loss": 0.5649, "rewards/accuracies": 0.78125, "rewards/chosen": 1.333984375, "rewards/margins": 4.6171875, "rewards/rejected": -3.2890625, "step": 3690 }, { "epoch": 0.6974349284330861, "grad_norm": 1.6712761144669475, "learning_rate": 3.28952826167651e-07, "logits/chosen": 2.5, "logits/rejected": 2.25390625, "logps/chosen": -751.5, "logps/rejected": -698.0, "loss": 0.5813, "rewards/accuracies": 0.75, "rewards/chosen": 0.792236328125, "rewards/margins": 5.50390625, "rewards/rejected": -4.7109375, "step": 3691 }, { "epoch": 0.6976238839812934, "grad_norm": 2.1563364754168615, "learning_rate": 3.2869433958156486e-07, "logits/chosen": 2.9296875, "logits/rejected": 2.927734375, "logps/chosen": -598.75, "logps/rejected": -897.5, "loss": 0.7038, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1796875, "rewards/margins": 4.3125, "rewards/rejected": -4.1328125, "step": 3692 }, { "epoch": 0.6978128395295007, "grad_norm": 1.737690766589768, "learning_rate": 3.2843594927437146e-07, "logits/chosen": 2.7451171875, "logits/rejected": 3.03125, "logps/chosen": -569.0, "logps/rejected": -652.5, "loss": 0.7655, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6669921875, "rewards/margins": 2.541015625, "rewards/rejected": -3.205078125, "step": 3693 }, { "epoch": 0.698001795077708, "grad_norm": 1.9788773393110688, "learning_rate": 3.281776553584839e-07, "logits/chosen": 2.2509765625, "logits/rejected": 1.7880859375, "logps/chosen": -1140.5, "logps/rejected": -836.0, "loss": 0.5204, "rewards/accuracies": 0.9375, "rewards/chosen": 1.480224609375, "rewards/margins": 5.66796875, "rewards/rejected": -4.17578125, "step": 3694 }, { "epoch": 0.6981907506259153, "grad_norm": 2.313358632812381, "learning_rate": 3.279194579462727e-07, "logits/chosen": 2.162109375, "logits/rejected": 2.4140625, "logps/chosen": -509.0, "logps/rejected": -537.5, "loss": 0.6246, "rewards/accuracies": 0.75, "rewards/chosen": 0.558349609375, "rewards/margins": 3.17578125, "rewards/rejected": -2.611328125, "step": 3695 }, { "epoch": 0.6983797061741225, "grad_norm": 2.2270950050887115, "learning_rate": 3.2766135715006614e-07, "logits/chosen": 3.15625, "logits/rejected": 2.16015625, "logps/chosen": -807.5, "logps/rejected": -1002.5, "loss": 0.5345, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0479736328125, "rewards/margins": 7.134765625, "rewards/rejected": -7.19921875, "step": 3696 }, { "epoch": 0.6985686617223298, "grad_norm": 1.6541480233746717, "learning_rate": 3.274033530821516e-07, "logits/chosen": 2.08349609375, "logits/rejected": 1.880859375, "logps/chosen": -440.5, "logps/rejected": -1688.0, "loss": 0.7143, "rewards/accuracies": 0.65625, "rewards/chosen": -0.132568359375, "rewards/margins": 4.68359375, "rewards/rejected": -4.8203125, "step": 3697 }, { "epoch": 0.6987576172705371, "grad_norm": 2.377026944622249, "learning_rate": 3.2714544585477267e-07, "logits/chosen": 2.45703125, "logits/rejected": 1.873046875, "logps/chosen": -782.5, "logps/rejected": -833.0, "loss": 0.5072, "rewards/accuracies": 0.84375, "rewards/chosen": 1.498046875, "rewards/margins": 5.140625, "rewards/rejected": -3.65234375, "step": 3698 }, { "epoch": 0.6989465728187444, "grad_norm": 2.8720410155945215, "learning_rate": 3.268876355801323e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.94140625, "logps/chosen": -917.5, "logps/rejected": -1275.0, "loss": 0.6075, "rewards/accuracies": 0.71875, "rewards/chosen": 0.97412109375, "rewards/margins": 5.25, "rewards/rejected": -4.27734375, "step": 3699 }, { "epoch": 0.6991355283669517, "grad_norm": 1.999619967587976, "learning_rate": 3.2662992237039056e-07, "logits/chosen": 2.865234375, "logits/rejected": 2.5, "logps/chosen": -901.0, "logps/rejected": -991.0, "loss": 0.4145, "rewards/accuracies": 0.875, "rewards/chosen": 1.13671875, "rewards/margins": 7.3125, "rewards/rejected": -6.1640625, "step": 3700 }, { "epoch": 0.699324483915159, "grad_norm": 2.2836527794965016, "learning_rate": 3.263723063376652e-07, "logits/chosen": 2.29296875, "logits/rejected": 1.990234375, "logps/chosen": -714.0, "logps/rejected": -735.5, "loss": 0.6059, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5294189453125, "rewards/margins": 5.3671875, "rewards/rejected": -4.82421875, "step": 3701 }, { "epoch": 0.6995134394633662, "grad_norm": 2.7395450431464865, "learning_rate": 3.2611478759403257e-07, "logits/chosen": 2.560546875, "logits/rejected": 2.3515625, "logps/chosen": -962.0, "logps/rejected": -639.0, "loss": 0.6334, "rewards/accuracies": 0.75, "rewards/chosen": 0.51708984375, "rewards/margins": 3.9765625, "rewards/rejected": -3.451171875, "step": 3702 }, { "epoch": 0.6997023950115735, "grad_norm": 3.230195134756048, "learning_rate": 3.258573662515252e-07, "logits/chosen": 2.38671875, "logits/rejected": 2.23828125, "logps/chosen": -580.0, "logps/rejected": -588.0, "loss": 0.7254, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07373046875, "rewards/margins": 3.685546875, "rewards/rejected": -3.60546875, "step": 3703 }, { "epoch": 0.6998913505597808, "grad_norm": 2.134510540873606, "learning_rate": 3.2560004242213464e-07, "logits/chosen": 2.067138671875, "logits/rejected": 1.77490234375, "logps/chosen": -1170.0, "logps/rejected": -1185.0, "loss": 0.473, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5732421875, "rewards/margins": 7.0859375, "rewards/rejected": -5.51171875, "step": 3704 }, { "epoch": 0.7000803061079881, "grad_norm": 1.1824606613064552, "learning_rate": 3.2534281621780936e-07, "logits/chosen": 2.509765625, "logits/rejected": 2.326171875, "logps/chosen": -832.5, "logps/rejected": -1152.0, "loss": 0.5989, "rewards/accuracies": 0.8125, "rewards/chosen": 0.99609375, "rewards/margins": 5.9453125, "rewards/rejected": -4.94140625, "step": 3705 }, { "epoch": 0.7002692616561954, "grad_norm": 1.8010367107864234, "learning_rate": 3.250856877504554e-07, "logits/chosen": 1.9765625, "logits/rejected": 2.0048828125, "logps/chosen": -1085.5, "logps/rejected": -926.5, "loss": 0.5999, "rewards/accuracies": 0.75, "rewards/chosen": 0.88671875, "rewards/margins": 5.138671875, "rewards/rejected": -4.2578125, "step": 3706 }, { "epoch": 0.7004582172044027, "grad_norm": 2.3025867363422248, "learning_rate": 3.248286571319364e-07, "logits/chosen": 3.330078125, "logits/rejected": 2.787109375, "logps/chosen": -790.5, "logps/rejected": -678.5, "loss": 0.5926, "rewards/accuracies": 0.71875, "rewards/chosen": 0.994140625, "rewards/margins": 4.833984375, "rewards/rejected": -3.84375, "step": 3707 }, { "epoch": 0.7006471727526099, "grad_norm": 2.4881350987087094, "learning_rate": 3.245717244740732e-07, "logits/chosen": 2.9833984375, "logits/rejected": 2.8685302734375, "logps/chosen": -668.5, "logps/rejected": -865.0, "loss": 0.6205, "rewards/accuracies": 0.75, "rewards/chosen": 0.3857421875, "rewards/margins": 4.5859375, "rewards/rejected": -4.20703125, "step": 3708 }, { "epoch": 0.7008361283008172, "grad_norm": 1.8122939494211465, "learning_rate": 3.243148898886444e-07, "logits/chosen": 2.98828125, "logits/rejected": 3.375, "logps/chosen": -426.5, "logps/rejected": -1193.0, "loss": 0.6189, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13818359375, "rewards/margins": 7.28515625, "rewards/rejected": -7.12890625, "step": 3709 }, { "epoch": 0.7010250838490245, "grad_norm": 1.9053089408763468, "learning_rate": 3.2405815348738575e-07, "logits/chosen": 2.638671875, "logits/rejected": 2.19921875, "logps/chosen": -1510.5, "logps/rejected": -1055.0, "loss": 0.586, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3291015625, "rewards/margins": 3.43359375, "rewards/rejected": -4.76171875, "step": 3710 }, { "epoch": 0.7012140393972318, "grad_norm": 4.565417409656437, "learning_rate": 3.238015153819901e-07, "logits/chosen": 3.078125, "logits/rejected": 2.93359375, "logps/chosen": -973.0, "logps/rejected": -1206.0, "loss": 0.5535, "rewards/accuracies": 0.875, "rewards/chosen": 1.121826171875, "rewards/margins": 5.765625, "rewards/rejected": -4.64453125, "step": 3711 }, { "epoch": 0.7014029949454391, "grad_norm": 2.2133350920323043, "learning_rate": 3.235449756841079e-07, "logits/chosen": 2.66015625, "logits/rejected": 2.2568359375, "logps/chosen": -1228.5, "logps/rejected": -1258.0, "loss": 0.4915, "rewards/accuracies": 0.875, "rewards/chosen": 2.021484375, "rewards/margins": 6.875, "rewards/rejected": -4.84765625, "step": 3712 }, { "epoch": 0.7015919504936464, "grad_norm": 1.5217427574259905, "learning_rate": 3.2328853450534643e-07, "logits/chosen": 3.18359375, "logits/rejected": 2.876953125, "logps/chosen": -612.0, "logps/rejected": -591.5, "loss": 0.6252, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2939453125, "rewards/margins": 3.65625, "rewards/rejected": -3.36328125, "step": 3713 }, { "epoch": 0.7017809060418536, "grad_norm": 2.1445662868165147, "learning_rate": 3.2303219195727006e-07, "logits/chosen": 1.998046875, "logits/rejected": 1.711181640625, "logps/chosen": -880.5, "logps/rejected": -797.0, "loss": 0.5042, "rewards/accuracies": 0.875, "rewards/chosen": 1.315673828125, "rewards/margins": 5.69140625, "rewards/rejected": -4.3671875, "step": 3714 }, { "epoch": 0.7019698615900609, "grad_norm": 2.6000308165211123, "learning_rate": 3.2277594815140095e-07, "logits/chosen": 2.431640625, "logits/rejected": 2.4296875, "logps/chosen": -1010.0, "logps/rejected": -1783.0, "loss": 0.5369, "rewards/accuracies": 0.78125, "rewards/chosen": 0.47509765625, "rewards/margins": 9.0390625, "rewards/rejected": -8.578125, "step": 3715 }, { "epoch": 0.7021588171382682, "grad_norm": 2.6181247831819308, "learning_rate": 3.225198031992176e-07, "logits/chosen": 3.39453125, "logits/rejected": 2.80078125, "logps/chosen": -704.5, "logps/rejected": -916.5, "loss": 0.4244, "rewards/accuracies": 0.90625, "rewards/chosen": 1.00537109375, "rewards/margins": 6.9765625, "rewards/rejected": -5.9609375, "step": 3716 }, { "epoch": 0.7023477726864755, "grad_norm": 1.9455260900582099, "learning_rate": 3.222637572121556e-07, "logits/chosen": 3.46484375, "logits/rejected": 3.81640625, "logps/chosen": -998.0, "logps/rejected": -1009.0, "loss": 0.5809, "rewards/accuracies": 0.6875, "rewards/chosen": 0.923828125, "rewards/margins": 6.45703125, "rewards/rejected": -5.53515625, "step": 3717 }, { "epoch": 0.7025367282346828, "grad_norm": 2.6020389445665018, "learning_rate": 3.220078103016078e-07, "logits/chosen": 2.978515625, "logits/rejected": 2.826171875, "logps/chosen": -1034.0, "logps/rejected": -1282.0, "loss": 0.5971, "rewards/accuracies": 0.75, "rewards/chosen": 0.7271728515625, "rewards/margins": 4.6875, "rewards/rejected": -3.953125, "step": 3718 }, { "epoch": 0.70272568378289, "grad_norm": 2.4190425702053924, "learning_rate": 3.2175196257892346e-07, "logits/chosen": 3.435546875, "logits/rejected": 3.171875, "logps/chosen": -871.0, "logps/rejected": -791.0, "loss": 0.4991, "rewards/accuracies": 0.90625, "rewards/chosen": 1.435546875, "rewards/margins": 5.84375, "rewards/rejected": -4.4140625, "step": 3719 }, { "epoch": 0.7029146393310973, "grad_norm": 2.2035708968385674, "learning_rate": 3.214962141554094e-07, "logits/chosen": 3.78125, "logits/rejected": 3.57421875, "logps/chosen": -871.0, "logps/rejected": -775.5, "loss": 0.5347, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2425537109375, "rewards/margins": 5.3359375, "rewards/rejected": -5.0859375, "step": 3720 }, { "epoch": 0.7031035948793046, "grad_norm": 3.5828991015591103, "learning_rate": 3.2124056514232834e-07, "logits/chosen": 3.1875, "logits/rejected": 2.9140625, "logps/chosen": -985.0, "logps/rejected": -971.0, "loss": 0.5709, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4091796875, "rewards/margins": 5.04296875, "rewards/rejected": -3.640625, "step": 3721 }, { "epoch": 0.7032925504275119, "grad_norm": 2.2769118228653653, "learning_rate": 3.209850156509004e-07, "logits/chosen": 2.431640625, "logits/rejected": 1.9609375, "logps/chosen": -630.0, "logps/rejected": -11889.0, "loss": 0.7378, "rewards/accuracies": 0.6875, "rewards/chosen": 1.014892578125, "rewards/margins": -48.3984375, "rewards/rejected": 49.34375, "step": 3722 }, { "epoch": 0.7034815059757192, "grad_norm": 2.1583193420005777, "learning_rate": 3.2072956579230265e-07, "logits/chosen": 1.82421875, "logits/rejected": 1.332763671875, "logps/chosen": -1521.0, "logps/rejected": -932.5, "loss": 0.4758, "rewards/accuracies": 0.875, "rewards/chosen": 0.5478515625, "rewards/margins": 6.328125, "rewards/rejected": -5.7734375, "step": 3723 }, { "epoch": 0.7036704615239265, "grad_norm": 2.418416337776677, "learning_rate": 3.204742156776677e-07, "logits/chosen": 3.5625, "logits/rejected": 3.087890625, "logps/chosen": -763.0, "logps/rejected": -687.5, "loss": 0.7579, "rewards/accuracies": 0.75, "rewards/chosen": -0.69287109375, "rewards/margins": 3.20361328125, "rewards/rejected": -3.892578125, "step": 3724 }, { "epoch": 0.7038594170721337, "grad_norm": 2.4834631725300014, "learning_rate": 3.20218965418086e-07, "logits/chosen": 2.101806640625, "logits/rejected": 2.7890625, "logps/chosen": -469.75, "logps/rejected": -1329.5, "loss": 0.708, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3564453125, "rewards/margins": 5.234375, "rewards/rejected": -5.59765625, "step": 3725 }, { "epoch": 0.704048372620341, "grad_norm": 1.82593956887901, "learning_rate": 3.199638151246037e-07, "logits/chosen": 1.9749755859375, "logits/rejected": 1.95361328125, "logps/chosen": -540.0, "logps/rejected": -820.5, "loss": 0.5702, "rewards/accuracies": 0.9375, "rewards/chosen": 1.296875, "rewards/margins": 5.1015625, "rewards/rejected": -3.80859375, "step": 3726 }, { "epoch": 0.7042373281685483, "grad_norm": 2.6954917644324827, "learning_rate": 3.19708764908224e-07, "logits/chosen": 3.51171875, "logits/rejected": 3.5, "logps/chosen": -439.75, "logps/rejected": -750.5, "loss": 0.6385, "rewards/accuracies": 0.875, "rewards/chosen": 0.4345703125, "rewards/margins": 3.759765625, "rewards/rejected": -3.3359375, "step": 3727 }, { "epoch": 0.7044262837167556, "grad_norm": 2.733403583552215, "learning_rate": 3.1945381487990617e-07, "logits/chosen": 2.6328125, "logits/rejected": 3.16015625, "logps/chosen": -565.5, "logps/rejected": -782.0, "loss": 0.6298, "rewards/accuracies": 0.78125, "rewards/chosen": 0.54638671875, "rewards/margins": 3.85546875, "rewards/rejected": -3.296875, "step": 3728 }, { "epoch": 0.704615239264963, "grad_norm": 2.0811403074284796, "learning_rate": 3.191989651505661e-07, "logits/chosen": 3.38671875, "logits/rejected": 3.39453125, "logps/chosen": -689.0, "logps/rejected": -845.5, "loss": 0.6361, "rewards/accuracies": 0.78125, "rewards/chosen": 1.583984375, "rewards/margins": 4.90234375, "rewards/rejected": -3.307861328125, "step": 3729 }, { "epoch": 0.7048041948131702, "grad_norm": 3.074219377356746, "learning_rate": 3.1894421583107613e-07, "logits/chosen": 1.763671875, "logits/rejected": 1.525390625, "logps/chosen": -589.0, "logps/rejected": -757.5, "loss": 0.6201, "rewards/accuracies": 0.78125, "rewards/chosen": 0.53515625, "rewards/margins": 4.3505859375, "rewards/rejected": -3.8125, "step": 3730 }, { "epoch": 0.7049931503613774, "grad_norm": 2.5341101384044205, "learning_rate": 3.1868956703226467e-07, "logits/chosen": 2.91796875, "logits/rejected": 2.89453125, "logps/chosen": -989.5, "logps/rejected": -1274.0, "loss": 0.5952, "rewards/accuracies": 0.78125, "rewards/chosen": 1.751953125, "rewards/margins": 6.46875, "rewards/rejected": -4.71484375, "step": 3731 }, { "epoch": 0.7051821059095847, "grad_norm": 2.292341912996544, "learning_rate": 3.184350188649167e-07, "logits/chosen": 3.169921875, "logits/rejected": 3.328125, "logps/chosen": -640.5, "logps/rejected": -881.0, "loss": 0.4942, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2333984375, "rewards/margins": 5.5859375, "rewards/rejected": -4.35546875, "step": 3732 }, { "epoch": 0.705371061457792, "grad_norm": 2.6859728463284527, "learning_rate": 3.1818057143977296e-07, "logits/chosen": 3.296875, "logits/rejected": 3.85546875, "logps/chosen": -751.5, "logps/rejected": -1884.0, "loss": 0.5562, "rewards/accuracies": 0.71875, "rewards/chosen": 1.630126953125, "rewards/margins": 8.890625, "rewards/rejected": -7.24609375, "step": 3733 }, { "epoch": 0.7055600170059994, "grad_norm": 3.3806559585143385, "learning_rate": 3.179262248675309e-07, "logits/chosen": 2.591796875, "logits/rejected": 2.3779296875, "logps/chosen": -719.0, "logps/rejected": -704.5, "loss": 0.5721, "rewards/accuracies": 0.875, "rewards/chosen": 0.73779296875, "rewards/margins": 4.02734375, "rewards/rejected": -3.294921875, "step": 3734 }, { "epoch": 0.7057489725542067, "grad_norm": 1.0904440292367772, "learning_rate": 3.1767197925884357e-07, "logits/chosen": 3.18359375, "logits/rejected": 2.396484375, "logps/chosen": -798.5, "logps/rejected": -757.0, "loss": 0.425, "rewards/accuracies": 0.8125, "rewards/chosen": 1.9560546875, "rewards/margins": 6.12109375, "rewards/rejected": -4.166015625, "step": 3735 }, { "epoch": 0.705937928102414, "grad_norm": 3.741858074004889, "learning_rate": 3.174178347243207e-07, "logits/chosen": 2.626953125, "logits/rejected": 2.85546875, "logps/chosen": -766.5, "logps/rejected": -790.0, "loss": 0.5455, "rewards/accuracies": 0.8125, "rewards/chosen": 1.267333984375, "rewards/margins": 4.4609375, "rewards/rejected": -3.19921875, "step": 3736 }, { "epoch": 0.7061268836506212, "grad_norm": 1.9206921269412598, "learning_rate": 3.1716379137452753e-07, "logits/chosen": 3.66796875, "logits/rejected": 3.30078125, "logps/chosen": -880.5, "logps/rejected": -1410.5, "loss": 0.4694, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4697265625, "rewards/margins": 8.359375, "rewards/rejected": -6.87890625, "step": 3737 }, { "epoch": 0.7063158391988285, "grad_norm": 1.8146750747940181, "learning_rate": 3.1690984931998547e-07, "logits/chosen": 2.4375, "logits/rejected": 1.962890625, "logps/chosen": -762.0, "logps/rejected": -740.5, "loss": 0.535, "rewards/accuracies": 0.875, "rewards/chosen": 1.79296875, "rewards/margins": 5.15625, "rewards/rejected": -3.36328125, "step": 3738 }, { "epoch": 0.7065047947470358, "grad_norm": 1.9752784545104836, "learning_rate": 3.1665600867117197e-07, "logits/chosen": 1.669921875, "logits/rejected": 1.392578125, "logps/chosen": -971.0, "logps/rejected": -878.0, "loss": 0.4677, "rewards/accuracies": 0.9375, "rewards/chosen": 1.38916015625, "rewards/margins": 6.4765625, "rewards/rejected": -5.08203125, "step": 3739 }, { "epoch": 0.7066937502952431, "grad_norm": 1.3935991718092682, "learning_rate": 3.164022695385199e-07, "logits/chosen": 2.36328125, "logits/rejected": 1.591796875, "logps/chosen": -576.5, "logps/rejected": -730.0, "loss": 0.6162, "rewards/accuracies": 0.8125, "rewards/chosen": 0.41650390625, "rewards/margins": 4.984375, "rewards/rejected": -4.578125, "step": 3740 }, { "epoch": 0.7068827058434504, "grad_norm": 4.81397507330893, "learning_rate": 3.161486320324188e-07, "logits/chosen": 2.982421875, "logits/rejected": 2.998046875, "logps/chosen": -773.5, "logps/rejected": -1023.0, "loss": 0.6513, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3291015625, "rewards/margins": 6.2265625, "rewards/rejected": -5.890625, "step": 3741 }, { "epoch": 0.7070716613916576, "grad_norm": 2.8544774380590496, "learning_rate": 3.158950962632133e-07, "logits/chosen": 2.84765625, "logits/rejected": 2.68359375, "logps/chosen": -790.0, "logps/rejected": -941.0, "loss": 0.7422, "rewards/accuracies": 0.65625, "rewards/chosen": 0.44903564453125, "rewards/margins": 3.18359375, "rewards/rejected": -2.7421875, "step": 3742 }, { "epoch": 0.7072606169398649, "grad_norm": 2.191495232351462, "learning_rate": 3.1564166234120386e-07, "logits/chosen": 3.21875, "logits/rejected": 2.927734375, "logps/chosen": -728.0, "logps/rejected": -771.0, "loss": 0.7281, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1094970703125, "rewards/margins": 3.369140625, "rewards/rejected": -3.48046875, "step": 3743 }, { "epoch": 0.7074495724880722, "grad_norm": 2.6703655028366264, "learning_rate": 3.1538833037664696e-07, "logits/chosen": 2.076171875, "logits/rejected": 2.62109375, "logps/chosen": -939.5, "logps/rejected": -1745.5, "loss": 0.6827, "rewards/accuracies": 0.78125, "rewards/chosen": 0.23046875, "rewards/margins": 6.740234375, "rewards/rejected": -6.509765625, "step": 3744 }, { "epoch": 0.7076385280362795, "grad_norm": 1.658721223542841, "learning_rate": 3.1513510047975423e-07, "logits/chosen": 2.734375, "logits/rejected": 2.759765625, "logps/chosen": -697.0, "logps/rejected": -842.0, "loss": 0.6227, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7353515625, "rewards/margins": 4.59765625, "rewards/rejected": -3.86328125, "step": 3745 }, { "epoch": 0.7078274835844868, "grad_norm": 2.443272543636628, "learning_rate": 3.1488197276069375e-07, "logits/chosen": 3.609375, "logits/rejected": 3.11328125, "logps/chosen": -1217.0, "logps/rejected": -733.5, "loss": 0.54, "rewards/accuracies": 0.8125, "rewards/chosen": 1.33740234375, "rewards/margins": 5.44921875, "rewards/rejected": -4.10546875, "step": 3746 }, { "epoch": 0.7080164391326941, "grad_norm": 1.886600026519524, "learning_rate": 3.1462894732958777e-07, "logits/chosen": 2.583984375, "logits/rejected": 2.4638671875, "logps/chosen": -1066.5, "logps/rejected": -1185.0, "loss": 0.6678, "rewards/accuracies": 0.75, "rewards/chosen": 1.255615234375, "rewards/margins": 4.3984375, "rewards/rejected": -3.15234375, "step": 3747 }, { "epoch": 0.7082053946809013, "grad_norm": 3.173199740177874, "learning_rate": 3.1437602429651555e-07, "logits/chosen": 2.8125, "logits/rejected": 2.900390625, "logps/chosen": -933.0, "logps/rejected": -1341.0, "loss": 0.6552, "rewards/accuracies": 0.71875, "rewards/chosen": 0.7288818359375, "rewards/margins": 4.22265625, "rewards/rejected": -3.498046875, "step": 3748 }, { "epoch": 0.7083943502291086, "grad_norm": 1.5218490089466978, "learning_rate": 3.141232037715108e-07, "logits/chosen": 3.29296875, "logits/rejected": 3.14453125, "logps/chosen": -893.0, "logps/rejected": -1131.0, "loss": 0.5725, "rewards/accuracies": 0.84375, "rewards/chosen": 0.812255859375, "rewards/margins": 5.3828125, "rewards/rejected": -4.56640625, "step": 3749 }, { "epoch": 0.7085833057773159, "grad_norm": 3.2875239392736955, "learning_rate": 3.138704858645628e-07, "logits/chosen": 2.90234375, "logits/rejected": 2.56640625, "logps/chosen": -901.0, "logps/rejected": -1193.0, "loss": 0.4695, "rewards/accuracies": 0.90625, "rewards/chosen": 1.13720703125, "rewards/margins": 5.95703125, "rewards/rejected": -4.8203125, "step": 3750 }, { "epoch": 0.7087722613255232, "grad_norm": 1.992653266427276, "learning_rate": 3.1361787068561687e-07, "logits/chosen": 2.548828125, "logits/rejected": 2.345703125, "logps/chosen": -1129.5, "logps/rejected": -2090.0, "loss": 0.5322, "rewards/accuracies": 0.84375, "rewards/chosen": 0.748046875, "rewards/margins": 12.28515625, "rewards/rejected": -11.578125, "step": 3751 }, { "epoch": 0.7089612168737305, "grad_norm": 2.497535467593395, "learning_rate": 3.1336535834457226e-07, "logits/chosen": 3.60546875, "logits/rejected": 3.5859375, "logps/chosen": -616.5, "logps/rejected": -932.0, "loss": 0.6852, "rewards/accuracies": 0.6875, "rewards/chosen": 0.673828125, "rewards/margins": 4.4769287109375, "rewards/rejected": -3.796875, "step": 3752 }, { "epoch": 0.7091501724219378, "grad_norm": 1.996952303211873, "learning_rate": 3.1311294895128505e-07, "logits/chosen": 2.83203125, "logits/rejected": 3.2265625, "logps/chosen": -642.0, "logps/rejected": -840.0, "loss": 0.5935, "rewards/accuracies": 0.78125, "rewards/chosen": 1.11572265625, "rewards/margins": 5.4453125, "rewards/rejected": -4.3203125, "step": 3753 }, { "epoch": 0.709339127970145, "grad_norm": 2.2582805475987535, "learning_rate": 3.1286064261556543e-07, "logits/chosen": 2.34765625, "logits/rejected": 2.36328125, "logps/chosen": -651.5, "logps/rejected": -1011.5, "loss": 0.5972, "rewards/accuracies": 0.65625, "rewards/chosen": 0.7076416015625, "rewards/margins": 4.6171875, "rewards/rejected": -3.91796875, "step": 3754 }, { "epoch": 0.7095280835183523, "grad_norm": 2.1618107605587253, "learning_rate": 3.126084394471791e-07, "logits/chosen": 3.30078125, "logits/rejected": 2.96875, "logps/chosen": -796.0, "logps/rejected": -1072.5, "loss": 0.4466, "rewards/accuracies": 0.875, "rewards/chosen": 1.66015625, "rewards/margins": 7.5234375, "rewards/rejected": -5.87109375, "step": 3755 }, { "epoch": 0.7097170390665596, "grad_norm": 3.415778463904962, "learning_rate": 3.1235633955584725e-07, "logits/chosen": 3.65625, "logits/rejected": 3.5859375, "logps/chosen": -618.5, "logps/rejected": -1270.0, "loss": 0.5062, "rewards/accuracies": 0.78125, "rewards/chosen": 1.169921875, "rewards/margins": 5.3984375, "rewards/rejected": -4.21875, "step": 3756 }, { "epoch": 0.7099059946147669, "grad_norm": 3.088524052062482, "learning_rate": 3.121043430512452e-07, "logits/chosen": 1.9775390625, "logits/rejected": 1.416015625, "logps/chosen": -866.5, "logps/rejected": -587.0, "loss": 0.6013, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3916015625, "rewards/margins": 4.515625, "rewards/rejected": -4.12109375, "step": 3757 }, { "epoch": 0.7100949501629742, "grad_norm": 3.453164706799058, "learning_rate": 3.118524500430044e-07, "logits/chosen": 2.802734375, "logits/rejected": 2.310546875, "logps/chosen": -608.0, "logps/rejected": -867.0, "loss": 0.472, "rewards/accuracies": 0.84375, "rewards/chosen": 0.503997802734375, "rewards/margins": 6.41796875, "rewards/rejected": -5.91015625, "step": 3758 }, { "epoch": 0.7102839057111815, "grad_norm": 1.8020647812930477, "learning_rate": 3.1160066064071057e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0625, "logps/chosen": -736.5, "logps/rejected": -1609.5, "loss": 0.6568, "rewards/accuracies": 0.8125, "rewards/chosen": 0.24560546875, "rewards/margins": 4.97265625, "rewards/rejected": -4.73828125, "step": 3759 }, { "epoch": 0.7104728612593887, "grad_norm": 2.075119181864686, "learning_rate": 3.113489749539045e-07, "logits/chosen": 3.1328125, "logits/rejected": 2.490234375, "logps/chosen": -891.0, "logps/rejected": -978.0, "loss": 0.3641, "rewards/accuracies": 1.0, "rewards/chosen": 1.18212890625, "rewards/margins": 6.46875, "rewards/rejected": -5.2890625, "step": 3760 }, { "epoch": 0.710661816807596, "grad_norm": 2.7107541223651044, "learning_rate": 3.1109739309208174e-07, "logits/chosen": 2.564453125, "logits/rejected": 2.50390625, "logps/chosen": -813.5, "logps/rejected": -819.5, "loss": 0.6117, "rewards/accuracies": 0.75, "rewards/chosen": 0.24609375, "rewards/margins": 4.046875, "rewards/rejected": -3.8046875, "step": 3761 }, { "epoch": 0.7108507723558033, "grad_norm": 1.9972150491266696, "learning_rate": 3.108459151646932e-07, "logits/chosen": 2.84375, "logits/rejected": 2.412109375, "logps/chosen": -544.0, "logps/rejected": -603.5, "loss": 0.618, "rewards/accuracies": 0.84375, "rewards/chosen": 0.334716796875, "rewards/margins": 4.10546875, "rewards/rejected": -3.76953125, "step": 3762 }, { "epoch": 0.7110397279040106, "grad_norm": 3.225244364670436, "learning_rate": 3.1059454128114393e-07, "logits/chosen": 2.240234375, "logits/rejected": 1.87744140625, "logps/chosen": -811.0, "logps/rejected": -750.5, "loss": 0.5039, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9091796875, "rewards/margins": 5.328125, "rewards/rejected": -4.41796875, "step": 3763 }, { "epoch": 0.7112286834522179, "grad_norm": 2.5978530765015653, "learning_rate": 3.103432715507941e-07, "logits/chosen": 3.29296875, "logits/rejected": 3.27734375, "logps/chosen": -560.0, "logps/rejected": -746.0, "loss": 0.6754, "rewards/accuracies": 0.78125, "rewards/chosen": -0.09765625, "rewards/margins": 2.861328125, "rewards/rejected": -2.9638671875, "step": 3764 }, { "epoch": 0.7114176390004251, "grad_norm": 2.641028717400201, "learning_rate": 3.1009210608295836e-07, "logits/chosen": 2.42578125, "logits/rejected": 2.134765625, "logps/chosen": -781.0, "logps/rejected": -1893.0, "loss": 0.5982, "rewards/accuracies": 0.8125, "rewards/chosen": 0.78076171875, "rewards/margins": 7.27734375, "rewards/rejected": -6.48828125, "step": 3765 }, { "epoch": 0.7116065945486324, "grad_norm": 1.1464178695480993, "learning_rate": 3.0984104498690604e-07, "logits/chosen": 2.0888671875, "logits/rejected": 2.0009765625, "logps/chosen": -665.5, "logps/rejected": -1416.0, "loss": 0.5394, "rewards/accuracies": 0.875, "rewards/chosen": 0.373779296875, "rewards/margins": 5.08984375, "rewards/rejected": -4.7109375, "step": 3766 }, { "epoch": 0.7117955500968397, "grad_norm": 2.432626152107229, "learning_rate": 3.095900883718615e-07, "logits/chosen": 2.3056640625, "logits/rejected": 2.384765625, "logps/chosen": -754.0, "logps/rejected": -993.0, "loss": 0.6554, "rewards/accuracies": 0.75, "rewards/chosen": 0.783447265625, "rewards/margins": 4.67578125, "rewards/rejected": -3.9111328125, "step": 3767 }, { "epoch": 0.711984505645047, "grad_norm": 1.8591194074988355, "learning_rate": 3.0933923634700274e-07, "logits/chosen": 2.3818359375, "logits/rejected": 2.19921875, "logps/chosen": -1095.0, "logps/rejected": -884.0, "loss": 0.6358, "rewards/accuracies": 0.6875, "rewards/chosen": 0.487060546875, "rewards/margins": 4.16015625, "rewards/rejected": -3.68359375, "step": 3768 }, { "epoch": 0.7121734611932543, "grad_norm": 2.5082840529156747, "learning_rate": 3.090884890214631e-07, "logits/chosen": 2.1328125, "logits/rejected": 2.123046875, "logps/chosen": -596.5, "logps/rejected": -666.5, "loss": 0.7352, "rewards/accuracies": 0.84375, "rewards/chosen": -0.43359375, "rewards/margins": 2.875, "rewards/rejected": -3.306640625, "step": 3769 }, { "epoch": 0.7123624167414616, "grad_norm": 2.9446810319548273, "learning_rate": 3.0883784650432995e-07, "logits/chosen": 2.5654296875, "logits/rejected": 2.6168212890625, "logps/chosen": -643.0, "logps/rejected": -789.5, "loss": 0.6789, "rewards/accuracies": 0.59375, "rewards/chosen": 0.38043212890625, "rewards/margins": 5.095703125, "rewards/rejected": -4.72265625, "step": 3770 }, { "epoch": 0.7125513722896688, "grad_norm": 3.2304582171837755, "learning_rate": 3.085873089046451e-07, "logits/chosen": 1.471435546875, "logits/rejected": 1.1259765625, "logps/chosen": -751.0, "logps/rejected": -718.0, "loss": 0.626, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1494140625, "rewards/margins": 4.23828125, "rewards/rejected": -4.09375, "step": 3771 }, { "epoch": 0.7127403278378761, "grad_norm": 2.26747309934376, "learning_rate": 3.0833687633140526e-07, "logits/chosen": 3.39453125, "logits/rejected": 2.345703125, "logps/chosen": -753.5, "logps/rejected": -676.5, "loss": 0.5904, "rewards/accuracies": 0.78125, "rewards/chosen": 0.74395751953125, "rewards/margins": 5.359375, "rewards/rejected": -4.60546875, "step": 3772 }, { "epoch": 0.7129292833860834, "grad_norm": 2.141337637396326, "learning_rate": 3.080865488935601e-07, "logits/chosen": 2.05859375, "logits/rejected": 1.75439453125, "logps/chosen": -549.0, "logps/rejected": -725.0, "loss": 0.6915, "rewards/accuracies": 0.71875, "rewards/chosen": -0.643798828125, "rewards/margins": 3.662109375, "rewards/rejected": -4.30859375, "step": 3773 }, { "epoch": 0.7131182389342907, "grad_norm": 2.833288939225671, "learning_rate": 3.0783632670001513e-07, "logits/chosen": 3.3828125, "logits/rejected": 3.20703125, "logps/chosen": -823.5, "logps/rejected": -740.5, "loss": 0.7798, "rewards/accuracies": 0.5625, "rewards/chosen": -0.118408203125, "rewards/margins": 3.38818359375, "rewards/rejected": -3.5029296875, "step": 3774 }, { "epoch": 0.713307194482498, "grad_norm": 3.644397139621834, "learning_rate": 3.0758620985962915e-07, "logits/chosen": 2.857421875, "logits/rejected": 2.4296875, "logps/chosen": -792.0, "logps/rejected": -767.0, "loss": 0.4272, "rewards/accuracies": 0.96875, "rewards/chosen": 1.263671875, "rewards/margins": 5.4921875, "rewards/rejected": -4.2265625, "step": 3775 }, { "epoch": 0.7134961500307053, "grad_norm": 2.9901371035566813, "learning_rate": 3.0733619848121516e-07, "logits/chosen": 3.4921875, "logits/rejected": 3.3984375, "logps/chosen": -1232.0, "logps/rejected": -1302.0, "loss": 0.4893, "rewards/accuracies": 0.875, "rewards/chosen": 1.6015625, "rewards/margins": 8.1640625, "rewards/rejected": -6.546875, "step": 3776 }, { "epoch": 0.7136851055789125, "grad_norm": 6.4034863445659465, "learning_rate": 3.0708629267354093e-07, "logits/chosen": 3.18359375, "logits/rejected": 2.79296875, "logps/chosen": -1680.0, "logps/rejected": -909.0, "loss": 0.608, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3487548828125, "rewards/margins": 4.65625, "rewards/rejected": -4.30078125, "step": 3777 }, { "epoch": 0.7138740611271198, "grad_norm": 2.3874641862276036, "learning_rate": 3.0683649254532716e-07, "logits/chosen": 2.89453125, "logits/rejected": 2.904296875, "logps/chosen": -957.0, "logps/rejected": -918.0, "loss": 0.6226, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0791015625, "rewards/margins": 5.3203125, "rewards/rejected": -4.248046875, "step": 3778 }, { "epoch": 0.7140630166753271, "grad_norm": 2.329571310224148, "learning_rate": 3.065867982052498e-07, "logits/chosen": 2.70703125, "logits/rejected": 2.28515625, "logps/chosen": -1296.0, "logps/rejected": -1486.0, "loss": 0.5584, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5654296875, "rewards/margins": 5.140625, "rewards/rejected": -4.57421875, "step": 3779 }, { "epoch": 0.7142519722235344, "grad_norm": 5.356498173197204, "learning_rate": 3.063372097619381e-07, "logits/chosen": 3.19921875, "logits/rejected": 2.248046875, "logps/chosen": -638.5, "logps/rejected": -573.5, "loss": 0.5017, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4111328125, "rewards/margins": 5.0390625, "rewards/rejected": -4.62109375, "step": 3780 }, { "epoch": 0.7144409277717417, "grad_norm": 1.821725171498265, "learning_rate": 3.060877273239752e-07, "logits/chosen": 2.283203125, "logits/rejected": 1.4130859375, "logps/chosen": -643.0, "logps/rejected": -632.5, "loss": 0.4566, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1044921875, "rewards/margins": 5.734375, "rewards/rejected": -4.625, "step": 3781 }, { "epoch": 0.714629883319949, "grad_norm": 3.0363662599420227, "learning_rate": 3.0583835099989835e-07, "logits/chosen": 3.025390625, "logits/rejected": 2.787109375, "logps/chosen": -640.5, "logps/rejected": -831.0, "loss": 0.4986, "rewards/accuracies": 0.9375, "rewards/chosen": 0.76123046875, "rewards/margins": 5.0859375, "rewards/rejected": -4.33203125, "step": 3782 }, { "epoch": 0.7148188388681562, "grad_norm": 3.056516537424236, "learning_rate": 3.0558908089819847e-07, "logits/chosen": 1.9052734375, "logits/rejected": 1.62939453125, "logps/chosen": -1177.5, "logps/rejected": -1718.5, "loss": 0.5523, "rewards/accuracies": 0.875, "rewards/chosen": -0.767822265625, "rewards/margins": 6.66796875, "rewards/rejected": -7.41796875, "step": 3783 }, { "epoch": 0.7150077944163635, "grad_norm": 3.0642699505101296, "learning_rate": 3.0533991712732067e-07, "logits/chosen": 3.40625, "logits/rejected": 2.99609375, "logps/chosen": -831.0, "logps/rejected": -871.0, "loss": 0.7344, "rewards/accuracies": 0.8125, "rewards/chosen": 0.27978515625, "rewards/margins": 3.51171875, "rewards/rejected": -3.220703125, "step": 3784 }, { "epoch": 0.7151967499645708, "grad_norm": 2.430867257298885, "learning_rate": 3.0509085979566324e-07, "logits/chosen": 2.9921875, "logits/rejected": 3.6953125, "logps/chosen": -592.0, "logps/rejected": -2578.0, "loss": 0.6639, "rewards/accuracies": 0.8125, "rewards/chosen": 0.35986328125, "rewards/margins": 13.765625, "rewards/rejected": -13.3984375, "step": 3785 }, { "epoch": 0.7153857055127781, "grad_norm": 2.819510351855539, "learning_rate": 3.0484190901157853e-07, "logits/chosen": 3.1953125, "logits/rejected": 2.99609375, "logps/chosen": -825.5, "logps/rejected": -1998.0, "loss": 0.5655, "rewards/accuracies": 0.78125, "rewards/chosen": 0.63720703125, "rewards/margins": 6.1328125, "rewards/rejected": -5.5, "step": 3786 }, { "epoch": 0.7155746610609854, "grad_norm": 1.8091993556065575, "learning_rate": 3.045930648833724e-07, "logits/chosen": 3.173828125, "logits/rejected": 2.4921875, "logps/chosen": -945.5, "logps/rejected": -1256.5, "loss": 0.468, "rewards/accuracies": 0.84375, "rewards/chosen": 0.943359375, "rewards/margins": 6.42578125, "rewards/rejected": -5.4765625, "step": 3787 }, { "epoch": 0.7157636166091926, "grad_norm": 2.3939659310426147, "learning_rate": 3.0434432751930426e-07, "logits/chosen": 2.1131591796875, "logits/rejected": 1.8974609375, "logps/chosen": -648.0, "logps/rejected": -934.0, "loss": 0.5495, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4970703125, "rewards/margins": 5.173828125, "rewards/rejected": -4.671875, "step": 3788 }, { "epoch": 0.7159525721573999, "grad_norm": 3.034365791813145, "learning_rate": 3.0409569702758716e-07, "logits/chosen": 4.23046875, "logits/rejected": 4.0546875, "logps/chosen": -741.0, "logps/rejected": -921.0, "loss": 0.7134, "rewards/accuracies": 0.75, "rewards/chosen": 0.666015625, "rewards/margins": 3.701171875, "rewards/rejected": -3.037109375, "step": 3789 }, { "epoch": 0.7161415277056072, "grad_norm": 1.7727901040819887, "learning_rate": 3.038471735163878e-07, "logits/chosen": 2.734375, "logits/rejected": 2.099609375, "logps/chosen": -679.0, "logps/rejected": -738.0, "loss": 0.4745, "rewards/accuracies": 0.84375, "rewards/chosen": 0.74951171875, "rewards/margins": 5.671875, "rewards/rejected": -4.91796875, "step": 3790 }, { "epoch": 0.7163304832538145, "grad_norm": 2.104455624266269, "learning_rate": 3.035987570938261e-07, "logits/chosen": 3.80078125, "logits/rejected": 3.53125, "logps/chosen": -733.0, "logps/rejected": -1165.0, "loss": 0.6408, "rewards/accuracies": 0.84375, "rewards/chosen": 0.82275390625, "rewards/margins": 5.265625, "rewards/rejected": -4.44140625, "step": 3791 }, { "epoch": 0.7165194388020218, "grad_norm": 5.6907582028985715, "learning_rate": 3.0335044786797537e-07, "logits/chosen": 2.7578125, "logits/rejected": 2.13671875, "logps/chosen": -927.0, "logps/rejected": -903.0, "loss": 0.5783, "rewards/accuracies": 0.6875, "rewards/chosen": 0.48974609375, "rewards/margins": 4.173828125, "rewards/rejected": -3.68359375, "step": 3792 }, { "epoch": 0.7167083943502291, "grad_norm": 3.0597753962139693, "learning_rate": 3.031022459468626e-07, "logits/chosen": 2.7734375, "logits/rejected": 2.955078125, "logps/chosen": -829.5, "logps/rejected": -1167.0, "loss": 0.4652, "rewards/accuracies": 0.875, "rewards/chosen": 1.34130859375, "rewards/margins": 5.859375, "rewards/rejected": -4.515625, "step": 3793 }, { "epoch": 0.7168973498984363, "grad_norm": 2.0877788092403633, "learning_rate": 3.0285415143846746e-07, "logits/chosen": 3.2109375, "logits/rejected": 2.55078125, "logps/chosen": -925.0, "logps/rejected": -695.5, "loss": 0.5287, "rewards/accuracies": 0.875, "rewards/chosen": -1.07568359375, "rewards/margins": 2.9765625, "rewards/rejected": -4.0546875, "step": 3794 }, { "epoch": 0.7170863054466436, "grad_norm": 2.605510635428749, "learning_rate": 3.0260616445072406e-07, "logits/chosen": 3.0546875, "logits/rejected": 2.548828125, "logps/chosen": -795.5, "logps/rejected": -699.5, "loss": 0.6086, "rewards/accuracies": 0.875, "rewards/chosen": 0.4228515625, "rewards/margins": 3.92578125, "rewards/rejected": -3.5078125, "step": 3795 }, { "epoch": 0.717275260994851, "grad_norm": 2.4689833262775336, "learning_rate": 3.023582850915182e-07, "logits/chosen": 2.673828125, "logits/rejected": 2.94921875, "logps/chosen": -955.0, "logps/rejected": -806.0, "loss": 0.5222, "rewards/accuracies": 0.84375, "rewards/chosen": 1.048828125, "rewards/margins": 5.21875, "rewards/rejected": -4.17578125, "step": 3796 }, { "epoch": 0.7174642165430583, "grad_norm": 2.564171018574239, "learning_rate": 3.021105134686899e-07, "logits/chosen": 1.9853515625, "logits/rejected": 2.1217041015625, "logps/chosen": -699.5, "logps/rejected": -1038.0, "loss": 0.5465, "rewards/accuracies": 0.84375, "rewards/chosen": 0.89599609375, "rewards/margins": 5.8046875, "rewards/rejected": -4.91796875, "step": 3797 }, { "epoch": 0.7176531720912656, "grad_norm": 2.2870623622411688, "learning_rate": 3.0186284969003263e-07, "logits/chosen": 1.83056640625, "logits/rejected": 2.169921875, "logps/chosen": -834.0, "logps/rejected": -1168.5, "loss": 0.5292, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6611328125, "rewards/margins": 6.0, "rewards/rejected": -5.32421875, "step": 3798 }, { "epoch": 0.7178421276394729, "grad_norm": 2.5511425810862924, "learning_rate": 3.016152938632914e-07, "logits/chosen": 2.6953125, "logits/rejected": 1.82373046875, "logps/chosen": -803.0, "logps/rejected": -660.0, "loss": 0.471, "rewards/accuracies": 0.90625, "rewards/chosen": 1.01220703125, "rewards/margins": 5.296875, "rewards/rejected": -4.296875, "step": 3799 }, { "epoch": 0.71803108318768, "grad_norm": 1.8838863412699145, "learning_rate": 3.0136784609616595e-07, "logits/chosen": 3.068359375, "logits/rejected": 2.8232421875, "logps/chosen": -859.0, "logps/rejected": -979.0, "loss": 0.5267, "rewards/accuracies": 0.8125, "rewards/chosen": 1.118896484375, "rewards/margins": 6.921875, "rewards/rejected": -5.8046875, "step": 3800 }, { "epoch": 0.7182200387358874, "grad_norm": 3.085938324819154, "learning_rate": 3.011205064963079e-07, "logits/chosen": 1.650390625, "logits/rejected": 1.5712890625, "logps/chosen": -638.0, "logps/rejected": -603.0, "loss": 0.5847, "rewards/accuracies": 0.84375, "rewards/chosen": -0.053955078125, "rewards/margins": 4.02734375, "rewards/rejected": -4.078125, "step": 3801 }, { "epoch": 0.7184089942840947, "grad_norm": 2.7464203360586974, "learning_rate": 3.008732751713225e-07, "logits/chosen": 2.640625, "logits/rejected": 2.578125, "logps/chosen": -902.5, "logps/rejected": -866.0, "loss": 0.5152, "rewards/accuracies": 0.875, "rewards/chosen": 0.8466796875, "rewards/margins": 5.3203125, "rewards/rejected": -4.46484375, "step": 3802 }, { "epoch": 0.718597949832302, "grad_norm": 4.585811708725518, "learning_rate": 3.006261522287673e-07, "logits/chosen": 2.4453125, "logits/rejected": 2.22265625, "logps/chosen": -937.5, "logps/rejected": -881.0, "loss": 0.6728, "rewards/accuracies": 0.71875, "rewards/chosen": 0.80029296875, "rewards/margins": 4.322265625, "rewards/rejected": -3.515625, "step": 3803 }, { "epoch": 0.7187869053805093, "grad_norm": 2.6011461610744697, "learning_rate": 3.00379137776153e-07, "logits/chosen": 2.26220703125, "logits/rejected": 2.154296875, "logps/chosen": -700.0, "logps/rejected": -845.5, "loss": 0.7112, "rewards/accuracies": 0.75, "rewards/chosen": 0.6170654296875, "rewards/margins": 4.1357421875, "rewards/rejected": -3.525390625, "step": 3804 }, { "epoch": 0.7189758609287166, "grad_norm": 2.3977021107020304, "learning_rate": 3.0013223192094326e-07, "logits/chosen": 3.376953125, "logits/rejected": 2.8515625, "logps/chosen": -683.0, "logps/rejected": -767.0, "loss": 0.6743, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1400146484375, "rewards/margins": 3.5546875, "rewards/rejected": -3.421875, "step": 3805 }, { "epoch": 0.7191648164769238, "grad_norm": 1.8049642761965035, "learning_rate": 2.998854347705543e-07, "logits/chosen": 2.318359375, "logits/rejected": 1.998046875, "logps/chosen": -927.0, "logps/rejected": -998.0, "loss": 0.545, "rewards/accuracies": 0.875, "rewards/chosen": 0.5888671875, "rewards/margins": 5.26171875, "rewards/rejected": -4.66796875, "step": 3806 }, { "epoch": 0.7193537720251311, "grad_norm": 1.9950378451774762, "learning_rate": 2.9963874643235506e-07, "logits/chosen": 2.728515625, "logits/rejected": 2.263671875, "logps/chosen": -987.0, "logps/rejected": -752.0, "loss": 0.4649, "rewards/accuracies": 0.84375, "rewards/chosen": 1.02734375, "rewards/margins": 4.9609375, "rewards/rejected": -3.92578125, "step": 3807 }, { "epoch": 0.7195427275733384, "grad_norm": 2.0066782497368814, "learning_rate": 2.9939216701366704e-07, "logits/chosen": 3.3046875, "logits/rejected": 2.4345703125, "logps/chosen": -773.0, "logps/rejected": -895.0, "loss": 0.5788, "rewards/accuracies": 0.84375, "rewards/chosen": 1.729248046875, "rewards/margins": 5.2890625, "rewards/rejected": -3.560546875, "step": 3808 }, { "epoch": 0.7197316831215457, "grad_norm": 3.6208827411681024, "learning_rate": 2.9914569662176457e-07, "logits/chosen": 3.3671875, "logits/rejected": 3.265625, "logps/chosen": -621.0, "logps/rejected": -628.0, "loss": 0.6248, "rewards/accuracies": 0.84375, "rewards/chosen": 0.761962890625, "rewards/margins": 4.00390625, "rewards/rejected": -3.24609375, "step": 3809 }, { "epoch": 0.719920638669753, "grad_norm": 2.8070797305016084, "learning_rate": 2.988993353638742e-07, "logits/chosen": 3.412109375, "logits/rejected": 3.51171875, "logps/chosen": -635.0, "logps/rejected": -690.0, "loss": 0.6854, "rewards/accuracies": 0.6875, "rewards/chosen": 0.25146484375, "rewards/margins": 3.15869140625, "rewards/rejected": -2.90625, "step": 3810 }, { "epoch": 0.7201095942179602, "grad_norm": 2.7987747997714436, "learning_rate": 2.986530833471757e-07, "logits/chosen": 2.62890625, "logits/rejected": 2.4375, "logps/chosen": -839.0, "logps/rejected": -831.0, "loss": 0.5536, "rewards/accuracies": 0.875, "rewards/chosen": 1.579833984375, "rewards/margins": 4.6953125, "rewards/rejected": -3.119140625, "step": 3811 }, { "epoch": 0.7202985497661675, "grad_norm": 2.417596436950549, "learning_rate": 2.9840694067880056e-07, "logits/chosen": 2.515625, "logits/rejected": 2.892578125, "logps/chosen": -889.0, "logps/rejected": -958.0, "loss": 0.5812, "rewards/accuracies": 0.71875, "rewards/chosen": 1.751953125, "rewards/margins": 4.66796875, "rewards/rejected": -2.9140625, "step": 3812 }, { "epoch": 0.7204875053143748, "grad_norm": 3.4896181742636245, "learning_rate": 2.9816090746583305e-07, "logits/chosen": 2.640625, "logits/rejected": 2.7890625, "logps/chosen": -849.5, "logps/rejected": -930.5, "loss": 0.4579, "rewards/accuracies": 0.90625, "rewards/chosen": 1.11181640625, "rewards/margins": 5.140625, "rewards/rejected": -4.0234375, "step": 3813 }, { "epoch": 0.7206764608625821, "grad_norm": 2.122508704205951, "learning_rate": 2.979149838153098e-07, "logits/chosen": 3.91015625, "logits/rejected": 4.15234375, "logps/chosen": -844.5, "logps/rejected": -1717.0, "loss": 0.6127, "rewards/accuracies": 0.8125, "rewards/chosen": 0.935546875, "rewards/margins": 7.57421875, "rewards/rejected": -6.625, "step": 3814 }, { "epoch": 0.7208654164107894, "grad_norm": 2.011286758851699, "learning_rate": 2.976691698342195e-07, "logits/chosen": 3.2734375, "logits/rejected": 3.390625, "logps/chosen": -665.75, "logps/rejected": -844.0, "loss": 0.6675, "rewards/accuracies": 0.875, "rewards/chosen": 0.76318359375, "rewards/margins": 4.419921875, "rewards/rejected": -3.662109375, "step": 3815 }, { "epoch": 0.7210543719589967, "grad_norm": 2.5695694378434326, "learning_rate": 2.97423465629504e-07, "logits/chosen": 3.2265625, "logits/rejected": 2.845703125, "logps/chosen": -919.0, "logps/rejected": -718.0, "loss": 0.7324, "rewards/accuracies": 0.6875, "rewards/chosen": 0.850830078125, "rewards/margins": 3.18359375, "rewards/rejected": -2.337890625, "step": 3816 }, { "epoch": 0.7212433275072039, "grad_norm": 2.4491683638626878, "learning_rate": 2.97177871308056e-07, "logits/chosen": 1.990234375, "logits/rejected": 2.150390625, "logps/chosen": -813.0, "logps/rejected": -1176.0, "loss": 0.5962, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5802001953125, "rewards/margins": 5.4453125, "rewards/rejected": -4.859375, "step": 3817 }, { "epoch": 0.7214322830554112, "grad_norm": 2.055034622164483, "learning_rate": 2.969323869767216e-07, "logits/chosen": 2.65234375, "logits/rejected": 2.564697265625, "logps/chosen": -701.5, "logps/rejected": -822.0, "loss": 0.5515, "rewards/accuracies": 0.84375, "rewards/chosen": 0.555908203125, "rewards/margins": 5.20703125, "rewards/rejected": -4.6484375, "step": 3818 }, { "epoch": 0.7216212386036185, "grad_norm": 1.6517566082341761, "learning_rate": 2.966870127422986e-07, "logits/chosen": 2.044921875, "logits/rejected": 2.041015625, "logps/chosen": -919.0, "logps/rejected": -1360.0, "loss": 0.5651, "rewards/accuracies": 0.8125, "rewards/chosen": 1.25927734375, "rewards/margins": 5.859375, "rewards/rejected": -4.603515625, "step": 3819 }, { "epoch": 0.7218101941518258, "grad_norm": 2.7852636733337213, "learning_rate": 2.964417487115367e-07, "logits/chosen": 2.884765625, "logits/rejected": 2.494140625, "logps/chosen": -755.0, "logps/rejected": -882.0, "loss": 0.5828, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1961669921875, "rewards/margins": 5.34375, "rewards/rejected": -5.14453125, "step": 3820 }, { "epoch": 0.7219991497000331, "grad_norm": 2.735741972128254, "learning_rate": 2.961965949911383e-07, "logits/chosen": 2.17578125, "logits/rejected": 1.9296875, "logps/chosen": -1023.0, "logps/rejected": -1800.0, "loss": 0.5545, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7001953125, "rewards/margins": 5.7578125, "rewards/rejected": -5.0546875, "step": 3821 }, { "epoch": 0.7221881052482404, "grad_norm": 2.599085869554891, "learning_rate": 2.9595155168775673e-07, "logits/chosen": 1.76904296875, "logits/rejected": 1.912109375, "logps/chosen": -14127.0, "logps/rejected": -685.0, "loss": 0.5011, "rewards/accuracies": 0.875, "rewards/chosen": 160.8671875, "rewards/margins": 165.6484375, "rewards/rejected": -5.22265625, "step": 3822 }, { "epoch": 0.7223770607964476, "grad_norm": 2.4811685214392423, "learning_rate": 2.9570661890799847e-07, "logits/chosen": 3.3203125, "logits/rejected": 3.177734375, "logps/chosen": -717.0, "logps/rejected": -653.5, "loss": 0.5244, "rewards/accuracies": 0.875, "rewards/chosen": 0.083984375, "rewards/margins": 5.2890625, "rewards/rejected": -5.203125, "step": 3823 }, { "epoch": 0.7225660163446549, "grad_norm": 2.1862297748746227, "learning_rate": 2.954617967584212e-07, "logits/chosen": 3.359375, "logits/rejected": 3.0625, "logps/chosen": -947.0, "logps/rejected": -1793.0, "loss": 0.5252, "rewards/accuracies": 0.875, "rewards/chosen": 0.7904052734375, "rewards/margins": 9.0, "rewards/rejected": -8.2109375, "step": 3824 }, { "epoch": 0.7227549718928622, "grad_norm": 1.6694582197252723, "learning_rate": 2.9521708534553447e-07, "logits/chosen": 3.037109375, "logits/rejected": 2.41796875, "logps/chosen": -868.0, "logps/rejected": -706.0, "loss": 0.4933, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1943359375, "rewards/margins": 5.609375, "rewards/rejected": -4.4140625, "step": 3825 }, { "epoch": 0.7229439274410695, "grad_norm": 1.9139091987476986, "learning_rate": 2.9497248477580035e-07, "logits/chosen": 2.1708984375, "logits/rejected": 2.10791015625, "logps/chosen": -735.0, "logps/rejected": -1181.0, "loss": 0.527, "rewards/accuracies": 0.875, "rewards/chosen": 0.581787109375, "rewards/margins": 7.15625, "rewards/rejected": -6.57421875, "step": 3826 }, { "epoch": 0.7231328829892768, "grad_norm": 2.2661210128762925, "learning_rate": 2.947279951556314e-07, "logits/chosen": 3.046875, "logits/rejected": 2.7109375, "logps/chosen": -854.5, "logps/rejected": -746.0, "loss": 0.4614, "rewards/accuracies": 0.9375, "rewards/chosen": 1.152099609375, "rewards/margins": 6.5703125, "rewards/rejected": -5.42578125, "step": 3827 }, { "epoch": 0.7233218385374841, "grad_norm": 1.5422343058264094, "learning_rate": 2.9448361659139334e-07, "logits/chosen": 2.94140625, "logits/rejected": 2.9921875, "logps/chosen": -932.0, "logps/rejected": -818.0, "loss": 0.5086, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6748046875, "rewards/margins": 5.65625, "rewards/rejected": -3.98046875, "step": 3828 }, { "epoch": 0.7235107940856913, "grad_norm": 2.0313030024008603, "learning_rate": 2.9423934918940264e-07, "logits/chosen": 2.80859375, "logits/rejected": 2.986328125, "logps/chosen": -796.5, "logps/rejected": -891.0, "loss": 0.6565, "rewards/accuracies": 0.75, "rewards/chosen": 0.5791015625, "rewards/margins": 5.779296875, "rewards/rejected": -5.20703125, "step": 3829 }, { "epoch": 0.7236997496338986, "grad_norm": 3.223455910281482, "learning_rate": 2.939951930559278e-07, "logits/chosen": 3.16796875, "logits/rejected": 3.03125, "logps/chosen": -885.0, "logps/rejected": -746.5, "loss": 0.6446, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3251953125, "rewards/margins": 3.984375, "rewards/rejected": -3.66015625, "step": 3830 }, { "epoch": 0.7238887051821059, "grad_norm": 3.9524922759694188, "learning_rate": 2.9375114829718863e-07, "logits/chosen": 2.572265625, "logits/rejected": 2.1767578125, "logps/chosen": -982.0, "logps/rejected": -732.0, "loss": 0.5753, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4501953125, "rewards/margins": 4.921875, "rewards/rejected": -4.48046875, "step": 3831 }, { "epoch": 0.7240776607303132, "grad_norm": 4.205668233677781, "learning_rate": 2.9350721501935655e-07, "logits/chosen": 3.28515625, "logits/rejected": 2.482421875, "logps/chosen": -507.5, "logps/rejected": -587.5, "loss": 0.6197, "rewards/accuracies": 0.8125, "rewards/chosen": 0.49609375, "rewards/margins": 4.828125, "rewards/rejected": -4.33984375, "step": 3832 }, { "epoch": 0.7242666162785205, "grad_norm": 1.6388529405671908, "learning_rate": 2.9326339332855497e-07, "logits/chosen": 2.314453125, "logits/rejected": 1.760009765625, "logps/chosen": -549.5, "logps/rejected": -559.5, "loss": 0.6102, "rewards/accuracies": 0.78125, "rewards/chosen": 0.86865234375, "rewards/margins": 4.32421875, "rewards/rejected": -3.45703125, "step": 3833 }, { "epoch": 0.7244555718267278, "grad_norm": 1.8649816296027004, "learning_rate": 2.9301968333085824e-07, "logits/chosen": 2.59765625, "logits/rejected": 2.447265625, "logps/chosen": -1163.0, "logps/rejected": -1025.5, "loss": 0.5013, "rewards/accuracies": 0.875, "rewards/chosen": 1.3701171875, "rewards/margins": 6.09765625, "rewards/rejected": -4.734375, "step": 3834 }, { "epoch": 0.724644527374935, "grad_norm": 1.8781331086367994, "learning_rate": 2.9277608513229214e-07, "logits/chosen": 3.171875, "logits/rejected": 3.109375, "logps/chosen": -915.0, "logps/rejected": -1080.0, "loss": 0.6441, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3046875, "rewards/margins": 4.92578125, "rewards/rejected": -3.6171875, "step": 3835 }, { "epoch": 0.7248334829231423, "grad_norm": 2.3033892377722496, "learning_rate": 2.9253259883883376e-07, "logits/chosen": 3.40234375, "logits/rejected": 3.44921875, "logps/chosen": -1226.0, "logps/rejected": -1110.0, "loss": 0.5272, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5, "rewards/margins": 4.8154296875, "rewards/rejected": -4.302734375, "step": 3836 }, { "epoch": 0.7250224384713496, "grad_norm": 3.4413577532431727, "learning_rate": 2.922892245564122e-07, "logits/chosen": 2.865234375, "logits/rejected": 2.802734375, "logps/chosen": -473.0, "logps/rejected": -792.0, "loss": 0.6183, "rewards/accuracies": 0.6875, "rewards/chosen": 0.27880859375, "rewards/margins": 5.72265625, "rewards/rejected": -5.4375, "step": 3837 }, { "epoch": 0.7252113940195569, "grad_norm": 2.323107265532857, "learning_rate": 2.920459623909066e-07, "logits/chosen": 2.56640625, "logits/rejected": 2.623046875, "logps/chosen": -533.0, "logps/rejected": -670.5, "loss": 0.6182, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8330078125, "rewards/margins": 4.60546875, "rewards/rejected": -3.77734375, "step": 3838 }, { "epoch": 0.7254003495677642, "grad_norm": 2.1603752618220455, "learning_rate": 2.9180281244814866e-07, "logits/chosen": 3.046875, "logits/rejected": 2.23828125, "logps/chosen": -869.0, "logps/rejected": -728.0, "loss": 0.5619, "rewards/accuracies": 0.75, "rewards/chosen": 0.955078125, "rewards/margins": 4.72265625, "rewards/rejected": -3.76171875, "step": 3839 }, { "epoch": 0.7255893051159714, "grad_norm": 1.6751146270558464, "learning_rate": 2.9155977483392027e-07, "logits/chosen": 3.09375, "logits/rejected": 3.404296875, "logps/chosen": -547.0, "logps/rejected": -920.5, "loss": 0.6812, "rewards/accuracies": 0.78125, "rewards/chosen": 0.34375, "rewards/margins": 4.2421875, "rewards/rejected": -3.900390625, "step": 3840 }, { "epoch": 0.7257782606641787, "grad_norm": 3.3117714864407595, "learning_rate": 2.9131684965395464e-07, "logits/chosen": 2.92578125, "logits/rejected": 2.94140625, "logps/chosen": -1205.0, "logps/rejected": -1298.0, "loss": 0.4798, "rewards/accuracies": 0.84375, "rewards/chosen": 1.724609375, "rewards/margins": 7.0, "rewards/rejected": -5.27734375, "step": 3841 }, { "epoch": 0.725967216212386, "grad_norm": 2.130766221933048, "learning_rate": 2.9107403701393677e-07, "logits/chosen": 2.55078125, "logits/rejected": 2.50390625, "logps/chosen": -675.0, "logps/rejected": -686.0, "loss": 0.5869, "rewards/accuracies": 0.84375, "rewards/chosen": 0.734375, "rewards/margins": 4.2998046875, "rewards/rejected": -3.56591796875, "step": 3842 }, { "epoch": 0.7261561717605933, "grad_norm": 2.9433909760763957, "learning_rate": 2.908313370195015e-07, "logits/chosen": 3.11328125, "logits/rejected": 2.83203125, "logps/chosen": -990.0, "logps/rejected": -1071.0, "loss": 0.548, "rewards/accuracies": 0.875, "rewards/chosen": 0.9814453125, "rewards/margins": 5.66015625, "rewards/rejected": -4.685546875, "step": 3843 }, { "epoch": 0.7263451273088006, "grad_norm": 1.7500469539813397, "learning_rate": 2.905887497762359e-07, "logits/chosen": 2.9140625, "logits/rejected": 2.66796875, "logps/chosen": -1000.0, "logps/rejected": -902.5, "loss": 0.441, "rewards/accuracies": 0.90625, "rewards/chosen": 2.2734375, "rewards/margins": 5.8125, "rewards/rejected": -3.53515625, "step": 3844 }, { "epoch": 0.7265340828570079, "grad_norm": 2.1919571681035843, "learning_rate": 2.90346275389677e-07, "logits/chosen": 3.1171875, "logits/rejected": 3.03125, "logps/chosen": -561.0, "logps/rejected": -986.0, "loss": 0.6563, "rewards/accuracies": 0.65625, "rewards/chosen": 1.1787109375, "rewards/margins": 4.025390625, "rewards/rejected": -2.841796875, "step": 3845 }, { "epoch": 0.7267230384052151, "grad_norm": 1.8198067182746247, "learning_rate": 2.901039139653132e-07, "logits/chosen": 3.314453125, "logits/rejected": 3.4296875, "logps/chosen": -454.5, "logps/rejected": -576.5, "loss": 0.6815, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8271484375, "rewards/margins": 3.6630859375, "rewards/rejected": -2.83984375, "step": 3846 }, { "epoch": 0.7269119939534224, "grad_norm": 2.1879763740139024, "learning_rate": 2.8986166560858405e-07, "logits/chosen": 2.7734375, "logits/rejected": 2.51953125, "logps/chosen": -1433.0, "logps/rejected": -1013.0, "loss": 0.4943, "rewards/accuracies": 0.84375, "rewards/chosen": 1.6416015625, "rewards/margins": 4.9609375, "rewards/rejected": -3.3203125, "step": 3847 }, { "epoch": 0.7271009495016297, "grad_norm": 2.939495255522608, "learning_rate": 2.8961953042487883e-07, "logits/chosen": 2.8916015625, "logits/rejected": 2.88427734375, "logps/chosen": -588.0, "logps/rejected": -508.5, "loss": 0.6643, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9560546875, "rewards/margins": 2.98046875, "rewards/rejected": -2.0244140625, "step": 3848 }, { "epoch": 0.727289905049837, "grad_norm": 3.4374742544736057, "learning_rate": 2.8937750851953907e-07, "logits/chosen": 3.3203125, "logits/rejected": 2.666015625, "logps/chosen": -1169.0, "logps/rejected": -1975.0, "loss": 0.3647, "rewards/accuracies": 0.9375, "rewards/chosen": 2.0419921875, "rewards/margins": 6.68359375, "rewards/rejected": -4.640625, "step": 3849 }, { "epoch": 0.7274788605980443, "grad_norm": 2.628557152725895, "learning_rate": 2.891355999978559e-07, "logits/chosen": 3.140625, "logits/rejected": 3.03515625, "logps/chosen": -863.0, "logps/rejected": -2041.0, "loss": 0.5618, "rewards/accuracies": 0.875, "rewards/chosen": 1.24420166015625, "rewards/margins": 7.890625, "rewards/rejected": -6.66015625, "step": 3850 }, { "epoch": 0.7276678161462516, "grad_norm": 1.9133297500211959, "learning_rate": 2.888938049650714e-07, "logits/chosen": 3.03125, "logits/rejected": 2.662109375, "logps/chosen": -984.0, "logps/rejected": -864.0, "loss": 0.4723, "rewards/accuracies": 0.875, "rewards/chosen": 1.857421875, "rewards/margins": 5.953125, "rewards/rejected": -4.10546875, "step": 3851 }, { "epoch": 0.7278567716944588, "grad_norm": 2.3419167904675517, "learning_rate": 2.886521235263785e-07, "logits/chosen": 3.16796875, "logits/rejected": 3.2265625, "logps/chosen": -521.0, "logps/rejected": -637.0, "loss": 0.6833, "rewards/accuracies": 0.75, "rewards/chosen": 0.485595703125, "rewards/margins": 3.640625, "rewards/rejected": -3.15625, "step": 3852 }, { "epoch": 0.7280457272426661, "grad_norm": 3.8231561171359743, "learning_rate": 2.884105557869204e-07, "logits/chosen": 4.0703125, "logits/rejected": 3.31640625, "logps/chosen": -829.0, "logps/rejected": -811.5, "loss": 0.713, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5263671875, "rewards/margins": 3.595703125, "rewards/rejected": -3.0703125, "step": 3853 }, { "epoch": 0.7282346827908734, "grad_norm": 1.932885336869457, "learning_rate": 2.8816910185179114e-07, "logits/chosen": 3.39453125, "logits/rejected": 3.24609375, "logps/chosen": -750.0, "logps/rejected": -712.5, "loss": 0.6457, "rewards/accuracies": 0.75, "rewards/chosen": 1.5, "rewards/margins": 4.12109375, "rewards/rejected": -2.6220703125, "step": 3854 }, { "epoch": 0.7284236383390807, "grad_norm": 1.5049396745380894, "learning_rate": 2.879277618260352e-07, "logits/chosen": 3.79296875, "logits/rejected": 3.16015625, "logps/chosen": -688.5, "logps/rejected": -799.0, "loss": 0.5657, "rewards/accuracies": 0.84375, "rewards/chosen": 1.099609375, "rewards/margins": 4.75390625, "rewards/rejected": -3.65625, "step": 3855 }, { "epoch": 0.728612593887288, "grad_norm": 2.0593637009294663, "learning_rate": 2.8768653581464726e-07, "logits/chosen": 2.244140625, "logits/rejected": 1.970703125, "logps/chosen": -822.0, "logps/rejected": -854.0, "loss": 0.6017, "rewards/accuracies": 0.875, "rewards/chosen": 1.927734375, "rewards/margins": 4.8203125, "rewards/rejected": -2.88671875, "step": 3856 }, { "epoch": 0.7288015494354954, "grad_norm": 1.955063442222115, "learning_rate": 2.8744542392257253e-07, "logits/chosen": 3.32421875, "logits/rejected": 2.9765625, "logps/chosen": -814.0, "logps/rejected": -923.0, "loss": 0.4486, "rewards/accuracies": 0.84375, "rewards/chosen": 1.5087890625, "rewards/margins": 5.671875, "rewards/rejected": -4.15625, "step": 3857 }, { "epoch": 0.7289905049837025, "grad_norm": 2.288911700272575, "learning_rate": 2.8720442625470666e-07, "logits/chosen": 3.17578125, "logits/rejected": 2.75, "logps/chosen": -371.5, "logps/rejected": -14113.5, "loss": 0.7613, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6474609375, "rewards/margins": -157.4873046875, "rewards/rejected": 157.990234375, "step": 3858 }, { "epoch": 0.7291794605319099, "grad_norm": 1.949596332748488, "learning_rate": 2.8696354291589536e-07, "logits/chosen": 3.72265625, "logits/rejected": 2.962890625, "logps/chosen": -565.0, "logps/rejected": -1162.0, "loss": 0.6156, "rewards/accuracies": 0.78125, "rewards/chosen": 1.18359375, "rewards/margins": 4.9345703125, "rewards/rejected": -3.75439453125, "step": 3859 }, { "epoch": 0.7293684160801172, "grad_norm": 2.4411357731659282, "learning_rate": 2.8672277401093516e-07, "logits/chosen": 3.6875, "logits/rejected": 3.51953125, "logps/chosen": -453.5, "logps/rejected": -1265.0, "loss": 0.6471, "rewards/accuracies": 0.6875, "rewards/chosen": 1.263671875, "rewards/margins": 4.7265625, "rewards/rejected": -3.45361328125, "step": 3860 }, { "epoch": 0.7295573716283245, "grad_norm": 2.0622151350495064, "learning_rate": 2.8648211964457213e-07, "logits/chosen": 3.71875, "logits/rejected": 3.828125, "logps/chosen": -593.5, "logps/rejected": -939.0, "loss": 0.7596, "rewards/accuracies": 0.6875, "rewards/chosen": 0.54833984375, "rewards/margins": 3.271484375, "rewards/rejected": -2.7236328125, "step": 3861 }, { "epoch": 0.7297463271765318, "grad_norm": 2.491385150915123, "learning_rate": 2.862415799215029e-07, "logits/chosen": 3.146484375, "logits/rejected": 2.791015625, "logps/chosen": -722.5, "logps/rejected": -760.0, "loss": 0.5873, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0244140625, "rewards/margins": 4.94921875, "rewards/rejected": -3.91796875, "step": 3862 }, { "epoch": 0.729935282724739, "grad_norm": 1.7554845032907838, "learning_rate": 2.8600115494637413e-07, "logits/chosen": 2.63671875, "logits/rejected": 1.906494140625, "logps/chosen": -1242.0, "logps/rejected": -1247.0, "loss": 0.5002, "rewards/accuracies": 0.75, "rewards/chosen": 1.7880859375, "rewards/margins": 5.75390625, "rewards/rejected": -3.97265625, "step": 3863 }, { "epoch": 0.7301242382729463, "grad_norm": 2.2029885564724707, "learning_rate": 2.857608448237824e-07, "logits/chosen": 3.2265625, "logits/rejected": 2.92578125, "logps/chosen": -834.0, "logps/rejected": -753.5, "loss": 0.62, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10693359375, "rewards/margins": 3.32421875, "rewards/rejected": -3.42578125, "step": 3864 }, { "epoch": 0.7303131938211536, "grad_norm": 1.5596372993817056, "learning_rate": 2.8552064965827483e-07, "logits/chosen": 2.98046875, "logits/rejected": 2.71484375, "logps/chosen": -1084.0, "logps/rejected": -895.5, "loss": 0.5498, "rewards/accuracies": 0.78125, "rewards/chosen": 1.537109375, "rewards/margins": 5.0546875, "rewards/rejected": -3.5234375, "step": 3865 }, { "epoch": 0.7305021493693609, "grad_norm": 2.7084080607375958, "learning_rate": 2.852805695543478e-07, "logits/chosen": 4.39453125, "logits/rejected": 4.5234375, "logps/chosen": -704.0, "logps/rejected": -773.0, "loss": 0.6321, "rewards/accuracies": 0.71875, "rewards/chosen": 1.494140625, "rewards/margins": 4.009765625, "rewards/rejected": -2.517822265625, "step": 3866 }, { "epoch": 0.7306911049175682, "grad_norm": 2.9620896695431775, "learning_rate": 2.8504060461644817e-07, "logits/chosen": 2.9921875, "logits/rejected": 2.95703125, "logps/chosen": -787.0, "logps/rejected": -867.0, "loss": 0.6553, "rewards/accuracies": 0.78125, "rewards/chosen": 0.40771484375, "rewards/margins": 3.392578125, "rewards/rejected": -2.9765625, "step": 3867 }, { "epoch": 0.7308800604657755, "grad_norm": 2.559617901580296, "learning_rate": 2.848007549489727e-07, "logits/chosen": 3.48828125, "logits/rejected": 3.357421875, "logps/chosen": -772.0, "logps/rejected": -696.5, "loss": 0.637, "rewards/accuracies": 0.78125, "rewards/chosen": 1.28466796875, "rewards/margins": 4.7421875, "rewards/rejected": -3.458984375, "step": 3868 }, { "epoch": 0.7310690160139827, "grad_norm": 3.0588801435626056, "learning_rate": 2.845610206562675e-07, "logits/chosen": 3.1015625, "logits/rejected": 2.984375, "logps/chosen": -641.5, "logps/rejected": -732.0, "loss": 0.5914, "rewards/accuracies": 0.8125, "rewards/chosen": 0.814453125, "rewards/margins": 4.55078125, "rewards/rejected": -3.736328125, "step": 3869 }, { "epoch": 0.73125797156219, "grad_norm": 2.0912031812686824, "learning_rate": 2.843214018426294e-07, "logits/chosen": 2.978515625, "logits/rejected": 3.08203125, "logps/chosen": -637.0, "logps/rejected": -1075.0, "loss": 0.6546, "rewards/accuracies": 0.75, "rewards/chosen": 0.29425048828125, "rewards/margins": 4.70703125, "rewards/rejected": -4.41015625, "step": 3870 }, { "epoch": 0.7314469271103973, "grad_norm": 2.6872955944067756, "learning_rate": 2.840818986123037e-07, "logits/chosen": 2.57666015625, "logits/rejected": 2.41015625, "logps/chosen": -795.0, "logps/rejected": -995.0, "loss": 0.5037, "rewards/accuracies": 0.90625, "rewards/chosen": 1.189453125, "rewards/margins": 5.99609375, "rewards/rejected": -4.80859375, "step": 3871 }, { "epoch": 0.7316358826586046, "grad_norm": 2.0543051313850365, "learning_rate": 2.838425110694866e-07, "logits/chosen": 3.4453125, "logits/rejected": 3.578125, "logps/chosen": -14417.0, "logps/rejected": -998.0, "loss": 0.7573, "rewards/accuracies": 0.65625, "rewards/chosen": 156.995361328125, "rewards/margins": 161.5, "rewards/rejected": -4.10546875, "step": 3872 }, { "epoch": 0.7318248382068119, "grad_norm": 1.6313675561494545, "learning_rate": 2.836032393183234e-07, "logits/chosen": 3.03125, "logits/rejected": 3.08203125, "logps/chosen": -14716.5, "logps/rejected": -14837.0, "loss": 0.5488, "rewards/accuracies": 0.8125, "rewards/chosen": 159.166015625, "rewards/margins": 5.44140625, "rewards/rejected": 153.79296875, "step": 3873 }, { "epoch": 0.7320137937550192, "grad_norm": 2.8229279455850476, "learning_rate": 2.8336408346290896e-07, "logits/chosen": 2.48046875, "logits/rejected": 2.21484375, "logps/chosen": -572.5, "logps/rejected": -1064.5, "loss": 0.6465, "rewards/accuracies": 0.8125, "rewards/chosen": 0.072998046875, "rewards/margins": 4.28515625, "rewards/rejected": -4.21484375, "step": 3874 }, { "epoch": 0.7322027493032264, "grad_norm": 2.0827358102992846, "learning_rate": 2.831250436072883e-07, "logits/chosen": 3.33203125, "logits/rejected": 3.16015625, "logps/chosen": -573.5, "logps/rejected": -12615.5, "loss": 0.6838, "rewards/accuracies": 0.78125, "rewards/chosen": 0.690673828125, "rewards/margins": -149.015625, "rewards/rejected": 149.578125, "step": 3875 }, { "epoch": 0.7323917048514337, "grad_norm": 1.348162629868235, "learning_rate": 2.8288611985545517e-07, "logits/chosen": 2.486328125, "logits/rejected": 2.8125, "logps/chosen": -884.5, "logps/rejected": -517.5, "loss": 0.6106, "rewards/accuracies": 0.78125, "rewards/chosen": 0.99658203125, "rewards/margins": 4.119140625, "rewards/rejected": -3.125, "step": 3876 }, { "epoch": 0.732580660399641, "grad_norm": 1.9100461788571572, "learning_rate": 2.826473123113534e-07, "logits/chosen": 3.2890625, "logits/rejected": 3.12890625, "logps/chosen": -975.0, "logps/rejected": -1283.0, "loss": 0.5912, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0439453125, "rewards/margins": 5.21875, "rewards/rejected": -4.17578125, "step": 3877 }, { "epoch": 0.7327696159478483, "grad_norm": 2.335441627566683, "learning_rate": 2.824086210788761e-07, "logits/chosen": 2.60205078125, "logits/rejected": 2.68359375, "logps/chosen": -745.0, "logps/rejected": -1341.0, "loss": 0.5463, "rewards/accuracies": 0.78125, "rewards/chosen": 0.58984375, "rewards/margins": 6.7265625, "rewards/rejected": -6.1328125, "step": 3878 }, { "epoch": 0.7329585714960556, "grad_norm": 2.1213331487628464, "learning_rate": 2.8217004626186553e-07, "logits/chosen": 2.837890625, "logits/rejected": 2.8671875, "logps/chosen": -561.25, "logps/rejected": -858.0, "loss": 0.5527, "rewards/accuracies": 0.8125, "rewards/chosen": 0.52734375, "rewards/margins": 5.15234375, "rewards/rejected": -4.6328125, "step": 3879 }, { "epoch": 0.7331475270442629, "grad_norm": 2.3848136754512215, "learning_rate": 2.819315879641135e-07, "logits/chosen": 2.87890625, "logits/rejected": 2.84765625, "logps/chosen": -636.5, "logps/rejected": -853.0, "loss": 0.5701, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9755859375, "rewards/margins": 4.34765625, "rewards/rejected": -3.37890625, "step": 3880 }, { "epoch": 0.7333364825924701, "grad_norm": 1.9912513548945754, "learning_rate": 2.8169324628936154e-07, "logits/chosen": 4.171875, "logits/rejected": 3.7578125, "logps/chosen": -806.0, "logps/rejected": -1124.0, "loss": 0.5608, "rewards/accuracies": 0.8125, "rewards/chosen": 1.254150390625, "rewards/margins": 7.62890625, "rewards/rejected": -6.376953125, "step": 3881 }, { "epoch": 0.7335254381406774, "grad_norm": 2.2917733039070196, "learning_rate": 2.814550213412997e-07, "logits/chosen": 2.7890625, "logits/rejected": 2.474609375, "logps/chosen": -851.0, "logps/rejected": -814.0, "loss": 0.6365, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6787109375, "rewards/margins": 3.96875, "rewards/rejected": -3.28515625, "step": 3882 }, { "epoch": 0.7337143936888847, "grad_norm": 2.454025607181992, "learning_rate": 2.812169132235678e-07, "logits/chosen": 3.875, "logits/rejected": 3.46484375, "logps/chosen": -551.0, "logps/rejected": -667.5, "loss": 0.687, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1572265625, "rewards/margins": 3.140625, "rewards/rejected": -2.98046875, "step": 3883 }, { "epoch": 0.733903349237092, "grad_norm": 1.934935639657547, "learning_rate": 2.8097892203975446e-07, "logits/chosen": 2.69921875, "logits/rejected": 2.806640625, "logps/chosen": -574.75, "logps/rejected": -746.0, "loss": 0.5876, "rewards/accuracies": 0.8125, "rewards/chosen": 0.46630859375, "rewards/margins": 4.51171875, "rewards/rejected": -4.04296875, "step": 3884 }, { "epoch": 0.7340923047852993, "grad_norm": 1.9807202969364996, "learning_rate": 2.8074104789339757e-07, "logits/chosen": 2.84765625, "logits/rejected": 2.873046875, "logps/chosen": -667.5, "logps/rejected": -1385.0, "loss": 0.5837, "rewards/accuracies": 0.8125, "rewards/chosen": 0.45703125, "rewards/margins": 5.7109375, "rewards/rejected": -5.25390625, "step": 3885 }, { "epoch": 0.7342812603335065, "grad_norm": 2.0079035734747595, "learning_rate": 2.8050329088798453e-07, "logits/chosen": 4.046875, "logits/rejected": 3.70703125, "logps/chosen": -650.0, "logps/rejected": -738.5, "loss": 0.5612, "rewards/accuracies": 0.84375, "rewards/chosen": 0.763671875, "rewards/margins": 4.73828125, "rewards/rejected": -3.96484375, "step": 3886 }, { "epoch": 0.7344702158817138, "grad_norm": 2.9171243063794545, "learning_rate": 2.8026565112695076e-07, "logits/chosen": 2.697265625, "logits/rejected": 2.494140625, "logps/chosen": -1157.0, "logps/rejected": -1494.0, "loss": 0.5361, "rewards/accuracies": 0.875, "rewards/chosen": 1.3125, "rewards/margins": 5.71875, "rewards/rejected": -4.4140625, "step": 3887 }, { "epoch": 0.7346591714299211, "grad_norm": 3.249302312505241, "learning_rate": 2.800281287136819e-07, "logits/chosen": 3.1015625, "logits/rejected": 2.5, "logps/chosen": -1009.0, "logps/rejected": -1198.0, "loss": 0.5339, "rewards/accuracies": 0.78125, "rewards/chosen": 0.873046875, "rewards/margins": 6.203125, "rewards/rejected": -5.32421875, "step": 3888 }, { "epoch": 0.7348481269781284, "grad_norm": 2.1406517760465626, "learning_rate": 2.797907237515115e-07, "logits/chosen": 2.556640625, "logits/rejected": 2.4765625, "logps/chosen": -1349.0, "logps/rejected": -996.0, "loss": 0.498, "rewards/accuracies": 0.84375, "rewards/chosen": 1.244140625, "rewards/margins": 7.5546875, "rewards/rejected": -6.296875, "step": 3889 }, { "epoch": 0.7350370825263357, "grad_norm": 1.791522244014171, "learning_rate": 2.7955343634372254e-07, "logits/chosen": 2.76953125, "logits/rejected": 2.5, "logps/chosen": -662.5, "logps/rejected": -643.0, "loss": 0.5776, "rewards/accuracies": 0.875, "rewards/chosen": -0.11846923828125, "rewards/margins": 4.90234375, "rewards/rejected": -5.01953125, "step": 3890 }, { "epoch": 0.735226038074543, "grad_norm": 1.6909341068348254, "learning_rate": 2.7931626659354706e-07, "logits/chosen": 3.1015625, "logits/rejected": 2.93359375, "logps/chosen": -913.0, "logps/rejected": -730.5, "loss": 0.5483, "rewards/accuracies": 0.8125, "rewards/chosen": 0.46484375, "rewards/margins": 4.484375, "rewards/rejected": -4.015625, "step": 3891 }, { "epoch": 0.7354149936227502, "grad_norm": 1.533214326094687, "learning_rate": 2.7907921460416514e-07, "logits/chosen": 2.541015625, "logits/rejected": 2.203125, "logps/chosen": -628.25, "logps/rejected": -620.5, "loss": 0.5347, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6063232421875, "rewards/margins": 5.0703125, "rewards/rejected": -4.4609375, "step": 3892 }, { "epoch": 0.7356039491709575, "grad_norm": 1.9579001966396978, "learning_rate": 2.788422804787064e-07, "logits/chosen": 2.763671875, "logits/rejected": 2.796875, "logps/chosen": -528.0, "logps/rejected": -594.5, "loss": 0.6584, "rewards/accuracies": 0.75, "rewards/chosen": 0.51171875, "rewards/margins": 3.6640625, "rewards/rejected": -3.146484375, "step": 3893 }, { "epoch": 0.7357929047191648, "grad_norm": 7.011291305657586, "learning_rate": 2.786054643202489e-07, "logits/chosen": 2.873046875, "logits/rejected": 3.1484375, "logps/chosen": -624.0, "logps/rejected": -642.5, "loss": 0.6923, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1138916015625, "rewards/margins": 3.35546875, "rewards/rejected": -3.474609375, "step": 3894 }, { "epoch": 0.7359818602673721, "grad_norm": 3.184265525694399, "learning_rate": 2.7836876623181915e-07, "logits/chosen": 3.607421875, "logits/rejected": 2.8828125, "logps/chosen": -979.0, "logps/rejected": -1512.5, "loss": 0.4994, "rewards/accuracies": 0.875, "rewards/chosen": 1.2294921875, "rewards/margins": 6.59375, "rewards/rejected": -5.36328125, "step": 3895 }, { "epoch": 0.7361708158155794, "grad_norm": 2.2845382671473646, "learning_rate": 2.78132186316393e-07, "logits/chosen": 2.28125, "logits/rejected": 2.11328125, "logps/chosen": -645.5, "logps/rejected": -712.0, "loss": 0.4631, "rewards/accuracies": 0.90625, "rewards/chosen": 0.86328125, "rewards/margins": 5.6875, "rewards/rejected": -4.82421875, "step": 3896 }, { "epoch": 0.7363597713637867, "grad_norm": 3.128822995238264, "learning_rate": 2.7789572467689364e-07, "logits/chosen": 3.4296875, "logits/rejected": 3.3046875, "logps/chosen": -514.0, "logps/rejected": -1525.0, "loss": 0.5222, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7275390625, "rewards/margins": 7.9296875, "rewards/rejected": -7.1875, "step": 3897 }, { "epoch": 0.7365487269119939, "grad_norm": 2.360746271010166, "learning_rate": 2.776593814161942e-07, "logits/chosen": 3.76171875, "logits/rejected": 3.35546875, "logps/chosen": -534.5, "logps/rejected": -506.0, "loss": 0.6706, "rewards/accuracies": 0.75, "rewards/chosen": -0.12640380859375, "rewards/margins": 3.50390625, "rewards/rejected": -3.625, "step": 3898 }, { "epoch": 0.7367376824602012, "grad_norm": 3.3232823138017467, "learning_rate": 2.7742315663711533e-07, "logits/chosen": 2.0, "logits/rejected": 1.6875, "logps/chosen": -598.5, "logps/rejected": -552.0, "loss": 0.7426, "rewards/accuracies": 0.71875, "rewards/chosen": -0.66845703125, "rewards/margins": 3.12109375, "rewards/rejected": -3.7890625, "step": 3899 }, { "epoch": 0.7369266380084085, "grad_norm": 1.2563243239976631, "learning_rate": 2.7718705044242675e-07, "logits/chosen": 2.525390625, "logits/rejected": 2.46826171875, "logps/chosen": -399.75, "logps/rejected": -431.5, "loss": 0.6761, "rewards/accuracies": 0.78125, "rewards/chosen": -0.228515625, "rewards/margins": 3.1875, "rewards/rejected": -3.421875, "step": 3900 }, { "epoch": 0.7371155935566158, "grad_norm": 3.0348824656058784, "learning_rate": 2.7695106293484617e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.7421875, "logps/chosen": -932.0, "logps/rejected": -1004.0, "loss": 0.6157, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4130859375, "rewards/margins": 5.25390625, "rewards/rejected": -3.84375, "step": 3901 }, { "epoch": 0.7373045491048231, "grad_norm": 2.861104680812805, "learning_rate": 2.767151942170395e-07, "logits/chosen": 3.6875, "logits/rejected": 3.1640625, "logps/chosen": -675.0, "logps/rejected": -562.5, "loss": 0.6431, "rewards/accuracies": 0.84375, "rewards/chosen": 0.79638671875, "rewards/margins": 2.9931640625, "rewards/rejected": -2.19140625, "step": 3902 }, { "epoch": 0.7374935046530304, "grad_norm": 2.1700383573948288, "learning_rate": 2.7647944439162186e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.5078125, "logps/chosen": -1057.0, "logps/rejected": -1217.0, "loss": 0.5947, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1943359375, "rewards/margins": 5.68359375, "rewards/rejected": -4.48828125, "step": 3903 }, { "epoch": 0.7376824602012376, "grad_norm": 3.136913819951117, "learning_rate": 2.762438135611558e-07, "logits/chosen": 2.5859375, "logits/rejected": 1.927734375, "logps/chosen": -642.5, "logps/rejected": -720.0, "loss": 0.5701, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3916015625, "rewards/margins": 4.361328125, "rewards/rejected": -3.966796875, "step": 3904 }, { "epoch": 0.7378714157494449, "grad_norm": 3.0295417491572096, "learning_rate": 2.760083018281523e-07, "logits/chosen": 1.95703125, "logits/rejected": 2.068359375, "logps/chosen": -926.0, "logps/rejected": -1139.0, "loss": 0.4629, "rewards/accuracies": 0.90625, "rewards/chosen": 1.458984375, "rewards/margins": 6.21875, "rewards/rejected": -4.7578125, "step": 3905 }, { "epoch": 0.7380603712976522, "grad_norm": 1.8356574001187376, "learning_rate": 2.757729092950707e-07, "logits/chosen": 3.19140625, "logits/rejected": 2.828125, "logps/chosen": -584.0, "logps/rejected": -717.0, "loss": 0.4899, "rewards/accuracies": 0.84375, "rewards/chosen": 0.64794921875, "rewards/margins": 4.953125, "rewards/rejected": -4.30859375, "step": 3906 }, { "epoch": 0.7382493268458595, "grad_norm": 2.8474393409089402, "learning_rate": 2.755376360643182e-07, "logits/chosen": 2.796875, "logits/rejected": 2.244140625, "logps/chosen": -1200.0, "logps/rejected": -1328.0, "loss": 0.4667, "rewards/accuracies": 0.875, "rewards/chosen": 1.775390625, "rewards/margins": 7.51953125, "rewards/rejected": -5.72265625, "step": 3907 }, { "epoch": 0.7384382823940668, "grad_norm": 3.2016464425469544, "learning_rate": 2.753024822382505e-07, "logits/chosen": 3.2578125, "logits/rejected": 3.2890625, "logps/chosen": -964.5, "logps/rejected": -1156.0, "loss": 0.6048, "rewards/accuracies": 0.75, "rewards/chosen": 1.25994873046875, "rewards/margins": 4.25, "rewards/rejected": -2.98828125, "step": 3908 }, { "epoch": 0.738627237942274, "grad_norm": 4.203237160907314, "learning_rate": 2.7506744791917103e-07, "logits/chosen": 3.1171875, "logits/rejected": 3.55078125, "logps/chosen": -698.5, "logps/rejected": -661.5, "loss": 0.5603, "rewards/accuracies": 0.84375, "rewards/chosen": 0.95361328125, "rewards/margins": 5.0, "rewards/rejected": -4.0546875, "step": 3909 }, { "epoch": 0.7388161934904813, "grad_norm": 1.7113642010480608, "learning_rate": 2.7483253320933144e-07, "logits/chosen": 3.05078125, "logits/rejected": 2.712890625, "logps/chosen": -949.5, "logps/rejected": -1049.0, "loss": 0.5241, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1357421875, "rewards/margins": 5.109375, "rewards/rejected": -3.98046875, "step": 3910 }, { "epoch": 0.7390051490386886, "grad_norm": 3.474194145622612, "learning_rate": 2.745977382109309e-07, "logits/chosen": 2.7890625, "logits/rejected": 2.67578125, "logps/chosen": -1170.0, "logps/rejected": -1702.0, "loss": 0.4873, "rewards/accuracies": 0.84375, "rewards/chosen": 1.521484375, "rewards/margins": 6.26171875, "rewards/rejected": -4.75, "step": 3911 }, { "epoch": 0.7391941045868959, "grad_norm": 3.340085721735814, "learning_rate": 2.7436306302611733e-07, "logits/chosen": 2.421875, "logits/rejected": 1.8828125, "logps/chosen": -560.0, "logps/rejected": -608.0, "loss": 0.6869, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0675048828125, "rewards/margins": 3.515625, "rewards/rejected": -3.58203125, "step": 3912 }, { "epoch": 0.7393830601351032, "grad_norm": 4.236231163616142, "learning_rate": 2.741285077569856e-07, "logits/chosen": 3.09765625, "logits/rejected": 2.892578125, "logps/chosen": -1553.0, "logps/rejected": -858.5, "loss": 0.6451, "rewards/accuracies": 0.8125, "rewards/chosen": -0.623046875, "rewards/margins": 1.8203125, "rewards/rejected": -2.451171875, "step": 3913 }, { "epoch": 0.7395720156833105, "grad_norm": 2.855232395530281, "learning_rate": 2.7389407250557906e-07, "logits/chosen": 3.17578125, "logits/rejected": 2.9296875, "logps/chosen": -597.0, "logps/rejected": -822.5, "loss": 0.5039, "rewards/accuracies": 0.8125, "rewards/chosen": 0.85009765625, "rewards/margins": 5.32421875, "rewards/rejected": -4.4609375, "step": 3914 }, { "epoch": 0.7397609712315177, "grad_norm": 2.410538810299556, "learning_rate": 2.736597573738886e-07, "logits/chosen": 3.546875, "logits/rejected": 3.7109375, "logps/chosen": -868.5, "logps/rejected": -1046.5, "loss": 0.5959, "rewards/accuracies": 0.75, "rewards/chosen": 0.919921875, "rewards/margins": 6.046875, "rewards/rejected": -5.11328125, "step": 3915 }, { "epoch": 0.739949926779725, "grad_norm": 2.264570172594206, "learning_rate": 2.7342556246385263e-07, "logits/chosen": 2.6171875, "logits/rejected": 2.2158203125, "logps/chosen": -688.0, "logps/rejected": -669.0, "loss": 0.556, "rewards/accuracies": 0.84375, "rewards/chosen": 0.364501953125, "rewards/margins": 4.07421875, "rewards/rejected": -3.71484375, "step": 3916 }, { "epoch": 0.7401388823279323, "grad_norm": 2.31239037061194, "learning_rate": 2.73191487877358e-07, "logits/chosen": 3.23828125, "logits/rejected": 2.69140625, "logps/chosen": -297.25, "logps/rejected": -404.0, "loss": 0.7272, "rewards/accuracies": 0.75, "rewards/chosen": -0.24322509765625, "rewards/margins": 3.3125, "rewards/rejected": -3.55078125, "step": 3917 }, { "epoch": 0.7403278378761396, "grad_norm": 2.089673288279216, "learning_rate": 2.729575337162382e-07, "logits/chosen": 3.44921875, "logits/rejected": 3.29296875, "logps/chosen": -794.5, "logps/rejected": -878.5, "loss": 0.5588, "rewards/accuracies": 0.75, "rewards/chosen": 1.17822265625, "rewards/margins": 6.0, "rewards/rejected": -4.8359375, "step": 3918 }, { "epoch": 0.740516793424347, "grad_norm": 2.3205821362156787, "learning_rate": 2.7272370008227517e-07, "logits/chosen": 3.21875, "logits/rejected": 2.32958984375, "logps/chosen": -782.0, "logps/rejected": -883.0, "loss": 0.524, "rewards/accuracies": 0.90625, "rewards/chosen": 1.79296875, "rewards/margins": 7.0234375, "rewards/rejected": -5.2392578125, "step": 3919 }, { "epoch": 0.7407057489725543, "grad_norm": 2.8290743246800205, "learning_rate": 2.7248998707719803e-07, "logits/chosen": 3.072265625, "logits/rejected": 2.5673828125, "logps/chosen": -663.5, "logps/rejected": -772.0, "loss": 0.7025, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9619140625, "rewards/margins": 2.857421875, "rewards/rejected": -1.890625, "step": 3920 }, { "epoch": 0.7408947045207614, "grad_norm": 1.9551944576113416, "learning_rate": 2.722563948026835e-07, "logits/chosen": 2.015625, "logits/rejected": 1.510986328125, "logps/chosen": -836.5, "logps/rejected": -674.0, "loss": 0.6645, "rewards/accuracies": 0.8125, "rewards/chosen": 1.451171875, "rewards/margins": 3.34765625, "rewards/rejected": -1.8916015625, "step": 3921 }, { "epoch": 0.7410836600689688, "grad_norm": 1.9534947467256103, "learning_rate": 2.720229233603556e-07, "logits/chosen": 3.18359375, "logits/rejected": 2.94140625, "logps/chosen": -555.5, "logps/rejected": -676.0, "loss": 0.751, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2081298828125, "rewards/margins": 3.14453125, "rewards/rejected": -2.9453125, "step": 3922 }, { "epoch": 0.7412726156171761, "grad_norm": 3.613554449139748, "learning_rate": 2.717895728517859e-07, "logits/chosen": 3.36328125, "logits/rejected": 2.82421875, "logps/chosen": -750.5, "logps/rejected": -1084.0, "loss": 0.5974, "rewards/accuracies": 0.8125, "rewards/chosen": 1.134765625, "rewards/margins": 4.515625, "rewards/rejected": -3.38671875, "step": 3923 }, { "epoch": 0.7414615711653834, "grad_norm": 4.168968762724525, "learning_rate": 2.7155634337849376e-07, "logits/chosen": 3.3203125, "logits/rejected": 2.9765625, "logps/chosen": -700.0, "logps/rejected": -870.5, "loss": 0.5863, "rewards/accuracies": 0.78125, "rewards/chosen": 1.09814453125, "rewards/margins": 4.7265625, "rewards/rejected": -3.623046875, "step": 3924 }, { "epoch": 0.7416505267135907, "grad_norm": 2.36648575428291, "learning_rate": 2.713232350419453e-07, "logits/chosen": 3.36328125, "logits/rejected": 3.1015625, "logps/chosen": -1089.0, "logps/rejected": -1520.0, "loss": 0.5255, "rewards/accuracies": 0.71875, "rewards/chosen": 1.5184326171875, "rewards/margins": 6.33203125, "rewards/rejected": -4.80859375, "step": 3925 }, { "epoch": 0.741839482261798, "grad_norm": 2.571771838125108, "learning_rate": 2.710902479435541e-07, "logits/chosen": 2.515625, "logits/rejected": 2.404296875, "logps/chosen": -622.5, "logps/rejected": -986.0, "loss": 0.6402, "rewards/accuracies": 0.75, "rewards/chosen": 0.926910400390625, "rewards/margins": 4.9765625, "rewards/rejected": -4.04296875, "step": 3926 }, { "epoch": 0.7420284378100052, "grad_norm": 1.5967478666326957, "learning_rate": 2.7085738218468115e-07, "logits/chosen": 3.5390625, "logits/rejected": 3.42578125, "logps/chosen": -814.0, "logps/rejected": -1421.0, "loss": 0.6076, "rewards/accuracies": 0.75, "rewards/chosen": 1.63671875, "rewards/margins": 5.44921875, "rewards/rejected": -3.814453125, "step": 3927 }, { "epoch": 0.7422173933582125, "grad_norm": 2.0217056777470526, "learning_rate": 2.7062463786663446e-07, "logits/chosen": 2.26171875, "logits/rejected": 1.9296875, "logps/chosen": -670.5, "logps/rejected": -622.5, "loss": 0.632, "rewards/accuracies": 0.75, "rewards/chosen": 0.881591796875, "rewards/margins": 3.96484375, "rewards/rejected": -3.087890625, "step": 3928 }, { "epoch": 0.7424063489064198, "grad_norm": 2.8643941105927646, "learning_rate": 2.703920150906693e-07, "logits/chosen": 3.48828125, "logits/rejected": 3.23046875, "logps/chosen": -1559.0, "logps/rejected": -1214.0, "loss": 0.7478, "rewards/accuracies": 0.75, "rewards/chosen": -5.07373046875, "rewards/margins": 1.56591796875, "rewards/rejected": -6.619140625, "step": 3929 }, { "epoch": 0.7425953044546271, "grad_norm": 2.55703277213363, "learning_rate": 2.7015951395798824e-07, "logits/chosen": 1.8076171875, "logits/rejected": 1.962890625, "logps/chosen": -847.0, "logps/rejected": -982.5, "loss": 0.628, "rewards/accuracies": 0.78125, "rewards/chosen": 0.47021484375, "rewards/margins": 3.775390625, "rewards/rejected": -3.29296875, "step": 3930 }, { "epoch": 0.7427842600028344, "grad_norm": 1.6754155891255245, "learning_rate": 2.699271345697407e-07, "logits/chosen": 3.390625, "logits/rejected": 3.890625, "logps/chosen": -964.0, "logps/rejected": -1429.5, "loss": 0.6479, "rewards/accuracies": 0.78125, "rewards/chosen": 1.789794921875, "rewards/margins": 7.4482421875, "rewards/rejected": -5.66357421875, "step": 3931 }, { "epoch": 0.7429732155510416, "grad_norm": 3.168651335371008, "learning_rate": 2.6969487702702304e-07, "logits/chosen": 2.6630859375, "logits/rejected": 2.5400390625, "logps/chosen": -773.0, "logps/rejected": -689.0, "loss": 0.4719, "rewards/accuracies": 0.90625, "rewards/chosen": 0.69207763671875, "rewards/margins": 5.625, "rewards/rejected": -4.92578125, "step": 3932 }, { "epoch": 0.7431621710992489, "grad_norm": 2.392604474677304, "learning_rate": 2.69462741430879e-07, "logits/chosen": 2.552734375, "logits/rejected": 2.052734375, "logps/chosen": -534.5, "logps/rejected": -543.0, "loss": 0.538, "rewards/accuracies": 0.875, "rewards/chosen": 0.8857421875, "rewards/margins": 5.2734375, "rewards/rejected": -4.37890625, "step": 3933 }, { "epoch": 0.7433511266474562, "grad_norm": 1.6795076755873417, "learning_rate": 2.692307278822986e-07, "logits/chosen": 2.435546875, "logits/rejected": 2.337890625, "logps/chosen": -599.0, "logps/rejected": -651.5, "loss": 0.5802, "rewards/accuracies": 0.78125, "rewards/chosen": 0.734375, "rewards/margins": 4.298828125, "rewards/rejected": -3.556640625, "step": 3934 }, { "epoch": 0.7435400821956635, "grad_norm": 1.8046277807795612, "learning_rate": 2.689988364822199e-07, "logits/chosen": 3.0390625, "logits/rejected": 2.474609375, "logps/chosen": -754.0, "logps/rejected": -1287.0, "loss": 0.4755, "rewards/accuracies": 0.78125, "rewards/chosen": 0.771484375, "rewards/margins": 9.4453125, "rewards/rejected": -8.6796875, "step": 3935 }, { "epoch": 0.7437290377438708, "grad_norm": 3.0200731753707166, "learning_rate": 2.687670673315263e-07, "logits/chosen": 2.396484375, "logits/rejected": 2.267578125, "logps/chosen": -887.5, "logps/rejected": -956.0, "loss": 0.4794, "rewards/accuracies": 0.9375, "rewards/chosen": 1.376953125, "rewards/margins": 6.28125, "rewards/rejected": -4.8984375, "step": 3936 }, { "epoch": 0.7439179932920781, "grad_norm": 2.7938045531122024, "learning_rate": 2.685354205310495e-07, "logits/chosen": 3.44140625, "logits/rejected": 3.33203125, "logps/chosen": -817.0, "logps/rejected": -781.0, "loss": 0.6493, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1962890625, "rewards/margins": 3.671875, "rewards/rejected": -2.4716796875, "step": 3937 }, { "epoch": 0.7441069488402853, "grad_norm": 1.6604230920943186, "learning_rate": 2.683038961815669e-07, "logits/chosen": 2.7265625, "logits/rejected": 2.779296875, "logps/chosen": -690.5, "logps/rejected": -1744.0, "loss": 0.6062, "rewards/accuracies": 0.84375, "rewards/chosen": 0.33056640625, "rewards/margins": 6.38671875, "rewards/rejected": -6.0546875, "step": 3938 }, { "epoch": 0.7442959043884926, "grad_norm": 4.4346713982203605, "learning_rate": 2.680724943838031e-07, "logits/chosen": 3.294921875, "logits/rejected": 2.9326171875, "logps/chosen": -929.5, "logps/rejected": -994.0, "loss": 0.5603, "rewards/accuracies": 0.71875, "rewards/chosen": 1.1572265625, "rewards/margins": 6.22265625, "rewards/rejected": -5.0703125, "step": 3939 }, { "epoch": 0.7444848599366999, "grad_norm": 3.5918829663323546, "learning_rate": 2.6784121523842967e-07, "logits/chosen": 2.54296875, "logits/rejected": 1.96875, "logps/chosen": -783.5, "logps/rejected": -816.0, "loss": 0.4859, "rewards/accuracies": 0.875, "rewards/chosen": 0.7764892578125, "rewards/margins": 5.7734375, "rewards/rejected": -5.00390625, "step": 3940 }, { "epoch": 0.7446738154849072, "grad_norm": 2.591177440541708, "learning_rate": 2.676100588460638e-07, "logits/chosen": 2.974609375, "logits/rejected": 2.845703125, "logps/chosen": -537.5, "logps/rejected": -823.0, "loss": 0.5171, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8330078125, "rewards/margins": 5.64453125, "rewards/rejected": -4.81640625, "step": 3941 }, { "epoch": 0.7448627710331145, "grad_norm": 1.600953867970274, "learning_rate": 2.6737902530727053e-07, "logits/chosen": 2.87109375, "logits/rejected": 2.822265625, "logps/chosen": -731.0, "logps/rejected": -1073.0, "loss": 0.592, "rewards/accuracies": 0.75, "rewards/chosen": -0.0908203125, "rewards/margins": 6.0546875, "rewards/rejected": -6.14453125, "step": 3942 }, { "epoch": 0.7450517265813218, "grad_norm": 5.130945413767741, "learning_rate": 2.6714811472256055e-07, "logits/chosen": 2.783203125, "logits/rejected": 2.353515625, "logps/chosen": -635.5, "logps/rejected": -611.0, "loss": 0.5783, "rewards/accuracies": 0.75, "rewards/chosen": 0.7021484375, "rewards/margins": 4.642578125, "rewards/rejected": -3.93359375, "step": 3943 }, { "epoch": 0.745240682129529, "grad_norm": 5.148806616116733, "learning_rate": 2.6691732719239136e-07, "logits/chosen": 2.087890625, "logits/rejected": 1.8251953125, "logps/chosen": -855.5, "logps/rejected": -708.5, "loss": 0.4873, "rewards/accuracies": 0.96875, "rewards/chosen": 1.05859375, "rewards/margins": 5.3828125, "rewards/rejected": -4.3203125, "step": 3944 }, { "epoch": 0.7454296376777363, "grad_norm": 2.6547254266884757, "learning_rate": 2.6668666281716723e-07, "logits/chosen": 3.046875, "logits/rejected": 2.29296875, "logps/chosen": -618.0, "logps/rejected": -510.0, "loss": 0.6828, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2978515625, "rewards/margins": 2.970703125, "rewards/rejected": -3.265625, "step": 3945 }, { "epoch": 0.7456185932259436, "grad_norm": 2.3714318844831688, "learning_rate": 2.6645612169723806e-07, "logits/chosen": 1.7822265625, "logits/rejected": 1.412109375, "logps/chosen": -713.0, "logps/rejected": -811.0, "loss": 0.5954, "rewards/accuracies": 0.75, "rewards/chosen": 0.070068359375, "rewards/margins": 4.39453125, "rewards/rejected": -4.3203125, "step": 3946 }, { "epoch": 0.7458075487741509, "grad_norm": 2.245440717396635, "learning_rate": 2.662257039329012e-07, "logits/chosen": 3.044921875, "logits/rejected": 2.9140625, "logps/chosen": -766.0, "logps/rejected": -1155.0, "loss": 0.6206, "rewards/accuracies": 0.875, "rewards/chosen": 0.6934814453125, "rewards/margins": 6.5859375, "rewards/rejected": -5.88671875, "step": 3947 }, { "epoch": 0.7459965043223582, "grad_norm": 1.6597563608440264, "learning_rate": 2.659954096243994e-07, "logits/chosen": 3.0078125, "logits/rejected": 2.37890625, "logps/chosen": -879.0, "logps/rejected": -1181.5, "loss": 0.6023, "rewards/accuracies": 0.78125, "rewards/chosen": 0.525390625, "rewards/margins": 5.59765625, "rewards/rejected": -5.07421875, "step": 3948 }, { "epoch": 0.7461854598705655, "grad_norm": 2.3589354605639894, "learning_rate": 2.657652388719219e-07, "logits/chosen": 3.35546875, "logits/rejected": 3.05078125, "logps/chosen": -627.5, "logps/rejected": -1070.0, "loss": 0.6224, "rewards/accuracies": 0.75, "rewards/chosen": 0.46728515625, "rewards/margins": 5.0078125, "rewards/rejected": -4.54296875, "step": 3949 }, { "epoch": 0.7463744154187727, "grad_norm": 2.6212097725286783, "learning_rate": 2.6553519177560494e-07, "logits/chosen": 2.65625, "logits/rejected": 2.44921875, "logps/chosen": -650.0, "logps/rejected": -980.5, "loss": 0.5892, "rewards/accuracies": 0.75, "rewards/chosen": -0.2529296875, "rewards/margins": 5.0859375, "rewards/rejected": -5.33984375, "step": 3950 }, { "epoch": 0.74656337096698, "grad_norm": 1.7764274119914645, "learning_rate": 2.653052684355297e-07, "logits/chosen": 2.59765625, "logits/rejected": 2.6796875, "logps/chosen": -723.0, "logps/rejected": -1813.0, "loss": 0.571, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4453125, "rewards/margins": 7.67578125, "rewards/rejected": -6.23046875, "step": 3951 }, { "epoch": 0.7467523265151873, "grad_norm": 3.154492165612287, "learning_rate": 2.6507546895172464e-07, "logits/chosen": 4.3203125, "logits/rejected": 4.01953125, "logps/chosen": -538.75, "logps/rejected": -1771.5, "loss": 0.5653, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5179443359375, "rewards/margins": 7.06640625, "rewards/rejected": -6.5625, "step": 3952 }, { "epoch": 0.7469412820633946, "grad_norm": 1.9711586688047575, "learning_rate": 2.648457934241638e-07, "logits/chosen": 2.03125, "logits/rejected": 2.25, "logps/chosen": -708.5, "logps/rejected": -1255.0, "loss": 0.5806, "rewards/accuracies": 0.90625, "rewards/chosen": 0.291015625, "rewards/margins": 6.7890625, "rewards/rejected": -6.49609375, "step": 3953 }, { "epoch": 0.7471302376116019, "grad_norm": 2.3679530398609514, "learning_rate": 2.646162419527673e-07, "logits/chosen": 2.6630859375, "logits/rejected": 2.11865234375, "logps/chosen": -973.0, "logps/rejected": -2020.0, "loss": 0.4586, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1337890625, "rewards/margins": 8.859375, "rewards/rejected": -7.73046875, "step": 3954 }, { "epoch": 0.7473191931598091, "grad_norm": 2.520719962743343, "learning_rate": 2.643868146374012e-07, "logits/chosen": 1.732421875, "logits/rejected": 1.55859375, "logps/chosen": -909.5, "logps/rejected": -1076.0, "loss": 0.4607, "rewards/accuracies": 0.90625, "rewards/chosen": 0.923583984375, "rewards/margins": 6.05859375, "rewards/rejected": -5.125, "step": 3955 }, { "epoch": 0.7475081487080164, "grad_norm": 2.346976255203243, "learning_rate": 2.6415751157787804e-07, "logits/chosen": 2.986328125, "logits/rejected": 2.359375, "logps/chosen": -745.0, "logps/rejected": -1046.5, "loss": 0.5806, "rewards/accuracies": 0.71875, "rewards/chosen": 0.90765380859375, "rewards/margins": 5.19921875, "rewards/rejected": -4.296875, "step": 3956 }, { "epoch": 0.7476971042562237, "grad_norm": 1.5310813438921456, "learning_rate": 2.639283328739559e-07, "logits/chosen": 3.27734375, "logits/rejected": 3.078125, "logps/chosen": -993.0, "logps/rejected": -1811.0, "loss": 0.7177, "rewards/accuracies": 0.75, "rewards/chosen": -0.26953125, "rewards/margins": 6.95703125, "rewards/rejected": -7.23046875, "step": 3957 }, { "epoch": 0.747886059804431, "grad_norm": 1.6795830082753729, "learning_rate": 2.636992786253387e-07, "logits/chosen": 2.69140625, "logits/rejected": 2.314453125, "logps/chosen": -1152.5, "logps/rejected": -883.0, "loss": 0.5501, "rewards/accuracies": 0.78125, "rewards/chosen": 2.0625, "rewards/margins": 5.17578125, "rewards/rejected": -3.109375, "step": 3958 }, { "epoch": 0.7480750153526383, "grad_norm": 2.305408125887276, "learning_rate": 2.6347034893167653e-07, "logits/chosen": 3.30078125, "logits/rejected": 2.3798828125, "logps/chosen": -421.25, "logps/rejected": -556.0, "loss": 0.6613, "rewards/accuracies": 0.75, "rewards/chosen": 0.68701171875, "rewards/margins": 3.10546875, "rewards/rejected": -2.4140625, "step": 3959 }, { "epoch": 0.7482639709008456, "grad_norm": 2.3336058457370936, "learning_rate": 2.6324154389256473e-07, "logits/chosen": 2.759765625, "logits/rejected": 2.765625, "logps/chosen": -463.5, "logps/rejected": -623.0, "loss": 0.7219, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0982666015625, "rewards/margins": 2.955078125, "rewards/rejected": -3.05859375, "step": 3960 }, { "epoch": 0.7484529264490528, "grad_norm": 1.6541520454561223, "learning_rate": 2.630128636075455e-07, "logits/chosen": 4.19140625, "logits/rejected": 3.5078125, "logps/chosen": -690.5, "logps/rejected": -889.0, "loss": 0.714, "rewards/accuracies": 0.78125, "rewards/chosen": 0.221343994140625, "rewards/margins": 4.6123046875, "rewards/rejected": -4.390625, "step": 3961 }, { "epoch": 0.7486418819972601, "grad_norm": 3.725005655194003, "learning_rate": 2.627843081761053e-07, "logits/chosen": 3.189453125, "logits/rejected": 3.21875, "logps/chosen": -942.5, "logps/rejected": -908.5, "loss": 0.6517, "rewards/accuracies": 0.78125, "rewards/chosen": 0.26953125, "rewards/margins": 4.94140625, "rewards/rejected": -4.671875, "step": 3962 }, { "epoch": 0.7488308375454674, "grad_norm": 3.507836425535229, "learning_rate": 2.625558776976775e-07, "logits/chosen": 3.15234375, "logits/rejected": 3.068359375, "logps/chosen": -879.0, "logps/rejected": -780.0, "loss": 0.6812, "rewards/accuracies": 0.71875, "rewards/chosen": 0.152099609375, "rewards/margins": 3.34765625, "rewards/rejected": -3.19921875, "step": 3963 }, { "epoch": 0.7490197930936747, "grad_norm": 3.3609447366264003, "learning_rate": 2.623275722716405e-07, "logits/chosen": 3.0625, "logits/rejected": 3.2421875, "logps/chosen": -661.0, "logps/rejected": -1084.5, "loss": 0.7151, "rewards/accuracies": 0.75, "rewards/chosen": 0.2021484375, "rewards/margins": 5.357421875, "rewards/rejected": -5.138671875, "step": 3964 }, { "epoch": 0.749208748641882, "grad_norm": 2.050678571581539, "learning_rate": 2.620993919973183e-07, "logits/chosen": 3.40234375, "logits/rejected": 3.07421875, "logps/chosen": -365.5, "logps/rejected": -463.5, "loss": 0.7265, "rewards/accuracies": 0.8125, "rewards/chosen": 0.869384765625, "rewards/margins": 2.580078125, "rewards/rejected": -1.71484375, "step": 3965 }, { "epoch": 0.7493977041900893, "grad_norm": 3.6625894030310278, "learning_rate": 2.61871336973981e-07, "logits/chosen": 2.337890625, "logits/rejected": 2.046875, "logps/chosen": -1017.0, "logps/rejected": -967.0, "loss": 0.4619, "rewards/accuracies": 0.9375, "rewards/chosen": 1.560546875, "rewards/margins": 5.748046875, "rewards/rejected": -4.1875, "step": 3966 }, { "epoch": 0.7495866597382965, "grad_norm": 3.3201147277892717, "learning_rate": 2.616434073008432e-07, "logits/chosen": 3.82421875, "logits/rejected": 3.263671875, "logps/chosen": -677.5, "logps/rejected": -628.0, "loss": 0.6263, "rewards/accuracies": 0.75, "rewards/chosen": 0.99609375, "rewards/margins": 4.427734375, "rewards/rejected": -3.421875, "step": 3967 }, { "epoch": 0.7497756152865038, "grad_norm": 2.8318057506474994, "learning_rate": 2.6141560307706624e-07, "logits/chosen": 3.19921875, "logits/rejected": 3.009765625, "logps/chosen": -911.5, "logps/rejected": -1021.0, "loss": 0.6184, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4736328125, "rewards/margins": 4.33203125, "rewards/rejected": -2.861328125, "step": 3968 }, { "epoch": 0.7499645708347111, "grad_norm": 1.5066644922852812, "learning_rate": 2.6118792440175573e-07, "logits/chosen": 2.495849609375, "logits/rejected": 2.7076416015625, "logps/chosen": -672.5, "logps/rejected": -822.5, "loss": 0.4644, "rewards/accuracies": 0.9375, "rewards/chosen": 1.509765625, "rewards/margins": 5.9765625, "rewards/rejected": -4.46484375, "step": 3969 }, { "epoch": 0.7501535263829184, "grad_norm": 4.90215037694615, "learning_rate": 2.609603713739631e-07, "logits/chosen": 3.25390625, "logits/rejected": 3.2734375, "logps/chosen": -620.5, "logps/rejected": -784.0, "loss": 0.6595, "rewards/accuracies": 0.71875, "rewards/chosen": 0.072021484375, "rewards/margins": 4.16796875, "rewards/rejected": -4.09375, "step": 3970 }, { "epoch": 0.7503424819311257, "grad_norm": 2.8021184106313424, "learning_rate": 2.6073294409268565e-07, "logits/chosen": 3.52734375, "logits/rejected": 3.46875, "logps/chosen": -718.0, "logps/rejected": -1330.0, "loss": 0.6862, "rewards/accuracies": 0.625, "rewards/chosen": 0.845703125, "rewards/margins": 4.84375, "rewards/rejected": -4.0, "step": 3971 }, { "epoch": 0.750531437479333, "grad_norm": 3.3569361349152036, "learning_rate": 2.605056426568649e-07, "logits/chosen": 2.9921875, "logits/rejected": 2.212890625, "logps/chosen": -1021.0, "logps/rejected": -840.0, "loss": 0.6285, "rewards/accuracies": 0.78125, "rewards/chosen": 0.384765625, "rewards/margins": 3.9268798828125, "rewards/rejected": -3.546875, "step": 3972 }, { "epoch": 0.7507203930275402, "grad_norm": 2.6626057606171893, "learning_rate": 2.602784671653886e-07, "logits/chosen": 2.75390625, "logits/rejected": 2.4609375, "logps/chosen": -650.0, "logps/rejected": -713.5, "loss": 0.5441, "rewards/accuracies": 0.90625, "rewards/chosen": 0.88232421875, "rewards/margins": 5.25, "rewards/rejected": -4.37109375, "step": 3973 }, { "epoch": 0.7509093485757475, "grad_norm": 2.244312656593647, "learning_rate": 2.6005141771708915e-07, "logits/chosen": 2.900390625, "logits/rejected": 2.98046875, "logps/chosen": -734.0, "logps/rejected": -762.5, "loss": 0.6847, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4033203125, "rewards/margins": 4.546875, "rewards/rejected": -4.13671875, "step": 3974 }, { "epoch": 0.7510983041239548, "grad_norm": 2.8314133125836123, "learning_rate": 2.598244944107444e-07, "logits/chosen": 1.985107421875, "logits/rejected": 2.289794921875, "logps/chosen": -926.0, "logps/rejected": -1063.0, "loss": 0.4064, "rewards/accuracies": 0.96875, "rewards/chosen": 1.80078125, "rewards/margins": 6.265625, "rewards/rejected": -4.4609375, "step": 3975 }, { "epoch": 0.7512872596721621, "grad_norm": 3.671261274724483, "learning_rate": 2.5959769734507697e-07, "logits/chosen": 2.9453125, "logits/rejected": 2.62109375, "logps/chosen": -857.0, "logps/rejected": -889.5, "loss": 0.6065, "rewards/accuracies": 0.84375, "rewards/chosen": 0.922607421875, "rewards/margins": 3.8720703125, "rewards/rejected": -2.955078125, "step": 3976 }, { "epoch": 0.7514762152203694, "grad_norm": 3.552992992896849, "learning_rate": 2.593710266187548e-07, "logits/chosen": 3.30078125, "logits/rejected": 3.171875, "logps/chosen": -956.0, "logps/rejected": -1123.5, "loss": 0.5401, "rewards/accuracies": 0.875, "rewards/chosen": 1.3331298828125, "rewards/margins": 6.08984375, "rewards/rejected": -4.76953125, "step": 3977 }, { "epoch": 0.7516651707685766, "grad_norm": 2.3042590819025177, "learning_rate": 2.5914448233039117e-07, "logits/chosen": 3.26171875, "logits/rejected": 3.3046875, "logps/chosen": -736.0, "logps/rejected": -735.5, "loss": 0.6099, "rewards/accuracies": 0.78125, "rewards/chosen": 0.46588134765625, "rewards/margins": 4.78125, "rewards/rejected": -4.3203125, "step": 3978 }, { "epoch": 0.7518541263167839, "grad_norm": 2.4194554672694335, "learning_rate": 2.5891806457854374e-07, "logits/chosen": 3.419921875, "logits/rejected": 2.4609375, "logps/chosen": -992.5, "logps/rejected": -819.0, "loss": 0.6043, "rewards/accuracies": 0.875, "rewards/chosen": 1.1376953125, "rewards/margins": 5.66796875, "rewards/rejected": -4.53515625, "step": 3979 }, { "epoch": 0.7520430818649912, "grad_norm": 2.466142693769741, "learning_rate": 2.5869177346171556e-07, "logits/chosen": 2.888671875, "logits/rejected": 2.701171875, "logps/chosen": -693.0, "logps/rejected": -910.0, "loss": 0.6816, "rewards/accuracies": 0.8125, "rewards/chosen": -0.50146484375, "rewards/margins": 4.6455078125, "rewards/rejected": -5.14453125, "step": 3980 }, { "epoch": 0.7522320374131986, "grad_norm": 2.7799126813518815, "learning_rate": 2.584656090783544e-07, "logits/chosen": 3.125, "logits/rejected": 3.18359375, "logps/chosen": -424.5, "logps/rejected": -1972.0, "loss": 0.6709, "rewards/accuracies": 0.84375, "rewards/chosen": 0.34765625, "rewards/margins": 4.18359375, "rewards/rejected": -3.82421875, "step": 3981 }, { "epoch": 0.7524209929614059, "grad_norm": 3.2248385365196373, "learning_rate": 2.582395715268529e-07, "logits/chosen": 2.7109375, "logits/rejected": 2.572265625, "logps/chosen": -772.5, "logps/rejected": -1443.0, "loss": 0.6158, "rewards/accuracies": 0.8125, "rewards/chosen": 0.80712890625, "rewards/margins": 6.1796875, "rewards/rejected": -5.375, "step": 3982 }, { "epoch": 0.7526099485096132, "grad_norm": 2.9848680387003346, "learning_rate": 2.580136609055484e-07, "logits/chosen": 2.974609375, "logits/rejected": 2.875, "logps/chosen": -764.0, "logps/rejected": -998.5, "loss": 0.6257, "rewards/accuracies": 0.8125, "rewards/chosen": 0.26513671875, "rewards/margins": 7.0634765625, "rewards/rejected": -6.78515625, "step": 3983 }, { "epoch": 0.7527989040578204, "grad_norm": 3.07210553586229, "learning_rate": 2.5778787731272357e-07, "logits/chosen": 2.51025390625, "logits/rejected": 2.263671875, "logps/chosen": -685.5, "logps/rejected": -759.5, "loss": 0.5748, "rewards/accuracies": 0.875, "rewards/chosen": -0.0374755859375, "rewards/margins": 5.3125, "rewards/rejected": -5.359375, "step": 3984 }, { "epoch": 0.7529878596060277, "grad_norm": 4.393524903685142, "learning_rate": 2.575622208466052e-07, "logits/chosen": 2.880859375, "logits/rejected": 2.962890625, "logps/chosen": -958.0, "logps/rejected": -953.0, "loss": 0.5311, "rewards/accuracies": 0.84375, "rewards/chosen": 0.04296875, "rewards/margins": 4.931640625, "rewards/rejected": -4.890625, "step": 3985 }, { "epoch": 0.753176815154235, "grad_norm": 5.11770541064594, "learning_rate": 2.57336691605365e-07, "logits/chosen": 3.49609375, "logits/rejected": 3.12109375, "logps/chosen": -927.5, "logps/rejected": -829.0, "loss": 0.6301, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5546875, "rewards/margins": 4.7890625, "rewards/rejected": -4.2265625, "step": 3986 }, { "epoch": 0.7533657707024423, "grad_norm": 1.816084873194359, "learning_rate": 2.571112896871194e-07, "logits/chosen": 2.80078125, "logits/rejected": 2.97265625, "logps/chosen": -955.5, "logps/rejected": -975.5, "loss": 0.578, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4638671875, "rewards/margins": 4.29296875, "rewards/rejected": -2.828125, "step": 3987 }, { "epoch": 0.7535547262506496, "grad_norm": 3.2795860400737102, "learning_rate": 2.568860151899291e-07, "logits/chosen": 3.66796875, "logits/rejected": 2.9296875, "logps/chosen": -562.5, "logps/rejected": -718.0, "loss": 0.5454, "rewards/accuracies": 0.90625, "rewards/chosen": 0.63037109375, "rewards/margins": 6.1484375, "rewards/rejected": -5.5234375, "step": 3988 }, { "epoch": 0.7537436817988569, "grad_norm": 1.936617760036313, "learning_rate": 2.566608682118002e-07, "logits/chosen": 2.056640625, "logits/rejected": 1.654296875, "logps/chosen": -1145.5, "logps/rejected": -1072.0, "loss": 0.376, "rewards/accuracies": 0.96875, "rewards/chosen": 2.03515625, "rewards/margins": 6.890625, "rewards/rejected": -4.84765625, "step": 3989 }, { "epoch": 0.7539326373470641, "grad_norm": 2.689824758016074, "learning_rate": 2.5643584885068203e-07, "logits/chosen": 3.41796875, "logits/rejected": 2.923828125, "logps/chosen": -730.5, "logps/rejected": -735.0, "loss": 0.613, "rewards/accuracies": 0.8125, "rewards/chosen": 0.74365234375, "rewards/margins": 4.3720703125, "rewards/rejected": -3.6240234375, "step": 3990 }, { "epoch": 0.7541215928952714, "grad_norm": 5.1286402318850115, "learning_rate": 2.5621095720446954e-07, "logits/chosen": 2.244140625, "logits/rejected": 2.197265625, "logps/chosen": -898.0, "logps/rejected": -1028.5, "loss": 0.5596, "rewards/accuracies": 0.875, "rewards/chosen": 0.734375, "rewards/margins": 5.42578125, "rewards/rejected": -4.69921875, "step": 3991 }, { "epoch": 0.7543105484434787, "grad_norm": 2.722479807397377, "learning_rate": 2.55986193371002e-07, "logits/chosen": 2.263671875, "logits/rejected": 2.19921875, "logps/chosen": -637.0, "logps/rejected": -798.75, "loss": 0.5593, "rewards/accuracies": 0.71875, "rewards/chosen": 0.61767578125, "rewards/margins": 3.931640625, "rewards/rejected": -3.314453125, "step": 3992 }, { "epoch": 0.754499503991686, "grad_norm": 2.4127974508667913, "learning_rate": 2.557615574480621e-07, "logits/chosen": 2.5302734375, "logits/rejected": 2.486328125, "logps/chosen": -1216.5, "logps/rejected": -1205.0, "loss": 0.6172, "rewards/accuracies": 0.84375, "rewards/chosen": -0.46484375, "rewards/margins": 5.3447265625, "rewards/rejected": -5.79296875, "step": 3993 }, { "epoch": 0.7546884595398933, "grad_norm": 1.6762198263518788, "learning_rate": 2.5553704953337804e-07, "logits/chosen": 2.6171875, "logits/rejected": 1.8232421875, "logps/chosen": -6622.0, "logps/rejected": -1242.0, "loss": 0.5705, "rewards/accuracies": 0.78125, "rewards/chosen": -27.29296875, "rewards/margins": -22.78515625, "rewards/rejected": -4.49609375, "step": 3994 }, { "epoch": 0.7548774150881006, "grad_norm": 3.0650073950711536, "learning_rate": 2.553126697246217e-07, "logits/chosen": 1.80810546875, "logits/rejected": 1.517578125, "logps/chosen": -744.0, "logps/rejected": -826.0, "loss": 0.5509, "rewards/accuracies": 0.9375, "rewards/chosen": 0.51123046875, "rewards/margins": 5.26953125, "rewards/rejected": -4.7578125, "step": 3995 }, { "epoch": 0.7550663706363078, "grad_norm": 2.10965271212029, "learning_rate": 2.550884181194095e-07, "logits/chosen": 3.25, "logits/rejected": 3.23828125, "logps/chosen": -991.5, "logps/rejected": -1488.0, "loss": 0.451, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7431640625, "rewards/margins": 7.6875, "rewards/rejected": -5.9453125, "step": 3996 }, { "epoch": 0.7552553261845151, "grad_norm": 1.7065368118663844, "learning_rate": 2.5486429481530183e-07, "logits/chosen": 3.099609375, "logits/rejected": 2.95703125, "logps/chosen": -795.0, "logps/rejected": -746.5, "loss": 0.5673, "rewards/accuracies": 0.84375, "rewards/chosen": 1.05029296875, "rewards/margins": 4.87109375, "rewards/rejected": -3.8125, "step": 3997 }, { "epoch": 0.7554442817327224, "grad_norm": 3.6184172086693622, "learning_rate": 2.546402999098032e-07, "logits/chosen": 2.9140625, "logits/rejected": 3.08203125, "logps/chosen": -883.5, "logps/rejected": -1465.0, "loss": 0.5752, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6123046875, "rewards/margins": 4.9609375, "rewards/rejected": -4.33984375, "step": 3998 }, { "epoch": 0.7556332372809297, "grad_norm": 1.2580639816651529, "learning_rate": 2.544164335003628e-07, "logits/chosen": 1.67578125, "logits/rejected": 1.580078125, "logps/chosen": -881.0, "logps/rejected": -8863.0, "loss": 0.5318, "rewards/accuracies": 0.84375, "rewards/chosen": 1.04840087890625, "rewards/margins": -78.8046875, "rewards/rejected": 80.1875, "step": 3999 }, { "epoch": 0.755822192829137, "grad_norm": 3.1703192264909004, "learning_rate": 2.5419269568437357e-07, "logits/chosen": 3.208984375, "logits/rejected": 2.98046875, "logps/chosen": -697.0, "logps/rejected": -857.0, "loss": 0.5489, "rewards/accuracies": 0.78125, "rewards/chosen": 1.15234375, "rewards/margins": 6.4765625, "rewards/rejected": -5.31640625, "step": 4000 }, { "epoch": 0.7560111483773442, "grad_norm": 2.712477665187353, "learning_rate": 2.5396908655917236e-07, "logits/chosen": 2.09619140625, "logits/rejected": 1.88232421875, "logps/chosen": -806.0, "logps/rejected": -10971.0, "loss": 0.6129, "rewards/accuracies": 0.8125, "rewards/chosen": 0.09228515625, "rewards/margins": -59.28125, "rewards/rejected": 59.35546875, "step": 4001 }, { "epoch": 0.7562001039255515, "grad_norm": 2.7541011480784614, "learning_rate": 2.537456062220402e-07, "logits/chosen": 2.8408203125, "logits/rejected": 3.009765625, "logps/chosen": -619.5, "logps/rejected": -1350.0, "loss": 0.4261, "rewards/accuracies": 0.875, "rewards/chosen": 0.93896484375, "rewards/margins": 8.5, "rewards/rejected": -7.546875, "step": 4002 }, { "epoch": 0.7563890594737588, "grad_norm": 2.518102355590074, "learning_rate": 2.5352225477020217e-07, "logits/chosen": 3.09375, "logits/rejected": 2.685546875, "logps/chosen": -690.0, "logps/rejected": -770.0, "loss": 0.691, "rewards/accuracies": 0.75, "rewards/chosen": 0.42578125, "rewards/margins": 4.16796875, "rewards/rejected": -3.7421875, "step": 4003 }, { "epoch": 0.7565780150219661, "grad_norm": 3.7055715192366327, "learning_rate": 2.5329903230082694e-07, "logits/chosen": 2.630859375, "logits/rejected": 2.232421875, "logps/chosen": -839.0, "logps/rejected": -1000.0, "loss": 0.5192, "rewards/accuracies": 0.875, "rewards/chosen": 1.05413818359375, "rewards/margins": 8.1796875, "rewards/rejected": -7.125, "step": 4004 }, { "epoch": 0.7567669705701734, "grad_norm": 2.019394105211626, "learning_rate": 2.5307593891102763e-07, "logits/chosen": 2.078125, "logits/rejected": 1.65234375, "logps/chosen": -1213.0, "logps/rejected": -1058.0, "loss": 0.5621, "rewards/accuracies": 0.875, "rewards/chosen": 0.95703125, "rewards/margins": 5.8515625, "rewards/rejected": -4.8984375, "step": 4005 }, { "epoch": 0.7569559261183807, "grad_norm": 2.3527443933422854, "learning_rate": 2.528529746978607e-07, "logits/chosen": 2.484375, "logits/rejected": 2.181640625, "logps/chosen": -575.0, "logps/rejected": -772.0, "loss": 0.6439, "rewards/accuracies": 0.6875, "rewards/chosen": 0.060546875, "rewards/margins": 6.6171875, "rewards/rejected": -6.5390625, "step": 4006 }, { "epoch": 0.7571448816665879, "grad_norm": 1.0927567534864957, "learning_rate": 2.526301397583267e-07, "logits/chosen": 3.31640625, "logits/rejected": 3.96875, "logps/chosen": -782.5, "logps/rejected": -871.0, "loss": 0.4321, "rewards/accuracies": 0.875, "rewards/chosen": 1.2607421875, "rewards/margins": 6.734375, "rewards/rejected": -5.46875, "step": 4007 }, { "epoch": 0.7573338372147952, "grad_norm": 1.9974885942073746, "learning_rate": 2.524074341893697e-07, "logits/chosen": 2.515625, "logits/rejected": 2.533203125, "logps/chosen": -661.5, "logps/rejected": -590.0, "loss": 0.7197, "rewards/accuracies": 0.625, "rewards/chosen": 0.913330078125, "rewards/margins": 3.161865234375, "rewards/rejected": -2.25048828125, "step": 4008 }, { "epoch": 0.7575227927630025, "grad_norm": 5.487901708542564, "learning_rate": 2.5218485808787757e-07, "logits/chosen": 3.03515625, "logits/rejected": 2.47265625, "logps/chosen": -1037.5, "logps/rejected": -760.5, "loss": 0.5536, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0439453125, "rewards/margins": 5.44921875, "rewards/rejected": -4.41015625, "step": 4009 }, { "epoch": 0.7577117483112098, "grad_norm": 2.3613890438697203, "learning_rate": 2.519624115506823e-07, "logits/chosen": 3.34375, "logits/rejected": 2.90234375, "logps/chosen": -891.0, "logps/rejected": -784.0, "loss": 0.6015, "rewards/accuracies": 0.75, "rewards/chosen": 0.111328125, "rewards/margins": 5.8203125, "rewards/rejected": -5.71484375, "step": 4010 }, { "epoch": 0.7579007038594171, "grad_norm": 2.504384738163255, "learning_rate": 2.517400946745586e-07, "logits/chosen": 1.9091796875, "logits/rejected": 1.90234375, "logps/chosen": -1710.0, "logps/rejected": -1345.0, "loss": 0.5514, "rewards/accuracies": 0.84375, "rewards/chosen": 0.744140625, "rewards/margins": 5.5859375, "rewards/rejected": -4.84765625, "step": 4011 }, { "epoch": 0.7580896594076244, "grad_norm": 2.450377283945475, "learning_rate": 2.5151790755622573e-07, "logits/chosen": 3.6796875, "logits/rejected": 3.08984375, "logps/chosen": -629.0, "logps/rejected": -1014.5, "loss": 0.6045, "rewards/accuracies": 0.875, "rewards/chosen": 0.3013916015625, "rewards/margins": 6.23828125, "rewards/rejected": -5.939453125, "step": 4012 }, { "epoch": 0.7582786149558316, "grad_norm": 2.4928337829958798, "learning_rate": 2.5129585029234576e-07, "logits/chosen": 2.3203125, "logits/rejected": 3.015625, "logps/chosen": -726.0, "logps/rejected": -773.5, "loss": 0.699, "rewards/accuracies": 0.71875, "rewards/chosen": 0.248291015625, "rewards/margins": 4.69921875, "rewards/rejected": -4.453125, "step": 4013 }, { "epoch": 0.7584675705040389, "grad_norm": 2.1199285161193147, "learning_rate": 2.5107392297952454e-07, "logits/chosen": 3.41796875, "logits/rejected": 3.26171875, "logps/chosen": -1030.0, "logps/rejected": -859.0, "loss": 0.5613, "rewards/accuracies": 0.71875, "rewards/chosen": 0.46875, "rewards/margins": 5.58203125, "rewards/rejected": -5.109375, "step": 4014 }, { "epoch": 0.7586565260522462, "grad_norm": 2.553905709546698, "learning_rate": 2.508521257143119e-07, "logits/chosen": 2.94140625, "logits/rejected": 2.14453125, "logps/chosen": -901.0, "logps/rejected": -710.0, "loss": 0.5694, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1318359375, "rewards/margins": 5.4765625, "rewards/rejected": -4.35546875, "step": 4015 }, { "epoch": 0.7588454816004535, "grad_norm": 1.8065112146945463, "learning_rate": 2.5063045859319997e-07, "logits/chosen": 1.9951171875, "logits/rejected": 1.86962890625, "logps/chosen": -894.5, "logps/rejected": -939.5, "loss": 0.6455, "rewards/accuracies": 0.78125, "rewards/chosen": 0.99609375, "rewards/margins": 4.353515625, "rewards/rejected": -3.3583984375, "step": 4016 }, { "epoch": 0.7590344371486608, "grad_norm": 3.111857987962942, "learning_rate": 2.5040892171262526e-07, "logits/chosen": 3.4296875, "logits/rejected": 2.853515625, "logps/chosen": -901.5, "logps/rejected": -915.0, "loss": 0.5778, "rewards/accuracies": 0.875, "rewards/chosen": 1.27392578125, "rewards/margins": 5.3828125, "rewards/rejected": -4.1171875, "step": 4017 }, { "epoch": 0.7592233926968681, "grad_norm": 2.8261510878785923, "learning_rate": 2.501875151689673e-07, "logits/chosen": 3.01171875, "logits/rejected": 2.994140625, "logps/chosen": -819.0, "logps/rejected": -714.5, "loss": 0.6607, "rewards/accuracies": 0.75, "rewards/chosen": 1.3701171875, "rewards/margins": 4.5703125, "rewards/rejected": -3.20068359375, "step": 4018 }, { "epoch": 0.7594123482450753, "grad_norm": 1.998963836432145, "learning_rate": 2.499662390585485e-07, "logits/chosen": 3.37109375, "logits/rejected": 2.73046875, "logps/chosen": -886.0, "logps/rejected": -735.5, "loss": 0.5355, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1143798828125, "rewards/margins": 5.71875, "rewards/rejected": -4.59765625, "step": 4019 }, { "epoch": 0.7596013037932826, "grad_norm": 2.4031334103774946, "learning_rate": 2.497450934776355e-07, "logits/chosen": 2.228515625, "logits/rejected": 2.287109375, "logps/chosen": -14079.0, "logps/rejected": -27983.0, "loss": 0.5676, "rewards/accuracies": 0.8125, "rewards/chosen": 166.80322265625, "rewards/margins": -159.3125, "rewards/rejected": 326.65625, "step": 4020 }, { "epoch": 0.7597902593414899, "grad_norm": 1.8606305186428203, "learning_rate": 2.4952407852243684e-07, "logits/chosen": 2.66796875, "logits/rejected": 2.783203125, "logps/chosen": -589.0, "logps/rejected": -734.0, "loss": 0.5886, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1396484375, "rewards/margins": 4.884765625, "rewards/rejected": -4.74609375, "step": 4021 }, { "epoch": 0.7599792148896972, "grad_norm": 1.820816184350471, "learning_rate": 2.493031942891055e-07, "logits/chosen": 2.29296875, "logits/rejected": 1.84765625, "logps/chosen": -692.0, "logps/rejected": -763.0, "loss": 0.6155, "rewards/accuracies": 0.75, "rewards/chosen": 1.12451171875, "rewards/margins": 5.443359375, "rewards/rejected": -4.3046875, "step": 4022 }, { "epoch": 0.7601681704379045, "grad_norm": 2.5451430395012955, "learning_rate": 2.4908244087373684e-07, "logits/chosen": 1.9091796875, "logits/rejected": 1.23468017578125, "logps/chosen": -483.0, "logps/rejected": -484.0, "loss": 0.5923, "rewards/accuracies": 0.78125, "rewards/chosen": 0.855712890625, "rewards/margins": 4.2734375, "rewards/rejected": -3.421875, "step": 4023 }, { "epoch": 0.7603571259861117, "grad_norm": 2.060324458180642, "learning_rate": 2.4886181837236944e-07, "logits/chosen": 2.90625, "logits/rejected": 2.7734375, "logps/chosen": -920.0, "logps/rejected": -1828.5, "loss": 0.4859, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4742431640625, "rewards/margins": 10.8046875, "rewards/rejected": -9.3046875, "step": 4024 }, { "epoch": 0.760546081534319, "grad_norm": 3.367198238291662, "learning_rate": 2.486413268809851e-07, "logits/chosen": 3.78515625, "logits/rejected": 3.4375, "logps/chosen": -571.0, "logps/rejected": -667.5, "loss": 0.6371, "rewards/accuracies": 0.8125, "rewards/chosen": 0.34814453125, "rewards/margins": 3.5390625, "rewards/rejected": -3.189453125, "step": 4025 }, { "epoch": 0.7607350370825263, "grad_norm": 2.4700771563893515, "learning_rate": 2.484209664955083e-07, "logits/chosen": 3.23046875, "logits/rejected": 2.443359375, "logps/chosen": -947.0, "logps/rejected": -819.0, "loss": 0.5619, "rewards/accuracies": 0.875, "rewards/chosen": 0.92730712890625, "rewards/margins": 4.5859375, "rewards/rejected": -3.66015625, "step": 4026 }, { "epoch": 0.7609239926307336, "grad_norm": 2.4801947135336593, "learning_rate": 2.4820073731180695e-07, "logits/chosen": 2.01171875, "logits/rejected": 1.802978515625, "logps/chosen": -845.0, "logps/rejected": -1766.0, "loss": 0.427, "rewards/accuracies": 1.0, "rewards/chosen": 1.22607421875, "rewards/margins": 11.0078125, "rewards/rejected": -9.7578125, "step": 4027 }, { "epoch": 0.7611129481789409, "grad_norm": 2.248759927277178, "learning_rate": 2.479806394256915e-07, "logits/chosen": 2.994140625, "logits/rejected": 2.120849609375, "logps/chosen": -504.5, "logps/rejected": -898.5, "loss": 0.5274, "rewards/accuracies": 0.875, "rewards/chosen": 0.3209228515625, "rewards/margins": 5.234375, "rewards/rejected": -4.90625, "step": 4028 }, { "epoch": 0.7613019037271482, "grad_norm": 1.862118648841359, "learning_rate": 2.4776067293291546e-07, "logits/chosen": 1.3798828125, "logits/rejected": 1.40966796875, "logps/chosen": -594.75, "logps/rejected": -907.0, "loss": 0.5947, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1533203125, "rewards/margins": 4.44140625, "rewards/rejected": -3.296875, "step": 4029 }, { "epoch": 0.7614908592753554, "grad_norm": 1.778230103173364, "learning_rate": 2.4754083792917494e-07, "logits/chosen": 2.55224609375, "logits/rejected": 1.92138671875, "logps/chosen": -806.0, "logps/rejected": -747.5, "loss": 0.5225, "rewards/accuracies": 0.875, "rewards/chosen": 1.087158203125, "rewards/margins": 5.1953125, "rewards/rejected": -4.11328125, "step": 4030 }, { "epoch": 0.7616798148235627, "grad_norm": 1.9706366457977738, "learning_rate": 2.473211345101094e-07, "logits/chosen": 3.25, "logits/rejected": 2.86328125, "logps/chosen": -817.5, "logps/rejected": -988.0, "loss": 0.6536, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6226806640625, "rewards/margins": 4.16796875, "rewards/rejected": -3.55078125, "step": 4031 }, { "epoch": 0.76186877037177, "grad_norm": 1.5858621060237243, "learning_rate": 2.471015627713002e-07, "logits/chosen": 3.248046875, "logits/rejected": 3.0546875, "logps/chosen": -697.5, "logps/rejected": -729.5, "loss": 0.5507, "rewards/accuracies": 0.84375, "rewards/chosen": 1.404052734375, "rewards/margins": 5.6015625, "rewards/rejected": -4.19921875, "step": 4032 }, { "epoch": 0.7620577259199773, "grad_norm": 3.376945579622721, "learning_rate": 2.4688212280827216e-07, "logits/chosen": 2.87109375, "logits/rejected": 3.0166015625, "logps/chosen": -524.5, "logps/rejected": -1005.5, "loss": 0.6882, "rewards/accuracies": 0.71875, "rewards/chosen": -0.100341796875, "rewards/margins": 8.76953125, "rewards/rejected": -8.859375, "step": 4033 }, { "epoch": 0.7622466814681846, "grad_norm": 2.9170090830675472, "learning_rate": 2.466628147164926e-07, "logits/chosen": 2.994140625, "logits/rejected": 2.7877197265625, "logps/chosen": -666.0, "logps/rejected": -6090.0, "loss": 0.5206, "rewards/accuracies": 0.8125, "rewards/chosen": 0.33056640625, "rewards/margins": -78.3359375, "rewards/rejected": 78.6328125, "step": 4034 }, { "epoch": 0.7624356370163919, "grad_norm": 2.6160924851106966, "learning_rate": 2.4644363859137097e-07, "logits/chosen": 2.69921875, "logits/rejected": 2.435546875, "logps/chosen": -787.5, "logps/rejected": -1572.5, "loss": 0.4772, "rewards/accuracies": 0.875, "rewards/chosen": 0.96875, "rewards/margins": 9.6328125, "rewards/rejected": -8.64453125, "step": 4035 }, { "epoch": 0.7626245925645991, "grad_norm": 2.1166870975361496, "learning_rate": 2.462245945282604e-07, "logits/chosen": 1.85986328125, "logits/rejected": 1.7237548828125, "logps/chosen": -768.0, "logps/rejected": -1277.0, "loss": 0.4532, "rewards/accuracies": 0.875, "rewards/chosen": 1.0753173828125, "rewards/margins": 5.859375, "rewards/rejected": -4.796875, "step": 4036 }, { "epoch": 0.7628135481128064, "grad_norm": 3.8357162422798194, "learning_rate": 2.460056826224551e-07, "logits/chosen": 3.17578125, "logits/rejected": 2.5625, "logps/chosen": -612.0, "logps/rejected": -838.0, "loss": 0.4531, "rewards/accuracies": 0.9375, "rewards/chosen": 1.72265625, "rewards/margins": 6.296875, "rewards/rejected": -4.57421875, "step": 4037 }, { "epoch": 0.7630025036610137, "grad_norm": 2.3711667559131744, "learning_rate": 2.4578690296919306e-07, "logits/chosen": 2.4560546875, "logits/rejected": 2.71875, "logps/chosen": -852.0, "logps/rejected": -799.0, "loss": 0.6235, "rewards/accuracies": 0.75, "rewards/chosen": 0.6749267578125, "rewards/margins": 4.109375, "rewards/rejected": -3.44140625, "step": 4038 }, { "epoch": 0.763191459209221, "grad_norm": 3.555070357915233, "learning_rate": 2.455682556636541e-07, "logits/chosen": 3.271484375, "logits/rejected": 2.8525390625, "logps/chosen": -681.0, "logps/rejected": -819.0, "loss": 0.6009, "rewards/accuracies": 0.8125, "rewards/chosen": 0.97216796875, "rewards/margins": 4.962890625, "rewards/rejected": -3.98828125, "step": 4039 }, { "epoch": 0.7633804147574283, "grad_norm": 2.30804770528488, "learning_rate": 2.453497408009605e-07, "logits/chosen": 3.259765625, "logits/rejected": 2.9609375, "logps/chosen": -698.0, "logps/rejected": -691.0, "loss": 0.5705, "rewards/accuracies": 0.8125, "rewards/chosen": 0.726806640625, "rewards/margins": 3.84765625, "rewards/rejected": -3.1171875, "step": 4040 }, { "epoch": 0.7635693703056357, "grad_norm": 3.6036256924151835, "learning_rate": 2.4513135847617733e-07, "logits/chosen": 3.216796875, "logits/rejected": 2.505859375, "logps/chosen": -506.5, "logps/rejected": -783.0, "loss": 0.4803, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0380859375, "rewards/margins": 6.125, "rewards/rejected": -5.08203125, "step": 4041 }, { "epoch": 0.7637583258538428, "grad_norm": 2.8677765356248335, "learning_rate": 2.4491310878431125e-07, "logits/chosen": 2.455078125, "logits/rejected": 2.17041015625, "logps/chosen": -697.5, "logps/rejected": -2102.0, "loss": 0.5889, "rewards/accuracies": 0.8125, "rewards/chosen": -0.146484375, "rewards/margins": 5.98828125, "rewards/rejected": -6.10546875, "step": 4042 }, { "epoch": 0.7639472814020501, "grad_norm": 2.5410319958502154, "learning_rate": 2.4469499182031197e-07, "logits/chosen": 2.5390625, "logits/rejected": 2.341796875, "logps/chosen": -783.0, "logps/rejected": -998.0, "loss": 0.5048, "rewards/accuracies": 0.90625, "rewards/chosen": 1.4873046875, "rewards/margins": 6.7890625, "rewards/rejected": -5.310546875, "step": 4043 }, { "epoch": 0.7641362369502575, "grad_norm": 2.029984801164637, "learning_rate": 2.444770076790711e-07, "logits/chosen": 2.533203125, "logits/rejected": 2.3984375, "logps/chosen": -797.5, "logps/rejected": -720.5, "loss": 0.6208, "rewards/accuracies": 0.75, "rewards/chosen": 0.826171875, "rewards/margins": 4.69140625, "rewards/rejected": -3.85546875, "step": 4044 }, { "epoch": 0.7643251924984648, "grad_norm": 1.8455362062608547, "learning_rate": 2.442591564554224e-07, "logits/chosen": 2.8515625, "logits/rejected": 3.03515625, "logps/chosen": -1028.0, "logps/rejected": -1612.0, "loss": 0.7235, "rewards/accuracies": 0.65625, "rewards/chosen": 0.684326171875, "rewards/margins": 5.52734375, "rewards/rejected": -4.845703125, "step": 4045 }, { "epoch": 0.7645141480466721, "grad_norm": 4.706460753732784, "learning_rate": 2.44041438244142e-07, "logits/chosen": 3.095703125, "logits/rejected": 2.962890625, "logps/chosen": -574.0, "logps/rejected": -781.0, "loss": 0.4945, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7890625, "rewards/margins": 6.7578125, "rewards/rejected": -5.953125, "step": 4046 }, { "epoch": 0.7647031035948793, "grad_norm": 2.1099790143481534, "learning_rate": 2.4382385313994787e-07, "logits/chosen": 2.830078125, "logits/rejected": 2.625, "logps/chosen": -813.5, "logps/rejected": -773.5, "loss": 0.6011, "rewards/accuracies": 0.78125, "rewards/chosen": 0.715576171875, "rewards/margins": 3.69921875, "rewards/rejected": -2.98828125, "step": 4047 }, { "epoch": 0.7648920591430866, "grad_norm": 2.4288909328950488, "learning_rate": 2.4360640123750064e-07, "logits/chosen": 2.6015625, "logits/rejected": 2.6640625, "logps/chosen": -642.0, "logps/rejected": -1047.5, "loss": 0.6696, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9462890625, "rewards/margins": 4.689453125, "rewards/rejected": -5.6484375, "step": 4048 }, { "epoch": 0.7650810146912939, "grad_norm": 4.19261300525415, "learning_rate": 2.433890826314024e-07, "logits/chosen": 3.05859375, "logits/rejected": 2.90234375, "logps/chosen": -885.0, "logps/rejected": -934.0, "loss": 0.6571, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7998046875, "rewards/margins": 5.8359375, "rewards/rejected": -5.046875, "step": 4049 }, { "epoch": 0.7652699702395012, "grad_norm": 2.087028293137788, "learning_rate": 2.431718974161975e-07, "logits/chosen": 1.93359375, "logits/rejected": 1.548828125, "logps/chosen": -652.5, "logps/rejected": -728.5, "loss": 0.5816, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4248046875, "rewards/margins": 4.34375, "rewards/rejected": -3.9140625, "step": 4050 }, { "epoch": 0.7654589257877085, "grad_norm": 3.120175239014812, "learning_rate": 2.4295484568637233e-07, "logits/chosen": 2.189453125, "logits/rejected": 2.1171875, "logps/chosen": -646.5, "logps/rejected": -984.0, "loss": 0.5146, "rewards/accuracies": 0.875, "rewards/chosen": 0.8203125, "rewards/margins": 6.7890625, "rewards/rejected": -5.98046875, "step": 4051 }, { "epoch": 0.7656478813359158, "grad_norm": 3.809299453713412, "learning_rate": 2.42737927536355e-07, "logits/chosen": 2.54296875, "logits/rejected": 2.083984375, "logps/chosen": -730.0, "logps/rejected": -655.0, "loss": 0.4791, "rewards/accuracies": 0.84375, "rewards/chosen": 0.39501953125, "rewards/margins": 5.2421875, "rewards/rejected": -4.84765625, "step": 4052 }, { "epoch": 0.765836836884123, "grad_norm": 1.9108487474400504, "learning_rate": 2.4252114306051547e-07, "logits/chosen": 2.3173828125, "logits/rejected": 2.0, "logps/chosen": -732.0, "logps/rejected": -1799.5, "loss": 0.6003, "rewards/accuracies": 0.75, "rewards/chosen": -0.0482177734375, "rewards/margins": 7.830078125, "rewards/rejected": -7.8515625, "step": 4053 }, { "epoch": 0.7660257924323303, "grad_norm": 2.465870974048348, "learning_rate": 2.4230449235316603e-07, "logits/chosen": 3.41796875, "logits/rejected": 3.15625, "logps/chosen": -1010.0, "logps/rejected": -1001.0, "loss": 0.5698, "rewards/accuracies": 0.8125, "rewards/chosen": 0.77545166015625, "rewards/margins": 5.3046875, "rewards/rejected": -4.51171875, "step": 4054 }, { "epoch": 0.7662147479805376, "grad_norm": 3.3651233528740137, "learning_rate": 2.4208797550856036e-07, "logits/chosen": 2.79296875, "logits/rejected": 3.296875, "logps/chosen": -738.0, "logps/rejected": -789.5, "loss": 0.7447, "rewards/accuracies": 0.75, "rewards/chosen": -0.30029296875, "rewards/margins": 2.470703125, "rewards/rejected": -2.76953125, "step": 4055 }, { "epoch": 0.7664037035287449, "grad_norm": 2.048052318422377, "learning_rate": 2.418715926208939e-07, "logits/chosen": 3.640625, "logits/rejected": 3.8046875, "logps/chosen": -667.5, "logps/rejected": -858.0, "loss": 0.641, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1441650390625, "rewards/margins": 4.3828125, "rewards/rejected": -4.24609375, "step": 4056 }, { "epoch": 0.7665926590769522, "grad_norm": 1.4233811916280843, "learning_rate": 2.416553437843038e-07, "logits/chosen": 1.353515625, "logits/rejected": 1.560546875, "logps/chosen": -738.0, "logps/rejected": -849.0, "loss": 0.5836, "rewards/accuracies": 0.78125, "rewards/chosen": 0.725341796875, "rewards/margins": 5.251953125, "rewards/rejected": -4.52734375, "step": 4057 }, { "epoch": 0.7667816146251595, "grad_norm": 2.245917850911325, "learning_rate": 2.4143922909286904e-07, "logits/chosen": 2.8515625, "logits/rejected": 2.9140625, "logps/chosen": -750.0, "logps/rejected": -905.0, "loss": 0.6105, "rewards/accuracies": 0.84375, "rewards/chosen": 0.71435546875, "rewards/margins": 5.27734375, "rewards/rejected": -4.57421875, "step": 4058 }, { "epoch": 0.7669705701733667, "grad_norm": 1.8932687669645887, "learning_rate": 2.412232486406105e-07, "logits/chosen": 3.3203125, "logits/rejected": 3.296875, "logps/chosen": -817.0, "logps/rejected": -928.0, "loss": 0.573, "rewards/accuracies": 0.8125, "rewards/chosen": 0.603515625, "rewards/margins": 5.8984375, "rewards/rejected": -5.29296875, "step": 4059 }, { "epoch": 0.767159525721574, "grad_norm": 2.8245665525366412, "learning_rate": 2.410074025214899e-07, "logits/chosen": 3.50390625, "logits/rejected": 2.673828125, "logps/chosen": -968.5, "logps/rejected": -880.5, "loss": 0.4822, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9599609375, "rewards/margins": 6.0234375, "rewards/rejected": -5.0546875, "step": 4060 }, { "epoch": 0.7673484812697813, "grad_norm": 2.623934245371445, "learning_rate": 2.4079169082941113e-07, "logits/chosen": 2.890625, "logits/rejected": 2.255859375, "logps/chosen": -962.0, "logps/rejected": -858.0, "loss": 0.6409, "rewards/accuracies": 0.75, "rewards/chosen": 0.60693359375, "rewards/margins": 4.59765625, "rewards/rejected": -4.0, "step": 4061 }, { "epoch": 0.7675374368179886, "grad_norm": 2.0449628863232827, "learning_rate": 2.405761136582195e-07, "logits/chosen": 2.9921875, "logits/rejected": 2.814453125, "logps/chosen": -674.5, "logps/rejected": -713.0, "loss": 0.6236, "rewards/accuracies": 0.78125, "rewards/chosen": -0.162109375, "rewards/margins": 4.140625, "rewards/rejected": -4.3046875, "step": 4062 }, { "epoch": 0.7677263923661959, "grad_norm": 4.154713582134358, "learning_rate": 2.403606711017015e-07, "logits/chosen": 2.05859375, "logits/rejected": 2.28515625, "logps/chosen": -663.0, "logps/rejected": -662.5, "loss": 0.5782, "rewards/accuracies": 0.75, "rewards/chosen": -0.1162109375, "rewards/margins": 3.875, "rewards/rejected": -3.98828125, "step": 4063 }, { "epoch": 0.7679153479144032, "grad_norm": 2.2128699566476056, "learning_rate": 2.4014536325358573e-07, "logits/chosen": 2.068359375, "logits/rejected": 2.48291015625, "logps/chosen": -533.0, "logps/rejected": -673.0, "loss": 0.611, "rewards/accuracies": 0.78125, "rewards/chosen": -0.77685546875, "rewards/margins": 4.06640625, "rewards/rejected": -4.8359375, "step": 4064 }, { "epoch": 0.7681043034626104, "grad_norm": 3.061600620091067, "learning_rate": 2.399301902075413e-07, "logits/chosen": 2.134765625, "logits/rejected": 1.873046875, "logps/chosen": -967.0, "logps/rejected": -1640.0, "loss": 0.4416, "rewards/accuracies": 0.90625, "rewards/chosen": 0.99951171875, "rewards/margins": 7.21875, "rewards/rejected": -6.234375, "step": 4065 }, { "epoch": 0.7682932590108177, "grad_norm": 4.889631414683493, "learning_rate": 2.397151520571793e-07, "logits/chosen": 3.40625, "logits/rejected": 3.0390625, "logps/chosen": -8926.0, "logps/rejected": -1862.0, "loss": 0.6394, "rewards/accuracies": 0.875, "rewards/chosen": 59.8115234375, "rewards/margins": 70.36328125, "rewards/rejected": -10.46875, "step": 4066 }, { "epoch": 0.768482214559025, "grad_norm": 2.1180795183968, "learning_rate": 2.3950024889605196e-07, "logits/chosen": 2.537109375, "logits/rejected": 2.3720703125, "logps/chosen": -781.0, "logps/rejected": -707.5, "loss": 0.5079, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4830322265625, "rewards/margins": 5.2578125, "rewards/rejected": -4.77734375, "step": 4067 }, { "epoch": 0.7686711701072323, "grad_norm": 4.562179690434601, "learning_rate": 2.392854808176526e-07, "logits/chosen": 2.578125, "logits/rejected": 2.392578125, "logps/chosen": -1261.0, "logps/rejected": -1139.0, "loss": 0.4905, "rewards/accuracies": 0.90625, "rewards/chosen": 0.590576171875, "rewards/margins": 5.703125, "rewards/rejected": -5.10546875, "step": 4068 }, { "epoch": 0.7688601256554396, "grad_norm": 2.5256939848577633, "learning_rate": 2.390708479154161e-07, "logits/chosen": 1.9111328125, "logits/rejected": 2.361328125, "logps/chosen": -901.0, "logps/rejected": -941.0, "loss": 0.4621, "rewards/accuracies": 0.84375, "rewards/chosen": 0.78955078125, "rewards/margins": 6.44921875, "rewards/rejected": -5.65625, "step": 4069 }, { "epoch": 0.7690490812036468, "grad_norm": 3.2143089930449755, "learning_rate": 2.388563502827185e-07, "logits/chosen": 3.0859375, "logits/rejected": 2.908203125, "logps/chosen": -794.0, "logps/rejected": -1404.0, "loss": 0.6055, "rewards/accuracies": 0.75, "rewards/chosen": 0.041015625, "rewards/margins": 7.078125, "rewards/rejected": -7.02734375, "step": 4070 }, { "epoch": 0.7692380367518541, "grad_norm": 2.4887677825448793, "learning_rate": 2.386419880128767e-07, "logits/chosen": 3.234375, "logits/rejected": 3.1328125, "logps/chosen": -1344.0, "logps/rejected": -1112.0, "loss": 0.6844, "rewards/accuracies": 0.75, "rewards/chosen": -1.25537109375, "rewards/margins": 3.082275390625, "rewards/rejected": -4.33203125, "step": 4071 }, { "epoch": 0.7694269923000614, "grad_norm": 2.630574537381171, "learning_rate": 2.384277611991488e-07, "logits/chosen": 3.0458984375, "logits/rejected": 2.61669921875, "logps/chosen": -472.5, "logps/rejected": -558.0, "loss": 0.5534, "rewards/accuracies": 0.875, "rewards/chosen": 0.4443359375, "rewards/margins": 5.671875, "rewards/rejected": -5.234375, "step": 4072 }, { "epoch": 0.7696159478482687, "grad_norm": 2.9039957148874813, "learning_rate": 2.382136699347342e-07, "logits/chosen": 1.7333984375, "logits/rejected": 1.9375, "logps/chosen": -623.0, "logps/rejected": -669.5, "loss": 0.6514, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3330078125, "rewards/margins": 3.51953125, "rewards/rejected": -3.84765625, "step": 4073 }, { "epoch": 0.769804903396476, "grad_norm": 1.7727100084642202, "learning_rate": 2.379997143127729e-07, "logits/chosen": 2.6875, "logits/rejected": 3.0234375, "logps/chosen": -912.5, "logps/rejected": -1339.0, "loss": 0.5863, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10546875, "rewards/margins": 5.5, "rewards/rejected": -5.3984375, "step": 4074 }, { "epoch": 0.7699938589446833, "grad_norm": 1.4093771781656594, "learning_rate": 2.3778589442634662e-07, "logits/chosen": 2.5546875, "logits/rejected": 2.513671875, "logps/chosen": -829.0, "logps/rejected": -1170.0, "loss": 0.5459, "rewards/accuracies": 0.8125, "rewards/chosen": 0.788818359375, "rewards/margins": 6.203125, "rewards/rejected": -5.4140625, "step": 4075 }, { "epoch": 0.7701828144928905, "grad_norm": 2.531868616715728, "learning_rate": 2.3757221036847722e-07, "logits/chosen": 2.7578125, "logits/rejected": 2.966796875, "logps/chosen": -587.0, "logps/rejected": -1482.0, "loss": 0.7313, "rewards/accuracies": 0.84375, "rewards/chosen": -0.275390625, "rewards/margins": 4.3984375, "rewards/rejected": -4.671875, "step": 4076 }, { "epoch": 0.7703717700410978, "grad_norm": 4.604599080536469, "learning_rate": 2.3735866223212797e-07, "logits/chosen": 2.662109375, "logits/rejected": 2.310546875, "logps/chosen": -696.5, "logps/rejected": -668.0, "loss": 0.6715, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6046142578125, "rewards/margins": 3.419921875, "rewards/rejected": -4.0244140625, "step": 4077 }, { "epoch": 0.7705607255893051, "grad_norm": 1.5598831851435795, "learning_rate": 2.3714525011020283e-07, "logits/chosen": 2.212890625, "logits/rejected": 2.056640625, "logps/chosen": -883.0, "logps/rejected": -1170.0, "loss": 0.7254, "rewards/accuracies": 0.8125, "rewards/chosen": -0.572265625, "rewards/margins": 5.6328125, "rewards/rejected": -6.1875, "step": 4078 }, { "epoch": 0.7707496811375124, "grad_norm": 2.9742996993982844, "learning_rate": 2.369319740955464e-07, "logits/chosen": 3.58203125, "logits/rejected": 3.130859375, "logps/chosen": -1128.0, "logps/rejected": -1136.0, "loss": 0.5654, "rewards/accuracies": 0.75, "rewards/chosen": 0.43414306640625, "rewards/margins": 4.578125, "rewards/rejected": -4.1484375, "step": 4079 }, { "epoch": 0.7709386366857197, "grad_norm": 2.6323206400619426, "learning_rate": 2.367188342809448e-07, "logits/chosen": 3.08203125, "logits/rejected": 3.32421875, "logps/chosen": -865.0, "logps/rejected": -1440.0, "loss": 0.5927, "rewards/accuracies": 0.8125, "rewards/chosen": 0.200927734375, "rewards/margins": 7.1640625, "rewards/rejected": -6.96484375, "step": 4080 }, { "epoch": 0.771127592233927, "grad_norm": 4.607566805329909, "learning_rate": 2.3650583075912372e-07, "logits/chosen": 3.2734375, "logits/rejected": 2.720703125, "logps/chosen": -975.5, "logps/rejected": -1269.0, "loss": 0.4574, "rewards/accuracies": 0.875, "rewards/chosen": 1.06494140625, "rewards/margins": 6.6875, "rewards/rejected": -5.6171875, "step": 4081 }, { "epoch": 0.7713165477821342, "grad_norm": 2.6263086965585436, "learning_rate": 2.3629296362275073e-07, "logits/chosen": 2.9296875, "logits/rejected": 2.94140625, "logps/chosen": -416.5, "logps/rejected": -487.0, "loss": 0.6898, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07080078125, "rewards/margins": 3.08984375, "rewards/rejected": -3.15234375, "step": 4082 }, { "epoch": 0.7715055033303415, "grad_norm": 3.330382219980237, "learning_rate": 2.3608023296443336e-07, "logits/chosen": 2.998046875, "logits/rejected": 2.8125, "logps/chosen": -641.5, "logps/rejected": -704.5, "loss": 0.5219, "rewards/accuracies": 0.84375, "rewards/chosen": 0.60546875, "rewards/margins": 5.46875, "rewards/rejected": -4.87109375, "step": 4083 }, { "epoch": 0.7716944588785488, "grad_norm": 1.857496590280493, "learning_rate": 2.358676388767198e-07, "logits/chosen": 2.474609375, "logits/rejected": 2.484375, "logps/chosen": -748.0, "logps/rejected": -777.5, "loss": 0.5841, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0185546875, "rewards/margins": 4.84765625, "rewards/rejected": -3.828125, "step": 4084 }, { "epoch": 0.7718834144267561, "grad_norm": 4.354590094510454, "learning_rate": 2.356551814520994e-07, "logits/chosen": 3.3125, "logits/rejected": 3.24609375, "logps/chosen": -965.5, "logps/rejected": -841.5, "loss": 0.7311, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09619140625, "rewards/margins": 3.64453125, "rewards/rejected": -3.73828125, "step": 4085 }, { "epoch": 0.7720723699749634, "grad_norm": 3.161446746489387, "learning_rate": 2.3544286078300117e-07, "logits/chosen": 3.21484375, "logits/rejected": 2.63671875, "logps/chosen": -1090.0, "logps/rejected": -1031.0, "loss": 0.506, "rewards/accuracies": 0.875, "rewards/chosen": 0.7060546875, "rewards/margins": 5.9609375, "rewards/rejected": -5.265625, "step": 4086 }, { "epoch": 0.7722613255231707, "grad_norm": 1.8484187185389511, "learning_rate": 2.352306769617953e-07, "logits/chosen": 2.75390625, "logits/rejected": 2.52734375, "logps/chosen": -922.0, "logps/rejected": -954.5, "loss": 0.6176, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2763671875, "rewards/margins": 4.73828125, "rewards/rejected": -4.4609375, "step": 4087 }, { "epoch": 0.7724502810713779, "grad_norm": 3.23939717584333, "learning_rate": 2.3501863008079228e-07, "logits/chosen": 2.669921875, "logits/rejected": 2.22265625, "logps/chosen": -901.5, "logps/rejected": -1079.0, "loss": 0.5282, "rewards/accuracies": 0.8125, "rewards/chosen": 1.05224609375, "rewards/margins": 5.05078125, "rewards/rejected": -3.99609375, "step": 4088 }, { "epoch": 0.7726392366195852, "grad_norm": 3.5649450736994908, "learning_rate": 2.3480672023224279e-07, "logits/chosen": 3.1171875, "logits/rejected": 2.71484375, "logps/chosen": -758.5, "logps/rejected": -800.5, "loss": 0.4672, "rewards/accuracies": 0.875, "rewards/chosen": 0.26953125, "rewards/margins": 5.5078125, "rewards/rejected": -5.25, "step": 4089 }, { "epoch": 0.7728281921677925, "grad_norm": 1.6910649040445334, "learning_rate": 2.345949475083385e-07, "logits/chosen": 2.166015625, "logits/rejected": 2.111328125, "logps/chosen": -1022.0, "logps/rejected": -894.0, "loss": 0.5509, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1190185546875, "rewards/margins": 5.328125, "rewards/rejected": -5.1953125, "step": 4090 }, { "epoch": 0.7730171477159998, "grad_norm": 4.045608032752789, "learning_rate": 2.343833120012104e-07, "logits/chosen": 3.12109375, "logits/rejected": 2.666015625, "logps/chosen": -865.0, "logps/rejected": -891.0, "loss": 0.5769, "rewards/accuracies": 0.6875, "rewards/chosen": -0.37841796875, "rewards/margins": 6.21875, "rewards/rejected": -6.5859375, "step": 4091 }, { "epoch": 0.7732061032642071, "grad_norm": 2.223560798520867, "learning_rate": 2.3417181380293082e-07, "logits/chosen": 3.59375, "logits/rejected": 3.3046875, "logps/chosen": -689.5, "logps/rejected": -617.0, "loss": 0.6561, "rewards/accuracies": 0.75, "rewards/chosen": 0.2117919921875, "rewards/margins": 3.53125, "rewards/rejected": -3.32421875, "step": 4092 }, { "epoch": 0.7733950588124143, "grad_norm": 3.8433545916497214, "learning_rate": 2.3396045300551184e-07, "logits/chosen": 3.55859375, "logits/rejected": 3.23046875, "logps/chosen": -780.5, "logps/rejected": -1074.0, "loss": 0.557, "rewards/accuracies": 0.8125, "rewards/chosen": 0.085693359375, "rewards/margins": 6.1171875, "rewards/rejected": -6.0390625, "step": 4093 }, { "epoch": 0.7735840143606216, "grad_norm": 3.9764923981562776, "learning_rate": 2.3374922970090584e-07, "logits/chosen": 3.12890625, "logits/rejected": 3.05859375, "logps/chosen": -811.0, "logps/rejected": -897.0, "loss": 0.4854, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9130859375, "rewards/margins": 5.6015625, "rewards/rejected": -4.6875, "step": 4094 }, { "epoch": 0.7737729699088289, "grad_norm": 2.0035995015481864, "learning_rate": 2.3353814398100524e-07, "logits/chosen": 2.7265625, "logits/rejected": 2.8251953125, "logps/chosen": -1216.0, "logps/rejected": -1946.0, "loss": 0.6316, "rewards/accuracies": 0.6875, "rewards/chosen": 0.86767578125, "rewards/margins": 6.0703125, "rewards/rejected": -5.1875, "step": 4095 }, { "epoch": 0.7739619254570362, "grad_norm": 4.462478831398825, "learning_rate": 2.333271959376428e-07, "logits/chosen": 2.64453125, "logits/rejected": 2.6328125, "logps/chosen": -779.5, "logps/rejected": -816.5, "loss": 0.5177, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5751953125, "rewards/margins": 5.97265625, "rewards/rejected": -5.40234375, "step": 4096 }, { "epoch": 0.7741508810052435, "grad_norm": 3.9258306771838956, "learning_rate": 2.331163856625916e-07, "logits/chosen": 1.9775390625, "logits/rejected": 1.29296875, "logps/chosen": -674.5, "logps/rejected": -705.0, "loss": 0.5037, "rewards/accuracies": 0.875, "rewards/chosen": 0.40673828125, "rewards/margins": 6.1171875, "rewards/rejected": -5.6953125, "step": 4097 }, { "epoch": 0.7743398365534508, "grad_norm": 3.1998483520617844, "learning_rate": 2.329057132475643e-07, "logits/chosen": 2.853515625, "logits/rejected": 2.48046875, "logps/chosen": -699.5, "logps/rejected": -696.0, "loss": 0.5429, "rewards/accuracies": 0.84375, "rewards/chosen": 0.439453125, "rewards/margins": 4.7109375, "rewards/rejected": -4.28125, "step": 4098 }, { "epoch": 0.774528792101658, "grad_norm": 5.316227031168214, "learning_rate": 2.3269517878421385e-07, "logits/chosen": 2.9287109375, "logits/rejected": 2.3095703125, "logps/chosen": -797.0, "logps/rejected": -735.5, "loss": 0.5667, "rewards/accuracies": 0.84375, "rewards/chosen": 0.344970703125, "rewards/margins": 4.984375, "rewards/rejected": -4.640625, "step": 4099 }, { "epoch": 0.7747177476498653, "grad_norm": 2.533611597003657, "learning_rate": 2.324847823641331e-07, "logits/chosen": 3.390625, "logits/rejected": 3.0234375, "logps/chosen": -1326.0, "logps/rejected": -1669.0, "loss": 0.4927, "rewards/accuracies": 0.875, "rewards/chosen": 0.1328125, "rewards/margins": 7.1796875, "rewards/rejected": -7.0390625, "step": 4100 }, { "epoch": 0.7749067031980726, "grad_norm": 2.2106637244253537, "learning_rate": 2.3227452407885493e-07, "logits/chosen": 3.328125, "logits/rejected": 2.880859375, "logps/chosen": -972.0, "logps/rejected": -964.0, "loss": 0.5754, "rewards/accuracies": 0.84375, "rewards/chosen": -0.17333984375, "rewards/margins": 4.73046875, "rewards/rejected": -4.8984375, "step": 4101 }, { "epoch": 0.77509565874628, "grad_norm": 3.9324856674700337, "learning_rate": 2.320644040198519e-07, "logits/chosen": 3.16796875, "logits/rejected": 3.576171875, "logps/chosen": -1018.0, "logps/rejected": -1073.0, "loss": 0.6254, "rewards/accuracies": 0.875, "rewards/chosen": 0.5791015625, "rewards/margins": 5.65625, "rewards/rejected": -5.078125, "step": 4102 }, { "epoch": 0.7752846142944873, "grad_norm": 4.4843358722058175, "learning_rate": 2.31854422278537e-07, "logits/chosen": 2.74609375, "logits/rejected": 2.6796875, "logps/chosen": -850.0, "logps/rejected": -673.5, "loss": 0.5992, "rewards/accuracies": 0.875, "rewards/chosen": 1.08447265625, "rewards/margins": 4.80078125, "rewards/rejected": -3.7109375, "step": 4103 }, { "epoch": 0.7754735698426946, "grad_norm": 2.8319792238691854, "learning_rate": 2.3164457894626237e-07, "logits/chosen": 2.91015625, "logits/rejected": 2.5234375, "logps/chosen": -1018.5, "logps/rejected": -1024.0, "loss": 0.4029, "rewards/accuracies": 0.8125, "rewards/chosen": 1.375, "rewards/margins": 6.0, "rewards/rejected": -4.6171875, "step": 4104 }, { "epoch": 0.7756625253909017, "grad_norm": 2.1441679476558257, "learning_rate": 2.3143487411432012e-07, "logits/chosen": 2.6171875, "logits/rejected": 2.0107421875, "logps/chosen": -962.0, "logps/rejected": -1744.0, "loss": 0.6206, "rewards/accuracies": 0.875, "rewards/chosen": -0.651123046875, "rewards/margins": 4.99609375, "rewards/rejected": -5.6484375, "step": 4105 }, { "epoch": 0.775851480939109, "grad_norm": 2.1901514564942954, "learning_rate": 2.3122530787394272e-07, "logits/chosen": 2.4072265625, "logits/rejected": 2.609375, "logps/chosen": -606.0, "logps/rejected": -947.5, "loss": 0.6149, "rewards/accuracies": 0.75, "rewards/chosen": 0.23046875, "rewards/margins": 3.73828125, "rewards/rejected": -3.5078125, "step": 4106 }, { "epoch": 0.7760404364873164, "grad_norm": 2.34514771073029, "learning_rate": 2.310158803163012e-07, "logits/chosen": 3.4140625, "logits/rejected": 3.04296875, "logps/chosen": -811.0, "logps/rejected": -1449.0, "loss": 0.5872, "rewards/accuracies": 0.75, "rewards/chosen": 0.513671875, "rewards/margins": 6.48046875, "rewards/rejected": -5.96484375, "step": 4107 }, { "epoch": 0.7762293920355237, "grad_norm": 1.521086750962843, "learning_rate": 2.3080659153250734e-07, "logits/chosen": 3.390625, "logits/rejected": 3.42578125, "logps/chosen": -692.0, "logps/rejected": -1535.5, "loss": 0.5949, "rewards/accuracies": 0.71875, "rewards/chosen": -0.341796875, "rewards/margins": 10.4921875, "rewards/rejected": -10.8203125, "step": 4108 }, { "epoch": 0.776418347583731, "grad_norm": 5.610034033193165, "learning_rate": 2.3059744161361194e-07, "logits/chosen": 3.1875, "logits/rejected": 2.64453125, "logps/chosen": -896.0, "logps/rejected": -962.0, "loss": 0.475, "rewards/accuracies": 0.875, "rewards/chosen": 0.6656494140625, "rewards/margins": 7.4296875, "rewards/rejected": -6.76171875, "step": 4109 }, { "epoch": 0.7766073031319383, "grad_norm": 4.143085154271623, "learning_rate": 2.303884306506054e-07, "logits/chosen": 2.892578125, "logits/rejected": 2.796875, "logps/chosen": -851.0, "logps/rejected": -1033.0, "loss": 0.6238, "rewards/accuracies": 0.875, "rewards/chosen": -0.38916015625, "rewards/margins": 4.65625, "rewards/rejected": -5.0390625, "step": 4110 }, { "epoch": 0.7767962586801455, "grad_norm": 3.9163874320957595, "learning_rate": 2.301795587344183e-07, "logits/chosen": 3.03125, "logits/rejected": 2.83984375, "logps/chosen": -736.0, "logps/rejected": -850.5, "loss": 0.5411, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1435546875, "rewards/margins": 5.296875, "rewards/rejected": -5.1484375, "step": 4111 }, { "epoch": 0.7769852142283528, "grad_norm": 1.84789754746799, "learning_rate": 2.2997082595591955e-07, "logits/chosen": 3.44921875, "logits/rejected": 3.96875, "logps/chosen": -1162.0, "logps/rejected": -865.0, "loss": 0.5987, "rewards/accuracies": 0.78125, "rewards/chosen": -0.320831298828125, "rewards/margins": 4.42578125, "rewards/rejected": -4.7578125, "step": 4112 }, { "epoch": 0.7771741697765601, "grad_norm": 3.186133490544675, "learning_rate": 2.2976223240591874e-07, "logits/chosen": 2.88671875, "logits/rejected": 3.01171875, "logps/chosen": -1240.5, "logps/rejected": -941.5, "loss": 0.6238, "rewards/accuracies": 0.8125, "rewards/chosen": 1.078857421875, "rewards/margins": 4.8671875, "rewards/rejected": -3.783203125, "step": 4113 }, { "epoch": 0.7773631253247674, "grad_norm": 3.526501423607217, "learning_rate": 2.2955377817516422e-07, "logits/chosen": 2.984375, "logits/rejected": 2.7421875, "logps/chosen": -936.0, "logps/rejected": -1293.0, "loss": 0.5657, "rewards/accuracies": 0.75, "rewards/chosen": 0.87646484375, "rewards/margins": 6.06640625, "rewards/rejected": -5.18359375, "step": 4114 }, { "epoch": 0.7775520808729747, "grad_norm": 2.067349309819726, "learning_rate": 2.2934546335434385e-07, "logits/chosen": 3.064453125, "logits/rejected": 2.65625, "logps/chosen": -867.0, "logps/rejected": -1163.5, "loss": 0.6621, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3817138671875, "rewards/margins": 6.60546875, "rewards/rejected": -6.23828125, "step": 4115 }, { "epoch": 0.7777410364211819, "grad_norm": 4.492187755537079, "learning_rate": 2.2913728803408485e-07, "logits/chosen": 3.7734375, "logits/rejected": 3.45703125, "logps/chosen": -1059.0, "logps/rejected": -1399.0, "loss": 0.6262, "rewards/accuracies": 0.625, "rewards/chosen": 1.025390625, "rewards/margins": 8.140625, "rewards/rejected": -7.1171875, "step": 4116 }, { "epoch": 0.7779299919693892, "grad_norm": 3.437838613176367, "learning_rate": 2.2892925230495368e-07, "logits/chosen": 1.89404296875, "logits/rejected": 2.041015625, "logps/chosen": -558.5, "logps/rejected": -1656.0, "loss": 0.718, "rewards/accuracies": 0.75, "rewards/chosen": -0.22509765625, "rewards/margins": 5.14453125, "rewards/rejected": -5.37890625, "step": 4117 }, { "epoch": 0.7781189475175965, "grad_norm": 4.0320129356273, "learning_rate": 2.287213562574564e-07, "logits/chosen": 2.58203125, "logits/rejected": 2.548828125, "logps/chosen": -880.0, "logps/rejected": -825.5, "loss": 0.5279, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9036865234375, "rewards/margins": 5.12890625, "rewards/rejected": -4.21875, "step": 4118 }, { "epoch": 0.7783079030658038, "grad_norm": 5.1777481924664395, "learning_rate": 2.2851359998203796e-07, "logits/chosen": 2.330078125, "logits/rejected": 2.134521484375, "logps/chosen": -926.5, "logps/rejected": -796.0, "loss": 0.6589, "rewards/accuracies": 0.8125, "rewards/chosen": 1.3017578125, "rewards/margins": 4.6669921875, "rewards/rejected": -3.3681640625, "step": 4119 }, { "epoch": 0.7784968586140111, "grad_norm": 2.8332192274997166, "learning_rate": 2.283059835690825e-07, "logits/chosen": 2.9091796875, "logits/rejected": 3.33203125, "logps/chosen": -886.5, "logps/rejected": -1658.0, "loss": 0.528, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0283203125, "rewards/margins": 6.5078125, "rewards/rejected": -5.46875, "step": 4120 }, { "epoch": 0.7786858141622184, "grad_norm": 2.699552621391277, "learning_rate": 2.2809850710891353e-07, "logits/chosen": 3.2265625, "logits/rejected": 3.4609375, "logps/chosen": -827.0, "logps/rejected": -1761.0, "loss": 0.5767, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0947265625, "rewards/margins": 8.86328125, "rewards/rejected": -7.75390625, "step": 4121 }, { "epoch": 0.7788747697104256, "grad_norm": 4.378856095093868, "learning_rate": 2.2789117069179345e-07, "logits/chosen": 2.7421875, "logits/rejected": 2.81640625, "logps/chosen": -515.5, "logps/rejected": -1527.5, "loss": 0.7801, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8994140625, "rewards/margins": 7.66796875, "rewards/rejected": -8.5703125, "step": 4122 }, { "epoch": 0.7790637252586329, "grad_norm": 2.4576740267437156, "learning_rate": 2.2768397440792384e-07, "logits/chosen": 3.0234375, "logits/rejected": 2.97265625, "logps/chosen": -641.0, "logps/rejected": -1021.0, "loss": 0.5931, "rewards/accuracies": 0.75, "rewards/chosen": 0.72900390625, "rewards/margins": 5.765625, "rewards/rejected": -5.04833984375, "step": 4123 }, { "epoch": 0.7792526808068402, "grad_norm": 1.6663031866065752, "learning_rate": 2.274769183474455e-07, "logits/chosen": 2.4375, "logits/rejected": 2.05859375, "logps/chosen": -524.5, "logps/rejected": -810.0, "loss": 0.6184, "rewards/accuracies": 0.875, "rewards/chosen": 0.250244140625, "rewards/margins": 4.74609375, "rewards/rejected": -4.4921875, "step": 4124 }, { "epoch": 0.7794416363550475, "grad_norm": 2.3944140031020904, "learning_rate": 2.2727000260043792e-07, "logits/chosen": 3.80078125, "logits/rejected": 3.330078125, "logps/chosen": -560.5, "logps/rejected": -715.5, "loss": 0.7066, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0146484375, "rewards/margins": 3.40576171875, "rewards/rejected": -3.419921875, "step": 4125 }, { "epoch": 0.7796305919032548, "grad_norm": 7.407285585432965, "learning_rate": 2.2706322725691968e-07, "logits/chosen": 3.70703125, "logits/rejected": 3.4609375, "logps/chosen": -944.5, "logps/rejected": -1045.0, "loss": 0.5662, "rewards/accuracies": 0.84375, "rewards/chosen": 1.470703125, "rewards/margins": 5.0859375, "rewards/rejected": -3.607421875, "step": 4126 }, { "epoch": 0.7798195474514621, "grad_norm": 1.314611260912561, "learning_rate": 2.268565924068482e-07, "logits/chosen": 2.841796875, "logits/rejected": 2.7578125, "logps/chosen": -895.0, "logps/rejected": -1212.0, "loss": 0.5425, "rewards/accuracies": 0.90625, "rewards/chosen": 1.1787109375, "rewards/margins": 6.61328125, "rewards/rejected": -5.4375, "step": 4127 }, { "epoch": 0.7800085029996693, "grad_norm": 2.7004682431922595, "learning_rate": 2.2665009814011984e-07, "logits/chosen": 2.76220703125, "logits/rejected": 2.686279296875, "logps/chosen": -925.5, "logps/rejected": -1082.0, "loss": 0.5592, "rewards/accuracies": 0.78125, "rewards/chosen": 1.08154296875, "rewards/margins": 5.904296875, "rewards/rejected": -4.828125, "step": 4128 }, { "epoch": 0.7801974585478766, "grad_norm": 3.694108564944772, "learning_rate": 2.2644374454657012e-07, "logits/chosen": 3.171875, "logits/rejected": 2.978515625, "logps/chosen": -559.0, "logps/rejected": -724.5, "loss": 0.6894, "rewards/accuracies": 0.65625, "rewards/chosen": 0.44287109375, "rewards/margins": 2.83203125, "rewards/rejected": -2.38671875, "step": 4129 }, { "epoch": 0.7803864140960839, "grad_norm": 3.1611682336853195, "learning_rate": 2.262375317159725e-07, "logits/chosen": 2.171875, "logits/rejected": 2.095458984375, "logps/chosen": -1091.5, "logps/rejected": -1161.0, "loss": 0.586, "rewards/accuracies": 0.84375, "rewards/chosen": 1.12109375, "rewards/margins": 5.95703125, "rewards/rejected": -4.83203125, "step": 4130 }, { "epoch": 0.7805753696442912, "grad_norm": 2.6416277544286983, "learning_rate": 2.2603145973804026e-07, "logits/chosen": 2.78515625, "logits/rejected": 2.478515625, "logps/chosen": -811.0, "logps/rejected": -819.0, "loss": 0.681, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4443359375, "rewards/margins": 4.0546875, "rewards/rejected": -3.609375, "step": 4131 }, { "epoch": 0.7807643251924985, "grad_norm": 3.7357575370371263, "learning_rate": 2.258255287024246e-07, "logits/chosen": 2.1171875, "logits/rejected": 1.8076171875, "logps/chosen": -949.0, "logps/rejected": -794.0, "loss": 0.5671, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5037841796875, "rewards/margins": 4.96875, "rewards/rejected": -4.47265625, "step": 4132 }, { "epoch": 0.7809532807407058, "grad_norm": 2.531965748713176, "learning_rate": 2.2561973869871569e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.404296875, "logps/chosen": -942.0, "logps/rejected": -1663.0, "loss": 0.481, "rewards/accuracies": 0.78125, "rewards/chosen": 0.90380859375, "rewards/margins": 6.3515625, "rewards/rejected": -5.4453125, "step": 4133 }, { "epoch": 0.781142236288913, "grad_norm": 1.7361646454563742, "learning_rate": 2.2541408981644266e-07, "logits/chosen": 2.943359375, "logits/rejected": 2.5703125, "logps/chosen": -770.0, "logps/rejected": -1459.5, "loss": 0.5672, "rewards/accuracies": 0.8125, "rewards/chosen": 1.70703125, "rewards/margins": 6.828125, "rewards/rejected": -5.12109375, "step": 4134 }, { "epoch": 0.7813311918371203, "grad_norm": 1.8694496251479542, "learning_rate": 2.252085821450724e-07, "logits/chosen": 2.47265625, "logits/rejected": 2.4775390625, "logps/chosen": -825.0, "logps/rejected": -866.0, "loss": 0.5833, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5400390625, "rewards/margins": 4.28125, "rewards/rejected": -3.734375, "step": 4135 }, { "epoch": 0.7815201473853276, "grad_norm": 2.6525104146605587, "learning_rate": 2.2500321577401133e-07, "logits/chosen": 2.98828125, "logits/rejected": 2.84765625, "logps/chosen": -760.0, "logps/rejected": -1024.0, "loss": 0.6495, "rewards/accuracies": 0.75, "rewards/chosen": -0.41259765625, "rewards/margins": 6.09375, "rewards/rejected": -6.5078125, "step": 4136 }, { "epoch": 0.7817091029335349, "grad_norm": 2.1561765853676995, "learning_rate": 2.2479799079260387e-07, "logits/chosen": 3.091796875, "logits/rejected": 2.8251953125, "logps/chosen": -612.0, "logps/rejected": -718.5, "loss": 0.6026, "rewards/accuracies": 0.71875, "rewards/chosen": 0.313232421875, "rewards/margins": 3.50390625, "rewards/rejected": -3.189453125, "step": 4137 }, { "epoch": 0.7818980584817422, "grad_norm": 2.3800178562364267, "learning_rate": 2.245929072901328e-07, "logits/chosen": 2.96484375, "logits/rejected": 2.541015625, "logps/chosen": -627.5, "logps/rejected": -626.5, "loss": 0.5212, "rewards/accuracies": 0.84375, "rewards/chosen": 0.871826171875, "rewards/margins": 5.47265625, "rewards/rejected": -4.59765625, "step": 4138 }, { "epoch": 0.7820870140299494, "grad_norm": 2.4042997116333193, "learning_rate": 2.2438796535582005e-07, "logits/chosen": 2.7734375, "logits/rejected": 3.02734375, "logps/chosen": -582.0, "logps/rejected": -790.0, "loss": 0.6422, "rewards/accuracies": 0.8125, "rewards/chosen": 0.633056640625, "rewards/margins": 4.037109375, "rewards/rejected": -3.39453125, "step": 4139 }, { "epoch": 0.7822759695781567, "grad_norm": 2.5488806565636337, "learning_rate": 2.241831650788249e-07, "logits/chosen": 3.0234375, "logits/rejected": 2.76953125, "logps/chosen": -534.0, "logps/rejected": -531.5, "loss": 0.6569, "rewards/accuracies": 0.78125, "rewards/chosen": 0.392578125, "rewards/margins": 3.95703125, "rewards/rejected": -3.5546875, "step": 4140 }, { "epoch": 0.782464925126364, "grad_norm": 2.267321254869409, "learning_rate": 2.2397850654824608e-07, "logits/chosen": 2.8828125, "logits/rejected": 2.87109375, "logps/chosen": -579.0, "logps/rejected": -645.0, "loss": 0.7417, "rewards/accuracies": 0.75, "rewards/chosen": -0.0224609375, "rewards/margins": 2.9560546875, "rewards/rejected": -2.986328125, "step": 4141 }, { "epoch": 0.7826538806745713, "grad_norm": 1.674624363759356, "learning_rate": 2.2377398985311985e-07, "logits/chosen": 4.41015625, "logits/rejected": 4.43359375, "logps/chosen": -512.0, "logps/rejected": -750.0, "loss": 0.6879, "rewards/accuracies": 0.78125, "rewards/chosen": 0.478515625, "rewards/margins": 4.203125, "rewards/rejected": -3.72265625, "step": 4142 }, { "epoch": 0.7828428362227786, "grad_norm": 1.8513687639734242, "learning_rate": 2.2356961508242122e-07, "logits/chosen": 2.14453125, "logits/rejected": 2.56640625, "logps/chosen": -748.0, "logps/rejected": -841.0, "loss": 0.5466, "rewards/accuracies": 0.8125, "rewards/chosen": 0.46484375, "rewards/margins": 5.361328125, "rewards/rejected": -4.8984375, "step": 4143 }, { "epoch": 0.7830317917709859, "grad_norm": 2.1190086847620044, "learning_rate": 2.2336538232506308e-07, "logits/chosen": 2.3359375, "logits/rejected": 2.234375, "logps/chosen": -603.0, "logps/rejected": -664.5, "loss": 0.5873, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2861328125, "rewards/margins": 4.6171875, "rewards/rejected": -4.3359375, "step": 4144 }, { "epoch": 0.7832207473191931, "grad_norm": 2.4850234260645316, "learning_rate": 2.2316129166989717e-07, "logits/chosen": 3.8515625, "logits/rejected": 2.931640625, "logps/chosen": -818.0, "logps/rejected": -829.0, "loss": 0.5129, "rewards/accuracies": 0.875, "rewards/chosen": 0.21044921875, "rewards/margins": 5.45703125, "rewards/rejected": -5.2578125, "step": 4145 }, { "epoch": 0.7834097028674004, "grad_norm": 2.096426969033016, "learning_rate": 2.2295734320571275e-07, "logits/chosen": 2.35546875, "logits/rejected": 1.83203125, "logps/chosen": -803.0, "logps/rejected": -854.0, "loss": 0.5643, "rewards/accuracies": 0.8125, "rewards/chosen": 0.889892578125, "rewards/margins": 4.9765625, "rewards/rejected": -4.072265625, "step": 4146 }, { "epoch": 0.7835986584156077, "grad_norm": 2.10215783998423, "learning_rate": 2.227535370212375e-07, "logits/chosen": 2.775390625, "logits/rejected": 1.8857421875, "logps/chosen": -881.0, "logps/rejected": -676.5, "loss": 0.5357, "rewards/accuracies": 0.90625, "rewards/chosen": 0.51544189453125, "rewards/margins": 5.11328125, "rewards/rejected": -4.59375, "step": 4147 }, { "epoch": 0.783787613963815, "grad_norm": 2.567513000481293, "learning_rate": 2.2254987320513725e-07, "logits/chosen": 2.4140625, "logits/rejected": 2.23046875, "logps/chosen": -1029.0, "logps/rejected": -885.0, "loss": 0.6064, "rewards/accuracies": 0.90625, "rewards/chosen": 0.734375, "rewards/margins": 5.234375, "rewards/rejected": -4.4921875, "step": 4148 }, { "epoch": 0.7839765695120223, "grad_norm": 2.9343926249103767, "learning_rate": 2.2234635184601553e-07, "logits/chosen": 2.9296875, "logits/rejected": 2.69140625, "logps/chosen": -853.5, "logps/rejected": -945.5, "loss": 0.5733, "rewards/accuracies": 0.8125, "rewards/chosen": 1.076171875, "rewards/margins": 4.84765625, "rewards/rejected": -3.76953125, "step": 4149 }, { "epoch": 0.7841655250602296, "grad_norm": 1.0652413898578588, "learning_rate": 2.2214297303241475e-07, "logits/chosen": 3.3515625, "logits/rejected": 3.689453125, "logps/chosen": -838.0, "logps/rejected": -1099.0, "loss": 0.5845, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0361328125, "rewards/margins": 5.9453125, "rewards/rejected": -4.9140625, "step": 4150 }, { "epoch": 0.7843544806084368, "grad_norm": 2.0592691668485363, "learning_rate": 2.2193973685281414e-07, "logits/chosen": 3.38671875, "logits/rejected": 3.53515625, "logps/chosen": -773.5, "logps/rejected": -581.5, "loss": 0.5775, "rewards/accuracies": 0.78125, "rewards/chosen": 0.206787109375, "rewards/margins": 5.0546875, "rewards/rejected": -4.8359375, "step": 4151 }, { "epoch": 0.7845434361566441, "grad_norm": 1.9468760710104318, "learning_rate": 2.217366433956318e-07, "logits/chosen": 3.30078125, "logits/rejected": 3.0625, "logps/chosen": -1124.0, "logps/rejected": -1099.5, "loss": 0.5788, "rewards/accuracies": 0.875, "rewards/chosen": 0.8720703125, "rewards/margins": 6.37109375, "rewards/rejected": -5.5, "step": 4152 }, { "epoch": 0.7847323917048514, "grad_norm": 1.944308395241838, "learning_rate": 2.2153369274922346e-07, "logits/chosen": 2.498046875, "logits/rejected": 2.193359375, "logps/chosen": -937.0, "logps/rejected": -734.0, "loss": 0.5471, "rewards/accuracies": 0.84375, "rewards/chosen": 1.419921875, "rewards/margins": 5.203125, "rewards/rejected": -3.78515625, "step": 4153 }, { "epoch": 0.7849213472530587, "grad_norm": 1.8158784706399544, "learning_rate": 2.2133088500188236e-07, "logits/chosen": 2.7158203125, "logits/rejected": 2.48046875, "logps/chosen": -965.0, "logps/rejected": -1010.0, "loss": 0.3913, "rewards/accuracies": 0.9375, "rewards/chosen": 1.10546875, "rewards/margins": 7.734375, "rewards/rejected": -6.640625, "step": 4154 }, { "epoch": 0.785110302801266, "grad_norm": 2.224531338576728, "learning_rate": 2.2112822024184042e-07, "logits/chosen": 3.01953125, "logits/rejected": 3.083984375, "logps/chosen": -969.0, "logps/rejected": -1751.0, "loss": 0.5414, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2734375, "rewards/margins": 7.546875, "rewards/rejected": -7.2734375, "step": 4155 }, { "epoch": 0.7852992583494733, "grad_norm": 3.6931723654742323, "learning_rate": 2.2092569855726617e-07, "logits/chosen": 2.3203125, "logits/rejected": 1.7265625, "logps/chosen": -782.0, "logps/rejected": -869.5, "loss": 0.555, "rewards/accuracies": 0.78125, "rewards/chosen": 0.38104248046875, "rewards/margins": 4.7109375, "rewards/rejected": -4.32421875, "step": 4156 }, { "epoch": 0.7854882138976805, "grad_norm": 3.689737782858233, "learning_rate": 2.2072332003626705e-07, "logits/chosen": 2.740234375, "logits/rejected": 2.1767578125, "logps/chosen": -882.0, "logps/rejected": -827.0, "loss": 0.5881, "rewards/accuracies": 0.78125, "rewards/chosen": 0.148651123046875, "rewards/margins": 5.15234375, "rewards/rejected": -5.01171875, "step": 4157 }, { "epoch": 0.7856771694458878, "grad_norm": 4.146788144843757, "learning_rate": 2.2052108476688754e-07, "logits/chosen": 2.193359375, "logits/rejected": 2.3203125, "logps/chosen": -852.0, "logps/rejected": -1128.0, "loss": 0.5323, "rewards/accuracies": 0.90625, "rewards/chosen": 0.88134765625, "rewards/margins": 5.34375, "rewards/rejected": -4.46875, "step": 4158 }, { "epoch": 0.7858661249940951, "grad_norm": 2.216794603444979, "learning_rate": 2.2031899283710982e-07, "logits/chosen": 2.31640625, "logits/rejected": 2.33984375, "logps/chosen": -775.0, "logps/rejected": -702.5, "loss": 0.5955, "rewards/accuracies": 0.78125, "rewards/chosen": 0.73046875, "rewards/margins": 4.3359375, "rewards/rejected": -3.59765625, "step": 4159 }, { "epoch": 0.7860550805423024, "grad_norm": 3.1169325788611886, "learning_rate": 2.201170443348543e-07, "logits/chosen": 2.6142578125, "logits/rejected": 2.8046875, "logps/chosen": -718.0, "logps/rejected": -1574.5, "loss": 0.5965, "rewards/accuracies": 0.84375, "rewards/chosen": 0.546539306640625, "rewards/margins": 7.171875, "rewards/rejected": -6.625, "step": 4160 }, { "epoch": 0.7862440360905097, "grad_norm": 2.3692244261540476, "learning_rate": 2.1991523934797796e-07, "logits/chosen": 2.3359375, "logits/rejected": 1.974609375, "logps/chosen": -834.0, "logps/rejected": -735.0, "loss": 0.494, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6337890625, "rewards/margins": 5.34375, "rewards/rejected": -4.71875, "step": 4161 }, { "epoch": 0.7864329916387169, "grad_norm": 2.847283592327029, "learning_rate": 2.1971357796427637e-07, "logits/chosen": 1.841796875, "logits/rejected": 1.49609375, "logps/chosen": -1013.0, "logps/rejected": -1165.0, "loss": 0.5441, "rewards/accuracies": 0.8125, "rewards/chosen": 0.66064453125, "rewards/margins": 6.12109375, "rewards/rejected": -5.46875, "step": 4162 }, { "epoch": 0.7866219471869242, "grad_norm": 2.2903759324490687, "learning_rate": 2.1951206027148205e-07, "logits/chosen": 2.2900390625, "logits/rejected": 2.0625, "logps/chosen": -565.0, "logps/rejected": -671.5, "loss": 0.5724, "rewards/accuracies": 0.84375, "rewards/chosen": 0.017578125, "rewards/margins": 4.58203125, "rewards/rejected": -4.55078125, "step": 4163 }, { "epoch": 0.7868109027351315, "grad_norm": 2.2371859050715566, "learning_rate": 2.1931068635726507e-07, "logits/chosen": 2.5859375, "logits/rejected": 2.109375, "logps/chosen": -607.5, "logps/rejected": -550.0, "loss": 0.5596, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8037109375, "rewards/margins": 3.94140625, "rewards/rejected": -3.1328125, "step": 4164 }, { "epoch": 0.7869998582833388, "grad_norm": 1.9192612921384655, "learning_rate": 2.1910945630923338e-07, "logits/chosen": 2.490234375, "logits/rejected": 2.28125, "logps/chosen": -747.5, "logps/rejected": -889.0, "loss": 0.5565, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0052490234375, "rewards/margins": 5.69140625, "rewards/rejected": -4.69140625, "step": 4165 }, { "epoch": 0.7871888138315462, "grad_norm": 1.8067565599263578, "learning_rate": 2.1890837021493147e-07, "logits/chosen": 2.67578125, "logits/rejected": 2.3115234375, "logps/chosen": -977.5, "logps/rejected": -1007.0, "loss": 0.5677, "rewards/accuracies": 0.6875, "rewards/chosen": 1.2589111328125, "rewards/margins": 5.1953125, "rewards/rejected": -3.93359375, "step": 4166 }, { "epoch": 0.7873777693797535, "grad_norm": 2.725696852194184, "learning_rate": 2.1870742816184213e-07, "logits/chosen": 3.0703125, "logits/rejected": 3.05078125, "logps/chosen": -792.0, "logps/rejected": -769.0, "loss": 0.6472, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5908203125, "rewards/margins": 3.888427734375, "rewards/rejected": -3.30078125, "step": 4167 }, { "epoch": 0.7875667249279606, "grad_norm": 4.161079040789319, "learning_rate": 2.1850663023738492e-07, "logits/chosen": 3.107421875, "logits/rejected": 3.26953125, "logps/chosen": -529.0, "logps/rejected": -1598.5, "loss": 0.6474, "rewards/accuracies": 0.78125, "rewards/chosen": -0.262451171875, "rewards/margins": 5.7578125, "rewards/rejected": -6.01953125, "step": 4168 }, { "epoch": 0.787755680476168, "grad_norm": 4.420420552882745, "learning_rate": 2.1830597652891685e-07, "logits/chosen": 3.5390625, "logits/rejected": 3.4921875, "logps/chosen": -540.0, "logps/rejected": -609.0, "loss": 0.6339, "rewards/accuracies": 0.75, "rewards/chosen": 0.927520751953125, "rewards/margins": 4.66015625, "rewards/rejected": -3.73828125, "step": 4169 }, { "epoch": 0.7879446360243753, "grad_norm": 2.7388504913856937, "learning_rate": 2.1810546712373217e-07, "logits/chosen": 3.37109375, "logits/rejected": 2.8828125, "logps/chosen": -760.0, "logps/rejected": -811.0, "loss": 0.5399, "rewards/accuracies": 0.75, "rewards/chosen": 0.461181640625, "rewards/margins": 5.1328125, "rewards/rejected": -4.671875, "step": 4170 }, { "epoch": 0.7881335915725826, "grad_norm": 2.7406207428135194, "learning_rate": 2.179051021090623e-07, "logits/chosen": 2.994140625, "logits/rejected": 2.8828125, "logps/chosen": -933.0, "logps/rejected": -1305.0, "loss": 0.4692, "rewards/accuracies": 0.875, "rewards/chosen": 1.69140625, "rewards/margins": 7.5703125, "rewards/rejected": -5.890625, "step": 4171 }, { "epoch": 0.7883225471207899, "grad_norm": 2.876324154952636, "learning_rate": 2.1770488157207625e-07, "logits/chosen": 3.2578125, "logits/rejected": 3.06640625, "logps/chosen": -888.0, "logps/rejected": -1314.5, "loss": 0.5844, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5938720703125, "rewards/margins": 6.0205078125, "rewards/rejected": -6.6328125, "step": 4172 }, { "epoch": 0.7885115026689972, "grad_norm": 1.7108591227262508, "learning_rate": 2.1750480559987955e-07, "logits/chosen": 3.3671875, "logits/rejected": 3.0390625, "logps/chosen": -645.5, "logps/rejected": -800.0, "loss": 0.6778, "rewards/accuracies": 0.75, "rewards/chosen": 1.301025390625, "rewards/margins": 4.42578125, "rewards/rejected": -3.1328125, "step": 4173 }, { "epoch": 0.7887004582172044, "grad_norm": 2.454252359947136, "learning_rate": 2.1730487427951528e-07, "logits/chosen": 3.48828125, "logits/rejected": 3.11328125, "logps/chosen": -598.0, "logps/rejected": -645.0, "loss": 0.586, "rewards/accuracies": 0.8125, "rewards/chosen": 0.37353515625, "rewards/margins": 3.93359375, "rewards/rejected": -3.55859375, "step": 4174 }, { "epoch": 0.7888894137654117, "grad_norm": 2.121741725764535, "learning_rate": 2.1710508769796326e-07, "logits/chosen": 4.078125, "logits/rejected": 3.5, "logps/chosen": -982.0, "logps/rejected": -1350.0, "loss": 0.597, "rewards/accuracies": 0.78125, "rewards/chosen": 0.60302734375, "rewards/margins": 4.71875, "rewards/rejected": -4.109375, "step": 4175 }, { "epoch": 0.789078369313619, "grad_norm": 1.9840459920648712, "learning_rate": 2.1690544594214074e-07, "logits/chosen": 2.376953125, "logits/rejected": 2.8359375, "logps/chosen": -990.0, "logps/rejected": -1680.0, "loss": 0.6745, "rewards/accuracies": 0.8125, "rewards/chosen": 0.342529296875, "rewards/margins": 4.828125, "rewards/rejected": -4.48828125, "step": 4176 }, { "epoch": 0.7892673248618263, "grad_norm": 17.02307055977743, "learning_rate": 2.1670594909890143e-07, "logits/chosen": 3.4296875, "logits/rejected": 3.16015625, "logps/chosen": -739.0, "logps/rejected": -961.0, "loss": 0.591, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5386962890625, "rewards/margins": 5.2890625, "rewards/rejected": -4.751953125, "step": 4177 }, { "epoch": 0.7894562804100336, "grad_norm": 3.247480642426645, "learning_rate": 2.1650659725503666e-07, "logits/chosen": 3.490234375, "logits/rejected": 3.435546875, "logps/chosen": -752.0, "logps/rejected": -1191.5, "loss": 0.6829, "rewards/accuracies": 0.71875, "rewards/chosen": 0.626953125, "rewards/margins": 7.728515625, "rewards/rejected": -7.125, "step": 4178 }, { "epoch": 0.7896452359582409, "grad_norm": 3.322832704798343, "learning_rate": 2.1630739049727416e-07, "logits/chosen": 2.91015625, "logits/rejected": 2.455078125, "logps/chosen": -1260.0, "logps/rejected": -789.5, "loss": 0.5769, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1533203125, "rewards/margins": 5.765625, "rewards/rejected": -4.6171875, "step": 4179 }, { "epoch": 0.7898341915064481, "grad_norm": 4.200332507928342, "learning_rate": 2.1610832891227864e-07, "logits/chosen": 3.216796875, "logits/rejected": 3.087890625, "logps/chosen": -1168.0, "logps/rejected": -1439.0, "loss": 0.5912, "rewards/accuracies": 0.84375, "rewards/chosen": 1.84375, "rewards/margins": 9.546875, "rewards/rejected": -7.7109375, "step": 4180 }, { "epoch": 0.7900231470546554, "grad_norm": 2.914876833610068, "learning_rate": 2.1590941258665207e-07, "logits/chosen": 3.3125, "logits/rejected": 2.9296875, "logps/chosen": -846.0, "logps/rejected": -1112.0, "loss": 0.649, "rewards/accuracies": 0.75, "rewards/chosen": 0.34619140625, "rewards/margins": 3.7421875, "rewards/rejected": -3.39453125, "step": 4181 }, { "epoch": 0.7902121026028627, "grad_norm": 3.2264854712571167, "learning_rate": 2.157106416069323e-07, "logits/chosen": 3.16015625, "logits/rejected": 2.89453125, "logps/chosen": -766.0, "logps/rejected": -736.5, "loss": 0.6102, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0341796875, "rewards/margins": 3.59375, "rewards/rejected": -2.556640625, "step": 4182 }, { "epoch": 0.79040105815107, "grad_norm": 4.38296545625669, "learning_rate": 2.1551201605959508e-07, "logits/chosen": 2.52734375, "logits/rejected": 2.46875, "logps/chosen": -664.5, "logps/rejected": -991.0, "loss": 0.5724, "rewards/accuracies": 0.84375, "rewards/chosen": 0.803955078125, "rewards/margins": 5.712890625, "rewards/rejected": -4.91796875, "step": 4183 }, { "epoch": 0.7905900136992773, "grad_norm": 3.7753596384024632, "learning_rate": 2.1531353603105205e-07, "logits/chosen": 2.94140625, "logits/rejected": 2.92578125, "logps/chosen": -956.0, "logps/rejected": -825.0, "loss": 0.5162, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9794921875, "rewards/margins": 5.421875, "rewards/rejected": -4.4453125, "step": 4184 }, { "epoch": 0.7907789692474845, "grad_norm": 2.5508232482236517, "learning_rate": 2.1511520160765174e-07, "logits/chosen": 2.640625, "logits/rejected": 2.06640625, "logps/chosen": -964.0, "logps/rejected": -1015.0, "loss": 0.5475, "rewards/accuracies": 0.71875, "rewards/chosen": 0.613311767578125, "rewards/margins": 5.552734375, "rewards/rejected": -4.94140625, "step": 4185 }, { "epoch": 0.7909679247956918, "grad_norm": 2.254146183806959, "learning_rate": 2.1491701287567977e-07, "logits/chosen": 3.48828125, "logits/rejected": 3.40234375, "logps/chosen": -1152.0, "logps/rejected": -1910.5, "loss": 0.7034, "rewards/accuracies": 0.6875, "rewards/chosen": -2.17724609375, "rewards/margins": 2.232421875, "rewards/rejected": -4.40625, "step": 4186 }, { "epoch": 0.7911568803438991, "grad_norm": 3.4076229221828807, "learning_rate": 2.1471896992135757e-07, "logits/chosen": 2.0458984375, "logits/rejected": 1.76953125, "logps/chosen": -962.0, "logps/rejected": -940.0, "loss": 0.6781, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6396484375, "rewards/margins": 4.365234375, "rewards/rejected": -3.72265625, "step": 4187 }, { "epoch": 0.7913458358921064, "grad_norm": 2.2819503951626916, "learning_rate": 2.14521072830844e-07, "logits/chosen": 2.95703125, "logits/rejected": 2.4140625, "logps/chosen": -815.5, "logps/rejected": -1329.5, "loss": 0.6658, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6083984375, "rewards/margins": 4.1171875, "rewards/rejected": -3.509765625, "step": 4188 }, { "epoch": 0.7915347914403137, "grad_norm": 5.238158452684041, "learning_rate": 2.143233216902339e-07, "logits/chosen": 3.765625, "logits/rejected": 3.921875, "logps/chosen": -669.0, "logps/rejected": -956.5, "loss": 0.5462, "rewards/accuracies": 0.84375, "rewards/chosen": 1.7275390625, "rewards/margins": 6.875, "rewards/rejected": -5.15234375, "step": 4189 }, { "epoch": 0.791723746988521, "grad_norm": 5.0301749577779225, "learning_rate": 2.1412571658555877e-07, "logits/chosen": 1.96875, "logits/rejected": 2.431640625, "logps/chosen": -918.0, "logps/rejected": -845.0, "loss": 0.6177, "rewards/accuracies": 0.8125, "rewards/chosen": 0.75518798828125, "rewards/margins": 4.39453125, "rewards/rejected": -3.63671875, "step": 4190 }, { "epoch": 0.7919127025367282, "grad_norm": 3.709482008089718, "learning_rate": 2.1392825760278664e-07, "logits/chosen": 2.63671875, "logits/rejected": 2.328125, "logps/chosen": -1150.0, "logps/rejected": -1128.0, "loss": 0.6691, "rewards/accuracies": 0.78125, "rewards/chosen": 0.876220703125, "rewards/margins": 4.0234375, "rewards/rejected": -3.140625, "step": 4191 }, { "epoch": 0.7921016580849355, "grad_norm": 2.2883830632652633, "learning_rate": 2.1373094482782185e-07, "logits/chosen": 2.787109375, "logits/rejected": 2.484375, "logps/chosen": -519.5, "logps/rejected": -669.0, "loss": 0.5021, "rewards/accuracies": 0.875, "rewards/chosen": 0.94921875, "rewards/margins": 4.6875, "rewards/rejected": -3.73828125, "step": 4192 }, { "epoch": 0.7922906136331428, "grad_norm": 3.551438897712589, "learning_rate": 2.1353377834650532e-07, "logits/chosen": 2.23828125, "logits/rejected": 1.623046875, "logps/chosen": -804.0, "logps/rejected": -783.0, "loss": 0.5621, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7353515625, "rewards/margins": 4.47265625, "rewards/rejected": -3.7421875, "step": 4193 }, { "epoch": 0.7924795691813501, "grad_norm": 1.738927732552113, "learning_rate": 2.133367582446141e-07, "logits/chosen": 2.720703125, "logits/rejected": 2.583984375, "logps/chosen": -534.5, "logps/rejected": -2108.0, "loss": 0.604, "rewards/accuracies": 0.78125, "rewards/chosen": 0.670654296875, "rewards/margins": 6.79296875, "rewards/rejected": -6.1171875, "step": 4194 }, { "epoch": 0.7926685247295574, "grad_norm": 2.0655000501567264, "learning_rate": 2.1313988460786175e-07, "logits/chosen": 3.1484375, "logits/rejected": 3.6953125, "logps/chosen": -739.25, "logps/rejected": -762.5, "loss": 0.6134, "rewards/accuracies": 0.875, "rewards/chosen": 0.853515625, "rewards/margins": 3.76171875, "rewards/rejected": -2.90234375, "step": 4195 }, { "epoch": 0.7928574802777647, "grad_norm": 1.759360790212517, "learning_rate": 2.1294315752189802e-07, "logits/chosen": 3.65625, "logits/rejected": 3.23046875, "logps/chosen": -473.0, "logps/rejected": -403.0, "loss": 0.6279, "rewards/accuracies": 0.71875, "rewards/chosen": 0.843505859375, "rewards/margins": 3.48046875, "rewards/rejected": -2.634765625, "step": 4196 }, { "epoch": 0.7930464358259719, "grad_norm": 1.6695563427361968, "learning_rate": 2.1274657707230881e-07, "logits/chosen": 3.55078125, "logits/rejected": 2.9921875, "logps/chosen": -741.75, "logps/rejected": -592.5, "loss": 0.5508, "rewards/accuracies": 0.71875, "rewards/chosen": 1.14453125, "rewards/margins": 4.828125, "rewards/rejected": -3.68359375, "step": 4197 }, { "epoch": 0.7932353913741792, "grad_norm": 2.3900700433143203, "learning_rate": 2.125501433446163e-07, "logits/chosen": 3.515625, "logits/rejected": 3.216796875, "logps/chosen": -611.5, "logps/rejected": -699.0, "loss": 0.5603, "rewards/accuracies": 0.8125, "rewards/chosen": 0.93017578125, "rewards/margins": 4.2109375, "rewards/rejected": -3.28125, "step": 4198 }, { "epoch": 0.7934243469223865, "grad_norm": 1.9260522571569592, "learning_rate": 2.1235385642427907e-07, "logits/chosen": 2.40625, "logits/rejected": 2.2578125, "logps/chosen": -887.0, "logps/rejected": -1153.0, "loss": 0.5145, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0712890625, "rewards/margins": 4.7578125, "rewards/rejected": -3.6875, "step": 4199 }, { "epoch": 0.7936133024705938, "grad_norm": 1.856809080027446, "learning_rate": 2.1215771639669148e-07, "logits/chosen": 3.04296875, "logits/rejected": 2.337890625, "logps/chosen": -727.5, "logps/rejected": -747.5, "loss": 0.5215, "rewards/accuracies": 0.84375, "rewards/chosen": 0.98876953125, "rewards/margins": 5.5859375, "rewards/rejected": -4.587890625, "step": 4200 }, { "epoch": 0.7938022580188011, "grad_norm": 2.4953535849606325, "learning_rate": 2.119617233471842e-07, "logits/chosen": 3.90625, "logits/rejected": 3.5390625, "logps/chosen": -593.5, "logps/rejected": -666.5, "loss": 0.6686, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5048828125, "rewards/margins": 3.19921875, "rewards/rejected": -2.6845703125, "step": 4201 }, { "epoch": 0.7939912135670084, "grad_norm": 2.2825191443983366, "learning_rate": 2.117658773610238e-07, "logits/chosen": 3.26953125, "logits/rejected": 3.2734375, "logps/chosen": -832.0, "logps/rejected": -951.0, "loss": 0.7685, "rewards/accuracies": 0.6875, "rewards/chosen": 1.5009765625, "rewards/margins": 3.3671875, "rewards/rejected": -1.869140625, "step": 4202 }, { "epoch": 0.7941801691152156, "grad_norm": 2.0027277649517474, "learning_rate": 2.115701785234129e-07, "logits/chosen": 2.294921875, "logits/rejected": 1.953125, "logps/chosen": -572.0, "logps/rejected": -450.5, "loss": 0.5377, "rewards/accuracies": 0.78125, "rewards/chosen": 0.369384765625, "rewards/margins": 4.8828125, "rewards/rejected": -4.52734375, "step": 4203 }, { "epoch": 0.7943691246634229, "grad_norm": 2.6666029275144014, "learning_rate": 2.1137462691949055e-07, "logits/chosen": 3.828125, "logits/rejected": 3.2265625, "logps/chosen": -646.5, "logps/rejected": -915.0, "loss": 0.5618, "rewards/accuracies": 0.8125, "rewards/chosen": 0.300537109375, "rewards/margins": 4.2109375, "rewards/rejected": -3.90234375, "step": 4204 }, { "epoch": 0.7945580802116302, "grad_norm": 2.2320864820929076, "learning_rate": 2.1117922263433074e-07, "logits/chosen": 4.296875, "logits/rejected": 3.4375, "logps/chosen": -548.0, "logps/rejected": -581.5, "loss": 0.7408, "rewards/accuracies": 0.65625, "rewards/chosen": 0.666259765625, "rewards/margins": 2.86328125, "rewards/rejected": -2.197265625, "step": 4205 }, { "epoch": 0.7947470357598375, "grad_norm": 4.26666445228336, "learning_rate": 2.1098396575294454e-07, "logits/chosen": 3.158203125, "logits/rejected": 3.11328125, "logps/chosen": -542.25, "logps/rejected": -813.0, "loss": 0.5307, "rewards/accuracies": 0.84375, "rewards/chosen": 0.353515625, "rewards/margins": 4.375, "rewards/rejected": -4.0234375, "step": 4206 }, { "epoch": 0.7949359913080448, "grad_norm": 1.7174763244458602, "learning_rate": 2.1078885636027803e-07, "logits/chosen": 3.0078125, "logits/rejected": 2.66015625, "logps/chosen": -791.0, "logps/rejected": -613.0, "loss": 0.4937, "rewards/accuracies": 0.875, "rewards/chosen": 0.72265625, "rewards/margins": 4.765625, "rewards/rejected": -4.0390625, "step": 4207 }, { "epoch": 0.795124946856252, "grad_norm": 2.187746914957449, "learning_rate": 2.105938945412134e-07, "logits/chosen": 2.609375, "logits/rejected": 2.265625, "logps/chosen": -739.0, "logps/rejected": -1092.0, "loss": 0.6063, "rewards/accuracies": 0.71875, "rewards/chosen": 1.3623046875, "rewards/margins": 4.47265625, "rewards/rejected": -3.1171875, "step": 4208 }, { "epoch": 0.7953139024044593, "grad_norm": 2.4943968509448102, "learning_rate": 2.1039908038056907e-07, "logits/chosen": 2.79296875, "logits/rejected": 2.21875, "logps/chosen": -1076.0, "logps/rejected": -895.0, "loss": 0.4752, "rewards/accuracies": 0.875, "rewards/chosen": 1.576171875, "rewards/margins": 5.40625, "rewards/rejected": -3.828125, "step": 4209 }, { "epoch": 0.7955028579526666, "grad_norm": 3.289115367881881, "learning_rate": 2.1020441396309823e-07, "logits/chosen": 3.21875, "logits/rejected": 3.28515625, "logps/chosen": -820.5, "logps/rejected": -1125.0, "loss": 0.6094, "rewards/accuracies": 0.78125, "rewards/chosen": 1.041015625, "rewards/margins": 4.9921875, "rewards/rejected": -3.955078125, "step": 4210 }, { "epoch": 0.7956918135008739, "grad_norm": 1.7008665220500356, "learning_rate": 2.1000989537349087e-07, "logits/chosen": 3.3125, "logits/rejected": 2.8125, "logps/chosen": -463.5, "logps/rejected": -510.0, "loss": 0.5884, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7890625, "rewards/margins": 4.234375, "rewards/rejected": -3.44140625, "step": 4211 }, { "epoch": 0.7958807690490812, "grad_norm": 2.554513091898862, "learning_rate": 2.09815524696372e-07, "logits/chosen": 2.9765625, "logits/rejected": 2.91015625, "logps/chosen": -851.5, "logps/rejected": -855.5, "loss": 0.5204, "rewards/accuracies": 0.8125, "rewards/chosen": 1.490234375, "rewards/margins": 5.828125, "rewards/rejected": -4.349609375, "step": 4212 }, { "epoch": 0.7960697245972885, "grad_norm": 3.9255685117224624, "learning_rate": 2.0962130201630235e-07, "logits/chosen": 2.94921875, "logits/rejected": 2.708984375, "logps/chosen": -789.0, "logps/rejected": -874.5, "loss": 0.5613, "rewards/accuracies": 0.875, "rewards/chosen": 1.1181640625, "rewards/margins": 5.27734375, "rewards/rejected": -4.150390625, "step": 4213 }, { "epoch": 0.7962586801454957, "grad_norm": 4.107400584916853, "learning_rate": 2.0942722741777874e-07, "logits/chosen": 1.970703125, "logits/rejected": 2.3720703125, "logps/chosen": -823.5, "logps/rejected": -2332.0, "loss": 0.576, "rewards/accuracies": 0.8125, "rewards/chosen": 1.296875, "rewards/margins": 8.3359375, "rewards/rejected": -7.0546875, "step": 4214 }, { "epoch": 0.796447635693703, "grad_norm": 2.990630045648702, "learning_rate": 2.0923330098523273e-07, "logits/chosen": 2.349609375, "logits/rejected": 2.021484375, "logps/chosen": -818.5, "logps/rejected": -870.5, "loss": 0.5301, "rewards/accuracies": 0.84375, "rewards/chosen": 0.853515625, "rewards/margins": 5.15625, "rewards/rejected": -4.30859375, "step": 4215 }, { "epoch": 0.7966365912419103, "grad_norm": 3.324838316849701, "learning_rate": 2.0903952280303227e-07, "logits/chosen": 3.41015625, "logits/rejected": 3.62890625, "logps/chosen": -1126.0, "logps/rejected": -1286.0, "loss": 0.536, "rewards/accuracies": 0.875, "rewards/chosen": 2.0283203125, "rewards/margins": 5.328125, "rewards/rejected": -3.302734375, "step": 4216 }, { "epoch": 0.7968255467901176, "grad_norm": 2.387748568019654, "learning_rate": 2.0884589295548026e-07, "logits/chosen": 2.482421875, "logits/rejected": 2.455078125, "logps/chosen": -674.5, "logps/rejected": -850.5, "loss": 0.5869, "rewards/accuracies": 0.84375, "rewards/chosen": 0.701904296875, "rewards/margins": 4.0703125, "rewards/rejected": -3.37109375, "step": 4217 }, { "epoch": 0.7970145023383249, "grad_norm": 2.4800855915297397, "learning_rate": 2.0865241152681533e-07, "logits/chosen": 3.66796875, "logits/rejected": 3.53125, "logps/chosen": -547.5, "logps/rejected": -777.5, "loss": 0.6651, "rewards/accuracies": 0.75, "rewards/chosen": 0.214599609375, "rewards/margins": 4.3359375, "rewards/rejected": -4.125, "step": 4218 }, { "epoch": 0.7972034578865322, "grad_norm": 3.5782064957476103, "learning_rate": 2.0845907860121133e-07, "logits/chosen": 2.517578125, "logits/rejected": 2.3974609375, "logps/chosen": -822.0, "logps/rejected": -1126.0, "loss": 0.6359, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1064453125, "rewards/margins": 6.109375, "rewards/rejected": -5.0, "step": 4219 }, { "epoch": 0.7973924134347394, "grad_norm": 2.060107787109723, "learning_rate": 2.0826589426277774e-07, "logits/chosen": 3.359375, "logits/rejected": 2.869140625, "logps/chosen": -589.25, "logps/rejected": -748.0, "loss": 0.5498, "rewards/accuracies": 0.875, "rewards/chosen": 0.98681640625, "rewards/margins": 4.921875, "rewards/rejected": -3.953125, "step": 4220 }, { "epoch": 0.7975813689829467, "grad_norm": 2.3239922486722806, "learning_rate": 2.0807285859555937e-07, "logits/chosen": 3.162109375, "logits/rejected": 2.85546875, "logps/chosen": -579.5, "logps/rejected": -748.5, "loss": 0.5477, "rewards/accuracies": 0.875, "rewards/chosen": 0.80810546875, "rewards/margins": 4.1875, "rewards/rejected": -3.37890625, "step": 4221 }, { "epoch": 0.797770324531154, "grad_norm": 1.937634215983856, "learning_rate": 2.078799716835362e-07, "logits/chosen": 2.552734375, "logits/rejected": 2.18359375, "logps/chosen": -539.0, "logps/rejected": -624.0, "loss": 0.7249, "rewards/accuracies": 0.6875, "rewards/chosen": 0.69171142578125, "rewards/margins": 2.70703125, "rewards/rejected": -2.017578125, "step": 4222 }, { "epoch": 0.7979592800793613, "grad_norm": 2.073173994117828, "learning_rate": 2.0768723361062345e-07, "logits/chosen": 2.80859375, "logits/rejected": 2.2548828125, "logps/chosen": -677.0, "logps/rejected": -849.0, "loss": 0.603, "rewards/accuracies": 0.84375, "rewards/chosen": 0.65185546875, "rewards/margins": 4.1796875, "rewards/rejected": -3.537109375, "step": 4223 }, { "epoch": 0.7981482356275686, "grad_norm": 1.8982733499756377, "learning_rate": 2.0749464446067173e-07, "logits/chosen": 3.25, "logits/rejected": 3.47265625, "logps/chosen": -845.5, "logps/rejected": -1106.0, "loss": 0.4646, "rewards/accuracies": 0.84375, "rewards/chosen": 1.537841796875, "rewards/margins": 8.19921875, "rewards/rejected": -6.6796875, "step": 4224 }, { "epoch": 0.798337191175776, "grad_norm": 2.802272386251645, "learning_rate": 2.0730220431746706e-07, "logits/chosen": 2.79296875, "logits/rejected": 2.96875, "logps/chosen": -725.5, "logps/rejected": -763.5, "loss": 0.5546, "rewards/accuracies": 0.75, "rewards/chosen": 1.810546875, "rewards/margins": 4.57421875, "rewards/rejected": -2.7593994140625, "step": 4225 }, { "epoch": 0.7985261467239831, "grad_norm": 2.1804491643115895, "learning_rate": 2.0710991326473004e-07, "logits/chosen": 3.62890625, "logits/rejected": 3.8125, "logps/chosen": -607.0, "logps/rejected": -909.0, "loss": 0.6038, "rewards/accuracies": 0.8125, "rewards/chosen": 0.634765625, "rewards/margins": 4.84765625, "rewards/rejected": -4.2138671875, "step": 4226 }, { "epoch": 0.7987151022721904, "grad_norm": 1.5737884709137697, "learning_rate": 2.0691777138611707e-07, "logits/chosen": 2.81640625, "logits/rejected": 2.57421875, "logps/chosen": -1164.0, "logps/rejected": -2258.0, "loss": 0.4603, "rewards/accuracies": 0.8125, "rewards/chosen": 2.3037109375, "rewards/margins": 7.71875, "rewards/rejected": -5.41015625, "step": 4227 }, { "epoch": 0.7989040578203978, "grad_norm": 3.1257777812738183, "learning_rate": 2.067257787652193e-07, "logits/chosen": 3.015625, "logits/rejected": 3.015625, "logps/chosen": -959.0, "logps/rejected": -1566.0, "loss": 0.7693, "rewards/accuracies": 0.65625, "rewards/chosen": 0.09423828125, "rewards/margins": 5.50341796875, "rewards/rejected": -5.404541015625, "step": 4228 }, { "epoch": 0.799093013368605, "grad_norm": 2.272006619408011, "learning_rate": 2.0653393548556287e-07, "logits/chosen": 3.556640625, "logits/rejected": 3.169921875, "logps/chosen": -751.0, "logps/rejected": -882.0, "loss": 0.6266, "rewards/accuracies": 0.78125, "rewards/chosen": 0.26953125, "rewards/margins": 4.2421875, "rewards/rejected": -3.970703125, "step": 4229 }, { "epoch": 0.7992819689168124, "grad_norm": 2.7072134952969575, "learning_rate": 2.0634224163060934e-07, "logits/chosen": 3.52734375, "logits/rejected": 2.982421875, "logps/chosen": -719.0, "logps/rejected": -724.0, "loss": 0.6571, "rewards/accuracies": 0.65625, "rewards/chosen": 0.50830078125, "rewards/margins": 4.52734375, "rewards/rejected": -4.02734375, "step": 4230 }, { "epoch": 0.7994709244650196, "grad_norm": 2.073502180296965, "learning_rate": 2.0615069728375468e-07, "logits/chosen": 3.06640625, "logits/rejected": 2.8203125, "logps/chosen": -645.0, "logps/rejected": -867.0, "loss": 0.5758, "rewards/accuracies": 0.8125, "rewards/chosen": 0.83251953125, "rewards/margins": 5.1015625, "rewards/rejected": -4.25390625, "step": 4231 }, { "epoch": 0.7996598800132269, "grad_norm": 2.134026236336989, "learning_rate": 2.0595930252833035e-07, "logits/chosen": 3.6171875, "logits/rejected": 3.064453125, "logps/chosen": -866.0, "logps/rejected": -862.0, "loss": 0.568, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0859375, "rewards/margins": 5.310546875, "rewards/rejected": -5.40625, "step": 4232 }, { "epoch": 0.7998488355614342, "grad_norm": 1.8900766921439431, "learning_rate": 2.057680574476025e-07, "logits/chosen": 3.08984375, "logits/rejected": 3.43359375, "logps/chosen": -784.0, "logps/rejected": -773.0, "loss": 0.5708, "rewards/accuracies": 0.8125, "rewards/chosen": 0.423828125, "rewards/margins": 5.2890625, "rewards/rejected": -4.8671875, "step": 4233 }, { "epoch": 0.8000377911096415, "grad_norm": 3.975572327494668, "learning_rate": 2.0557696212477204e-07, "logits/chosen": 3.8046875, "logits/rejected": 3.77734375, "logps/chosen": -625.5, "logps/rejected": -801.5, "loss": 0.5508, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8519287109375, "rewards/margins": 4.90234375, "rewards/rejected": -4.056640625, "step": 4234 }, { "epoch": 0.8002267466578488, "grad_norm": 3.500078658937488, "learning_rate": 2.0538601664297526e-07, "logits/chosen": 3.2734375, "logits/rejected": 3.31640625, "logps/chosen": -750.75, "logps/rejected": -909.0, "loss": 0.6494, "rewards/accuracies": 0.75, "rewards/chosen": 0.6724853515625, "rewards/margins": 7.4609375, "rewards/rejected": -6.7734375, "step": 4235 }, { "epoch": 0.8004157022060561, "grad_norm": 1.9841292161050426, "learning_rate": 2.0519522108528233e-07, "logits/chosen": 3.05078125, "logits/rejected": 3.3125, "logps/chosen": -590.0, "logps/rejected": -1456.0, "loss": 0.8193, "rewards/accuracies": 0.625, "rewards/chosen": -0.8837890625, "rewards/margins": 2.8779296875, "rewards/rejected": -3.759765625, "step": 4236 }, { "epoch": 0.8006046577542633, "grad_norm": 2.954803237561683, "learning_rate": 2.0500457553469918e-07, "logits/chosen": 2.8828125, "logits/rejected": 2.234375, "logps/chosen": -691.0, "logps/rejected": -8666.0, "loss": 0.5526, "rewards/accuracies": 0.75, "rewards/chosen": 0.2840576171875, "rewards/margins": -21.05078125, "rewards/rejected": 21.3046875, "step": 4237 }, { "epoch": 0.8007936133024706, "grad_norm": 3.5714985959532766, "learning_rate": 2.0481408007416588e-07, "logits/chosen": 2.14453125, "logits/rejected": 1.716796875, "logps/chosen": -790.5, "logps/rejected": -1156.0, "loss": 0.4041, "rewards/accuracies": 0.90625, "rewards/chosen": 1.51171875, "rewards/margins": 8.9921875, "rewards/rejected": -7.4765625, "step": 4238 }, { "epoch": 0.8009825688506779, "grad_norm": 2.4709689277931486, "learning_rate": 2.0462373478655737e-07, "logits/chosen": 2.16796875, "logits/rejected": 2.123046875, "logps/chosen": -557.5, "logps/rejected": -989.0, "loss": 0.6008, "rewards/accuracies": 0.875, "rewards/chosen": -0.1370849609375, "rewards/margins": 5.44921875, "rewards/rejected": -5.59375, "step": 4239 }, { "epoch": 0.8011715243988852, "grad_norm": 2.1108476794158375, "learning_rate": 2.0443353975468326e-07, "logits/chosen": 3.39453125, "logits/rejected": 3.30859375, "logps/chosen": -1085.0, "logps/rejected": -1207.0, "loss": 0.501, "rewards/accuracies": 0.78125, "rewards/chosen": 1.031494140625, "rewards/margins": 7.08203125, "rewards/rejected": -6.05078125, "step": 4240 }, { "epoch": 0.8013604799470925, "grad_norm": 2.328285587824618, "learning_rate": 2.0424349506128763e-07, "logits/chosen": 2.28515625, "logits/rejected": 2.234375, "logps/chosen": -1070.0, "logps/rejected": -1244.0, "loss": 0.513, "rewards/accuracies": 0.875, "rewards/chosen": 0.77099609375, "rewards/margins": 6.171875, "rewards/rejected": -5.3984375, "step": 4241 }, { "epoch": 0.8015494354952998, "grad_norm": 2.026525876610239, "learning_rate": 2.0405360078904958e-07, "logits/chosen": 3.451171875, "logits/rejected": 2.5078125, "logps/chosen": -665.0, "logps/rejected": -756.0, "loss": 0.4454, "rewards/accuracies": 0.9375, "rewards/chosen": 2.080078125, "rewards/margins": 6.0078125, "rewards/rejected": -3.92578125, "step": 4242 }, { "epoch": 0.801738391043507, "grad_norm": 2.1066424176558503, "learning_rate": 2.0386385702058236e-07, "logits/chosen": 2.248046875, "logits/rejected": 1.748046875, "logps/chosen": -978.0, "logps/rejected": -744.0, "loss": 0.5487, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7591552734375, "rewards/margins": 5.11328125, "rewards/rejected": -4.35546875, "step": 4243 }, { "epoch": 0.8019273465917143, "grad_norm": 1.9695960677212672, "learning_rate": 2.036742638384339e-07, "logits/chosen": 2.30859375, "logits/rejected": 2.33203125, "logps/chosen": -694.5, "logps/rejected": -812.0, "loss": 0.5457, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3642578125, "rewards/margins": 4.48828125, "rewards/rejected": -4.140625, "step": 4244 }, { "epoch": 0.8021163021399216, "grad_norm": 2.673152706560901, "learning_rate": 2.034848213250866e-07, "logits/chosen": 2.23046875, "logits/rejected": 2.009765625, "logps/chosen": -899.0, "logps/rejected": -1128.0, "loss": 0.5262, "rewards/accuracies": 0.84375, "rewards/chosen": 0.18603515625, "rewards/margins": 4.78125, "rewards/rejected": -4.59765625, "step": 4245 }, { "epoch": 0.8023052576881289, "grad_norm": 2.0675952881371273, "learning_rate": 2.032955295629573e-07, "logits/chosen": 2.07379150390625, "logits/rejected": 2.1611328125, "logps/chosen": -808.0, "logps/rejected": -1096.0, "loss": 0.4845, "rewards/accuracies": 0.84375, "rewards/chosen": 1.258056640625, "rewards/margins": 6.3515625, "rewards/rejected": -5.0859375, "step": 4246 }, { "epoch": 0.8024942132363362, "grad_norm": 2.0415220475722218, "learning_rate": 2.031063886343971e-07, "logits/chosen": 3.052734375, "logits/rejected": 2.8974609375, "logps/chosen": -517.5, "logps/rejected": -437.5, "loss": 0.5187, "rewards/accuracies": 0.875, "rewards/chosen": 0.58984375, "rewards/margins": 4.62109375, "rewards/rejected": -4.03515625, "step": 4247 }, { "epoch": 0.8026831687845435, "grad_norm": 3.066701517884461, "learning_rate": 2.0291739862169199e-07, "logits/chosen": 3.064453125, "logits/rejected": 2.7333984375, "logps/chosen": -942.0, "logps/rejected": -807.5, "loss": 0.6745, "rewards/accuracies": 0.71875, "rewards/chosen": 1.0206298828125, "rewards/margins": 4.505859375, "rewards/rejected": -3.484375, "step": 4248 }, { "epoch": 0.8028721243327507, "grad_norm": 3.1575805757777333, "learning_rate": 2.0272855960706177e-07, "logits/chosen": 3.0625, "logits/rejected": 3.23828125, "logps/chosen": -703.0, "logps/rejected": -706.0, "loss": 0.6207, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9140625, "rewards/margins": 3.84375, "rewards/rejected": -2.9296875, "step": 4249 }, { "epoch": 0.803061079880958, "grad_norm": 3.475065953353337, "learning_rate": 2.0253987167266073e-07, "logits/chosen": 2.576171875, "logits/rejected": 2.458984375, "logps/chosen": -788.5, "logps/rejected": -727.0, "loss": 0.6041, "rewards/accuracies": 0.75, "rewards/chosen": 1.22802734375, "rewards/margins": 3.984375, "rewards/rejected": -2.75, "step": 4250 }, { "epoch": 0.8032500354291653, "grad_norm": 2.155603430374156, "learning_rate": 2.0235133490057748e-07, "logits/chosen": 2.685546875, "logits/rejected": 2.318359375, "logps/chosen": -725.0, "logps/rejected": -785.0, "loss": 0.5913, "rewards/accuracies": 0.78125, "rewards/chosen": 1.19140625, "rewards/margins": 4.671875, "rewards/rejected": -3.48046875, "step": 4251 }, { "epoch": 0.8034389909773726, "grad_norm": 3.7453034253571995, "learning_rate": 2.021629493728347e-07, "logits/chosen": 1.9052734375, "logits/rejected": 1.884765625, "logps/chosen": -974.5, "logps/rejected": -1067.0, "loss": 0.4271, "rewards/accuracies": 0.9375, "rewards/chosen": 1.595703125, "rewards/margins": 6.4140625, "rewards/rejected": -4.83203125, "step": 4252 }, { "epoch": 0.8036279465255799, "grad_norm": 2.7759639372753555, "learning_rate": 2.019747151713897e-07, "logits/chosen": 2.25, "logits/rejected": 2.0107421875, "logps/chosen": -967.0, "logps/rejected": -883.0, "loss": 0.5472, "rewards/accuracies": 0.9375, "rewards/chosen": 0.249786376953125, "rewards/margins": 5.578125, "rewards/rejected": -5.328125, "step": 4253 }, { "epoch": 0.8038169020737871, "grad_norm": 4.240078272786469, "learning_rate": 2.0178663237813326e-07, "logits/chosen": 3.0078125, "logits/rejected": 3.3046875, "logps/chosen": -710.0, "logps/rejected": -878.0, "loss": 0.6142, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7666015625, "rewards/margins": 5.8984375, "rewards/rejected": -5.14453125, "step": 4254 }, { "epoch": 0.8040058576219944, "grad_norm": 2.6527428644291757, "learning_rate": 2.015987010748909e-07, "logits/chosen": 2.498046875, "logits/rejected": 2.5546875, "logps/chosen": -719.0, "logps/rejected": -908.0, "loss": 0.6509, "rewards/accuracies": 0.71875, "rewards/chosen": 0.27838134765625, "rewards/margins": 4.26953125, "rewards/rejected": -3.9921875, "step": 4255 }, { "epoch": 0.8041948131702017, "grad_norm": 2.1941539520187368, "learning_rate": 2.0141092134342224e-07, "logits/chosen": 3.16796875, "logits/rejected": 2.921875, "logps/chosen": -917.0, "logps/rejected": -1042.0, "loss": 0.6296, "rewards/accuracies": 0.90625, "rewards/chosen": 0.102447509765625, "rewards/margins": 5.6875, "rewards/rejected": -5.58203125, "step": 4256 }, { "epoch": 0.804383768718409, "grad_norm": 2.3378760817712605, "learning_rate": 2.0122329326542027e-07, "logits/chosen": 3.3125, "logits/rejected": 3.07421875, "logps/chosen": -786.5, "logps/rejected": -1106.5, "loss": 0.4891, "rewards/accuracies": 0.84375, "rewards/chosen": 1.09912109375, "rewards/margins": 5.4453125, "rewards/rejected": -4.33203125, "step": 4257 }, { "epoch": 0.8045727242666163, "grad_norm": 1.781613711627137, "learning_rate": 2.0103581692251297e-07, "logits/chosen": 1.96533203125, "logits/rejected": 1.4541015625, "logps/chosen": -981.0, "logps/rejected": -977.0, "loss": 0.5533, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7568359375, "rewards/margins": 6.828125, "rewards/rejected": -6.0546875, "step": 4258 }, { "epoch": 0.8047616798148236, "grad_norm": 2.1945932574233353, "learning_rate": 2.008484923962613e-07, "logits/chosen": 3.390625, "logits/rejected": 3.8046875, "logps/chosen": -600.0, "logps/rejected": -1069.0, "loss": 0.6266, "rewards/accuracies": 0.75, "rewards/chosen": 0.6962890625, "rewards/margins": 4.888671875, "rewards/rejected": -4.185546875, "step": 4259 }, { "epoch": 0.8049506353630308, "grad_norm": 1.8840588453283038, "learning_rate": 2.0066131976816096e-07, "logits/chosen": 4.234375, "logits/rejected": 3.953125, "logps/chosen": -1021.0, "logps/rejected": -908.0, "loss": 0.5412, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3675537109375, "rewards/margins": 6.0390625, "rewards/rejected": -4.671875, "step": 4260 }, { "epoch": 0.8051395909112381, "grad_norm": 4.243333987219946, "learning_rate": 2.0047429911964136e-07, "logits/chosen": 2.685546875, "logits/rejected": 2.73046875, "logps/chosen": -751.5, "logps/rejected": -806.0, "loss": 0.5416, "rewards/accuracies": 0.84375, "rewards/chosen": 0.55029296875, "rewards/margins": 5.3046875, "rewards/rejected": -4.75, "step": 4261 }, { "epoch": 0.8053285464594454, "grad_norm": 3.919929320113871, "learning_rate": 2.002874305320655e-07, "logits/chosen": 2.712890625, "logits/rejected": 2.828125, "logps/chosen": -780.0, "logps/rejected": -701.0, "loss": 0.5704, "rewards/accuracies": 0.8125, "rewards/chosen": -0.00146484375, "rewards/margins": 4.46484375, "rewards/rejected": -4.47265625, "step": 4262 }, { "epoch": 0.8055175020076527, "grad_norm": 2.209199035567818, "learning_rate": 2.001007140867306e-07, "logits/chosen": 3.47265625, "logits/rejected": 3.5703125, "logps/chosen": -739.0, "logps/rejected": -683.5, "loss": 0.5695, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6103515625, "rewards/margins": 4.65625, "rewards/rejected": -4.05078125, "step": 4263 }, { "epoch": 0.80570645755586, "grad_norm": 2.3690352668035493, "learning_rate": 1.9991414986486755e-07, "logits/chosen": 2.3359375, "logits/rejected": 1.875, "logps/chosen": -689.0, "logps/rejected": -700.0, "loss": 0.6751, "rewards/accuracies": 0.75, "rewards/chosen": 0.1103515625, "rewards/margins": 3.513671875, "rewards/rejected": -3.40576171875, "step": 4264 }, { "epoch": 0.8058954131040673, "grad_norm": 1.8582601069018143, "learning_rate": 1.9972773794764095e-07, "logits/chosen": 3.001953125, "logits/rejected": 2.96875, "logps/chosen": -819.0, "logps/rejected": -990.0, "loss": 0.6535, "rewards/accuracies": 0.78125, "rewards/chosen": 0.599609375, "rewards/margins": 4.69140625, "rewards/rejected": -4.08984375, "step": 4265 }, { "epoch": 0.8060843686522745, "grad_norm": 2.3811551502579604, "learning_rate": 1.9954147841614918e-07, "logits/chosen": 2.6103515625, "logits/rejected": 2.400390625, "logps/chosen": -949.0, "logps/rejected": -1083.0, "loss": 0.5295, "rewards/accuracies": 0.875, "rewards/chosen": 1.048828125, "rewards/margins": 5.46875, "rewards/rejected": -4.4140625, "step": 4266 }, { "epoch": 0.8062733242004818, "grad_norm": 1.4919379368381684, "learning_rate": 1.9935537135142438e-07, "logits/chosen": 3.626953125, "logits/rejected": 3.37109375, "logps/chosen": -1126.0, "logps/rejected": -1137.5, "loss": 0.4379, "rewards/accuracies": 0.9375, "rewards/chosen": 1.75146484375, "rewards/margins": 6.7578125, "rewards/rejected": -5.015625, "step": 4267 }, { "epoch": 0.8064622797486891, "grad_norm": 2.6899523141130515, "learning_rate": 1.9916941683443212e-07, "logits/chosen": 3.405426025390625, "logits/rejected": 2.52587890625, "logps/chosen": -502.5, "logps/rejected": -563.0, "loss": 0.6127, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3740234375, "rewards/margins": 4.15625, "rewards/rejected": -3.78125, "step": 4268 }, { "epoch": 0.8066512352968964, "grad_norm": 3.8682113802792943, "learning_rate": 1.9898361494607207e-07, "logits/chosen": 3.0390625, "logits/rejected": 2.791015625, "logps/chosen": -892.5, "logps/rejected": -1194.0, "loss": 0.5798, "rewards/accuracies": 0.8125, "rewards/chosen": 0.692138671875, "rewards/margins": 5.0234375, "rewards/rejected": -4.328125, "step": 4269 }, { "epoch": 0.8068401908451037, "grad_norm": 3.330066812750589, "learning_rate": 1.9879796576717718e-07, "logits/chosen": 2.19921875, "logits/rejected": 1.93359375, "logps/chosen": -847.0, "logps/rejected": -660.5, "loss": 0.6392, "rewards/accuracies": 0.75, "rewards/chosen": -0.00390625, "rewards/margins": 3.8330078125, "rewards/rejected": -3.84375, "step": 4270 }, { "epoch": 0.807029146393311, "grad_norm": 4.383872324712623, "learning_rate": 1.9861246937851394e-07, "logits/chosen": 2.96875, "logits/rejected": 2.7578125, "logps/chosen": -672.5, "logps/rejected": -971.0, "loss": 0.5183, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3828125, "rewards/margins": 5.3125, "rewards/rejected": -4.9296875, "step": 4271 }, { "epoch": 0.8072181019415182, "grad_norm": 4.502596594670311, "learning_rate": 1.9842712586078242e-07, "logits/chosen": 4.2265625, "logits/rejected": 3.8125, "logps/chosen": -623.5, "logps/rejected": -830.0, "loss": 0.5727, "rewards/accuracies": 0.71875, "rewards/chosen": 0.400634765625, "rewards/margins": 4.74609375, "rewards/rejected": -4.337890625, "step": 4272 }, { "epoch": 0.8074070574897255, "grad_norm": 1.5654514290114763, "learning_rate": 1.9824193529461618e-07, "logits/chosen": 2.9140625, "logits/rejected": 2.390625, "logps/chosen": -990.0, "logps/rejected": -824.0, "loss": 0.4871, "rewards/accuracies": 0.84375, "rewards/chosen": 1.64453125, "rewards/margins": 5.734375, "rewards/rejected": -4.08984375, "step": 4273 }, { "epoch": 0.8075960130379328, "grad_norm": 4.0537704286813865, "learning_rate": 1.9805689776058255e-07, "logits/chosen": 2.99609375, "logits/rejected": 2.8984375, "logps/chosen": -704.0, "logps/rejected": -1388.0, "loss": 0.5206, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0029296875, "rewards/margins": 6.0390625, "rewards/rejected": -5.0390625, "step": 4274 }, { "epoch": 0.8077849685861401, "grad_norm": 1.9233037771040262, "learning_rate": 1.9787201333918146e-07, "logits/chosen": 2.5869140625, "logits/rejected": 2.12109375, "logps/chosen": -654.75, "logps/rejected": -628.0, "loss": 0.5253, "rewards/accuracies": 0.90625, "rewards/chosen": 0.934478759765625, "rewards/margins": 4.560546875, "rewards/rejected": -3.625, "step": 4275 }, { "epoch": 0.8079739241343474, "grad_norm": 2.7408648383358085, "learning_rate": 1.9768728211084724e-07, "logits/chosen": 2.0859375, "logits/rejected": 1.91015625, "logps/chosen": -611.0, "logps/rejected": -1224.0, "loss": 0.5831, "rewards/accuracies": 0.875, "rewards/chosen": 0.5048828125, "rewards/margins": 7.828125, "rewards/rejected": -7.314453125, "step": 4276 }, { "epoch": 0.8081628796825546, "grad_norm": 2.248367388259149, "learning_rate": 1.9750270415594692e-07, "logits/chosen": 3.5390625, "logits/rejected": 3.12890625, "logps/chosen": -601.5, "logps/rejected": -749.5, "loss": 0.6168, "rewards/accuracies": 0.71875, "rewards/chosen": 1.07421875, "rewards/margins": 4.32421875, "rewards/rejected": -3.25, "step": 4277 }, { "epoch": 0.8083518352307619, "grad_norm": 4.586708518257455, "learning_rate": 1.9731827955478093e-07, "logits/chosen": 3.28515625, "logits/rejected": 3.31640625, "logps/chosen": -722.0, "logps/rejected": -669.0, "loss": 0.5517, "rewards/accuracies": 0.78125, "rewards/chosen": 0.92138671875, "rewards/margins": 4.9296875, "rewards/rejected": -4.0078125, "step": 4278 }, { "epoch": 0.8085407907789692, "grad_norm": 3.4110852067356556, "learning_rate": 1.9713400838758343e-07, "logits/chosen": 2.84765625, "logits/rejected": 2.5986328125, "logps/chosen": -853.0, "logps/rejected": -951.0, "loss": 0.5447, "rewards/accuracies": 0.90625, "rewards/chosen": 0.93841552734375, "rewards/margins": 5.21875, "rewards/rejected": -4.2734375, "step": 4279 }, { "epoch": 0.8087297463271765, "grad_norm": 2.9572400111993535, "learning_rate": 1.9694989073452103e-07, "logits/chosen": 2.435546875, "logits/rejected": 2.7421875, "logps/chosen": -1032.0, "logps/rejected": -1927.0, "loss": 0.5542, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7216644287109375, "rewards/margins": 8.2734375, "rewards/rejected": -7.55078125, "step": 4280 }, { "epoch": 0.8089187018753838, "grad_norm": 4.5295940483116475, "learning_rate": 1.967659266756943e-07, "logits/chosen": 3.322265625, "logits/rejected": 3.099609375, "logps/chosen": -358.0, "logps/rejected": -651.0, "loss": 0.77, "rewards/accuracies": 0.6875, "rewards/chosen": -0.59515380859375, "rewards/margins": 3.7578125, "rewards/rejected": -4.36328125, "step": 4281 }, { "epoch": 0.8091076574235911, "grad_norm": 3.037658988969617, "learning_rate": 1.965821162911366e-07, "logits/chosen": 1.927734375, "logits/rejected": 1.923828125, "logps/chosen": -694.5, "logps/rejected": -975.0, "loss": 0.4754, "rewards/accuracies": 0.875, "rewards/chosen": 0.857177734375, "rewards/margins": 4.9765625, "rewards/rejected": -4.125, "step": 4282 }, { "epoch": 0.8092966129717983, "grad_norm": 2.629681544830164, "learning_rate": 1.9639845966081443e-07, "logits/chosen": 3.5625, "logits/rejected": 3.15234375, "logps/chosen": -762.5, "logps/rejected": -1541.0, "loss": 0.5665, "rewards/accuracies": 0.71875, "rewards/chosen": 1.5263671875, "rewards/margins": 6.962890625, "rewards/rejected": -5.458984375, "step": 4283 }, { "epoch": 0.8094855685200056, "grad_norm": 2.1548254513354066, "learning_rate": 1.9621495686462785e-07, "logits/chosen": 3.53125, "logits/rejected": 2.79296875, "logps/chosen": -950.0, "logps/rejected": -782.5, "loss": 0.4952, "rewards/accuracies": 0.90625, "rewards/chosen": 1.263671875, "rewards/margins": 6.0546875, "rewards/rejected": -4.7890625, "step": 4284 }, { "epoch": 0.8096745240682129, "grad_norm": 1.9979892564032797, "learning_rate": 1.9603160798240914e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.861328125, "logps/chosen": -1222.0, "logps/rejected": -1275.0, "loss": 0.4557, "rewards/accuracies": 0.8125, "rewards/chosen": 2.47265625, "rewards/margins": 6.95703125, "rewards/rejected": -4.482421875, "step": 4285 }, { "epoch": 0.8098634796164202, "grad_norm": 2.9162271859299898, "learning_rate": 1.9584841309392448e-07, "logits/chosen": 2.69921875, "logits/rejected": 2.5087890625, "logps/chosen": -757.5, "logps/rejected": -832.0, "loss": 0.6129, "rewards/accuracies": 0.84375, "rewards/chosen": 1.138671875, "rewards/margins": 4.52734375, "rewards/rejected": -3.384765625, "step": 4286 }, { "epoch": 0.8100524351646275, "grad_norm": 3.306488857892054, "learning_rate": 1.956653722788726e-07, "logits/chosen": 2.8203125, "logits/rejected": 3.0546875, "logps/chosen": -992.5, "logps/rejected": -1352.5, "loss": 0.5971, "rewards/accuracies": 0.71875, "rewards/chosen": 0.83099365234375, "rewards/margins": 5.056640625, "rewards/rejected": -4.216796875, "step": 4287 }, { "epoch": 0.8102413907128349, "grad_norm": 1.578848949156543, "learning_rate": 1.9548248561688524e-07, "logits/chosen": 1.84765625, "logits/rejected": 2.1103515625, "logps/chosen": -629.5, "logps/rejected": -902.0, "loss": 0.6577, "rewards/accuracies": 0.84375, "rewards/chosen": 0.74560546875, "rewards/margins": 4.04296875, "rewards/rejected": -3.30078125, "step": 4288 }, { "epoch": 0.810430346261042, "grad_norm": 3.6596901682654837, "learning_rate": 1.9529975318752718e-07, "logits/chosen": 3.74609375, "logits/rejected": 3.4375, "logps/chosen": -934.5, "logps/rejected": -933.5, "loss": 0.5267, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0390625, "rewards/margins": 5.859375, "rewards/rejected": -4.80078125, "step": 4289 }, { "epoch": 0.8106193018092493, "grad_norm": 2.4403004004178572, "learning_rate": 1.9511717507029596e-07, "logits/chosen": 1.70703125, "logits/rejected": 1.7578125, "logps/chosen": -1154.0, "logps/rejected": -1024.0, "loss": 0.4565, "rewards/accuracies": 0.8125, "rewards/chosen": 1.84765625, "rewards/margins": 6.21875, "rewards/rejected": -4.375, "step": 4290 }, { "epoch": 0.8108082573574567, "grad_norm": 3.316499003489167, "learning_rate": 1.9493475134462223e-07, "logits/chosen": 3.037109375, "logits/rejected": 2.88671875, "logps/chosen": -14161.0, "logps/rejected": -833.5, "loss": 0.5558, "rewards/accuracies": 0.8125, "rewards/chosen": 170.87847900390625, "rewards/margins": 175.279296875, "rewards/rejected": -4.00390625, "step": 4291 }, { "epoch": 0.810997212905664, "grad_norm": 1.705342034683956, "learning_rate": 1.9475248208986927e-07, "logits/chosen": 2.42578125, "logits/rejected": 2.240234375, "logps/chosen": -732.0, "logps/rejected": -1260.0, "loss": 0.6543, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3564453125, "rewards/margins": 4.29296875, "rewards/rejected": -3.9375, "step": 4292 }, { "epoch": 0.8111861684538713, "grad_norm": 5.364403220134511, "learning_rate": 1.945703673853332e-07, "logits/chosen": 4.609375, "logits/rejected": 4.25, "logps/chosen": -758.0, "logps/rejected": -856.0, "loss": 0.5029, "rewards/accuracies": 0.8125, "rewards/chosen": 0.896240234375, "rewards/margins": 6.25, "rewards/rejected": -5.359375, "step": 4293 }, { "epoch": 0.8113751240020786, "grad_norm": 2.739404138407326, "learning_rate": 1.9438840731024286e-07, "logits/chosen": 2.517578125, "logits/rejected": 2.541015625, "logps/chosen": -575.0, "logps/rejected": -1032.0, "loss": 0.5704, "rewards/accuracies": 0.78125, "rewards/chosen": 1.09765625, "rewards/margins": 6.234375, "rewards/rejected": -5.13671875, "step": 4294 }, { "epoch": 0.8115640795502858, "grad_norm": 3.7772145059481974, "learning_rate": 1.9420660194375998e-07, "logits/chosen": 2.3564453125, "logits/rejected": 2.0966796875, "logps/chosen": -929.5, "logps/rejected": -957.0, "loss": 0.4919, "rewards/accuracies": 0.84375, "rewards/chosen": 0.716796875, "rewards/margins": 5.07421875, "rewards/rejected": -4.359375, "step": 4295 }, { "epoch": 0.8117530350984931, "grad_norm": 2.5104715959401984, "learning_rate": 1.9402495136497866e-07, "logits/chosen": 2.1953125, "logits/rejected": 1.8359375, "logps/chosen": -710.5, "logps/rejected": -690.5, "loss": 0.4734, "rewards/accuracies": 0.9375, "rewards/chosen": 1.16455078125, "rewards/margins": 4.96484375, "rewards/rejected": -3.80078125, "step": 4296 }, { "epoch": 0.8119419906467004, "grad_norm": 1.506852804832212, "learning_rate": 1.9384345565292612e-07, "logits/chosen": 2.2421875, "logits/rejected": 2.23828125, "logps/chosen": -964.0, "logps/rejected": -1083.0, "loss": 0.5356, "rewards/accuracies": 0.75, "rewards/chosen": 1.3956298828125, "rewards/margins": 7.015625, "rewards/rejected": -5.62109375, "step": 4297 }, { "epoch": 0.8121309461949077, "grad_norm": 2.2785803391328585, "learning_rate": 1.9366211488656193e-07, "logits/chosen": 3.58203125, "logits/rejected": 3.265625, "logps/chosen": -784.0, "logps/rejected": -755.0, "loss": 0.5517, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39453125, "rewards/margins": 5.2734375, "rewards/rejected": -4.87109375, "step": 4298 }, { "epoch": 0.812319901743115, "grad_norm": 4.072765394925949, "learning_rate": 1.93480929144778e-07, "logits/chosen": 3.58203125, "logits/rejected": 3.21484375, "logps/chosen": -713.5, "logps/rejected": -791.0, "loss": 0.6136, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3173828125, "rewards/margins": 5.00390625, "rewards/rejected": -4.68359375, "step": 4299 }, { "epoch": 0.8125088572913222, "grad_norm": 4.124335277208303, "learning_rate": 1.9329989850639954e-07, "logits/chosen": 2.78125, "logits/rejected": 2.404296875, "logps/chosen": -790.0, "logps/rejected": -898.0, "loss": 0.5628, "rewards/accuracies": 0.78125, "rewards/chosen": 1.17236328125, "rewards/margins": 4.99609375, "rewards/rejected": -3.82421875, "step": 4300 }, { "epoch": 0.8126978128395295, "grad_norm": 2.0564467434213083, "learning_rate": 1.9311902305018334e-07, "logits/chosen": 2.869140625, "logits/rejected": 2.41064453125, "logps/chosen": -1010.0, "logps/rejected": -1004.5, "loss": 0.5626, "rewards/accuracies": 0.78125, "rewards/chosen": 0.974029541015625, "rewards/margins": 6.166015625, "rewards/rejected": -5.1953125, "step": 4301 }, { "epoch": 0.8128867683877368, "grad_norm": 4.64740220918794, "learning_rate": 1.9293830285481952e-07, "logits/chosen": 2.55078125, "logits/rejected": 2.150390625, "logps/chosen": -1160.0, "logps/rejected": -1180.0, "loss": 0.5523, "rewards/accuracies": 0.8125, "rewards/chosen": 1.09765625, "rewards/margins": 6.796875, "rewards/rejected": -5.68359375, "step": 4302 }, { "epoch": 0.8130757239359441, "grad_norm": 2.2673501633564315, "learning_rate": 1.9275773799893018e-07, "logits/chosen": 3.2421875, "logits/rejected": 3.34375, "logps/chosen": -624.0, "logps/rejected": -827.0, "loss": 0.6385, "rewards/accuracies": 0.75, "rewards/chosen": 0.1453857421875, "rewards/margins": 5.03515625, "rewards/rejected": -4.8828125, "step": 4303 }, { "epoch": 0.8132646794841514, "grad_norm": 2.9439613521780483, "learning_rate": 1.9257732856106972e-07, "logits/chosen": 2.4765625, "logits/rejected": 2.6875, "logps/chosen": -669.5, "logps/rejected": -1565.0, "loss": 0.6669, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7041015625, "rewards/margins": 8.7734375, "rewards/rejected": -9.4609375, "step": 4304 }, { "epoch": 0.8134536350323587, "grad_norm": 2.4722793392263402, "learning_rate": 1.9239707461972565e-07, "logits/chosen": 3.6796875, "logits/rejected": 2.783203125, "logps/chosen": -797.5, "logps/rejected": -707.0, "loss": 0.6048, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0810546875, "rewards/margins": 4.8125, "rewards/rejected": -4.7265625, "step": 4305 }, { "epoch": 0.8136425905805659, "grad_norm": 2.5059509797126793, "learning_rate": 1.9221697625331685e-07, "logits/chosen": 3.375, "logits/rejected": 3.1953125, "logps/chosen": -621.5, "logps/rejected": -703.5, "loss": 0.6308, "rewards/accuracies": 0.78125, "rewards/chosen": 0.44091796875, "rewards/margins": 4.375, "rewards/rejected": -3.93359375, "step": 4306 }, { "epoch": 0.8138315461287732, "grad_norm": 3.109723310633072, "learning_rate": 1.9203703354019524e-07, "logits/chosen": 2.765625, "logits/rejected": 2.28515625, "logps/chosen": -802.0, "logps/rejected": -636.0, "loss": 0.5087, "rewards/accuracies": 0.90625, "rewards/chosen": 0.734375, "rewards/margins": 5.80859375, "rewards/rejected": -5.078125, "step": 4307 }, { "epoch": 0.8140205016769805, "grad_norm": 2.3475339357820775, "learning_rate": 1.9185724655864482e-07, "logits/chosen": 3.1484375, "logits/rejected": 3.40625, "logps/chosen": -696.0, "logps/rejected": -614.5, "loss": 0.6694, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1474609375, "rewards/margins": 4.1650390625, "rewards/rejected": -4.015625, "step": 4308 }, { "epoch": 0.8142094572251878, "grad_norm": 2.0340392872533, "learning_rate": 1.916776153868817e-07, "logits/chosen": 1.9140625, "logits/rejected": 2.064453125, "logps/chosen": -740.0, "logps/rejected": -1026.0, "loss": 0.5724, "rewards/accuracies": 0.84375, "rewards/chosen": 0.240234375, "rewards/margins": 5.453125, "rewards/rejected": -5.21484375, "step": 4309 }, { "epoch": 0.8143984127733951, "grad_norm": 2.3071765744425172, "learning_rate": 1.9149814010305439e-07, "logits/chosen": 2.4453125, "logits/rejected": 2.59375, "logps/chosen": -1124.5, "logps/rejected": -1266.0, "loss": 0.4751, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5732421875, "rewards/margins": 6.296875, "rewards/rejected": -4.73828125, "step": 4310 }, { "epoch": 0.8145873683216024, "grad_norm": 3.2707062721814553, "learning_rate": 1.913188207852433e-07, "logits/chosen": 3.171875, "logits/rejected": 2.4609375, "logps/chosen": -912.0, "logps/rejected": -1807.0, "loss": 0.565, "rewards/accuracies": 0.875, "rewards/chosen": 0.13037109375, "rewards/margins": 6.921875, "rewards/rejected": -6.7890625, "step": 4311 }, { "epoch": 0.8147763238698096, "grad_norm": 4.236478561080051, "learning_rate": 1.9113965751146153e-07, "logits/chosen": 3.453125, "logits/rejected": 3.28515625, "logps/chosen": -644.0, "logps/rejected": -1473.5, "loss": 0.6049, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4486083984375, "rewards/margins": 6.834228515625, "rewards/rejected": -6.3984375, "step": 4312 }, { "epoch": 0.8149652794180169, "grad_norm": 1.313310996077736, "learning_rate": 1.909606503596537e-07, "logits/chosen": 3.208984375, "logits/rejected": 3.234375, "logps/chosen": -849.0, "logps/rejected": -1061.0, "loss": 0.6095, "rewards/accuracies": 0.75, "rewards/chosen": 0.8489990234375, "rewards/margins": 5.75, "rewards/rejected": -4.89453125, "step": 4313 }, { "epoch": 0.8151542349662242, "grad_norm": 4.558570784049996, "learning_rate": 1.907817994076969e-07, "logits/chosen": 3.1171875, "logits/rejected": 3.63671875, "logps/chosen": -766.0, "logps/rejected": -1085.0, "loss": 0.634, "rewards/accuracies": 0.75, "rewards/chosen": 0.63330078125, "rewards/margins": 4.8603515625, "rewards/rejected": -4.2275390625, "step": 4314 }, { "epoch": 0.8153431905144315, "grad_norm": 2.844169670498573, "learning_rate": 1.9060310473340002e-07, "logits/chosen": 3.599609375, "logits/rejected": 2.953125, "logps/chosen": -442.75, "logps/rejected": -489.0, "loss": 0.6606, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0625, "rewards/margins": 3.49609375, "rewards/rejected": -3.4375, "step": 4315 }, { "epoch": 0.8155321460626388, "grad_norm": 2.9445918755512275, "learning_rate": 1.9042456641450423e-07, "logits/chosen": 2.4127197265625, "logits/rejected": 1.92578125, "logps/chosen": -830.5, "logps/rejected": -773.5, "loss": 0.4625, "rewards/accuracies": 0.9375, "rewards/chosen": 1.416015625, "rewards/margins": 6.0859375, "rewards/rejected": -4.66796875, "step": 4316 }, { "epoch": 0.8157211016108461, "grad_norm": 3.9507312180493, "learning_rate": 1.9024618452868218e-07, "logits/chosen": 2.634765625, "logits/rejected": 2.763671875, "logps/chosen": -1756.0, "logps/rejected": -1536.0, "loss": 0.5668, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5244140625, "rewards/margins": 5.48828125, "rewards/rejected": -4.96484375, "step": 4317 }, { "epoch": 0.8159100571590533, "grad_norm": 1.5855403404297022, "learning_rate": 1.900679591535392e-07, "logits/chosen": 3.234375, "logits/rejected": 3.025390625, "logps/chosen": -552.0, "logps/rejected": -1130.0, "loss": 0.6074, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1839599609375, "rewards/margins": 5.0625, "rewards/rejected": -5.2421875, "step": 4318 }, { "epoch": 0.8160990127072606, "grad_norm": 2.827818764718889, "learning_rate": 1.898898903666119e-07, "logits/chosen": 3.66796875, "logits/rejected": 3.0390625, "logps/chosen": -701.0, "logps/rejected": -650.5, "loss": 0.5387, "rewards/accuracies": 0.875, "rewards/chosen": 1.33984375, "rewards/margins": 4.9921875, "rewards/rejected": -3.662109375, "step": 4319 }, { "epoch": 0.8162879682554679, "grad_norm": 2.2297154618850956, "learning_rate": 1.897119782453691e-07, "logits/chosen": 3.37109375, "logits/rejected": 2.75, "logps/chosen": -971.0, "logps/rejected": -641.5, "loss": 0.517, "rewards/accuracies": 0.8125, "rewards/chosen": 0.35693359375, "rewards/margins": 4.744140625, "rewards/rejected": -4.37890625, "step": 4320 }, { "epoch": 0.8164769238036752, "grad_norm": 2.1988774324966385, "learning_rate": 1.8953422286721133e-07, "logits/chosen": 2.5859375, "logits/rejected": 2.2109375, "logps/chosen": -776.0, "logps/rejected": -951.0, "loss": 0.6471, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2109375, "rewards/margins": 4.234375, "rewards/rejected": -3.025390625, "step": 4321 }, { "epoch": 0.8166658793518825, "grad_norm": 2.570413370530113, "learning_rate": 1.8935662430947077e-07, "logits/chosen": 2.70703125, "logits/rejected": 2.36328125, "logps/chosen": -1200.0, "logps/rejected": -1971.0, "loss": 0.3874, "rewards/accuracies": 0.84375, "rewards/chosen": 2.396484375, "rewards/margins": 11.25, "rewards/rejected": -8.875, "step": 4322 }, { "epoch": 0.8168548349000897, "grad_norm": 3.944862538568741, "learning_rate": 1.8917918264941198e-07, "logits/chosen": 3.5625, "logits/rejected": 3.390625, "logps/chosen": -479.5, "logps/rejected": -436.0, "loss": 0.6824, "rewards/accuracies": 0.71875, "rewards/chosen": 0.379638671875, "rewards/margins": 3.0712890625, "rewards/rejected": -2.6953125, "step": 4323 }, { "epoch": 0.817043790448297, "grad_norm": 2.561340399384687, "learning_rate": 1.890018979642304e-07, "logits/chosen": 2.1953125, "logits/rejected": 2.04296875, "logps/chosen": -352.5, "logps/rejected": -445.0, "loss": 0.7089, "rewards/accuracies": 0.78125, "rewards/chosen": -0.298828125, "rewards/margins": 3.203125, "rewards/rejected": -3.5, "step": 4324 }, { "epoch": 0.8172327459965043, "grad_norm": 2.560293305981916, "learning_rate": 1.8882477033105397e-07, "logits/chosen": 2.33984375, "logits/rejected": 2.61328125, "logps/chosen": -982.0, "logps/rejected": -998.0, "loss": 0.6003, "rewards/accuracies": 0.71875, "rewards/chosen": 0.5419921875, "rewards/margins": 4.80078125, "rewards/rejected": -4.26171875, "step": 4325 }, { "epoch": 0.8174217015447116, "grad_norm": 1.438235075442842, "learning_rate": 1.886477998269418e-07, "logits/chosen": 3.6171875, "logits/rejected": 2.99609375, "logps/chosen": -811.0, "logps/rejected": -705.5, "loss": 0.5992, "rewards/accuracies": 0.78125, "rewards/chosen": -0.19622802734375, "rewards/margins": 4.41796875, "rewards/rejected": -4.62109375, "step": 4326 }, { "epoch": 0.8176106570929189, "grad_norm": 1.693640684340878, "learning_rate": 1.884709865288847e-07, "logits/chosen": 3.6015625, "logits/rejected": 3.24609375, "logps/chosen": -704.0, "logps/rejected": -669.0, "loss": 0.5622, "rewards/accuracies": 0.78125, "rewards/chosen": 0.74658203125, "rewards/margins": 4.4296875, "rewards/rejected": -3.6796875, "step": 4327 }, { "epoch": 0.8177996126411262, "grad_norm": 1.4359529949783638, "learning_rate": 1.8829433051380556e-07, "logits/chosen": 2.84765625, "logits/rejected": 3.1171875, "logps/chosen": -764.0, "logps/rejected": -1966.0, "loss": 0.5904, "rewards/accuracies": 0.78125, "rewards/chosen": 0.083984375, "rewards/margins": 6.953125, "rewards/rejected": -6.87109375, "step": 4328 }, { "epoch": 0.8179885681893334, "grad_norm": 2.172287825016398, "learning_rate": 1.8811783185855796e-07, "logits/chosen": 3.31640625, "logits/rejected": 3.046875, "logps/chosen": -698.0, "logps/rejected": -829.0, "loss": 0.6456, "rewards/accuracies": 0.78125, "rewards/chosen": 0.806640625, "rewards/margins": 4.859375, "rewards/rejected": -4.046875, "step": 4329 }, { "epoch": 0.8181775237375407, "grad_norm": 2.002352442453492, "learning_rate": 1.8794149063992792e-07, "logits/chosen": 1.94482421875, "logits/rejected": 2.2763671875, "logps/chosen": -552.5, "logps/rejected": -580.0, "loss": 0.6372, "rewards/accuracies": 0.78125, "rewards/chosen": 1.345703125, "rewards/margins": 4.0, "rewards/rejected": -2.642578125, "step": 4330 }, { "epoch": 0.818366479285748, "grad_norm": 4.767745003011707, "learning_rate": 1.8776530693463245e-07, "logits/chosen": 2.33203125, "logits/rejected": 1.7705078125, "logps/chosen": -762.5, "logps/rejected": -679.0, "loss": 0.4815, "rewards/accuracies": 0.8125, "rewards/chosen": 1.14453125, "rewards/margins": 5.51953125, "rewards/rejected": -4.3671875, "step": 4331 }, { "epoch": 0.8185554348339553, "grad_norm": 2.233189657190124, "learning_rate": 1.8758928081932008e-07, "logits/chosen": 2.81640625, "logits/rejected": 2.6171875, "logps/chosen": -515.5, "logps/rejected": -740.0, "loss": 0.6179, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6484375, "rewards/margins": 3.99609375, "rewards/rejected": -3.3349609375, "step": 4332 }, { "epoch": 0.8187443903821626, "grad_norm": 1.7679338957961166, "learning_rate": 1.8741341237057117e-07, "logits/chosen": 2.5361328125, "logits/rejected": 2.2001953125, "logps/chosen": -728.0, "logps/rejected": -832.0, "loss": 0.5211, "rewards/accuracies": 0.875, "rewards/chosen": 0.8505859375, "rewards/margins": 5.23046875, "rewards/rejected": -4.375, "step": 4333 }, { "epoch": 0.8189333459303699, "grad_norm": 3.5289197335380407, "learning_rate": 1.8723770166489672e-07, "logits/chosen": 2.3662109375, "logits/rejected": 1.759765625, "logps/chosen": -1088.0, "logps/rejected": -868.0, "loss": 0.5173, "rewards/accuracies": 0.90625, "rewards/chosen": 1.306640625, "rewards/margins": 5.546875, "rewards/rejected": -4.24560546875, "step": 4334 }, { "epoch": 0.8191223014785771, "grad_norm": 2.9734311206868536, "learning_rate": 1.8706214877873988e-07, "logits/chosen": 2.9453125, "logits/rejected": 2.408203125, "logps/chosen": -583.0, "logps/rejected": -501.0, "loss": 0.6923, "rewards/accuracies": 0.78125, "rewards/chosen": 0.34814453125, "rewards/margins": 3.24609375, "rewards/rejected": -2.890625, "step": 4335 }, { "epoch": 0.8193112570267844, "grad_norm": 1.8482441680212867, "learning_rate": 1.8688675378847478e-07, "logits/chosen": 3.0546875, "logits/rejected": 2.998046875, "logps/chosen": -6130.0, "logps/rejected": -750.0, "loss": 0.5852, "rewards/accuracies": 0.78125, "rewards/chosen": 21.94189453125, "rewards/margins": 26.19921875, "rewards/rejected": -4.25390625, "step": 4336 }, { "epoch": 0.8195002125749917, "grad_norm": 2.898495586072489, "learning_rate": 1.867115167704069e-07, "logits/chosen": 3.1640625, "logits/rejected": 3.4609375, "logps/chosen": -845.0, "logps/rejected": -853.0, "loss": 0.6161, "rewards/accuracies": 0.75, "rewards/chosen": 0.3101806640625, "rewards/margins": 3.80859375, "rewards/rejected": -3.50390625, "step": 4337 }, { "epoch": 0.819689168123199, "grad_norm": 3.432361354949007, "learning_rate": 1.8653643780077287e-07, "logits/chosen": 2.56640625, "logits/rejected": 2.1875, "logps/chosen": -874.0, "logps/rejected": -800.0, "loss": 0.4437, "rewards/accuracies": 0.8125, "rewards/chosen": 1.62890625, "rewards/margins": 5.265625, "rewards/rejected": -3.63671875, "step": 4338 }, { "epoch": 0.8198781236714063, "grad_norm": 3.7327953771670606, "learning_rate": 1.8636151695574092e-07, "logits/chosen": 1.8359375, "logits/rejected": 1.970703125, "logps/chosen": -705.5, "logps/rejected": -1684.5, "loss": 0.6707, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4384765625, "rewards/margins": 6.09375, "rewards/rejected": -5.6640625, "step": 4339 }, { "epoch": 0.8200670792196136, "grad_norm": 3.171842391808866, "learning_rate": 1.8618675431141013e-07, "logits/chosen": 2.99609375, "logits/rejected": 2.517578125, "logps/chosen": -917.0, "logps/rejected": -1560.0, "loss": 0.575, "rewards/accuracies": 0.8125, "rewards/chosen": 0.66796875, "rewards/margins": 6.181640625, "rewards/rejected": -5.515625, "step": 4340 }, { "epoch": 0.8202560347678208, "grad_norm": 2.7526034217942006, "learning_rate": 1.860121499438109e-07, "logits/chosen": 3.37109375, "logits/rejected": 3.17578125, "logps/chosen": -722.5, "logps/rejected": -1131.0, "loss": 0.4979, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7421875, "rewards/margins": 7.25390625, "rewards/rejected": -6.5234375, "step": 4341 }, { "epoch": 0.8204449903160281, "grad_norm": 1.8309511044828155, "learning_rate": 1.858377039289047e-07, "logits/chosen": 1.9501953125, "logits/rejected": 1.7545166015625, "logps/chosen": -674.0, "logps/rejected": -1035.0, "loss": 0.6715, "rewards/accuracies": 0.75, "rewards/chosen": 0.2099609375, "rewards/margins": 4.244140625, "rewards/rejected": -4.041015625, "step": 4342 }, { "epoch": 0.8206339458642354, "grad_norm": 2.321519259569573, "learning_rate": 1.8566341634258408e-07, "logits/chosen": 2.4609375, "logits/rejected": 2.830078125, "logps/chosen": -909.5, "logps/rejected": -805.5, "loss": 0.4945, "rewards/accuracies": 0.90625, "rewards/chosen": 1.076171875, "rewards/margins": 5.96484375, "rewards/rejected": -4.890625, "step": 4343 }, { "epoch": 0.8208229014124427, "grad_norm": 1.9898722467348868, "learning_rate": 1.8548928726067316e-07, "logits/chosen": 3.435546875, "logits/rejected": 3.3125, "logps/chosen": -624.5, "logps/rejected": -888.0, "loss": 0.5678, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3447265625, "rewards/margins": 5.15625, "rewards/rejected": -5.5078125, "step": 4344 }, { "epoch": 0.82101185696065, "grad_norm": 4.702633603102914, "learning_rate": 1.8531531675892614e-07, "logits/chosen": 4.0625, "logits/rejected": 3.51171875, "logps/chosen": -1157.5, "logps/rejected": -1620.0, "loss": 0.6156, "rewards/accuracies": 0.84375, "rewards/chosen": -0.65313720703125, "rewards/margins": 8.2578125, "rewards/rejected": -8.9140625, "step": 4345 }, { "epoch": 0.8212008125088572, "grad_norm": 3.695904584588893, "learning_rate": 1.8514150491302915e-07, "logits/chosen": 2.30859375, "logits/rejected": 1.837890625, "logps/chosen": -504.0, "logps/rejected": -647.0, "loss": 0.5389, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4765625, "rewards/margins": 5.296875, "rewards/rejected": -4.818359375, "step": 4346 }, { "epoch": 0.8213897680570645, "grad_norm": 2.7991155031513313, "learning_rate": 1.8496785179859884e-07, "logits/chosen": 3.453125, "logits/rejected": 3.41796875, "logps/chosen": -864.0, "logps/rejected": -950.0, "loss": 0.5818, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2357177734375, "rewards/margins": 4.796875, "rewards/rejected": -4.5625, "step": 4347 }, { "epoch": 0.8215787236052718, "grad_norm": 2.176437329968213, "learning_rate": 1.8479435749118276e-07, "logits/chosen": 2.666015625, "logits/rejected": 2.76953125, "logps/chosen": -796.0, "logps/rejected": -885.0, "loss": 0.5962, "rewards/accuracies": 0.75, "rewards/chosen": 0.66943359375, "rewards/margins": 6.125, "rewards/rejected": -5.453125, "step": 4348 }, { "epoch": 0.8217676791534791, "grad_norm": 7.929897498779052, "learning_rate": 1.846210220662599e-07, "logits/chosen": 2.984375, "logits/rejected": 2.9140625, "logps/chosen": -604.0, "logps/rejected": -1047.0, "loss": 0.6543, "rewards/accuracies": 0.75, "rewards/chosen": 0.0927734375, "rewards/margins": 4.8203125, "rewards/rejected": -4.734375, "step": 4349 }, { "epoch": 0.8219566347016865, "grad_norm": 3.8727908366573667, "learning_rate": 1.844478455992391e-07, "logits/chosen": 3.48046875, "logits/rejected": 2.84375, "logps/chosen": -719.5, "logps/rejected": -670.0, "loss": 0.6009, "rewards/accuracies": 0.78125, "rewards/chosen": -0.148681640625, "rewards/margins": 3.9375, "rewards/rejected": -4.09375, "step": 4350 }, { "epoch": 0.8221455902498938, "grad_norm": 3.6634572051768264, "learning_rate": 1.8427482816546125e-07, "logits/chosen": 2.98248291015625, "logits/rejected": 2.76171875, "logps/chosen": -617.0, "logps/rejected": -647.5, "loss": 0.6599, "rewards/accuracies": 0.875, "rewards/chosen": 0.5947265625, "rewards/margins": 3.65234375, "rewards/rejected": -3.060546875, "step": 4351 }, { "epoch": 0.822334545798101, "grad_norm": 2.6656676448544765, "learning_rate": 1.8410196984019722e-07, "logits/chosen": 2.5390625, "logits/rejected": 2.5849609375, "logps/chosen": -589.0, "logps/rejected": -2035.0, "loss": 0.6564, "rewards/accuracies": 0.78125, "rewards/chosen": -0.30029296875, "rewards/margins": 6.6484375, "rewards/rejected": -6.953125, "step": 4352 }, { "epoch": 0.8225235013463083, "grad_norm": 1.7107569482148473, "learning_rate": 1.8392927069864882e-07, "logits/chosen": 2.140625, "logits/rejected": 1.939453125, "logps/chosen": -1358.5, "logps/rejected": -1209.5, "loss": 0.5836, "rewards/accuracies": 0.78125, "rewards/chosen": 1.18798828125, "rewards/margins": 5.45703125, "rewards/rejected": -4.265625, "step": 4353 }, { "epoch": 0.8227124568945156, "grad_norm": 1.8608595579403244, "learning_rate": 1.8375673081594906e-07, "logits/chosen": 2.482421875, "logits/rejected": 2.109375, "logps/chosen": -724.0, "logps/rejected": -750.5, "loss": 0.6288, "rewards/accuracies": 0.75, "rewards/chosen": 0.6083984375, "rewards/margins": 3.9951171875, "rewards/rejected": -3.39453125, "step": 4354 }, { "epoch": 0.8229014124427229, "grad_norm": 3.488901780765845, "learning_rate": 1.835843502671609e-07, "logits/chosen": 3.11328125, "logits/rejected": 2.8203125, "logps/chosen": -1157.0, "logps/rejected": -975.0, "loss": 0.5857, "rewards/accuracies": 0.75, "rewards/chosen": 0.09423828125, "rewards/margins": 5.3984375, "rewards/rejected": -5.3046875, "step": 4355 }, { "epoch": 0.8230903679909302, "grad_norm": 2.917106885387685, "learning_rate": 1.8341212912727866e-07, "logits/chosen": 2.779296875, "logits/rejected": 2.984375, "logps/chosen": -1061.0, "logps/rejected": -1078.0, "loss": 0.6412, "rewards/accuracies": 0.71875, "rewards/chosen": -0.02978515625, "rewards/margins": 4.42578125, "rewards/rejected": -4.44921875, "step": 4356 }, { "epoch": 0.8232793235391375, "grad_norm": 2.5761772248904085, "learning_rate": 1.83240067471227e-07, "logits/chosen": 2.7578125, "logits/rejected": 2.78125, "logps/chosen": -617.5, "logps/rejected": -652.5, "loss": 0.4924, "rewards/accuracies": 0.90625, "rewards/chosen": 0.51611328125, "rewards/margins": 5.546875, "rewards/rejected": -5.04296875, "step": 4357 }, { "epoch": 0.8234682790873447, "grad_norm": 3.590972889019534, "learning_rate": 1.830681653738612e-07, "logits/chosen": 2.34765625, "logits/rejected": 2.40625, "logps/chosen": -715.5, "logps/rejected": -967.0, "loss": 0.5857, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7027587890625, "rewards/margins": 6.26953125, "rewards/rejected": -5.5625, "step": 4358 }, { "epoch": 0.823657234635552, "grad_norm": 2.7440831438449163, "learning_rate": 1.828964229099672e-07, "logits/chosen": 2.32080078125, "logits/rejected": 1.54296875, "logps/chosen": -1356.5, "logps/rejected": -557.0, "loss": 0.6588, "rewards/accuracies": 0.78125, "rewards/chosen": -1.604736328125, "rewards/margins": 1.8828125, "rewards/rejected": -3.5, "step": 4359 }, { "epoch": 0.8238461901837593, "grad_norm": 3.304861848817366, "learning_rate": 1.827248401542613e-07, "logits/chosen": 3.15234375, "logits/rejected": 3.1875, "logps/chosen": -915.0, "logps/rejected": -944.0, "loss": 0.5988, "rewards/accuracies": 0.71875, "rewards/chosen": 1.4189453125, "rewards/margins": 4.66015625, "rewards/rejected": -3.234375, "step": 4360 }, { "epoch": 0.8240351457319666, "grad_norm": 4.266311377141404, "learning_rate": 1.8255341718139066e-07, "logits/chosen": 3.220703125, "logits/rejected": 3.2421875, "logps/chosen": -706.0, "logps/rejected": -935.0, "loss": 0.5084, "rewards/accuracies": 0.8125, "rewards/chosen": 0.98291015625, "rewards/margins": 5.4296875, "rewards/rejected": -4.44140625, "step": 4361 }, { "epoch": 0.8242241012801739, "grad_norm": 1.6021497326253893, "learning_rate": 1.823821540659327e-07, "logits/chosen": 2.552734375, "logits/rejected": 2.3984375, "logps/chosen": -679.5, "logps/rejected": -771.5, "loss": 0.507, "rewards/accuracies": 0.8125, "rewards/chosen": 0.765625, "rewards/margins": 5.5390625, "rewards/rejected": -4.76953125, "step": 4362 }, { "epoch": 0.8244130568283812, "grad_norm": 2.8315784093426983, "learning_rate": 1.8221105088239537e-07, "logits/chosen": 2.466796875, "logits/rejected": 1.814453125, "logps/chosen": -1060.0, "logps/rejected": -835.0, "loss": 0.4456, "rewards/accuracies": 0.90625, "rewards/chosen": 1.31298828125, "rewards/margins": 5.984375, "rewards/rejected": -4.67578125, "step": 4363 }, { "epoch": 0.8246020123765884, "grad_norm": 2.732919022317697, "learning_rate": 1.820401077052169e-07, "logits/chosen": 2.994140625, "logits/rejected": 3.20703125, "logps/chosen": -807.0, "logps/rejected": -1692.0, "loss": 0.6105, "rewards/accuracies": 0.84375, "rewards/chosen": 0.125732421875, "rewards/margins": 6.8671875, "rewards/rejected": -6.73828125, "step": 4364 }, { "epoch": 0.8247909679247957, "grad_norm": 3.005402205735649, "learning_rate": 1.8186932460876592e-07, "logits/chosen": 2.212890625, "logits/rejected": 2.4375, "logps/chosen": -610.0, "logps/rejected": -867.0, "loss": 0.6151, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0302734375, "rewards/margins": 4.57421875, "rewards/rejected": -4.59765625, "step": 4365 }, { "epoch": 0.824979923473003, "grad_norm": 2.393533476058979, "learning_rate": 1.8169870166734183e-07, "logits/chosen": 2.42529296875, "logits/rejected": 2.2529296875, "logps/chosen": -651.5, "logps/rejected": -692.5, "loss": 0.5819, "rewards/accuracies": 0.90625, "rewards/chosen": 0.47802734375, "rewards/margins": 4.6953125, "rewards/rejected": -4.21484375, "step": 4366 }, { "epoch": 0.8251688790212103, "grad_norm": 3.248999056837836, "learning_rate": 1.8152823895517383e-07, "logits/chosen": 2.306640625, "logits/rejected": 2.12109375, "logps/chosen": -836.5, "logps/rejected": -840.5, "loss": 0.6457, "rewards/accuracies": 0.78125, "rewards/chosen": 0.58056640625, "rewards/margins": 4.10595703125, "rewards/rejected": -3.529296875, "step": 4367 }, { "epoch": 0.8253578345694176, "grad_norm": 2.1056851794216427, "learning_rate": 1.8135793654642168e-07, "logits/chosen": 2.4296875, "logits/rejected": 1.912109375, "logps/chosen": -689.0, "logps/rejected": -670.0, "loss": 0.6777, "rewards/accuracies": 0.875, "rewards/chosen": -0.08673095703125, "rewards/margins": 3.953125, "rewards/rejected": -4.0390625, "step": 4368 }, { "epoch": 0.8255467901176248, "grad_norm": 3.0046793870583945, "learning_rate": 1.811877945151753e-07, "logits/chosen": 2.517578125, "logits/rejected": 2.69140625, "logps/chosen": -447.0, "logps/rejected": -696.0, "loss": 0.5943, "rewards/accuracies": 0.75, "rewards/chosen": 0.533203125, "rewards/margins": 4.3359375, "rewards/rejected": -3.8046875, "step": 4369 }, { "epoch": 0.8257357456658321, "grad_norm": 1.6956340854089909, "learning_rate": 1.8101781293545487e-07, "logits/chosen": 3.625, "logits/rejected": 3.4765625, "logps/chosen": -847.0, "logps/rejected": -2121.0, "loss": 0.487, "rewards/accuracies": 0.84375, "rewards/chosen": 1.5048828125, "rewards/margins": 8.546875, "rewards/rejected": -7.05859375, "step": 4370 }, { "epoch": 0.8259247012140394, "grad_norm": 5.462846581116353, "learning_rate": 1.808479918812107e-07, "logits/chosen": 3.04296875, "logits/rejected": 2.51904296875, "logps/chosen": -657.0, "logps/rejected": -775.0, "loss": 0.5022, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0224609375, "rewards/margins": 5.421875, "rewards/rejected": -5.40625, "step": 4371 }, { "epoch": 0.8261136567622467, "grad_norm": 5.131925474670093, "learning_rate": 1.8067833142632355e-07, "logits/chosen": 3.16015625, "logits/rejected": 3.046875, "logps/chosen": -651.5, "logps/rejected": -999.5, "loss": 0.634, "rewards/accuracies": 0.75, "rewards/chosen": 0.28857421875, "rewards/margins": 3.72265625, "rewards/rejected": -3.43359375, "step": 4372 }, { "epoch": 0.826302612310454, "grad_norm": 3.2335690584392296, "learning_rate": 1.8050883164460392e-07, "logits/chosen": 3.09375, "logits/rejected": 2.4814453125, "logps/chosen": -576.5, "logps/rejected": -959.0, "loss": 0.5359, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6982421875, "rewards/margins": 5.390625, "rewards/rejected": -4.6796875, "step": 4373 }, { "epoch": 0.8264915678586613, "grad_norm": 2.101299104451195, "learning_rate": 1.803394926097926e-07, "logits/chosen": 2.84375, "logits/rejected": 2.287109375, "logps/chosen": -695.0, "logps/rejected": -1791.0, "loss": 0.4744, "rewards/accuracies": 0.875, "rewards/chosen": 0.899658203125, "rewards/margins": 10.2265625, "rewards/rejected": -9.30859375, "step": 4374 }, { "epoch": 0.8266805234068685, "grad_norm": 1.8893783723151323, "learning_rate": 1.8017031439556069e-07, "logits/chosen": 3.1875, "logits/rejected": 2.9375, "logps/chosen": -498.0, "logps/rejected": -843.5, "loss": 0.6346, "rewards/accuracies": 0.75, "rewards/chosen": 0.1787109375, "rewards/margins": 4.16015625, "rewards/rejected": -3.98046875, "step": 4375 }, { "epoch": 0.8268694789550758, "grad_norm": 3.342911779552249, "learning_rate": 1.8000129707550858e-07, "logits/chosen": 2.830078125, "logits/rejected": 2.59375, "logps/chosen": -537.0, "logps/rejected": -544.0, "loss": 0.5707, "rewards/accuracies": 0.875, "rewards/chosen": 0.419921875, "rewards/margins": 4.48046875, "rewards/rejected": -4.0625, "step": 4376 }, { "epoch": 0.8270584345032831, "grad_norm": 2.1415634145372877, "learning_rate": 1.7983244072316761e-07, "logits/chosen": 2.83984375, "logits/rejected": 2.619140625, "logps/chosen": -826.0, "logps/rejected": -734.0, "loss": 0.7157, "rewards/accuracies": 0.71875, "rewards/chosen": 0.693603515625, "rewards/margins": 3.74853515625, "rewards/rejected": -3.056640625, "step": 4377 }, { "epoch": 0.8272473900514904, "grad_norm": 2.354566010117636, "learning_rate": 1.796637454119984e-07, "logits/chosen": 3.671875, "logits/rejected": 3.625, "logps/chosen": -709.0, "logps/rejected": -1759.0, "loss": 0.5806, "rewards/accuracies": 0.9375, "rewards/chosen": 1.52734375, "rewards/margins": 8.70703125, "rewards/rejected": -7.169921875, "step": 4378 }, { "epoch": 0.8274363455996977, "grad_norm": 2.2745920440380534, "learning_rate": 1.794952112153918e-07, "logits/chosen": 3.64453125, "logits/rejected": 3.03125, "logps/chosen": -573.0, "logps/rejected": -794.0, "loss": 0.5813, "rewards/accuracies": 0.75, "rewards/chosen": 0.367431640625, "rewards/margins": 5.484375, "rewards/rejected": -5.1171875, "step": 4379 }, { "epoch": 0.827625301147905, "grad_norm": 2.0552649773852134, "learning_rate": 1.793268382066686e-07, "logits/chosen": 2.953125, "logits/rejected": 2.947265625, "logps/chosen": -644.0, "logps/rejected": -1096.0, "loss": 0.5106, "rewards/accuracies": 0.84375, "rewards/chosen": 1.181640625, "rewards/margins": 5.359375, "rewards/rejected": -4.18359375, "step": 4380 }, { "epoch": 0.8278142566961122, "grad_norm": 3.1080772410608395, "learning_rate": 1.791586264590791e-07, "logits/chosen": 3.1640625, "logits/rejected": 3.3359375, "logps/chosen": -877.0, "logps/rejected": -1250.0, "loss": 0.7386, "rewards/accuracies": 0.65625, "rewards/chosen": 0.2353515625, "rewards/margins": 3.33984375, "rewards/rejected": -3.10546875, "step": 4381 }, { "epoch": 0.8280032122443195, "grad_norm": 3.1960727027059876, "learning_rate": 1.78990576045804e-07, "logits/chosen": 3.091796875, "logits/rejected": 2.150390625, "logps/chosen": -635.0, "logps/rejected": -599.5, "loss": 0.5885, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3941650390625, "rewards/margins": 4.3125, "rewards/rejected": -3.921875, "step": 4382 }, { "epoch": 0.8281921677925268, "grad_norm": 2.082344954026819, "learning_rate": 1.7882268703995345e-07, "logits/chosen": 3.328125, "logits/rejected": 3.40625, "logps/chosen": -985.0, "logps/rejected": -994.0, "loss": 0.5914, "rewards/accuracies": 0.6875, "rewards/chosen": 0.74951171875, "rewards/margins": 5.14453125, "rewards/rejected": -4.40234375, "step": 4383 }, { "epoch": 0.8283811233407341, "grad_norm": 3.7717173487897, "learning_rate": 1.7865495951456745e-07, "logits/chosen": 2.98046875, "logits/rejected": 2.66796875, "logps/chosen": -928.0, "logps/rejected": -1227.0, "loss": 0.444, "rewards/accuracies": 0.875, "rewards/chosen": 8.27667236328125, "rewards/margins": 14.8046875, "rewards/rejected": -6.5625, "step": 4384 }, { "epoch": 0.8285700788889414, "grad_norm": 4.228897054923696, "learning_rate": 1.7848739354261574e-07, "logits/chosen": 2.85546875, "logits/rejected": 2.3671875, "logps/chosen": -871.5, "logps/rejected": -735.0, "loss": 0.4331, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3125, "rewards/margins": 6.0703125, "rewards/rejected": -4.7578125, "step": 4385 }, { "epoch": 0.8287590344371487, "grad_norm": 3.0770974057021734, "learning_rate": 1.7831998919699764e-07, "logits/chosen": 2.599609375, "logits/rejected": 2.01702880859375, "logps/chosen": -724.5, "logps/rejected": -836.0, "loss": 0.5553, "rewards/accuracies": 0.8125, "rewards/chosen": 0.84912109375, "rewards/margins": 7.4453125, "rewards/rejected": -6.59765625, "step": 4386 }, { "epoch": 0.8289479899853559, "grad_norm": 3.821443349076312, "learning_rate": 1.781527465505427e-07, "logits/chosen": 3.048828125, "logits/rejected": 3.671875, "logps/chosen": -673.5, "logps/rejected": -1130.5, "loss": 0.692, "rewards/accuracies": 0.75, "rewards/chosen": -1.138671875, "rewards/margins": 9.1484375, "rewards/rejected": -10.296875, "step": 4387 }, { "epoch": 0.8291369455335632, "grad_norm": 3.3148504692915863, "learning_rate": 1.7798566567600942e-07, "logits/chosen": 2.525390625, "logits/rejected": 2.904296875, "logps/chosen": -1039.5, "logps/rejected": -1169.0, "loss": 0.5363, "rewards/accuracies": 0.875, "rewards/chosen": 0.9365234375, "rewards/margins": 5.91796875, "rewards/rejected": -4.98828125, "step": 4388 }, { "epoch": 0.8293259010817705, "grad_norm": 2.7559336937419703, "learning_rate": 1.778187466460863e-07, "logits/chosen": 2.623046875, "logits/rejected": 2.78515625, "logps/chosen": -590.0, "logps/rejected": -817.0, "loss": 0.6099, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05126953125, "rewards/margins": 4.943359375, "rewards/rejected": -5.0, "step": 4389 }, { "epoch": 0.8295148566299778, "grad_norm": 2.947491745854447, "learning_rate": 1.7765198953339144e-07, "logits/chosen": 2.412109375, "logits/rejected": 2.73828125, "logps/chosen": -765.0, "logps/rejected": -568.5, "loss": 0.6509, "rewards/accuracies": 0.78125, "rewards/chosen": -0.537109375, "rewards/margins": 4.349609375, "rewards/rejected": -4.88671875, "step": 4390 }, { "epoch": 0.8297038121781851, "grad_norm": 2.4761783674636133, "learning_rate": 1.774853944104723e-07, "logits/chosen": 2.86328125, "logits/rejected": 2.7421875, "logps/chosen": -813.0, "logps/rejected": -1800.0, "loss": 0.5319, "rewards/accuracies": 0.71875, "rewards/chosen": 1.2138671875, "rewards/margins": 10.81640625, "rewards/rejected": -9.609375, "step": 4391 }, { "epoch": 0.8298927677263923, "grad_norm": 3.3845317755986017, "learning_rate": 1.7731896134980594e-07, "logits/chosen": 2.615234375, "logits/rejected": 2.103515625, "logps/chosen": -724.5, "logps/rejected": -703.5, "loss": 0.6678, "rewards/accuracies": 0.71875, "rewards/chosen": -0.84423828125, "rewards/margins": 3.48046875, "rewards/rejected": -4.328125, "step": 4392 }, { "epoch": 0.8300817232745996, "grad_norm": 1.9594889779932092, "learning_rate": 1.771526904237992e-07, "logits/chosen": 1.21923828125, "logits/rejected": 0.99920654296875, "logps/chosen": -759.0, "logps/rejected": -891.5, "loss": 0.5695, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3515625, "rewards/margins": 5.712890625, "rewards/rejected": -5.365234375, "step": 4393 }, { "epoch": 0.8302706788228069, "grad_norm": 3.1591944661486346, "learning_rate": 1.7698658170478804e-07, "logits/chosen": 2.740234375, "logits/rejected": 2.29296875, "logps/chosen": -1047.0, "logps/rejected": -973.0, "loss": 0.5626, "rewards/accuracies": 0.78125, "rewards/chosen": 0.673828125, "rewards/margins": 5.080078125, "rewards/rejected": -4.40234375, "step": 4394 }, { "epoch": 0.8304596343710142, "grad_norm": 2.524344146455002, "learning_rate": 1.7682063526503787e-07, "logits/chosen": 2.91796875, "logits/rejected": 2.568359375, "logps/chosen": -991.0, "logps/rejected": -1141.0, "loss": 0.4377, "rewards/accuracies": 0.90625, "rewards/chosen": 0.619140625, "rewards/margins": 6.703125, "rewards/rejected": -6.1015625, "step": 4395 }, { "epoch": 0.8306485899192215, "grad_norm": 2.685549806978576, "learning_rate": 1.7665485117674373e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.3349609375, "logps/chosen": -770.0, "logps/rejected": -720.5, "loss": 0.575, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1708984375, "rewards/margins": 6.12890625, "rewards/rejected": -6.29296875, "step": 4396 }, { "epoch": 0.8308375454674288, "grad_norm": 4.97788156022319, "learning_rate": 1.7648922951202964e-07, "logits/chosen": 3.41015625, "logits/rejected": 3.2109375, "logps/chosen": -1033.5, "logps/rejected": -1042.5, "loss": 0.5513, "rewards/accuracies": 0.75, "rewards/chosen": 1.16796875, "rewards/margins": 6.173828125, "rewards/rejected": -5.01171875, "step": 4397 }, { "epoch": 0.831026501015636, "grad_norm": 2.4461038923407488, "learning_rate": 1.763237703429496e-07, "logits/chosen": 4.31640625, "logits/rejected": 3.53515625, "logps/chosen": -729.0, "logps/rejected": -763.0, "loss": 0.5504, "rewards/accuracies": 0.78125, "rewards/chosen": 1.01904296875, "rewards/margins": 5.3125, "rewards/rejected": -4.29296875, "step": 4398 }, { "epoch": 0.8312154565638433, "grad_norm": 2.1267972115635927, "learning_rate": 1.7615847374148608e-07, "logits/chosen": 3.5390625, "logits/rejected": 3.44140625, "logps/chosen": -527.5, "logps/rejected": -652.0, "loss": 0.7921, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1187744140625, "rewards/margins": 2.8193359375, "rewards/rejected": -2.939697265625, "step": 4399 }, { "epoch": 0.8314044121120506, "grad_norm": 2.8757086795577953, "learning_rate": 1.759933397795516e-07, "logits/chosen": 3.203125, "logits/rejected": 2.67578125, "logps/chosen": -602.5, "logps/rejected": -605.5, "loss": 0.7032, "rewards/accuracies": 0.75, "rewards/chosen": 0.03857421875, "rewards/margins": 3.5615234375, "rewards/rejected": -3.529296875, "step": 4400 }, { "epoch": 0.8315933676602579, "grad_norm": 2.118842973865934, "learning_rate": 1.7582836852898744e-07, "logits/chosen": 2.767578125, "logits/rejected": 2.83203125, "logps/chosen": -674.0, "logps/rejected": -698.0, "loss": 0.5956, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2529296875, "rewards/margins": 4.48046875, "rewards/rejected": -4.73046875, "step": 4401 }, { "epoch": 0.8317823232084652, "grad_norm": 3.3104057554944726, "learning_rate": 1.7566356006156415e-07, "logits/chosen": 1.779296875, "logits/rejected": 1.4111328125, "logps/chosen": -659.0, "logps/rejected": -766.0, "loss": 0.5539, "rewards/accuracies": 0.84375, "rewards/chosen": 0.609375, "rewards/margins": 5.18359375, "rewards/rejected": -4.5703125, "step": 4402 }, { "epoch": 0.8319712787566725, "grad_norm": 2.2310368613652103, "learning_rate": 1.7549891444898194e-07, "logits/chosen": 2.90625, "logits/rejected": 2.90234375, "logps/chosen": -924.5, "logps/rejected": -1011.0, "loss": 0.4973, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1650390625, "rewards/margins": 5.5703125, "rewards/rejected": -5.40625, "step": 4403 }, { "epoch": 0.8321602343048797, "grad_norm": 0.9568144235531334, "learning_rate": 1.7533443176286921e-07, "logits/chosen": 2.208984375, "logits/rejected": 2.162109375, "logps/chosen": -634.5, "logps/rejected": -1119.5, "loss": 0.4974, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8311767578125, "rewards/margins": 6.921875, "rewards/rejected": -6.0859375, "step": 4404 }, { "epoch": 0.832349189853087, "grad_norm": 4.038665441009706, "learning_rate": 1.7517011207478455e-07, "logits/chosen": 3.1875, "logits/rejected": 3.203125, "logps/chosen": -489.0, "logps/rejected": -565.5, "loss": 0.6002, "rewards/accuracies": 0.75, "rewards/chosen": 0.11181640625, "rewards/margins": 4.109375, "rewards/rejected": -4.0, "step": 4405 }, { "epoch": 0.8325381454012943, "grad_norm": 2.0817076924984512, "learning_rate": 1.7500595545621496e-07, "logits/chosen": 3.20703125, "logits/rejected": 2.3828125, "logps/chosen": -994.0, "logps/rejected": -886.0, "loss": 0.6898, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6728515625, "rewards/margins": 3.50341796875, "rewards/rejected": -4.17578125, "step": 4406 }, { "epoch": 0.8327271009495016, "grad_norm": 2.243100610519649, "learning_rate": 1.7484196197857654e-07, "logits/chosen": 3.3671875, "logits/rejected": 2.802734375, "logps/chosen": -995.0, "logps/rejected": -712.5, "loss": 0.4403, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1595458984375, "rewards/margins": 5.37109375, "rewards/rejected": -4.21484375, "step": 4407 }, { "epoch": 0.832916056497709, "grad_norm": 3.3909458842465257, "learning_rate": 1.7467813171321487e-07, "logits/chosen": 2.28125, "logits/rejected": 1.9140625, "logps/chosen": -791.0, "logps/rejected": -750.0, "loss": 0.6659, "rewards/accuracies": 0.71875, "rewards/chosen": 0.038330078125, "rewards/margins": 3.9765625, "rewards/rejected": -3.94921875, "step": 4408 }, { "epoch": 0.8331050120459162, "grad_norm": 2.4723843627632847, "learning_rate": 1.745144647314038e-07, "logits/chosen": 3.08984375, "logits/rejected": 3.78125, "logps/chosen": -822.0, "logps/rejected": -1259.0, "loss": 0.5863, "rewards/accuracies": 0.84375, "rewards/chosen": 0.583740234375, "rewards/margins": 4.76171875, "rewards/rejected": -4.16796875, "step": 4409 }, { "epoch": 0.8332939675941234, "grad_norm": 2.208548934786225, "learning_rate": 1.7435096110434693e-07, "logits/chosen": 3.51171875, "logits/rejected": 3.49609375, "logps/chosen": -995.0, "logps/rejected": -1387.0, "loss": 0.5245, "rewards/accuracies": 0.8125, "rewards/chosen": 0.98583984375, "rewards/margins": 7.6796875, "rewards/rejected": -6.6953125, "step": 4410 }, { "epoch": 0.8334829231423307, "grad_norm": 1.6860566846338647, "learning_rate": 1.741876209031762e-07, "logits/chosen": 2.75, "logits/rejected": 2.4375, "logps/chosen": -704.0, "logps/rejected": -594.5, "loss": 0.6313, "rewards/accuracies": 0.875, "rewards/chosen": -0.1572265625, "rewards/margins": 3.8759765625, "rewards/rejected": -4.03515625, "step": 4411 }, { "epoch": 0.833671878690538, "grad_norm": 1.873585391809612, "learning_rate": 1.740244441989527e-07, "logits/chosen": 2.703125, "logits/rejected": 2.412109375, "logps/chosen": -830.5, "logps/rejected": -933.0, "loss": 0.5987, "rewards/accuracies": 0.8125, "rewards/chosen": 0.03662109375, "rewards/margins": 5.21484375, "rewards/rejected": -5.177734375, "step": 4412 }, { "epoch": 0.8338608342387454, "grad_norm": 3.956833799928229, "learning_rate": 1.7386143106266633e-07, "logits/chosen": 3.044921875, "logits/rejected": 2.8359375, "logps/chosen": -722.5, "logps/rejected": -765.0, "loss": 0.5514, "rewards/accuracies": 0.84375, "rewards/chosen": 0.486328125, "rewards/margins": 5.0234375, "rewards/rejected": -4.54296875, "step": 4413 }, { "epoch": 0.8340497897869527, "grad_norm": 2.7089022528256073, "learning_rate": 1.7369858156523598e-07, "logits/chosen": 1.9521484375, "logits/rejected": 1.5576171875, "logps/chosen": -1022.0, "logps/rejected": -940.0, "loss": 0.5602, "rewards/accuracies": 0.90625, "rewards/chosen": 7.3037109375, "rewards/margins": 12.4453125, "rewards/rejected": -5.1328125, "step": 4414 }, { "epoch": 0.83423874533516, "grad_norm": 5.49861462510225, "learning_rate": 1.7353589577750916e-07, "logits/chosen": 3.1171875, "logits/rejected": 3.2578125, "logps/chosen": -735.0, "logps/rejected": -796.0, "loss": 0.649, "rewards/accuracies": 0.84375, "rewards/chosen": 1.34326171875, "rewards/margins": 4.86328125, "rewards/rejected": -3.5078125, "step": 4415 }, { "epoch": 0.8344277008833672, "grad_norm": 2.7222953514621513, "learning_rate": 1.7337337377026217e-07, "logits/chosen": 2.59375, "logits/rejected": 2.61328125, "logps/chosen": -706.0, "logps/rejected": -726.5, "loss": 0.5922, "rewards/accuracies": 0.78125, "rewards/chosen": 0.560302734375, "rewards/margins": 4.4453125, "rewards/rejected": -3.8828125, "step": 4416 }, { "epoch": 0.8346166564315745, "grad_norm": 2.7428325159185585, "learning_rate": 1.7321101561420022e-07, "logits/chosen": 3.212890625, "logits/rejected": 2.955078125, "logps/chosen": -624.5, "logps/rejected": -518.0, "loss": 0.6849, "rewards/accuracies": 0.75, "rewards/chosen": 0.674560546875, "rewards/margins": 3.1328125, "rewards/rejected": -2.455078125, "step": 4417 }, { "epoch": 0.8348056119797818, "grad_norm": 2.698990430326649, "learning_rate": 1.7304882137995687e-07, "logits/chosen": 2.982421875, "logits/rejected": 2.716796875, "logps/chosen": -775.0, "logps/rejected": -948.5, "loss": 0.4736, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9775390625, "rewards/margins": 5.46875, "rewards/rejected": -4.49609375, "step": 4418 }, { "epoch": 0.8349945675279891, "grad_norm": 1.7885004128443818, "learning_rate": 1.72886791138095e-07, "logits/chosen": 3.6484375, "logits/rejected": 3.640625, "logps/chosen": -1391.0, "logps/rejected": -913.0, "loss": 0.8387, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0439453125, "rewards/margins": 0.01513671875, "rewards/rejected": -2.0615234375, "step": 4419 }, { "epoch": 0.8351835230761964, "grad_norm": 4.397058356064342, "learning_rate": 1.727249249591054e-07, "logits/chosen": 3.228515625, "logits/rejected": 2.9609375, "logps/chosen": -746.0, "logps/rejected": -1310.0, "loss": 0.604, "rewards/accuracies": 0.90625, "rewards/chosen": 0.587890625, "rewards/margins": 7.24609375, "rewards/rejected": -6.6640625, "step": 4420 }, { "epoch": 0.8353724786244036, "grad_norm": 1.9330956933867514, "learning_rate": 1.725632229134082e-07, "logits/chosen": 2.677734375, "logits/rejected": 2.19140625, "logps/chosen": -789.0, "logps/rejected": -597.0, "loss": 0.6874, "rewards/accuracies": 0.75, "rewards/chosen": 0.43603515625, "rewards/margins": 3.24609375, "rewards/rejected": -2.8125, "step": 4421 }, { "epoch": 0.8355614341726109, "grad_norm": 2.839393224333862, "learning_rate": 1.7240168507135155e-07, "logits/chosen": 3.43359375, "logits/rejected": 2.595703125, "logps/chosen": -1212.0, "logps/rejected": -1186.0, "loss": 0.4887, "rewards/accuracies": 0.8125, "rewards/chosen": 1.27099609375, "rewards/margins": 6.484375, "rewards/rejected": -5.21875, "step": 4422 }, { "epoch": 0.8357503897208182, "grad_norm": 2.758616972272362, "learning_rate": 1.7224031150321232e-07, "logits/chosen": 3.19921875, "logits/rejected": 3.515625, "logps/chosen": -652.5, "logps/rejected": -2233.0, "loss": 0.6742, "rewards/accuracies": 0.71875, "rewards/chosen": 0.54931640625, "rewards/margins": 7.95703125, "rewards/rejected": -7.41015625, "step": 4423 }, { "epoch": 0.8359393452690255, "grad_norm": 3.198359489914892, "learning_rate": 1.7207910227919637e-07, "logits/chosen": 2.44921875, "logits/rejected": 2.20703125, "logps/chosen": -956.0, "logps/rejected": -621.5, "loss": 0.6843, "rewards/accuracies": 0.78125, "rewards/chosen": 0.16357421875, "rewards/margins": 3.07421875, "rewards/rejected": -2.912109375, "step": 4424 }, { "epoch": 0.8361283008172328, "grad_norm": 2.8187542192763106, "learning_rate": 1.7191805746943719e-07, "logits/chosen": 2.177490234375, "logits/rejected": 2.109375, "logps/chosen": -870.5, "logps/rejected": -719.0, "loss": 0.576, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6181640625, "rewards/margins": 5.11328125, "rewards/rejected": -4.49609375, "step": 4425 }, { "epoch": 0.8363172563654401, "grad_norm": 4.590741294782876, "learning_rate": 1.7175717714399753e-07, "logits/chosen": 2.31640625, "logits/rejected": 2.3671875, "logps/chosen": -706.0, "logps/rejected": -993.0, "loss": 0.4356, "rewards/accuracies": 0.875, "rewards/chosen": 0.29931640625, "rewards/margins": 7.09375, "rewards/rejected": -6.8125, "step": 4426 }, { "epoch": 0.8365062119136473, "grad_norm": 2.4434237794713103, "learning_rate": 1.7159646137286822e-07, "logits/chosen": 2.787109375, "logits/rejected": 2.8984375, "logps/chosen": -829.0, "logps/rejected": -756.5, "loss": 0.5682, "rewards/accuracies": 0.875, "rewards/chosen": 1.23974609375, "rewards/margins": 4.6484375, "rewards/rejected": -3.41650390625, "step": 4427 }, { "epoch": 0.8366951674618546, "grad_norm": 2.0594550581288846, "learning_rate": 1.7143591022596842e-07, "logits/chosen": 2.875, "logits/rejected": 2.8046875, "logps/chosen": -841.0, "logps/rejected": -860.0, "loss": 0.5825, "rewards/accuracies": 0.78125, "rewards/chosen": 0.90936279296875, "rewards/margins": 5.5625, "rewards/rejected": -4.6484375, "step": 4428 }, { "epoch": 0.8368841230100619, "grad_norm": 2.1141088562874457, "learning_rate": 1.7127552377314615e-07, "logits/chosen": 3.0078125, "logits/rejected": 2.5703125, "logps/chosen": -606.5, "logps/rejected": -11619.0, "loss": 0.6386, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17626953125, "rewards/margins": -62.38671875, "rewards/rejected": 62.140625, "step": 4429 }, { "epoch": 0.8370730785582692, "grad_norm": 2.3127843956580953, "learning_rate": 1.71115302084177e-07, "logits/chosen": 2.33203125, "logits/rejected": 2.4765625, "logps/chosen": -458.0, "logps/rejected": -785.0, "loss": 0.5286, "rewards/accuracies": 0.84375, "rewards/chosen": 1.015625, "rewards/margins": 5.5703125, "rewards/rejected": -4.5625, "step": 4430 }, { "epoch": 0.8372620341064765, "grad_norm": 3.9793283010841987, "learning_rate": 1.7095524522876565e-07, "logits/chosen": 2.279296875, "logits/rejected": 2.484375, "logps/chosen": -570.0, "logps/rejected": -689.0, "loss": 0.5254, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7646484375, "rewards/margins": 4.71875, "rewards/rejected": -3.9638671875, "step": 4431 }, { "epoch": 0.8374509896546838, "grad_norm": 1.5674086797464248, "learning_rate": 1.707953532765446e-07, "logits/chosen": 3.2021484375, "logits/rejected": 3.5126953125, "logps/chosen": -1076.0, "logps/rejected": -2290.0, "loss": 0.5748, "rewards/accuracies": 0.90625, "rewards/chosen": 0.34130859375, "rewards/margins": 8.90625, "rewards/rejected": -8.578125, "step": 4432 }, { "epoch": 0.837639945202891, "grad_norm": 3.9173054006362413, "learning_rate": 1.706356262970748e-07, "logits/chosen": 2.677734375, "logits/rejected": 2.0849609375, "logps/chosen": -605.0, "logps/rejected": -14026.0, "loss": 0.6169, "rewards/accuracies": 0.78125, "rewards/chosen": 0.114013671875, "rewards/margins": -162.20703125, "rewards/rejected": 161.99609375, "step": 4433 }, { "epoch": 0.8378289007510983, "grad_norm": 1.468190556240773, "learning_rate": 1.704760643598453e-07, "logits/chosen": 3.1796875, "logits/rejected": 2.8828125, "logps/chosen": -803.0, "logps/rejected": -908.0, "loss": 0.5388, "rewards/accuracies": 0.875, "rewards/chosen": 1.1845703125, "rewards/margins": 6.1328125, "rewards/rejected": -4.9609375, "step": 4434 }, { "epoch": 0.8380178562993056, "grad_norm": 2.121185112234995, "learning_rate": 1.7031666753427338e-07, "logits/chosen": 3.162109375, "logits/rejected": 2.7177734375, "logps/chosen": -577.5, "logps/rejected": -552.5, "loss": 0.6215, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3387451171875, "rewards/margins": 3.771484375, "rewards/rejected": -3.43603515625, "step": 4435 }, { "epoch": 0.8382068118475129, "grad_norm": 2.2375125478886244, "learning_rate": 1.7015743588970472e-07, "logits/chosen": 4.296875, "logits/rejected": 4.2734375, "logps/chosen": -994.0, "logps/rejected": -1589.5, "loss": 0.8083, "rewards/accuracies": 0.75, "rewards/chosen": 1.117431640625, "rewards/margins": 6.01953125, "rewards/rejected": -4.9228515625, "step": 4436 }, { "epoch": 0.8383957673957202, "grad_norm": 1.26409083059645, "learning_rate": 1.6999836949541284e-07, "logits/chosen": 2.87109375, "logits/rejected": 2.84375, "logps/chosen": -831.0, "logps/rejected": -695.0, "loss": 0.6105, "rewards/accuracies": 0.84375, "rewards/chosen": 1.294921875, "rewards/margins": 4.2265625, "rewards/rejected": -2.9296875, "step": 4437 }, { "epoch": 0.8385847229439275, "grad_norm": 2.9262310775520275, "learning_rate": 1.6983946842059955e-07, "logits/chosen": 3.2109375, "logits/rejected": 3.17578125, "logps/chosen": -370.0, "logps/rejected": -600.0, "loss": 0.6278, "rewards/accuracies": 0.875, "rewards/chosen": 0.1005859375, "rewards/margins": 4.34375, "rewards/rejected": -4.2421875, "step": 4438 }, { "epoch": 0.8387736784921347, "grad_norm": 2.331881141214796, "learning_rate": 1.6968073273439456e-07, "logits/chosen": 2.146484375, "logits/rejected": 2.2998046875, "logps/chosen": -1239.5, "logps/rejected": -1104.0, "loss": 0.51, "rewards/accuracies": 0.8125, "rewards/chosen": 1.63134765625, "rewards/margins": 6.6484375, "rewards/rejected": -5.015625, "step": 4439 }, { "epoch": 0.838962634040342, "grad_norm": 1.5939675703903542, "learning_rate": 1.6952216250585573e-07, "logits/chosen": 3.162109375, "logits/rejected": 3.3828125, "logps/chosen": -767.5, "logps/rejected": -1561.0, "loss": 0.5297, "rewards/accuracies": 0.84375, "rewards/chosen": 0.544189453125, "rewards/margins": 8.2109375, "rewards/rejected": -7.69921875, "step": 4440 }, { "epoch": 0.8391515895885493, "grad_norm": 3.1933459062558236, "learning_rate": 1.6936375780396894e-07, "logits/chosen": 2.578125, "logits/rejected": 1.982421875, "logps/chosen": -595.0, "logps/rejected": -740.0, "loss": 0.5022, "rewards/accuracies": 0.90625, "rewards/chosen": 1.14208984375, "rewards/margins": 5.125, "rewards/rejected": -3.982421875, "step": 4441 }, { "epoch": 0.8393405451367566, "grad_norm": 3.4027338730098466, "learning_rate": 1.6920551869764822e-07, "logits/chosen": 2.892578125, "logits/rejected": 2.93359375, "logps/chosen": -954.0, "logps/rejected": -956.0, "loss": 0.5315, "rewards/accuracies": 0.8125, "rewards/chosen": 1.45166015625, "rewards/margins": 5.81640625, "rewards/rejected": -4.3671875, "step": 4442 }, { "epoch": 0.8395295006849639, "grad_norm": 2.7813473325077003, "learning_rate": 1.6904744525573528e-07, "logits/chosen": 3.125, "logits/rejected": 3.015625, "logps/chosen": -658.0, "logps/rejected": -1283.0, "loss": 0.5433, "rewards/accuracies": 0.8125, "rewards/chosen": 0.833984375, "rewards/margins": 5.69140625, "rewards/rejected": -4.86328125, "step": 4443 }, { "epoch": 0.8397184562331711, "grad_norm": 2.4570661603154615, "learning_rate": 1.6888953754699986e-07, "logits/chosen": 3.6015625, "logits/rejected": 2.775390625, "logps/chosen": -729.0, "logps/rejected": -1124.0, "loss": 0.5685, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9833984375, "rewards/margins": 5.33984375, "rewards/rejected": -4.349609375, "step": 4444 }, { "epoch": 0.8399074117813784, "grad_norm": 14.24494003168861, "learning_rate": 1.6873179564013967e-07, "logits/chosen": 2.861328125, "logits/rejected": 2.51953125, "logps/chosen": -889.0, "logps/rejected": -1313.0, "loss": 0.4247, "rewards/accuracies": 0.875, "rewards/chosen": 1.466796875, "rewards/margins": 6.9765625, "rewards/rejected": -5.5234375, "step": 4445 }, { "epoch": 0.8400963673295857, "grad_norm": 3.0949457208157733, "learning_rate": 1.6857421960378005e-07, "logits/chosen": 2.5234375, "logits/rejected": 2.224609375, "logps/chosen": -627.5, "logps/rejected": -1636.5, "loss": 0.4804, "rewards/accuracies": 0.9375, "rewards/chosen": 0.87890625, "rewards/margins": 10.2734375, "rewards/rejected": -9.37109375, "step": 4446 }, { "epoch": 0.840285322877793, "grad_norm": 2.6114616878240025, "learning_rate": 1.6841680950647476e-07, "logits/chosen": 2.671875, "logits/rejected": 2.5546875, "logps/chosen": -1198.5, "logps/rejected": -2229.0, "loss": 0.5011, "rewards/accuracies": 0.8125, "rewards/chosen": 1.330322265625, "rewards/margins": 8.7578125, "rewards/rejected": -7.4375, "step": 4447 }, { "epoch": 0.8404742784260003, "grad_norm": 2.1216176854594004, "learning_rate": 1.6825956541670443e-07, "logits/chosen": 2.90625, "logits/rejected": 2.98046875, "logps/chosen": -730.5, "logps/rejected": -758.5, "loss": 0.5524, "rewards/accuracies": 0.875, "rewards/chosen": 1.376953125, "rewards/margins": 5.04296875, "rewards/rejected": -3.669921875, "step": 4448 }, { "epoch": 0.8406632339742076, "grad_norm": 3.2687910516808443, "learning_rate": 1.6810248740287836e-07, "logits/chosen": 3.119140625, "logits/rejected": 3.46875, "logps/chosen": -704.5, "logps/rejected": -1675.0, "loss": 0.6731, "rewards/accuracies": 0.75, "rewards/chosen": 0.51220703125, "rewards/margins": 5.12109375, "rewards/rejected": -4.6171875, "step": 4449 }, { "epoch": 0.8408521895224148, "grad_norm": 2.516930614520533, "learning_rate": 1.6794557553333332e-07, "logits/chosen": 2.4296875, "logits/rejected": 2.349609375, "logps/chosen": -817.0, "logps/rejected": -890.5, "loss": 0.5891, "rewards/accuracies": 0.75, "rewards/chosen": 0.26025390625, "rewards/margins": 4.1640625, "rewards/rejected": -3.908203125, "step": 4450 }, { "epoch": 0.8410411450706221, "grad_norm": 2.221797745684497, "learning_rate": 1.6778882987633341e-07, "logits/chosen": 3.33203125, "logits/rejected": 2.572265625, "logps/chosen": -690.5, "logps/rejected": -712.5, "loss": 0.7162, "rewards/accuracies": 0.75, "rewards/chosen": 0.0322265625, "rewards/margins": 3.3017578125, "rewards/rejected": -3.2734375, "step": 4451 }, { "epoch": 0.8412301006188294, "grad_norm": 1.8241249065553349, "learning_rate": 1.67632250500071e-07, "logits/chosen": 3.5, "logits/rejected": 2.751953125, "logps/chosen": -944.0, "logps/rejected": -692.0, "loss": 0.6102, "rewards/accuracies": 0.8125, "rewards/chosen": 0.066314697265625, "rewards/margins": 3.859375, "rewards/rejected": -3.78515625, "step": 4452 }, { "epoch": 0.8414190561670367, "grad_norm": 3.9705898507898945, "learning_rate": 1.674758374726657e-07, "logits/chosen": 3.48828125, "logits/rejected": 3.51171875, "logps/chosen": -549.5, "logps/rejected": -612.0, "loss": 0.6861, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1710205078125, "rewards/margins": 4.09765625, "rewards/rejected": -3.91796875, "step": 4453 }, { "epoch": 0.841608011715244, "grad_norm": 1.6597853607919981, "learning_rate": 1.6731959086216502e-07, "logits/chosen": 3.359375, "logits/rejected": 3.41015625, "logps/chosen": -758.0, "logps/rejected": -1206.0, "loss": 0.5602, "rewards/accuracies": 0.78125, "rewards/chosen": 0.720947265625, "rewards/margins": 5.671875, "rewards/rejected": -4.953125, "step": 4454 }, { "epoch": 0.8417969672634513, "grad_norm": 3.3408180890852535, "learning_rate": 1.6716351073654385e-07, "logits/chosen": 3.05859375, "logits/rejected": 2.84375, "logps/chosen": -509.5, "logps/rejected": -811.0, "loss": 0.6336, "rewards/accuracies": 0.8125, "rewards/chosen": 0.00390625, "rewards/margins": 6.1328125, "rewards/rejected": -6.12109375, "step": 4455 }, { "epoch": 0.8419859228116585, "grad_norm": 3.1945202307276763, "learning_rate": 1.6700759716370466e-07, "logits/chosen": 3.61328125, "logits/rejected": 4.19140625, "logps/chosen": -1567.0, "logps/rejected": -1476.0, "loss": 0.5497, "rewards/accuracies": 0.78125, "rewards/chosen": 1.852783203125, "rewards/margins": 7.1640625, "rewards/rejected": -5.31640625, "step": 4456 }, { "epoch": 0.8421748783598658, "grad_norm": 2.2499217391947193, "learning_rate": 1.6685185021147782e-07, "logits/chosen": 3.015625, "logits/rejected": 2.771484375, "logps/chosen": -1401.0, "logps/rejected": -1552.0, "loss": 0.5616, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0869140625, "rewards/margins": 6.6953125, "rewards/rejected": -5.609375, "step": 4457 }, { "epoch": 0.8423638339080731, "grad_norm": 2.2392665930734177, "learning_rate": 1.6669626994762083e-07, "logits/chosen": 3.92578125, "logits/rejected": 3.51953125, "logps/chosen": -984.0, "logps/rejected": -841.5, "loss": 0.5355, "rewards/accuracies": 0.78125, "rewards/chosen": 0.75537109375, "rewards/margins": 5.8984375, "rewards/rejected": -5.1484375, "step": 4458 }, { "epoch": 0.8425527894562804, "grad_norm": 4.123278227002363, "learning_rate": 1.6654085643981875e-07, "logits/chosen": 2.603515625, "logits/rejected": 2.359375, "logps/chosen": -622.0, "logps/rejected": -699.5, "loss": 0.6813, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0831298828125, "rewards/margins": 3.900390625, "rewards/rejected": -3.81640625, "step": 4459 }, { "epoch": 0.8427417450044877, "grad_norm": 2.330296539899698, "learning_rate": 1.6638560975568415e-07, "logits/chosen": 2.306640625, "logits/rejected": 2.17578125, "logps/chosen": -513.0, "logps/rejected": -718.0, "loss": 0.7448, "rewards/accuracies": 0.625, "rewards/chosen": -0.47802734375, "rewards/margins": 2.484375, "rewards/rejected": -2.966796875, "step": 4460 }, { "epoch": 0.842930700552695, "grad_norm": 3.8955133396208645, "learning_rate": 1.6623052996275698e-07, "logits/chosen": 3.271484375, "logits/rejected": 3.2265625, "logps/chosen": -497.5, "logps/rejected": -603.0, "loss": 0.6279, "rewards/accuracies": 0.75, "rewards/chosen": -0.052734375, "rewards/margins": 4.3671875, "rewards/rejected": -4.42578125, "step": 4461 }, { "epoch": 0.8431196561009022, "grad_norm": 2.8587364240922084, "learning_rate": 1.6607561712850456e-07, "logits/chosen": 2.107421875, "logits/rejected": 1.77734375, "logps/chosen": -701.0, "logps/rejected": -769.75, "loss": 0.662, "rewards/accuracies": 0.75, "rewards/chosen": 0.8564453125, "rewards/margins": 3.435546875, "rewards/rejected": -2.5791015625, "step": 4462 }, { "epoch": 0.8433086116491095, "grad_norm": 2.2006049481980945, "learning_rate": 1.6592087132032184e-07, "logits/chosen": 2.388671875, "logits/rejected": 1.822265625, "logps/chosen": -1041.0, "logps/rejected": -771.0, "loss": 0.5458, "rewards/accuracies": 0.78125, "rewards/chosen": 1.58935546875, "rewards/margins": 5.2734375, "rewards/rejected": -3.6796875, "step": 4463 }, { "epoch": 0.8434975671973168, "grad_norm": 2.2790510872868364, "learning_rate": 1.657662926055307e-07, "logits/chosen": 3.181640625, "logits/rejected": 3.13671875, "logps/chosen": -909.0, "logps/rejected": -1711.0, "loss": 0.5466, "rewards/accuracies": 0.875, "rewards/chosen": 1.0517578125, "rewards/margins": 9.1171875, "rewards/rejected": -8.0625, "step": 4464 }, { "epoch": 0.8436865227455241, "grad_norm": 1.861489229245885, "learning_rate": 1.6561188105138057e-07, "logits/chosen": 2.361328125, "logits/rejected": 1.9296875, "logps/chosen": -921.0, "logps/rejected": -872.0, "loss": 0.4354, "rewards/accuracies": 0.875, "rewards/chosen": 2.14453125, "rewards/margins": 6.234375, "rewards/rejected": -4.08984375, "step": 4465 }, { "epoch": 0.8438754782937314, "grad_norm": 2.305612533787874, "learning_rate": 1.6545763672504813e-07, "logits/chosen": 2.53515625, "logits/rejected": 2.78125, "logps/chosen": -704.0, "logps/rejected": -1276.5, "loss": 0.6145, "rewards/accuracies": 0.75, "rewards/chosen": 0.98583984375, "rewards/margins": 8.671875, "rewards/rejected": -7.689453125, "step": 4466 }, { "epoch": 0.8440644338419386, "grad_norm": 1.9644499487941036, "learning_rate": 1.6530355969363713e-07, "logits/chosen": 2.37890625, "logits/rejected": 1.2939453125, "logps/chosen": -812.0, "logps/rejected": -615.0, "loss": 0.4374, "rewards/accuracies": 0.90625, "rewards/chosen": 1.549072265625, "rewards/margins": 5.40234375, "rewards/rejected": -3.861328125, "step": 4467 }, { "epoch": 0.8442533893901459, "grad_norm": 2.4815831909928625, "learning_rate": 1.6514965002417908e-07, "logits/chosen": 2.94140625, "logits/rejected": 2.423828125, "logps/chosen": -903.5, "logps/rejected": -637.5, "loss": 0.6713, "rewards/accuracies": 0.75, "rewards/chosen": 0.8779296875, "rewards/margins": 3.811279296875, "rewards/rejected": -2.93701171875, "step": 4468 }, { "epoch": 0.8444423449383532, "grad_norm": 3.0716484166466307, "learning_rate": 1.6499590778363175e-07, "logits/chosen": 2.8818359375, "logits/rejected": 3.1357421875, "logps/chosen": -974.5, "logps/rejected": -1509.0, "loss": 0.4557, "rewards/accuracies": 0.875, "rewards/chosen": 1.02734375, "rewards/margins": 7.484375, "rewards/rejected": -6.44921875, "step": 4469 }, { "epoch": 0.8446313004865605, "grad_norm": 2.9908249533766864, "learning_rate": 1.6484233303888096e-07, "logits/chosen": 3.2578125, "logits/rejected": 3.2109375, "logps/chosen": -541.5, "logps/rejected": -894.0, "loss": 0.7964, "rewards/accuracies": 0.625, "rewards/chosen": 0.1119384765625, "rewards/margins": 2.31298828125, "rewards/rejected": -2.20703125, "step": 4470 }, { "epoch": 0.8448202560347678, "grad_norm": 2.979239157074677, "learning_rate": 1.6468892585673928e-07, "logits/chosen": 2.49609375, "logits/rejected": 2.50390625, "logps/chosen": -981.0, "logps/rejected": -1059.0, "loss": 0.5678, "rewards/accuracies": 0.8125, "rewards/chosen": 1.51171875, "rewards/margins": 5.46875, "rewards/rejected": -3.947265625, "step": 4471 }, { "epoch": 0.8450092115829752, "grad_norm": 2.29164793766525, "learning_rate": 1.6453568630394627e-07, "logits/chosen": 2.021484375, "logits/rejected": 1.859375, "logps/chosen": -779.0, "logps/rejected": -698.0, "loss": 0.5868, "rewards/accuracies": 0.75, "rewards/chosen": 0.04541015625, "rewards/margins": 4.09765625, "rewards/rejected": -4.05078125, "step": 4472 }, { "epoch": 0.8451981671311823, "grad_norm": 2.819103777569961, "learning_rate": 1.6438261444716892e-07, "logits/chosen": 3.412109375, "logits/rejected": 3.3193359375, "logps/chosen": -731.0, "logps/rejected": -9853.0, "loss": 0.7409, "rewards/accuracies": 0.75, "rewards/chosen": 0.052734375, "rewards/margins": -21.236328125, "rewards/rejected": 21.23046875, "step": 4473 }, { "epoch": 0.8453871226793896, "grad_norm": 1.4730097795228636, "learning_rate": 1.6422971035300073e-07, "logits/chosen": 3.73828125, "logits/rejected": 4.20703125, "logps/chosen": -888.0, "logps/rejected": -2021.0, "loss": 0.6631, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6197509765625, "rewards/margins": 7.03125, "rewards/rejected": -6.41015625, "step": 4474 }, { "epoch": 0.845576078227597, "grad_norm": 2.4671482215886242, "learning_rate": 1.6407697408796284e-07, "logits/chosen": 3.171875, "logits/rejected": 2.91796875, "logps/chosen": -763.5, "logps/rejected": -899.0, "loss": 0.6619, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2249755859375, "rewards/margins": 4.4921875, "rewards/rejected": -4.275390625, "step": 4475 }, { "epoch": 0.8457650337758043, "grad_norm": 1.3322887760183666, "learning_rate": 1.6392440571850302e-07, "logits/chosen": 2.26171875, "logits/rejected": 2.3515625, "logps/chosen": -697.5, "logps/rejected": -1590.0, "loss": 0.6199, "rewards/accuracies": 0.78125, "rewards/chosen": 0.38134765625, "rewards/margins": 8.5703125, "rewards/rejected": -8.21484375, "step": 4476 }, { "epoch": 0.8459539893240116, "grad_norm": 2.0297752669182647, "learning_rate": 1.6377200531099578e-07, "logits/chosen": 2.5078125, "logits/rejected": 2.236328125, "logps/chosen": -10549.0, "logps/rejected": -1208.0, "loss": 0.5201, "rewards/accuracies": 0.8125, "rewards/chosen": 13.80078125, "rewards/margins": 17.69921875, "rewards/rejected": -3.9609375, "step": 4477 }, { "epoch": 0.8461429448722189, "grad_norm": 2.7215503866851516, "learning_rate": 1.6361977293174328e-07, "logits/chosen": 2.64453125, "logits/rejected": 2.919921875, "logps/chosen": -736.5, "logps/rejected": -792.0, "loss": 0.6002, "rewards/accuracies": 0.71875, "rewards/chosen": 0.407470703125, "rewards/margins": 4.3203125, "rewards/rejected": -3.9140625, "step": 4478 }, { "epoch": 0.846331900420426, "grad_norm": 1.2971055248304841, "learning_rate": 1.6346770864697368e-07, "logits/chosen": 3.4296875, "logits/rejected": 3.11328125, "logps/chosen": -675.5, "logps/rejected": -920.0, "loss": 0.5613, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7958984375, "rewards/margins": 5.484375, "rewards/rejected": -4.69140625, "step": 4479 }, { "epoch": 0.8465208559686334, "grad_norm": 1.4622110114246403, "learning_rate": 1.633158125228427e-07, "logits/chosen": 2.943359375, "logits/rejected": 2.732421875, "logps/chosen": -797.0, "logps/rejected": -722.5, "loss": 0.4535, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2783203125, "rewards/margins": 5.828125, "rewards/rejected": -4.546875, "step": 4480 }, { "epoch": 0.8467098115168407, "grad_norm": 2.502120630476388, "learning_rate": 1.6316408462543256e-07, "logits/chosen": 3.67578125, "logits/rejected": 3.0859375, "logps/chosen": -564.0, "logps/rejected": -642.5, "loss": 0.6152, "rewards/accuracies": 0.65625, "rewards/chosen": 0.12158203125, "rewards/margins": 4.1806640625, "rewards/rejected": -4.0625, "step": 4481 }, { "epoch": 0.846898767065048, "grad_norm": 2.550285544482662, "learning_rate": 1.6301252502075257e-07, "logits/chosen": 2.69140625, "logits/rejected": 3.1953125, "logps/chosen": -1130.0, "logps/rejected": -2296.0, "loss": 0.6103, "rewards/accuracies": 0.71875, "rewards/chosen": 0.51123046875, "rewards/margins": 8.95703125, "rewards/rejected": -8.4453125, "step": 4482 }, { "epoch": 0.8470877226132553, "grad_norm": 3.0698561782706, "learning_rate": 1.6286113377473837e-07, "logits/chosen": 2.9921875, "logits/rejected": 2.392578125, "logps/chosen": -938.0, "logps/rejected": -930.0, "loss": 0.4305, "rewards/accuracies": 0.90625, "rewards/chosen": 1.7470703125, "rewards/margins": 6.1875, "rewards/rejected": -4.4453125, "step": 4483 }, { "epoch": 0.8472766781614626, "grad_norm": 3.2513203168072637, "learning_rate": 1.6270991095325273e-07, "logits/chosen": 3.12109375, "logits/rejected": 2.40234375, "logps/chosen": -637.0, "logps/rejected": -728.0, "loss": 0.4903, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9830322265625, "rewards/margins": 6.54296875, "rewards/rejected": -5.5703125, "step": 4484 }, { "epoch": 0.8474656337096698, "grad_norm": 1.8259880362767864, "learning_rate": 1.625588566220852e-07, "logits/chosen": 3.97265625, "logits/rejected": 3.8359375, "logps/chosen": -552.5, "logps/rejected": -429.0, "loss": 0.6788, "rewards/accuracies": 0.6875, "rewards/chosen": 0.439697265625, "rewards/margins": 3.56640625, "rewards/rejected": -3.125, "step": 4485 }, { "epoch": 0.8476545892578771, "grad_norm": 2.292823913786126, "learning_rate": 1.6240797084695172e-07, "logits/chosen": 2.8984375, "logits/rejected": 2.6875, "logps/chosen": -786.0, "logps/rejected": -1003.0, "loss": 0.5718, "rewards/accuracies": 0.75, "rewards/chosen": 0.537109375, "rewards/margins": 5.0234375, "rewards/rejected": -4.4921875, "step": 4486 }, { "epoch": 0.8478435448060844, "grad_norm": 1.1591385588241259, "learning_rate": 1.6225725369349506e-07, "logits/chosen": 2.87109375, "logits/rejected": 2.818359375, "logps/chosen": -426.75, "logps/rejected": -1570.0, "loss": 0.5993, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9462890625, "rewards/margins": 7.953125, "rewards/rejected": -7.0078125, "step": 4487 }, { "epoch": 0.8480325003542917, "grad_norm": 2.366273749054407, "learning_rate": 1.6210670522728456e-07, "logits/chosen": 2.4375, "logits/rejected": 2.099609375, "logps/chosen": -619.0, "logps/rejected": -558.0, "loss": 0.6053, "rewards/accuracies": 0.8125, "rewards/chosen": 0.203125, "rewards/margins": 3.8173828125, "rewards/rejected": -3.61328125, "step": 4488 }, { "epoch": 0.848221455902499, "grad_norm": 2.4118234009486517, "learning_rate": 1.6195632551381655e-07, "logits/chosen": 1.86328125, "logits/rejected": 2.2939453125, "logps/chosen": -424.0, "logps/rejected": -960.0, "loss": 0.6606, "rewards/accuracies": 0.8125, "rewards/chosen": -0.120361328125, "rewards/margins": 4.3203125, "rewards/rejected": -4.4453125, "step": 4489 }, { "epoch": 0.8484104114507062, "grad_norm": 2.656733909442291, "learning_rate": 1.6180611461851325e-07, "logits/chosen": 2.375, "logits/rejected": 2.349853515625, "logps/chosen": -763.5, "logps/rejected": -779.5, "loss": 0.6148, "rewards/accuracies": 0.75, "rewards/chosen": 0.956787109375, "rewards/margins": 4.86328125, "rewards/rejected": -3.9140625, "step": 4490 }, { "epoch": 0.8485993669989135, "grad_norm": 2.5132983301925997, "learning_rate": 1.616560726067241e-07, "logits/chosen": 2.84765625, "logits/rejected": 1.892578125, "logps/chosen": -646.5, "logps/rejected": -622.5, "loss": 0.548, "rewards/accuracies": 0.84375, "rewards/chosen": 0.955078125, "rewards/margins": 4.921875, "rewards/rejected": -3.96875, "step": 4491 }, { "epoch": 0.8487883225471208, "grad_norm": 4.7395531842272955, "learning_rate": 1.615061995437246e-07, "logits/chosen": 2.123046875, "logits/rejected": 2.462890625, "logps/chosen": -1067.5, "logps/rejected": -1607.0, "loss": 0.6846, "rewards/accuracies": 0.6875, "rewards/chosen": 0.640625, "rewards/margins": 7.8515625, "rewards/rejected": -7.203125, "step": 4492 }, { "epoch": 0.8489772780953281, "grad_norm": 2.230354078291231, "learning_rate": 1.6135649549471695e-07, "logits/chosen": 3.67578125, "logits/rejected": 3.2109375, "logps/chosen": -627.0, "logps/rejected": -681.0, "loss": 0.552, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0645751953125, "rewards/margins": 4.65625, "rewards/rejected": -3.603515625, "step": 4493 }, { "epoch": 0.8491662336435354, "grad_norm": 4.358947087972451, "learning_rate": 1.6120696052483001e-07, "logits/chosen": 3.009765625, "logits/rejected": 2.3056640625, "logps/chosen": -870.0, "logps/rejected": -1061.0, "loss": 0.3974, "rewards/accuracies": 0.9375, "rewards/chosen": 2.15234375, "rewards/margins": 6.75, "rewards/rejected": -4.5859375, "step": 4494 }, { "epoch": 0.8493551891917427, "grad_norm": 4.868613893557007, "learning_rate": 1.6105759469911862e-07, "logits/chosen": 3.765625, "logits/rejected": 3.73828125, "logps/chosen": -638.5, "logps/rejected": -814.0, "loss": 0.5405, "rewards/accuracies": 0.71875, "rewards/chosen": 1.39794921875, "rewards/margins": 4.30859375, "rewards/rejected": -2.8984375, "step": 4495 }, { "epoch": 0.8495441447399499, "grad_norm": 1.6818389887047802, "learning_rate": 1.6090839808256445e-07, "logits/chosen": 1.900390625, "logits/rejected": 1.62109375, "logps/chosen": -713.0, "logps/rejected": -1060.0, "loss": 0.4481, "rewards/accuracies": 0.90625, "rewards/chosen": 1.30859375, "rewards/margins": 8.4453125, "rewards/rejected": -7.1328125, "step": 4496 }, { "epoch": 0.8497331002881572, "grad_norm": 3.6319932844757448, "learning_rate": 1.6075937074007533e-07, "logits/chosen": 2.408203125, "logits/rejected": 2.654296875, "logps/chosen": -565.0, "logps/rejected": -1199.0, "loss": 0.5529, "rewards/accuracies": 0.875, "rewards/chosen": 0.328857421875, "rewards/margins": 5.08203125, "rewards/rejected": -4.76953125, "step": 4497 }, { "epoch": 0.8499220558363645, "grad_norm": 1.3997377601698062, "learning_rate": 1.6061051273648547e-07, "logits/chosen": 2.94140625, "logits/rejected": 2.427734375, "logps/chosen": -972.0, "logps/rejected": -807.5, "loss": 0.4899, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4169921875, "rewards/margins": 5.35546875, "rewards/rejected": -3.931640625, "step": 4498 }, { "epoch": 0.8501110113845718, "grad_norm": 3.3270172831733675, "learning_rate": 1.604618241365557e-07, "logits/chosen": 3.056640625, "logits/rejected": 2.234375, "logps/chosen": -675.0, "logps/rejected": -725.5, "loss": 0.6678, "rewards/accuracies": 0.8125, "rewards/chosen": 0.345947265625, "rewards/margins": 3.61328125, "rewards/rejected": -3.2734375, "step": 4499 }, { "epoch": 0.8502999669327791, "grad_norm": 2.335810812739971, "learning_rate": 1.6031330500497257e-07, "logits/chosen": 3.8671875, "logits/rejected": 3.90625, "logps/chosen": -451.0, "logps/rejected": -563.0, "loss": 0.7179, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5654296875, "rewards/margins": 2.7890625, "rewards/rejected": -3.359375, "step": 4500 }, { "epoch": 0.8504889224809864, "grad_norm": 2.915436894053474, "learning_rate": 1.6016495540634945e-07, "logits/chosen": 2.421875, "logits/rejected": 2.84375, "logps/chosen": -883.0, "logps/rejected": -869.5, "loss": 0.6425, "rewards/accuracies": 0.75, "rewards/chosen": 0.85565185546875, "rewards/margins": 4.2265625, "rewards/rejected": -3.3681640625, "step": 4501 }, { "epoch": 0.8506778780291936, "grad_norm": 1.7869396730401323, "learning_rate": 1.6001677540522567e-07, "logits/chosen": 2.767578125, "logits/rejected": 3.10546875, "logps/chosen": -772.0, "logps/rejected": -1166.0, "loss": 0.5815, "rewards/accuracies": 0.84375, "rewards/chosen": 0.71484375, "rewards/margins": 8.22265625, "rewards/rejected": -7.5, "step": 4502 }, { "epoch": 0.8508668335774009, "grad_norm": 2.27406016102441, "learning_rate": 1.5986876506606686e-07, "logits/chosen": 2.78125, "logits/rejected": 2.626953125, "logps/chosen": -723.0, "logps/rejected": -946.0, "loss": 0.5368, "rewards/accuracies": 0.84375, "rewards/chosen": 1.003662109375, "rewards/margins": 7.4921875, "rewards/rejected": -6.48828125, "step": 4503 }, { "epoch": 0.8510557891256082, "grad_norm": 4.181944132527012, "learning_rate": 1.597209244532649e-07, "logits/chosen": 2.330078125, "logits/rejected": 2.0244140625, "logps/chosen": -680.0, "logps/rejected": -617.5, "loss": 0.6858, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3154296875, "rewards/margins": 3.0703125, "rewards/rejected": -2.751953125, "step": 4504 }, { "epoch": 0.8512447446738155, "grad_norm": 2.382518260893342, "learning_rate": 1.5957325363113752e-07, "logits/chosen": 2.9375, "logits/rejected": 2.068359375, "logps/chosen": -637.0, "logps/rejected": -737.0, "loss": 0.6185, "rewards/accuracies": 0.78125, "rewards/chosen": 0.919921875, "rewards/margins": 4.8203125, "rewards/rejected": -3.890625, "step": 4505 }, { "epoch": 0.8514337002220228, "grad_norm": 3.8226150583122873, "learning_rate": 1.594257526639291e-07, "logits/chosen": 3.25, "logits/rejected": 3.046875, "logps/chosen": -976.0, "logps/rejected": -909.0, "loss": 0.5734, "rewards/accuracies": 0.75, "rewards/chosen": 0.982421875, "rewards/margins": 5.21875, "rewards/rejected": -4.24609375, "step": 4506 }, { "epoch": 0.8516226557702301, "grad_norm": 3.6884260918261726, "learning_rate": 1.5927842161580976e-07, "logits/chosen": 2.358642578125, "logits/rejected": 2.3125, "logps/chosen": -814.5, "logps/rejected": -911.0, "loss": 0.5534, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04345703125, "rewards/margins": 5.1796875, "rewards/rejected": -5.140625, "step": 4507 }, { "epoch": 0.8518116113184373, "grad_norm": 2.590733694799785, "learning_rate": 1.5913126055087578e-07, "logits/chosen": 2.2236328125, "logits/rejected": 1.75732421875, "logps/chosen": -728.5, "logps/rejected": -818.0, "loss": 0.6115, "rewards/accuracies": 0.90625, "rewards/chosen": 0.56689453125, "rewards/margins": 4.3671875, "rewards/rejected": -3.8046875, "step": 4508 }, { "epoch": 0.8520005668666446, "grad_norm": 2.4064514122783693, "learning_rate": 1.5898426953314942e-07, "logits/chosen": 2.92578125, "logits/rejected": 2.650390625, "logps/chosen": -1143.0, "logps/rejected": -973.0, "loss": 0.4702, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6583251953125, "rewards/margins": 6.0546875, "rewards/rejected": -5.390625, "step": 4509 }, { "epoch": 0.8521895224148519, "grad_norm": 2.966643198107903, "learning_rate": 1.5883744862657917e-07, "logits/chosen": 2.095703125, "logits/rejected": 2.060546875, "logps/chosen": -698.0, "logps/rejected": -1248.0, "loss": 0.5553, "rewards/accuracies": 0.875, "rewards/chosen": 0.669830322265625, "rewards/margins": 5.5625, "rewards/rejected": -4.8984375, "step": 4510 }, { "epoch": 0.8523784779630592, "grad_norm": 2.4643902037462295, "learning_rate": 1.5869079789503915e-07, "logits/chosen": 3.10546875, "logits/rejected": 2.63671875, "logps/chosen": -816.5, "logps/rejected": -848.5, "loss": 0.4932, "rewards/accuracies": 0.84375, "rewards/chosen": 0.790283203125, "rewards/margins": 5.5546875, "rewards/rejected": -4.765625, "step": 4511 }, { "epoch": 0.8525674335112665, "grad_norm": 5.150325978541068, "learning_rate": 1.5854431740232988e-07, "logits/chosen": 2.244140625, "logits/rejected": 1.8701171875, "logps/chosen": -696.0, "logps/rejected": -735.5, "loss": 0.7045, "rewards/accuracies": 0.6875, "rewards/chosen": 0.606201171875, "rewards/margins": 3.3046875, "rewards/rejected": -2.69921875, "step": 4512 }, { "epoch": 0.8527563890594737, "grad_norm": 2.3264441542978025, "learning_rate": 1.5839800721217756e-07, "logits/chosen": 2.6640625, "logits/rejected": 2.779296875, "logps/chosen": -14203.0, "logps/rejected": -865.0, "loss": 0.582, "rewards/accuracies": 0.8125, "rewards/chosen": 170.69140625, "rewards/margins": 174.86328125, "rewards/rejected": -3.677734375, "step": 4513 }, { "epoch": 0.852945344607681, "grad_norm": 2.508934138776303, "learning_rate": 1.5825186738823438e-07, "logits/chosen": 3.3984375, "logits/rejected": 2.9609375, "logps/chosen": -729.0, "logps/rejected": -1020.0, "loss": 0.5843, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9827880859375, "rewards/margins": 7.8671875, "rewards/rejected": -6.87109375, "step": 4514 }, { "epoch": 0.8531343001558883, "grad_norm": 3.046461923676444, "learning_rate": 1.5810589799407826e-07, "logits/chosen": 3.60546875, "logits/rejected": 3.1484375, "logps/chosen": -695.0, "logps/rejected": -753.0, "loss": 0.5138, "rewards/accuracies": 0.84375, "rewards/chosen": 1.6171875, "rewards/margins": 5.25, "rewards/rejected": -3.642578125, "step": 4515 }, { "epoch": 0.8533232557040956, "grad_norm": 2.628852616562037, "learning_rate": 1.579600990932131e-07, "logits/chosen": 1.8076171875, "logits/rejected": 2.015625, "logps/chosen": -1114.0, "logps/rejected": -1616.0, "loss": 0.5721, "rewards/accuracies": 0.78125, "rewards/chosen": 1.76611328125, "rewards/margins": 7.5625, "rewards/rejected": -5.80078125, "step": 4516 }, { "epoch": 0.8535122112523029, "grad_norm": 2.1783333700957175, "learning_rate": 1.578144707490689e-07, "logits/chosen": 2.86328125, "logits/rejected": 2.609375, "logps/chosen": -792.0, "logps/rejected": -1127.5, "loss": 0.5375, "rewards/accuracies": 0.875, "rewards/chosen": 1.0718994140625, "rewards/margins": 6.3515625, "rewards/rejected": -5.2734375, "step": 4517 }, { "epoch": 0.8537011668005102, "grad_norm": 2.3283333213687927, "learning_rate": 1.576690130250008e-07, "logits/chosen": 2.251953125, "logits/rejected": 1.60546875, "logps/chosen": -1092.0, "logps/rejected": -920.0, "loss": 0.473, "rewards/accuracies": 0.875, "rewards/chosen": 0.44921875, "rewards/margins": 4.78515625, "rewards/rejected": -4.34765625, "step": 4518 }, { "epoch": 0.8538901223487174, "grad_norm": 3.3786336238318904, "learning_rate": 1.5752372598429033e-07, "logits/chosen": 3.515625, "logits/rejected": 3.03515625, "logps/chosen": -645.0, "logps/rejected": -836.5, "loss": 0.5788, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6015625, "rewards/margins": 5.1328125, "rewards/rejected": -4.52734375, "step": 4519 }, { "epoch": 0.8540790778969247, "grad_norm": 2.7311891469750593, "learning_rate": 1.5737860969014445e-07, "logits/chosen": 3.6640625, "logits/rejected": 3.935546875, "logps/chosen": -1029.0, "logps/rejected": -1919.0, "loss": 0.5606, "rewards/accuracies": 0.8125, "rewards/chosen": -0.62109375, "rewards/margins": 5.4296875, "rewards/rejected": -6.04296875, "step": 4520 }, { "epoch": 0.854268033445132, "grad_norm": 2.0622330025030906, "learning_rate": 1.5723366420569596e-07, "logits/chosen": 2.642578125, "logits/rejected": 2.522705078125, "logps/chosen": -796.5, "logps/rejected": -791.0, "loss": 0.4941, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4453125, "rewards/margins": 5.12890625, "rewards/rejected": -4.6953125, "step": 4521 }, { "epoch": 0.8544569889933393, "grad_norm": 2.373924987373645, "learning_rate": 1.5708888959400335e-07, "logits/chosen": 3.46484375, "logits/rejected": 2.796875, "logps/chosen": -955.5, "logps/rejected": -1592.0, "loss": 0.3893, "rewards/accuracies": 0.96875, "rewards/chosen": 2.263671875, "rewards/margins": 9.5, "rewards/rejected": -7.2421875, "step": 4522 }, { "epoch": 0.8546459445415466, "grad_norm": 2.4354826526299758, "learning_rate": 1.5694428591805047e-07, "logits/chosen": 2.388671875, "logits/rejected": 1.962890625, "logps/chosen": -818.0, "logps/rejected": -642.0, "loss": 0.5135, "rewards/accuracies": 0.9375, "rewards/chosen": 1.125, "rewards/margins": 4.81640625, "rewards/rejected": -3.69140625, "step": 4523 }, { "epoch": 0.8548349000897539, "grad_norm": 2.419816065002025, "learning_rate": 1.567998532407474e-07, "logits/chosen": 1.818359375, "logits/rejected": 1.5009765625, "logps/chosen": -868.5, "logps/rejected": -928.0, "loss": 0.4576, "rewards/accuracies": 0.9375, "rewards/chosen": 1.193359375, "rewards/margins": 6.4296875, "rewards/rejected": -5.2265625, "step": 4524 }, { "epoch": 0.8550238556379611, "grad_norm": 1.5785847791873329, "learning_rate": 1.5665559162492936e-07, "logits/chosen": 2.146484375, "logits/rejected": 2.263671875, "logps/chosen": -780.5, "logps/rejected": -836.0, "loss": 0.5812, "rewards/accuracies": 0.8125, "rewards/chosen": 0.326171875, "rewards/margins": 4.4921875, "rewards/rejected": -4.1640625, "step": 4525 }, { "epoch": 0.8552128111861684, "grad_norm": 2.234291476591501, "learning_rate": 1.5651150113335706e-07, "logits/chosen": 2.76171875, "logits/rejected": 2.1123046875, "logps/chosen": -680.5, "logps/rejected": -613.0, "loss": 0.5421, "rewards/accuracies": 0.875, "rewards/chosen": 0.5256500244140625, "rewards/margins": 4.87890625, "rewards/rejected": -4.3515625, "step": 4526 }, { "epoch": 0.8554017667343757, "grad_norm": 2.3665718447098603, "learning_rate": 1.5636758182871737e-07, "logits/chosen": 2.60546875, "logits/rejected": 2.224609375, "logps/chosen": -970.0, "logps/rejected": -908.0, "loss": 0.5972, "rewards/accuracies": 0.78125, "rewards/chosen": 1.136474609375, "rewards/margins": 4.94921875, "rewards/rejected": -3.8125, "step": 4527 }, { "epoch": 0.855590722282583, "grad_norm": 2.196319534776292, "learning_rate": 1.5622383377362207e-07, "logits/chosen": 2.97265625, "logits/rejected": 2.494140625, "logps/chosen": -786.5, "logps/rejected": -754.5, "loss": 0.6677, "rewards/accuracies": 0.71875, "rewards/chosen": 9.4425048828125, "rewards/margins": 12.7578125, "rewards/rejected": -3.2734375, "step": 4528 }, { "epoch": 0.8557796778307903, "grad_norm": 2.8730447779003274, "learning_rate": 1.5608025703060868e-07, "logits/chosen": 2.064453125, "logits/rejected": 1.599609375, "logps/chosen": -539.5, "logps/rejected": -556.0, "loss": 0.6005, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1983642578125, "rewards/margins": 3.72265625, "rewards/rejected": -3.53515625, "step": 4529 }, { "epoch": 0.8559686333789976, "grad_norm": 4.884723047946635, "learning_rate": 1.559368516621402e-07, "logits/chosen": 1.84521484375, "logits/rejected": 1.55029296875, "logps/chosen": -852.0, "logps/rejected": -1043.0, "loss": 0.6171, "rewards/accuracies": 0.75, "rewards/chosen": 0.1121826171875, "rewards/margins": 5.2109375, "rewards/rejected": -5.1015625, "step": 4530 }, { "epoch": 0.8561575889272048, "grad_norm": 2.8650044271670834, "learning_rate": 1.5579361773060505e-07, "logits/chosen": 2.66796875, "logits/rejected": 2.359375, "logps/chosen": -782.0, "logps/rejected": -861.0, "loss": 0.6055, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9840087890625, "rewards/margins": 5.0078125, "rewards/rejected": -4.0234375, "step": 4531 }, { "epoch": 0.8563465444754121, "grad_norm": 1.6054394863354027, "learning_rate": 1.5565055529831695e-07, "logits/chosen": 3.296875, "logits/rejected": 3.17578125, "logps/chosen": -463.75, "logps/rejected": -1527.0, "loss": 0.5111, "rewards/accuracies": 0.84375, "rewards/chosen": 0.51800537109375, "rewards/margins": 9.6171875, "rewards/rejected": -9.0625, "step": 4532 }, { "epoch": 0.8565355000236194, "grad_norm": 3.1848283438779292, "learning_rate": 1.5550766442751533e-07, "logits/chosen": 2.994140625, "logits/rejected": 3.125, "logps/chosen": -608.5, "logps/rejected": -1222.0, "loss": 0.522, "rewards/accuracies": 0.8125, "rewards/chosen": 1.083984375, "rewards/margins": 6.78125, "rewards/rejected": -5.6875, "step": 4533 }, { "epoch": 0.8567244555718267, "grad_norm": 1.9336889023809094, "learning_rate": 1.5536494518036466e-07, "logits/chosen": 3.28125, "logits/rejected": 2.83203125, "logps/chosen": -880.5, "logps/rejected": -833.5, "loss": 0.632, "rewards/accuracies": 0.71875, "rewards/chosen": 0.83642578125, "rewards/margins": 4.453125, "rewards/rejected": -3.61328125, "step": 4534 }, { "epoch": 0.856913411120034, "grad_norm": 2.9827904267228087, "learning_rate": 1.5522239761895492e-07, "logits/chosen": 3.0390625, "logits/rejected": 2.5859375, "logps/chosen": -675.5, "logps/rejected": -1047.0, "loss": 0.5767, "rewards/accuracies": 0.84375, "rewards/chosen": 1.041015625, "rewards/margins": 6.6015625, "rewards/rejected": -5.5546875, "step": 4535 }, { "epoch": 0.8571023666682412, "grad_norm": 2.879565686105358, "learning_rate": 1.5508002180530125e-07, "logits/chosen": 3.19921875, "logits/rejected": 2.7890625, "logps/chosen": -945.0, "logps/rejected": -797.0, "loss": 0.59, "rewards/accuracies": 0.78125, "rewards/chosen": 0.92822265625, "rewards/margins": 3.82421875, "rewards/rejected": -2.8984375, "step": 4536 }, { "epoch": 0.8572913222164485, "grad_norm": 1.7360728671802055, "learning_rate": 1.5493781780134413e-07, "logits/chosen": 2.6328125, "logits/rejected": 2.38671875, "logps/chosen": -757.0, "logps/rejected": -607.0, "loss": 0.641, "rewards/accuracies": 0.78125, "rewards/chosen": 0.30908203125, "rewards/margins": 3.671875, "rewards/rejected": -3.3671875, "step": 4537 }, { "epoch": 0.8574802777646559, "grad_norm": 2.54531706581794, "learning_rate": 1.547957856689496e-07, "logits/chosen": 2.478515625, "logits/rejected": 2.697265625, "logps/chosen": -635.0, "logps/rejected": -678.0, "loss": 0.5851, "rewards/accuracies": 0.84375, "rewards/chosen": 0.531982421875, "rewards/margins": 4.0703125, "rewards/rejected": -3.52734375, "step": 4538 }, { "epoch": 0.8576692333128632, "grad_norm": 2.23920481332418, "learning_rate": 1.546539254699083e-07, "logits/chosen": 2.201171875, "logits/rejected": 2.216796875, "logps/chosen": -594.5, "logps/rejected": -824.5, "loss": 0.6065, "rewards/accuracies": 0.84375, "rewards/chosen": 0.63916015625, "rewards/margins": 7.0859375, "rewards/rejected": -6.4609375, "step": 4539 }, { "epoch": 0.8578581888610705, "grad_norm": 1.5563122751592395, "learning_rate": 1.545122372659366e-07, "logits/chosen": 2.216796875, "logits/rejected": 1.62109375, "logps/chosen": -785.0, "logps/rejected": -786.0, "loss": 0.6282, "rewards/accuracies": 0.78125, "rewards/chosen": 0.33447265625, "rewards/margins": 4.671875, "rewards/rejected": -4.33203125, "step": 4540 }, { "epoch": 0.8580471444092778, "grad_norm": 2.544464695603014, "learning_rate": 1.5437072111867587e-07, "logits/chosen": 2.734375, "logits/rejected": 2.837890625, "logps/chosen": -475.5, "logps/rejected": -14094.5, "loss": 0.656, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7216796875, "rewards/margins": -158.07421875, "rewards/rejected": 158.390625, "step": 4541 }, { "epoch": 0.858236099957485, "grad_norm": 3.0476767873508543, "learning_rate": 1.5422937708969257e-07, "logits/chosen": 1.658935546875, "logits/rejected": 1.41943359375, "logps/chosen": -607.75, "logps/rejected": -945.0, "loss": 0.5678, "rewards/accuracies": 0.8125, "rewards/chosen": 0.73193359375, "rewards/margins": 5.34375, "rewards/rejected": -4.60546875, "step": 4542 }, { "epoch": 0.8584250555056923, "grad_norm": 2.0219825827613245, "learning_rate": 1.540882052404785e-07, "logits/chosen": 2.94140625, "logits/rejected": 2.9296875, "logps/chosen": -1065.5, "logps/rejected": -1105.5, "loss": 0.437, "rewards/accuracies": 0.875, "rewards/chosen": 1.44140625, "rewards/margins": 8.80078125, "rewards/rejected": -7.33984375, "step": 4543 }, { "epoch": 0.8586140110538996, "grad_norm": 2.6035160551511534, "learning_rate": 1.5394720563245007e-07, "logits/chosen": 2.05859375, "logits/rejected": 1.458984375, "logps/chosen": -955.0, "logps/rejected": -869.0, "loss": 0.4631, "rewards/accuracies": 0.84375, "rewards/chosen": 1.1923828125, "rewards/margins": 5.921875, "rewards/rejected": -4.734375, "step": 4544 }, { "epoch": 0.8588029666021069, "grad_norm": 3.287222438184677, "learning_rate": 1.5380637832694933e-07, "logits/chosen": 3.60546875, "logits/rejected": 3.0, "logps/chosen": -810.5, "logps/rejected": -895.5, "loss": 0.5606, "rewards/accuracies": 0.84375, "rewards/chosen": 1.128173828125, "rewards/margins": 4.41796875, "rewards/rejected": -3.29296875, "step": 4545 }, { "epoch": 0.8589919221503142, "grad_norm": 3.1324857085978324, "learning_rate": 1.5366572338524305e-07, "logits/chosen": 3.6796875, "logits/rejected": 2.77734375, "logps/chosen": -813.0, "logps/rejected": -909.0, "loss": 0.4808, "rewards/accuracies": 0.78125, "rewards/chosen": 1.22607421875, "rewards/margins": 6.9453125, "rewards/rejected": -5.7109375, "step": 4546 }, { "epoch": 0.8591808776985215, "grad_norm": 2.378932552058748, "learning_rate": 1.53525240868523e-07, "logits/chosen": 3.2734375, "logits/rejected": 3.06640625, "logps/chosen": -734.0, "logps/rejected": -683.0, "loss": 0.744, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5679931640625, "rewards/margins": 2.77734375, "rewards/rejected": -2.20703125, "step": 4547 }, { "epoch": 0.8593698332467287, "grad_norm": 1.6345485206795716, "learning_rate": 1.5338493083790628e-07, "logits/chosen": 2.23828125, "logits/rejected": 2.421875, "logps/chosen": -1022.0, "logps/rejected": -2698.0, "loss": 0.4885, "rewards/accuracies": 0.875, "rewards/chosen": 1.6162109375, "rewards/margins": 17.78515625, "rewards/rejected": -16.12109375, "step": 4548 }, { "epoch": 0.859558788794936, "grad_norm": 2.5034325991255884, "learning_rate": 1.532447933544343e-07, "logits/chosen": 3.53125, "logits/rejected": 3.25390625, "logps/chosen": -1113.0, "logps/rejected": -1598.0, "loss": 0.5821, "rewards/accuracies": 0.8125, "rewards/chosen": 0.52392578125, "rewards/margins": 6.5859375, "rewards/rejected": -6.05859375, "step": 4549 }, { "epoch": 0.8597477443431433, "grad_norm": 2.183979362677969, "learning_rate": 1.5310482847907403e-07, "logits/chosen": 2.65234375, "logits/rejected": 2.21484375, "logps/chosen": -610.5, "logps/rejected": -716.5, "loss": 0.6021, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9013671875, "rewards/margins": 4.759765625, "rewards/rejected": -3.869140625, "step": 4550 }, { "epoch": 0.8599366998913506, "grad_norm": 1.7547939611227577, "learning_rate": 1.5296503627271705e-07, "logits/chosen": 3.73046875, "logits/rejected": 3.0859375, "logps/chosen": -622.5, "logps/rejected": -1842.5, "loss": 0.5455, "rewards/accuracies": 0.84375, "rewards/chosen": 0.521484375, "rewards/margins": 10.41796875, "rewards/rejected": -9.8828125, "step": 4551 }, { "epoch": 0.8601256554395579, "grad_norm": 2.0139558637778343, "learning_rate": 1.5282541679617986e-07, "logits/chosen": 3.37109375, "logits/rejected": 3.19140625, "logps/chosen": -524.0, "logps/rejected": -939.5, "loss": 0.5901, "rewards/accuracies": 0.8125, "rewards/chosen": 0.83935546875, "rewards/margins": 5.2578125, "rewards/rejected": -4.41796875, "step": 4552 }, { "epoch": 0.8603146109877652, "grad_norm": 1.7772777036126834, "learning_rate": 1.5268597011020385e-07, "logits/chosen": 3.478515625, "logits/rejected": 2.59765625, "logps/chosen": -906.0, "logps/rejected": -1013.0, "loss": 0.4689, "rewards/accuracies": 0.78125, "rewards/chosen": 0.921875, "rewards/margins": 8.703125, "rewards/rejected": -7.77734375, "step": 4553 }, { "epoch": 0.8605035665359724, "grad_norm": 2.512689294364122, "learning_rate": 1.525466962754551e-07, "logits/chosen": 3.109375, "logits/rejected": 2.533203125, "logps/chosen": -580.0, "logps/rejected": -589.0, "loss": 0.5891, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8909912109375, "rewards/margins": 4.71875, "rewards/rejected": -3.83203125, "step": 4554 }, { "epoch": 0.8606925220841797, "grad_norm": 3.28194420018539, "learning_rate": 1.5240759535252485e-07, "logits/chosen": 3.80859375, "logits/rejected": 3.49609375, "logps/chosen": -844.5, "logps/rejected": -789.5, "loss": 0.5919, "rewards/accuracies": 0.8125, "rewards/chosen": 1.310302734375, "rewards/margins": 5.7578125, "rewards/rejected": -4.4453125, "step": 4555 }, { "epoch": 0.860881477632387, "grad_norm": 1.8963077111003805, "learning_rate": 1.5226866740192868e-07, "logits/chosen": 2.53125, "logits/rejected": 2.361328125, "logps/chosen": -757.0, "logps/rejected": -952.0, "loss": 0.6024, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0487060546875, "rewards/margins": 5.78125, "rewards/rejected": -5.734375, "step": 4556 }, { "epoch": 0.8610704331805943, "grad_norm": 5.344370874945946, "learning_rate": 1.521299124841072e-07, "logits/chosen": 2.302734375, "logits/rejected": 2.34375, "logps/chosen": -537.0, "logps/rejected": -1124.0, "loss": 0.7068, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6904296875, "rewards/margins": 4.1484375, "rewards/rejected": -3.466796875, "step": 4557 }, { "epoch": 0.8612593887288016, "grad_norm": 2.8440490251845967, "learning_rate": 1.5199133065942564e-07, "logits/chosen": 3.4453125, "logits/rejected": 3.91015625, "logps/chosen": -550.75, "logps/rejected": -1468.0, "loss": 0.6409, "rewards/accuracies": 0.75, "rewards/chosen": 0.418212890625, "rewards/margins": 6.41015625, "rewards/rejected": -5.982421875, "step": 4558 }, { "epoch": 0.8614483442770088, "grad_norm": 3.4658464414225487, "learning_rate": 1.5185292198817382e-07, "logits/chosen": 3.017578125, "logits/rejected": 2.619140625, "logps/chosen": -552.5, "logps/rejected": -971.5, "loss": 0.447, "rewards/accuracies": 1.0, "rewards/chosen": 1.005859375, "rewards/margins": 5.265625, "rewards/rejected": -4.2734375, "step": 4559 }, { "epoch": 0.8616372998252161, "grad_norm": 2.284418552219393, "learning_rate": 1.5171468653056646e-07, "logits/chosen": 3.0625, "logits/rejected": 2.93359375, "logps/chosen": -703.5, "logps/rejected": -861.0, "loss": 0.5849, "rewards/accuracies": 0.8125, "rewards/chosen": 1.612060546875, "rewards/margins": 4.720703125, "rewards/rejected": -3.10986328125, "step": 4560 }, { "epoch": 0.8618262553734234, "grad_norm": 2.605574035987811, "learning_rate": 1.515766243467428e-07, "logits/chosen": 3.17578125, "logits/rejected": 3.03125, "logps/chosen": -508.5, "logps/rejected": -907.0, "loss": 0.5244, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0205078125, "rewards/margins": 6.55078125, "rewards/rejected": -5.5390625, "step": 4561 }, { "epoch": 0.8620152109216307, "grad_norm": 1.927379794679582, "learning_rate": 1.5143873549676675e-07, "logits/chosen": 3.1328125, "logits/rejected": 2.833984375, "logps/chosen": -684.5, "logps/rejected": -629.5, "loss": 0.7427, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2679443359375, "rewards/margins": 2.71142578125, "rewards/rejected": -2.451171875, "step": 4562 }, { "epoch": 0.862204166469838, "grad_norm": 2.9106997012977103, "learning_rate": 1.5130102004062667e-07, "logits/chosen": 3.26171875, "logits/rejected": 2.638671875, "logps/chosen": -591.0, "logps/rejected": -788.0, "loss": 0.5162, "rewards/accuracies": 0.875, "rewards/chosen": 1.0625, "rewards/margins": 4.74609375, "rewards/rejected": -3.68359375, "step": 4563 }, { "epoch": 0.8623931220180453, "grad_norm": 1.878210628877901, "learning_rate": 1.5116347803823576e-07, "logits/chosen": 2.10546875, "logits/rejected": 2.19140625, "logps/chosen": -567.5, "logps/rejected": -919.0, "loss": 0.6113, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6943359375, "rewards/margins": 4.19921875, "rewards/rejected": -3.5078125, "step": 4564 }, { "epoch": 0.8625820775662525, "grad_norm": 2.3890373536638037, "learning_rate": 1.510261095494313e-07, "logits/chosen": 2.57421875, "logits/rejected": 2.6171875, "logps/chosen": -783.0, "logps/rejected": -841.0, "loss": 0.611, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7158203125, "rewards/margins": 4.08984375, "rewards/rejected": -3.37109375, "step": 4565 }, { "epoch": 0.8627710331144598, "grad_norm": 3.6760093460634264, "learning_rate": 1.508889146339757e-07, "logits/chosen": 3.0703125, "logits/rejected": 2.25390625, "logps/chosen": -968.0, "logps/rejected": -1003.0, "loss": 0.586, "rewards/accuracies": 0.75, "rewards/chosen": 1.26953125, "rewards/margins": 4.16015625, "rewards/rejected": -2.89453125, "step": 4566 }, { "epoch": 0.8629599886626671, "grad_norm": 2.8843602434493056, "learning_rate": 1.507518933515552e-07, "logits/chosen": 2.568359375, "logits/rejected": 2.7021484375, "logps/chosen": -446.0, "logps/rejected": -989.5, "loss": 0.6353, "rewards/accuracies": 0.875, "rewards/chosen": 0.35107421875, "rewards/margins": 8.828125, "rewards/rejected": -8.46484375, "step": 4567 }, { "epoch": 0.8631489442108744, "grad_norm": 2.5779824647583034, "learning_rate": 1.5061504576178102e-07, "logits/chosen": 2.8984375, "logits/rejected": 2.560546875, "logps/chosen": -705.0, "logps/rejected": -695.0, "loss": 0.5768, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0361328125, "rewards/margins": 4.5234375, "rewards/rejected": -3.4951171875, "step": 4568 }, { "epoch": 0.8633378997590817, "grad_norm": 3.5095433157823095, "learning_rate": 1.5047837192418873e-07, "logits/chosen": 3.46484375, "logits/rejected": 3.4609375, "logps/chosen": -480.5, "logps/rejected": -581.0, "loss": 0.5838, "rewards/accuracies": 0.90625, "rewards/chosen": 1.126953125, "rewards/margins": 4.02734375, "rewards/rejected": -2.900390625, "step": 4569 }, { "epoch": 0.863526855307289, "grad_norm": 2.5241587035818682, "learning_rate": 1.5034187189823788e-07, "logits/chosen": 3.759765625, "logits/rejected": 3.51953125, "logps/chosen": -484.0, "logps/rejected": -553.0, "loss": 0.6568, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4892578125, "rewards/margins": 4.21484375, "rewards/rejected": -3.73046875, "step": 4570 }, { "epoch": 0.8637158108554962, "grad_norm": 2.391791063717196, "learning_rate": 1.5020554574331295e-07, "logits/chosen": 2.677734375, "logits/rejected": 2.4921875, "logps/chosen": -956.5, "logps/rejected": -937.5, "loss": 0.6387, "rewards/accuracies": 0.8125, "rewards/chosen": 0.84814453125, "rewards/margins": 4.947265625, "rewards/rejected": -4.099609375, "step": 4571 }, { "epoch": 0.8639047664037035, "grad_norm": 1.964397025642275, "learning_rate": 1.5006939351872252e-07, "logits/chosen": 2.130859375, "logits/rejected": 2.021484375, "logps/chosen": -528.0, "logps/rejected": -893.5, "loss": 0.5352, "rewards/accuracies": 0.8125, "rewards/chosen": 1.34521484375, "rewards/margins": 5.56640625, "rewards/rejected": -4.21875, "step": 4572 }, { "epoch": 0.8640937219519108, "grad_norm": 1.7689599190936258, "learning_rate": 1.4993341528369946e-07, "logits/chosen": 4.1015625, "logits/rejected": 4.03515625, "logps/chosen": -513.5, "logps/rejected": -1071.5, "loss": 0.517, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8568115234375, "rewards/margins": 7.09765625, "rewards/rejected": -6.25, "step": 4573 }, { "epoch": 0.8642826775001181, "grad_norm": 2.7538863459817713, "learning_rate": 1.4979761109740119e-07, "logits/chosen": 3.21875, "logits/rejected": 2.77734375, "logps/chosen": -1602.0, "logps/rejected": -1409.0, "loss": 0.4106, "rewards/accuracies": 0.90625, "rewards/chosen": 1.95703125, "rewards/margins": 8.796875, "rewards/rejected": -6.8359375, "step": 4574 }, { "epoch": 0.8644716330483254, "grad_norm": 6.038645368236627, "learning_rate": 1.4966198101890894e-07, "logits/chosen": 1.439453125, "logits/rejected": 1.01904296875, "logps/chosen": -674.0, "logps/rejected": -673.5, "loss": 0.6346, "rewards/accuracies": 0.6875, "rewards/chosen": 0.98046875, "rewards/margins": 3.8984375, "rewards/rejected": -2.916015625, "step": 4575 }, { "epoch": 0.8646605885965327, "grad_norm": 1.9871028784605977, "learning_rate": 1.4952652510722882e-07, "logits/chosen": 2.474609375, "logits/rejected": 2.251953125, "logps/chosen": -721.0, "logps/rejected": -880.5, "loss": 0.5657, "rewards/accuracies": 0.84375, "rewards/chosen": 1.408203125, "rewards/margins": 4.85546875, "rewards/rejected": -3.44140625, "step": 4576 }, { "epoch": 0.8648495441447399, "grad_norm": 2.4001416580587964, "learning_rate": 1.4939124342129068e-07, "logits/chosen": 2.36328125, "logits/rejected": 2.267578125, "logps/chosen": -1168.0, "logps/rejected": -1803.0, "loss": 0.4848, "rewards/accuracies": 0.875, "rewards/chosen": 1.0126953125, "rewards/margins": 6.9140625, "rewards/rejected": -5.90234375, "step": 4577 }, { "epoch": 0.8650384996929472, "grad_norm": 2.3696093233954643, "learning_rate": 1.492561360199487e-07, "logits/chosen": 3.1953125, "logits/rejected": 2.734375, "logps/chosen": -650.0, "logps/rejected": -591.5, "loss": 0.7071, "rewards/accuracies": 0.8125, "rewards/chosen": 0.84814453125, "rewards/margins": 3.515625, "rewards/rejected": -2.677734375, "step": 4578 }, { "epoch": 0.8652274552411545, "grad_norm": 2.4249958696245244, "learning_rate": 1.4912120296198137e-07, "logits/chosen": 3.6484375, "logits/rejected": 3.1171875, "logps/chosen": -725.0, "logps/rejected": -693.5, "loss": 0.7392, "rewards/accuracies": 0.78125, "rewards/chosen": -0.21923828125, "rewards/margins": 3.95703125, "rewards/rejected": -4.171875, "step": 4579 }, { "epoch": 0.8654164107893618, "grad_norm": 1.7834937817473542, "learning_rate": 1.4898644430609108e-07, "logits/chosen": 3.38671875, "logits/rejected": 3.17578125, "logps/chosen": -13946.0, "logps/rejected": -691.5, "loss": 0.5791, "rewards/accuracies": 0.78125, "rewards/chosen": 170.25390625, "rewards/margins": 174.576171875, "rewards/rejected": -4.8203125, "step": 4580 }, { "epoch": 0.8656053663375691, "grad_norm": 2.9088845981940716, "learning_rate": 1.488518601109047e-07, "logits/chosen": 3.9453125, "logits/rejected": 3.373046875, "logps/chosen": -493.5, "logps/rejected": -758.0, "loss": 0.5645, "rewards/accuracies": 0.875, "rewards/chosen": 1.0654296875, "rewards/margins": 4.27734375, "rewards/rejected": -3.2109375, "step": 4581 }, { "epoch": 0.8657943218857763, "grad_norm": 5.031802156880292, "learning_rate": 1.4871745043497295e-07, "logits/chosen": 2.5703125, "logits/rejected": 2.060546875, "logps/chosen": -1155.5, "logps/rejected": -1066.5, "loss": 0.4972, "rewards/accuracies": 0.90625, "rewards/chosen": 1.748779296875, "rewards/margins": 6.6796875, "rewards/rejected": -4.9296875, "step": 4582 }, { "epoch": 0.8659832774339836, "grad_norm": 2.8002819705965325, "learning_rate": 1.4858321533677055e-07, "logits/chosen": 2.212890625, "logits/rejected": 1.94873046875, "logps/chosen": -845.0, "logps/rejected": -1660.0, "loss": 0.4485, "rewards/accuracies": 0.90625, "rewards/chosen": 1.672515869140625, "rewards/margins": 9.96875, "rewards/rejected": -8.296875, "step": 4583 }, { "epoch": 0.8661722329821909, "grad_norm": 1.8111106521991345, "learning_rate": 1.4844915487469655e-07, "logits/chosen": 3.78125, "logits/rejected": 3.41015625, "logps/chosen": -782.5, "logps/rejected": -953.5, "loss": 0.5263, "rewards/accuracies": 0.8125, "rewards/chosen": 1.275390625, "rewards/margins": 6.5546875, "rewards/rejected": -5.28125, "step": 4584 }, { "epoch": 0.8663611885303982, "grad_norm": 1.8617884092859311, "learning_rate": 1.4831526910707382e-07, "logits/chosen": 2.826171875, "logits/rejected": 2.75390625, "logps/chosen": -1222.0, "logps/rejected": -1148.0, "loss": 0.4398, "rewards/accuracies": 0.84375, "rewards/chosen": 1.265625, "rewards/margins": 6.73046875, "rewards/rejected": -5.4609375, "step": 4585 }, { "epoch": 0.8665501440786055, "grad_norm": 2.2804499298727094, "learning_rate": 1.4818155809214915e-07, "logits/chosen": 3.70703125, "logits/rejected": 3.201171875, "logps/chosen": -780.0, "logps/rejected": -754.0, "loss": 0.5739, "rewards/accuracies": 0.71875, "rewards/chosen": 0.388671875, "rewards/margins": 5.46875, "rewards/rejected": -5.0859375, "step": 4586 }, { "epoch": 0.8667390996268128, "grad_norm": 2.536695775938421, "learning_rate": 1.480480218880937e-07, "logits/chosen": 2.484375, "logits/rejected": 2.197265625, "logps/chosen": -814.0, "logps/rejected": -1229.0, "loss": 0.6465, "rewards/accuracies": 0.71875, "rewards/chosen": 0.972900390625, "rewards/margins": 5.84375, "rewards/rejected": -4.87109375, "step": 4587 }, { "epoch": 0.86692805517502, "grad_norm": 2.7387004110148587, "learning_rate": 1.4791466055300208e-07, "logits/chosen": 2.630859375, "logits/rejected": 2.3037109375, "logps/chosen": -996.0, "logps/rejected": -908.0, "loss": 0.5703, "rewards/accuracies": 0.84375, "rewards/chosen": 1.14453125, "rewards/margins": 5.546875, "rewards/rejected": -4.41015625, "step": 4588 }, { "epoch": 0.8671170107232273, "grad_norm": 2.0202642532687234, "learning_rate": 1.4778147414489318e-07, "logits/chosen": 3.453125, "logits/rejected": 3.58984375, "logps/chosen": -972.5, "logps/rejected": -1651.0, "loss": 0.6581, "rewards/accuracies": 0.84375, "rewards/chosen": -0.66455078125, "rewards/margins": 7.613037109375, "rewards/rejected": -8.30078125, "step": 4589 }, { "epoch": 0.8673059662714346, "grad_norm": 2.679608226702204, "learning_rate": 1.4764846272170967e-07, "logits/chosen": 2.509765625, "logits/rejected": 2.2060546875, "logps/chosen": -992.5, "logps/rejected": -1106.0, "loss": 0.4818, "rewards/accuracies": 0.78125, "rewards/chosen": 1.939208984375, "rewards/margins": 7.59375, "rewards/rejected": -5.65625, "step": 4590 }, { "epoch": 0.8674949218196419, "grad_norm": 1.3710085461660888, "learning_rate": 1.4751562634131796e-07, "logits/chosen": 2.515625, "logits/rejected": 1.884765625, "logps/chosen": -550.0, "logps/rejected": -574.0, "loss": 0.6089, "rewards/accuracies": 0.78125, "rewards/chosen": -0.194976806640625, "rewards/margins": 4.3359375, "rewards/rejected": -4.541015625, "step": 4591 }, { "epoch": 0.8676838773678492, "grad_norm": 2.90409641941482, "learning_rate": 1.4738296506150867e-07, "logits/chosen": 3.05078125, "logits/rejected": 2.599609375, "logps/chosen": -647.5, "logps/rejected": -677.0, "loss": 0.6887, "rewards/accuracies": 0.75, "rewards/chosen": -0.24609375, "rewards/margins": 3.927734375, "rewards/rejected": -4.171875, "step": 4592 }, { "epoch": 0.8678728329160565, "grad_norm": 2.3732773256431807, "learning_rate": 1.472504789399957e-07, "logits/chosen": 2.591796875, "logits/rejected": 2.6650390625, "logps/chosen": -573.5, "logps/rejected": -708.0, "loss": 0.5614, "rewards/accuracies": 0.8125, "rewards/chosen": -0.01080322265625, "rewards/margins": 4.546875, "rewards/rejected": -4.5625, "step": 4593 }, { "epoch": 0.8680617884642637, "grad_norm": 2.252773546950152, "learning_rate": 1.4711816803441734e-07, "logits/chosen": 2.594482421875, "logits/rejected": 2.71435546875, "logps/chosen": -930.5, "logps/rejected": -757.0, "loss": 0.5529, "rewards/accuracies": 0.90625, "rewards/chosen": 0.34930419921875, "rewards/margins": 7.734375, "rewards/rejected": -7.3671875, "step": 4594 }, { "epoch": 0.868250744012471, "grad_norm": 2.2828793672508882, "learning_rate": 1.4698603240233522e-07, "logits/chosen": 2.203125, "logits/rejected": 1.86328125, "logps/chosen": -937.0, "logps/rejected": -916.0, "loss": 0.5734, "rewards/accuracies": 0.84375, "rewards/chosen": 1.294189453125, "rewards/margins": 5.69140625, "rewards/rejected": -4.38671875, "step": 4595 }, { "epoch": 0.8684396995606783, "grad_norm": 1.7997190343914546, "learning_rate": 1.4685407210123489e-07, "logits/chosen": 3.39453125, "logits/rejected": 2.9765625, "logps/chosen": -994.5, "logps/rejected": -1078.0, "loss": 0.5755, "rewards/accuracies": 0.78125, "rewards/chosen": 1.625732421875, "rewards/margins": 5.640625, "rewards/rejected": -4.02734375, "step": 4596 }, { "epoch": 0.8686286551088857, "grad_norm": 3.364078442548574, "learning_rate": 1.4672228718852575e-07, "logits/chosen": 2.20703125, "logits/rejected": 1.857421875, "logps/chosen": -894.0, "logps/rejected": -1110.0, "loss": 0.4418, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4736328125, "rewards/margins": 9.6328125, "rewards/rejected": -9.1796875, "step": 4597 }, { "epoch": 0.868817610657093, "grad_norm": 1.232758231878803, "learning_rate": 1.465906777215404e-07, "logits/chosen": 2.396484375, "logits/rejected": 1.33880615234375, "logps/chosen": -629.5, "logps/rejected": -711.0, "loss": 0.5861, "rewards/accuracies": 0.71875, "rewards/chosen": 0.47265625, "rewards/margins": 4.671875, "rewards/rejected": -4.203125, "step": 4598 }, { "epoch": 0.8690065662053003, "grad_norm": 3.1099921823770633, "learning_rate": 1.4645924375753577e-07, "logits/chosen": 3.22265625, "logits/rejected": 2.89453125, "logps/chosen": -1058.0, "logps/rejected": -1152.0, "loss": 0.5898, "rewards/accuracies": 0.84375, "rewards/chosen": 0.49267578125, "rewards/margins": 6.3984375, "rewards/rejected": -5.8984375, "step": 4599 }, { "epoch": 0.8691955217535074, "grad_norm": 2.3290205249848097, "learning_rate": 1.4632798535369204e-07, "logits/chosen": 2.78515625, "logits/rejected": 1.552734375, "logps/chosen": -961.0, "logps/rejected": -757.0, "loss": 0.4227, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1611328125, "rewards/margins": 6.640625, "rewards/rejected": -6.484375, "step": 4600 }, { "epoch": 0.8693844773017148, "grad_norm": 2.34583430833943, "learning_rate": 1.4619690256711292e-07, "logits/chosen": 2.9296875, "logits/rejected": 2.55078125, "logps/chosen": -990.0, "logps/rejected": -756.5, "loss": 0.5672, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5419921875, "rewards/margins": 5.4921875, "rewards/rejected": -3.94921875, "step": 4601 }, { "epoch": 0.8695734328499221, "grad_norm": 1.8282079179128896, "learning_rate": 1.4606599545482635e-07, "logits/chosen": 2.212890625, "logits/rejected": 2.376953125, "logps/chosen": -928.0, "logps/rejected": -1288.0, "loss": 0.5276, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1845703125, "rewards/margins": 6.27734375, "rewards/rejected": -5.0859375, "step": 4602 }, { "epoch": 0.8697623883981294, "grad_norm": 2.016681554116284, "learning_rate": 1.459352640737828e-07, "logits/chosen": 2.7109375, "logits/rejected": 2.041015625, "logps/chosen": -949.0, "logps/rejected": -1166.0, "loss": 0.5771, "rewards/accuracies": 0.75, "rewards/chosen": 0.649871826171875, "rewards/margins": 5.33984375, "rewards/rejected": -4.6796875, "step": 4603 }, { "epoch": 0.8699513439463367, "grad_norm": 5.118378881047146, "learning_rate": 1.4580470848085735e-07, "logits/chosen": 2.23681640625, "logits/rejected": 2.1328125, "logps/chosen": -872.0, "logps/rejected": -868.5, "loss": 0.7248, "rewards/accuracies": 0.78125, "rewards/chosen": 0.28125, "rewards/margins": 3.671875, "rewards/rejected": -3.38671875, "step": 4604 }, { "epoch": 0.8701402994945439, "grad_norm": 1.9696463099132693, "learning_rate": 1.4567432873284797e-07, "logits/chosen": 3.2109375, "logits/rejected": 2.92578125, "logps/chosen": -743.0, "logps/rejected": -1036.0, "loss": 0.5634, "rewards/accuracies": 0.8125, "rewards/chosen": 0.58984375, "rewards/margins": 9.87109375, "rewards/rejected": -9.26953125, "step": 4605 }, { "epoch": 0.8703292550427512, "grad_norm": 2.327244632296069, "learning_rate": 1.4554412488647622e-07, "logits/chosen": 3.01953125, "logits/rejected": 3.0703125, "logps/chosen": -962.0, "logps/rejected": -1282.0, "loss": 0.5393, "rewards/accuracies": 0.75, "rewards/chosen": 1.11865234375, "rewards/margins": 10.484375, "rewards/rejected": -9.3515625, "step": 4606 }, { "epoch": 0.8705182105909585, "grad_norm": 3.1995667274641306, "learning_rate": 1.4541409699838719e-07, "logits/chosen": 3.08984375, "logits/rejected": 2.8203125, "logps/chosen": -776.5, "logps/rejected": -1132.0, "loss": 0.6009, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9970703125, "rewards/margins": 5.87109375, "rewards/rejected": -4.8671875, "step": 4607 }, { "epoch": 0.8707071661391658, "grad_norm": 2.331377610639887, "learning_rate": 1.4528424512514958e-07, "logits/chosen": 3.126953125, "logits/rejected": 2.775390625, "logps/chosen": -965.0, "logps/rejected": -1153.5, "loss": 0.4729, "rewards/accuracies": 0.90625, "rewards/chosen": 1.45941162109375, "rewards/margins": 5.359375, "rewards/rejected": -3.91015625, "step": 4608 }, { "epoch": 0.8708961216873731, "grad_norm": 2.4349860338127955, "learning_rate": 1.4515456932325528e-07, "logits/chosen": 2.84375, "logits/rejected": 2.15234375, "logps/chosen": -628.0, "logps/rejected": -625.0, "loss": 0.6067, "rewards/accuracies": 0.78125, "rewards/chosen": 1.177734375, "rewards/margins": 4.171875, "rewards/rejected": -2.99609375, "step": 4609 }, { "epoch": 0.8710850772355804, "grad_norm": 3.6982780055339495, "learning_rate": 1.4502506964911973e-07, "logits/chosen": 2.37109375, "logits/rejected": 2.490234375, "logps/chosen": -691.5, "logps/rejected": -1042.5, "loss": 0.6234, "rewards/accuracies": 0.75, "rewards/chosen": 0.2646484375, "rewards/margins": 5.0234375, "rewards/rejected": -4.75390625, "step": 4610 }, { "epoch": 0.8712740327837876, "grad_norm": 1.8573097243811545, "learning_rate": 1.448957461590816e-07, "logits/chosen": 2.666015625, "logits/rejected": 2.685546875, "logps/chosen": -907.0, "logps/rejected": -799.0, "loss": 0.5619, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2158203125, "rewards/margins": 4.84375, "rewards/rejected": -3.62890625, "step": 4611 }, { "epoch": 0.8714629883319949, "grad_norm": 1.7775865294852242, "learning_rate": 1.44766598909403e-07, "logits/chosen": 2.61328125, "logits/rejected": 2.37109375, "logps/chosen": -725.0, "logps/rejected": -700.5, "loss": 0.5976, "rewards/accuracies": 0.84375, "rewards/chosen": 1.50390625, "rewards/margins": 4.359375, "rewards/rejected": -2.859375, "step": 4612 }, { "epoch": 0.8716519438802022, "grad_norm": 1.4951607694883808, "learning_rate": 1.4463762795626954e-07, "logits/chosen": 2.3515625, "logits/rejected": 2.126953125, "logps/chosen": -765.5, "logps/rejected": -1644.0, "loss": 0.3917, "rewards/accuracies": 0.875, "rewards/chosen": 1.0921630859375, "rewards/margins": 7.912109375, "rewards/rejected": -6.83984375, "step": 4613 }, { "epoch": 0.8718408994284095, "grad_norm": 4.542682064476002, "learning_rate": 1.445088333557896e-07, "logits/chosen": 2.6171875, "logits/rejected": 2.385986328125, "logps/chosen": -848.0, "logps/rejected": -1574.0, "loss": 0.4317, "rewards/accuracies": 0.96875, "rewards/chosen": 1.5927734375, "rewards/margins": 6.9453125, "rewards/rejected": -5.34375, "step": 4614 }, { "epoch": 0.8720298549766168, "grad_norm": 3.2277194125284034, "learning_rate": 1.4438021516399554e-07, "logits/chosen": 2.404296875, "logits/rejected": 2.2109375, "logps/chosen": -1003.5, "logps/rejected": -746.0, "loss": 0.6509, "rewards/accuracies": 0.75, "rewards/chosen": 0.093994140625, "rewards/margins": 3.845703125, "rewards/rejected": -3.759765625, "step": 4615 }, { "epoch": 0.8722188105248241, "grad_norm": 4.840333968656178, "learning_rate": 1.4425177343684246e-07, "logits/chosen": 3.53515625, "logits/rejected": 2.625, "logps/chosen": -930.5, "logps/rejected": -862.0, "loss": 0.6227, "rewards/accuracies": 0.65625, "rewards/chosen": 0.612548828125, "rewards/margins": 4.6953125, "rewards/rejected": -4.08203125, "step": 4616 }, { "epoch": 0.8724077660730313, "grad_norm": 2.8362437658799107, "learning_rate": 1.441235082302088e-07, "logits/chosen": 1.91796875, "logits/rejected": 1.873046875, "logps/chosen": -1052.0, "logps/rejected": -2467.0, "loss": 0.5545, "rewards/accuracies": 0.8125, "rewards/chosen": 0.71929931640625, "rewards/margins": 7.50390625, "rewards/rejected": -6.787109375, "step": 4617 }, { "epoch": 0.8725967216212386, "grad_norm": 3.9428352590610514, "learning_rate": 1.4399541959989652e-07, "logits/chosen": 2.89453125, "logits/rejected": 2.259765625, "logps/chosen": -784.0, "logps/rejected": -1082.0, "loss": 0.5244, "rewards/accuracies": 0.78125, "rewards/chosen": 0.880615234375, "rewards/margins": 6.625, "rewards/rejected": -5.7421875, "step": 4618 }, { "epoch": 0.8727856771694459, "grad_norm": 2.810295073596575, "learning_rate": 1.4386750760163012e-07, "logits/chosen": 2.845703125, "logits/rejected": 2.556640625, "logps/chosen": -1084.0, "logps/rejected": -1149.0, "loss": 0.5574, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4169921875, "rewards/margins": 5.859375, "rewards/rejected": -4.451171875, "step": 4619 }, { "epoch": 0.8729746327176532, "grad_norm": 1.7326050721461344, "learning_rate": 1.43739772291058e-07, "logits/chosen": 3.44921875, "logits/rejected": 2.68359375, "logps/chosen": -1171.0, "logps/rejected": -1792.0, "loss": 0.4618, "rewards/accuracies": 0.78125, "rewards/chosen": 1.7735595703125, "rewards/margins": 10.515625, "rewards/rejected": -8.765625, "step": 4620 }, { "epoch": 0.8731635882658605, "grad_norm": 2.4029309549041535, "learning_rate": 1.4361221372375115e-07, "logits/chosen": 2.96484375, "logits/rejected": 2.6953125, "logps/chosen": -801.0, "logps/rejected": -579.0, "loss": 0.5351, "rewards/accuracies": 0.8125, "rewards/chosen": 0.76904296875, "rewards/margins": 4.8515625, "rewards/rejected": -4.0859375, "step": 4621 }, { "epoch": 0.8733525438140678, "grad_norm": 2.414797601900073, "learning_rate": 1.4348483195520373e-07, "logits/chosen": 2.654296875, "logits/rejected": 1.9560546875, "logps/chosen": -1025.0, "logps/rejected": -804.0, "loss": 0.5941, "rewards/accuracies": 0.78125, "rewards/chosen": 1.23583984375, "rewards/margins": 5.1015625, "rewards/rejected": -3.853515625, "step": 4622 }, { "epoch": 0.873541499362275, "grad_norm": 2.109490618449091, "learning_rate": 1.4335762704083348e-07, "logits/chosen": 2.6484375, "logits/rejected": 1.936279296875, "logps/chosen": -951.0, "logps/rejected": -967.0, "loss": 0.3989, "rewards/accuracies": 0.90625, "rewards/chosen": 0.953125, "rewards/margins": 7.203125, "rewards/rejected": -6.2421875, "step": 4623 }, { "epoch": 0.8737304549104823, "grad_norm": 2.25363241686608, "learning_rate": 1.4323059903598037e-07, "logits/chosen": 2.107421875, "logits/rejected": 1.263671875, "logps/chosen": -828.5, "logps/rejected": -887.5, "loss": 0.4728, "rewards/accuracies": 0.875, "rewards/chosen": 0.9912109375, "rewards/margins": 5.796875, "rewards/rejected": -4.8125, "step": 4624 }, { "epoch": 0.8739194104586896, "grad_norm": 3.161401618358115, "learning_rate": 1.4310374799590816e-07, "logits/chosen": 3.1953125, "logits/rejected": 2.81640625, "logps/chosen": -615.5, "logps/rejected": -831.0, "loss": 0.6867, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12841796875, "rewards/margins": 3.9443359375, "rewards/rejected": -3.80859375, "step": 4625 }, { "epoch": 0.8741083660068969, "grad_norm": 1.8076653146246109, "learning_rate": 1.429770739758032e-07, "logits/chosen": 3.125, "logits/rejected": 2.26953125, "logps/chosen": -969.0, "logps/rejected": -2077.0, "loss": 0.4936, "rewards/accuracies": 0.8125, "rewards/chosen": 0.916015625, "rewards/margins": 10.6796875, "rewards/rejected": -9.7578125, "step": 4626 }, { "epoch": 0.8742973215551042, "grad_norm": 2.239518571816042, "learning_rate": 1.4285057703077502e-07, "logits/chosen": 2.455078125, "logits/rejected": 2.390625, "logps/chosen": -604.5, "logps/rejected": -910.0, "loss": 0.5345, "rewards/accuracies": 0.75, "rewards/chosen": 1.0712890625, "rewards/margins": 5.3046875, "rewards/rejected": -4.23828125, "step": 4627 }, { "epoch": 0.8744862771033114, "grad_norm": 4.30505934206995, "learning_rate": 1.4272425721585592e-07, "logits/chosen": 3.234375, "logits/rejected": 2.65625, "logps/chosen": -672.0, "logps/rejected": -688.0, "loss": 0.6166, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8790283203125, "rewards/margins": 4.744140625, "rewards/rejected": -3.8525390625, "step": 4628 }, { "epoch": 0.8746752326515187, "grad_norm": 2.8582084810811628, "learning_rate": 1.4259811458600133e-07, "logits/chosen": 3.474609375, "logits/rejected": 2.8974609375, "logps/chosen": -909.0, "logps/rejected": -812.0, "loss": 0.6565, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0576171875, "rewards/margins": 4.47265625, "rewards/rejected": -3.416015625, "step": 4629 }, { "epoch": 0.874864188199726, "grad_norm": 2.775227326259188, "learning_rate": 1.4247214919608955e-07, "logits/chosen": 2.986328125, "logits/rejected": 2.68359375, "logps/chosen": -857.0, "logps/rejected": -1322.0, "loss": 0.4995, "rewards/accuracies": 0.875, "rewards/chosen": 1.59375, "rewards/margins": 11.5859375, "rewards/rejected": -10.0, "step": 4630 }, { "epoch": 0.8750531437479333, "grad_norm": 3.293819837367901, "learning_rate": 1.4234636110092173e-07, "logits/chosen": 3.4296875, "logits/rejected": 2.94140625, "logps/chosen": -855.0, "logps/rejected": -937.0, "loss": 0.5673, "rewards/accuracies": 0.75, "rewards/chosen": 1.876953125, "rewards/margins": 4.859375, "rewards/rejected": -2.986328125, "step": 4631 }, { "epoch": 0.8752420992961406, "grad_norm": 1.7733660970873053, "learning_rate": 1.422207503552219e-07, "logits/chosen": 1.9189453125, "logits/rejected": 1.9345703125, "logps/chosen": -921.0, "logps/rejected": -738.5, "loss": 0.7002, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9801025390625, "rewards/margins": 4.11328125, "rewards/rejected": -3.1279296875, "step": 4632 }, { "epoch": 0.8754310548443479, "grad_norm": 1.6366037273992282, "learning_rate": 1.4209531701363695e-07, "logits/chosen": 3.181640625, "logits/rejected": 2.990234375, "logps/chosen": -525.5, "logps/rejected": -810.0, "loss": 0.5814, "rewards/accuracies": 0.875, "rewards/chosen": 0.3271484375, "rewards/margins": 5.734375, "rewards/rejected": -5.4140625, "step": 4633 }, { "epoch": 0.8756200103925551, "grad_norm": 2.1715668464192706, "learning_rate": 1.419700611307365e-07, "logits/chosen": 2.734375, "logits/rejected": 2.494140625, "logps/chosen": -787.0, "logps/rejected": -715.0, "loss": 0.6187, "rewards/accuracies": 0.78125, "rewards/chosen": 0.10009765625, "rewards/margins": 3.625, "rewards/rejected": -3.515625, "step": 4634 }, { "epoch": 0.8758089659407624, "grad_norm": 1.9428493158672193, "learning_rate": 1.4184498276101307e-07, "logits/chosen": 3.52734375, "logits/rejected": 2.880859375, "logps/chosen": -818.25, "logps/rejected": -13422.5, "loss": 0.5417, "rewards/accuracies": 0.875, "rewards/chosen": 1.4326171875, "rewards/margins": -80.56640625, "rewards/rejected": 82.234375, "step": 4635 }, { "epoch": 0.8759979214889697, "grad_norm": 1.9759797565705102, "learning_rate": 1.4172008195888206e-07, "logits/chosen": 2.9609375, "logits/rejected": 2.541015625, "logps/chosen": -631.5, "logps/rejected": -619.5, "loss": 0.747, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3289794921875, "rewards/margins": 3.31640625, "rewards/rejected": -2.9765625, "step": 4636 }, { "epoch": 0.876186877037177, "grad_norm": 3.4611988745424114, "learning_rate": 1.415953587786814e-07, "logits/chosen": 3.23046875, "logits/rejected": 2.94140625, "logps/chosen": -843.0, "logps/rejected": -755.0, "loss": 0.5359, "rewards/accuracies": 0.84375, "rewards/chosen": 0.71234130859375, "rewards/margins": 4.890625, "rewards/rejected": -4.1875, "step": 4637 }, { "epoch": 0.8763758325853843, "grad_norm": 3.068727637924911, "learning_rate": 1.4147081327467181e-07, "logits/chosen": 2.6796875, "logits/rejected": 2.78125, "logps/chosen": -921.0, "logps/rejected": -959.0, "loss": 0.4819, "rewards/accuracies": 0.875, "rewards/chosen": 1.8623046875, "rewards/margins": 6.05078125, "rewards/rejected": -4.1953125, "step": 4638 }, { "epoch": 0.8765647881335916, "grad_norm": 1.5481966783438699, "learning_rate": 1.4134644550103674e-07, "logits/chosen": 3.34765625, "logits/rejected": 3.33203125, "logps/chosen": -545.0, "logps/rejected": -467.5, "loss": 0.6789, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1669921875, "rewards/margins": 3.1796875, "rewards/rejected": -2.0078125, "step": 4639 }, { "epoch": 0.8767537436817988, "grad_norm": 2.9585030833584343, "learning_rate": 1.4122225551188222e-07, "logits/chosen": 2.73828125, "logits/rejected": 2.314453125, "logps/chosen": -583.0, "logps/rejected": -571.5, "loss": 0.664, "rewards/accuracies": 0.75, "rewards/chosen": 0.40966796875, "rewards/margins": 3.509765625, "rewards/rejected": -3.10546875, "step": 4640 }, { "epoch": 0.8769426992300061, "grad_norm": 2.420506083975038, "learning_rate": 1.410982433612374e-07, "logits/chosen": 3.119140625, "logits/rejected": 2.521484375, "logps/chosen": -614.0, "logps/rejected": -569.5, "loss": 0.535, "rewards/accuracies": 0.90625, "rewards/chosen": 0.83203125, "rewards/margins": 4.99609375, "rewards/rejected": -4.171875, "step": 4641 }, { "epoch": 0.8771316547782134, "grad_norm": 2.32772821758831, "learning_rate": 1.4097440910305313e-07, "logits/chosen": 2.91015625, "logits/rejected": 2.90625, "logps/chosen": -648.0, "logps/rejected": -1777.5, "loss": 0.5643, "rewards/accuracies": 0.84375, "rewards/chosen": 0.818359375, "rewards/margins": 15.80078125, "rewards/rejected": -14.98046875, "step": 4642 }, { "epoch": 0.8773206103264207, "grad_norm": 1.6361073423357584, "learning_rate": 1.408507527912038e-07, "logits/chosen": 2.12890625, "logits/rejected": 2.0625, "logps/chosen": -698.0, "logps/rejected": -798.5, "loss": 0.6314, "rewards/accuracies": 0.71875, "rewards/chosen": 0.63134765625, "rewards/margins": 4.9609375, "rewards/rejected": -4.328125, "step": 4643 }, { "epoch": 0.877509565874628, "grad_norm": 3.7616544065269006, "learning_rate": 1.4072727447948614e-07, "logits/chosen": 3.171875, "logits/rejected": 2.580078125, "logps/chosen": -900.0, "logps/rejected": -811.5, "loss": 0.5623, "rewards/accuracies": 0.875, "rewards/chosen": 0.9541015625, "rewards/margins": 4.984375, "rewards/rejected": -4.0234375, "step": 4644 }, { "epoch": 0.8776985214228353, "grad_norm": 2.423854398967356, "learning_rate": 1.406039742216189e-07, "logits/chosen": 2.30078125, "logits/rejected": 1.447265625, "logps/chosen": -711.0, "logps/rejected": -670.5, "loss": 0.5472, "rewards/accuracies": 0.84375, "rewards/chosen": 0.72314453125, "rewards/margins": 5.1875, "rewards/rejected": -4.46484375, "step": 4645 }, { "epoch": 0.8778874769710425, "grad_norm": 3.5033567091020754, "learning_rate": 1.4048085207124407e-07, "logits/chosen": 2.794921875, "logits/rejected": 2.2744140625, "logps/chosen": -866.5, "logps/rejected": -813.0, "loss": 0.5806, "rewards/accuracies": 0.84375, "rewards/chosen": 0.08380126953125, "rewards/margins": 5.44921875, "rewards/rejected": -5.375, "step": 4646 }, { "epoch": 0.8780764325192498, "grad_norm": 1.602672384554087, "learning_rate": 1.403579080819258e-07, "logits/chosen": 2.9443359375, "logits/rejected": 2.4755859375, "logps/chosen": -1059.0, "logps/rejected": -1797.0, "loss": 0.6193, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9296875, "rewards/margins": 6.109375, "rewards/rejected": -8.0546875, "step": 4647 }, { "epoch": 0.8782653880674571, "grad_norm": 1.9872217121070046, "learning_rate": 1.402351423071508e-07, "logits/chosen": 2.736328125, "logits/rejected": 3.095703125, "logps/chosen": -806.0, "logps/rejected": -1075.0, "loss": 0.4345, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5184326171875, "rewards/margins": 6.5859375, "rewards/rejected": -6.0625, "step": 4648 }, { "epoch": 0.8784543436156644, "grad_norm": 1.0828093479880045, "learning_rate": 1.4011255480032827e-07, "logits/chosen": 3.7890625, "logits/rejected": 3.24609375, "logps/chosen": -704.0, "logps/rejected": -614.5, "loss": 0.5507, "rewards/accuracies": 0.875, "rewards/chosen": 1.470703125, "rewards/margins": 5.890625, "rewards/rejected": -4.41796875, "step": 4649 }, { "epoch": 0.8786432991638717, "grad_norm": 1.564265793722945, "learning_rate": 1.3999014561478966e-07, "logits/chosen": 2.99609375, "logits/rejected": 2.63671875, "logps/chosen": -762.0, "logps/rejected": -649.0, "loss": 0.623, "rewards/accuracies": 0.78125, "rewards/chosen": 0.09814453125, "rewards/margins": 4.025390625, "rewards/rejected": -3.9296875, "step": 4650 }, { "epoch": 0.8788322547120789, "grad_norm": 2.994380300678197, "learning_rate": 1.3986791480378932e-07, "logits/chosen": 3.44921875, "logits/rejected": 2.9296875, "logps/chosen": -630.0, "logps/rejected": -720.0, "loss": 0.5728, "rewards/accuracies": 0.8125, "rewards/chosen": 0.99365234375, "rewards/margins": 5.36328125, "rewards/rejected": -4.37109375, "step": 4651 }, { "epoch": 0.8790212102602862, "grad_norm": 3.2219265117244693, "learning_rate": 1.3974586242050346e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.86328125, "logps/chosen": -879.0, "logps/rejected": -812.0, "loss": 0.6831, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6142578125, "rewards/margins": 4.8828125, "rewards/rejected": -4.26171875, "step": 4652 }, { "epoch": 0.8792101658084935, "grad_norm": 2.0279747682060245, "learning_rate": 1.3962398851803091e-07, "logits/chosen": 3.74609375, "logits/rejected": 3.27734375, "logps/chosen": -834.0, "logps/rejected": -1210.0, "loss": 0.6217, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1337890625, "rewards/margins": 4.64453125, "rewards/rejected": -4.50390625, "step": 4653 }, { "epoch": 0.8793991213567008, "grad_norm": 3.3040794011114394, "learning_rate": 1.3950229314939295e-07, "logits/chosen": 3.02734375, "logits/rejected": 2.5068359375, "logps/chosen": -896.0, "logps/rejected": -1017.0, "loss": 0.572, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9931640625, "rewards/margins": 5.537109375, "rewards/rejected": -4.53125, "step": 4654 }, { "epoch": 0.8795880769049081, "grad_norm": 2.6517081487247207, "learning_rate": 1.3938077636753295e-07, "logits/chosen": 2.228515625, "logits/rejected": 1.109375, "logps/chosen": -1052.0, "logps/rejected": -811.0, "loss": 0.5287, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3349609375, "rewards/margins": 5.21875, "rewards/rejected": -3.888671875, "step": 4655 }, { "epoch": 0.8797770324531154, "grad_norm": 2.2087607689426516, "learning_rate": 1.3925943822531674e-07, "logits/chosen": 3.0078125, "logits/rejected": 2.134765625, "logps/chosen": -887.0, "logps/rejected": -794.0, "loss": 0.5251, "rewards/accuracies": 0.875, "rewards/chosen": 0.9013671875, "rewards/margins": 5.4765625, "rewards/rejected": -4.57421875, "step": 4656 }, { "epoch": 0.8799659880013226, "grad_norm": 3.1702875928856495, "learning_rate": 1.3913827877553253e-07, "logits/chosen": 2.591796875, "logits/rejected": 2.4970703125, "logps/chosen": -722.0, "logps/rejected": -770.0, "loss": 0.623, "rewards/accuracies": 0.8125, "rewards/chosen": 0.67724609375, "rewards/margins": 4.39453125, "rewards/rejected": -3.71484375, "step": 4657 }, { "epoch": 0.88015494354953, "grad_norm": 1.8195395588562397, "learning_rate": 1.3901729807089052e-07, "logits/chosen": 3.353515625, "logits/rejected": 2.8828125, "logps/chosen": -666.0, "logps/rejected": -938.0, "loss": 0.7167, "rewards/accuracies": 0.75, "rewards/chosen": -0.421142578125, "rewards/margins": 7.890625, "rewards/rejected": -8.33984375, "step": 4658 }, { "epoch": 0.8803438990977372, "grad_norm": 4.311926145443934, "learning_rate": 1.3889649616402338e-07, "logits/chosen": 2.4609375, "logits/rejected": 2.333984375, "logps/chosen": -724.0, "logps/rejected": -741.0, "loss": 0.6412, "rewards/accuracies": 0.71875, "rewards/chosen": 0.623779296875, "rewards/margins": 3.66796875, "rewards/rejected": -3.044921875, "step": 4659 }, { "epoch": 0.8805328546459446, "grad_norm": 3.629349673502007, "learning_rate": 1.3877587310748585e-07, "logits/chosen": 3.14453125, "logits/rejected": 2.84765625, "logps/chosen": -916.0, "logps/rejected": -1443.0, "loss": 0.5927, "rewards/accuracies": 0.84375, "rewards/chosen": 1.15234375, "rewards/margins": 4.48828125, "rewards/rejected": -3.333984375, "step": 4660 }, { "epoch": 0.8807218101941519, "grad_norm": 3.600141543561962, "learning_rate": 1.3865542895375487e-07, "logits/chosen": 1.908203125, "logits/rejected": 1.771484375, "logps/chosen": -938.0, "logps/rejected": -1170.0, "loss": 0.5542, "rewards/accuracies": 0.8125, "rewards/chosen": 0.859375, "rewards/margins": 6.53125, "rewards/rejected": -5.6796875, "step": 4661 }, { "epoch": 0.8809107657423592, "grad_norm": 3.831208067624774, "learning_rate": 1.3853516375522977e-07, "logits/chosen": 2.82421875, "logits/rejected": 2.73681640625, "logps/chosen": -444.0, "logps/rejected": -553.5, "loss": 0.5399, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7578125, "rewards/margins": 5.2109375, "rewards/rejected": -4.44921875, "step": 4662 }, { "epoch": 0.8810997212905664, "grad_norm": 5.088648217705422, "learning_rate": 1.3841507756423165e-07, "logits/chosen": 2.201171875, "logits/rejected": 2.0625, "logps/chosen": -942.0, "logps/rejected": -1167.5, "loss": 0.5319, "rewards/accuracies": 0.90625, "rewards/chosen": 0.123504638671875, "rewards/margins": 7.1171875, "rewards/rejected": -6.9921875, "step": 4663 }, { "epoch": 0.8812886768387737, "grad_norm": 2.1985246200061024, "learning_rate": 1.382951704330041e-07, "logits/chosen": 3.33203125, "logits/rejected": 3.51171875, "logps/chosen": -888.5, "logps/rejected": -1113.0, "loss": 0.6308, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2918701171875, "rewards/margins": 6.296875, "rewards/rejected": -6.0, "step": 4664 }, { "epoch": 0.881477632386981, "grad_norm": 2.5726380861873355, "learning_rate": 1.381754424137125e-07, "logits/chosen": 3.609375, "logits/rejected": 3.453125, "logps/chosen": -428.0, "logps/rejected": -675.0, "loss": 0.7268, "rewards/accuracies": 0.71875, "rewards/chosen": 0.38330078125, "rewards/margins": 4.07421875, "rewards/rejected": -3.6953125, "step": 4665 }, { "epoch": 0.8816665879351883, "grad_norm": 1.7381686619482422, "learning_rate": 1.3805589355844455e-07, "logits/chosen": 2.994140625, "logits/rejected": 2.45458984375, "logps/chosen": -979.0, "logps/rejected": -1064.5, "loss": 0.5626, "rewards/accuracies": 0.84375, "rewards/chosen": 1.7578125, "rewards/margins": 7.888671875, "rewards/rejected": -6.14306640625, "step": 4666 }, { "epoch": 0.8818555434833956, "grad_norm": 2.1369861266097283, "learning_rate": 1.3793652391921e-07, "logits/chosen": 3.7421875, "logits/rejected": 3.90625, "logps/chosen": -678.5, "logps/rejected": -883.0, "loss": 0.7278, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3994140625, "rewards/margins": 5.39599609375, "rewards/rejected": -6.80078125, "step": 4667 }, { "epoch": 0.8820444990316029, "grad_norm": 1.5991480907468707, "learning_rate": 1.3781733354794023e-07, "logits/chosen": 2.86328125, "logits/rejected": 2.44189453125, "logps/chosen": -555.0, "logps/rejected": -718.5, "loss": 0.5731, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4287109375, "rewards/margins": 5.13671875, "rewards/rejected": -4.69921875, "step": 4668 }, { "epoch": 0.8822334545798101, "grad_norm": 3.106661886085157, "learning_rate": 1.376983224964892e-07, "logits/chosen": 2.94921875, "logits/rejected": 3.0546875, "logps/chosen": -579.0, "logps/rejected": -1003.0, "loss": 0.6318, "rewards/accuracies": 0.75, "rewards/chosen": 0.97216796875, "rewards/margins": 4.650390625, "rewards/rejected": -3.669921875, "step": 4669 }, { "epoch": 0.8824224101280174, "grad_norm": 3.0001364289426773, "learning_rate": 1.3757949081663256e-07, "logits/chosen": 2.2099609375, "logits/rejected": 2.029296875, "logps/chosen": -937.0, "logps/rejected": -1193.0, "loss": 0.5485, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9383544921875, "rewards/margins": 6.26953125, "rewards/rejected": -5.31640625, "step": 4670 }, { "epoch": 0.8826113656762247, "grad_norm": 3.1290639441286117, "learning_rate": 1.374608385600679e-07, "logits/chosen": 3.109375, "logits/rejected": 2.529296875, "logps/chosen": -612.5, "logps/rejected": -765.0, "loss": 0.5604, "rewards/accuracies": 0.875, "rewards/chosen": 0.13916015625, "rewards/margins": 7.4921875, "rewards/rejected": -7.3359375, "step": 4671 }, { "epoch": 0.882800321224432, "grad_norm": 4.061341950734445, "learning_rate": 1.373423657784149e-07, "logits/chosen": 2.265625, "logits/rejected": 2.345703125, "logps/chosen": -466.5, "logps/rejected": -571.5, "loss": 0.6883, "rewards/accuracies": 0.75, "rewards/chosen": 0.123291015625, "rewards/margins": 3.62109375, "rewards/rejected": -3.49609375, "step": 4672 }, { "epoch": 0.8829892767726393, "grad_norm": 2.0639285849197453, "learning_rate": 1.3722407252321488e-07, "logits/chosen": 3.2890625, "logits/rejected": 2.92578125, "logps/chosen": -846.0, "logps/rejected": -1166.0, "loss": 0.4871, "rewards/accuracies": 0.84375, "rewards/chosen": 1.611328125, "rewards/margins": 8.1796875, "rewards/rejected": -6.58203125, "step": 4673 }, { "epoch": 0.8831782323208465, "grad_norm": 2.2738218445064664, "learning_rate": 1.3710595884593148e-07, "logits/chosen": 2.806640625, "logits/rejected": 2.763671875, "logps/chosen": -954.0, "logps/rejected": -786.5, "loss": 0.5917, "rewards/accuracies": 0.84375, "rewards/chosen": 0.942626953125, "rewards/margins": 4.49609375, "rewards/rejected": -3.5546875, "step": 4674 }, { "epoch": 0.8833671878690538, "grad_norm": 2.9269491385183826, "learning_rate": 1.3698802479794984e-07, "logits/chosen": 3.41796875, "logits/rejected": 3.1640625, "logps/chosen": -565.0, "logps/rejected": -793.0, "loss": 0.5148, "rewards/accuracies": 0.875, "rewards/chosen": 1.033203125, "rewards/margins": 6.0234375, "rewards/rejected": -4.984375, "step": 4675 }, { "epoch": 0.8835561434172611, "grad_norm": 2.446907600819277, "learning_rate": 1.3687027043057722e-07, "logits/chosen": 2.29296875, "logits/rejected": 2.13671875, "logps/chosen": -882.5, "logps/rejected": -976.0, "loss": 0.5731, "rewards/accuracies": 0.84375, "rewards/chosen": 0.4013671875, "rewards/margins": 4.5859375, "rewards/rejected": -4.18359375, "step": 4676 }, { "epoch": 0.8837450989654684, "grad_norm": 3.858562715825515, "learning_rate": 1.3675269579504245e-07, "logits/chosen": 3.515625, "logits/rejected": 3.52734375, "logps/chosen": -721.0, "logps/rejected": -2116.0, "loss": 0.7358, "rewards/accuracies": 0.71875, "rewards/chosen": -0.572998046875, "rewards/margins": 7.6484375, "rewards/rejected": -8.220703125, "step": 4677 }, { "epoch": 0.8839340545136757, "grad_norm": 2.1879119195506065, "learning_rate": 1.3663530094249627e-07, "logits/chosen": 2.794921875, "logits/rejected": 2.80859375, "logps/chosen": -514.5, "logps/rejected": -636.0, "loss": 0.5544, "rewards/accuracies": 0.84375, "rewards/chosen": 0.28759765625, "rewards/margins": 4.55859375, "rewards/rejected": -4.2578125, "step": 4678 }, { "epoch": 0.884123010061883, "grad_norm": 3.654969602912065, "learning_rate": 1.3651808592401138e-07, "logits/chosen": 1.794921875, "logits/rejected": 1.80859375, "logps/chosen": -822.0, "logps/rejected": -1070.5, "loss": 0.5398, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6708984375, "rewards/margins": 7.0859375, "rewards/rejected": -6.41015625, "step": 4679 }, { "epoch": 0.8843119656100902, "grad_norm": 4.0353852150143386, "learning_rate": 1.3640105079058212e-07, "logits/chosen": 3.505859375, "logits/rejected": 3.291015625, "logps/chosen": -718.0, "logps/rejected": -875.5, "loss": 0.7162, "rewards/accuracies": 0.71875, "rewards/chosen": 0.288360595703125, "rewards/margins": 3.76171875, "rewards/rejected": -3.4677734375, "step": 4680 }, { "epoch": 0.8845009211582975, "grad_norm": 1.9224427200693528, "learning_rate": 1.3628419559312442e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.4375, "logps/chosen": -632.0, "logps/rejected": -820.0, "loss": 0.6051, "rewards/accuracies": 0.75, "rewards/chosen": -0.49365234375, "rewards/margins": 4.05078125, "rewards/rejected": -4.53125, "step": 4681 }, { "epoch": 0.8846898767065048, "grad_norm": 3.5910949392055818, "learning_rate": 1.361675203824761e-07, "logits/chosen": 2.68359375, "logits/rejected": 2.55078125, "logps/chosen": -922.5, "logps/rejected": -747.5, "loss": 0.6322, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9420166015625, "rewards/margins": 4.3125, "rewards/rejected": -3.37109375, "step": 4682 }, { "epoch": 0.8848788322547121, "grad_norm": 4.27119922646286, "learning_rate": 1.3605102520939673e-07, "logits/chosen": 3.1640625, "logits/rejected": 2.95703125, "logps/chosen": -1008.0, "logps/rejected": -962.5, "loss": 0.5944, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6337890625, "rewards/margins": 4.88671875, "rewards/rejected": -4.2578125, "step": 4683 }, { "epoch": 0.8850677878029194, "grad_norm": 3.144806989407308, "learning_rate": 1.3593471012456735e-07, "logits/chosen": 2.859375, "logits/rejected": 2.1494140625, "logps/chosen": -738.0, "logps/rejected": -1021.0, "loss": 0.5591, "rewards/accuracies": 0.75, "rewards/chosen": 0.4635009765625, "rewards/margins": 6.8515625, "rewards/rejected": -6.390625, "step": 4684 }, { "epoch": 0.8852567433511267, "grad_norm": 3.218753335835549, "learning_rate": 1.3581857517859086e-07, "logits/chosen": 3.00390625, "logits/rejected": 2.96875, "logps/chosen": -793.0, "logps/rejected": -679.0, "loss": 0.6442, "rewards/accuracies": 0.75, "rewards/chosen": -0.310546875, "rewards/margins": 3.865234375, "rewards/rejected": -4.1796875, "step": 4685 }, { "epoch": 0.8854456988993339, "grad_norm": 3.051185157445127, "learning_rate": 1.357026204219916e-07, "logits/chosen": 2.55078125, "logits/rejected": 2.609375, "logps/chosen": -866.0, "logps/rejected": -798.0, "loss": 0.7119, "rewards/accuracies": 0.6875, "rewards/chosen": -3.305419921875, "rewards/margins": -0.05078125, "rewards/rejected": -3.2734375, "step": 4686 }, { "epoch": 0.8856346544475412, "grad_norm": 1.6993734913738139, "learning_rate": 1.3558684590521564e-07, "logits/chosen": 2.44140625, "logits/rejected": 2.71484375, "logps/chosen": -520.0, "logps/rejected": -730.0, "loss": 0.539, "rewards/accuracies": 0.8125, "rewards/chosen": 0.630859375, "rewards/margins": 5.734375, "rewards/rejected": -5.08203125, "step": 4687 }, { "epoch": 0.8858236099957485, "grad_norm": 2.775202203316871, "learning_rate": 1.354712516786308e-07, "logits/chosen": 2.427734375, "logits/rejected": 1.892578125, "logps/chosen": -952.0, "logps/rejected": -1152.0, "loss": 0.524, "rewards/accuracies": 0.8125, "rewards/chosen": 1.287109375, "rewards/margins": 5.92578125, "rewards/rejected": -4.642578125, "step": 4688 }, { "epoch": 0.8860125655439558, "grad_norm": 3.2940073349590313, "learning_rate": 1.3535583779252596e-07, "logits/chosen": 3.078125, "logits/rejected": 2.6640625, "logps/chosen": -572.0, "logps/rejected": -521.5, "loss": 0.6299, "rewards/accuracies": 0.84375, "rewards/chosen": 0.438232421875, "rewards/margins": 3.626953125, "rewards/rejected": -3.18359375, "step": 4689 }, { "epoch": 0.8862015210921631, "grad_norm": 2.469062750225022, "learning_rate": 1.3524060429711209e-07, "logits/chosen": 2.59033203125, "logits/rejected": 2.47265625, "logps/chosen": -716.0, "logps/rejected": -562.5, "loss": 0.4871, "rewards/accuracies": 0.9375, "rewards/chosen": 0.59033203125, "rewards/margins": 5.26953125, "rewards/rejected": -4.68359375, "step": 4690 }, { "epoch": 0.8863904766403704, "grad_norm": 2.0257124148847843, "learning_rate": 1.3512555124252142e-07, "logits/chosen": 2.66015625, "logits/rejected": 2.83984375, "logps/chosen": -659.0, "logps/rejected": -768.0, "loss": 0.6499, "rewards/accuracies": 0.84375, "rewards/chosen": 0.513671875, "rewards/margins": 3.869140625, "rewards/rejected": -3.349609375, "step": 4691 }, { "epoch": 0.8865794321885776, "grad_norm": 4.147678240351515, "learning_rate": 1.350106786788076e-07, "logits/chosen": 1.869140625, "logits/rejected": 2.1552734375, "logps/chosen": -656.0, "logps/rejected": -1124.0, "loss": 0.5408, "rewards/accuracies": 0.78125, "rewards/chosen": 0.77679443359375, "rewards/margins": 5.265625, "rewards/rejected": -4.4921875, "step": 4692 }, { "epoch": 0.8867683877367849, "grad_norm": 2.291223361208005, "learning_rate": 1.3489598665594604e-07, "logits/chosen": 3.33203125, "logits/rejected": 3.16015625, "logps/chosen": -661.5, "logps/rejected": -615.5, "loss": 0.7321, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1259765625, "rewards/margins": 3.4658203125, "rewards/rejected": -3.58984375, "step": 4693 }, { "epoch": 0.8869573432849922, "grad_norm": 1.8490159757860531, "learning_rate": 1.3478147522383324e-07, "logits/chosen": 3.44140625, "logits/rejected": 3.013671875, "logps/chosen": -843.0, "logps/rejected": -757.5, "loss": 0.6415, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8046875, "rewards/margins": 4.43359375, "rewards/rejected": -3.63671875, "step": 4694 }, { "epoch": 0.8871462988331995, "grad_norm": 5.7359453194208445, "learning_rate": 1.3466714443228746e-07, "logits/chosen": 2.890625, "logits/rejected": 2.953125, "logps/chosen": -682.5, "logps/rejected": -845.5, "loss": 0.6482, "rewards/accuracies": 0.875, "rewards/chosen": 0.428466796875, "rewards/margins": 4.25, "rewards/rejected": -3.82421875, "step": 4695 }, { "epoch": 0.8873352543814068, "grad_norm": 3.752187713614584, "learning_rate": 1.3455299433104825e-07, "logits/chosen": 3.1953125, "logits/rejected": 2.408203125, "logps/chosen": -1172.0, "logps/rejected": -1015.0, "loss": 0.5558, "rewards/accuracies": 0.84375, "rewards/chosen": 0.749267578125, "rewards/margins": 5.291015625, "rewards/rejected": -4.54296875, "step": 4696 }, { "epoch": 0.887524209929614, "grad_norm": 5.026746069433398, "learning_rate": 1.344390249697765e-07, "logits/chosen": 3.2421875, "logits/rejected": 2.859375, "logps/chosen": -1005.0, "logps/rejected": -960.0, "loss": 0.5033, "rewards/accuracies": 0.875, "rewards/chosen": 1.037109375, "rewards/margins": 5.7109375, "rewards/rejected": -4.66796875, "step": 4697 }, { "epoch": 0.8877131654778213, "grad_norm": 2.562764185645776, "learning_rate": 1.343252363980545e-07, "logits/chosen": 2.658203125, "logits/rejected": 3.0, "logps/chosen": -783.0, "logps/rejected": -834.0, "loss": 0.6165, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3338623046875, "rewards/margins": 4.75390625, "rewards/rejected": -4.42578125, "step": 4698 }, { "epoch": 0.8879021210260286, "grad_norm": 2.911281471256076, "learning_rate": 1.3421162866538584e-07, "logits/chosen": 2.48046875, "logits/rejected": 2.484375, "logps/chosen": -449.5, "logps/rejected": -668.0, "loss": 0.7456, "rewards/accuracies": 0.6875, "rewards/chosen": -0.212646484375, "rewards/margins": 3.1064453125, "rewards/rejected": -3.318359375, "step": 4699 }, { "epoch": 0.8880910765742359, "grad_norm": 2.1191890077262516, "learning_rate": 1.3409820182119567e-07, "logits/chosen": 3.58984375, "logits/rejected": 3.44921875, "logps/chosen": -910.0, "logps/rejected": -841.5, "loss": 0.7049, "rewards/accuracies": 0.8125, "rewards/chosen": 1.09521484375, "rewards/margins": 3.333984375, "rewards/rejected": -2.228118896484375, "step": 4700 }, { "epoch": 0.8882800321224432, "grad_norm": 2.764487457148902, "learning_rate": 1.3398495591483011e-07, "logits/chosen": 2.75390625, "logits/rejected": 2.599609375, "logps/chosen": -933.0, "logps/rejected": -901.0, "loss": 0.6569, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4888916015625, "rewards/margins": 3.923828125, "rewards/rejected": -3.435546875, "step": 4701 }, { "epoch": 0.8884689876706505, "grad_norm": 3.523283509660111, "learning_rate": 1.338718909955568e-07, "logits/chosen": 2.75, "logits/rejected": 2.0234375, "logps/chosen": -5962.0, "logps/rejected": -752.5, "loss": 0.5369, "rewards/accuracies": 0.875, "rewards/chosen": -25.59765625, "rewards/margins": -20.0390625, "rewards/rejected": -5.5, "step": 4702 }, { "epoch": 0.8886579432188577, "grad_norm": 2.080596025068459, "learning_rate": 1.3375900711256456e-07, "logits/chosen": 3.17578125, "logits/rejected": 3.63671875, "logps/chosen": -531.0, "logps/rejected": -1221.0, "loss": 0.7411, "rewards/accuracies": 0.75, "rewards/chosen": 1.02783203125, "rewards/margins": 6.609375, "rewards/rejected": -5.58203125, "step": 4703 }, { "epoch": 0.888846898767065, "grad_norm": 2.2981629262956225, "learning_rate": 1.3364630431496348e-07, "logits/chosen": 2.42578125, "logits/rejected": 2.234375, "logps/chosen": -942.0, "logps/rejected": -1006.0, "loss": 0.6348, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1376953125, "rewards/margins": 4.9296875, "rewards/rejected": -3.794921875, "step": 4704 }, { "epoch": 0.8890358543152723, "grad_norm": 3.347454815344686, "learning_rate": 1.335337826517847e-07, "logits/chosen": 2.83203125, "logits/rejected": 2.80078125, "logps/chosen": -560.0, "logps/rejected": -727.0, "loss": 0.7216, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8759765625, "rewards/margins": 3.4375, "rewards/rejected": -4.3125, "step": 4705 }, { "epoch": 0.8892248098634796, "grad_norm": 2.5344688952704435, "learning_rate": 1.3342144217198086e-07, "logits/chosen": 2.33984375, "logits/rejected": 3.03515625, "logps/chosen": -461.0, "logps/rejected": -620.5, "loss": 0.7131, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2734375, "rewards/margins": 3.10400390625, "rewards/rejected": -3.36962890625, "step": 4706 }, { "epoch": 0.8894137654116869, "grad_norm": 3.2134034797151956, "learning_rate": 1.333092829244256e-07, "logits/chosen": 3.177734375, "logits/rejected": 2.236328125, "logps/chosen": -446.5, "logps/rejected": -596.0, "loss": 0.547, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2724609375, "rewards/margins": 6.66796875, "rewards/rejected": -6.390625, "step": 4707 }, { "epoch": 0.8896027209598942, "grad_norm": 2.0620930507825674, "learning_rate": 1.3319730495791363e-07, "logits/chosen": 3.015625, "logits/rejected": 2.564453125, "logps/chosen": -883.0, "logps/rejected": -870.0, "loss": 0.4788, "rewards/accuracies": 0.75, "rewards/chosen": 1.3101806640625, "rewards/margins": 5.2890625, "rewards/rejected": -3.9765625, "step": 4708 }, { "epoch": 0.8897916765081014, "grad_norm": 1.9577946413352503, "learning_rate": 1.33085508321161e-07, "logits/chosen": 2.595703125, "logits/rejected": 1.97265625, "logps/chosen": -827.5, "logps/rejected": -1510.0, "loss": 0.5493, "rewards/accuracies": 0.84375, "rewards/chosen": 1.111328125, "rewards/margins": 5.5, "rewards/rejected": -4.3818359375, "step": 4709 }, { "epoch": 0.8899806320563087, "grad_norm": 1.3313532808180297, "learning_rate": 1.3297389306280465e-07, "logits/chosen": 3.16796875, "logits/rejected": 2.4130859375, "logps/chosen": -2062.0, "logps/rejected": -1439.0, "loss": 0.5079, "rewards/accuracies": 0.84375, "rewards/chosen": -0.79638671875, "rewards/margins": 6.0546875, "rewards/rejected": -6.87109375, "step": 4710 }, { "epoch": 0.890169587604516, "grad_norm": 2.2161670751270606, "learning_rate": 1.3286245923140296e-07, "logits/chosen": 2.787109375, "logits/rejected": 2.8828125, "logps/chosen": -720.0, "logps/rejected": -1031.0, "loss": 0.5745, "rewards/accuracies": 0.8125, "rewards/chosen": -0.76416015625, "rewards/margins": 6.95703125, "rewards/rejected": -7.7265625, "step": 4711 }, { "epoch": 0.8903585431527233, "grad_norm": 2.1126940534508742, "learning_rate": 1.3275120687543486e-07, "logits/chosen": 2.26171875, "logits/rejected": 2.21875, "logps/chosen": -904.0, "logps/rejected": -2019.0, "loss": 0.4634, "rewards/accuracies": 0.8125, "rewards/chosen": 1.568359375, "rewards/margins": 9.0078125, "rewards/rejected": -7.421875, "step": 4712 }, { "epoch": 0.8905474987009306, "grad_norm": 4.976677966937132, "learning_rate": 1.3264013604330082e-07, "logits/chosen": 2.6669921875, "logits/rejected": 2.6171875, "logps/chosen": -609.75, "logps/rejected": -934.75, "loss": 0.674, "rewards/accuracies": 0.71875, "rewards/chosen": 0.40478515625, "rewards/margins": 6.072265625, "rewards/rejected": -5.666015625, "step": 4713 }, { "epoch": 0.8907364542491379, "grad_norm": 2.9774757615518546, "learning_rate": 1.3252924678332212e-07, "logits/chosen": 3.37109375, "logits/rejected": 3.5625, "logps/chosen": -715.5, "logps/rejected": -601.0, "loss": 0.6555, "rewards/accuracies": 0.71875, "rewards/chosen": 0.80224609375, "rewards/margins": 4.12109375, "rewards/rejected": -3.326171875, "step": 4714 }, { "epoch": 0.8909254097973451, "grad_norm": 2.122953309009996, "learning_rate": 1.324185391437409e-07, "logits/chosen": 3.80859375, "logits/rejected": 3.546875, "logps/chosen": -743.5, "logps/rejected": -587.5, "loss": 0.6003, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3017578125, "rewards/margins": 4.609375, "rewards/rejected": -4.30859375, "step": 4715 }, { "epoch": 0.8911143653455524, "grad_norm": 2.8311157597508205, "learning_rate": 1.3230801317272075e-07, "logits/chosen": 3.453125, "logits/rejected": 3.080078125, "logps/chosen": -719.0, "logps/rejected": -606.5, "loss": 0.6319, "rewards/accuracies": 0.78125, "rewards/chosen": 0.626220703125, "rewards/margins": 4.42578125, "rewards/rejected": -3.80322265625, "step": 4716 }, { "epoch": 0.8913033208937597, "grad_norm": 2.10567868778794, "learning_rate": 1.3219766891834554e-07, "logits/chosen": 2.841796875, "logits/rejected": 3.103515625, "logps/chosen": -800.25, "logps/rejected": -1925.0, "loss": 0.5932, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9765625, "rewards/margins": 8.3125, "rewards/rejected": -7.32421875, "step": 4717 }, { "epoch": 0.891492276441967, "grad_norm": 2.340227802611602, "learning_rate": 1.3208750642862082e-07, "logits/chosen": 3.095703125, "logits/rejected": 2.49853515625, "logps/chosen": -646.5, "logps/rejected": -1371.0, "loss": 0.5475, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9578857421875, "rewards/margins": 10.8828125, "rewards/rejected": -9.93359375, "step": 4718 }, { "epoch": 0.8916812319901743, "grad_norm": 2.8080004367083613, "learning_rate": 1.319775257514726e-07, "logits/chosen": 3.54296875, "logits/rejected": 3.27734375, "logps/chosen": -445.0, "logps/rejected": -497.25, "loss": 0.6121, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3304443359375, "rewards/margins": 3.796875, "rewards/rejected": -4.12890625, "step": 4719 }, { "epoch": 0.8918701875383815, "grad_norm": 2.332056713048347, "learning_rate": 1.318677269347478e-07, "logits/chosen": 1.7705078125, "logits/rejected": 1.40478515625, "logps/chosen": -757.0, "logps/rejected": -788.0, "loss": 0.5587, "rewards/accuracies": 0.8125, "rewards/chosen": 1.31640625, "rewards/margins": 4.92578125, "rewards/rejected": -3.603515625, "step": 4720 }, { "epoch": 0.8920591430865888, "grad_norm": 3.3491206314121635, "learning_rate": 1.3175811002621446e-07, "logits/chosen": 2.794921875, "logits/rejected": 2.0244140625, "logps/chosen": -734.0, "logps/rejected": -829.0, "loss": 0.5377, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3427734375, "rewards/margins": 5.74609375, "rewards/rejected": -4.39453125, "step": 4721 }, { "epoch": 0.8922480986347961, "grad_norm": 3.2034769325336576, "learning_rate": 1.3164867507356132e-07, "logits/chosen": 3.11328125, "logits/rejected": 2.82421875, "logps/chosen": -694.0, "logps/rejected": -1393.0, "loss": 0.5473, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9140625, "rewards/margins": 5.2578125, "rewards/rejected": -4.34375, "step": 4722 }, { "epoch": 0.8924370541830035, "grad_norm": 2.545496466980221, "learning_rate": 1.3153942212439793e-07, "logits/chosen": 2.994140625, "logits/rejected": 2.873046875, "logps/chosen": -525.5, "logps/rejected": -572.5, "loss": 0.6759, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2421875, "rewards/margins": 3.12890625, "rewards/rejected": -3.37109375, "step": 4723 }, { "epoch": 0.8926260097312108, "grad_norm": 2.0456839029275855, "learning_rate": 1.314303512262548e-07, "logits/chosen": 3.46875, "logits/rejected": 3.02734375, "logps/chosen": -881.0, "logps/rejected": -832.5, "loss": 0.5192, "rewards/accuracies": 0.875, "rewards/chosen": 1.2265625, "rewards/margins": 5.34375, "rewards/rejected": -4.1171875, "step": 4724 }, { "epoch": 0.8928149652794181, "grad_norm": 3.2591115783640605, "learning_rate": 1.313214624265831e-07, "logits/chosen": 2.8203125, "logits/rejected": 2.82421875, "logps/chosen": -434.5, "logps/rejected": -849.0, "loss": 0.5637, "rewards/accuracies": 0.90625, "rewards/chosen": 0.0234375, "rewards/margins": 6.0390625, "rewards/rejected": -6.01171875, "step": 4725 }, { "epoch": 0.8930039208276253, "grad_norm": 2.1601405929622826, "learning_rate": 1.3121275577275476e-07, "logits/chosen": 3.3515625, "logits/rejected": 3.505859375, "logps/chosen": -632.0, "logps/rejected": -14555.0, "loss": 0.5771, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0390625, "rewards/margins": -160.4140625, "rewards/rejected": 160.92578125, "step": 4726 }, { "epoch": 0.8931928763758326, "grad_norm": 3.999880106385728, "learning_rate": 1.3110423131206276e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -821.0, "logps/rejected": -2707.0, "loss": 0.5921, "rewards/accuracies": 0.90625, "rewards/chosen": 0.125, "rewards/margins": 12.2734375, "rewards/rejected": -12.140625, "step": 4727 }, { "epoch": 0.8933818319240399, "grad_norm": 3.13325893950642, "learning_rate": 1.3099588909172047e-07, "logits/chosen": 2.94140625, "logits/rejected": 2.033203125, "logps/chosen": -469.5, "logps/rejected": -858.0, "loss": 0.5132, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6640625, "rewards/margins": 6.6953125, "rewards/rejected": -6.02734375, "step": 4728 }, { "epoch": 0.8935707874722472, "grad_norm": 1.8950951858788119, "learning_rate": 1.3088772915886208e-07, "logits/chosen": 2.86328125, "logits/rejected": 2.505859375, "logps/chosen": -1019.0, "logps/rejected": -1167.0, "loss": 0.5047, "rewards/accuracies": 0.78125, "rewards/chosen": 1.115234375, "rewards/margins": 5.7265625, "rewards/rejected": -4.611328125, "step": 4729 }, { "epoch": 0.8937597430204545, "grad_norm": 1.9081868475216364, "learning_rate": 1.3077975156054262e-07, "logits/chosen": 3.03515625, "logits/rejected": 2.74609375, "logps/chosen": -635.0, "logps/rejected": -821.5, "loss": 0.6233, "rewards/accuracies": 0.78125, "rewards/chosen": 0.513671875, "rewards/margins": 4.3828125, "rewards/rejected": -3.86328125, "step": 4730 }, { "epoch": 0.8939486985686618, "grad_norm": 1.9445121665245537, "learning_rate": 1.3067195634373745e-07, "logits/chosen": 2.216796875, "logits/rejected": 1.703125, "logps/chosen": -523.5, "logps/rejected": -529.5, "loss": 0.6038, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0260009765625, "rewards/margins": 4.35546875, "rewards/rejected": -4.37890625, "step": 4731 }, { "epoch": 0.894137654116869, "grad_norm": 5.046827733282564, "learning_rate": 1.3056434355534315e-07, "logits/chosen": 2.560546875, "logits/rejected": 2.083984375, "logps/chosen": -682.0, "logps/rejected": -764.0, "loss": 0.575, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4736328125, "rewards/margins": 4.5078125, "rewards/rejected": -4.03125, "step": 4732 }, { "epoch": 0.8943266096650763, "grad_norm": 8.572438979620218, "learning_rate": 1.3045691324217627e-07, "logits/chosen": 2.796875, "logits/rejected": 2.8359375, "logps/chosen": -852.0, "logps/rejected": -935.0, "loss": 0.5571, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5830078125, "rewards/margins": 5.1796875, "rewards/rejected": -4.6015625, "step": 4733 }, { "epoch": 0.8945155652132836, "grad_norm": 2.239758421870965, "learning_rate": 1.3034966545097445e-07, "logits/chosen": 3.4375, "logits/rejected": 3.05859375, "logps/chosen": -723.0, "logps/rejected": -739.0, "loss": 0.5732, "rewards/accuracies": 0.75, "rewards/chosen": 1.3759765625, "rewards/margins": 5.19921875, "rewards/rejected": -3.810546875, "step": 4734 }, { "epoch": 0.8947045207614909, "grad_norm": 2.622220446268294, "learning_rate": 1.302426002283958e-07, "logits/chosen": 3.5390625, "logits/rejected": 2.890625, "logps/chosen": -792.0, "logps/rejected": -1621.0, "loss": 0.5911, "rewards/accuracies": 0.8125, "rewards/chosen": 0.50634765625, "rewards/margins": 7.453125, "rewards/rejected": -6.9453125, "step": 4735 }, { "epoch": 0.8948934763096982, "grad_norm": 4.776910743677649, "learning_rate": 1.3013571762101888e-07, "logits/chosen": 3.0078125, "logits/rejected": 2.5859375, "logps/chosen": -926.0, "logps/rejected": -944.0, "loss": 0.4279, "rewards/accuracies": 0.90625, "rewards/chosen": 1.040283203125, "rewards/margins": 6.01953125, "rewards/rejected": -4.984375, "step": 4736 }, { "epoch": 0.8950824318579055, "grad_norm": 2.7261691528586174, "learning_rate": 1.3002901767534317e-07, "logits/chosen": 3.09765625, "logits/rejected": 2.416015625, "logps/chosen": -743.0, "logps/rejected": -647.0, "loss": 0.6248, "rewards/accuracies": 0.78125, "rewards/chosen": -0.30322265625, "rewards/margins": 4.46484375, "rewards/rejected": -4.7734375, "step": 4737 }, { "epoch": 0.8952713874061127, "grad_norm": 4.671766757686652, "learning_rate": 1.29922500437788e-07, "logits/chosen": 3.3125, "logits/rejected": 3.83984375, "logps/chosen": -610.5, "logps/rejected": -1553.0, "loss": 0.5936, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0224609375, "rewards/margins": 6.73828125, "rewards/rejected": -6.71484375, "step": 4738 }, { "epoch": 0.89546034295432, "grad_norm": 2.7006722147138804, "learning_rate": 1.2981616595469398e-07, "logits/chosen": 3.2578125, "logits/rejected": 2.8359375, "logps/chosen": -729.5, "logps/rejected": -745.0, "loss": 0.605, "rewards/accuracies": 0.75, "rewards/chosen": -0.0914306640625, "rewards/margins": 5.1796875, "rewards/rejected": -5.26953125, "step": 4739 }, { "epoch": 0.8956492985025273, "grad_norm": 1.6799484784569099, "learning_rate": 1.2971001427232166e-07, "logits/chosen": 3.3828125, "logits/rejected": 3.498046875, "logps/chosen": -768.5, "logps/rejected": -1857.0, "loss": 0.578, "rewards/accuracies": 0.78125, "rewards/chosen": 1.2606201171875, "rewards/margins": 9.7421875, "rewards/rejected": -8.46484375, "step": 4740 }, { "epoch": 0.8958382540507346, "grad_norm": 2.7967276584706102, "learning_rate": 1.2960404543685232e-07, "logits/chosen": 3.96875, "logits/rejected": 3.267578125, "logps/chosen": -676.5, "logps/rejected": -598.0, "loss": 0.6552, "rewards/accuracies": 0.6875, "rewards/chosen": -2.212890625, "rewards/margins": 2.05859375, "rewards/rejected": -4.2734375, "step": 4741 }, { "epoch": 0.8960272095989419, "grad_norm": 5.641687754421674, "learning_rate": 1.2949825949438777e-07, "logits/chosen": 3.60546875, "logits/rejected": 3.802734375, "logps/chosen": -824.0, "logps/rejected": -865.0, "loss": 0.6085, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2841796875, "rewards/margins": 4.953125, "rewards/rejected": -5.234375, "step": 4742 }, { "epoch": 0.8962161651471491, "grad_norm": 3.4048977206963635, "learning_rate": 1.293926564909499e-07, "logits/chosen": 1.86083984375, "logits/rejected": 1.720703125, "logps/chosen": -592.0, "logps/rejected": -767.0, "loss": 0.5002, "rewards/accuracies": 0.78125, "rewards/chosen": 0.354736328125, "rewards/margins": 5.53515625, "rewards/rejected": -5.1796875, "step": 4743 }, { "epoch": 0.8964051206953564, "grad_norm": 3.105543673165002, "learning_rate": 1.2928723647248135e-07, "logits/chosen": 2.4658203125, "logits/rejected": 2.33203125, "logps/chosen": -735.0, "logps/rejected": -861.0, "loss": 0.5817, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0068359375, "rewards/margins": 5.70703125, "rewards/rejected": -5.6953125, "step": 4744 }, { "epoch": 0.8965940762435637, "grad_norm": 3.550073399759978, "learning_rate": 1.2918199948484507e-07, "logits/chosen": 2.80859375, "logits/rejected": 2.9296875, "logps/chosen": -614.0, "logps/rejected": -713.0, "loss": 0.6161, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1748046875, "rewards/margins": 4.2734375, "rewards/rejected": -4.09375, "step": 4745 }, { "epoch": 0.896783031791771, "grad_norm": 2.137832552357234, "learning_rate": 1.290769455738243e-07, "logits/chosen": 3.29296875, "logits/rejected": 3.765625, "logps/chosen": -978.0, "logps/rejected": -1206.0, "loss": 0.7124, "rewards/accuracies": 0.6875, "rewards/chosen": 0.224609375, "rewards/margins": 5.4205322265625, "rewards/rejected": -5.2109375, "step": 4746 }, { "epoch": 0.8969719873399783, "grad_norm": 3.352582899906579, "learning_rate": 1.2897207478512265e-07, "logits/chosen": 2.724609375, "logits/rejected": 2.373046875, "logps/chosen": -641.0, "logps/rejected": -636.5, "loss": 0.4823, "rewards/accuracies": 0.90625, "rewards/chosen": 0.6806640625, "rewards/margins": 5.62890625, "rewards/rejected": -4.94140625, "step": 4747 }, { "epoch": 0.8971609428881856, "grad_norm": 2.8155189558748663, "learning_rate": 1.2886738716436406e-07, "logits/chosen": 2.1949462890625, "logits/rejected": 1.6708984375, "logps/chosen": -487.0, "logps/rejected": -908.0, "loss": 0.6536, "rewards/accuracies": 0.8125, "rewards/chosen": 0.02508544921875, "rewards/margins": 4.125, "rewards/rejected": -4.1015625, "step": 4748 }, { "epoch": 0.8973498984363928, "grad_norm": 2.5273745715268388, "learning_rate": 1.287628827570929e-07, "logits/chosen": 3.068359375, "logits/rejected": 2.80078125, "logps/chosen": -692.5, "logps/rejected": -700.0, "loss": 0.668, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0263671875, "rewards/margins": 3.68359375, "rewards/rejected": -3.66015625, "step": 4749 }, { "epoch": 0.8975388539846001, "grad_norm": 3.3899513345701484, "learning_rate": 1.2865856160877379e-07, "logits/chosen": 2.6796875, "logits/rejected": 2.3125, "logps/chosen": -774.5, "logps/rejected": -1436.0, "loss": 0.4851, "rewards/accuracies": 0.84375, "rewards/chosen": 0.673828125, "rewards/margins": 9.8984375, "rewards/rejected": -9.234375, "step": 4750 }, { "epoch": 0.8977278095328074, "grad_norm": 3.870285394839719, "learning_rate": 1.2855442376479147e-07, "logits/chosen": 3.91015625, "logits/rejected": 3.5390625, "logps/chosen": -687.0, "logps/rejected": -870.0, "loss": 0.6043, "rewards/accuracies": 0.8125, "rewards/chosen": -0.065673828125, "rewards/margins": 4.671875, "rewards/rejected": -4.73828125, "step": 4751 }, { "epoch": 0.8979167650810147, "grad_norm": 3.285954161421931, "learning_rate": 1.2845046927045113e-07, "logits/chosen": 2.498046875, "logits/rejected": 2.349609375, "logps/chosen": -847.5, "logps/rejected": -904.0, "loss": 0.6133, "rewards/accuracies": 0.78125, "rewards/chosen": 0.291015625, "rewards/margins": 4.8046875, "rewards/rejected": -4.51953125, "step": 4752 }, { "epoch": 0.898105720629222, "grad_norm": 2.2378348719946604, "learning_rate": 1.2834669817097807e-07, "logits/chosen": 2.765625, "logits/rejected": 2.94140625, "logps/chosen": -727.5, "logps/rejected": -766.0, "loss": 0.6731, "rewards/accuracies": 0.65625, "rewards/chosen": 0.33837890625, "rewards/margins": 3.236328125, "rewards/rejected": -2.900390625, "step": 4753 }, { "epoch": 0.8982946761774293, "grad_norm": 2.648578170760166, "learning_rate": 1.2824311051151776e-07, "logits/chosen": 2.25, "logits/rejected": 2.37109375, "logps/chosen": -895.0, "logps/rejected": -835.0, "loss": 0.6176, "rewards/accuracies": 0.84375, "rewards/chosen": -0.58544921875, "rewards/margins": 4.625, "rewards/rejected": -5.2109375, "step": 4754 }, { "epoch": 0.8984836317256365, "grad_norm": 4.50021061480257, "learning_rate": 1.281397063371361e-07, "logits/chosen": 2.662109375, "logits/rejected": 2.228515625, "logps/chosen": -656.0, "logps/rejected": -567.5, "loss": 0.7521, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3408203125, "rewards/margins": 2.532958984375, "rewards/rejected": -2.875, "step": 4755 }, { "epoch": 0.8986725872738438, "grad_norm": 2.6861897008236975, "learning_rate": 1.2803648569281901e-07, "logits/chosen": 2.5703125, "logits/rejected": 2.6015625, "logps/chosen": -1194.0, "logps/rejected": -1039.0, "loss": 0.562, "rewards/accuracies": 0.78125, "rewards/chosen": 0.84765625, "rewards/margins": 6.4375, "rewards/rejected": -5.59375, "step": 4756 }, { "epoch": 0.8988615428220511, "grad_norm": 2.437801197108662, "learning_rate": 1.2793344862347241e-07, "logits/chosen": 2.9453125, "logits/rejected": 2.890625, "logps/chosen": -1154.5, "logps/rejected": -1740.0, "loss": 0.5278, "rewards/accuracies": 0.8125, "rewards/chosen": 1.314453125, "rewards/margins": 10.171875, "rewards/rejected": -8.84375, "step": 4757 }, { "epoch": 0.8990504983702584, "grad_norm": 3.5643013514538544, "learning_rate": 1.2783059517392272e-07, "logits/chosen": 2.875, "logits/rejected": 2.3466796875, "logps/chosen": -997.0, "logps/rejected": -998.0, "loss": 0.4907, "rewards/accuracies": 0.84375, "rewards/chosen": 0.83251953125, "rewards/margins": 5.953125, "rewards/rejected": -5.1171875, "step": 4758 }, { "epoch": 0.8992394539184657, "grad_norm": 1.2931383390264195, "learning_rate": 1.2772792538891603e-07, "logits/chosen": 2.796875, "logits/rejected": 2.525390625, "logps/chosen": -963.5, "logps/rejected": -1047.0, "loss": 0.5229, "rewards/accuracies": 0.78125, "rewards/chosen": 1.51611328125, "rewards/margins": 7.52734375, "rewards/rejected": -6.015625, "step": 4759 }, { "epoch": 0.899428409466673, "grad_norm": 2.5035580171335052, "learning_rate": 1.27625439313119e-07, "logits/chosen": 2.2265625, "logits/rejected": 2.83984375, "logps/chosen": -523.0, "logps/rejected": -908.5, "loss": 0.6857, "rewards/accuracies": 0.75, "rewards/chosen": 0.431640625, "rewards/margins": 5.6875, "rewards/rejected": -5.25390625, "step": 4760 }, { "epoch": 0.8996173650148802, "grad_norm": 2.5461861977120073, "learning_rate": 1.27523136991118e-07, "logits/chosen": 2.986328125, "logits/rejected": 2.39453125, "logps/chosen": -1021.5, "logps/rejected": -776.0, "loss": 0.4595, "rewards/accuracies": 0.78125, "rewards/chosen": 1.32421875, "rewards/margins": 5.4375, "rewards/rejected": -4.11328125, "step": 4761 }, { "epoch": 0.8998063205630875, "grad_norm": 3.8719978413366176, "learning_rate": 1.2742101846741957e-07, "logits/chosen": 2.759765625, "logits/rejected": 2.720703125, "logps/chosen": -487.5, "logps/rejected": -513.0, "loss": 0.7322, "rewards/accuracies": 0.65625, "rewards/chosen": 0.646484375, "rewards/margins": 2.49169921875, "rewards/rejected": -1.84716796875, "step": 4762 }, { "epoch": 0.8999952761112948, "grad_norm": 2.1569895287253282, "learning_rate": 1.2731908378645054e-07, "logits/chosen": 2.703125, "logits/rejected": 2.177734375, "logps/chosen": -683.5, "logps/rejected": -692.0, "loss": 0.7278, "rewards/accuracies": 0.6875, "rewards/chosen": 0.378662109375, "rewards/margins": 4.025390625, "rewards/rejected": -3.64794921875, "step": 4763 }, { "epoch": 0.9001842316595021, "grad_norm": 3.932895691045859, "learning_rate": 1.2721733299255718e-07, "logits/chosen": 3.013671875, "logits/rejected": 2.3671875, "logps/chosen": -598.5, "logps/rejected": -493.0, "loss": 0.7245, "rewards/accuracies": 0.75, "rewards/chosen": -0.3369140625, "rewards/margins": 1.986328125, "rewards/rejected": -2.3125, "step": 4764 }, { "epoch": 0.9003731872077094, "grad_norm": 2.74518577665905, "learning_rate": 1.2711576613000636e-07, "logits/chosen": 2.501953125, "logits/rejected": 1.867767333984375, "logps/chosen": -1142.0, "logps/rejected": -950.0, "loss": 0.4208, "rewards/accuracies": 0.90625, "rewards/chosen": 0.95587158203125, "rewards/margins": 6.2890625, "rewards/rejected": -5.3359375, "step": 4765 }, { "epoch": 0.9005621427559166, "grad_norm": 2.8694708221391494, "learning_rate": 1.2701438324298465e-07, "logits/chosen": 2.1171875, "logits/rejected": 1.8369140625, "logps/chosen": -640.5, "logps/rejected": -15607.0, "loss": 0.4958, "rewards/accuracies": 0.875, "rewards/chosen": 1.1767578125, "rewards/margins": 23.375, "rewards/rejected": -22.1953125, "step": 4766 }, { "epoch": 0.9007510983041239, "grad_norm": 1.5462161073659626, "learning_rate": 1.2691318437559852e-07, "logits/chosen": 2.44482421875, "logits/rejected": 1.9761962890625, "logps/chosen": -922.0, "logps/rejected": -860.0, "loss": 0.4946, "rewards/accuracies": 0.875, "rewards/chosen": 1.3349609375, "rewards/margins": 6.1171875, "rewards/rejected": -4.79296875, "step": 4767 }, { "epoch": 0.9009400538523312, "grad_norm": 2.7118018964321875, "learning_rate": 1.2681216957187463e-07, "logits/chosen": 2.8671875, "logits/rejected": 2.2421875, "logps/chosen": -826.0, "logps/rejected": -1131.5, "loss": 0.5634, "rewards/accuracies": 0.75, "rewards/chosen": 0.9189453125, "rewards/margins": 5.7421875, "rewards/rejected": -4.81640625, "step": 4768 }, { "epoch": 0.9011290094005385, "grad_norm": 2.601360992917891, "learning_rate": 1.2671133887575918e-07, "logits/chosen": 2.572265625, "logits/rejected": 1.6796875, "logps/chosen": -841.5, "logps/rejected": -1159.5, "loss": 0.5788, "rewards/accuracies": 0.8125, "rewards/chosen": 0.265625, "rewards/margins": 6.56640625, "rewards/rejected": -6.29296875, "step": 4769 }, { "epoch": 0.9013179649487458, "grad_norm": 2.645543878689862, "learning_rate": 1.266106923311187e-07, "logits/chosen": 3.65625, "logits/rejected": 3.103515625, "logps/chosen": -775.0, "logps/rejected": -1751.0, "loss": 0.6111, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0068359375, "rewards/margins": 9.9453125, "rewards/rejected": -9.96875, "step": 4770 }, { "epoch": 0.9015069204969531, "grad_norm": 1.845809455751533, "learning_rate": 1.2651022998173935e-07, "logits/chosen": 3.060546875, "logits/rejected": 2.478515625, "logps/chosen": -609.5, "logps/rejected": -535.5, "loss": 0.6517, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5048828125, "rewards/margins": 4.69140625, "rewards/rejected": -4.18359375, "step": 4771 }, { "epoch": 0.9016958760451603, "grad_norm": 3.0698117586666727, "learning_rate": 1.2640995187132721e-07, "logits/chosen": 2.607421875, "logits/rejected": 2.47265625, "logps/chosen": -1145.0, "logps/rejected": -967.0, "loss": 0.4981, "rewards/accuracies": 0.8125, "rewards/chosen": 0.70458984375, "rewards/margins": 5.6875, "rewards/rejected": -4.984375, "step": 4772 }, { "epoch": 0.9018848315933676, "grad_norm": 2.034285438936615, "learning_rate": 1.2630985804350818e-07, "logits/chosen": 2.708984375, "logits/rejected": 2.23046875, "logps/chosen": -801.0, "logps/rejected": -923.0, "loss": 0.7048, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3515625, "rewards/margins": 3.8974609375, "rewards/rejected": -3.5390625, "step": 4773 }, { "epoch": 0.9020737871415749, "grad_norm": 3.47132964343278, "learning_rate": 1.2620994854182812e-07, "logits/chosen": 2.38671875, "logits/rejected": 1.9560546875, "logps/chosen": -700.0, "logps/rejected": -795.0, "loss": 0.5145, "rewards/accuracies": 0.84375, "rewards/chosen": 0.521484375, "rewards/margins": 5.7109375, "rewards/rejected": -5.19140625, "step": 4774 }, { "epoch": 0.9022627426897822, "grad_norm": 1.3188456640491353, "learning_rate": 1.261102234097524e-07, "logits/chosen": 2.6708984375, "logits/rejected": 2.0712890625, "logps/chosen": -689.5, "logps/rejected": -781.5, "loss": 0.4562, "rewards/accuracies": 0.90625, "rewards/chosen": 1.06640625, "rewards/margins": 5.921875, "rewards/rejected": -4.84375, "step": 4775 }, { "epoch": 0.9024516982379895, "grad_norm": 2.200524568391816, "learning_rate": 1.2601068269066658e-07, "logits/chosen": 3.05078125, "logits/rejected": 2.578125, "logps/chosen": -836.5, "logps/rejected": -774.0, "loss": 0.5253, "rewards/accuracies": 0.84375, "rewards/chosen": 0.73779296875, "rewards/margins": 5.09765625, "rewards/rejected": -4.359375, "step": 4776 }, { "epoch": 0.9026406537861968, "grad_norm": 1.4401985432463185, "learning_rate": 1.2591132642787572e-07, "logits/chosen": 2.179931640625, "logits/rejected": 2.24591064453125, "logps/chosen": -563.0, "logps/rejected": -1860.5, "loss": 0.5896, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2303466796875, "rewards/margins": 9.53125, "rewards/rejected": -9.3046875, "step": 4777 }, { "epoch": 0.902829609334404, "grad_norm": 3.4803054554431636, "learning_rate": 1.258121546646047e-07, "logits/chosen": 3.58203125, "logits/rejected": 3.1796875, "logps/chosen": -984.0, "logps/rejected": -764.0, "loss": 0.5583, "rewards/accuracies": 0.90625, "rewards/chosen": 0.44384765625, "rewards/margins": 4.1796875, "rewards/rejected": -3.73046875, "step": 4778 }, { "epoch": 0.9030185648826113, "grad_norm": 4.015298939149794, "learning_rate": 1.2571316744399816e-07, "logits/chosen": 1.884765625, "logits/rejected": 1.6953125, "logps/chosen": -634.75, "logps/rejected": -984.0, "loss": 0.5824, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13818359375, "rewards/margins": 5.56640625, "rewards/rejected": -5.4296875, "step": 4779 }, { "epoch": 0.9032075204308186, "grad_norm": 2.22845597627876, "learning_rate": 1.256143648091203e-07, "logits/chosen": 3.765625, "logits/rejected": 3.16796875, "logps/chosen": -636.5, "logps/rejected": -533.5, "loss": 0.6816, "rewards/accuracies": 0.9375, "rewards/chosen": 0.034423828125, "rewards/margins": 4.029296875, "rewards/rejected": -3.990234375, "step": 4780 }, { "epoch": 0.903396475979026, "grad_norm": 1.4535233771455611, "learning_rate": 1.2551574680295526e-07, "logits/chosen": 3.27734375, "logits/rejected": 2.634765625, "logps/chosen": -737.0, "logps/rejected": -1117.0, "loss": 0.5439, "rewards/accuracies": 0.875, "rewards/chosen": 0.75537109375, "rewards/margins": 5.9453125, "rewards/rejected": -5.1875, "step": 4781 }, { "epoch": 0.9035854315272333, "grad_norm": 2.9504574469358316, "learning_rate": 1.2541731346840683e-07, "logits/chosen": 2.75390625, "logits/rejected": 2.548828125, "logps/chosen": -535.0, "logps/rejected": -1124.0, "loss": 0.6633, "rewards/accuracies": 0.8125, "rewards/chosen": 0.244842529296875, "rewards/margins": 8.015625, "rewards/rejected": -7.7578125, "step": 4782 }, { "epoch": 0.9037743870754406, "grad_norm": 2.8098821201144233, "learning_rate": 1.2531906484829818e-07, "logits/chosen": 2.7890625, "logits/rejected": 2.54296875, "logps/chosen": -743.0, "logps/rejected": -603.0, "loss": 0.5893, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0087890625, "rewards/margins": 3.697265625, "rewards/rejected": -3.703125, "step": 4783 }, { "epoch": 0.9039633426236477, "grad_norm": 2.9047184845077245, "learning_rate": 1.252210009853725e-07, "logits/chosen": 3.4765625, "logits/rejected": 3.3515625, "logps/chosen": -784.0, "logps/rejected": -1144.0, "loss": 0.6743, "rewards/accuracies": 0.75, "rewards/chosen": 0.4482421875, "rewards/margins": 3.4169921875, "rewards/rejected": -2.96875, "step": 4784 }, { "epoch": 0.904152298171855, "grad_norm": 1.7236327542694394, "learning_rate": 1.2512312192229219e-07, "logits/chosen": 2.98828125, "logits/rejected": 2.8828125, "logps/chosen": -634.5, "logps/rejected": -707.0, "loss": 0.6625, "rewards/accuracies": 0.78125, "rewards/chosen": -0.027099609375, "rewards/margins": 3.626953125, "rewards/rejected": -3.6484375, "step": 4785 }, { "epoch": 0.9043412537200624, "grad_norm": 2.869443960449034, "learning_rate": 1.2502542770163973e-07, "logits/chosen": 3.66796875, "logits/rejected": 2.7373046875, "logps/chosen": -837.5, "logps/rejected": -818.0, "loss": 0.7297, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0250244140625, "rewards/margins": 3.1220703125, "rewards/rejected": -3.14453125, "step": 4786 }, { "epoch": 0.9045302092682697, "grad_norm": 2.9973119020582684, "learning_rate": 1.2492791836591668e-07, "logits/chosen": 3.26171875, "logits/rejected": 2.8984375, "logps/chosen": -971.5, "logps/rejected": -1004.5, "loss": 0.489, "rewards/accuracies": 0.84375, "rewards/chosen": 1.43603515625, "rewards/margins": 6.75390625, "rewards/rejected": -5.3125, "step": 4787 }, { "epoch": 0.904719164816477, "grad_norm": 1.8408536773922481, "learning_rate": 1.2483059395754464e-07, "logits/chosen": 3.26171875, "logits/rejected": 2.728515625, "logps/chosen": -745.5, "logps/rejected": -621.0, "loss": 0.5205, "rewards/accuracies": 0.6875, "rewards/chosen": 0.60333251953125, "rewards/margins": 4.765625, "rewards/rejected": -4.158203125, "step": 4788 }, { "epoch": 0.9049081203646842, "grad_norm": 2.5720371586499273, "learning_rate": 1.2473345451886445e-07, "logits/chosen": 2.931640625, "logits/rejected": 2.4970703125, "logps/chosen": -1009.0, "logps/rejected": -2470.0, "loss": 0.4738, "rewards/accuracies": 0.875, "rewards/chosen": 1.08837890625, "rewards/margins": 10.7890625, "rewards/rejected": -9.6953125, "step": 4789 }, { "epoch": 0.9050970759128915, "grad_norm": 2.0428421817605353, "learning_rate": 1.2463650009213648e-07, "logits/chosen": 2.564453125, "logits/rejected": 2.1875, "logps/chosen": -875.0, "logps/rejected": -834.0, "loss": 0.4386, "rewards/accuracies": 0.90625, "rewards/chosen": 0.7626953125, "rewards/margins": 6.8125, "rewards/rejected": -6.05078125, "step": 4790 }, { "epoch": 0.9052860314610988, "grad_norm": 1.9313277492423289, "learning_rate": 1.2453973071954086e-07, "logits/chosen": 2.1162109375, "logits/rejected": 1.3232421875, "logps/chosen": -788.0, "logps/rejected": -735.5, "loss": 0.432, "rewards/accuracies": 0.90625, "rewards/chosen": 1.46484375, "rewards/margins": 6.296875, "rewards/rejected": -4.83203125, "step": 4791 }, { "epoch": 0.9054749870093061, "grad_norm": 1.3272844243242847, "learning_rate": 1.244431464431769e-07, "logits/chosen": 2.6796875, "logits/rejected": 1.93359375, "logps/chosen": -649.0, "logps/rejected": -455.25, "loss": 0.6264, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1790771484375, "rewards/margins": 3.626953125, "rewards/rejected": -3.451171875, "step": 4792 }, { "epoch": 0.9056639425575134, "grad_norm": 3.3441023802662886, "learning_rate": 1.2434674730506355e-07, "logits/chosen": 2.28515625, "logits/rejected": 1.478515625, "logps/chosen": -911.0, "logps/rejected": -860.0, "loss": 0.606, "rewards/accuracies": 0.6875, "rewards/chosen": -0.081787109375, "rewards/margins": 3.37890625, "rewards/rejected": -3.45703125, "step": 4793 }, { "epoch": 0.9058528981057207, "grad_norm": 2.4920786928602077, "learning_rate": 1.242505333471393e-07, "logits/chosen": 3.11328125, "logits/rejected": 3.55078125, "logps/chosen": -706.5, "logps/rejected": -830.0, "loss": 0.6244, "rewards/accuracies": 0.8125, "rewards/chosen": 0.193603515625, "rewards/margins": 4.28515625, "rewards/rejected": -4.08203125, "step": 4794 }, { "epoch": 0.9060418536539279, "grad_norm": 2.0315542674694944, "learning_rate": 1.2415450461126185e-07, "logits/chosen": 3.5625, "logits/rejected": 3.05078125, "logps/chosen": -567.0, "logps/rejected": -966.5, "loss": 0.5608, "rewards/accuracies": 0.8125, "rewards/chosen": 1.455078125, "rewards/margins": 6.25, "rewards/rejected": -4.80078125, "step": 4795 }, { "epoch": 0.9062308092021352, "grad_norm": 2.3226786684948233, "learning_rate": 1.2405866113920852e-07, "logits/chosen": 2.376953125, "logits/rejected": 2.30078125, "logps/chosen": -1063.0, "logps/rejected": -985.0, "loss": 0.4838, "rewards/accuracies": 0.90625, "rewards/chosen": 1.064453125, "rewards/margins": 6.015625, "rewards/rejected": -4.9453125, "step": 4796 }, { "epoch": 0.9064197647503425, "grad_norm": 3.0499810150274627, "learning_rate": 1.2396300297267596e-07, "logits/chosen": 3.2578125, "logits/rejected": 3.12890625, "logps/chosen": -640.0, "logps/rejected": -1303.0, "loss": 0.5502, "rewards/accuracies": 0.90625, "rewards/chosen": 0.985595703125, "rewards/margins": 7.4140625, "rewards/rejected": -6.4375, "step": 4797 }, { "epoch": 0.9066087202985498, "grad_norm": 2.4081626377007543, "learning_rate": 1.2386753015328016e-07, "logits/chosen": 3.27734375, "logits/rejected": 3.01171875, "logps/chosen": -644.0, "logps/rejected": -661.0, "loss": 0.6875, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16015625, "rewards/margins": 3.279296875, "rewards/rejected": -3.115234375, "step": 4798 }, { "epoch": 0.9067976758467571, "grad_norm": 3.1497874426057497, "learning_rate": 1.2377224272255651e-07, "logits/chosen": 3.283203125, "logits/rejected": 2.7177734375, "logps/chosen": -764.5, "logps/rejected": -607.5, "loss": 0.5648, "rewards/accuracies": 0.875, "rewards/chosen": 0.523193359375, "rewards/margins": 4.62109375, "rewards/rejected": -4.095703125, "step": 4799 }, { "epoch": 0.9069866313949644, "grad_norm": 4.952790061257629, "learning_rate": 1.2367714072195978e-07, "logits/chosen": 2.58984375, "logits/rejected": 2.0283203125, "logps/chosen": -456.5, "logps/rejected": -589.0, "loss": 0.5986, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2333984375, "rewards/margins": 4.6240234375, "rewards/rejected": -4.39453125, "step": 4800 }, { "epoch": 0.9071755869431716, "grad_norm": 1.9195518856926657, "learning_rate": 1.2358222419286394e-07, "logits/chosen": 3.59765625, "logits/rejected": 3.078125, "logps/chosen": -707.5, "logps/rejected": -636.25, "loss": 0.5814, "rewards/accuracies": 0.84375, "rewards/chosen": 0.89013671875, "rewards/margins": 4.60546875, "rewards/rejected": -3.712890625, "step": 4801 }, { "epoch": 0.9073645424913789, "grad_norm": 3.3857360137714, "learning_rate": 1.2348749317656252e-07, "logits/chosen": 3.58984375, "logits/rejected": 3.59765625, "logps/chosen": -854.0, "logps/rejected": -1168.0, "loss": 0.6606, "rewards/accuracies": 0.6875, "rewards/chosen": 0.126373291015625, "rewards/margins": 4.3359375, "rewards/rejected": -4.20703125, "step": 4802 }, { "epoch": 0.9075534980395862, "grad_norm": 2.3166952483680947, "learning_rate": 1.2339294771426812e-07, "logits/chosen": 2.66015625, "logits/rejected": 2.578125, "logps/chosen": -986.5, "logps/rejected": -800.0, "loss": 0.4762, "rewards/accuracies": 0.84375, "rewards/chosen": 1.7099609375, "rewards/margins": 5.328125, "rewards/rejected": -3.62109375, "step": 4803 }, { "epoch": 0.9077424535877935, "grad_norm": 3.1979944659661452, "learning_rate": 1.2329858784711268e-07, "logits/chosen": 3.33984375, "logits/rejected": 3.1875, "logps/chosen": -938.5, "logps/rejected": -1037.5, "loss": 0.6533, "rewards/accuracies": 0.75, "rewards/chosen": 1.0546875, "rewards/margins": 5.8486328125, "rewards/rejected": -4.80078125, "step": 4804 }, { "epoch": 0.9079314091360008, "grad_norm": 2.213936032770801, "learning_rate": 1.2320441361614747e-07, "logits/chosen": 2.5205078125, "logits/rejected": 2.3642578125, "logps/chosen": -618.0, "logps/rejected": -674.0, "loss": 0.6518, "rewards/accuracies": 0.75, "rewards/chosen": 0.1669921875, "rewards/margins": 3.54296875, "rewards/rejected": -3.375, "step": 4805 }, { "epoch": 0.9081203646842081, "grad_norm": 2.267608462022396, "learning_rate": 1.2311042506234274e-07, "logits/chosen": 3.421875, "logits/rejected": 3.59765625, "logps/chosen": -869.0, "logps/rejected": -952.0, "loss": 0.54, "rewards/accuracies": 0.8125, "rewards/chosen": 1.376953125, "rewards/margins": 5.921875, "rewards/rejected": -4.54296875, "step": 4806 }, { "epoch": 0.9083093202324153, "grad_norm": 5.03744031451435, "learning_rate": 1.230166222265884e-07, "logits/chosen": 2.677734375, "logits/rejected": 2.37255859375, "logps/chosen": -771.0, "logps/rejected": -776.0, "loss": 0.5103, "rewards/accuracies": 0.84375, "rewards/chosen": 1.063720703125, "rewards/margins": 5.171875, "rewards/rejected": -4.109375, "step": 4807 }, { "epoch": 0.9084982757806226, "grad_norm": 2.672416478093698, "learning_rate": 1.2292300514969313e-07, "logits/chosen": 2.146484375, "logits/rejected": 2.123046875, "logps/chosen": -694.0, "logps/rejected": -1195.0, "loss": 0.4656, "rewards/accuracies": 0.84375, "rewards/chosen": 1.05126953125, "rewards/margins": 6.5703125, "rewards/rejected": -5.5, "step": 4808 }, { "epoch": 0.9086872313288299, "grad_norm": 2.4200487655933327, "learning_rate": 1.2282957387238505e-07, "logits/chosen": 2.9736328125, "logits/rejected": 2.5126953125, "logps/chosen": -1010.0, "logps/rejected": -1524.0, "loss": 0.6314, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2001953125, "rewards/margins": 5.517578125, "rewards/rejected": -5.30859375, "step": 4809 }, { "epoch": 0.9088761868770372, "grad_norm": 3.0938347715705423, "learning_rate": 1.227363284353114e-07, "logits/chosen": 3.109375, "logits/rejected": 2.85546875, "logps/chosen": -770.0, "logps/rejected": -2396.0, "loss": 0.4748, "rewards/accuracies": 0.875, "rewards/chosen": 0.92041015625, "rewards/margins": 12.46875, "rewards/rejected": -11.546875, "step": 4810 }, { "epoch": 0.9090651424252445, "grad_norm": 1.9447126809840887, "learning_rate": 1.226432688790385e-07, "logits/chosen": 1.81640625, "logits/rejected": 1.171875, "logps/chosen": -943.0, "logps/rejected": -1030.0, "loss": 0.4939, "rewards/accuracies": 0.875, "rewards/chosen": 0.7626953125, "rewards/margins": 6.92578125, "rewards/rejected": -6.1796875, "step": 4811 }, { "epoch": 0.9092540979734517, "grad_norm": 2.5042950500724013, "learning_rate": 1.2255039524405198e-07, "logits/chosen": 3.0625, "logits/rejected": 2.71484375, "logps/chosen": -936.0, "logps/rejected": -1032.0, "loss": 0.6252, "rewards/accuracies": 0.65625, "rewards/chosen": 1.0419921875, "rewards/margins": 4.53125, "rewards/rejected": -3.48046875, "step": 4812 }, { "epoch": 0.909443053521659, "grad_norm": 3.094832275435928, "learning_rate": 1.2245770757075623e-07, "logits/chosen": 2.77294921875, "logits/rejected": 3.167236328125, "logps/chosen": -675.0, "logps/rejected": -1800.5, "loss": 0.6582, "rewards/accuracies": 0.8125, "rewards/chosen": 0.370849609375, "rewards/margins": 6.451171875, "rewards/rejected": -6.08984375, "step": 4813 }, { "epoch": 0.9096320090698663, "grad_norm": 2.3318237025455923, "learning_rate": 1.223652058994752e-07, "logits/chosen": 2.8984375, "logits/rejected": 2.5546875, "logps/chosen": -632.0, "logps/rejected": -700.0, "loss": 0.4753, "rewards/accuracies": 0.96875, "rewards/chosen": 1.724609375, "rewards/margins": 5.25, "rewards/rejected": -3.51953125, "step": 4814 }, { "epoch": 0.9098209646180736, "grad_norm": 2.3425761301201775, "learning_rate": 1.2227289027045158e-07, "logits/chosen": 3.63671875, "logits/rejected": 3.7109375, "logps/chosen": -571.0, "logps/rejected": -658.0, "loss": 0.6673, "rewards/accuracies": 0.75, "rewards/chosen": -0.3779296875, "rewards/margins": 3.145263671875, "rewards/rejected": -3.525390625, "step": 4815 }, { "epoch": 0.9100099201662809, "grad_norm": 4.020218197102278, "learning_rate": 1.2218076072384715e-07, "logits/chosen": 3.5703125, "logits/rejected": 3.193359375, "logps/chosen": -675.0, "logps/rejected": -1485.0, "loss": 0.6408, "rewards/accuracies": 0.75, "rewards/chosen": -0.30859375, "rewards/margins": 5.5546875, "rewards/rejected": -5.859375, "step": 4816 }, { "epoch": 0.9101988757144882, "grad_norm": 1.4669917091103766, "learning_rate": 1.2208881729974304e-07, "logits/chosen": 3.22265625, "logits/rejected": 2.9375, "logps/chosen": -934.0, "logps/rejected": -939.0, "loss": 0.6154, "rewards/accuracies": 0.6875, "rewards/chosen": 0.984375, "rewards/margins": 4.7421875, "rewards/rejected": -3.759765625, "step": 4817 }, { "epoch": 0.9103878312626954, "grad_norm": 2.581343535343636, "learning_rate": 1.2199706003813894e-07, "logits/chosen": 3.6953125, "logits/rejected": 3.0390625, "logps/chosen": -616.0, "logps/rejected": -528.5, "loss": 0.6108, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3701171875, "rewards/margins": 3.5703125, "rewards/rejected": -3.19921875, "step": 4818 }, { "epoch": 0.9105767868109027, "grad_norm": 3.2594189701875456, "learning_rate": 1.2190548897895388e-07, "logits/chosen": 3.2890625, "logits/rejected": 2.888671875, "logps/chosen": -657.0, "logps/rejected": -994.0, "loss": 0.4944, "rewards/accuracies": 0.9375, "rewards/chosen": 1.09130859375, "rewards/margins": 5.51953125, "rewards/rejected": -4.421875, "step": 4819 }, { "epoch": 0.91076574235911, "grad_norm": 2.2793048057544953, "learning_rate": 1.2181410416202588e-07, "logits/chosen": 2.2421875, "logits/rejected": 2.578125, "logps/chosen": -471.5, "logps/rejected": -600.0, "loss": 0.7576, "rewards/accuracies": 0.59375, "rewards/chosen": 0.28564453125, "rewards/margins": 2.765625, "rewards/rejected": -2.478515625, "step": 4820 }, { "epoch": 0.9109546979073173, "grad_norm": 3.697726589214466, "learning_rate": 1.2172290562711173e-07, "logits/chosen": 2.501953125, "logits/rejected": 2.044921875, "logps/chosen": -1051.0, "logps/rejected": -940.0, "loss": 0.602, "rewards/accuracies": 0.75, "rewards/chosen": 1.369140625, "rewards/margins": 4.25, "rewards/rejected": -2.884765625, "step": 4821 }, { "epoch": 0.9111436534555246, "grad_norm": 2.036219287763326, "learning_rate": 1.2163189341388735e-07, "logits/chosen": 3.14453125, "logits/rejected": 3.033203125, "logps/chosen": -721.0, "logps/rejected": -819.5, "loss": 0.4349, "rewards/accuracies": 0.875, "rewards/chosen": 0.818359375, "rewards/margins": 5.6328125, "rewards/rejected": -4.8125, "step": 4822 }, { "epoch": 0.9113326090037319, "grad_norm": 3.134590738009073, "learning_rate": 1.2154106756194746e-07, "logits/chosen": 2.384765625, "logits/rejected": 2.56640625, "logps/chosen": -820.5, "logps/rejected": -1813.0, "loss": 0.5254, "rewards/accuracies": 0.78125, "rewards/chosen": 1.78961181640625, "rewards/margins": 8.181640625, "rewards/rejected": -6.390625, "step": 4823 }, { "epoch": 0.9115215645519391, "grad_norm": 6.161825953715562, "learning_rate": 1.21450428110806e-07, "logits/chosen": 3.177734375, "logits/rejected": 2.5703125, "logps/chosen": -718.0, "logps/rejected": -848.0, "loss": 0.4267, "rewards/accuracies": 0.90625, "rewards/chosen": 0.671875, "rewards/margins": 6.0546875, "rewards/rejected": -5.390625, "step": 4824 }, { "epoch": 0.9117105201001464, "grad_norm": 2.752345748354141, "learning_rate": 1.2135997509989542e-07, "logits/chosen": 3.0341796875, "logits/rejected": 2.8681640625, "logps/chosen": -527.5, "logps/rejected": -1044.5, "loss": 0.5554, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0263671875, "rewards/margins": 5.76953125, "rewards/rejected": -4.748046875, "step": 4825 }, { "epoch": 0.9118994756483537, "grad_norm": 4.944810075057613, "learning_rate": 1.2126970856856732e-07, "logits/chosen": 2.546875, "logits/rejected": 2.289306640625, "logps/chosen": -766.5, "logps/rejected": -979.0, "loss": 0.5671, "rewards/accuracies": 0.75, "rewards/chosen": 0.43603515625, "rewards/margins": 4.61328125, "rewards/rejected": -4.18359375, "step": 4826 }, { "epoch": 0.912088431196561, "grad_norm": 2.826970455986422, "learning_rate": 1.2117962855609204e-07, "logits/chosen": 3.33203125, "logits/rejected": 3.34375, "logps/chosen": -584.5, "logps/rejected": -668.0, "loss": 0.5378, "rewards/accuracies": 0.84375, "rewards/chosen": 0.770751953125, "rewards/margins": 5.27734375, "rewards/rejected": -4.50390625, "step": 4827 }, { "epoch": 0.9122773867447683, "grad_norm": 3.3958024310780055, "learning_rate": 1.210897351016589e-07, "logits/chosen": 2.76953125, "logits/rejected": 2.78125, "logps/chosen": -806.0, "logps/rejected": -767.0, "loss": 0.6044, "rewards/accuracies": 0.78125, "rewards/chosen": 0.63916015625, "rewards/margins": 4.55078125, "rewards/rejected": -3.9140625, "step": 4828 }, { "epoch": 0.9124663422929756, "grad_norm": 3.413184414383007, "learning_rate": 1.2100002824437597e-07, "logits/chosen": 3.15625, "logits/rejected": 3.400390625, "logps/chosen": -760.0, "logps/rejected": -918.0, "loss": 0.6903, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8134765625, "rewards/margins": 4.060546875, "rewards/rejected": -3.24609375, "step": 4829 }, { "epoch": 0.9126552978411828, "grad_norm": 4.019346168884538, "learning_rate": 1.209105080232702e-07, "logits/chosen": 4.23828125, "logits/rejected": 3.8828125, "logps/chosen": -751.5, "logps/rejected": -824.0, "loss": 0.4501, "rewards/accuracies": 0.875, "rewards/chosen": 1.365234375, "rewards/margins": 6.70703125, "rewards/rejected": -5.3359375, "step": 4830 }, { "epoch": 0.9128442533893901, "grad_norm": 1.7596213967787004, "learning_rate": 1.2082117447728723e-07, "logits/chosen": 2.76171875, "logits/rejected": 2.8125, "logps/chosen": -654.0, "logps/rejected": -1032.0, "loss": 0.5682, "rewards/accuracies": 0.8125, "rewards/chosen": 0.148193359375, "rewards/margins": 5.8828125, "rewards/rejected": -5.7265625, "step": 4831 }, { "epoch": 0.9130332089375974, "grad_norm": 1.8242691373071716, "learning_rate": 1.2073202764529158e-07, "logits/chosen": 3.34765625, "logits/rejected": 2.525390625, "logps/chosen": -980.0, "logps/rejected": -958.5, "loss": 0.6126, "rewards/accuracies": 0.71875, "rewards/chosen": 1.56201171875, "rewards/margins": 4.802734375, "rewards/rejected": -3.23828125, "step": 4832 }, { "epoch": 0.9132221644858047, "grad_norm": 2.860115697017251, "learning_rate": 1.2064306756606667e-07, "logits/chosen": 3.001953125, "logits/rejected": 2.6875, "logps/chosen": -923.0, "logps/rejected": -935.0, "loss": 0.5953, "rewards/accuracies": 0.71875, "rewards/chosen": 1.4072265625, "rewards/margins": 4.234375, "rewards/rejected": -2.828125, "step": 4833 }, { "epoch": 0.913411120034012, "grad_norm": 3.1996462227761926, "learning_rate": 1.2055429427831434e-07, "logits/chosen": 2.1953125, "logits/rejected": 2.0068359375, "logps/chosen": -671.5, "logps/rejected": -626.0, "loss": 0.5301, "rewards/accuracies": 0.8125, "rewards/chosen": 0.60546875, "rewards/margins": 4.3828125, "rewards/rejected": -3.76953125, "step": 4834 }, { "epoch": 0.9136000755822192, "grad_norm": 4.7397127772370995, "learning_rate": 1.2046570782065545e-07, "logits/chosen": 2.76171875, "logits/rejected": 2.912109375, "logps/chosen": -751.0, "logps/rejected": -1586.0, "loss": 0.5534, "rewards/accuracies": 0.8125, "rewards/chosen": 1.23046875, "rewards/margins": 7.80078125, "rewards/rejected": -6.5625, "step": 4835 }, { "epoch": 0.9137890311304265, "grad_norm": 2.2803802018407846, "learning_rate": 1.2037730823162942e-07, "logits/chosen": 2.072265625, "logits/rejected": 2.3046875, "logps/chosen": -785.0, "logps/rejected": -1034.0, "loss": 0.4809, "rewards/accuracies": 0.75, "rewards/chosen": 1.09478759765625, "rewards/margins": 6.52734375, "rewards/rejected": -5.44140625, "step": 4836 }, { "epoch": 0.9139779866786338, "grad_norm": 2.6988115310748, "learning_rate": 1.2028909554969447e-07, "logits/chosen": 3.375, "logits/rejected": 3.22265625, "logps/chosen": -959.0, "logps/rejected": -741.0, "loss": 0.6381, "rewards/accuracies": 0.71875, "rewards/chosen": 0.807861328125, "rewards/margins": 3.8671875, "rewards/rejected": -3.05859375, "step": 4837 }, { "epoch": 0.9141669422268411, "grad_norm": 1.6486254230567907, "learning_rate": 1.202010698132276e-07, "logits/chosen": 2.748046875, "logits/rejected": 2.841796875, "logps/chosen": -1559.5, "logps/rejected": -1399.0, "loss": 0.5296, "rewards/accuracies": 0.875, "rewards/chosen": 0.27783203125, "rewards/margins": 5.33984375, "rewards/rejected": -5.078125, "step": 4838 }, { "epoch": 0.9143558977750484, "grad_norm": 4.030780291209703, "learning_rate": 1.2011323106052408e-07, "logits/chosen": 2.77734375, "logits/rejected": 2.005859375, "logps/chosen": -1201.0, "logps/rejected": -910.0, "loss": 0.6268, "rewards/accuracies": 0.75, "rewards/chosen": -0.37890625, "rewards/margins": 3.734375, "rewards/rejected": -4.109375, "step": 4839 }, { "epoch": 0.9145448533232557, "grad_norm": 4.329145989151086, "learning_rate": 1.2002557932979837e-07, "logits/chosen": 2.91796875, "logits/rejected": 3.30859375, "logps/chosen": -701.0, "logps/rejected": -836.0, "loss": 0.5468, "rewards/accuracies": 0.78125, "rewards/chosen": 0.95263671875, "rewards/margins": 6.19140625, "rewards/rejected": -5.234375, "step": 4840 }, { "epoch": 0.9147338088714629, "grad_norm": 1.9123452375518293, "learning_rate": 1.1993811465918318e-07, "logits/chosen": 3.0283203125, "logits/rejected": 2.2890625, "logps/chosen": -834.0, "logps/rejected": -755.0, "loss": 0.4669, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7239990234375, "rewards/margins": 6.2109375, "rewards/rejected": -5.4921875, "step": 4841 }, { "epoch": 0.9149227644196702, "grad_norm": 3.705680959430712, "learning_rate": 1.1985083708672998e-07, "logits/chosen": 3.44921875, "logits/rejected": 3.3046875, "logps/chosen": -869.0, "logps/rejected": -844.0, "loss": 0.5671, "rewards/accuracies": 0.875, "rewards/chosen": 0.49609375, "rewards/margins": 5.390625, "rewards/rejected": -4.890625, "step": 4842 }, { "epoch": 0.9151117199678775, "grad_norm": 1.7761261207139412, "learning_rate": 1.1976374665040877e-07, "logits/chosen": 2.64453125, "logits/rejected": 3.0390625, "logps/chosen": -682.0, "logps/rejected": -839.0, "loss": 0.792, "rewards/accuracies": 0.75, "rewards/chosen": -0.5712890625, "rewards/margins": 3.326171875, "rewards/rejected": -3.890625, "step": 4843 }, { "epoch": 0.9153006755160848, "grad_norm": 4.427407396269798, "learning_rate": 1.1967684338810825e-07, "logits/chosen": 2.990234375, "logits/rejected": 2.423828125, "logps/chosen": -841.5, "logps/rejected": -721.0, "loss": 0.5501, "rewards/accuracies": 0.84375, "rewards/chosen": 0.915679931640625, "rewards/margins": 5.69140625, "rewards/rejected": -4.7734375, "step": 4844 }, { "epoch": 0.9154896310642922, "grad_norm": 4.7106058214967605, "learning_rate": 1.1959012733763568e-07, "logits/chosen": 3.201171875, "logits/rejected": 3.2890625, "logps/chosen": -759.5, "logps/rejected": -1908.0, "loss": 0.5844, "rewards/accuracies": 0.875, "rewards/chosen": 1.08056640625, "rewards/margins": 9.046875, "rewards/rejected": -7.96484375, "step": 4845 }, { "epoch": 0.9156785866124995, "grad_norm": 3.386686011829864, "learning_rate": 1.1950359853671677e-07, "logits/chosen": 3.0234375, "logits/rejected": 2.60546875, "logps/chosen": -588.0, "logps/rejected": -1530.0, "loss": 0.6, "rewards/accuracies": 0.90625, "rewards/chosen": 0.580078125, "rewards/margins": 9.95703125, "rewards/rejected": -9.37890625, "step": 4846 }, { "epoch": 0.9158675421607066, "grad_norm": 2.626400568504765, "learning_rate": 1.194172570229958e-07, "logits/chosen": 2.744140625, "logits/rejected": 2.3125, "logps/chosen": -866.0, "logps/rejected": -813.5, "loss": 0.6631, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08935546875, "rewards/margins": 3.6640625, "rewards/rejected": -3.7578125, "step": 4847 }, { "epoch": 0.916056497708914, "grad_norm": 3.5077587577321583, "learning_rate": 1.1933110283403557e-07, "logits/chosen": 3.26953125, "logits/rejected": 2.439453125, "logps/chosen": -729.0, "logps/rejected": -664.5, "loss": 0.5004, "rewards/accuracies": 0.875, "rewards/chosen": 0.950439453125, "rewards/margins": 4.94140625, "rewards/rejected": -3.99609375, "step": 4848 }, { "epoch": 0.9162454532571213, "grad_norm": 2.0779034548463566, "learning_rate": 1.1924513600731754e-07, "logits/chosen": 2.607421875, "logits/rejected": 2.30078125, "logps/chosen": -898.75, "logps/rejected": -1118.5, "loss": 0.5867, "rewards/accuracies": 0.78125, "rewards/chosen": 0.568603515625, "rewards/margins": 5.05078125, "rewards/rejected": -4.48828125, "step": 4849 }, { "epoch": 0.9164344088053286, "grad_norm": 3.167135945487613, "learning_rate": 1.1915935658024131e-07, "logits/chosen": 2.8359375, "logits/rejected": 2.5712890625, "logps/chosen": -833.0, "logps/rejected": -954.0, "loss": 0.6626, "rewards/accuracies": 0.84375, "rewards/chosen": -0.59130859375, "rewards/margins": 4.765625, "rewards/rejected": -5.359375, "step": 4850 }, { "epoch": 0.9166233643535359, "grad_norm": 3.422334286720759, "learning_rate": 1.1907376459012536e-07, "logits/chosen": 1.60546875, "logits/rejected": 1.48046875, "logps/chosen": -519.5, "logps/rejected": -664.5, "loss": 0.5576, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6458740234375, "rewards/margins": 4.646484375, "rewards/rejected": -4.00390625, "step": 4851 }, { "epoch": 0.9168123199017432, "grad_norm": 2.171251775359293, "learning_rate": 1.189883600742064e-07, "logits/chosen": 3.19140625, "logits/rejected": 2.490234375, "logps/chosen": -698.0, "logps/rejected": -496.5, "loss": 0.5859, "rewards/accuracies": 0.75, "rewards/chosen": 0.4832763671875, "rewards/margins": 3.9453125, "rewards/rejected": -3.45703125, "step": 4852 }, { "epoch": 0.9170012754499504, "grad_norm": 3.0442929711654583, "learning_rate": 1.1890314306963957e-07, "logits/chosen": 2.833984375, "logits/rejected": 2.55859375, "logps/chosen": -984.0, "logps/rejected": -1845.0, "loss": 0.5527, "rewards/accuracies": 0.78125, "rewards/chosen": 0.74407958984375, "rewards/margins": 10.1484375, "rewards/rejected": -9.421875, "step": 4853 }, { "epoch": 0.9171902309981577, "grad_norm": 2.21008700848514, "learning_rate": 1.1881811361349854e-07, "logits/chosen": 3.63671875, "logits/rejected": 3.390625, "logps/chosen": -642.0, "logps/rejected": -941.0, "loss": 0.5028, "rewards/accuracies": 0.84375, "rewards/chosen": 1.09765625, "rewards/margins": 5.7265625, "rewards/rejected": -4.62109375, "step": 4854 }, { "epoch": 0.917379186546365, "grad_norm": 2.52462172072348, "learning_rate": 1.1873327174277515e-07, "logits/chosen": 3.58984375, "logits/rejected": 3.32421875, "logps/chosen": -539.0, "logps/rejected": -627.0, "loss": 0.6783, "rewards/accuracies": 0.8125, "rewards/chosen": 0.59228515625, "rewards/margins": 3.470703125, "rewards/rejected": -2.880859375, "step": 4855 }, { "epoch": 0.9175681420945723, "grad_norm": 1.312337134774398, "learning_rate": 1.1864861749438004e-07, "logits/chosen": 2.57421875, "logits/rejected": 2.587890625, "logps/chosen": -772.0, "logps/rejected": -1174.0, "loss": 0.472, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4931640625, "rewards/margins": 6.015625, "rewards/rejected": -4.5390625, "step": 4856 }, { "epoch": 0.9177570976427796, "grad_norm": 2.9436282040724135, "learning_rate": 1.1856415090514179e-07, "logits/chosen": 3.046875, "logits/rejected": 2.91796875, "logps/chosen": -548.5, "logps/rejected": -998.0, "loss": 0.6102, "rewards/accuracies": 0.78125, "rewards/chosen": 0.162109375, "rewards/margins": 5.98046875, "rewards/rejected": -5.828125, "step": 4857 }, { "epoch": 0.9179460531909868, "grad_norm": 8.286255311058138, "learning_rate": 1.1847987201180768e-07, "logits/chosen": 2.12890625, "logits/rejected": 1.880859375, "logps/chosen": -829.0, "logps/rejected": -2614.0, "loss": 0.7113, "rewards/accuracies": 0.78125, "rewards/chosen": 0.720703125, "rewards/margins": -0.873046875, "rewards/rejected": 1.583984375, "step": 4858 }, { "epoch": 0.9181350087391941, "grad_norm": 1.783822020026328, "learning_rate": 1.1839578085104309e-07, "logits/chosen": 2.951171875, "logits/rejected": 2.6318359375, "logps/chosen": -577.5, "logps/rejected": -640.0, "loss": 0.6444, "rewards/accuracies": 0.71875, "rewards/chosen": -0.80810546875, "rewards/margins": 4.109375, "rewards/rejected": -4.921875, "step": 4859 }, { "epoch": 0.9183239642874014, "grad_norm": 2.288796467152435, "learning_rate": 1.1831187745943187e-07, "logits/chosen": 3.08984375, "logits/rejected": 3.197265625, "logps/chosen": -464.5, "logps/rejected": -673.5, "loss": 0.6261, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6904296875, "rewards/margins": 4.390625, "rewards/rejected": -3.703125, "step": 4860 }, { "epoch": 0.9185129198356087, "grad_norm": 1.9161842174501853, "learning_rate": 1.1822816187347622e-07, "logits/chosen": 3.14453125, "logits/rejected": 2.666015625, "logps/chosen": -917.0, "logps/rejected": -1037.0, "loss": 0.5352, "rewards/accuracies": 0.8125, "rewards/chosen": 0.779296875, "rewards/margins": 7.0546875, "rewards/rejected": -6.2734375, "step": 4861 }, { "epoch": 0.918701875383816, "grad_norm": 4.709622462282029, "learning_rate": 1.181446341295964e-07, "logits/chosen": 3.796875, "logits/rejected": 3.41015625, "logps/chosen": -653.5, "logps/rejected": -633.0, "loss": 0.7105, "rewards/accuracies": 0.75, "rewards/chosen": 0.013671875, "rewards/margins": 3.787109375, "rewards/rejected": -3.7734375, "step": 4862 }, { "epoch": 0.9188908309320233, "grad_norm": 4.9722675238972185, "learning_rate": 1.1806129426413118e-07, "logits/chosen": 3.5390625, "logits/rejected": 3.236328125, "logps/chosen": -678.5, "logps/rejected": -782.5, "loss": 0.5955, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4404296875, "rewards/margins": 5.16015625, "rewards/rejected": -4.7265625, "step": 4863 }, { "epoch": 0.9190797864802305, "grad_norm": 3.3428111855710823, "learning_rate": 1.1797814231333758e-07, "logits/chosen": 2.79296875, "logits/rejected": 1.90234375, "logps/chosen": -821.0, "logps/rejected": -1112.0, "loss": 0.5836, "rewards/accuracies": 0.84375, "rewards/chosen": 0.42578125, "rewards/margins": 5.046875, "rewards/rejected": -4.61328125, "step": 4864 }, { "epoch": 0.9192687420284378, "grad_norm": 1.1348590396107678, "learning_rate": 1.1789517831339068e-07, "logits/chosen": 2.185546875, "logits/rejected": 1.81640625, "logps/chosen": -1056.0, "logps/rejected": -1158.0, "loss": 0.5435, "rewards/accuracies": 0.84375, "rewards/chosen": 1.21337890625, "rewards/margins": 6.08984375, "rewards/rejected": -4.875, "step": 4865 }, { "epoch": 0.9194576975766451, "grad_norm": 5.090985088909997, "learning_rate": 1.1781240230038405e-07, "logits/chosen": 2.98046875, "logits/rejected": 2.638671875, "logps/chosen": -587.5, "logps/rejected": -1182.0, "loss": 0.6364, "rewards/accuracies": 0.78125, "rewards/chosen": 0.00048828125, "rewards/margins": 4.05859375, "rewards/rejected": -4.0625, "step": 4866 }, { "epoch": 0.9196466531248524, "grad_norm": 2.918601338286215, "learning_rate": 1.177298143103292e-07, "logits/chosen": 3.0595703125, "logits/rejected": 2.22998046875, "logps/chosen": -830.5, "logps/rejected": -797.0, "loss": 0.505, "rewards/accuracies": 0.9375, "rewards/chosen": 1.006103515625, "rewards/margins": 4.81640625, "rewards/rejected": -3.8125, "step": 4867 }, { "epoch": 0.9198356086730597, "grad_norm": 2.751828419533254, "learning_rate": 1.1764741437915614e-07, "logits/chosen": 3.01171875, "logits/rejected": 3.32421875, "logps/chosen": -713.0, "logps/rejected": -1670.0, "loss": 0.5228, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3017578125, "rewards/margins": 7.8515625, "rewards/rejected": -6.53515625, "step": 4868 }, { "epoch": 0.920024564221267, "grad_norm": 3.9886878125582785, "learning_rate": 1.1756520254271276e-07, "logits/chosen": 3.0703125, "logits/rejected": 2.904296875, "logps/chosen": -1055.0, "logps/rejected": -1849.0, "loss": 0.4858, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9349365234375, "rewards/margins": 7.9296875, "rewards/rejected": -5.98828125, "step": 4869 }, { "epoch": 0.9202135197694742, "grad_norm": 2.0259232056183767, "learning_rate": 1.1748317883676542e-07, "logits/chosen": 3.8125, "logits/rejected": 3.5546875, "logps/chosen": -968.0, "logps/rejected": -1563.5, "loss": 0.6521, "rewards/accuracies": 0.78125, "rewards/chosen": 0.267578125, "rewards/margins": 8.34375, "rewards/rejected": -8.05859375, "step": 4870 }, { "epoch": 0.9204024753176815, "grad_norm": 2.192672701277718, "learning_rate": 1.1740134329699837e-07, "logits/chosen": 2.98828125, "logits/rejected": 2.77734375, "logps/chosen": -772.0, "logps/rejected": -1168.0, "loss": 0.5467, "rewards/accuracies": 0.90625, "rewards/chosen": 0.59619140625, "rewards/margins": 6.45703125, "rewards/rejected": -5.87109375, "step": 4871 }, { "epoch": 0.9205914308658888, "grad_norm": 2.603225458664138, "learning_rate": 1.1731969595901408e-07, "logits/chosen": 2.673828125, "logits/rejected": 2.677734375, "logps/chosen": -534.5, "logps/rejected": -444.0, "loss": 0.7771, "rewards/accuracies": 0.6875, "rewards/chosen": -0.052490234375, "rewards/margins": 2.169921875, "rewards/rejected": -2.2265625, "step": 4872 }, { "epoch": 0.9207803864140961, "grad_norm": 2.6770536034404793, "learning_rate": 1.1723823685833331e-07, "logits/chosen": 2.953125, "logits/rejected": 3.23046875, "logps/chosen": -991.0, "logps/rejected": -1011.0, "loss": 0.6825, "rewards/accuracies": 0.75, "rewards/chosen": 0.9765625, "rewards/margins": 3.17578125, "rewards/rejected": -2.1982421875, "step": 4873 }, { "epoch": 0.9209693419623034, "grad_norm": 2.1268578029815792, "learning_rate": 1.1715696603039471e-07, "logits/chosen": 2.4140625, "logits/rejected": 1.9658203125, "logps/chosen": -530.5, "logps/rejected": -563.5, "loss": 0.5495, "rewards/accuracies": 0.78125, "rewards/chosen": 1.584228515625, "rewards/margins": 4.59375, "rewards/rejected": -3.00390625, "step": 4874 }, { "epoch": 0.9211582975105107, "grad_norm": 1.8496485293756404, "learning_rate": 1.1707588351055517e-07, "logits/chosen": 2.208984375, "logits/rejected": 1.779296875, "logps/chosen": -684.0, "logps/rejected": -852.0, "loss": 0.5415, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4283447265625, "rewards/margins": 4.90625, "rewards/rejected": -4.4765625, "step": 4875 }, { "epoch": 0.9213472530587179, "grad_norm": 2.498228507173052, "learning_rate": 1.1699498933408943e-07, "logits/chosen": 3.33984375, "logits/rejected": 2.908203125, "logps/chosen": -791.5, "logps/rejected": -1477.0, "loss": 0.6147, "rewards/accuracies": 0.78125, "rewards/chosen": 0.08984375, "rewards/margins": 6.857421875, "rewards/rejected": -6.76171875, "step": 4876 }, { "epoch": 0.9215362086069252, "grad_norm": 2.1592843094697454, "learning_rate": 1.1691428353619068e-07, "logits/chosen": 3.123046875, "logits/rejected": 2.9150390625, "logps/chosen": -756.5, "logps/rejected": -1622.0, "loss": 0.4798, "rewards/accuracies": 0.90625, "rewards/chosen": 1.18115234375, "rewards/margins": 9.859375, "rewards/rejected": -8.6953125, "step": 4877 }, { "epoch": 0.9217251641551325, "grad_norm": 1.6772638086613498, "learning_rate": 1.1683376615196973e-07, "logits/chosen": 2.451171875, "logits/rejected": 2.2685546875, "logps/chosen": -757.0, "logps/rejected": -950.0, "loss": 0.5326, "rewards/accuracies": 0.84375, "rewards/chosen": 0.91015625, "rewards/margins": 6.78515625, "rewards/rejected": -5.8828125, "step": 4878 }, { "epoch": 0.9219141197033398, "grad_norm": 3.2464014672921913, "learning_rate": 1.1675343721645572e-07, "logits/chosen": 1.46044921875, "logits/rejected": 0.9794921875, "logps/chosen": -1146.0, "logps/rejected": -993.0, "loss": 0.5661, "rewards/accuracies": 0.78125, "rewards/chosen": 1.197265625, "rewards/margins": 5.15625, "rewards/rejected": -3.9609375, "step": 4879 }, { "epoch": 0.9221030752515471, "grad_norm": 2.289390749512114, "learning_rate": 1.1667329676459567e-07, "logits/chosen": 2.798828125, "logits/rejected": 2.48046875, "logps/chosen": -673.5, "logps/rejected": -706.0, "loss": 0.5797, "rewards/accuracies": 0.84375, "rewards/chosen": 0.75372314453125, "rewards/margins": 4.4765625, "rewards/rejected": -3.7265625, "step": 4880 }, { "epoch": 0.9222920307997543, "grad_norm": 3.506452096985404, "learning_rate": 1.1659334483125465e-07, "logits/chosen": 3.61328125, "logits/rejected": 3.416015625, "logps/chosen": -908.5, "logps/rejected": -1708.0, "loss": 0.5217, "rewards/accuracies": 0.8125, "rewards/chosen": 1.30865478515625, "rewards/margins": 7.4296875, "rewards/rejected": -6.12109375, "step": 4881 }, { "epoch": 0.9224809863479616, "grad_norm": 5.947321153139809, "learning_rate": 1.1651358145121574e-07, "logits/chosen": 2.783203125, "logits/rejected": 2.2578125, "logps/chosen": -1016.0, "logps/rejected": -862.5, "loss": 0.5199, "rewards/accuracies": 0.875, "rewards/chosen": 1.01171875, "rewards/margins": 4.9375, "rewards/rejected": -3.93359375, "step": 4882 }, { "epoch": 0.9226699418961689, "grad_norm": 2.554952891055375, "learning_rate": 1.1643400665917985e-07, "logits/chosen": 2.99609375, "logits/rejected": 2.1328125, "logps/chosen": -706.0, "logps/rejected": -669.0, "loss": 0.5683, "rewards/accuracies": 0.75, "rewards/chosen": 1.13720703125, "rewards/margins": 4.478515625, "rewards/rejected": -3.337890625, "step": 4883 }, { "epoch": 0.9228588974443762, "grad_norm": 2.715834470238004, "learning_rate": 1.1635462048976605e-07, "logits/chosen": 3.220703125, "logits/rejected": 3.482421875, "logps/chosen": -626.0, "logps/rejected": -630.0, "loss": 0.6194, "rewards/accuracies": 0.875, "rewards/chosen": 0.5224609375, "rewards/margins": 4.13671875, "rewards/rejected": -3.60546875, "step": 4884 }, { "epoch": 0.9230478529925835, "grad_norm": 1.5731949078946224, "learning_rate": 1.1627542297751117e-07, "logits/chosen": 2.36328125, "logits/rejected": 1.4970703125, "logps/chosen": -855.0, "logps/rejected": -14224.0, "loss": 0.522, "rewards/accuracies": 0.84375, "rewards/chosen": 1.16015625, "rewards/margins": -167.30859375, "rewards/rejected": 168.07421875, "step": 4885 }, { "epoch": 0.9232368085407908, "grad_norm": 2.0745074763695195, "learning_rate": 1.1619641415687009e-07, "logits/chosen": 2.623046875, "logits/rejected": 2.607421875, "logps/chosen": -1180.0, "logps/rejected": -1090.0, "loss": 0.5128, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2724609375, "rewards/margins": 5.5703125, "rewards/rejected": -4.30078125, "step": 4886 }, { "epoch": 0.923425764088998, "grad_norm": 6.413633582383314, "learning_rate": 1.1611759406221554e-07, "logits/chosen": 3.490234375, "logits/rejected": 2.943359375, "logps/chosen": -1068.0, "logps/rejected": -1060.0, "loss": 0.591, "rewards/accuracies": 0.75, "rewards/chosen": 0.8466796875, "rewards/margins": 9.6640625, "rewards/rejected": -8.8203125, "step": 4887 }, { "epoch": 0.9236147196372053, "grad_norm": 2.8556443203519444, "learning_rate": 1.160389627278381e-07, "logits/chosen": 2.92578125, "logits/rejected": 2.5732421875, "logps/chosen": -691.5, "logps/rejected": -1051.5, "loss": 0.4604, "rewards/accuracies": 0.84375, "rewards/chosen": 0.962890625, "rewards/margins": 6.4296875, "rewards/rejected": -5.4609375, "step": 4888 }, { "epoch": 0.9238036751854126, "grad_norm": 1.4341968528988933, "learning_rate": 1.1596052018794636e-07, "logits/chosen": 2.525390625, "logits/rejected": 2.40234375, "logps/chosen": -933.0, "logps/rejected": -1489.0, "loss": 0.5776, "rewards/accuracies": 0.875, "rewards/chosen": 0.574462890625, "rewards/margins": 5.57421875, "rewards/rejected": -5.0078125, "step": 4889 }, { "epoch": 0.9239926307336199, "grad_norm": 2.2704049851074486, "learning_rate": 1.1588226647666666e-07, "logits/chosen": 2.62890625, "logits/rejected": 2.111328125, "logps/chosen": -574.5, "logps/rejected": -574.0, "loss": 0.3818, "rewards/accuracies": 1.0, "rewards/chosen": 0.8690185546875, "rewards/margins": 6.40625, "rewards/rejected": -5.546875, "step": 4890 }, { "epoch": 0.9241815862818272, "grad_norm": 1.93260282370903, "learning_rate": 1.1580420162804323e-07, "logits/chosen": 2.056640625, "logits/rejected": 1.615234375, "logps/chosen": -563.0, "logps/rejected": -1175.0, "loss": 0.4916, "rewards/accuracies": 0.875, "rewards/chosen": 0.77294921875, "rewards/margins": 5.53515625, "rewards/rejected": -4.76953125, "step": 4891 }, { "epoch": 0.9243705418300345, "grad_norm": 1.2738435126721213, "learning_rate": 1.1572632567603815e-07, "logits/chosen": 2.955078125, "logits/rejected": 2.79296875, "logps/chosen": -435.5, "logps/rejected": -526.0, "loss": 0.6287, "rewards/accuracies": 0.65625, "rewards/chosen": 1.02734375, "rewards/margins": 3.49609375, "rewards/rejected": -2.466796875, "step": 4892 }, { "epoch": 0.9245594973782417, "grad_norm": 3.213962810062692, "learning_rate": 1.156486386545312e-07, "logits/chosen": 2.78515625, "logits/rejected": 2.62890625, "logps/chosen": -567.5, "logps/rejected": -767.0, "loss": 0.5731, "rewards/accuracies": 0.78125, "rewards/chosen": 0.9736328125, "rewards/margins": 4.6171875, "rewards/rejected": -3.6484375, "step": 4893 }, { "epoch": 0.924748452926449, "grad_norm": 1.876135084404137, "learning_rate": 1.1557114059732028e-07, "logits/chosen": 2.681640625, "logits/rejected": 2.0419921875, "logps/chosen": -1118.0, "logps/rejected": -847.0, "loss": 0.5257, "rewards/accuracies": 0.8125, "rewards/chosen": 1.8740234375, "rewards/margins": 5.6875, "rewards/rejected": -3.828125, "step": 4894 }, { "epoch": 0.9249374084746563, "grad_norm": 2.457842066418347, "learning_rate": 1.1549383153812074e-07, "logits/chosen": 3.04296875, "logits/rejected": 2.453125, "logps/chosen": -705.25, "logps/rejected": -1087.0, "loss": 0.6316, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9794921875, "rewards/margins": 5.2626953125, "rewards/rejected": -4.294921875, "step": 4895 }, { "epoch": 0.9251263640228636, "grad_norm": 2.0984918113683486, "learning_rate": 1.1541671151056584e-07, "logits/chosen": 3.80078125, "logits/rejected": 3.048828125, "logps/chosen": -518.0, "logps/rejected": -1005.5, "loss": 0.6473, "rewards/accuracies": 0.8125, "rewards/chosen": 1.020263671875, "rewards/margins": 6.541015625, "rewards/rejected": -5.5234375, "step": 4896 }, { "epoch": 0.9253153195710709, "grad_norm": 3.083902786641484, "learning_rate": 1.1533978054820664e-07, "logits/chosen": 2.62109375, "logits/rejected": 2.384765625, "logps/chosen": -690.5, "logps/rejected": -644.0, "loss": 0.6438, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4345703125, "rewards/margins": 3.9296875, "rewards/rejected": -3.48828125, "step": 4897 }, { "epoch": 0.9255042751192782, "grad_norm": 2.799851121498014, "learning_rate": 1.1526303868451191e-07, "logits/chosen": 3.74609375, "logits/rejected": 3.38671875, "logps/chosen": -533.0, "logps/rejected": -790.0, "loss": 0.6554, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4970703125, "rewards/margins": 3.4921875, "rewards/rejected": -2.9921875, "step": 4898 }, { "epoch": 0.9256932306674854, "grad_norm": 2.830194109105625, "learning_rate": 1.1518648595286807e-07, "logits/chosen": 2.8359375, "logits/rejected": 3.27734375, "logps/chosen": -390.5, "logps/rejected": -596.5, "loss": 0.7789, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4095458984375, "rewards/margins": 3.48046875, "rewards/rejected": -3.8837890625, "step": 4899 }, { "epoch": 0.9258821862156927, "grad_norm": 3.6135025563256633, "learning_rate": 1.1511012238657939e-07, "logits/chosen": 2.51953125, "logits/rejected": 2.55078125, "logps/chosen": -666.0, "logps/rejected": -1137.0, "loss": 0.525, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6376953125, "rewards/margins": 5.32421875, "rewards/rejected": -4.68359375, "step": 4900 }, { "epoch": 0.9260711417639, "grad_norm": 1.5542480322162247, "learning_rate": 1.1503394801886788e-07, "logits/chosen": 2.849609375, "logits/rejected": 2.73046875, "logps/chosen": -602.0, "logps/rejected": -916.5, "loss": 0.568, "rewards/accuracies": 0.84375, "rewards/chosen": 0.919921875, "rewards/margins": 5.609375, "rewards/rejected": -4.68359375, "step": 4901 }, { "epoch": 0.9262600973121073, "grad_norm": 1.8003148709690078, "learning_rate": 1.14957962882873e-07, "logits/chosen": 2.734375, "logits/rejected": 2.912109375, "logps/chosen": -863.75, "logps/rejected": -5351.5, "loss": 0.6368, "rewards/accuracies": 0.71875, "rewards/chosen": 0.806640625, "rewards/margins": 2.53125, "rewards/rejected": -1.72265625, "step": 4902 }, { "epoch": 0.9264490528603146, "grad_norm": 4.366430980434129, "learning_rate": 1.1488216701165218e-07, "logits/chosen": 2.546875, "logits/rejected": 2.0078125, "logps/chosen": -1397.0, "logps/rejected": -1165.0, "loss": 0.4624, "rewards/accuracies": 0.875, "rewards/chosen": 2.25732421875, "rewards/margins": 7.125, "rewards/rejected": -4.8671875, "step": 4903 }, { "epoch": 0.9266380084085218, "grad_norm": 2.5693928558736223, "learning_rate": 1.1480656043818024e-07, "logits/chosen": 4.0078125, "logits/rejected": 3.34765625, "logps/chosen": -511.5, "logps/rejected": -494.5, "loss": 0.6391, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1318359375, "rewards/margins": 3.3203125, "rewards/rejected": -2.181640625, "step": 4904 }, { "epoch": 0.9268269639567291, "grad_norm": 4.220645382229953, "learning_rate": 1.1473114319534991e-07, "logits/chosen": 2.84765625, "logits/rejected": 2.453125, "logps/chosen": -606.5, "logps/rejected": -741.0, "loss": 0.6321, "rewards/accuracies": 0.71875, "rewards/chosen": 0.8050537109375, "rewards/margins": 4.826171875, "rewards/rejected": -4.029296875, "step": 4905 }, { "epoch": 0.9270159195049364, "grad_norm": 2.1027345011744765, "learning_rate": 1.146559153159712e-07, "logits/chosen": 3.59375, "logits/rejected": 3.7578125, "logps/chosen": -672.0, "logps/rejected": -1530.0, "loss": 0.4915, "rewards/accuracies": 0.875, "rewards/chosen": 1.853515625, "rewards/margins": 8.2578125, "rewards/rejected": -6.390625, "step": 4906 }, { "epoch": 0.9272048750531438, "grad_norm": 2.54839407198746, "learning_rate": 1.1458087683277223e-07, "logits/chosen": 2.599609375, "logits/rejected": 2.94140625, "logps/chosen": -904.5, "logps/rejected": -1976.0, "loss": 0.5623, "rewards/accuracies": 0.8125, "rewards/chosen": 0.72265625, "rewards/margins": 5.6640625, "rewards/rejected": -4.94140625, "step": 4907 }, { "epoch": 0.927393830601351, "grad_norm": 2.209022406826111, "learning_rate": 1.1450602777839824e-07, "logits/chosen": 2.46875, "logits/rejected": 2.619140625, "logps/chosen": -985.5, "logps/rejected": -791.0, "loss": 0.6082, "rewards/accuracies": 0.78125, "rewards/chosen": 0.43560791015625, "rewards/margins": 4.86328125, "rewards/rejected": -4.43359375, "step": 4908 }, { "epoch": 0.9275827861495584, "grad_norm": 4.6605810292569085, "learning_rate": 1.1443136818541232e-07, "logits/chosen": 3.57421875, "logits/rejected": 3.41796875, "logps/chosen": -675.0, "logps/rejected": -627.5, "loss": 0.5895, "rewards/accuracies": 0.8125, "rewards/chosen": 0.55908203125, "rewards/margins": 3.8671875, "rewards/rejected": -3.3046875, "step": 4909 }, { "epoch": 0.9277717416977656, "grad_norm": 3.031305580385621, "learning_rate": 1.1435689808629516e-07, "logits/chosen": 2.91796875, "logits/rejected": 2.375, "logps/chosen": -928.0, "logps/rejected": -946.0, "loss": 0.6626, "rewards/accuracies": 0.78125, "rewards/chosen": 0.90087890625, "rewards/margins": 4.697265625, "rewards/rejected": -3.802734375, "step": 4910 }, { "epoch": 0.9279606972459729, "grad_norm": 2.8160901085331065, "learning_rate": 1.1428261751344482e-07, "logits/chosen": 3.44140625, "logits/rejected": 3.546875, "logps/chosen": -674.0, "logps/rejected": -1050.0, "loss": 0.5807, "rewards/accuracies": 0.75, "rewards/chosen": 0.801513671875, "rewards/margins": 5.9296875, "rewards/rejected": -5.125, "step": 4911 }, { "epoch": 0.9281496527941802, "grad_norm": 2.596894044037005, "learning_rate": 1.1420852649917706e-07, "logits/chosen": 3.314453125, "logits/rejected": 3.193359375, "logps/chosen": -431.0, "logps/rejected": -624.0, "loss": 0.6285, "rewards/accuracies": 0.8125, "rewards/chosen": 0.347900390625, "rewards/margins": 3.990234375, "rewards/rejected": -3.634765625, "step": 4912 }, { "epoch": 0.9283386083423875, "grad_norm": 2.7684575603428883, "learning_rate": 1.1413462507572521e-07, "logits/chosen": 1.94140625, "logits/rejected": 1.5478515625, "logps/chosen": -1053.0, "logps/rejected": -1074.0, "loss": 0.57, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0947265625, "rewards/margins": 5.8203125, "rewards/rejected": -4.7265625, "step": 4913 }, { "epoch": 0.9285275638905948, "grad_norm": 3.575929767939942, "learning_rate": 1.1406091327523982e-07, "logits/chosen": 2.888671875, "logits/rejected": 2.4892578125, "logps/chosen": -906.0, "logps/rejected": -1154.0, "loss": 0.5083, "rewards/accuracies": 0.84375, "rewards/chosen": 1.232421875, "rewards/margins": 9.66796875, "rewards/rejected": -8.435546875, "step": 4914 }, { "epoch": 0.9287165194388021, "grad_norm": 2.5299062801052914, "learning_rate": 1.1398739112978942e-07, "logits/chosen": 2.443359375, "logits/rejected": 2.82421875, "logps/chosen": -826.0, "logps/rejected": -1722.0, "loss": 0.7771, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0478515625, "rewards/margins": 4.8681640625, "rewards/rejected": -4.90625, "step": 4915 }, { "epoch": 0.9289054749870093, "grad_norm": 3.1590095075974207, "learning_rate": 1.1391405867135965e-07, "logits/chosen": 2.8935546875, "logits/rejected": 2.908203125, "logps/chosen": -932.5, "logps/rejected": -1138.0, "loss": 0.4219, "rewards/accuracies": 0.875, "rewards/chosen": 2.0693359375, "rewards/margins": 8.1875, "rewards/rejected": -6.125, "step": 4916 }, { "epoch": 0.9290944305352166, "grad_norm": 1.8395719250484706, "learning_rate": 1.1384091593185376e-07, "logits/chosen": 2.7890625, "logits/rejected": 3.140625, "logps/chosen": -761.0, "logps/rejected": -832.0, "loss": 0.5582, "rewards/accuracies": 0.84375, "rewards/chosen": 0.850341796875, "rewards/margins": 4.765625, "rewards/rejected": -3.923828125, "step": 4917 }, { "epoch": 0.9292833860834239, "grad_norm": 2.063513264883694, "learning_rate": 1.1376796294309248e-07, "logits/chosen": 2.3447265625, "logits/rejected": 2.10400390625, "logps/chosen": -955.0, "logps/rejected": -744.0, "loss": 0.5116, "rewards/accuracies": 0.90625, "rewards/chosen": 0.447265625, "rewards/margins": 4.88671875, "rewards/rejected": -4.44140625, "step": 4918 }, { "epoch": 0.9294723416316312, "grad_norm": 4.222386354657014, "learning_rate": 1.1369519973681393e-07, "logits/chosen": 2.8671875, "logits/rejected": 2.958984375, "logps/chosen": -1034.5, "logps/rejected": -2506.0, "loss": 0.5443, "rewards/accuracies": 0.84375, "rewards/chosen": 1.00537109375, "rewards/margins": 11.81640625, "rewards/rejected": -10.8046875, "step": 4919 }, { "epoch": 0.9296612971798385, "grad_norm": 4.239846551657849, "learning_rate": 1.1362262634467363e-07, "logits/chosen": 3.3671875, "logits/rejected": 3.173828125, "logps/chosen": -626.0, "logps/rejected": -1117.0, "loss": 0.631, "rewards/accuracies": 0.875, "rewards/chosen": 0.7724609375, "rewards/margins": 4.8125, "rewards/rejected": -4.04296875, "step": 4920 }, { "epoch": 0.9298502527280458, "grad_norm": 2.7481269839471536, "learning_rate": 1.1355024279824475e-07, "logits/chosen": 2.55859375, "logits/rejected": 3.203125, "logps/chosen": -1043.0, "logps/rejected": -1100.0, "loss": 0.5843, "rewards/accuracies": 0.78125, "rewards/chosen": 1.4765625, "rewards/margins": 6.00390625, "rewards/rejected": -4.5390625, "step": 4921 }, { "epoch": 0.930039208276253, "grad_norm": 2.348389085000032, "learning_rate": 1.1347804912901762e-07, "logits/chosen": 3.171875, "logits/rejected": 2.595703125, "logps/chosen": -761.0, "logps/rejected": -799.0, "loss": 0.621, "rewards/accuracies": 0.71875, "rewards/chosen": 1.374755859375, "rewards/margins": 4.58203125, "rewards/rejected": -3.2109375, "step": 4922 }, { "epoch": 0.9302281638244603, "grad_norm": 4.507006561053775, "learning_rate": 1.1340604536840007e-07, "logits/chosen": 3.3203125, "logits/rejected": 2.8828125, "logps/chosen": -601.5, "logps/rejected": -856.0, "loss": 0.6421, "rewards/accuracies": 0.84375, "rewards/chosen": 0.65887451171875, "rewards/margins": 3.37109375, "rewards/rejected": -2.7109375, "step": 4923 }, { "epoch": 0.9304171193726676, "grad_norm": 2.6169584159833232, "learning_rate": 1.1333423154771728e-07, "logits/chosen": 2.9921875, "logits/rejected": 2.20703125, "logps/chosen": -680.0, "logps/rejected": -928.0, "loss": 0.5181, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6201171875, "rewards/margins": 5.859375, "rewards/rejected": -5.234375, "step": 4924 }, { "epoch": 0.9306060749208749, "grad_norm": 2.262343290156398, "learning_rate": 1.132626076982118e-07, "logits/chosen": 2.3046875, "logits/rejected": 2.265625, "logps/chosen": -971.0, "logps/rejected": -2030.0, "loss": 0.4724, "rewards/accuracies": 0.90625, "rewards/chosen": 1.287841796875, "rewards/margins": 8.7421875, "rewards/rejected": -7.4375, "step": 4925 }, { "epoch": 0.9307950304690822, "grad_norm": 4.522811242012508, "learning_rate": 1.131911738510436e-07, "logits/chosen": 2.953125, "logits/rejected": 3.0078125, "logps/chosen": -528.5, "logps/rejected": -578.0, "loss": 0.6107, "rewards/accuracies": 0.9375, "rewards/chosen": 0.89306640625, "rewards/margins": 3.76171875, "rewards/rejected": -2.865234375, "step": 4926 }, { "epoch": 0.9309839860172894, "grad_norm": 2.4040695567561587, "learning_rate": 1.131199300372898e-07, "logits/chosen": 2.984375, "logits/rejected": 2.76171875, "logps/chosen": -846.5, "logps/rejected": -1060.0, "loss": 0.4212, "rewards/accuracies": 0.90625, "rewards/chosen": 1.2724609375, "rewards/margins": 6.96875, "rewards/rejected": -5.6953125, "step": 4927 }, { "epoch": 0.9311729415654967, "grad_norm": 4.299531569459713, "learning_rate": 1.1304887628794515e-07, "logits/chosen": 2.83984375, "logits/rejected": 2.6953125, "logps/chosen": -774.0, "logps/rejected": -1045.5, "loss": 0.5872, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8955078125, "rewards/margins": 5.50390625, "rewards/rejected": -4.59375, "step": 4928 }, { "epoch": 0.931361897113704, "grad_norm": 4.318328448849262, "learning_rate": 1.129780126339214e-07, "logits/chosen": 1.689453125, "logits/rejected": 1.4033203125, "logps/chosen": -651.0, "logps/rejected": -1352.0, "loss": 0.5797, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0406494140625, "rewards/margins": 6.0625, "rewards/rejected": -6.0234375, "step": 4929 }, { "epoch": 0.9315508526619113, "grad_norm": 2.4057936925907923, "learning_rate": 1.1290733910604782e-07, "logits/chosen": 3.44921875, "logits/rejected": 3.69140625, "logps/chosen": -611.5, "logps/rejected": -858.0, "loss": 0.7134, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0849609375, "rewards/margins": 4.47314453125, "rewards/rejected": -4.5439453125, "step": 4930 }, { "epoch": 0.9317398082101186, "grad_norm": 2.9487643095920473, "learning_rate": 1.1283685573507086e-07, "logits/chosen": 1.86181640625, "logits/rejected": 1.27301025390625, "logps/chosen": -685.0, "logps/rejected": -623.5, "loss": 0.6703, "rewards/accuracies": 0.875, "rewards/chosen": 0.555419921875, "rewards/margins": 4.34765625, "rewards/rejected": -3.791015625, "step": 4931 }, { "epoch": 0.9319287637583259, "grad_norm": 3.1139408857510786, "learning_rate": 1.1276656255165426e-07, "logits/chosen": 2.53125, "logits/rejected": 2.5546875, "logps/chosen": -575.5, "logps/rejected": -788.0, "loss": 0.6124, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0283203125, "rewards/margins": 4.04296875, "rewards/rejected": -3.009765625, "step": 4932 }, { "epoch": 0.9321177193065331, "grad_norm": 4.014418162030528, "learning_rate": 1.1269645958637905e-07, "logits/chosen": 3.7734375, "logits/rejected": 3.20703125, "logps/chosen": -679.0, "logps/rejected": -612.5, "loss": 0.569, "rewards/accuracies": 0.84375, "rewards/chosen": 0.53326416015625, "rewards/margins": 4.98046875, "rewards/rejected": -4.44140625, "step": 4933 }, { "epoch": 0.9323066748547404, "grad_norm": 2.563885220901964, "learning_rate": 1.1262654686974348e-07, "logits/chosen": 2.986328125, "logits/rejected": 2.408203125, "logps/chosen": -659.0, "logps/rejected": -1079.0, "loss": 0.6094, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3092041015625, "rewards/margins": 5.0390625, "rewards/rejected": -4.736328125, "step": 4934 }, { "epoch": 0.9324956304029477, "grad_norm": 3.7870378442547232, "learning_rate": 1.1255682443216291e-07, "logits/chosen": 3.138671875, "logits/rejected": 2.455078125, "logps/chosen": -996.0, "logps/rejected": -1012.0, "loss": 0.4821, "rewards/accuracies": 0.875, "rewards/chosen": 0.7958984375, "rewards/margins": 6.8828125, "rewards/rejected": -6.0859375, "step": 4935 }, { "epoch": 0.932684585951155, "grad_norm": 3.3352910431692457, "learning_rate": 1.124872923039703e-07, "logits/chosen": 1.939453125, "logits/rejected": 1.77734375, "logps/chosen": -980.5, "logps/rejected": -1051.0, "loss": 0.5246, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8916015625, "rewards/margins": 5.9765625, "rewards/rejected": -4.078125, "step": 4936 }, { "epoch": 0.9328735414993623, "grad_norm": 1.6212862210781547, "learning_rate": 1.1241795051541534e-07, "logits/chosen": 3.140625, "logits/rejected": 2.58984375, "logps/chosen": -512.0, "logps/rejected": -835.0, "loss": 0.5538, "rewards/accuracies": 0.875, "rewards/chosen": 0.86767578125, "rewards/margins": 5.15625, "rewards/rejected": -4.2890625, "step": 4937 }, { "epoch": 0.9330624970475696, "grad_norm": 4.246798677191237, "learning_rate": 1.1234879909666525e-07, "logits/chosen": 2.921875, "logits/rejected": 2.84375, "logps/chosen": -604.0, "logps/rejected": -743.0, "loss": 0.6487, "rewards/accuracies": 0.78125, "rewards/chosen": 0.26708984375, "rewards/margins": 3.26953125, "rewards/rejected": -3.00390625, "step": 4938 }, { "epoch": 0.9332514525957768, "grad_norm": 4.157832469567301, "learning_rate": 1.1227983807780427e-07, "logits/chosen": 3.5625, "logits/rejected": 3.064453125, "logps/chosen": -973.5, "logps/rejected": -1052.5, "loss": 0.5926, "rewards/accuracies": 0.6875, "rewards/chosen": 0.52294921875, "rewards/margins": 6.25, "rewards/rejected": -5.740234375, "step": 4939 }, { "epoch": 0.9334404081439841, "grad_norm": 3.7241500718364544, "learning_rate": 1.1221106748883382e-07, "logits/chosen": 3.14453125, "logits/rejected": 2.353515625, "logps/chosen": -572.0, "logps/rejected": -688.0, "loss": 0.5916, "rewards/accuracies": 0.875, "rewards/chosen": 0.18017578125, "rewards/margins": 4.4453125, "rewards/rejected": -4.2578125, "step": 4940 }, { "epoch": 0.9336293636921914, "grad_norm": 1.468061598959788, "learning_rate": 1.1214248735967258e-07, "logits/chosen": 2.646484375, "logits/rejected": 2.53515625, "logps/chosen": -1035.5, "logps/rejected": -1145.0, "loss": 0.5288, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8408203125, "rewards/margins": 6.994140625, "rewards/rejected": -6.140625, "step": 4941 }, { "epoch": 0.9338183192403987, "grad_norm": 2.0281725385480787, "learning_rate": 1.1207409772015613e-07, "logits/chosen": 2.0546875, "logits/rejected": 1.48828125, "logps/chosen": -763.5, "logps/rejected": -9529.0, "loss": 0.6449, "rewards/accuracies": 0.8125, "rewards/chosen": 0.14794921875, "rewards/margins": -31.6474609375, "rewards/rejected": 31.60546875, "step": 4942 }, { "epoch": 0.934007274788606, "grad_norm": 2.2044475422805623, "learning_rate": 1.1200589860003759e-07, "logits/chosen": 3.26171875, "logits/rejected": 2.826171875, "logps/chosen": -1371.5, "logps/rejected": -968.5, "loss": 0.6138, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0244140625, "rewards/margins": 4.013671875, "rewards/rejected": -3.9921875, "step": 4943 }, { "epoch": 0.9341962303368133, "grad_norm": 2.4054468509595126, "learning_rate": 1.1193789002898678e-07, "logits/chosen": 3.3125, "logits/rejected": 2.525390625, "logps/chosen": -875.5, "logps/rejected": -1161.0, "loss": 0.5736, "rewards/accuracies": 0.71875, "rewards/chosen": 1.646484375, "rewards/margins": 9.62890625, "rewards/rejected": -7.970703125, "step": 4944 }, { "epoch": 0.9343851858850205, "grad_norm": 2.7344032408600776, "learning_rate": 1.1187007203659082e-07, "logits/chosen": 2.482421875, "logits/rejected": 2.00390625, "logps/chosen": -880.5, "logps/rejected": -1735.0, "loss": 0.4695, "rewards/accuracies": 0.875, "rewards/chosen": 1.30859375, "rewards/margins": 11.2265625, "rewards/rejected": -9.921875, "step": 4945 }, { "epoch": 0.9345741414332278, "grad_norm": 1.9560089279602246, "learning_rate": 1.1180244465235387e-07, "logits/chosen": 2.50390625, "logits/rejected": 2.330078125, "logps/chosen": -664.5, "logps/rejected": -769.0, "loss": 0.5187, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6650390625, "rewards/margins": 4.578125, "rewards/rejected": -3.91796875, "step": 4946 }, { "epoch": 0.9347630969814351, "grad_norm": 2.066364517179942, "learning_rate": 1.1173500790569723e-07, "logits/chosen": 2.595703125, "logits/rejected": 2.037109375, "logps/chosen": -889.0, "logps/rejected": -834.0, "loss": 0.4893, "rewards/accuracies": 0.8125, "rewards/chosen": 1.59033203125, "rewards/margins": 6.36328125, "rewards/rejected": -4.76953125, "step": 4947 }, { "epoch": 0.9349520525296424, "grad_norm": 2.955195339082404, "learning_rate": 1.1166776182595911e-07, "logits/chosen": 2.46484375, "logits/rejected": 2.123046875, "logps/chosen": -723.0, "logps/rejected": -644.0, "loss": 0.5492, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7587890625, "rewards/margins": 4.5390625, "rewards/rejected": -3.7734375, "step": 4948 }, { "epoch": 0.9351410080778497, "grad_norm": 2.9055077029647567, "learning_rate": 1.1160070644239504e-07, "logits/chosen": 2.44921875, "logits/rejected": 2.591796875, "logps/chosen": -703.0, "logps/rejected": -981.0, "loss": 0.5663, "rewards/accuracies": 0.875, "rewards/chosen": 0.06378173828125, "rewards/margins": 4.28515625, "rewards/rejected": -4.2109375, "step": 4949 }, { "epoch": 0.9353299636260569, "grad_norm": 2.789526174116556, "learning_rate": 1.1153384178417732e-07, "logits/chosen": 3.16015625, "logits/rejected": 2.552734375, "logps/chosen": -1259.0, "logps/rejected": -880.0, "loss": 0.5947, "rewards/accuracies": 0.78125, "rewards/chosen": -0.172119140625, "rewards/margins": 4.3359375, "rewards/rejected": -4.5078125, "step": 4950 }, { "epoch": 0.9355189191742642, "grad_norm": 2.8899393647794014, "learning_rate": 1.1146716788039536e-07, "logits/chosen": 2.419921875, "logits/rejected": 2.95703125, "logps/chosen": -586.0, "logps/rejected": -16140.0, "loss": 0.692, "rewards/accuracies": 0.75, "rewards/chosen": 0.62890625, "rewards/margins": -156.9921875, "rewards/rejected": 158.2421875, "step": 4951 }, { "epoch": 0.9357078747224715, "grad_norm": 2.495200306402023, "learning_rate": 1.1140068476005571e-07, "logits/chosen": 2.626953125, "logits/rejected": 2.650390625, "logps/chosen": -911.0, "logps/rejected": -1593.0, "loss": 0.6779, "rewards/accuracies": 0.75, "rewards/chosen": -0.0675048828125, "rewards/margins": 14.0390625, "rewards/rejected": -14.0703125, "step": 4952 }, { "epoch": 0.9358968302706788, "grad_norm": 4.0084664096547, "learning_rate": 1.1133439245208164e-07, "logits/chosen": 3.927734375, "logits/rejected": 3.162109375, "logps/chosen": -734.5, "logps/rejected": -880.0, "loss": 0.6227, "rewards/accuracies": 0.8125, "rewards/chosen": 1.08056640625, "rewards/margins": 4.6015625, "rewards/rejected": -3.515625, "step": 4953 }, { "epoch": 0.9360857858188861, "grad_norm": 2.428232581447526, "learning_rate": 1.1126829098531372e-07, "logits/chosen": 3.0830078125, "logits/rejected": 2.58984375, "logps/chosen": -972.0, "logps/rejected": -841.0, "loss": 0.6254, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7109375, "rewards/margins": 4.44140625, "rewards/rejected": -3.7265625, "step": 4954 }, { "epoch": 0.9362747413670934, "grad_norm": 3.299914236528765, "learning_rate": 1.1120238038850934e-07, "logits/chosen": 2.64453125, "logits/rejected": 2.57421875, "logps/chosen": -494.5, "logps/rejected": -769.5, "loss": 0.616, "rewards/accuracies": 0.84375, "rewards/chosen": 0.41455078125, "rewards/margins": 4.38671875, "rewards/rejected": -3.97265625, "step": 4955 }, { "epoch": 0.9364636969153006, "grad_norm": 4.427554948544236, "learning_rate": 1.111366606903428e-07, "logits/chosen": 3.41015625, "logits/rejected": 3.06640625, "logps/chosen": -972.0, "logps/rejected": -914.0, "loss": 0.5307, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7294921875, "rewards/margins": 5.029296875, "rewards/rejected": -4.3046875, "step": 4956 }, { "epoch": 0.9366526524635079, "grad_norm": 2.7057946323214592, "learning_rate": 1.1107113191940548e-07, "logits/chosen": 3.3203125, "logits/rejected": 3.328125, "logps/chosen": -603.25, "logps/rejected": -1454.5, "loss": 0.5998, "rewards/accuracies": 0.875, "rewards/chosen": 0.3466796875, "rewards/margins": 5.84375, "rewards/rejected": -5.5078125, "step": 4957 }, { "epoch": 0.9368416080117152, "grad_norm": 1.9083199996141176, "learning_rate": 1.1100579410420554e-07, "logits/chosen": 3.453125, "logits/rejected": 3.408203125, "logps/chosen": -697.0, "logps/rejected": -1697.0, "loss": 0.7455, "rewards/accuracies": 0.75, "rewards/chosen": -0.5836181640625, "rewards/margins": 17.5625, "rewards/rejected": -18.09765625, "step": 4958 }, { "epoch": 0.9370305635599225, "grad_norm": 1.7602712705784727, "learning_rate": 1.1094064727316829e-07, "logits/chosen": 3.81640625, "logits/rejected": 3.55859375, "logps/chosen": -649.5, "logps/rejected": -1046.0, "loss": 0.5918, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6875, "rewards/margins": 6.75, "rewards/rejected": -6.05078125, "step": 4959 }, { "epoch": 0.9372195191081298, "grad_norm": 1.4982882203584778, "learning_rate": 1.1087569145463575e-07, "logits/chosen": 3.07421875, "logits/rejected": 2.48046875, "logps/chosen": -919.0, "logps/rejected": -884.0, "loss": 0.5549, "rewards/accuracies": 0.84375, "rewards/chosen": 1.6005859375, "rewards/margins": 6.560546875, "rewards/rejected": -4.962890625, "step": 4960 }, { "epoch": 0.9374084746563371, "grad_norm": 1.870542096077649, "learning_rate": 1.1081092667686687e-07, "logits/chosen": 2.865234375, "logits/rejected": 2.638671875, "logps/chosen": -930.0, "logps/rejected": -1172.0, "loss": 0.5464, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3076171875, "rewards/margins": 5.3046875, "rewards/rejected": -4.99609375, "step": 4961 }, { "epoch": 0.9375974302045443, "grad_norm": 3.4718808387615194, "learning_rate": 1.1074635296803761e-07, "logits/chosen": 3.109375, "logits/rejected": 2.8359375, "logps/chosen": -813.0, "logps/rejected": -1715.0, "loss": 0.6066, "rewards/accuracies": 0.75, "rewards/chosen": 0.421142578125, "rewards/margins": 9.1728515625, "rewards/rejected": -8.7568359375, "step": 4962 }, { "epoch": 0.9377863857527516, "grad_norm": 5.715067148696247, "learning_rate": 1.1068197035624059e-07, "logits/chosen": 3.41015625, "logits/rejected": 3.21875, "logps/chosen": -785.0, "logps/rejected": -1434.0, "loss": 0.5451, "rewards/accuracies": 0.75, "rewards/chosen": 0.0439453125, "rewards/margins": 6.0419921875, "rewards/rejected": -5.9921875, "step": 4963 }, { "epoch": 0.9379753413009589, "grad_norm": 2.8176278243447612, "learning_rate": 1.1061777886948556e-07, "logits/chosen": 2.083984375, "logits/rejected": 2.3134765625, "logps/chosen": -1070.0, "logps/rejected": -1897.5, "loss": 0.4793, "rewards/accuracies": 0.8125, "rewards/chosen": 1.01318359375, "rewards/margins": 10.5703125, "rewards/rejected": -9.546875, "step": 4964 }, { "epoch": 0.9381642968491662, "grad_norm": 2.76266721258132, "learning_rate": 1.1055377853569895e-07, "logits/chosen": 3.0546875, "logits/rejected": 3.0859375, "logps/chosen": -1184.0, "logps/rejected": -1257.0, "loss": 0.4838, "rewards/accuracies": 0.90625, "rewards/chosen": 0.859375, "rewards/margins": 6.515625, "rewards/rejected": -5.65234375, "step": 4965 }, { "epoch": 0.9383532523973735, "grad_norm": 4.675882858282455, "learning_rate": 1.1048996938272407e-07, "logits/chosen": 2.8046875, "logits/rejected": 2.958984375, "logps/chosen": -758.0, "logps/rejected": -1184.0, "loss": 0.6168, "rewards/accuracies": 0.71875, "rewards/chosen": 1.62744140625, "rewards/margins": 8.7109375, "rewards/rejected": -7.0869140625, "step": 4966 }, { "epoch": 0.9385422079455809, "grad_norm": 2.08292521955038, "learning_rate": 1.1042635143832101e-07, "logits/chosen": 3.265625, "logits/rejected": 3.0859375, "logps/chosen": -1031.0, "logps/rejected": -1101.0, "loss": 0.5849, "rewards/accuracies": 0.71875, "rewards/chosen": 1.72314453125, "rewards/margins": 5.59765625, "rewards/rejected": -3.875, "step": 4967 }, { "epoch": 0.938731163493788, "grad_norm": 2.153348518099182, "learning_rate": 1.1036292473016679e-07, "logits/chosen": 2.91015625, "logits/rejected": 2.49609375, "logps/chosen": -729.0, "logps/rejected": -852.0, "loss": 0.6435, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0029296875, "rewards/margins": 3.921875, "rewards/rejected": -3.921875, "step": 4968 }, { "epoch": 0.9389201190419953, "grad_norm": 2.6157138283496675, "learning_rate": 1.1029968928585502e-07, "logits/chosen": 3.244140625, "logits/rejected": 2.98828125, "logps/chosen": -933.5, "logps/rejected": -910.0, "loss": 0.5847, "rewards/accuracies": 0.75, "rewards/chosen": 1.3251953125, "rewards/margins": 5.61328125, "rewards/rejected": -4.28515625, "step": 4969 }, { "epoch": 0.9391090745902027, "grad_norm": 2.2862640623083066, "learning_rate": 1.1023664513289642e-07, "logits/chosen": 3.61328125, "logits/rejected": 3.388671875, "logps/chosen": -688.5, "logps/rejected": -761.0, "loss": 0.684, "rewards/accuracies": 0.625, "rewards/chosen": 0.0478515625, "rewards/margins": 3.43359375, "rewards/rejected": -3.390625, "step": 4970 }, { "epoch": 0.93929803013841, "grad_norm": 1.6570673729033385, "learning_rate": 1.1017379229871823e-07, "logits/chosen": 2.326171875, "logits/rejected": 1.9580078125, "logps/chosen": -787.5, "logps/rejected": -1090.0, "loss": 0.5167, "rewards/accuracies": 0.84375, "rewards/chosen": 1.0234375, "rewards/margins": 5.654296875, "rewards/rejected": -4.625, "step": 4971 }, { "epoch": 0.9394869856866173, "grad_norm": 2.26622994049917, "learning_rate": 1.1011113081066455e-07, "logits/chosen": 2.828125, "logits/rejected": 1.97265625, "logps/chosen": -641.0, "logps/rejected": -702.0, "loss": 0.5471, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3394775390625, "rewards/margins": 5.5625, "rewards/rejected": -5.2265625, "step": 4972 }, { "epoch": 0.9396759412348245, "grad_norm": 2.42737567907895, "learning_rate": 1.1004866069599619e-07, "logits/chosen": 2.5625, "logits/rejected": 2.646484375, "logps/chosen": -795.0, "logps/rejected": -1091.0, "loss": 0.6112, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3916015625, "rewards/margins": 5.41015625, "rewards/rejected": -5.01953125, "step": 4973 }, { "epoch": 0.9398648967830318, "grad_norm": 1.2311475070168931, "learning_rate": 1.099863819818907e-07, "logits/chosen": 2.05859375, "logits/rejected": 2.10986328125, "logps/chosen": -915.0, "logps/rejected": -1464.0, "loss": 0.508, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2685546875, "rewards/margins": 7.640625, "rewards/rejected": -6.37890625, "step": 4974 }, { "epoch": 0.9400538523312391, "grad_norm": 2.402419187307311, "learning_rate": 1.0992429469544256e-07, "logits/chosen": 3.1171875, "logits/rejected": 3.1171875, "logps/chosen": -613.5, "logps/rejected": -1448.0, "loss": 0.5422, "rewards/accuracies": 0.875, "rewards/chosen": 0.611572265625, "rewards/margins": 9.25390625, "rewards/rejected": -8.65625, "step": 4975 }, { "epoch": 0.9402428078794464, "grad_norm": 1.8303124710648926, "learning_rate": 1.0986239886366261e-07, "logits/chosen": 3.087890625, "logits/rejected": 2.958984375, "logps/chosen": -699.0, "logps/rejected": -810.0, "loss": 0.5111, "rewards/accuracies": 0.78125, "rewards/chosen": 1.50732421875, "rewards/margins": 6.2265625, "rewards/rejected": -4.71875, "step": 4976 }, { "epoch": 0.9404317634276537, "grad_norm": 2.6627678274301063, "learning_rate": 1.0980069451347872e-07, "logits/chosen": 2.677734375, "logits/rejected": 2.177734375, "logps/chosen": -440.0, "logps/rejected": -475.5, "loss": 0.7851, "rewards/accuracies": 0.78125, "rewards/chosen": -0.835693359375, "rewards/margins": 3.078125, "rewards/rejected": -3.9140625, "step": 4977 }, { "epoch": 0.940620718975861, "grad_norm": 2.6457327445728676, "learning_rate": 1.097391816717353e-07, "logits/chosen": 2.65234375, "logits/rejected": 2.712890625, "logps/chosen": -729.0, "logps/rejected": -745.0, "loss": 0.5294, "rewards/accuracies": 0.78125, "rewards/chosen": 0.94677734375, "rewards/margins": 4.26953125, "rewards/rejected": -3.328125, "step": 4978 }, { "epoch": 0.9408096745240682, "grad_norm": 1.9922553460487398, "learning_rate": 1.0967786036519344e-07, "logits/chosen": 4.04296875, "logits/rejected": 3.359375, "logps/chosen": -1202.0, "logps/rejected": -1280.0, "loss": 0.4453, "rewards/accuracies": 0.875, "rewards/chosen": 2.078125, "rewards/margins": 8.1875, "rewards/rejected": -6.10546875, "step": 4979 }, { "epoch": 0.9409986300722755, "grad_norm": 2.1662361640501535, "learning_rate": 1.0961673062053102e-07, "logits/chosen": 3.2421875, "logits/rejected": 2.494140625, "logps/chosen": -806.0, "logps/rejected": -954.0, "loss": 0.4657, "rewards/accuracies": 0.8125, "rewards/chosen": 1.330078125, "rewards/margins": 6.09765625, "rewards/rejected": -4.763671875, "step": 4980 }, { "epoch": 0.9411875856204828, "grad_norm": 4.326306277259374, "learning_rate": 1.095557924643424e-07, "logits/chosen": 3.3125, "logits/rejected": 3.0, "logps/chosen": -609.0, "logps/rejected": -688.5, "loss": 0.5819, "rewards/accuracies": 0.84375, "rewards/chosen": 0.602783203125, "rewards/margins": 5.50390625, "rewards/rejected": -4.8984375, "step": 4981 }, { "epoch": 0.9413765411686901, "grad_norm": 1.897386211182688, "learning_rate": 1.0949504592313874e-07, "logits/chosen": 3.140625, "logits/rejected": 2.677734375, "logps/chosen": -965.0, "logps/rejected": -1102.0, "loss": 0.4343, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1806640625, "rewards/margins": 7.3671875, "rewards/rejected": -6.19921875, "step": 4982 }, { "epoch": 0.9415654967168974, "grad_norm": 2.340326326406505, "learning_rate": 1.0943449102334778e-07, "logits/chosen": 2.513671875, "logits/rejected": 2.22265625, "logps/chosen": -711.5, "logps/rejected": -928.0, "loss": 0.5189, "rewards/accuracies": 0.96875, "rewards/chosen": 0.24609375, "rewards/margins": 6.36328125, "rewards/rejected": -6.109375, "step": 4983 }, { "epoch": 0.9417544522651047, "grad_norm": 2.078620936838183, "learning_rate": 1.0937412779131384e-07, "logits/chosen": 2.51171875, "logits/rejected": 2.123046875, "logps/chosen": -1252.5, "logps/rejected": -1540.0, "loss": 0.5062, "rewards/accuracies": 0.75, "rewards/chosen": 2.153076171875, "rewards/margins": 8.2109375, "rewards/rejected": -6.03125, "step": 4984 }, { "epoch": 0.9419434078133119, "grad_norm": 4.54638422025523, "learning_rate": 1.0931395625329806e-07, "logits/chosen": 3.40625, "logits/rejected": 3.21875, "logps/chosen": -643.0, "logps/rejected": -661.0, "loss": 0.5452, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1854248046875, "rewards/margins": 5.1171875, "rewards/rejected": -4.9296875, "step": 4985 }, { "epoch": 0.9421323633615192, "grad_norm": 1.6309951745517568, "learning_rate": 1.0925397643547785e-07, "logits/chosen": 3.2109375, "logits/rejected": 3.1396484375, "logps/chosen": -729.0, "logps/rejected": -743.0, "loss": 0.6311, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5361328125, "rewards/margins": 3.943359375, "rewards/rejected": -3.3984375, "step": 4986 }, { "epoch": 0.9423213189097265, "grad_norm": 2.4618175248361998, "learning_rate": 1.0919418836394748e-07, "logits/chosen": 4.0390625, "logits/rejected": 4.015625, "logps/chosen": -763.0, "logps/rejected": -887.5, "loss": 0.6147, "rewards/accuracies": 0.875, "rewards/chosen": 0.9169921875, "rewards/margins": 5.5078125, "rewards/rejected": -4.58984375, "step": 4987 }, { "epoch": 0.9425102744579338, "grad_norm": 1.9609794894466295, "learning_rate": 1.0913459206471769e-07, "logits/chosen": 2.953125, "logits/rejected": 2.6240234375, "logps/chosen": -535.5, "logps/rejected": -688.0, "loss": 0.5753, "rewards/accuracies": 0.8125, "rewards/chosen": 0.365234375, "rewards/margins": 4.47998046875, "rewards/rejected": -4.1171875, "step": 4988 }, { "epoch": 0.9426992300061411, "grad_norm": 2.8329762479353477, "learning_rate": 1.0907518756371586e-07, "logits/chosen": 3.390625, "logits/rejected": 3.2265625, "logps/chosen": -641.0, "logps/rejected": -883.0, "loss": 0.5217, "rewards/accuracies": 0.78125, "rewards/chosen": 1.025146484375, "rewards/margins": 5.2734375, "rewards/rejected": -4.25390625, "step": 4989 }, { "epoch": 0.9428881855543484, "grad_norm": 1.7090304757570176, "learning_rate": 1.0901597488678584e-07, "logits/chosen": 3.685546875, "logits/rejected": 3.26953125, "logps/chosen": -569.0, "logps/rejected": -829.0, "loss": 0.5103, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1298828125, "rewards/margins": 6.1484375, "rewards/rejected": -5.015625, "step": 4990 }, { "epoch": 0.9430771411025556, "grad_norm": 2.718108979652982, "learning_rate": 1.0895695405968808e-07, "logits/chosen": 3.0390625, "logits/rejected": 3.01953125, "logps/chosen": -648.5, "logps/rejected": -608.0, "loss": 0.6947, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17388916015625, "rewards/margins": 2.849609375, "rewards/rejected": -3.0234375, "step": 4991 }, { "epoch": 0.9432660966507629, "grad_norm": 2.367344430837618, "learning_rate": 1.0889812510809957e-07, "logits/chosen": 3.22265625, "logits/rejected": 2.3515625, "logps/chosen": -932.0, "logps/rejected": -744.5, "loss": 0.5595, "rewards/accuracies": 0.84375, "rewards/chosen": 0.10498046875, "rewards/margins": 4.82421875, "rewards/rejected": -4.71875, "step": 4992 }, { "epoch": 0.9434550521989702, "grad_norm": 2.732270522224137, "learning_rate": 1.0883948805761383e-07, "logits/chosen": 3.28515625, "logits/rejected": 3.22265625, "logps/chosen": -633.5, "logps/rejected": -1023.0, "loss": 0.7485, "rewards/accuracies": 0.75, "rewards/chosen": -0.41064453125, "rewards/margins": 2.130859375, "rewards/rejected": -2.544921875, "step": 4993 }, { "epoch": 0.9436440077471775, "grad_norm": 1.9034386704552895, "learning_rate": 1.0878104293374078e-07, "logits/chosen": 2.71484375, "logits/rejected": 3.19140625, "logps/chosen": -1072.0, "logps/rejected": -1784.0, "loss": 0.5791, "rewards/accuracies": 0.71875, "rewards/chosen": 0.332275390625, "rewards/margins": 7.90234375, "rewards/rejected": -7.58984375, "step": 4994 }, { "epoch": 0.9438329632953848, "grad_norm": 1.5999276174586243, "learning_rate": 1.0872278976190696e-07, "logits/chosen": 2.9130859375, "logits/rejected": 2.1689453125, "logps/chosen": -802.0, "logps/rejected": -11767.5, "loss": 0.5759, "rewards/accuracies": 0.875, "rewards/chosen": 1.064453125, "rewards/margins": -165.703125, "rewards/rejected": 166.33203125, "step": 4995 }, { "epoch": 0.944021918843592, "grad_norm": 3.2569424387600976, "learning_rate": 1.0866472856745546e-07, "logits/chosen": 3.318359375, "logits/rejected": 3.0859375, "logps/chosen": -760.0, "logps/rejected": -965.0, "loss": 0.6783, "rewards/accuracies": 0.71875, "rewards/chosen": 1.153564453125, "rewards/margins": 4.40234375, "rewards/rejected": -3.25, "step": 4996 }, { "epoch": 0.9442108743917993, "grad_norm": 3.7331288168701127, "learning_rate": 1.0860685937564572e-07, "logits/chosen": 2.513671875, "logits/rejected": 2.869140625, "logps/chosen": -745.0, "logps/rejected": -1792.5, "loss": 0.6059, "rewards/accuracies": 0.75, "rewards/chosen": 0.94140625, "rewards/margins": 5.58203125, "rewards/rejected": -4.64453125, "step": 4997 }, { "epoch": 0.9443998299400066, "grad_norm": 1.6498457407628413, "learning_rate": 1.0854918221165371e-07, "logits/chosen": 3.3046875, "logits/rejected": 2.8671875, "logps/chosen": -717.0, "logps/rejected": -797.5, "loss": 0.673, "rewards/accuracies": 0.65625, "rewards/chosen": 0.486328125, "rewards/margins": 3.546875, "rewards/rejected": -3.0576171875, "step": 4998 }, { "epoch": 0.9445887854882139, "grad_norm": 2.441166361175493, "learning_rate": 1.0849169710057184e-07, "logits/chosen": 3.208984375, "logits/rejected": 2.564453125, "logps/chosen": -684.0, "logps/rejected": -764.5, "loss": 0.5621, "rewards/accuracies": 0.8125, "rewards/chosen": 0.197265625, "rewards/margins": 4.890625, "rewards/rejected": -4.6953125, "step": 4999 }, { "epoch": 0.9447777410364212, "grad_norm": 4.538495307698657, "learning_rate": 1.0843440406740888e-07, "logits/chosen": 3.3203125, "logits/rejected": 2.9609375, "logps/chosen": -703.5, "logps/rejected": -606.5, "loss": 0.6105, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8248291015625, "rewards/margins": 4.73046875, "rewards/rejected": -3.90234375, "step": 5000 }, { "epoch": 0.9449666965846285, "grad_norm": 2.1080775149850925, "learning_rate": 1.083773031370903e-07, "logits/chosen": 2.7734375, "logits/rejected": 2.72265625, "logps/chosen": -611.5, "logps/rejected": -977.0, "loss": 0.8023, "rewards/accuracies": 0.75, "rewards/chosen": -0.2412109375, "rewards/margins": 3.98046875, "rewards/rejected": -4.2236328125, "step": 5001 }, { "epoch": 0.9451556521328357, "grad_norm": 3.2100592135283526, "learning_rate": 1.0832039433445769e-07, "logits/chosen": 1.9921875, "logits/rejected": 1.7978515625, "logps/chosen": -548.5, "logps/rejected": -14330.0, "loss": 0.5625, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2081298828125, "rewards/margins": 6.91015625, "rewards/rejected": -6.6953125, "step": 5002 }, { "epoch": 0.945344607681043, "grad_norm": 2.6804590495084715, "learning_rate": 1.0826367768426928e-07, "logits/chosen": 3.83984375, "logits/rejected": 2.974609375, "logps/chosen": -590.0, "logps/rejected": -542.0, "loss": 0.6124, "rewards/accuracies": 0.71875, "rewards/chosen": 0.98974609375, "rewards/margins": 4.515625, "rewards/rejected": -3.52734375, "step": 5003 }, { "epoch": 0.9455335632292503, "grad_norm": 3.2017355737503586, "learning_rate": 1.0820715321119954e-07, "logits/chosen": 2.884765625, "logits/rejected": 3.30859375, "logps/chosen": -758.0, "logps/rejected": -829.0, "loss": 0.5532, "rewards/accuracies": 0.84375, "rewards/chosen": 1.3115234375, "rewards/margins": 5.26953125, "rewards/rejected": -3.95703125, "step": 5004 }, { "epoch": 0.9457225187774576, "grad_norm": 2.139325272938338, "learning_rate": 1.0815082093983939e-07, "logits/chosen": 3.3828125, "logits/rejected": 2.615234375, "logps/chosen": -1259.5, "logps/rejected": -1194.0, "loss": 0.4656, "rewards/accuracies": 0.9375, "rewards/chosen": 2.08984375, "rewards/margins": 6.80859375, "rewards/rejected": -4.70703125, "step": 5005 }, { "epoch": 0.9459114743256649, "grad_norm": 2.2709049154484955, "learning_rate": 1.0809468089469629e-07, "logits/chosen": 3.94921875, "logits/rejected": 3.53125, "logps/chosen": -723.0, "logps/rejected": -782.5, "loss": 0.637, "rewards/accuracies": 0.875, "rewards/chosen": 0.51953125, "rewards/margins": 4.18359375, "rewards/rejected": -3.65234375, "step": 5006 }, { "epoch": 0.9461004298738722, "grad_norm": 2.53713967184887, "learning_rate": 1.0803873310019381e-07, "logits/chosen": 1.8388671875, "logits/rejected": 1.348388671875, "logps/chosen": -1351.0, "logps/rejected": -1046.0, "loss": 0.4754, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0888671875, "rewards/margins": 5.13671875, "rewards/rejected": -5.22265625, "step": 5007 }, { "epoch": 0.9462893854220794, "grad_norm": 2.2358982108556638, "learning_rate": 1.0798297758067199e-07, "logits/chosen": 3.119140625, "logits/rejected": 2.875, "logps/chosen": -513.5, "logps/rejected": -928.0, "loss": 0.545, "rewards/accuracies": 0.875, "rewards/chosen": 0.5361328125, "rewards/margins": 6.7421875, "rewards/rejected": -6.1875, "step": 5008 }, { "epoch": 0.9464783409702867, "grad_norm": 2.786495271905603, "learning_rate": 1.0792741436038732e-07, "logits/chosen": 2.7109375, "logits/rejected": 2.322265625, "logps/chosen": -805.5, "logps/rejected": -631.5, "loss": 0.6253, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8836669921875, "rewards/margins": 4.1484375, "rewards/rejected": -3.26171875, "step": 5009 }, { "epoch": 0.946667296518494, "grad_norm": 2.4346193891141086, "learning_rate": 1.0787204346351251e-07, "logits/chosen": 2.44921875, "logits/rejected": 2.0078125, "logps/chosen": -601.0, "logps/rejected": -1065.0, "loss": 0.6297, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2646484375, "rewards/margins": 7.796875, "rewards/rejected": -7.515625, "step": 5010 }, { "epoch": 0.9468562520667013, "grad_norm": 3.1096768483230526, "learning_rate": 1.0781686491413666e-07, "logits/chosen": 2.3232421875, "logits/rejected": 2.347900390625, "logps/chosen": -808.0, "logps/rejected": -1132.0, "loss": 0.6434, "rewards/accuracies": 0.78125, "rewards/chosen": 0.75, "rewards/margins": 6.359375, "rewards/rejected": -5.6015625, "step": 5011 }, { "epoch": 0.9470452076149086, "grad_norm": 4.172437403752011, "learning_rate": 1.0776187873626509e-07, "logits/chosen": 2.91015625, "logits/rejected": 3.15234375, "logps/chosen": -439.5, "logps/rejected": -803.0, "loss": 0.64, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1736297607421875, "rewards/margins": 6.13671875, "rewards/rejected": -6.328125, "step": 5012 }, { "epoch": 0.9472341631631159, "grad_norm": 2.055375076045177, "learning_rate": 1.0770708495381956e-07, "logits/chosen": 3.1953125, "logits/rejected": 3.30859375, "logps/chosen": -525.0, "logps/rejected": -1587.5, "loss": 0.6955, "rewards/accuracies": 0.84375, "rewards/chosen": 0.235107421875, "rewards/margins": 6.421875, "rewards/rejected": -6.1953125, "step": 5013 }, { "epoch": 0.9474231187113231, "grad_norm": 2.7319226587699807, "learning_rate": 1.0765248359063805e-07, "logits/chosen": 2.740234375, "logits/rejected": 2.99609375, "logps/chosen": -838.0, "logps/rejected": -1711.5, "loss": 0.6228, "rewards/accuracies": 0.75, "rewards/chosen": 1.166015625, "rewards/margins": 7.48828125, "rewards/rejected": -6.33203125, "step": 5014 }, { "epoch": 0.9476120742595304, "grad_norm": 2.9525635032873185, "learning_rate": 1.0759807467047487e-07, "logits/chosen": 3.166015625, "logits/rejected": 3.01171875, "logps/chosen": -682.5, "logps/rejected": -764.0, "loss": 0.5285, "rewards/accuracies": 0.84375, "rewards/chosen": 1.33203125, "rewards/margins": 5.3515625, "rewards/rejected": -4.01171875, "step": 5015 }, { "epoch": 0.9478010298077377, "grad_norm": 3.1442619299603596, "learning_rate": 1.0754385821700058e-07, "logits/chosen": 2.771484375, "logits/rejected": 3.09375, "logps/chosen": -572.5, "logps/rejected": -958.0, "loss": 0.6451, "rewards/accuracies": 0.75, "rewards/chosen": 0.28125, "rewards/margins": 4.2421875, "rewards/rejected": -3.953125, "step": 5016 }, { "epoch": 0.947989985355945, "grad_norm": 2.9734231775649955, "learning_rate": 1.07489834253802e-07, "logits/chosen": 3.80859375, "logits/rejected": 3.9609375, "logps/chosen": -582.5, "logps/rejected": -645.0, "loss": 0.6641, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5321044921875, "rewards/margins": 3.568359375, "rewards/rejected": -3.0322265625, "step": 5017 }, { "epoch": 0.9481789409041523, "grad_norm": 2.4578077160634773, "learning_rate": 1.0743600280438222e-07, "logits/chosen": 2.37158203125, "logits/rejected": 1.95947265625, "logps/chosen": -659.5, "logps/rejected": -782.0, "loss": 0.4704, "rewards/accuracies": 0.875, "rewards/chosen": 0.9951171875, "rewards/margins": 5.84375, "rewards/rejected": -4.83984375, "step": 5018 }, { "epoch": 0.9483678964523596, "grad_norm": 2.73981500331787, "learning_rate": 1.0738236389216066e-07, "logits/chosen": 3.01953125, "logits/rejected": 2.81640625, "logps/chosen": -562.5, "logps/rejected": -781.0, "loss": 0.5685, "rewards/accuracies": 0.75, "rewards/chosen": 0.7230224609375, "rewards/margins": 4.390625, "rewards/rejected": -3.67578125, "step": 5019 }, { "epoch": 0.9485568520005668, "grad_norm": 2.4238753483975772, "learning_rate": 1.073289175404728e-07, "logits/chosen": 2.1044921875, "logits/rejected": 2.07421875, "logps/chosen": -675.0, "logps/rejected": -786.0, "loss": 0.5048, "rewards/accuracies": 0.90625, "rewards/chosen": 0.528564453125, "rewards/margins": 6.2109375, "rewards/rejected": -5.67578125, "step": 5020 }, { "epoch": 0.9487458075487741, "grad_norm": 2.5259516234097465, "learning_rate": 1.0727566377257045e-07, "logits/chosen": 1.94921875, "logits/rejected": 2.0703125, "logps/chosen": -503.5, "logps/rejected": -850.5, "loss": 0.5354, "rewards/accuracies": 0.875, "rewards/chosen": 0.580078125, "rewards/margins": 5.421875, "rewards/rejected": -4.84375, "step": 5021 }, { "epoch": 0.9489347630969814, "grad_norm": 4.798135847931553, "learning_rate": 1.072226026116217e-07, "logits/chosen": 2.515625, "logits/rejected": 2.89453125, "logps/chosen": -906.0, "logps/rejected": -771.0, "loss": 0.4975, "rewards/accuracies": 0.875, "rewards/chosen": 0.37451171875, "rewards/margins": 5.16796875, "rewards/rejected": -4.80078125, "step": 5022 }, { "epoch": 0.9491237186451887, "grad_norm": 3.6197069580866947, "learning_rate": 1.0716973408071067e-07, "logits/chosen": 2.931640625, "logits/rejected": 2.791015625, "logps/chosen": -1003.0, "logps/rejected": -1704.0, "loss": 0.4728, "rewards/accuracies": 0.875, "rewards/chosen": 0.689453125, "rewards/margins": 7.59375, "rewards/rejected": -6.88671875, "step": 5023 }, { "epoch": 0.949312674193396, "grad_norm": 3.292514320035349, "learning_rate": 1.0711705820283787e-07, "logits/chosen": 2.64453125, "logits/rejected": 2.0966796875, "logps/chosen": -1145.0, "logps/rejected": -983.0, "loss": 0.5241, "rewards/accuracies": 0.8125, "rewards/chosen": 0.793701171875, "rewards/margins": 5.6171875, "rewards/rejected": -4.81640625, "step": 5024 }, { "epoch": 0.9495016297416032, "grad_norm": 2.8109895794534685, "learning_rate": 1.070645750009199e-07, "logits/chosen": 1.958984375, "logits/rejected": 1.3394775390625, "logps/chosen": -563.25, "logps/rejected": -553.0, "loss": 0.5534, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6630859375, "rewards/margins": 4.47265625, "rewards/rejected": -3.80859375, "step": 5025 }, { "epoch": 0.9496905852898105, "grad_norm": 2.223822611110257, "learning_rate": 1.0701228449778946e-07, "logits/chosen": 3.2890625, "logits/rejected": 2.83984375, "logps/chosen": -551.0, "logps/rejected": -862.5, "loss": 0.5747, "rewards/accuracies": 0.8125, "rewards/chosen": 0.537841796875, "rewards/margins": 6.6015625, "rewards/rejected": -6.0625, "step": 5026 }, { "epoch": 0.9498795408380178, "grad_norm": 2.08600467720253, "learning_rate": 1.069601867161957e-07, "logits/chosen": 3.69140625, "logits/rejected": 3.375, "logps/chosen": -515.0, "logps/rejected": -667.5, "loss": 0.6143, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2587890625, "rewards/margins": 4.37109375, "rewards/rejected": -4.125, "step": 5027 }, { "epoch": 0.9500684963862251, "grad_norm": 2.8500980335349624, "learning_rate": 1.0690828167880342e-07, "logits/chosen": 3.2265625, "logits/rejected": 2.59130859375, "logps/chosen": -635.0, "logps/rejected": -759.0, "loss": 0.4896, "rewards/accuracies": 0.9375, "rewards/chosen": 0.689208984375, "rewards/margins": 5.1484375, "rewards/rejected": -4.45703125, "step": 5028 }, { "epoch": 0.9502574519344325, "grad_norm": 2.6945872078412743, "learning_rate": 1.0685656940819415e-07, "logits/chosen": 2.626953125, "logits/rejected": 2.6328125, "logps/chosen": -757.0, "logps/rejected": -842.0, "loss": 0.6143, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4732666015625, "rewards/margins": 4.82421875, "rewards/rejected": -4.35546875, "step": 5029 }, { "epoch": 0.9504464074826398, "grad_norm": 2.045055778404522, "learning_rate": 1.0680504992686521e-07, "logits/chosen": 3.14453125, "logits/rejected": 2.791015625, "logps/chosen": -975.0, "logps/rejected": -744.0, "loss": 0.5522, "rewards/accuracies": 0.8125, "rewards/chosen": 1.244873046875, "rewards/margins": 5.58984375, "rewards/rejected": -4.34765625, "step": 5030 }, { "epoch": 0.950635363030847, "grad_norm": 2.920542245716153, "learning_rate": 1.0675372325723003e-07, "logits/chosen": 3.64453125, "logits/rejected": 3.556640625, "logps/chosen": -839.0, "logps/rejected": -932.0, "loss": 0.7086, "rewards/accuracies": 0.6875, "rewards/chosen": 0.43017578125, "rewards/margins": 4.50390625, "rewards/rejected": -4.06640625, "step": 5031 }, { "epoch": 0.9508243185790543, "grad_norm": 6.05991926309632, "learning_rate": 1.0670258942161836e-07, "logits/chosen": 2.72265625, "logits/rejected": 2.578125, "logps/chosen": -664.0, "logps/rejected": -5803.5, "loss": 0.6261, "rewards/accuracies": 0.75, "rewards/chosen": 0.20703125, "rewards/margins": -16.830078125, "rewards/rejected": 16.95703125, "step": 5032 }, { "epoch": 0.9510132741272616, "grad_norm": 2.9597707182213036, "learning_rate": 1.0665164844227583e-07, "logits/chosen": 2.642578125, "logits/rejected": 2.4560546875, "logps/chosen": -688.0, "logps/rejected": -766.0, "loss": 0.5865, "rewards/accuracies": 0.78125, "rewards/chosen": 0.302001953125, "rewards/margins": 4.921875, "rewards/rejected": -4.6328125, "step": 5033 }, { "epoch": 0.9512022296754689, "grad_norm": 4.56079298788761, "learning_rate": 1.0660090034136439e-07, "logits/chosen": 3.0625, "logits/rejected": 2.173828125, "logps/chosen": -765.0, "logps/rejected": -645.5, "loss": 0.4586, "rewards/accuracies": 0.96875, "rewards/chosen": 0.62158203125, "rewards/margins": 6.2265625, "rewards/rejected": -5.60546875, "step": 5034 }, { "epoch": 0.9513911852236762, "grad_norm": 1.443516798420587, "learning_rate": 1.065503451409619e-07, "logits/chosen": 2.21875, "logits/rejected": 1.9949951171875, "logps/chosen": -958.0, "logps/rejected": -890.0, "loss": 0.5863, "rewards/accuracies": 0.78125, "rewards/chosen": 1.54296875, "rewards/margins": 6.1484375, "rewards/rejected": -4.59765625, "step": 5035 }, { "epoch": 0.9515801407718835, "grad_norm": 3.092279303271009, "learning_rate": 1.064999828630624e-07, "logits/chosen": 2.693359375, "logits/rejected": 2.228515625, "logps/chosen": -652.5, "logps/rejected": -542.0, "loss": 0.7705, "rewards/accuracies": 0.59375, "rewards/chosen": -0.52685546875, "rewards/margins": 1.7958984375, "rewards/rejected": -2.322265625, "step": 5036 }, { "epoch": 0.9517690963200907, "grad_norm": 2.1053127448548428, "learning_rate": 1.06449813529576e-07, "logits/chosen": 3.0625, "logits/rejected": 2.14453125, "logps/chosen": -765.5, "logps/rejected": -839.0, "loss": 0.5763, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0830078125, "rewards/margins": 4.28125, "rewards/rejected": -4.19921875, "step": 5037 }, { "epoch": 0.951958051868298, "grad_norm": 3.175606631216071, "learning_rate": 1.0639983716232871e-07, "logits/chosen": 2.7890625, "logits/rejected": 3.162109375, "logps/chosen": -881.0, "logps/rejected": -1056.0, "loss": 0.5809, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2568359375, "rewards/margins": 4.9765625, "rewards/rejected": -3.72265625, "step": 5038 }, { "epoch": 0.9521470074165053, "grad_norm": 2.7485849292045406, "learning_rate": 1.0635005378306289e-07, "logits/chosen": 3.611328125, "logits/rejected": 3.0234375, "logps/chosen": -869.0, "logps/rejected": -886.0, "loss": 0.5982, "rewards/accuracies": 0.8125, "rewards/chosen": 0.625, "rewards/margins": 5.0546875, "rewards/rejected": -4.4375, "step": 5039 }, { "epoch": 0.9523359629647126, "grad_norm": 2.9112490251682166, "learning_rate": 1.0630046341343663e-07, "logits/chosen": 3.046875, "logits/rejected": 2.9375, "logps/chosen": -855.5, "logps/rejected": -1260.0, "loss": 0.7427, "rewards/accuracies": 0.75, "rewards/chosen": -0.17822265625, "rewards/margins": 8.869140625, "rewards/rejected": -9.044921875, "step": 5040 }, { "epoch": 0.9525249185129199, "grad_norm": 2.4824522731446823, "learning_rate": 1.0625106607502429e-07, "logits/chosen": 2.4765625, "logits/rejected": 2.7890625, "logps/chosen": -728.0, "logps/rejected": -1421.0, "loss": 0.48, "rewards/accuracies": 0.875, "rewards/chosen": 0.43359375, "rewards/margins": 6.84375, "rewards/rejected": -6.40625, "step": 5041 }, { "epoch": 0.9527138740611272, "grad_norm": 1.226537174227158, "learning_rate": 1.0620186178931612e-07, "logits/chosen": 3.35546875, "logits/rejected": 2.5078125, "logps/chosen": -694.25, "logps/rejected": -682.5, "loss": 0.6206, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22705078125, "rewards/margins": 4.853515625, "rewards/rejected": -4.625, "step": 5042 }, { "epoch": 0.9529028296093344, "grad_norm": 2.081960100092094, "learning_rate": 1.061528505777184e-07, "logits/chosen": 3.22265625, "logits/rejected": 2.708984375, "logps/chosen": -745.5, "logps/rejected": -784.5, "loss": 0.5717, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4609375, "rewards/margins": 4.9453125, "rewards/rejected": -3.48828125, "step": 5043 }, { "epoch": 0.9530917851575417, "grad_norm": 1.6844849590963342, "learning_rate": 1.0610403246155337e-07, "logits/chosen": 3.515625, "logits/rejected": 3.06640625, "logps/chosen": -598.5, "logps/rejected": -617.0, "loss": 0.4701, "rewards/accuracies": 0.875, "rewards/chosen": 1.041015625, "rewards/margins": 5.75, "rewards/rejected": -4.7109375, "step": 5044 }, { "epoch": 0.953280740705749, "grad_norm": 2.4783667819818747, "learning_rate": 1.0605540746205945e-07, "logits/chosen": 2.63671875, "logits/rejected": 2.3740234375, "logps/chosen": -1171.0, "logps/rejected": -1542.0, "loss": 0.4463, "rewards/accuracies": 0.875, "rewards/chosen": 1.6845703125, "rewards/margins": 9.8828125, "rewards/rejected": -8.20703125, "step": 5045 }, { "epoch": 0.9534696962539563, "grad_norm": 3.5817268568272325, "learning_rate": 1.060069756003908e-07, "logits/chosen": 3.86328125, "logits/rejected": 3.46875, "logps/chosen": -1123.75, "logps/rejected": -14327.0, "loss": 0.5957, "rewards/accuracies": 0.8125, "rewards/chosen": 1.169921875, "rewards/margins": -171.2421875, "rewards/rejected": 171.82421875, "step": 5046 }, { "epoch": 0.9536586518021636, "grad_norm": 1.9946807679280987, "learning_rate": 1.0595873689761774e-07, "logits/chosen": 3.02734375, "logits/rejected": 2.439453125, "logps/chosen": -801.0, "logps/rejected": -732.0, "loss": 0.5966, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6533203125, "rewards/margins": 5.80859375, "rewards/rejected": -5.15234375, "step": 5047 }, { "epoch": 0.9538476073503708, "grad_norm": 2.521339003397087, "learning_rate": 1.0591069137472652e-07, "logits/chosen": 3.23828125, "logits/rejected": 3.10546875, "logps/chosen": -741.5, "logps/rejected": -779.0, "loss": 0.6646, "rewards/accuracies": 0.8125, "rewards/chosen": 0.212158203125, "rewards/margins": 4.34375, "rewards/rejected": -4.13671875, "step": 5048 }, { "epoch": 0.9540365628985781, "grad_norm": 2.173346325216758, "learning_rate": 1.0586283905261917e-07, "logits/chosen": 2.37890625, "logits/rejected": 2.5234375, "logps/chosen": -607.0, "logps/rejected": -517.0, "loss": 0.7068, "rewards/accuracies": 0.75, "rewards/chosen": -0.02911376953125, "rewards/margins": 2.94140625, "rewards/rejected": -2.96875, "step": 5049 }, { "epoch": 0.9542255184467854, "grad_norm": 1.9204673262237717, "learning_rate": 1.0581517995211403e-07, "logits/chosen": 2.76171875, "logits/rejected": 2.0, "logps/chosen": -1675.0, "logps/rejected": -584.0, "loss": 0.4943, "rewards/accuracies": 0.84375, "rewards/chosen": -0.45703125, "rewards/margins": 4.37060546875, "rewards/rejected": -4.8359375, "step": 5050 }, { "epoch": 0.9544144739949927, "grad_norm": 4.388500355176729, "learning_rate": 1.0576771409394495e-07, "logits/chosen": 3.03515625, "logits/rejected": 2.96875, "logps/chosen": -796.5, "logps/rejected": -782.0, "loss": 0.6067, "rewards/accuracies": 0.8125, "rewards/chosen": -0.94970703125, "rewards/margins": 4.662109375, "rewards/rejected": -5.6171875, "step": 5051 }, { "epoch": 0.9546034295432, "grad_norm": 2.0590432776878185, "learning_rate": 1.0572044149876206e-07, "logits/chosen": 3.33203125, "logits/rejected": 3.3046875, "logps/chosen": -594.0, "logps/rejected": -995.0, "loss": 0.4902, "rewards/accuracies": 0.90625, "rewards/chosen": 0.92431640625, "rewards/margins": 7.984375, "rewards/rejected": -7.0390625, "step": 5052 }, { "epoch": 0.9547923850914073, "grad_norm": 1.9857438995021401, "learning_rate": 1.0567336218713119e-07, "logits/chosen": 2.96875, "logits/rejected": 2.431640625, "logps/chosen": -846.0, "logps/rejected": -774.0, "loss": 0.479, "rewards/accuracies": 0.78125, "rewards/chosen": 0.603271484375, "rewards/margins": 5.8359375, "rewards/rejected": -5.234375, "step": 5053 }, { "epoch": 0.9549813406396145, "grad_norm": 2.9108597707704855, "learning_rate": 1.0562647617953418e-07, "logits/chosen": 2.515625, "logits/rejected": 2.494140625, "logps/chosen": -1153.0, "logps/rejected": -1822.0, "loss": 0.6009, "rewards/accuracies": 0.84375, "rewards/chosen": 0.85546875, "rewards/margins": 4.51025390625, "rewards/rejected": -3.666015625, "step": 5054 }, { "epoch": 0.9551702961878218, "grad_norm": 1.976027550712373, "learning_rate": 1.0557978349636887e-07, "logits/chosen": 3.12890625, "logits/rejected": 3.111328125, "logps/chosen": -644.5, "logps/rejected": -882.0, "loss": 0.4822, "rewards/accuracies": 0.78125, "rewards/chosen": 0.96875, "rewards/margins": 5.8125, "rewards/rejected": -4.8515625, "step": 5055 }, { "epoch": 0.9553592517360291, "grad_norm": 2.1349546605065117, "learning_rate": 1.0553328415794876e-07, "logits/chosen": 3.779296875, "logits/rejected": 3.287109375, "logps/chosen": -640.0, "logps/rejected": -690.5, "loss": 0.5768, "rewards/accuracies": 0.75, "rewards/chosen": 0.65185546875, "rewards/margins": 8.02734375, "rewards/rejected": -7.34375, "step": 5056 }, { "epoch": 0.9555482072842364, "grad_norm": 4.671711782318451, "learning_rate": 1.0548697818450336e-07, "logits/chosen": 2.875, "logits/rejected": 2.6435546875, "logps/chosen": -954.0, "logps/rejected": -1549.0, "loss": 0.5331, "rewards/accuracies": 0.9375, "rewards/chosen": 0.31964111328125, "rewards/margins": 6.8359375, "rewards/rejected": -6.515625, "step": 5057 }, { "epoch": 0.9557371628324437, "grad_norm": 1.653260970541865, "learning_rate": 1.0544086559617816e-07, "logits/chosen": 3.34375, "logits/rejected": 2.96484375, "logps/chosen": -1383.0, "logps/rejected": -2104.0, "loss": 0.4995, "rewards/accuracies": 0.8125, "rewards/chosen": 2.369140625, "rewards/margins": 11.3515625, "rewards/rejected": -8.9921875, "step": 5058 }, { "epoch": 0.955926118380651, "grad_norm": 2.9283921296718534, "learning_rate": 1.0539494641303426e-07, "logits/chosen": 2.982421875, "logits/rejected": 2.81640625, "logps/chosen": -881.5, "logps/rejected": -833.0, "loss": 0.5354, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4429931640625, "rewards/margins": 6.2265625, "rewards/rejected": -5.7734375, "step": 5059 }, { "epoch": 0.9561150739288582, "grad_norm": 3.6187476596187063, "learning_rate": 1.0534922065504895e-07, "logits/chosen": 3.083984375, "logits/rejected": 2.78125, "logps/chosen": -629.0, "logps/rejected": -644.0, "loss": 0.5597, "rewards/accuracies": 0.875, "rewards/chosen": 0.6865234375, "rewards/margins": 5.05078125, "rewards/rejected": -4.359375, "step": 5060 }, { "epoch": 0.9563040294770655, "grad_norm": 3.9019435841369687, "learning_rate": 1.0530368834211503e-07, "logits/chosen": 2.9375, "logits/rejected": 3.0703125, "logps/chosen": -669.5, "logps/rejected": -1367.5, "loss": 0.5194, "rewards/accuracies": 0.90625, "rewards/chosen": 1.05078125, "rewards/margins": 5.96875, "rewards/rejected": -4.91796875, "step": 5061 }, { "epoch": 0.9564929850252728, "grad_norm": 2.7450538529586823, "learning_rate": 1.0525834949404143e-07, "logits/chosen": 2.875, "logits/rejected": 2.95703125, "logps/chosen": -825.0, "logps/rejected": -1248.0, "loss": 0.3663, "rewards/accuracies": 0.96875, "rewards/chosen": 0.9130859375, "rewards/margins": 9.75, "rewards/rejected": -8.8203125, "step": 5062 }, { "epoch": 0.9566819405734801, "grad_norm": 2.6241589044647067, "learning_rate": 1.0521320413055269e-07, "logits/chosen": 3.13671875, "logits/rejected": 2.716796875, "logps/chosen": -524.0, "logps/rejected": -548.0, "loss": 0.6408, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09619140625, "rewards/margins": 4.30078125, "rewards/rejected": -4.39453125, "step": 5063 }, { "epoch": 0.9568708961216874, "grad_norm": 2.5142763762210274, "learning_rate": 1.0516825227128933e-07, "logits/chosen": 2.287109375, "logits/rejected": 1.78125, "logps/chosen": -606.0, "logps/rejected": -738.0, "loss": 0.5395, "rewards/accuracies": 0.875, "rewards/chosen": 0.4632568359375, "rewards/margins": 4.95703125, "rewards/rejected": -4.484375, "step": 5064 }, { "epoch": 0.9570598516698947, "grad_norm": 1.706896515304459, "learning_rate": 1.0512349393580765e-07, "logits/chosen": 3.26171875, "logits/rejected": 2.97265625, "logps/chosen": -1029.0, "logps/rejected": -1185.0, "loss": 0.5487, "rewards/accuracies": 0.8125, "rewards/chosen": 1.462890625, "rewards/margins": 6.6953125, "rewards/rejected": -5.21484375, "step": 5065 }, { "epoch": 0.9572488072181019, "grad_norm": 5.1227700668841125, "learning_rate": 1.0507892914357967e-07, "logits/chosen": 3.37109375, "logits/rejected": 2.62890625, "logps/chosen": -962.0, "logps/rejected": -897.5, "loss": 0.6641, "rewards/accuracies": 0.75, "rewards/chosen": 1.147705078125, "rewards/margins": 4.73046875, "rewards/rejected": -3.576171875, "step": 5066 }, { "epoch": 0.9574377627663092, "grad_norm": 2.3402560638576095, "learning_rate": 1.050345579139933e-07, "logits/chosen": 3.080078125, "logits/rejected": 3.025390625, "logps/chosen": -1147.0, "logps/rejected": -948.0, "loss": 0.6223, "rewards/accuracies": 0.84375, "rewards/chosen": 0.586669921875, "rewards/margins": 4.9140625, "rewards/rejected": -4.32421875, "step": 5067 }, { "epoch": 0.9576267183145165, "grad_norm": 1.9003560942960436, "learning_rate": 1.0499038026635228e-07, "logits/chosen": 2.771484375, "logits/rejected": 3.041015625, "logps/chosen": -691.0, "logps/rejected": -667.0, "loss": 0.6845, "rewards/accuracies": 0.78125, "rewards/chosen": 0.38330078125, "rewards/margins": 4.01171875, "rewards/rejected": -3.63671875, "step": 5068 }, { "epoch": 0.9578156738627238, "grad_norm": 3.6219115250934415, "learning_rate": 1.0494639621987599e-07, "logits/chosen": 3.296875, "logits/rejected": 2.431640625, "logps/chosen": -973.0, "logps/rejected": -849.5, "loss": 0.4747, "rewards/accuracies": 0.84375, "rewards/chosen": 1.2044677734375, "rewards/margins": 6.49609375, "rewards/rejected": -5.2890625, "step": 5069 }, { "epoch": 0.9580046294109311, "grad_norm": 3.8619644624848233, "learning_rate": 1.0490260579369964e-07, "logits/chosen": 3.4453125, "logits/rejected": 2.94140625, "logps/chosen": -996.0, "logps/rejected": -1124.0, "loss": 0.4602, "rewards/accuracies": 0.84375, "rewards/chosen": 0.708740234375, "rewards/margins": 8.203125, "rewards/rejected": -7.4921875, "step": 5070 }, { "epoch": 0.9581935849591383, "grad_norm": 6.976966527666543, "learning_rate": 1.0485900900687437e-07, "logits/chosen": 3.296875, "logits/rejected": 2.818359375, "logps/chosen": -654.0, "logps/rejected": -800.0, "loss": 0.5819, "rewards/accuracies": 0.8125, "rewards/chosen": 0.516357421875, "rewards/margins": 4.17578125, "rewards/rejected": -3.66015625, "step": 5071 }, { "epoch": 0.9583825405073456, "grad_norm": 2.1272256487884786, "learning_rate": 1.0481560587836675e-07, "logits/chosen": 2.89453125, "logits/rejected": 2.83984375, "logps/chosen": -963.0, "logps/rejected": -2125.0, "loss": 0.5968, "rewards/accuracies": 0.71875, "rewards/chosen": 0.677734375, "rewards/margins": 5.197265625, "rewards/rejected": -4.5078125, "step": 5072 }, { "epoch": 0.9585714960555529, "grad_norm": 3.4276765572914725, "learning_rate": 1.0477239642705941e-07, "logits/chosen": 2.751953125, "logits/rejected": 2.4443359375, "logps/chosen": -852.5, "logps/rejected": -1279.5, "loss": 0.5756, "rewards/accuracies": 0.875, "rewards/chosen": 1.0433349609375, "rewards/margins": 5.75, "rewards/rejected": -4.71484375, "step": 5073 }, { "epoch": 0.9587604516037602, "grad_norm": 1.4684404556926174, "learning_rate": 1.0472938067175058e-07, "logits/chosen": 3.8203125, "logits/rejected": 3.453125, "logps/chosen": -737.0, "logps/rejected": -1498.0, "loss": 0.6048, "rewards/accuracies": 0.84375, "rewards/chosen": 1.802734375, "rewards/margins": 9.125, "rewards/rejected": -7.328125, "step": 5074 }, { "epoch": 0.9589494071519675, "grad_norm": 2.2858024974606206, "learning_rate": 1.0468655863115416e-07, "logits/chosen": 2.482421875, "logits/rejected": 2.052734375, "logps/chosen": -687.0, "logps/rejected": -966.0, "loss": 0.5234, "rewards/accuracies": 0.8125, "rewards/chosen": 0.21099853515625, "rewards/margins": 6.77734375, "rewards/rejected": -6.55078125, "step": 5075 }, { "epoch": 0.9591383627001748, "grad_norm": 4.775172725189114, "learning_rate": 1.0464393032389995e-07, "logits/chosen": 3.609375, "logits/rejected": 3.28125, "logps/chosen": -711.25, "logps/rejected": -991.0, "loss": 0.5549, "rewards/accuracies": 0.75, "rewards/chosen": 0.39892578125, "rewards/margins": 5.7265625, "rewards/rejected": -5.32421875, "step": 5076 }, { "epoch": 0.959327318248382, "grad_norm": 2.617086609437149, "learning_rate": 1.0460149576853334e-07, "logits/chosen": 2.583984375, "logits/rejected": 1.9736328125, "logps/chosen": -912.0, "logps/rejected": -701.0, "loss": 0.5568, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2734375, "rewards/margins": 5.4375, "rewards/rejected": -5.171875, "step": 5077 }, { "epoch": 0.9595162737965893, "grad_norm": 2.1979327111086633, "learning_rate": 1.0455925498351544e-07, "logits/chosen": 2.716796875, "logits/rejected": 2.462890625, "logps/chosen": -921.0, "logps/rejected": -1370.0, "loss": 0.522, "rewards/accuracies": 0.84375, "rewards/chosen": 0.615966796875, "rewards/margins": 7.109375, "rewards/rejected": -6.5, "step": 5078 }, { "epoch": 0.9597052293447966, "grad_norm": 3.6319614844485786, "learning_rate": 1.045172079872231e-07, "logits/chosen": 3.3046875, "logits/rejected": 3.251953125, "logps/chosen": -609.5, "logps/rejected": -1302.0, "loss": 0.6378, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0751953125, "rewards/margins": 5.41796875, "rewards/rejected": -5.34765625, "step": 5079 }, { "epoch": 0.9598941848930039, "grad_norm": 2.5306074066371593, "learning_rate": 1.044753547979488e-07, "logits/chosen": 2.57421875, "logits/rejected": 2.626953125, "logps/chosen": -898.0, "logps/rejected": -9082.0, "loss": 0.5608, "rewards/accuracies": 0.84375, "rewards/chosen": 1.5830078125, "rewards/margins": -81.71875, "rewards/rejected": 82.927734375, "step": 5080 }, { "epoch": 0.9600831404412112, "grad_norm": 2.426767391763187, "learning_rate": 1.0443369543390083e-07, "logits/chosen": 2.794921875, "logits/rejected": 2.72265625, "logps/chosen": -897.0, "logps/rejected": -1339.0, "loss": 0.7042, "rewards/accuracies": 0.71875, "rewards/chosen": 0.9580078125, "rewards/margins": 5.859375, "rewards/rejected": -4.904296875, "step": 5081 }, { "epoch": 0.9602720959894185, "grad_norm": 2.3866719204090066, "learning_rate": 1.0439222991320293e-07, "logits/chosen": 2.6328125, "logits/rejected": 2.767578125, "logps/chosen": -678.5, "logps/rejected": -2159.0, "loss": 0.5884, "rewards/accuracies": 0.78125, "rewards/chosen": 1.21728515625, "rewards/margins": 5.5703125, "rewards/rejected": -4.35546875, "step": 5082 }, { "epoch": 0.9604610515376257, "grad_norm": 2.6148725028508903, "learning_rate": 1.0435095825389482e-07, "logits/chosen": 2.732421875, "logits/rejected": 2.734375, "logps/chosen": -865.0, "logps/rejected": -1099.5, "loss": 0.5861, "rewards/accuracies": 0.78125, "rewards/chosen": 0.47509765625, "rewards/margins": 4.95703125, "rewards/rejected": -4.48046875, "step": 5083 }, { "epoch": 0.960650007085833, "grad_norm": 3.218441544861547, "learning_rate": 1.0430988047393156e-07, "logits/chosen": 3.44921875, "logits/rejected": 3.6328125, "logps/chosen": -635.5, "logps/rejected": -918.0, "loss": 0.7338, "rewards/accuracies": 0.65625, "rewards/chosen": 0.052734375, "rewards/margins": 4.61279296875, "rewards/rejected": -4.5703125, "step": 5084 }, { "epoch": 0.9608389626340403, "grad_norm": 3.356986466997689, "learning_rate": 1.0426899659118411e-07, "logits/chosen": 3.033203125, "logits/rejected": 2.57421875, "logps/chosen": -805.5, "logps/rejected": -775.5, "loss": 0.5996, "rewards/accuracies": 0.78125, "rewards/chosen": 0.206787109375, "rewards/margins": 4.421875, "rewards/rejected": -4.2265625, "step": 5085 }, { "epoch": 0.9610279181822476, "grad_norm": 2.0862818317939773, "learning_rate": 1.0422830662343892e-07, "logits/chosen": 3.1875, "logits/rejected": 3.13671875, "logps/chosen": -832.0, "logps/rejected": -1005.0, "loss": 0.4525, "rewards/accuracies": 0.875, "rewards/chosen": 1.05029296875, "rewards/margins": 6.03125, "rewards/rejected": -4.98046875, "step": 5086 }, { "epoch": 0.961216873730455, "grad_norm": 2.5251684110890427, "learning_rate": 1.0418781058839815e-07, "logits/chosen": 3.5859375, "logits/rejected": 3.076171875, "logps/chosen": -495.5, "logps/rejected": -608.0, "loss": 0.5579, "rewards/accuracies": 0.875, "rewards/chosen": 0.90576171875, "rewards/margins": 5.6328125, "rewards/rejected": -4.71484375, "step": 5087 }, { "epoch": 0.9614058292786622, "grad_norm": 4.315544784660427, "learning_rate": 1.0414750850367961e-07, "logits/chosen": 2.587890625, "logits/rejected": 2.205078125, "logps/chosen": -735.5, "logps/rejected": -988.0, "loss": 0.5855, "rewards/accuracies": 0.8125, "rewards/chosen": 0.85693359375, "rewards/margins": 7.361328125, "rewards/rejected": -6.494140625, "step": 5088 }, { "epoch": 0.9615947848268694, "grad_norm": 2.768542494172005, "learning_rate": 1.0410740038681663e-07, "logits/chosen": 3.40625, "logits/rejected": 3.607421875, "logps/chosen": -744.5, "logps/rejected": -1821.0, "loss": 0.593, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15087890625, "rewards/margins": 7.158203125, "rewards/rejected": -7.30859375, "step": 5089 }, { "epoch": 0.9617837403750767, "grad_norm": 1.9957869354909852, "learning_rate": 1.0406748625525832e-07, "logits/chosen": 2.94921875, "logits/rejected": 2.498046875, "logps/chosen": -711.0, "logps/rejected": -698.0, "loss": 0.5403, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6240234375, "rewards/margins": 5.328125, "rewards/rejected": -4.70703125, "step": 5090 }, { "epoch": 0.961972695923284, "grad_norm": 1.9912771530193734, "learning_rate": 1.0402776612636925e-07, "logits/chosen": 3.2109375, "logits/rejected": 3.484375, "logps/chosen": -767.0, "logps/rejected": -764.0, "loss": 0.5704, "rewards/accuracies": 0.875, "rewards/chosen": 1.3310546875, "rewards/margins": 4.56640625, "rewards/rejected": -3.234375, "step": 5091 }, { "epoch": 0.9621616514714914, "grad_norm": 3.057829574284063, "learning_rate": 1.0398824001742957e-07, "logits/chosen": 4.015625, "logits/rejected": 3.76171875, "logps/chosen": -683.25, "logps/rejected": -934.0, "loss": 0.5689, "rewards/accuracies": 0.78125, "rewards/chosen": 1.1044921875, "rewards/margins": 5.0078125, "rewards/rejected": -3.89404296875, "step": 5092 }, { "epoch": 0.9623506070196987, "grad_norm": 3.86675493841935, "learning_rate": 1.0394890794563521e-07, "logits/chosen": 3.3359375, "logits/rejected": 2.91015625, "logps/chosen": -916.0, "logps/rejected": -973.0, "loss": 0.6061, "rewards/accuracies": 0.78125, "rewards/chosen": 1.259765625, "rewards/margins": 5.48046875, "rewards/rejected": -4.23046875, "step": 5093 }, { "epoch": 0.9625395625679058, "grad_norm": 4.335438967929989, "learning_rate": 1.0390976992809746e-07, "logits/chosen": 2.6787109375, "logits/rejected": 2.693359375, "logps/chosen": -703.0, "logps/rejected": -765.0, "loss": 0.5441, "rewards/accuracies": 0.875, "rewards/chosen": 0.6953125, "rewards/margins": 6.015625, "rewards/rejected": -5.3203125, "step": 5094 }, { "epoch": 0.9627285181161132, "grad_norm": 2.953739533731251, "learning_rate": 1.038708259818434e-07, "logits/chosen": 3.53515625, "logits/rejected": 3.45703125, "logps/chosen": -887.0, "logps/rejected": -939.0, "loss": 0.5616, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3349609375, "rewards/margins": 5.1796875, "rewards/rejected": -3.84765625, "step": 5095 }, { "epoch": 0.9629174736643205, "grad_norm": 2.632537381532563, "learning_rate": 1.0383207612381542e-07, "logits/chosen": 3.59765625, "logits/rejected": 2.560546875, "logps/chosen": -720.0, "logps/rejected": -14067.0, "loss": 0.63, "rewards/accuracies": 0.84375, "rewards/chosen": 0.33056640625, "rewards/margins": -164.265625, "rewards/rejected": 164.5234375, "step": 5096 }, { "epoch": 0.9631064292125278, "grad_norm": 2.167065555474867, "learning_rate": 1.0379352037087176e-07, "logits/chosen": 2.986328125, "logits/rejected": 2.931640625, "logps/chosen": -986.0, "logps/rejected": -872.0, "loss": 0.5238, "rewards/accuracies": 0.875, "rewards/chosen": 1.052734375, "rewards/margins": 5.36328125, "rewards/rejected": -4.328125, "step": 5097 }, { "epoch": 0.9632953847607351, "grad_norm": 5.351099737781791, "learning_rate": 1.0375515873978604e-07, "logits/chosen": 2.7109375, "logits/rejected": 2.3125, "logps/chosen": -786.0, "logps/rejected": -591.0, "loss": 0.6381, "rewards/accuracies": 0.75, "rewards/chosen": 0.1630859375, "rewards/margins": 3.64990234375, "rewards/rejected": -3.484375, "step": 5098 }, { "epoch": 0.9634843403089424, "grad_norm": 2.850603937804687, "learning_rate": 1.0371699124724744e-07, "logits/chosen": 3.26171875, "logits/rejected": 3.3359375, "logps/chosen": -1133.0, "logps/rejected": -1237.5, "loss": 0.5702, "rewards/accuracies": 0.75, "rewards/chosen": 1.625, "rewards/margins": 6.078125, "rewards/rejected": -4.453125, "step": 5099 }, { "epoch": 0.9636732958571496, "grad_norm": 2.258108301454123, "learning_rate": 1.0367901790986067e-07, "logits/chosen": 3.48046875, "logits/rejected": 3.255859375, "logps/chosen": -1201.0, "logps/rejected": -1228.0, "loss": 0.5234, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4652099609375, "rewards/margins": 6.52734375, "rewards/rejected": -5.06640625, "step": 5100 }, { "epoch": 0.9638622514053569, "grad_norm": 3.0899487360095432, "learning_rate": 1.0364123874414608e-07, "logits/chosen": 3.11328125, "logits/rejected": 2.771484375, "logps/chosen": -670.0, "logps/rejected": -769.0, "loss": 0.6558, "rewards/accuracies": 0.875, "rewards/chosen": 0.2159423828125, "rewards/margins": 4.1328125, "rewards/rejected": -3.91796875, "step": 5101 }, { "epoch": 0.9640512069535642, "grad_norm": 2.495649582385823, "learning_rate": 1.0360365376653949e-07, "logits/chosen": 3.46484375, "logits/rejected": 3.00390625, "logps/chosen": -886.0, "logps/rejected": -962.0, "loss": 0.6156, "rewards/accuracies": 0.8125, "rewards/chosen": 0.76708984375, "rewards/margins": 4.55859375, "rewards/rejected": -3.78125, "step": 5102 }, { "epoch": 0.9642401625017715, "grad_norm": 2.776720886141362, "learning_rate": 1.0356626299339209e-07, "logits/chosen": 2.4765625, "logits/rejected": 2.1962890625, "logps/chosen": -490.5, "logps/rejected": -743.0, "loss": 0.7073, "rewards/accuracies": 0.6875, "rewards/chosen": 0.84521484375, "rewards/margins": 2.9931640625, "rewards/rejected": -2.150390625, "step": 5103 }, { "epoch": 0.9644291180499788, "grad_norm": 3.277872577759987, "learning_rate": 1.0352906644097082e-07, "logits/chosen": 3.55859375, "logits/rejected": 3.22265625, "logps/chosen": -571.25, "logps/rejected": -1222.5, "loss": 0.5832, "rewards/accuracies": 0.75, "rewards/chosen": 1.042724609375, "rewards/margins": 6.7890625, "rewards/rejected": -5.7421875, "step": 5104 }, { "epoch": 0.9646180735981861, "grad_norm": 4.909392131401984, "learning_rate": 1.0349206412545804e-07, "logits/chosen": 3.34375, "logits/rejected": 3.2109375, "logps/chosen": -834.0, "logps/rejected": -1023.0, "loss": 0.6092, "rewards/accuracies": 0.84375, "rewards/chosen": 0.762939453125, "rewards/margins": 5.724609375, "rewards/rejected": -4.9609375, "step": 5105 }, { "epoch": 0.9648070291463933, "grad_norm": 2.5494469329520624, "learning_rate": 1.0345525606295155e-07, "logits/chosen": 3.03125, "logits/rejected": 3.06640625, "logps/chosen": -708.5, "logps/rejected": -944.0, "loss": 0.6116, "rewards/accuracies": 0.78125, "rewards/chosen": 0.94140625, "rewards/margins": 4.86328125, "rewards/rejected": -3.9296875, "step": 5106 }, { "epoch": 0.9649959846946006, "grad_norm": 3.044810256529342, "learning_rate": 1.0341864226946461e-07, "logits/chosen": 3.79296875, "logits/rejected": 3.43359375, "logps/chosen": -1037.0, "logps/rejected": -923.5, "loss": 0.7622, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2520751953125, "rewards/margins": 3.5908203125, "rewards/rejected": -3.341796875, "step": 5107 }, { "epoch": 0.9651849402428079, "grad_norm": 2.715742563091864, "learning_rate": 1.0338222276092608e-07, "logits/chosen": 3.97265625, "logits/rejected": 3.33203125, "logps/chosen": -671.0, "logps/rejected": -890.0, "loss": 0.5351, "rewards/accuracies": 0.875, "rewards/chosen": 0.88330078125, "rewards/margins": 6.9375, "rewards/rejected": -6.0390625, "step": 5108 }, { "epoch": 0.9653738957910152, "grad_norm": 2.302083004953508, "learning_rate": 1.033459975531803e-07, "logits/chosen": 3.25, "logits/rejected": 2.53125, "logps/chosen": -896.0, "logps/rejected": -715.0, "loss": 0.5204, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5234375, "rewards/margins": 6.0859375, "rewards/rejected": -4.5703125, "step": 5109 }, { "epoch": 0.9655628513392225, "grad_norm": 2.406001798021908, "learning_rate": 1.0330996666198695e-07, "logits/chosen": 2.705078125, "logits/rejected": 2.439453125, "logps/chosen": -1357.0, "logps/rejected": -917.0, "loss": 0.6872, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6552734375, "rewards/margins": 1.26171875, "rewards/rejected": -2.912109375, "step": 5110 }, { "epoch": 0.9657518068874298, "grad_norm": 2.8829598918972223, "learning_rate": 1.0327413010302133e-07, "logits/chosen": 2.98046875, "logits/rejected": 3.560546875, "logps/chosen": -1039.0, "logps/rejected": -1765.0, "loss": 0.4997, "rewards/accuracies": 0.78125, "rewards/chosen": 1.447265625, "rewards/margins": 7.9609375, "rewards/rejected": -6.515625, "step": 5111 }, { "epoch": 0.965940762435637, "grad_norm": 3.72589932636174, "learning_rate": 1.0323848789187398e-07, "logits/chosen": 3.322265625, "logits/rejected": 2.900390625, "logps/chosen": -672.5, "logps/rejected": -639.5, "loss": 0.6709, "rewards/accuracies": 0.6875, "rewards/chosen": 0.65869140625, "rewards/margins": 3.87109375, "rewards/rejected": -3.21484375, "step": 5112 }, { "epoch": 0.9661297179838443, "grad_norm": 2.160805185659282, "learning_rate": 1.0320304004405113e-07, "logits/chosen": 4.015625, "logits/rejected": 3.95703125, "logps/chosen": -497.5, "logps/rejected": -650.0, "loss": 0.673, "rewards/accuracies": 0.78125, "rewards/chosen": 1.010498046875, "rewards/margins": 3.580078125, "rewards/rejected": -2.569091796875, "step": 5113 }, { "epoch": 0.9663186735320516, "grad_norm": 2.5017660061456093, "learning_rate": 1.0316778657497427e-07, "logits/chosen": 4.17578125, "logits/rejected": 4.02734375, "logps/chosen": -852.0, "logps/rejected": -1224.0, "loss": 0.4778, "rewards/accuracies": 0.84375, "rewards/chosen": 0.68035888671875, "rewards/margins": 6.734375, "rewards/rejected": -6.05078125, "step": 5114 }, { "epoch": 0.9665076290802589, "grad_norm": 2.7782814128611815, "learning_rate": 1.0313272749998048e-07, "logits/chosen": 3.06640625, "logits/rejected": 2.51953125, "logps/chosen": -440.25, "logps/rejected": -552.5, "loss": 0.5712, "rewards/accuracies": 0.8125, "rewards/chosen": 0.814453125, "rewards/margins": 4.42578125, "rewards/rejected": -3.62109375, "step": 5115 }, { "epoch": 0.9666965846284662, "grad_norm": 3.8333340027054037, "learning_rate": 1.030978628343222e-07, "logits/chosen": 2.33203125, "logits/rejected": 2.443359375, "logps/chosen": -916.5, "logps/rejected": -1160.0, "loss": 0.646, "rewards/accuracies": 0.75, "rewards/chosen": 0.8916015625, "rewards/margins": 5.15234375, "rewards/rejected": -4.2578125, "step": 5116 }, { "epoch": 0.9668855401766734, "grad_norm": 1.723714935121296, "learning_rate": 1.0306319259316715e-07, "logits/chosen": 2.3134765625, "logits/rejected": 2.0205078125, "logps/chosen": -813.0, "logps/rejected": -767.0, "loss": 0.4593, "rewards/accuracies": 0.84375, "rewards/chosen": 1.513671875, "rewards/margins": 5.44140625, "rewards/rejected": -3.9375, "step": 5117 }, { "epoch": 0.9670744957248807, "grad_norm": 2.2284786704240496, "learning_rate": 1.0302871679159873e-07, "logits/chosen": 3.20703125, "logits/rejected": 3.13671875, "logps/chosen": -635.25, "logps/rejected": -674.5, "loss": 0.7024, "rewards/accuracies": 0.65625, "rewards/chosen": 0.4658203125, "rewards/margins": 3.859375, "rewards/rejected": -3.40234375, "step": 5118 }, { "epoch": 0.967263451273088, "grad_norm": 2.3300975474054737, "learning_rate": 1.0299443544461554e-07, "logits/chosen": 2.287109375, "logits/rejected": 1.849609375, "logps/chosen": -924.0, "logps/rejected": -935.5, "loss": 0.596, "rewards/accuracies": 0.78125, "rewards/chosen": 1.064453125, "rewards/margins": 5.205078125, "rewards/rejected": -4.1279296875, "step": 5119 }, { "epoch": 0.9674524068212953, "grad_norm": 1.8978930858608343, "learning_rate": 1.0296034856713171e-07, "logits/chosen": 2.30078125, "logits/rejected": 2.703125, "logps/chosen": -767.0, "logps/rejected": -1965.0, "loss": 0.5768, "rewards/accuracies": 0.78125, "rewards/chosen": 0.994140625, "rewards/margins": 9.90625, "rewards/rejected": -8.890625, "step": 5120 }, { "epoch": 0.9676413623695026, "grad_norm": 3.186927996384807, "learning_rate": 1.0292645617397668e-07, "logits/chosen": 3.0234375, "logits/rejected": 2.779296875, "logps/chosen": -848.5, "logps/rejected": -954.0, "loss": 0.553, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9219970703125, "rewards/margins": 5.59375, "rewards/rejected": -4.671875, "step": 5121 }, { "epoch": 0.9678303179177099, "grad_norm": 2.6570854768575223, "learning_rate": 1.0289275827989534e-07, "logits/chosen": 3.390625, "logits/rejected": 3.05859375, "logps/chosen": -916.0, "logps/rejected": -1554.0, "loss": 0.6353, "rewards/accuracies": 0.8125, "rewards/chosen": 0.138671875, "rewards/margins": 5.6171875, "rewards/rejected": -5.4609375, "step": 5122 }, { "epoch": 0.9680192734659171, "grad_norm": 2.016857958197376, "learning_rate": 1.0285925489954792e-07, "logits/chosen": 2.64453125, "logits/rejected": 2.40234375, "logps/chosen": -893.0, "logps/rejected": -917.0, "loss": 0.5023, "rewards/accuracies": 0.84375, "rewards/chosen": 1.30615234375, "rewards/margins": 5.9609375, "rewards/rejected": -4.6484375, "step": 5123 }, { "epoch": 0.9682082290141244, "grad_norm": 2.7914794871873125, "learning_rate": 1.0282594604751001e-07, "logits/chosen": 3.60546875, "logits/rejected": 3.87109375, "logps/chosen": -867.5, "logps/rejected": -1112.0, "loss": 0.6592, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8623046875, "rewards/margins": 5.35546875, "rewards/rejected": -4.4921875, "step": 5124 }, { "epoch": 0.9683971845623317, "grad_norm": 2.0335136610851916, "learning_rate": 1.0279283173827272e-07, "logits/chosen": 2.9375, "logits/rejected": 2.5859375, "logps/chosen": -750.5, "logps/rejected": -745.5, "loss": 0.5131, "rewards/accuracies": 0.78125, "rewards/chosen": 0.81829833984375, "rewards/margins": 4.7421875, "rewards/rejected": -3.91796875, "step": 5125 }, { "epoch": 0.968586140110539, "grad_norm": 3.7099901803294895, "learning_rate": 1.0275991198624225e-07, "logits/chosen": 2.587890625, "logits/rejected": 2.376953125, "logps/chosen": -879.0, "logps/rejected": -930.0, "loss": 0.4854, "rewards/accuracies": 0.90625, "rewards/chosen": 1.218017578125, "rewards/margins": 5.5546875, "rewards/rejected": -4.328125, "step": 5126 }, { "epoch": 0.9687750956587463, "grad_norm": 1.7165262027812676, "learning_rate": 1.027271868057405e-07, "logits/chosen": 3.25, "logits/rejected": 3.1015625, "logps/chosen": -571.5, "logps/rejected": -571.5, "loss": 0.6588, "rewards/accuracies": 0.65625, "rewards/chosen": 0.411376953125, "rewards/margins": 3.4296875, "rewards/rejected": -3.0234375, "step": 5127 }, { "epoch": 0.9689640512069536, "grad_norm": 1.9181485675008618, "learning_rate": 1.026946562110044e-07, "logits/chosen": 2.013671875, "logits/rejected": 2.142578125, "logps/chosen": -737.25, "logps/rejected": -1079.5, "loss": 0.5815, "rewards/accuracies": 0.84375, "rewards/chosen": 0.671875, "rewards/margins": 3.95703125, "rewards/rejected": -3.296875, "step": 5128 }, { "epoch": 0.9691530067551608, "grad_norm": 3.2207385581811105, "learning_rate": 1.0266232021618642e-07, "logits/chosen": 3.49609375, "logits/rejected": 2.916015625, "logps/chosen": -713.5, "logps/rejected": -678.5, "loss": 0.6388, "rewards/accuracies": 0.84375, "rewards/chosen": 0.528076171875, "rewards/margins": 3.486328125, "rewards/rejected": -2.94921875, "step": 5129 }, { "epoch": 0.9693419623033681, "grad_norm": 2.1374048205434084, "learning_rate": 1.0263017883535432e-07, "logits/chosen": 1.8701171875, "logits/rejected": 1.576416015625, "logps/chosen": -839.0, "logps/rejected": -788.5, "loss": 0.5599, "rewards/accuracies": 0.84375, "rewards/chosen": 1.4306640625, "rewards/margins": 4.58203125, "rewards/rejected": -3.16015625, "step": 5130 }, { "epoch": 0.9695309178515754, "grad_norm": 8.742494385076544, "learning_rate": 1.0259823208249111e-07, "logits/chosen": 2.6171875, "logits/rejected": 2.259765625, "logps/chosen": -1439.0, "logps/rejected": -1082.0, "loss": 0.548, "rewards/accuracies": 0.78125, "rewards/chosen": 0.5390625, "rewards/margins": 4.5, "rewards/rejected": -3.9501953125, "step": 5131 }, { "epoch": 0.9697198733997827, "grad_norm": 2.605196017426381, "learning_rate": 1.0256647997149533e-07, "logits/chosen": 3.26953125, "logits/rejected": 2.7197265625, "logps/chosen": -1652.0, "logps/rejected": -1068.0, "loss": 0.593, "rewards/accuracies": 0.75, "rewards/chosen": -3.0986328125, "rewards/margins": 2.17578125, "rewards/rejected": -5.2734375, "step": 5132 }, { "epoch": 0.96990882894799, "grad_norm": 1.9401646167943678, "learning_rate": 1.0253492251618061e-07, "logits/chosen": 2.3984375, "logits/rejected": 2.275390625, "logps/chosen": -1141.0, "logps/rejected": -1248.0, "loss": 0.449, "rewards/accuracies": 0.84375, "rewards/chosen": 1.59326171875, "rewards/margins": 6.40625, "rewards/rejected": -4.8203125, "step": 5133 }, { "epoch": 0.9700977844961973, "grad_norm": 1.9349447759925014, "learning_rate": 1.0250355973027609e-07, "logits/chosen": 2.126953125, "logits/rejected": 1.9921875, "logps/chosen": -676.0, "logps/rejected": -841.5, "loss": 0.6272, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7744140625, "rewards/margins": 4.21484375, "rewards/rejected": -3.4375, "step": 5134 }, { "epoch": 0.9702867400444045, "grad_norm": 1.5596341313653712, "learning_rate": 1.024723916274261e-07, "logits/chosen": 3.54296875, "logits/rejected": 3.203125, "logps/chosen": -641.25, "logps/rejected": -1202.5, "loss": 0.6126, "rewards/accuracies": 0.75, "rewards/chosen": 1.14208984375, "rewards/margins": 7.7578125, "rewards/rejected": -6.61328125, "step": 5135 }, { "epoch": 0.9704756955926118, "grad_norm": 2.3593254415189286, "learning_rate": 1.0244141822119023e-07, "logits/chosen": 3.658203125, "logits/rejected": 3.830078125, "logps/chosen": -861.0, "logps/rejected": -1904.0, "loss": 0.5198, "rewards/accuracies": 0.90625, "rewards/chosen": 1.616943359375, "rewards/margins": 12.28125, "rewards/rejected": -10.68359375, "step": 5136 }, { "epoch": 0.9706646511408191, "grad_norm": 2.1086310527227883, "learning_rate": 1.0241063952504353e-07, "logits/chosen": 3.53515625, "logits/rejected": 3.31640625, "logps/chosen": -1005.0, "logps/rejected": -1243.0, "loss": 0.5234, "rewards/accuracies": 0.84375, "rewards/chosen": 1.181396484375, "rewards/margins": 5.765625, "rewards/rejected": -4.5859375, "step": 5137 }, { "epoch": 0.9708536066890264, "grad_norm": 3.136949294708087, "learning_rate": 1.0238005555237625e-07, "logits/chosen": 3.1484375, "logits/rejected": 3.224609375, "logps/chosen": -860.0, "logps/rejected": -778.0, "loss": 0.5773, "rewards/accuracies": 0.78125, "rewards/chosen": 0.234375, "rewards/margins": 4.45703125, "rewards/rejected": -4.2265625, "step": 5138 }, { "epoch": 0.9710425622372337, "grad_norm": 2.3855289805711655, "learning_rate": 1.0234966631649389e-07, "logits/chosen": 3.41796875, "logits/rejected": 2.73828125, "logps/chosen": -775.0, "logps/rejected": -1684.0, "loss": 0.4047, "rewards/accuracies": 0.875, "rewards/chosen": 1.140625, "rewards/margins": 9.0625, "rewards/rejected": -7.90234375, "step": 5139 }, { "epoch": 0.9712315177854409, "grad_norm": 3.441532993791215, "learning_rate": 1.0231947183061723e-07, "logits/chosen": 3.86328125, "logits/rejected": 3.7890625, "logps/chosen": -688.5, "logps/rejected": -888.5, "loss": 0.628, "rewards/accuracies": 0.8125, "rewards/chosen": 1.78125, "rewards/margins": 4.791015625, "rewards/rejected": -3.009765625, "step": 5140 }, { "epoch": 0.9714204733336482, "grad_norm": 1.6571246222943399, "learning_rate": 1.0228947210788255e-07, "logits/chosen": 3.34375, "logits/rejected": 3.4296875, "logps/chosen": -859.5, "logps/rejected": -1054.5, "loss": 0.628, "rewards/accuracies": 0.6875, "rewards/chosen": 1.04229736328125, "rewards/margins": 6.11328125, "rewards/rejected": -5.0625, "step": 5141 }, { "epoch": 0.9716094288818555, "grad_norm": 3.1163901334101443, "learning_rate": 1.0225966716134093e-07, "logits/chosen": 3.91015625, "logits/rejected": 3.27734375, "logps/chosen": -767.0, "logps/rejected": -965.0, "loss": 0.583, "rewards/accuracies": 0.75, "rewards/chosen": 1.1396484375, "rewards/margins": 5.4453125, "rewards/rejected": -4.3046875, "step": 5142 }, { "epoch": 0.9717983844300628, "grad_norm": 2.048189393568859, "learning_rate": 1.0223005700395924e-07, "logits/chosen": 2.90234375, "logits/rejected": 2.89453125, "logps/chosen": -698.0, "logps/rejected": -1491.0, "loss": 0.7012, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6181640625, "rewards/margins": 5.3408203125, "rewards/rejected": -4.703125, "step": 5143 }, { "epoch": 0.9719873399782701, "grad_norm": 2.518030437240999, "learning_rate": 1.0220064164861926e-07, "logits/chosen": 3.53125, "logits/rejected": 3.06640625, "logps/chosen": -550.0, "logps/rejected": -551.0, "loss": 0.7193, "rewards/accuracies": 0.71875, "rewards/chosen": 0.560546875, "rewards/margins": 2.681640625, "rewards/rejected": -2.125, "step": 5144 }, { "epoch": 0.9721762955264774, "grad_norm": 2.1823658070357905, "learning_rate": 1.0217142110811807e-07, "logits/chosen": 2.25, "logits/rejected": 1.9189453125, "logps/chosen": -1094.0, "logps/rejected": -1187.0, "loss": 0.5478, "rewards/accuracies": 0.875, "rewards/chosen": 1.21484375, "rewards/margins": 5.30078125, "rewards/rejected": -4.08203125, "step": 5145 }, { "epoch": 0.9723652510746846, "grad_norm": 1.4126716318930848, "learning_rate": 1.0214239539516815e-07, "logits/chosen": 2.89453125, "logits/rejected": 1.8359375, "logps/chosen": -557.0, "logps/rejected": -876.0, "loss": 0.4772, "rewards/accuracies": 0.875, "rewards/chosen": 0.6678466796875, "rewards/margins": 5.859375, "rewards/rejected": -5.19140625, "step": 5146 }, { "epoch": 0.9725542066228919, "grad_norm": 1.7795651153676542, "learning_rate": 1.0211356452239703e-07, "logits/chosen": 3.193359375, "logits/rejected": 2.740234375, "logps/chosen": -974.5, "logps/rejected": -1056.5, "loss": 0.6108, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4468994140625, "rewards/margins": 5.12109375, "rewards/rejected": -3.6669921875, "step": 5147 }, { "epoch": 0.9727431621710992, "grad_norm": 2.2477313637898577, "learning_rate": 1.0208492850234768e-07, "logits/chosen": 3.314453125, "logits/rejected": 2.9169921875, "logps/chosen": -712.5, "logps/rejected": -798.0, "loss": 0.5893, "rewards/accuracies": 0.78125, "rewards/chosen": 0.971923828125, "rewards/margins": 6.9296875, "rewards/rejected": -5.9609375, "step": 5148 }, { "epoch": 0.9729321177193065, "grad_norm": 2.245070069089445, "learning_rate": 1.0205648734747805e-07, "logits/chosen": 2.7734375, "logits/rejected": 3.091796875, "logps/chosen": -500.5, "logps/rejected": -702.5, "loss": 0.6046, "rewards/accuracies": 0.875, "rewards/chosen": 0.5888671875, "rewards/margins": 3.802734375, "rewards/rejected": -3.2265625, "step": 5149 }, { "epoch": 0.9731210732675138, "grad_norm": 2.178473385472871, "learning_rate": 1.0202824107016148e-07, "logits/chosen": 3.3671875, "logits/rejected": 3.04296875, "logps/chosen": -1106.0, "logps/rejected": -1125.0, "loss": 0.4582, "rewards/accuracies": 0.84375, "rewards/chosen": 1.44091796875, "rewards/margins": 6.65625, "rewards/rejected": -5.21875, "step": 5150 }, { "epoch": 0.9733100288157212, "grad_norm": 2.2335202529837046, "learning_rate": 1.0200018968268661e-07, "logits/chosen": 2.7109375, "logits/rejected": 2.40234375, "logps/chosen": -628.5, "logps/rejected": -798.0, "loss": 0.5668, "rewards/accuracies": 0.875, "rewards/chosen": 0.2109375, "rewards/margins": 5.8671875, "rewards/rejected": -5.640625, "step": 5151 }, { "epoch": 0.9734989843639283, "grad_norm": 1.9001093020661157, "learning_rate": 1.01972333197257e-07, "logits/chosen": 2.748046875, "logits/rejected": 2.86328125, "logps/chosen": -590.5, "logps/rejected": -1637.0, "loss": 0.6534, "rewards/accuracies": 0.78125, "rewards/chosen": 0.4949951171875, "rewards/margins": 6.0986328125, "rewards/rejected": -5.5966796875, "step": 5152 }, { "epoch": 0.9736879399121356, "grad_norm": 2.909747683541471, "learning_rate": 1.0194467162599177e-07, "logits/chosen": 3.3515625, "logits/rejected": 2.865234375, "logps/chosen": -632.0, "logps/rejected": -617.0, "loss": 0.497, "rewards/accuracies": 0.78125, "rewards/chosen": 0.341064453125, "rewards/margins": 4.5078125, "rewards/rejected": -4.171875, "step": 5153 }, { "epoch": 0.973876895460343, "grad_norm": 1.951763119915374, "learning_rate": 1.0191720498092496e-07, "logits/chosen": 2.91796875, "logits/rejected": 2.759765625, "logps/chosen": -650.0, "logps/rejected": -729.5, "loss": 0.4964, "rewards/accuracies": 0.84375, "rewards/chosen": 1.150390625, "rewards/margins": 7.15625, "rewards/rejected": -6.0, "step": 5154 }, { "epoch": 0.9740658510085503, "grad_norm": 3.0237121124667015, "learning_rate": 1.0188993327400591e-07, "logits/chosen": 3.044921875, "logits/rejected": 2.908203125, "logps/chosen": -832.0, "logps/rejected": -785.0, "loss": 0.6029, "rewards/accuracies": 0.84375, "rewards/chosen": 0.7666015625, "rewards/margins": 5.5390625, "rewards/rejected": -4.77734375, "step": 5155 }, { "epoch": 0.9742548065567576, "grad_norm": 2.1647055903210255, "learning_rate": 1.0186285651709925e-07, "logits/chosen": 4.12890625, "logits/rejected": 3.82421875, "logps/chosen": -594.5, "logps/rejected": -1301.5, "loss": 0.4535, "rewards/accuracies": 0.90625, "rewards/chosen": 1.044921875, "rewards/margins": 10.21875, "rewards/rejected": -9.17578125, "step": 5156 }, { "epoch": 0.9744437621049649, "grad_norm": 2.8473793253132205, "learning_rate": 1.018359747219846e-07, "logits/chosen": 1.47509765625, "logits/rejected": 1.474609375, "logps/chosen": -626.5, "logps/rejected": -731.0, "loss": 0.5941, "rewards/accuracies": 0.8125, "rewards/chosen": 0.608642578125, "rewards/margins": 3.80859375, "rewards/rejected": -3.203125, "step": 5157 }, { "epoch": 0.974632717653172, "grad_norm": 2.062577027810678, "learning_rate": 1.0180928790035696e-07, "logits/chosen": 3.5546875, "logits/rejected": 3.609375, "logps/chosen": -912.0, "logps/rejected": -1071.0, "loss": 0.5171, "rewards/accuracies": 0.84375, "rewards/chosen": 1.77978515625, "rewards/margins": 7.21875, "rewards/rejected": -5.4375, "step": 5158 }, { "epoch": 0.9748216732013794, "grad_norm": 2.054305359316829, "learning_rate": 1.0178279606382633e-07, "logits/chosen": 3.1328125, "logits/rejected": 3.33984375, "logps/chosen": -777.5, "logps/rejected": -1315.5, "loss": 0.6643, "rewards/accuracies": 0.65625, "rewards/chosen": 1.5048828125, "rewards/margins": 4.4879150390625, "rewards/rejected": -2.96875, "step": 5159 }, { "epoch": 0.9750106287495867, "grad_norm": 2.519722162552017, "learning_rate": 1.0175649922391807e-07, "logits/chosen": 2.02734375, "logits/rejected": 1.900390625, "logps/chosen": -778.0, "logps/rejected": -841.0, "loss": 0.6371, "rewards/accuracies": 0.78125, "rewards/chosen": 0.439300537109375, "rewards/margins": 4.19921875, "rewards/rejected": -3.7578125, "step": 5160 }, { "epoch": 0.975199584297794, "grad_norm": 1.9727034843987898, "learning_rate": 1.0173039739207254e-07, "logits/chosen": 3.58984375, "logits/rejected": 2.890625, "logps/chosen": -654.0, "logps/rejected": -797.0, "loss": 0.6092, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0517578125, "rewards/margins": 4.125, "rewards/rejected": -4.08203125, "step": 5161 }, { "epoch": 0.9753885398460013, "grad_norm": 2.488810390966414, "learning_rate": 1.0170449057964533e-07, "logits/chosen": 3.2578125, "logits/rejected": 2.68359375, "logps/chosen": -1118.0, "logps/rejected": -1003.0, "loss": 0.537, "rewards/accuracies": 0.875, "rewards/chosen": 1.66015625, "rewards/margins": 5.828125, "rewards/rejected": -4.16796875, "step": 5162 }, { "epoch": 0.9755774953942085, "grad_norm": 4.015169492580454, "learning_rate": 1.0167877879790712e-07, "logits/chosen": 3.8046875, "logits/rejected": 3.53515625, "logps/chosen": -705.0, "logps/rejected": -1363.0, "loss": 0.544, "rewards/accuracies": 0.75, "rewards/chosen": 1.2978515625, "rewards/margins": 5.05859375, "rewards/rejected": -3.765625, "step": 5163 }, { "epoch": 0.9757664509424158, "grad_norm": 2.5219096765741216, "learning_rate": 1.0165326205804395e-07, "logits/chosen": 2.60546875, "logits/rejected": 2.0107421875, "logps/chosen": -1201.0, "logps/rejected": -1674.0, "loss": 0.5092, "rewards/accuracies": 0.8125, "rewards/chosen": 1.49853515625, "rewards/margins": 7.9609375, "rewards/rejected": -6.4609375, "step": 5164 }, { "epoch": 0.9759554064906231, "grad_norm": 2.7426236971465934, "learning_rate": 1.0162794037115677e-07, "logits/chosen": 3.328125, "logits/rejected": 3.171875, "logps/chosen": -876.5, "logps/rejected": -1118.5, "loss": 0.565, "rewards/accuracies": 0.84375, "rewards/chosen": 1.094970703125, "rewards/margins": 5.0234375, "rewards/rejected": -3.92578125, "step": 5165 }, { "epoch": 0.9761443620388304, "grad_norm": 1.8430236338371684, "learning_rate": 1.0160281374826178e-07, "logits/chosen": 3.49609375, "logits/rejected": 3.2890625, "logps/chosen": -875.0, "logps/rejected": -947.0, "loss": 0.6361, "rewards/accuracies": 0.78125, "rewards/chosen": 0.454345703125, "rewards/margins": 4.96484375, "rewards/rejected": -4.5, "step": 5166 }, { "epoch": 0.9763333175870377, "grad_norm": 2.4430069452397065, "learning_rate": 1.0157788220029026e-07, "logits/chosen": 2.736328125, "logits/rejected": 2.748046875, "logps/chosen": -925.5, "logps/rejected": -1450.0, "loss": 0.5194, "rewards/accuracies": 0.84375, "rewards/chosen": 1.35009765625, "rewards/margins": 9.4609375, "rewards/rejected": -8.0859375, "step": 5167 }, { "epoch": 0.976522273135245, "grad_norm": 2.618274136095409, "learning_rate": 1.0155314573808874e-07, "logits/chosen": 2.7265625, "logits/rejected": 2.5263671875, "logps/chosen": -849.0, "logps/rejected": -1734.0, "loss": 0.6119, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3408203125, "rewards/margins": 10.8359375, "rewards/rejected": -10.44140625, "step": 5168 }, { "epoch": 0.9767112286834522, "grad_norm": 2.4373697067687026, "learning_rate": 1.0152860437241878e-07, "logits/chosen": 2.86328125, "logits/rejected": 2.91015625, "logps/chosen": -546.0, "logps/rejected": -1324.5, "loss": 0.7773, "rewards/accuracies": 0.78125, "rewards/chosen": -0.138671875, "rewards/margins": 4.462890625, "rewards/rejected": -4.599609375, "step": 5169 }, { "epoch": 0.9769001842316595, "grad_norm": 4.770593057459593, "learning_rate": 1.0150425811395708e-07, "logits/chosen": 2.400390625, "logits/rejected": 2.380859375, "logps/chosen": -1020.0, "logps/rejected": -801.0, "loss": 0.558, "rewards/accuracies": 0.8125, "rewards/chosen": 0.74951171875, "rewards/margins": 4.640625, "rewards/rejected": -3.888671875, "step": 5170 }, { "epoch": 0.9770891397798668, "grad_norm": 2.6974746711768236, "learning_rate": 1.0148010697329543e-07, "logits/chosen": 2.78125, "logits/rejected": 2.69140625, "logps/chosen": -531.5, "logps/rejected": -650.0, "loss": 0.5174, "rewards/accuracies": 0.875, "rewards/chosen": 1.02880859375, "rewards/margins": 4.8125, "rewards/rejected": -3.77734375, "step": 5171 }, { "epoch": 0.9772780953280741, "grad_norm": 2.0165633050558633, "learning_rate": 1.014561509609408e-07, "logits/chosen": 2.87890625, "logits/rejected": 3.0546875, "logps/chosen": -1033.0, "logps/rejected": -1058.0, "loss": 0.5644, "rewards/accuracies": 0.71875, "rewards/chosen": 1.2900390625, "rewards/margins": 6.26953125, "rewards/rejected": -4.98828125, "step": 5172 }, { "epoch": 0.9774670508762814, "grad_norm": 2.640236986229096, "learning_rate": 1.0143239008731519e-07, "logits/chosen": 3.4296875, "logits/rejected": 3.3828125, "logps/chosen": -772.5, "logps/rejected": -2235.0, "loss": 0.5615, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3818359375, "rewards/margins": 11.4765625, "rewards/rejected": -11.0859375, "step": 5173 }, { "epoch": 0.9776560064244887, "grad_norm": 1.7685274987373296, "learning_rate": 1.0140882436275589e-07, "logits/chosen": 2.8125, "logits/rejected": 2.51171875, "logps/chosen": -609.75, "logps/rejected": -616.5, "loss": 0.5368, "rewards/accuracies": 0.84375, "rewards/chosen": 1.259765625, "rewards/margins": 4.90625, "rewards/rejected": -3.640625, "step": 5174 }, { "epoch": 0.9778449619726959, "grad_norm": 3.8519070056181377, "learning_rate": 1.01385453797515e-07, "logits/chosen": 3.8203125, "logits/rejected": 3.109375, "logps/chosen": -1115.5, "logps/rejected": -1015.0, "loss": 0.5429, "rewards/accuracies": 0.8125, "rewards/chosen": 1.62109375, "rewards/margins": 5.50390625, "rewards/rejected": -3.87890625, "step": 5175 }, { "epoch": 0.9780339175209032, "grad_norm": 3.317522046381357, "learning_rate": 1.0136227840175998e-07, "logits/chosen": 2.197265625, "logits/rejected": 2.08203125, "logps/chosen": -1326.0, "logps/rejected": -1235.0, "loss": 0.5237, "rewards/accuracies": 0.78125, "rewards/chosen": 1.6787109375, "rewards/margins": 6.171875, "rewards/rejected": -4.5, "step": 5176 }, { "epoch": 0.9782228730691105, "grad_norm": 6.7142705351561185, "learning_rate": 1.013392981855732e-07, "logits/chosen": 2.8828125, "logits/rejected": 2.6328125, "logps/chosen": -827.0, "logps/rejected": -854.0, "loss": 0.5563, "rewards/accuracies": 0.8125, "rewards/chosen": 1.279052734375, "rewards/margins": 4.51953125, "rewards/rejected": -3.240234375, "step": 5177 }, { "epoch": 0.9784118286173178, "grad_norm": 1.9106930509291697, "learning_rate": 1.013165131589522e-07, "logits/chosen": 2.42578125, "logits/rejected": 2.154296875, "logps/chosen": -942.5, "logps/rejected": -733.0, "loss": 0.5653, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04150390625, "rewards/margins": 3.9765625, "rewards/rejected": -3.93359375, "step": 5178 }, { "epoch": 0.9786007841655251, "grad_norm": 2.057714176877308, "learning_rate": 1.0129392333180969e-07, "logits/chosen": 2.1015625, "logits/rejected": 2.494140625, "logps/chosen": -800.0, "logps/rejected": -706.0, "loss": 0.4983, "rewards/accuracies": 0.84375, "rewards/chosen": 0.9400634765625, "rewards/margins": 5.4609375, "rewards/rejected": -4.515625, "step": 5179 }, { "epoch": 0.9787897397137324, "grad_norm": 6.651029596901416, "learning_rate": 1.0127152871397318e-07, "logits/chosen": 2.8359375, "logits/rejected": 2.4951171875, "logps/chosen": -876.0, "logps/rejected": -1183.0, "loss": 0.6089, "rewards/accuracies": 0.78125, "rewards/chosen": 0.29931640625, "rewards/margins": 4.9921875, "rewards/rejected": -4.6953125, "step": 5180 }, { "epoch": 0.9789786952619396, "grad_norm": 1.4282771266379568, "learning_rate": 1.012493293151856e-07, "logits/chosen": 2.83984375, "logits/rejected": 2.53515625, "logps/chosen": -726.5, "logps/rejected": -808.0, "loss": 0.4072, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1376953125, "rewards/margins": 6.875, "rewards/rejected": -5.75, "step": 5181 }, { "epoch": 0.9791676508101469, "grad_norm": 1.6134455814588489, "learning_rate": 1.0122732514510468e-07, "logits/chosen": 3.421875, "logits/rejected": 3.306640625, "logps/chosen": -674.0, "logps/rejected": -1065.0, "loss": 0.606, "rewards/accuracies": 0.84375, "rewards/chosen": 0.643798828125, "rewards/margins": 4.1796875, "rewards/rejected": -3.53515625, "step": 5182 }, { "epoch": 0.9793566063583542, "grad_norm": 3.7060710002555983, "learning_rate": 1.012055162133034e-07, "logits/chosen": 2.640625, "logits/rejected": 2.431640625, "logps/chosen": -656.0, "logps/rejected": -722.0, "loss": 0.7061, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8544921875, "rewards/margins": 4.1484375, "rewards/rejected": -5.0, "step": 5183 }, { "epoch": 0.9795455619065615, "grad_norm": 2.7072194908008727, "learning_rate": 1.0118390252926963e-07, "logits/chosen": 3.8359375, "logits/rejected": 4.171875, "logps/chosen": -615.5, "logps/rejected": -1106.0, "loss": 0.5382, "rewards/accuracies": 0.875, "rewards/chosen": 0.439453125, "rewards/margins": 5.6171875, "rewards/rejected": -5.18359375, "step": 5184 }, { "epoch": 0.9797345174547688, "grad_norm": 1.730208417894046, "learning_rate": 1.0116248410240648e-07, "logits/chosen": 1.9384765625, "logits/rejected": 1.42236328125, "logps/chosen": -838.0, "logps/rejected": -812.0, "loss": 0.5447, "rewards/accuracies": 0.8125, "rewards/chosen": 0.7998046875, "rewards/margins": 4.97265625, "rewards/rejected": -4.169921875, "step": 5185 }, { "epoch": 0.979923473002976, "grad_norm": 3.3450761020223108, "learning_rate": 1.0114126094203196e-07, "logits/chosen": 3.05859375, "logits/rejected": 2.59765625, "logps/chosen": -780.0, "logps/rejected": -797.0, "loss": 0.5943, "rewards/accuracies": 0.8125, "rewards/chosen": 0.53466796875, "rewards/margins": 4.6640625, "rewards/rejected": -4.12109375, "step": 5186 }, { "epoch": 0.9801124285511833, "grad_norm": 3.8442600124318385, "learning_rate": 1.0112023305737924e-07, "logits/chosen": 3.171875, "logits/rejected": 2.52734375, "logps/chosen": -760.5, "logps/rejected": -770.5, "loss": 0.4352, "rewards/accuracies": 0.875, "rewards/chosen": 0.99365234375, "rewards/margins": 6.53125, "rewards/rejected": -5.53125, "step": 5187 }, { "epoch": 0.9803013840993906, "grad_norm": 2.3222509181119584, "learning_rate": 1.0109940045759649e-07, "logits/chosen": 2.826171875, "logits/rejected": 2.720703125, "logps/chosen": -672.0, "logps/rejected": -742.0, "loss": 0.5408, "rewards/accuracies": 0.75, "rewards/chosen": 1.0087890625, "rewards/margins": 5.5390625, "rewards/rejected": -4.52734375, "step": 5188 }, { "epoch": 0.9804903396475979, "grad_norm": 2.664574731834868, "learning_rate": 1.0107876315174687e-07, "logits/chosen": 2.03515625, "logits/rejected": 1.8271484375, "logps/chosen": -983.5, "logps/rejected": -1232.0, "loss": 0.5424, "rewards/accuracies": 0.8125, "rewards/chosen": 0.646484375, "rewards/margins": 6.32421875, "rewards/rejected": -5.66015625, "step": 5189 }, { "epoch": 0.9806792951958052, "grad_norm": 3.786056890301982, "learning_rate": 1.0105832114880863e-07, "logits/chosen": 2.880859375, "logits/rejected": 2.3271484375, "logps/chosen": -502.25, "logps/rejected": -1164.0, "loss": 0.5724, "rewards/accuracies": 0.875, "rewards/chosen": 0.24658203125, "rewards/margins": 6.33203125, "rewards/rejected": -6.078125, "step": 5190 }, { "epoch": 0.9808682507440125, "grad_norm": 2.3463288852654087, "learning_rate": 1.0103807445767507e-07, "logits/chosen": 2.78125, "logits/rejected": 2.54296875, "logps/chosen": -1030.5, "logps/rejected": -1026.0, "loss": 0.5171, "rewards/accuracies": 0.84375, "rewards/chosen": 0.40234375, "rewards/margins": 5.84765625, "rewards/rejected": -5.4453125, "step": 5191 }, { "epoch": 0.9810572062922197, "grad_norm": 2.699681026980905, "learning_rate": 1.0101802308715457e-07, "logits/chosen": 3.306640625, "logits/rejected": 2.97265625, "logps/chosen": -878.5, "logps/rejected": -1416.0, "loss": 0.562, "rewards/accuracies": 0.75, "rewards/chosen": 0.498046875, "rewards/margins": 12.04296875, "rewards/rejected": -11.515625, "step": 5192 }, { "epoch": 0.981246161840427, "grad_norm": 2.503182062148536, "learning_rate": 1.0099816704597034e-07, "logits/chosen": 3.28515625, "logits/rejected": 2.69140625, "logps/chosen": -924.5, "logps/rejected": -871.0, "loss": 0.5879, "rewards/accuracies": 0.78125, "rewards/chosen": 0.232421875, "rewards/margins": 5.0706787109375, "rewards/rejected": -4.841796875, "step": 5193 }, { "epoch": 0.9814351173886343, "grad_norm": 1.9480002492764452, "learning_rate": 1.0097850634276084e-07, "logits/chosen": 3.02099609375, "logits/rejected": 2.8115234375, "logps/chosen": -960.5, "logps/rejected": -1913.0, "loss": 0.6746, "rewards/accuracies": 0.75, "rewards/chosen": 0.711669921875, "rewards/margins": 6.65625, "rewards/rejected": -5.9453125, "step": 5194 }, { "epoch": 0.9816240729368416, "grad_norm": 2.6794607349406614, "learning_rate": 1.0095904098607942e-07, "logits/chosen": 3.1484375, "logits/rejected": 2.4375, "logps/chosen": -680.5, "logps/rejected": -626.0, "loss": 0.5096, "rewards/accuracies": 0.875, "rewards/chosen": 0.23193359375, "rewards/margins": 5.69921875, "rewards/rejected": -5.46875, "step": 5195 }, { "epoch": 0.9818130284850489, "grad_norm": 4.387945343220969, "learning_rate": 1.0093977098439438e-07, "logits/chosen": 3.09765625, "logits/rejected": 2.75, "logps/chosen": -726.0, "logps/rejected": -976.0, "loss": 0.4252, "rewards/accuracies": 0.9375, "rewards/chosen": 1.26904296875, "rewards/margins": 6.671875, "rewards/rejected": -5.40234375, "step": 5196 }, { "epoch": 0.9820019840332562, "grad_norm": 3.2324426826586325, "learning_rate": 1.0092069634608927e-07, "logits/chosen": 2.921875, "logits/rejected": 2.6953125, "logps/chosen": -918.5, "logps/rejected": -1046.5, "loss": 0.5103, "rewards/accuracies": 0.90625, "rewards/chosen": 1.0782470703125, "rewards/margins": 5.6484375, "rewards/rejected": -4.5625, "step": 5197 }, { "epoch": 0.9821909395814634, "grad_norm": 6.623038498160773, "learning_rate": 1.0090181707946245e-07, "logits/chosen": 3.10546875, "logits/rejected": 2.27734375, "logps/chosen": -704.0, "logps/rejected": -852.0, "loss": 0.4499, "rewards/accuracies": 0.96875, "rewards/chosen": 1.11944580078125, "rewards/margins": 7.0234375, "rewards/rejected": -5.890625, "step": 5198 }, { "epoch": 0.9823798951296707, "grad_norm": 2.466539556002083, "learning_rate": 1.0088313319272723e-07, "logits/chosen": 2.697265625, "logits/rejected": 2.38720703125, "logps/chosen": -899.0, "logps/rejected": -999.0, "loss": 0.4601, "rewards/accuracies": 0.875, "rewards/chosen": 0.705078125, "rewards/margins": 6.0859375, "rewards/rejected": -5.3828125, "step": 5199 }, { "epoch": 0.982568850677878, "grad_norm": 1.900220413947915, "learning_rate": 1.008646446940122e-07, "logits/chosen": 2.84375, "logits/rejected": 2.3515625, "logps/chosen": -775.0, "logps/rejected": -826.0, "loss": 0.5111, "rewards/accuracies": 0.9375, "rewards/chosen": 0.333984375, "rewards/margins": 5.2890625, "rewards/rejected": -4.9609375, "step": 5200 }, { "epoch": 0.9827578062260853, "grad_norm": 2.313777165681553, "learning_rate": 1.0084635159136062e-07, "logits/chosen": 3.00390625, "logits/rejected": 2.322265625, "logps/chosen": -757.0, "logps/rejected": -912.0, "loss": 0.4367, "rewards/accuracies": 0.9375, "rewards/chosen": 1.625, "rewards/margins": 7.2578125, "rewards/rejected": -5.625, "step": 5201 }, { "epoch": 0.9829467617742926, "grad_norm": 2.289708057402796, "learning_rate": 1.0082825389273097e-07, "logits/chosen": 3.052734375, "logits/rejected": 2.7666015625, "logps/chosen": -433.5, "logps/rejected": -914.0, "loss": 0.5663, "rewards/accuracies": 0.78125, "rewards/chosen": 0.8564453125, "rewards/margins": 5.9609375, "rewards/rejected": -5.1015625, "step": 5202 }, { "epoch": 0.9831357173224999, "grad_norm": 2.1876090903730105, "learning_rate": 1.0081035160599664e-07, "logits/chosen": 3.8671875, "logits/rejected": 3.62890625, "logps/chosen": -1230.0, "logps/rejected": -991.0, "loss": 0.5602, "rewards/accuracies": 0.75, "rewards/chosen": 1.4775390625, "rewards/margins": 5.3515625, "rewards/rejected": -3.876953125, "step": 5203 }, { "epoch": 0.9833246728707071, "grad_norm": 1.9513220870866996, "learning_rate": 1.0079264473894597e-07, "logits/chosen": 3.486328125, "logits/rejected": 3.4609375, "logps/chosen": -797.0, "logps/rejected": -1181.0, "loss": 0.6122, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6650390625, "rewards/margins": 5.6328125, "rewards/rejected": -4.95703125, "step": 5204 }, { "epoch": 0.9835136284189144, "grad_norm": 4.641126081850189, "learning_rate": 1.0077513329928238e-07, "logits/chosen": 2.2216796875, "logits/rejected": 1.88824462890625, "logps/chosen": -569.0, "logps/rejected": -907.0, "loss": 0.5431, "rewards/accuracies": 0.8125, "rewards/chosen": 0.6142578125, "rewards/margins": 5.15625, "rewards/rejected": -4.5546875, "step": 5205 }, { "epoch": 0.9837025839671217, "grad_norm": 3.4749708067311453, "learning_rate": 1.0075781729462414e-07, "logits/chosen": 2.853515625, "logits/rejected": 2.123046875, "logps/chosen": -630.0, "logps/rejected": -687.0, "loss": 0.5509, "rewards/accuracies": 0.90625, "rewards/chosen": 0.277008056640625, "rewards/margins": 4.10546875, "rewards/rejected": -3.83203125, "step": 5206 }, { "epoch": 0.983891539515329, "grad_norm": 4.311441080682518, "learning_rate": 1.0074069673250464e-07, "logits/chosen": 2.19921875, "logits/rejected": 1.990234375, "logps/chosen": -640.0, "logps/rejected": -1163.0, "loss": 0.6196, "rewards/accuracies": 0.84375, "rewards/chosen": 0.505126953125, "rewards/margins": 3.84765625, "rewards/rejected": -3.34765625, "step": 5207 }, { "epoch": 0.9840804950635363, "grad_norm": 2.8719207222695124, "learning_rate": 1.0072377162037214e-07, "logits/chosen": 3.53125, "logits/rejected": 3.44140625, "logps/chosen": -799.0, "logps/rejected": -928.0, "loss": 0.5907, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4619140625, "rewards/margins": 7.22265625, "rewards/rejected": -6.7734375, "step": 5208 }, { "epoch": 0.9842694506117435, "grad_norm": 5.295463860428135, "learning_rate": 1.0070704196558984e-07, "logits/chosen": 2.60546875, "logits/rejected": 2.26953125, "logps/chosen": -662.5, "logps/rejected": -705.0, "loss": 0.6919, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07275390625, "rewards/margins": 3.32421875, "rewards/rejected": -3.248046875, "step": 5209 }, { "epoch": 0.9844584061599508, "grad_norm": 1.9092979118453623, "learning_rate": 1.0069050777543607e-07, "logits/chosen": 3.5078125, "logits/rejected": 2.708984375, "logps/chosen": -788.5, "logps/rejected": -685.0, "loss": 0.4861, "rewards/accuracies": 0.84375, "rewards/chosen": 0.51806640625, "rewards/margins": 5.40625, "rewards/rejected": -4.8828125, "step": 5210 }, { "epoch": 0.9846473617081581, "grad_norm": 2.1203722981605186, "learning_rate": 1.0067416905710394e-07, "logits/chosen": 2.294921875, "logits/rejected": 2.072265625, "logps/chosen": -597.0, "logps/rejected": -746.0, "loss": 0.6002, "rewards/accuracies": 0.875, "rewards/chosen": 0.4736328125, "rewards/margins": 4.0625, "rewards/rejected": -3.59375, "step": 5211 }, { "epoch": 0.9848363172563654, "grad_norm": 1.652886148309073, "learning_rate": 1.0065802581770164e-07, "logits/chosen": 3.72265625, "logits/rejected": 2.875, "logps/chosen": -916.0, "logps/rejected": -767.0, "loss": 0.5987, "rewards/accuracies": 0.78125, "rewards/chosen": 0.79296875, "rewards/margins": 4.68359375, "rewards/rejected": -3.89453125, "step": 5212 }, { "epoch": 0.9850252728045727, "grad_norm": 2.1908389697010158, "learning_rate": 1.0064207806425228e-07, "logits/chosen": 2.330078125, "logits/rejected": 2.703125, "logps/chosen": -830.5, "logps/rejected": -1180.0, "loss": 0.5064, "rewards/accuracies": 0.875, "rewards/chosen": 0.9228515625, "rewards/margins": 6.4609375, "rewards/rejected": -5.52734375, "step": 5213 }, { "epoch": 0.98521422835278, "grad_norm": 2.3458614531002637, "learning_rate": 1.0062632580369388e-07, "logits/chosen": 2.9609375, "logits/rejected": 2.931640625, "logps/chosen": -592.0, "logps/rejected": -598.0, "loss": 0.5809, "rewards/accuracies": 0.75, "rewards/chosen": 1.030029296875, "rewards/margins": 4.6953125, "rewards/rejected": -3.671875, "step": 5214 }, { "epoch": 0.9854031839009872, "grad_norm": 2.200448033575494, "learning_rate": 1.0061076904287948e-07, "logits/chosen": 3.0390625, "logits/rejected": 2.25, "logps/chosen": -685.5, "logps/rejected": -780.0, "loss": 0.5152, "rewards/accuracies": 0.84375, "rewards/chosen": 0.8232421875, "rewards/margins": 6.171875, "rewards/rejected": -5.3515625, "step": 5215 }, { "epoch": 0.9855921394491945, "grad_norm": 4.396639612841511, "learning_rate": 1.0059540778857706e-07, "logits/chosen": 3.056640625, "logits/rejected": 3.40625, "logps/chosen": -839.0, "logps/rejected": -771.5, "loss": 0.5148, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0003662109375, "rewards/margins": 6.22265625, "rewards/rejected": -6.21484375, "step": 5216 }, { "epoch": 0.9857810949974019, "grad_norm": 2.25486604428313, "learning_rate": 1.0058024204746947e-07, "logits/chosen": 2.779296875, "logits/rejected": 2.01171875, "logps/chosen": -1032.0, "logps/rejected": -961.0, "loss": 0.6249, "rewards/accuracies": 0.8125, "rewards/chosen": 0.123046875, "rewards/margins": 4.71875, "rewards/rejected": -4.609375, "step": 5217 }, { "epoch": 0.9859700505456092, "grad_norm": 2.443826490893481, "learning_rate": 1.0056527182615464e-07, "logits/chosen": 2.3515625, "logits/rejected": 1.8994140625, "logps/chosen": -812.0, "logps/rejected": -810.0, "loss": 0.5467, "rewards/accuracies": 0.8125, "rewards/chosen": 0.81689453125, "rewards/margins": 5.4453125, "rewards/rejected": -4.6171875, "step": 5218 }, { "epoch": 0.9861590060938165, "grad_norm": 2.566359067533974, "learning_rate": 1.0055049713114526e-07, "logits/chosen": 2.884765625, "logits/rejected": 2.1923828125, "logps/chosen": -622.5, "logps/rejected": -730.0, "loss": 0.6388, "rewards/accuracies": 0.71875, "rewards/chosen": 0.40185546875, "rewards/margins": 3.685546875, "rewards/rejected": -3.2880859375, "step": 5219 }, { "epoch": 0.9863479616420238, "grad_norm": 1.3148099477167794, "learning_rate": 1.0053591796886908e-07, "logits/chosen": 2.62890625, "logits/rejected": 2.05859375, "logps/chosen": -900.0, "logps/rejected": -1024.0, "loss": 0.5645, "rewards/accuracies": 0.75, "rewards/chosen": 0.5341796875, "rewards/margins": 5.5546875, "rewards/rejected": -5.015625, "step": 5220 }, { "epoch": 0.986536917190231, "grad_norm": 2.3643214381747044, "learning_rate": 1.0052153434566883e-07, "logits/chosen": 2.55078125, "logits/rejected": 2.796875, "logps/chosen": -768.5, "logps/rejected": -889.0, "loss": 0.5512, "rewards/accuracies": 0.8125, "rewards/chosen": 0.857177734375, "rewards/margins": 6.4765625, "rewards/rejected": -5.625, "step": 5221 }, { "epoch": 0.9867258727384383, "grad_norm": 2.0513664278504717, "learning_rate": 1.0050734626780201e-07, "logits/chosen": 3.01953125, "logits/rejected": 2.705078125, "logps/chosen": -795.5, "logps/rejected": -1283.5, "loss": 0.6271, "rewards/accuracies": 0.71875, "rewards/chosen": 0.328125, "rewards/margins": 5.041015625, "rewards/rejected": -4.720703125, "step": 5222 }, { "epoch": 0.9869148282866456, "grad_norm": 2.2844859488971534, "learning_rate": 1.0049335374144111e-07, "logits/chosen": 2.2275390625, "logits/rejected": 2.251953125, "logps/chosen": -771.0, "logps/rejected": -1083.0, "loss": 0.5325, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0306396484375, "rewards/margins": 7.20703125, "rewards/rejected": -7.2421875, "step": 5223 }, { "epoch": 0.9871037838348529, "grad_norm": 2.597342170494659, "learning_rate": 1.0047955677267371e-07, "logits/chosen": 2.7421875, "logits/rejected": 2.298828125, "logps/chosen": -827.0, "logps/rejected": -910.0, "loss": 0.5222, "rewards/accuracies": 0.78125, "rewards/chosen": 1.0908203125, "rewards/margins": 5.26171875, "rewards/rejected": -4.15625, "step": 5224 }, { "epoch": 0.9872927393830602, "grad_norm": 5.262053747854233, "learning_rate": 1.00465955367502e-07, "logits/chosen": 3.37890625, "logits/rejected": 3.078125, "logps/chosen": -861.5, "logps/rejected": -834.0, "loss": 0.5715, "rewards/accuracies": 0.71875, "rewards/chosen": 1.01171875, "rewards/margins": 5.1533203125, "rewards/rejected": -4.142578125, "step": 5225 }, { "epoch": 0.9874816949312675, "grad_norm": 2.7580152384844046, "learning_rate": 1.0045254953184344e-07, "logits/chosen": 3.859375, "logits/rejected": 3.32421875, "logps/chosen": -639.5, "logps/rejected": -830.0, "loss": 0.7143, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3450927734375, "rewards/margins": 3.51953125, "rewards/rejected": -3.86328125, "step": 5226 }, { "epoch": 0.9876706504794747, "grad_norm": 2.8420503332791713, "learning_rate": 1.004393392715301e-07, "logits/chosen": 3.091796875, "logits/rejected": 2.4140625, "logps/chosen": -840.5, "logps/rejected": -861.0, "loss": 0.6247, "rewards/accuracies": 0.75, "rewards/chosen": 0.963134765625, "rewards/margins": 5.28125, "rewards/rejected": -4.3125, "step": 5227 }, { "epoch": 0.987859606027682, "grad_norm": 3.156438946119483, "learning_rate": 1.0042632459230912e-07, "logits/chosen": 2.7890625, "logits/rejected": 2.578125, "logps/chosen": -955.5, "logps/rejected": -862.0, "loss": 0.6157, "rewards/accuracies": 0.875, "rewards/chosen": 0.130859375, "rewards/margins": 5.65625, "rewards/rejected": -5.5234375, "step": 5228 }, { "epoch": 0.9880485615758893, "grad_norm": 2.2797522723440182, "learning_rate": 1.0041350549984254e-07, "logits/chosen": 3.5625, "logits/rejected": 2.953125, "logps/chosen": -572.5, "logps/rejected": -14898.0, "loss": 0.7396, "rewards/accuracies": 0.71875, "rewards/chosen": -0.051025390625, "rewards/margins": -170.20703125, "rewards/rejected": 170.5703125, "step": 5229 }, { "epoch": 0.9882375171240966, "grad_norm": 2.8376203427102036, "learning_rate": 1.0040088199970733e-07, "logits/chosen": 3.87109375, "logits/rejected": 3.76953125, "logps/chosen": -1090.0, "logps/rejected": -1702.0, "loss": 0.4614, "rewards/accuracies": 0.78125, "rewards/chosen": 1.9072265625, "rewards/margins": 10.0703125, "rewards/rejected": -8.15625, "step": 5230 }, { "epoch": 0.9884264726723039, "grad_norm": 2.7558639548148633, "learning_rate": 1.0038845409739528e-07, "logits/chosen": 3.5625, "logits/rejected": 2.828125, "logps/chosen": -555.5, "logps/rejected": -627.0, "loss": 0.6244, "rewards/accuracies": 0.875, "rewards/chosen": 0.29345703125, "rewards/margins": 5.3671875, "rewards/rejected": -5.07421875, "step": 5231 }, { "epoch": 0.9886154282205111, "grad_norm": 2.508130328851801, "learning_rate": 1.0037622179831318e-07, "logits/chosen": 3.6015625, "logits/rejected": 3.2890625, "logps/chosen": -826.0, "logps/rejected": -813.0, "loss": 0.5018, "rewards/accuracies": 0.8125, "rewards/chosen": 0.669921875, "rewards/margins": 5.26953125, "rewards/rejected": -4.59765625, "step": 5232 }, { "epoch": 0.9888043837687184, "grad_norm": 2.6590626659765424, "learning_rate": 1.0036418510778269e-07, "logits/chosen": 3.125, "logits/rejected": 2.326171875, "logps/chosen": -811.0, "logps/rejected": -1360.0, "loss": 0.5743, "rewards/accuracies": 0.75, "rewards/chosen": 0.7666015625, "rewards/margins": 9.19921875, "rewards/rejected": -8.45703125, "step": 5233 }, { "epoch": 0.9889933393169257, "grad_norm": 3.057462377184918, "learning_rate": 1.0035234403104032e-07, "logits/chosen": 2.921875, "logits/rejected": 2.62109375, "logps/chosen": -1040.0, "logps/rejected": -1237.0, "loss": 0.6166, "rewards/accuracies": 0.78125, "rewards/chosen": 0.605712890625, "rewards/margins": 5.82421875, "rewards/rejected": -5.23046875, "step": 5234 }, { "epoch": 0.989182294865133, "grad_norm": 2.657109345976435, "learning_rate": 1.0034069857323754e-07, "logits/chosen": 2.4677734375, "logits/rejected": 2.3472900390625, "logps/chosen": -925.0, "logps/rejected": -1650.0, "loss": 0.4266, "rewards/accuracies": 0.9375, "rewards/chosen": 1.101806640625, "rewards/margins": 11.0234375, "rewards/rejected": -9.9609375, "step": 5235 }, { "epoch": 0.9893712504133403, "grad_norm": 3.6117922369293103, "learning_rate": 1.0032924873944076e-07, "logits/chosen": 2.8671875, "logits/rejected": 2.04296875, "logps/chosen": -647.25, "logps/rejected": -493.0, "loss": 0.4817, "rewards/accuracies": 0.90625, "rewards/chosen": 0.377685546875, "rewards/margins": 5.48046875, "rewards/rejected": -5.09375, "step": 5236 }, { "epoch": 0.9895602059615476, "grad_norm": 2.6782771590606975, "learning_rate": 1.0031799453463113e-07, "logits/chosen": 3.41796875, "logits/rejected": 3.12109375, "logps/chosen": -857.0, "logps/rejected": -1570.5, "loss": 0.5532, "rewards/accuracies": 0.71875, "rewards/chosen": 0.41668701171875, "rewards/margins": 14.87890625, "rewards/rejected": -14.51953125, "step": 5237 }, { "epoch": 0.9897491615097548, "grad_norm": 3.1387948734786533, "learning_rate": 1.0030693596370484e-07, "logits/chosen": 2.2705078125, "logits/rejected": 2.23687744140625, "logps/chosen": -865.0, "logps/rejected": -1541.0, "loss": 0.5249, "rewards/accuracies": 0.8125, "rewards/chosen": 0.48828125, "rewards/margins": 7.6171875, "rewards/rejected": -7.1328125, "step": 5238 }, { "epoch": 0.9899381170579621, "grad_norm": 2.2700404231905646, "learning_rate": 1.0029607303147291e-07, "logits/chosen": 3.068359375, "logits/rejected": 2.7265625, "logps/chosen": -914.0, "logps/rejected": -781.0, "loss": 0.541, "rewards/accuracies": 0.78125, "rewards/chosen": 1.3408203125, "rewards/margins": 5.6875, "rewards/rejected": -4.345703125, "step": 5239 }, { "epoch": 0.9901270726061694, "grad_norm": 2.4462315528459397, "learning_rate": 1.0028540574266128e-07, "logits/chosen": 2.37890625, "logits/rejected": 1.91796875, "logps/chosen": -842.5, "logps/rejected": -835.0, "loss": 0.5979, "rewards/accuracies": 0.75, "rewards/chosen": -0.076019287109375, "rewards/margins": 5.033203125, "rewards/rejected": -5.111328125, "step": 5240 }, { "epoch": 0.9903160281543767, "grad_norm": 3.7151412242532165, "learning_rate": 1.0027493410191066e-07, "logits/chosen": 2.55078125, "logits/rejected": 1.771240234375, "logps/chosen": -746.0, "logps/rejected": -766.5, "loss": 0.576, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1298828125, "rewards/margins": 4.48046875, "rewards/rejected": -4.34765625, "step": 5241 }, { "epoch": 0.990504983702584, "grad_norm": 2.3453540908147987, "learning_rate": 1.0026465811377681e-07, "logits/chosen": 3.0546875, "logits/rejected": 2.83984375, "logps/chosen": -504.0, "logps/rejected": -1353.5, "loss": 0.4249, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9384765625, "rewards/margins": 8.1953125, "rewards/rejected": -7.26953125, "step": 5242 }, { "epoch": 0.9906939392507913, "grad_norm": 3.7605764782408633, "learning_rate": 1.0025457778273028e-07, "logits/chosen": 2.115234375, "logits/rejected": 1.5703125, "logps/chosen": -864.5, "logps/rejected": -934.5, "loss": 0.5193, "rewards/accuracies": 0.84375, "rewards/chosen": 0.65234375, "rewards/margins": 6.294921875, "rewards/rejected": -5.62890625, "step": 5243 }, { "epoch": 0.9908828947989985, "grad_norm": 2.7853761689177117, "learning_rate": 1.0024469311315647e-07, "logits/chosen": 2.451171875, "logits/rejected": 2.544921875, "logps/chosen": -639.0, "logps/rejected": -626.0, "loss": 0.6805, "rewards/accuracies": 0.78125, "rewards/chosen": -0.208984375, "rewards/margins": 3.125, "rewards/rejected": -3.328125, "step": 5244 }, { "epoch": 0.9910718503472058, "grad_norm": 3.94599081554322, "learning_rate": 1.0023500410935575e-07, "logits/chosen": 2.92578125, "logits/rejected": 3.1953125, "logps/chosen": -928.0, "logps/rejected": -781.0, "loss": 0.6259, "rewards/accuracies": 0.84375, "rewards/chosen": 0.2529296875, "rewards/margins": 4.6640625, "rewards/rejected": -4.41015625, "step": 5245 }, { "epoch": 0.9912608058954131, "grad_norm": 2.5761708497519655, "learning_rate": 1.0022551077554324e-07, "logits/chosen": 2.5693359375, "logits/rejected": 2.666015625, "logps/chosen": -662.0, "logps/rejected": -1123.0, "loss": 0.5955, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1669921875, "rewards/margins": 6.66015625, "rewards/rejected": -6.828125, "step": 5246 }, { "epoch": 0.9914497614436204, "grad_norm": 3.67240641979133, "learning_rate": 1.0021621311584913e-07, "logits/chosen": 2.98828125, "logits/rejected": 2.6484375, "logps/chosen": -537.5, "logps/rejected": -588.0, "loss": 0.5314, "rewards/accuracies": 0.875, "rewards/chosen": 0.111572265625, "rewards/margins": 5.71484375, "rewards/rejected": -5.59765625, "step": 5247 }, { "epoch": 0.9916387169918277, "grad_norm": 1.7848289257324486, "learning_rate": 1.0020711113431826e-07, "logits/chosen": 2.49609375, "logits/rejected": 2.349609375, "logps/chosen": -1274.0, "logps/rejected": -1102.0, "loss": 0.5934, "rewards/accuracies": 0.6875, "rewards/chosen": 1.794921875, "rewards/margins": 6.322265625, "rewards/rejected": -4.517578125, "step": 5248 }, { "epoch": 0.991827672540035, "grad_norm": 3.0137704746368104, "learning_rate": 1.0019820483491048e-07, "logits/chosen": 3.58203125, "logits/rejected": 3.6875, "logps/chosen": -759.0, "logps/rejected": -1989.0, "loss": 0.504, "rewards/accuracies": 0.78125, "rewards/chosen": 0.7490234375, "rewards/margins": 9.63671875, "rewards/rejected": -8.8828125, "step": 5249 }, { "epoch": 0.9920166280882422, "grad_norm": 3.982976733120911, "learning_rate": 1.0018949422150047e-07, "logits/chosen": 2.93359375, "logits/rejected": 2.81640625, "logps/chosen": -1123.0, "logps/rejected": -2016.0, "loss": 0.5073, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6396484375, "rewards/margins": 9.2578125, "rewards/rejected": -8.62109375, "step": 5250 }, { "epoch": 0.9922055836364495, "grad_norm": 2.7443889592460957, "learning_rate": 1.0018097929787778e-07, "logits/chosen": 3.27734375, "logits/rejected": 3.2021484375, "logps/chosen": -585.5, "logps/rejected": -850.5, "loss": 0.5306, "rewards/accuracies": 0.8125, "rewards/chosen": 1.2060546875, "rewards/margins": 5.0234375, "rewards/rejected": -3.8125, "step": 5251 }, { "epoch": 0.9923945391846568, "grad_norm": 3.1009388034946874, "learning_rate": 1.0017266006774678e-07, "logits/chosen": 3.263671875, "logits/rejected": 3.353515625, "logps/chosen": -1001.0, "logps/rejected": -1097.5, "loss": 0.4741, "rewards/accuracies": 0.84375, "rewards/chosen": 1.345703125, "rewards/margins": 5.8984375, "rewards/rejected": -4.548828125, "step": 5252 }, { "epoch": 0.9925834947328641, "grad_norm": 3.9513678594291175, "learning_rate": 1.0016453653472683e-07, "logits/chosen": 3.181640625, "logits/rejected": 2.89453125, "logps/chosen": -701.5, "logps/rejected": -670.0, "loss": 0.5832, "rewards/accuracies": 0.84375, "rewards/chosen": 0.95263671875, "rewards/margins": 4.69140625, "rewards/rejected": -3.7421875, "step": 5253 }, { "epoch": 0.9927724502810714, "grad_norm": 3.251318599989131, "learning_rate": 1.00156608702352e-07, "logits/chosen": 2.55078125, "logits/rejected": 2.4453125, "logps/chosen": -614.0, "logps/rejected": -1003.0, "loss": 0.5774, "rewards/accuracies": 0.8125, "rewards/chosen": 0.359130859375, "rewards/margins": 5.0, "rewards/rejected": -4.6484375, "step": 5254 }, { "epoch": 0.9929614058292786, "grad_norm": 2.254168080682373, "learning_rate": 1.0014887657407125e-07, "logits/chosen": 2.888671875, "logits/rejected": 2.646484375, "logps/chosen": -730.0, "logps/rejected": -1041.0, "loss": 0.5759, "rewards/accuracies": 0.875, "rewards/chosen": 0.55078125, "rewards/margins": 4.91796875, "rewards/rejected": -4.36328125, "step": 5255 }, { "epoch": 0.9931503613774859, "grad_norm": 2.8923240966529056, "learning_rate": 1.001413401532486e-07, "logits/chosen": 3.19140625, "logits/rejected": 2.60546875, "logps/chosen": -792.0, "logps/rejected": -973.0, "loss": 0.6769, "rewards/accuracies": 0.75, "rewards/chosen": -0.05712890625, "rewards/margins": 5.0625, "rewards/rejected": -5.125, "step": 5256 }, { "epoch": 0.9933393169256932, "grad_norm": 3.1861741381463426, "learning_rate": 1.0013399944316255e-07, "logits/chosen": 3.126953125, "logits/rejected": 2.4140625, "logps/chosen": -657.5, "logps/rejected": -654.0, "loss": 0.5433, "rewards/accuracies": 0.90625, "rewards/chosen": 0.5030517578125, "rewards/margins": 5.171875, "rewards/rejected": -4.6640625, "step": 5257 }, { "epoch": 0.9935282724739005, "grad_norm": 4.370430588544789, "learning_rate": 1.0012685444700687e-07, "logits/chosen": 2.8359375, "logits/rejected": 2.7294921875, "logps/chosen": -1129.0, "logps/rejected": -1542.5, "loss": 0.481, "rewards/accuracies": 0.84375, "rewards/chosen": 1.295166015625, "rewards/margins": 10.4609375, "rewards/rejected": -9.12890625, "step": 5258 }, { "epoch": 0.9937172280221078, "grad_norm": 2.5683022852259465, "learning_rate": 1.0011990516788987e-07, "logits/chosen": 2.55078125, "logits/rejected": 2.57421875, "logps/chosen": -750.0, "logps/rejected": -875.0, "loss": 0.6573, "rewards/accuracies": 0.71875, "rewards/chosen": -0.05517578125, "rewards/margins": 4.025390625, "rewards/rejected": -4.078125, "step": 5259 }, { "epoch": 0.9939061835703151, "grad_norm": 2.4154074958378926, "learning_rate": 1.0011315160883482e-07, "logits/chosen": 2.93359375, "logits/rejected": 2.1796875, "logps/chosen": -700.0, "logps/rejected": -603.0, "loss": 0.7218, "rewards/accuracies": 0.71875, "rewards/chosen": -0.08642578125, "rewards/margins": 2.63671875, "rewards/rejected": -2.7255859375, "step": 5260 }, { "epoch": 0.9940951391185223, "grad_norm": 2.1910099521409796, "learning_rate": 1.0010659377277994e-07, "logits/chosen": 3.056640625, "logits/rejected": 2.865234375, "logps/chosen": -1105.0, "logps/rejected": -1845.0, "loss": 0.5111, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5322265625, "rewards/margins": 7.01171875, "rewards/rejected": -5.49609375, "step": 5261 }, { "epoch": 0.9942840946667296, "grad_norm": 3.263874567317165, "learning_rate": 1.0010023166257812e-07, "logits/chosen": 3.5390625, "logits/rejected": 3.26953125, "logps/chosen": -1392.5, "logps/rejected": -844.0, "loss": 0.7344, "rewards/accuracies": 0.625, "rewards/chosen": -2.89453125, "rewards/margins": 0.90625, "rewards/rejected": -3.796875, "step": 5262 }, { "epoch": 0.9944730502149369, "grad_norm": 1.695133236036031, "learning_rate": 1.0009406528099727e-07, "logits/chosen": 3.55859375, "logits/rejected": 3.28515625, "logps/chosen": -858.0, "logps/rejected": -1060.0, "loss": 0.489, "rewards/accuracies": 0.78125, "rewards/chosen": 1.63671875, "rewards/margins": 6.64453125, "rewards/rejected": -4.99609375, "step": 5263 }, { "epoch": 0.9946620057631442, "grad_norm": 3.016769217988424, "learning_rate": 1.0008809463072004e-07, "logits/chosen": 2.9609375, "logits/rejected": 2.673828125, "logps/chosen": -740.0, "logps/rejected": -904.0, "loss": 0.512, "rewards/accuracies": 0.875, "rewards/chosen": 0.73779296875, "rewards/margins": 5.734375, "rewards/rejected": -4.98828125, "step": 5264 }, { "epoch": 0.9948509613113515, "grad_norm": 2.1682558279224873, "learning_rate": 1.0008231971434394e-07, "logits/chosen": 2.490234375, "logits/rejected": 2.259765625, "logps/chosen": -948.0, "logps/rejected": -1147.0, "loss": 0.4983, "rewards/accuracies": 0.90625, "rewards/chosen": 1.083984375, "rewards/margins": 7.43359375, "rewards/rejected": -6.359375, "step": 5265 }, { "epoch": 0.9950399168595588, "grad_norm": 2.458798386476254, "learning_rate": 1.0007674053438136e-07, "logits/chosen": 2.30859375, "logits/rejected": 2.369140625, "logps/chosen": -963.0, "logps/rejected": -1768.0, "loss": 0.5587, "rewards/accuracies": 0.875, "rewards/chosen": 1.21331787109375, "rewards/margins": 10.1171875, "rewards/rejected": -8.890625, "step": 5266 }, { "epoch": 0.995228872407766, "grad_norm": 3.2918041743610766, "learning_rate": 1.0007135709325947e-07, "logits/chosen": 2.59375, "logits/rejected": 1.962890625, "logps/chosen": -1108.0, "logps/rejected": -912.0, "loss": 0.5017, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4200439453125, "rewards/margins": 4.80078125, "rewards/rejected": -4.37109375, "step": 5267 }, { "epoch": 0.9954178279559733, "grad_norm": 3.07847301505179, "learning_rate": 1.0006616939332045e-07, "logits/chosen": 1.66796875, "logits/rejected": 1.791015625, "logps/chosen": -605.5, "logps/rejected": -690.0, "loss": 0.6383, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4805908203125, "rewards/margins": 4.57421875, "rewards/rejected": -5.046875, "step": 5268 }, { "epoch": 0.9956067835041806, "grad_norm": 3.427625043649102, "learning_rate": 1.0006117743682109e-07, "logits/chosen": 3.1171875, "logits/rejected": 2.9453125, "logps/chosen": -991.0, "logps/rejected": -1184.0, "loss": 0.5757, "rewards/accuracies": 0.8125, "rewards/chosen": 0.86083984375, "rewards/margins": 8.74609375, "rewards/rejected": -7.87890625, "step": 5269 }, { "epoch": 0.9957957390523879, "grad_norm": 1.9876076624762682, "learning_rate": 1.0005638122593319e-07, "logits/chosen": 3.05859375, "logits/rejected": 2.80078125, "logps/chosen": -782.0, "logps/rejected": -1029.0, "loss": 0.408, "rewards/accuracies": 0.90625, "rewards/chosen": 1.35546875, "rewards/margins": 7.0625, "rewards/rejected": -5.7109375, "step": 5270 }, { "epoch": 0.9959846946005952, "grad_norm": 2.5146396052645046, "learning_rate": 1.0005178076274335e-07, "logits/chosen": 2.306640625, "logits/rejected": 2.357421875, "logps/chosen": -1141.0, "logps/rejected": -2172.0, "loss": 0.6145, "rewards/accuracies": 0.8125, "rewards/chosen": 0.8916015625, "rewards/margins": 9.65234375, "rewards/rejected": -8.765625, "step": 5271 }, { "epoch": 0.9961736501488025, "grad_norm": 2.4884888090717325, "learning_rate": 1.0004737604925295e-07, "logits/chosen": 3.119140625, "logits/rejected": 2.4375, "logps/chosen": -741.0, "logps/rejected": -604.0, "loss": 0.5588, "rewards/accuracies": 0.75, "rewards/chosen": 0.8759765625, "rewards/margins": 4.9609375, "rewards/rejected": -4.0859375, "step": 5272 }, { "epoch": 0.9963626056970097, "grad_norm": 2.4853390933492125, "learning_rate": 1.0004316708737827e-07, "logits/chosen": 3.765625, "logits/rejected": 3.4921875, "logps/chosen": -927.0, "logps/rejected": -1251.0, "loss": 0.3561, "rewards/accuracies": 1.0, "rewards/chosen": 1.3134765625, "rewards/margins": 8.9296875, "rewards/rejected": -7.6171875, "step": 5273 }, { "epoch": 0.996551561245217, "grad_norm": 1.4502105095421773, "learning_rate": 1.0003915387895048e-07, "logits/chosen": 3.6953125, "logits/rejected": 3.53515625, "logps/chosen": -910.0, "logps/rejected": -694.5, "loss": 0.6468, "rewards/accuracies": 0.75, "rewards/chosen": 0.5205078125, "rewards/margins": 4.34375, "rewards/rejected": -3.8203125, "step": 5274 }, { "epoch": 0.9967405167934243, "grad_norm": 2.2020216125853227, "learning_rate": 1.0003533642571546e-07, "logits/chosen": 3.046875, "logits/rejected": 2.4765625, "logps/chosen": -491.0, "logps/rejected": -576.5, "loss": 0.562, "rewards/accuracies": 0.84375, "rewards/chosen": 0.38189697265625, "rewards/margins": 4.5859375, "rewards/rejected": -4.203125, "step": 5275 }, { "epoch": 0.9969294723416317, "grad_norm": 2.200613464569596, "learning_rate": 1.0003171472933403e-07, "logits/chosen": 3.4453125, "logits/rejected": 3.2890625, "logps/chosen": -898.5, "logps/rejected": -964.0, "loss": 0.6117, "rewards/accuracies": 0.84375, "rewards/chosen": 0.5, "rewards/margins": 5.0703125, "rewards/rejected": -4.56640625, "step": 5276 }, { "epoch": 0.997118427889839, "grad_norm": 6.180498896206653, "learning_rate": 1.0002828879138174e-07, "logits/chosen": 2.884765625, "logits/rejected": 3.00390625, "logps/chosen": -578.25, "logps/rejected": -733.5, "loss": 0.6343, "rewards/accuracies": 0.78125, "rewards/chosen": 0.452880859375, "rewards/margins": 5.0859375, "rewards/rejected": -4.638671875, "step": 5277 }, { "epoch": 0.9973073834380461, "grad_norm": 2.095468900260074, "learning_rate": 1.0002505861334909e-07, "logits/chosen": 2.033203125, "logits/rejected": 1.6376953125, "logps/chosen": -726.0, "logps/rejected": -717.5, "loss": 0.5736, "rewards/accuracies": 0.9375, "rewards/chosen": 0.671142578125, "rewards/margins": 5.51953125, "rewards/rejected": -4.84765625, "step": 5278 }, { "epoch": 0.9974963389862535, "grad_norm": 2.5863394586787076, "learning_rate": 1.0002202419664137e-07, "logits/chosen": 3.328125, "logits/rejected": 3.33203125, "logps/chosen": -878.5, "logps/rejected": -1733.5, "loss": 0.4759, "rewards/accuracies": 0.90625, "rewards/chosen": 1.117431640625, "rewards/margins": 8.13671875, "rewards/rejected": -7.0234375, "step": 5279 }, { "epoch": 0.9976852945344608, "grad_norm": 1.6586072712556867, "learning_rate": 1.0001918554257872e-07, "logits/chosen": 3.19140625, "logits/rejected": 2.640625, "logps/chosen": -1140.0, "logps/rejected": -1060.0, "loss": 0.4555, "rewards/accuracies": 0.875, "rewards/chosen": 1.8388671875, "rewards/margins": 6.6640625, "rewards/rejected": -4.8359375, "step": 5280 }, { "epoch": 0.9978742500826681, "grad_norm": 2.5853027685222045, "learning_rate": 1.0001654265239607e-07, "logits/chosen": 3.38671875, "logits/rejected": 2.595703125, "logps/chosen": -879.0, "logps/rejected": -770.0, "loss": 0.5387, "rewards/accuracies": 0.875, "rewards/chosen": 0.52294921875, "rewards/margins": 4.921875, "rewards/rejected": -4.3984375, "step": 5281 }, { "epoch": 0.9980632056308754, "grad_norm": 5.031667518082234, "learning_rate": 1.0001409552724316e-07, "logits/chosen": 2.88671875, "logits/rejected": 2.1796875, "logps/chosen": -915.0, "logps/rejected": -1568.0, "loss": 0.5541, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9354248046875, "rewards/margins": 5.92578125, "rewards/rejected": -4.98046875, "step": 5282 }, { "epoch": 0.9982521611790827, "grad_norm": 2.1340937436452925, "learning_rate": 1.0001184416818473e-07, "logits/chosen": 2.96875, "logits/rejected": 2.25390625, "logps/chosen": -716.5, "logps/rejected": -747.5, "loss": 0.5388, "rewards/accuracies": 0.71875, "rewards/chosen": 0.69091796875, "rewards/margins": 5.0625, "rewards/rejected": -4.37109375, "step": 5283 }, { "epoch": 0.9984411167272899, "grad_norm": 2.6969272399565667, "learning_rate": 1.000097885762001e-07, "logits/chosen": 3.55078125, "logits/rejected": 3.1953125, "logps/chosen": -818.5, "logps/rejected": -863.0, "loss": 0.5205, "rewards/accuracies": 0.875, "rewards/chosen": 1.0712890625, "rewards/margins": 5.8359375, "rewards/rejected": -4.75390625, "step": 5284 }, { "epoch": 0.9986300722754972, "grad_norm": 3.3543954609726194, "learning_rate": 1.0000792875218361e-07, "logits/chosen": 3.08203125, "logits/rejected": 2.689453125, "logps/chosen": -711.0, "logps/rejected": -889.0, "loss": 0.455, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3681640625, "rewards/margins": 6.6015625, "rewards/rejected": -5.234375, "step": 5285 }, { "epoch": 0.9988190278237045, "grad_norm": 3.3112240940536255, "learning_rate": 1.000062646969444e-07, "logits/chosen": 3.3828125, "logits/rejected": 3.44921875, "logps/chosen": -1341.0, "logps/rejected": -1165.0, "loss": 0.5595, "rewards/accuracies": 0.8125, "rewards/chosen": 1.966796875, "rewards/margins": 6.0625, "rewards/rejected": -4.08203125, "step": 5286 }, { "epoch": 0.9990079833719118, "grad_norm": 1.94810978698751, "learning_rate": 1.000047964112064e-07, "logits/chosen": 2.8125, "logits/rejected": 2.341796875, "logps/chosen": -770.0, "logps/rejected": -1231.0, "loss": 0.5648, "rewards/accuracies": 0.8125, "rewards/chosen": 0.72119140625, "rewards/margins": 5.53515625, "rewards/rejected": -4.80859375, "step": 5287 }, { "epoch": 0.9991969389201191, "grad_norm": 2.9080130603200427, "learning_rate": 1.0000352389560841e-07, "logits/chosen": 2.25, "logits/rejected": 1.908203125, "logps/chosen": -827.0, "logps/rejected": -1117.0, "loss": 0.5229, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5078125, "rewards/margins": 6.01953125, "rewards/rejected": -4.52734375, "step": 5288 }, { "epoch": 0.9993858944683264, "grad_norm": 2.4674031841024218, "learning_rate": 1.0000244715070397e-07, "logits/chosen": 2.5078125, "logits/rejected": 2.0126953125, "logps/chosen": -624.0, "logps/rejected": -1030.0, "loss": 0.5695, "rewards/accuracies": 0.78125, "rewards/chosen": 0.78564453125, "rewards/margins": 5.75, "rewards/rejected": -4.96875, "step": 5289 }, { "epoch": 0.9995748500165336, "grad_norm": 2.5547764217205238, "learning_rate": 1.0000156617696157e-07, "logits/chosen": 2.65625, "logits/rejected": 2.23046875, "logps/chosen": -805.5, "logps/rejected": -657.5, "loss": 0.6528, "rewards/accuracies": 0.75, "rewards/chosen": 0.4384765625, "rewards/margins": 4.0625, "rewards/rejected": -3.62890625, "step": 5290 }, { "epoch": 0.9997638055647409, "grad_norm": 4.642179456286903, "learning_rate": 1.0000088097476444e-07, "logits/chosen": 3.18359375, "logits/rejected": 3.0, "logps/chosen": -861.0, "logps/rejected": -731.0, "loss": 0.595, "rewards/accuracies": 0.75, "rewards/chosen": 1.2060546875, "rewards/margins": 4.48876953125, "rewards/rejected": -3.27850341796875, "step": 5291 }, { "epoch": 0.9999527611129482, "grad_norm": 3.2738414122437365, "learning_rate": 1.0000039154441074e-07, "logits/chosen": 2.12890625, "logits/rejected": 1.8671875, "logps/chosen": -1010.0, "logps/rejected": -1141.0, "loss": 0.5014, "rewards/accuracies": 0.78125, "rewards/chosen": 1.86083984375, "rewards/margins": 5.48828125, "rewards/rejected": -3.634765625, "step": 5292 }, { "epoch": 1.0, "grad_norm": 4.422447571135364, "learning_rate": 1.0000009788611332e-07, "logits/chosen": 3.34375, "logits/rejected": 1.828125, "logps/chosen": -130.0, "logps/rejected": -139.0, "loss": 0.7219, "rewards/accuracies": 0.5, "rewards/chosen": -3.03125, "rewards/margins": 0.5, "rewards/rejected": -3.53125, "step": 5293 }, { "epoch": 1.0, "step": 5293, "total_flos": 0.0, "train_loss": 0.6372502187452401, "train_runtime": 27875.4595, "train_samples_per_second": 6.075, "train_steps_per_second": 0.19 } ], "logging_steps": 1, "max_steps": 5293, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2647, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }