qwen-cot-h3 / trainer_state.json
Zzyy2000's picture
Upload folder using huggingface_hub
9cc07a0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9798657718120807,
"eval_steps": 0,
"global_step": 222,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013422818791946308,
"grad_norm": 316.97791395060926,
"learning_rate": 4.347826086956521e-08,
"logits/chosen": 0.86328125,
"logits/rejected": 1.09375,
"logps/chosen": -127.5,
"logps/rejected": -150.0,
"loss": 0.6914,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.026845637583892617,
"grad_norm": 320.21238167334917,
"learning_rate": 8.695652173913042e-08,
"logits/chosen": 1.515625,
"logits/rejected": 2.109375,
"logps/chosen": -115.5,
"logps/rejected": -178.0,
"loss": 0.6914,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.040268456375838924,
"grad_norm": 296.99175855416775,
"learning_rate": 1.3043478260869563e-07,
"logits/chosen": 1.2265625,
"logits/rejected": 0.8046875,
"logps/chosen": -180.0,
"logps/rejected": -130.0,
"loss": 0.71,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.125,
"rewards/margins": -0.125,
"rewards/rejected": 0.0,
"step": 3
},
{
"epoch": 0.053691275167785234,
"grad_norm": 307.2302419398515,
"learning_rate": 1.7391304347826085e-07,
"logits/chosen": 0.81640625,
"logits/rejected": 1.171875,
"logps/chosen": -137.0,
"logps/rejected": -193.0,
"loss": 0.7095,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.125,
"rewards/margins": -0.25,
"rewards/rejected": 0.125,
"step": 4
},
{
"epoch": 0.06711409395973154,
"grad_norm": 281.7519292418279,
"learning_rate": 2.1739130434782607e-07,
"logits/chosen": -0.146484375,
"logits/rejected": -0.07275390625,
"logps/chosen": -138.0,
"logps/rejected": -138.0,
"loss": 0.7065,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.25,
"rewards/margins": 0.0625,
"rewards/rejected": 0.1875,
"step": 5
},
{
"epoch": 0.08053691275167785,
"grad_norm": 334.9778297964166,
"learning_rate": 2.6086956521739126e-07,
"logits/chosen": 0.44140625,
"logits/rejected": 0.51953125,
"logps/chosen": -122.0,
"logps/rejected": -109.0,
"loss": 0.7487,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.25,
"rewards/margins": 0.03125,
"rewards/rejected": 0.21875,
"step": 6
},
{
"epoch": 0.09395973154362416,
"grad_norm": 310.7570200892832,
"learning_rate": 3.043478260869565e-07,
"logits/chosen": 0.1708984375,
"logits/rejected": 0.6953125,
"logps/chosen": -115.0,
"logps/rejected": -178.0,
"loss": 0.7487,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.046875,
"rewards/margins": 0.015625,
"rewards/rejected": -0.0625,
"step": 7
},
{
"epoch": 0.10738255033557047,
"grad_norm": 262.65343185083736,
"learning_rate": 3.478260869565217e-07,
"logits/chosen": 1.3203125,
"logits/rejected": 0.9296875,
"logps/chosen": -80.0,
"logps/rejected": -88.5,
"loss": 0.5881,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.28125,
"rewards/margins": 0.3125,
"rewards/rejected": -0.03125,
"step": 8
},
{
"epoch": 0.12080536912751678,
"grad_norm": 360.8428876216846,
"learning_rate": 3.9130434782608694e-07,
"logits/chosen": 1.359375,
"logits/rejected": 1.09375,
"logps/chosen": -125.0,
"logps/rejected": -127.0,
"loss": 0.8363,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.125,
"rewards/margins": -0.28125,
"rewards/rejected": 0.15625,
"step": 9
},
{
"epoch": 0.1342281879194631,
"grad_norm": 255.9449987869854,
"learning_rate": 4.3478260869565214e-07,
"logits/chosen": 1.1953125,
"logits/rejected": 1.203125,
"logps/chosen": -154.0,
"logps/rejected": -121.0,
"loss": 0.571,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.3125,
"rewards/margins": 0.5,
"rewards/rejected": -0.1875,
"step": 10
},
{
"epoch": 0.1476510067114094,
"grad_norm": 346.20438885823984,
"learning_rate": 4.782608695652174e-07,
"logits/chosen": 1.0,
"logits/rejected": 0.7890625,
"logps/chosen": -214.0,
"logps/rejected": -219.0,
"loss": 0.7852,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.078125,
"rewards/margins": -0.359375,
"rewards/rejected": 0.4375,
"step": 11
},
{
"epoch": 0.1610738255033557,
"grad_norm": 269.1956711192441,
"learning_rate": 5.217391304347825e-07,
"logits/chosen": 0.32421875,
"logits/rejected": 0.09716796875,
"logps/chosen": -167.0,
"logps/rejected": -137.0,
"loss": 0.637,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.375,
"rewards/margins": 0.625,
"rewards/rejected": -0.25,
"step": 12
},
{
"epoch": 0.174496644295302,
"grad_norm": 282.7338298830031,
"learning_rate": 5.652173913043477e-07,
"logits/chosen": 1.359375,
"logits/rejected": 1.015625,
"logps/chosen": -172.0,
"logps/rejected": -130.0,
"loss": 0.6496,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.5625,
"rewards/margins": 0.4375,
"rewards/rejected": 0.125,
"step": 13
},
{
"epoch": 0.18791946308724833,
"grad_norm": 309.4997861737965,
"learning_rate": 6.08695652173913e-07,
"logits/chosen": 0.04052734375,
"logits/rejected": 0.00390625,
"logps/chosen": -140.0,
"logps/rejected": -129.0,
"loss": 0.7821,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0625,
"rewards/margins": -0.15625,
"rewards/rejected": 0.21875,
"step": 14
},
{
"epoch": 0.20134228187919462,
"grad_norm": 312.46545912340383,
"learning_rate": 6.521739130434782e-07,
"logits/chosen": 0.9453125,
"logits/rejected": 0.96484375,
"logps/chosen": -121.0,
"logps/rejected": -120.0,
"loss": 0.7118,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.25,
"rewards/margins": 0.0625,
"rewards/rejected": -0.3125,
"step": 15
},
{
"epoch": 0.21476510067114093,
"grad_norm": 303.0405196118898,
"learning_rate": 6.956521739130434e-07,
"logits/chosen": -0.03515625,
"logits/rejected": 1.171875,
"logps/chosen": -101.0,
"logps/rejected": -97.5,
"loss": 0.6458,
"rewards/accuracies": 0.25,
"rewards/chosen": 0.03125,
"rewards/margins": 0.03125,
"rewards/rejected": 0.0,
"step": 16
},
{
"epoch": 0.22818791946308725,
"grad_norm": 298.356070519565,
"learning_rate": 7.391304347826086e-07,
"logits/chosen": 0.466796875,
"logits/rejected": 0.84375,
"logps/chosen": -100.5,
"logps/rejected": -136.0,
"loss": 0.6341,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.125,
"rewards/margins": 0.125,
"rewards/rejected": 0.0,
"step": 17
},
{
"epoch": 0.24161073825503357,
"grad_norm": 282.77957696018876,
"learning_rate": 7.826086956521739e-07,
"logits/chosen": 0.8046875,
"logits/rejected": 0.625,
"logps/chosen": -99.5,
"logps/rejected": -136.0,
"loss": 0.7319,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.25,
"rewards/margins": -0.1875,
"rewards/rejected": -0.0625,
"step": 18
},
{
"epoch": 0.2550335570469799,
"grad_norm": 258.05534960333887,
"learning_rate": 8.260869565217391e-07,
"logits/chosen": 1.59375,
"logits/rejected": 1.5703125,
"logps/chosen": -156.0,
"logps/rejected": -133.0,
"loss": 0.5654,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.125,
"rewards/margins": 0.3125,
"rewards/rejected": -0.4375,
"step": 19
},
{
"epoch": 0.2684563758389262,
"grad_norm": 333.8352033041929,
"learning_rate": 8.695652173913043e-07,
"logits/chosen": 1.046875,
"logits/rejected": 1.3125,
"logps/chosen": -129.0,
"logps/rejected": -136.0,
"loss": 0.7298,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.125,
"rewards/margins": 0.1875,
"rewards/rejected": -0.0625,
"step": 20
},
{
"epoch": 0.28187919463087246,
"grad_norm": 305.6684446174624,
"learning_rate": 9.130434782608695e-07,
"logits/chosen": 0.6328125,
"logits/rejected": 0.9375,
"logps/chosen": -117.0,
"logps/rejected": -180.0,
"loss": 0.7222,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.125,
"rewards/margins": -0.125,
"rewards/rejected": 0.25,
"step": 21
},
{
"epoch": 0.2953020134228188,
"grad_norm": 260.1289506856149,
"learning_rate": 9.565217391304349e-07,
"logits/chosen": 0.70703125,
"logits/rejected": 1.1328125,
"logps/chosen": -35.0,
"logps/rejected": -46.25,
"loss": 0.6589,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.05859375,
"rewards/margins": 0.16796875,
"rewards/rejected": -0.109375,
"step": 22
},
{
"epoch": 0.3087248322147651,
"grad_norm": 278.34995613754035,
"learning_rate": 1e-06,
"logits/chosen": 0.75390625,
"logits/rejected": 0.73046875,
"logps/chosen": -156.0,
"logps/rejected": -152.0,
"loss": 0.6354,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.40625,
"rewards/margins": 0.84375,
"rewards/rejected": -0.4375,
"step": 23
},
{
"epoch": 0.3221476510067114,
"grad_norm": 233.06948826427978,
"learning_rate": 9.999376947588285e-07,
"logits/chosen": -0.04638671875,
"logits/rejected": 0.1689453125,
"logps/chosen": -92.0,
"logps/rejected": -136.0,
"loss": 0.5417,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0,
"rewards/margins": 0.4375,
"rewards/rejected": -0.4375,
"step": 24
},
{
"epoch": 0.33557046979865773,
"grad_norm": 278.50973158832136,
"learning_rate": 9.99750794563087e-07,
"logits/chosen": 0.8046875,
"logits/rejected": 1.421875,
"logps/chosen": -127.5,
"logps/rejected": -162.0,
"loss": 0.6742,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0625,
"rewards/margins": 0.1875,
"rewards/rejected": -0.125,
"step": 25
},
{
"epoch": 0.348993288590604,
"grad_norm": 269.19679796957723,
"learning_rate": 9.994393459922216e-07,
"logits/chosen": 0.578125,
"logits/rejected": 0.37890625,
"logps/chosen": -95.0,
"logps/rejected": -110.0,
"loss": 0.5146,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1875,
"rewards/margins": 0.875,
"rewards/rejected": -0.6875,
"step": 26
},
{
"epoch": 0.3624161073825503,
"grad_norm": 196.4517902957254,
"learning_rate": 9.990034266657467e-07,
"logits/chosen": 1.15625,
"logits/rejected": 1.09375,
"logps/chosen": -141.0,
"logps/rejected": -149.0,
"loss": 0.4653,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09375,
"rewards/margins": 0.5625,
"rewards/rejected": -0.65625,
"step": 27
},
{
"epoch": 0.37583892617449666,
"grad_norm": 210.82144225997905,
"learning_rate": 9.984431452238966e-07,
"logits/chosen": 0.4453125,
"logits/rejected": 0.453125,
"logps/chosen": -112.0,
"logps/rejected": -136.0,
"loss": 0.4992,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.125,
"rewards/margins": 0.1875,
"rewards/rejected": -0.3125,
"step": 28
},
{
"epoch": 0.38926174496644295,
"grad_norm": 229.4387268433427,
"learning_rate": 9.97758641300553e-07,
"logits/chosen": 0.0400390625,
"logits/rejected": 0.234375,
"logps/chosen": -77.0,
"logps/rejected": -85.0,
"loss": 0.4958,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.15625,
"rewards/margins": 0.25,
"rewards/rejected": -0.40625,
"step": 29
},
{
"epoch": 0.40268456375838924,
"grad_norm": 224.2234125086321,
"learning_rate": 9.96950085488444e-07,
"logits/chosen": 0.0966796875,
"logits/rejected": 0.12158203125,
"logps/chosen": -233.0,
"logps/rejected": -192.0,
"loss": 0.4652,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.5,
"rewards/margins": 0.625,
"rewards/rejected": -0.125,
"step": 30
},
{
"epoch": 0.4161073825503356,
"grad_norm": 218.79146408518088,
"learning_rate": 9.960176792966288e-07,
"logits/chosen": 0.376953125,
"logits/rejected": 0.76953125,
"logps/chosen": -151.0,
"logps/rejected": -186.0,
"loss": 0.4307,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.78125,
"rewards/margins": 1.96875,
"rewards/rejected": -1.1875,
"step": 31
},
{
"epoch": 0.42953020134228187,
"grad_norm": 234.76035113582137,
"learning_rate": 9.949616551002785e-07,
"logits/chosen": 1.484375,
"logits/rejected": 2.125,
"logps/chosen": -115.5,
"logps/rejected": -128.0,
"loss": 0.5723,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1875,
"rewards/margins": 0.6875,
"rewards/rejected": -0.875,
"step": 32
},
{
"epoch": 0.4429530201342282,
"grad_norm": 265.78924153069073,
"learning_rate": 9.937822760827619e-07,
"logits/chosen": 1.34375,
"logits/rejected": 1.875,
"logps/chosen": -96.0,
"logps/rejected": -131.0,
"loss": 0.5628,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09375,
"rewards/margins": 0.125,
"rewards/rejected": -0.21875,
"step": 33
},
{
"epoch": 0.4563758389261745,
"grad_norm": 188.15510343729397,
"learning_rate": 9.924798361700554e-07,
"logits/chosen": 1.390625,
"logits/rejected": 1.40625,
"logps/chosen": -160.0,
"logps/rejected": -155.0,
"loss": 0.4393,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3125,
"rewards/margins": 0.1875,
"rewards/rejected": -0.5,
"step": 34
},
{
"epoch": 0.4697986577181208,
"grad_norm": 184.26566167062134,
"learning_rate": 9.910546599574902e-07,
"logits/chosen": 0.953125,
"logits/rejected": 0.95703125,
"logps/chosen": -85.0,
"logps/rejected": -122.5,
"loss": 0.4133,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.171875,
"rewards/margins": 0.453125,
"rewards/rejected": -0.28125,
"step": 35
},
{
"epoch": 0.48322147651006714,
"grad_norm": 175.4002282124261,
"learning_rate": 9.895071026288573e-07,
"logits/chosen": 0.00640869140625,
"logits/rejected": 0.4453125,
"logps/chosen": -152.0,
"logps/rejected": -196.0,
"loss": 0.2941,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6875,
"rewards/margins": 2.4375,
"rewards/rejected": -1.75,
"step": 36
},
{
"epoch": 0.4966442953020134,
"grad_norm": 224.60918154752224,
"learning_rate": 9.878375498678867e-07,
"logits/chosen": -0.1337890625,
"logits/rejected": 0.443359375,
"logps/chosen": -166.0,
"logps/rejected": -216.0,
"loss": 0.3926,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.5,
"rewards/margins": 1.375,
"rewards/rejected": -0.875,
"step": 37
},
{
"epoch": 0.5100671140939598,
"grad_norm": 246.2589461213588,
"learning_rate": 9.860464177621284e-07,
"logits/chosen": 0.578125,
"logits/rejected": 0.3203125,
"logps/chosen": -126.0,
"logps/rejected": -109.5,
"loss": 0.443,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.09375,
"rewards/margins": 1.375,
"rewards/rejected": -1.28125,
"step": 38
},
{
"epoch": 0.5234899328859061,
"grad_norm": 166.84341628834525,
"learning_rate": 9.841341526992535e-07,
"logits/chosen": 0.236328125,
"logits/rejected": 0.8203125,
"logps/chosen": -106.0,
"logps/rejected": -133.0,
"loss": 0.3483,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3125,
"rewards/margins": 0.5625,
"rewards/rejected": -0.875,
"step": 39
},
{
"epoch": 0.5369127516778524,
"grad_norm": 200.39914415959888,
"learning_rate": 9.821012312558059e-07,
"logits/chosen": 0.96875,
"logits/rejected": 0.59765625,
"logps/chosen": -120.0,
"logps/rejected": -106.0,
"loss": 0.4129,
"rewards/accuracies": 0.25,
"rewards/chosen": 0.1875,
"rewards/margins": 0.34375,
"rewards/rejected": -0.15625,
"step": 40
},
{
"epoch": 0.5503355704697986,
"grad_norm": 152.05567773094523,
"learning_rate": 9.799481600784286e-07,
"logits/chosen": 1.6875,
"logits/rejected": 2.09375,
"logps/chosen": -128.0,
"logps/rejected": -258.0,
"loss": 0.2243,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.375,
"rewards/margins": 3.0625,
"rewards/rejected": -2.6875,
"step": 41
},
{
"epoch": 0.5637583892617449,
"grad_norm": 199.2294955589925,
"learning_rate": 9.776754757575973e-07,
"logits/chosen": 0.9609375,
"logits/rejected": 1.1796875,
"logps/chosen": -150.0,
"logps/rejected": -166.0,
"loss": 0.3205,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.140625,
"rewards/margins": 0.734375,
"rewards/rejected": -0.875,
"step": 42
},
{
"epoch": 0.5771812080536913,
"grad_norm": 157.50000350180815,
"learning_rate": 9.752837446938914e-07,
"logits/chosen": 0.6640625,
"logits/rejected": 0.57421875,
"logps/chosen": -108.0,
"logps/rejected": -132.0,
"loss": 0.3333,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.125,
"rewards/margins": 0.8125,
"rewards/rejected": -0.9375,
"step": 43
},
{
"epoch": 0.5906040268456376,
"grad_norm": 196.50929153843916,
"learning_rate": 9.727735629568335e-07,
"logits/chosen": 0.33203125,
"logits/rejected": 0.396484375,
"logps/chosen": -146.0,
"logps/rejected": -141.0,
"loss": 0.45,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.8125,
"rewards/margins": 1.25,
"rewards/rejected": -0.4375,
"step": 44
},
{
"epoch": 0.6040268456375839,
"grad_norm": 137.27969044382448,
"learning_rate": 9.701455561363377e-07,
"logits/chosen": 0.0810546875,
"logits/rejected": 0.1806640625,
"logps/chosen": -132.0,
"logps/rejected": -156.0,
"loss": 0.247,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5625,
"rewards/margins": 2.6875,
"rewards/rejected": -1.125,
"step": 45
},
{
"epoch": 0.6174496644295302,
"grad_norm": 133.9292244204905,
"learning_rate": 9.67400379186799e-07,
"logits/chosen": -0.921875,
"logits/rejected": -0.79296875,
"logps/chosen": -74.0,
"logps/rejected": -82.0,
"loss": 0.1977,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.515625,
"rewards/margins": 2.28125,
"rewards/rejected": -0.765625,
"step": 46
},
{
"epoch": 0.6308724832214765,
"grad_norm": 189.9282156190453,
"learning_rate": 9.645387162638652e-07,
"logits/chosen": 0.6953125,
"logits/rejected": 0.8359375,
"logps/chosen": -178.0,
"logps/rejected": -160.0,
"loss": 0.2958,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.625,
"rewards/margins": 1.84375,
"rewards/rejected": -1.21875,
"step": 47
},
{
"epoch": 0.6442953020134228,
"grad_norm": 145.8536278840751,
"learning_rate": 9.615612805539303e-07,
"logits/chosen": 1.15625,
"logits/rejected": 1.8125,
"logps/chosen": -82.5,
"logps/rejected": -121.0,
"loss": 0.2335,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5859375,
"rewards/margins": 1.6640625,
"rewards/rejected": -1.078125,
"step": 48
},
{
"epoch": 0.6577181208053692,
"grad_norm": 168.2368559847673,
"learning_rate": 9.584688140963944e-07,
"logits/chosen": 0.029296875,
"logits/rejected": 0.6640625,
"logps/chosen": -152.0,
"logps/rejected": -176.0,
"loss": 0.2925,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.6875,
"rewards/margins": 5.0,
"rewards/rejected": -2.3125,
"step": 49
},
{
"epoch": 0.6711409395973155,
"grad_norm": 90.26685617771182,
"learning_rate": 9.552620875987312e-07,
"logits/chosen": 0.7109375,
"logits/rejected": 1.0859375,
"logps/chosen": -155.0,
"logps/rejected": -189.0,
"loss": 0.1759,
"rewards/accuracies": 0.5,
"rewards/chosen": 2.8125,
"rewards/margins": 2.875,
"rewards/rejected": -0.0625,
"step": 50
},
{
"epoch": 0.6845637583892618,
"grad_norm": 161.68678516400212,
"learning_rate": 9.519419002444118e-07,
"logits/chosen": 0.765625,
"logits/rejected": 1.5,
"logps/chosen": -105.0,
"logps/rejected": -186.0,
"loss": 0.2773,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0625,
"rewards/margins": 3.25,
"rewards/rejected": -1.1875,
"step": 51
},
{
"epoch": 0.697986577181208,
"grad_norm": 116.30317809994341,
"learning_rate": 9.485090794937317e-07,
"logits/chosen": 0.369140625,
"logits/rejected": 1.015625,
"logps/chosen": -147.0,
"logps/rejected": -158.0,
"loss": 0.1838,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4375,
"rewards/margins": 4.875,
"rewards/rejected": -1.4375,
"step": 52
},
{
"epoch": 0.7114093959731543,
"grad_norm": 115.5453240211095,
"learning_rate": 9.4496448087759e-07,
"logits/chosen": 1.0859375,
"logits/rejected": 1.4296875,
"logps/chosen": -158.0,
"logps/rejected": -206.0,
"loss": 0.1532,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3125,
"rewards/margins": 4.4375,
"rewards/rejected": -4.125,
"step": 53
},
{
"epoch": 0.7248322147651006,
"grad_norm": 151.00137937073285,
"learning_rate": 9.413089877842735e-07,
"logits/chosen": 0.8984375,
"logits/rejected": 0.67578125,
"logps/chosen": -156.0,
"logps/rejected": -166.0,
"loss": 0.2604,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5,
"rewards/margins": 1.9375,
"rewards/rejected": -2.4375,
"step": 54
},
{
"epoch": 0.738255033557047,
"grad_norm": 121.8761212766047,
"learning_rate": 9.375435112392969e-07,
"logits/chosen": -0.58984375,
"logits/rejected": -0.1171875,
"logps/chosen": -123.0,
"logps/rejected": -236.0,
"loss": 0.1429,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.25,
"rewards/margins": 7.5,
"rewards/rejected": -6.25,
"step": 55
},
{
"epoch": 0.7516778523489933,
"grad_norm": 189.03161073826737,
"learning_rate": 9.336689896783572e-07,
"logits/chosen": -0.6328125,
"logits/rejected": -0.7109375,
"logps/chosen": -197.0,
"logps/rejected": -187.0,
"loss": 0.2165,
"rewards/accuracies": 0.25,
"rewards/chosen": 2.25,
"rewards/margins": 2.71875,
"rewards/rejected": -0.46875,
"step": 56
},
{
"epoch": 0.7651006711409396,
"grad_norm": 107.72885924741587,
"learning_rate": 9.29686388713456e-07,
"logits/chosen": -0.244140625,
"logits/rejected": 0.26171875,
"logps/chosen": -116.0,
"logps/rejected": -168.0,
"loss": 0.1597,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.5,
"rewards/margins": 3.125,
"rewards/rejected": -1.625,
"step": 57
},
{
"epoch": 0.7785234899328859,
"grad_norm": 80.02513282586197,
"learning_rate": 9.255967008922473e-07,
"logits/chosen": 0.025390625,
"logits/rejected": -0.0322265625,
"logps/chosen": -260.0,
"logps/rejected": -272.0,
"loss": 0.1172,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.5,
"rewards/margins": 11.875,
"rewards/rejected": -5.375,
"step": 58
},
{
"epoch": 0.7919463087248322,
"grad_norm": 61.82855219508814,
"learning_rate": 9.214009454506752e-07,
"logits/chosen": 0.359375,
"logits/rejected": 0.55078125,
"logps/chosen": -134.0,
"logps/rejected": -162.0,
"loss": 0.0776,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4375,
"rewards/margins": 5.875,
"rewards/rejected": -2.4375,
"step": 59
},
{
"epoch": 0.8053691275167785,
"grad_norm": 87.5310679148203,
"learning_rate": 9.171001680589587e-07,
"logits/chosen": 1.515625,
"logits/rejected": 1.359375,
"logps/chosen": -144.0,
"logps/rejected": -170.0,
"loss": 0.086,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8125,
"rewards/margins": 6.0625,
"rewards/rejected": -5.25,
"step": 60
},
{
"epoch": 0.8187919463087249,
"grad_norm": 94.98485933507177,
"learning_rate": 9.126954405609882e-07,
"logits/chosen": -1.328125,
"logits/rejected": -1.6640625,
"logps/chosen": -115.0,
"logps/rejected": -89.0,
"loss": 0.1292,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.59375,
"rewards/margins": 4.6875,
"rewards/rejected": -5.3125,
"step": 61
},
{
"epoch": 0.8322147651006712,
"grad_norm": 84.08521076849105,
"learning_rate": 9.081878607071995e-07,
"logits/chosen": 0.8828125,
"logits/rejected": 1.6171875,
"logps/chosen": -87.0,
"logps/rejected": -130.0,
"loss": 0.108,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3125,
"rewards/margins": 3.0625,
"rewards/rejected": -2.75,
"step": 62
},
{
"epoch": 0.8456375838926175,
"grad_norm": 71.02125606408353,
"learning_rate": 9.035785518809926e-07,
"logits/chosen": 0.154296875,
"logits/rejected": 0.578125,
"logps/chosen": -118.0,
"logps/rejected": -149.0,
"loss": 0.078,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.21875,
"rewards/margins": 2.40625,
"rewards/rejected": -2.1875,
"step": 63
},
{
"epoch": 0.8590604026845637,
"grad_norm": 94.51081771355827,
"learning_rate": 8.988686628187596e-07,
"logits/chosen": -0.37890625,
"logits/rejected": 0.072265625,
"logps/chosen": -98.0,
"logps/rejected": -118.5,
"loss": 0.0861,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.5,
"rewards/margins": 3.09375,
"rewards/rejected": -2.59375,
"step": 64
},
{
"epoch": 0.87248322147651,
"grad_norm": 166.0543842640925,
"learning_rate": 8.940593673235961e-07,
"logits/chosen": 1.1796875,
"logits/rejected": 1.28125,
"logps/chosen": -61.0,
"logps/rejected": -74.0,
"loss": 0.2046,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.90625,
"rewards/margins": 2.15625,
"rewards/rejected": -3.0625,
"step": 65
},
{
"epoch": 0.8859060402684564,
"grad_norm": 55.221935410851465,
"learning_rate": 8.891518639727649e-07,
"logits/chosen": 0.2421875,
"logits/rejected": 0.1103515625,
"logps/chosen": -237.0,
"logps/rejected": -234.0,
"loss": 0.0529,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.5625,
"rewards/margins": 7.0,
"rewards/rejected": -2.4375,
"step": 66
},
{
"epoch": 0.8993288590604027,
"grad_norm": 136.84606095636403,
"learning_rate": 8.841473758189852e-07,
"logits/chosen": 1.078125,
"logits/rejected": 1.3828125,
"logps/chosen": -96.0,
"logps/rejected": -127.5,
"loss": 0.0831,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.125,
"rewards/margins": 5.09375,
"rewards/rejected": -2.96875,
"step": 67
},
{
"epoch": 0.912751677852349,
"grad_norm": 72.8783809894428,
"learning_rate": 8.790471500856227e-07,
"logits/chosen": -0.30078125,
"logits/rejected": 0.2578125,
"logps/chosen": -106.0,
"logps/rejected": -152.0,
"loss": 0.0813,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.625,
"rewards/margins": 6.0625,
"rewards/rejected": -2.4375,
"step": 68
},
{
"epoch": 0.9261744966442953,
"grad_norm": 149.40438009877843,
"learning_rate": 8.738524578558546e-07,
"logits/chosen": 0.30859375,
"logits/rejected": 0.8359375,
"logps/chosen": -118.5,
"logps/rejected": -167.0,
"loss": 0.1764,
"rewards/accuracies": 0.75,
"rewards/chosen": 2.875,
"rewards/margins": 9.8125,
"rewards/rejected": -6.9375,
"step": 69
},
{
"epoch": 0.9395973154362416,
"grad_norm": 110.9947650787701,
"learning_rate": 8.685645937558894e-07,
"logits/chosen": 1.5,
"logits/rejected": 2.1875,
"logps/chosen": -200.0,
"logps/rejected": -210.0,
"loss": 0.1196,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.25,
"rewards/margins": 5.375,
"rewards/rejected": -4.125,
"step": 70
},
{
"epoch": 0.9530201342281879,
"grad_norm": 67.29974977847421,
"learning_rate": 8.631848756323197e-07,
"logits/chosen": -0.119140625,
"logits/rejected": 0.61328125,
"logps/chosen": -83.0,
"logps/rejected": -134.0,
"loss": 0.0889,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.34375,
"rewards/margins": 4.09375,
"rewards/rejected": -4.4375,
"step": 71
},
{
"epoch": 0.9664429530201343,
"grad_norm": 67.29089440200656,
"learning_rate": 8.577146442236856e-07,
"logits/chosen": 0.12109375,
"logits/rejected": 0.20703125,
"logps/chosen": -99.0,
"logps/rejected": -122.0,
"loss": 0.0882,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.71875,
"rewards/margins": 3.59375,
"rewards/rejected": -1.875,
"step": 72
},
{
"epoch": 0.9798657718120806,
"grad_norm": 30.621291076198204,
"learning_rate": 8.521552628263361e-07,
"logits/chosen": 1.1015625,
"logits/rejected": 1.0625,
"logps/chosen": -121.0,
"logps/rejected": -166.0,
"loss": 0.039,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5,
"rewards/margins": 4.0,
"rewards/rejected": -2.5,
"step": 73
},
{
"epoch": 0.9932885906040269,
"grad_norm": 188.921461152021,
"learning_rate": 8.465081169546658e-07,
"logits/chosen": 0.921875,
"logits/rejected": 0.9609375,
"logps/chosen": -87.5,
"logps/rejected": -100.5,
"loss": 0.0981,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8125,
"rewards/margins": 4.59375,
"rewards/rejected": -2.78125,
"step": 74
},
{
"epoch": 1.0067114093959733,
"grad_norm": 16.342940458429116,
"learning_rate": 8.407746139958168e-07,
"logits/chosen": 0.71484375,
"logits/rejected": 1.2109375,
"logps/chosen": -98.0,
"logps/rejected": -92.0,
"loss": 0.0157,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.03125,
"rewards/margins": 4.8125,
"rewards/rejected": -3.78125,
"step": 75
},
{
"epoch": 1.0201342281879195,
"grad_norm": 7.695753089891094,
"learning_rate": 8.349561828589275e-07,
"logits/chosen": 0.7421875,
"logits/rejected": 0.96484375,
"logps/chosen": -114.5,
"logps/rejected": -143.0,
"loss": 0.0086,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.03125,
"rewards/margins": 7.40625,
"rewards/rejected": -5.375,
"step": 76
},
{
"epoch": 1.0335570469798658,
"grad_norm": 3.97920779662753,
"learning_rate": 8.290542736190188e-07,
"logits/chosen": 0.044921875,
"logits/rejected": -0.158203125,
"logps/chosen": -89.0,
"logps/rejected": -124.0,
"loss": 0.0043,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.53125,
"rewards/margins": 10.125,
"rewards/rejected": -4.5625,
"step": 77
},
{
"epoch": 1.0469798657718121,
"grad_norm": 5.178156118093246,
"learning_rate": 8.230703571556048e-07,
"logits/chosen": 0.578125,
"logits/rejected": -0.1552734375,
"logps/chosen": -136.0,
"logps/rejected": -185.0,
"loss": 0.0056,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0625,
"rewards/margins": 8.875,
"rewards/rejected": -6.8125,
"step": 78
},
{
"epoch": 1.0604026845637584,
"grad_norm": 5.363603356053278,
"learning_rate": 8.170059247861193e-07,
"logits/chosen": 0.44140625,
"logits/rejected": 0.53515625,
"logps/chosen": -108.0,
"logps/rejected": -127.5,
"loss": 0.0055,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.125,
"rewards/margins": 8.5,
"rewards/rejected": -5.375,
"step": 79
},
{
"epoch": 1.0738255033557047,
"grad_norm": 1.1742657635295901,
"learning_rate": 8.108624878942476e-07,
"logits/chosen": 1.1875,
"logits/rejected": 1.4765625,
"logps/chosen": -81.5,
"logps/rejected": -118.5,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0625,
"rewards/margins": 8.875,
"rewards/rejected": -6.8125,
"step": 80
},
{
"epoch": 1.087248322147651,
"grad_norm": 2.2188166194810006,
"learning_rate": 8.046415775532584e-07,
"logits/chosen": -0.21875,
"logits/rejected": -0.111328125,
"logps/chosen": -105.0,
"logps/rejected": -140.0,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.515625,
"rewards/margins": 10.25,
"rewards/rejected": -6.6875,
"step": 81
},
{
"epoch": 1.1006711409395973,
"grad_norm": 18.55303933040324,
"learning_rate": 7.98344744144428e-07,
"logits/chosen": 0.318359375,
"logits/rejected": -0.24609375,
"logps/chosen": -156.0,
"logps/rejected": -145.0,
"loss": 0.0161,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.125,
"rewards/margins": 9.8125,
"rewards/rejected": -4.6875,
"step": 82
},
{
"epoch": 1.1140939597315436,
"grad_norm": 2.5907801972535665,
"learning_rate": 7.919735569706532e-07,
"logits/chosen": -0.443359375,
"logits/rejected": 0.02734375,
"logps/chosen": -93.5,
"logps/rejected": -132.0,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.375,
"rewards/margins": 21.125,
"rewards/rejected": -11.8125,
"step": 83
},
{
"epoch": 1.1275167785234899,
"grad_norm": 1.474856541528767,
"learning_rate": 7.855296038653473e-07,
"logits/chosen": 0.3515625,
"logits/rejected": 0.703125,
"logps/chosen": -111.0,
"logps/rejected": -143.0,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.0,
"rewards/margins": 11.0,
"rewards/rejected": -7.03125,
"step": 84
},
{
"epoch": 1.1409395973154361,
"grad_norm": 1.252650574794432,
"learning_rate": 7.7901449079672e-07,
"logits/chosen": 0.37890625,
"logits/rejected": 0.71875,
"logps/chosen": -101.0,
"logps/rejected": -126.0,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.75,
"rewards/margins": 9.25,
"rewards/rejected": -7.53125,
"step": 85
},
{
"epoch": 1.1543624161073827,
"grad_norm": 4.532807290474358,
"learning_rate": 7.724298414675352e-07,
"logits/chosen": 0.90234375,
"logits/rejected": 1.6328125,
"logps/chosen": -154.0,
"logps/rejected": -164.0,
"loss": 0.0029,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.1875,
"rewards/margins": 9.75,
"rewards/rejected": -5.5625,
"step": 86
},
{
"epoch": 1.167785234899329,
"grad_norm": 12.257533233326445,
"learning_rate": 7.657772969104507e-07,
"logits/chosen": -0.025390625,
"logits/rejected": 0.126953125,
"logps/chosen": -241.0,
"logps/rejected": -282.0,
"loss": 0.0062,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.125,
"rewards/margins": 16.25,
"rewards/rejected": -13.125,
"step": 87
},
{
"epoch": 1.1812080536912752,
"grad_norm": 3.2594066581738734,
"learning_rate": 7.590585150790387e-07,
"logits/chosen": -0.06689453125,
"logits/rejected": 0.0859375,
"logps/chosen": -128.0,
"logps/rejected": -126.5,
"loss": 0.0024,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.96875,
"rewards/margins": 11.125,
"rewards/rejected": -8.125,
"step": 88
},
{
"epoch": 1.1946308724832215,
"grad_norm": 1.689678600897324,
"learning_rate": 7.522751704345887e-07,
"logits/chosen": 1.1328125,
"logits/rejected": 0.359375,
"logps/chosen": -117.0,
"logps/rejected": -112.0,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.6875,
"rewards/margins": 11.8125,
"rewards/rejected": -7.15625,
"step": 89
},
{
"epoch": 1.2080536912751678,
"grad_norm": 12.90463794364264,
"learning_rate": 7.454289535287967e-07,
"logits/chosen": 0.032958984375,
"logits/rejected": 0.7265625,
"logps/chosen": -130.0,
"logps/rejected": -206.0,
"loss": 0.0091,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.90625,
"rewards/margins": 11.5,
"rewards/rejected": -6.5625,
"step": 90
},
{
"epoch": 1.221476510067114,
"grad_norm": 0.7368569495398003,
"learning_rate": 7.385215705824448e-07,
"logits/chosen": 1.0078125,
"logits/rejected": 1.3359375,
"logps/chosen": -103.0,
"logps/rejected": -175.0,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.78125,
"rewards/margins": 8.625,
"rewards/rejected": -7.875,
"step": 91
},
{
"epoch": 1.2348993288590604,
"grad_norm": 10.607931466543848,
"learning_rate": 7.315547430601738e-07,
"logits/chosen": 1.09375,
"logits/rejected": 1.7734375,
"logps/chosen": -147.0,
"logps/rejected": -205.0,
"loss": 0.007,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5,
"rewards/margins": 6.625,
"rewards/rejected": -5.125,
"step": 92
},
{
"epoch": 1.2483221476510067,
"grad_norm": 4.33218010786588,
"learning_rate": 7.245302072414601e-07,
"logits/chosen": 0.44921875,
"logits/rejected": 0.7734375,
"logps/chosen": -192.0,
"logps/rejected": -202.0,
"loss": 0.0041,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.21875,
"rewards/margins": 13.125,
"rewards/rejected": -9.9375,
"step": 93
},
{
"epoch": 1.261744966442953,
"grad_norm": 2.9637993733005548,
"learning_rate": 7.174497137878965e-07,
"logits/chosen": 0.5703125,
"logits/rejected": 0.431640625,
"logps/chosen": -160.0,
"logps/rejected": -161.0,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1875,
"rewards/margins": 8.625,
"rewards/rejected": -8.8125,
"step": 94
},
{
"epoch": 1.2751677852348993,
"grad_norm": 19.555003982399544,
"learning_rate": 7.103150273068921e-07,
"logits/chosen": -1.0,
"logits/rejected": -0.3046875,
"logps/chosen": -92.5,
"logps/rejected": -170.0,
"loss": 0.0109,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.53125,
"rewards/margins": 11.1875,
"rewards/rejected": -11.75,
"step": 95
},
{
"epoch": 1.2885906040268456,
"grad_norm": 5.018942211058499,
"learning_rate": 7.031279259118946e-07,
"logits/chosen": -1.0078125,
"logits/rejected": -1.40625,
"logps/chosen": -128.0,
"logps/rejected": -155.0,
"loss": 0.0025,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.375,
"rewards/margins": 18.0,
"rewards/rejected": -19.5,
"step": 96
},
{
"epoch": 1.302013422818792,
"grad_norm": 13.668642092116187,
"learning_rate": 6.958902007792465e-07,
"logits/chosen": 1.4609375,
"logits/rejected": 1.015625,
"logps/chosen": -140.0,
"logps/rejected": -156.0,
"loss": 0.0087,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.6875,
"rewards/margins": 9.5,
"rewards/rejected": -6.8125,
"step": 97
},
{
"epoch": 1.3154362416107381,
"grad_norm": 7.957898712794316,
"learning_rate": 6.886036557017881e-07,
"logits/chosen": 0.50390625,
"logits/rejected": 1.0,
"logps/chosen": -142.0,
"logps/rejected": -182.0,
"loss": 0.0065,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6875,
"rewards/margins": 9.375,
"rewards/rejected": -7.6875,
"step": 98
},
{
"epoch": 1.3288590604026846,
"grad_norm": 35.374516908422066,
"learning_rate": 6.812701066393123e-07,
"logits/chosen": -0.64453125,
"logits/rejected": 0.169921875,
"logps/chosen": -109.5,
"logps/rejected": -128.0,
"loss": 0.03,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.375,
"rewards/margins": 12.25,
"rewards/rejected": -9.875,
"step": 99
},
{
"epoch": 1.342281879194631,
"grad_norm": 8.489290840205843,
"learning_rate": 6.738913812659912e-07,
"logits/chosen": -0.1982421875,
"logits/rejected": -0.29296875,
"logps/chosen": -157.0,
"logps/rejected": -186.0,
"loss": 0.005,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.484375,
"rewards/margins": 9.5625,
"rewards/rejected": -7.09375,
"step": 100
},
{
"epoch": 1.3557046979865772,
"grad_norm": 1.273570981721026,
"learning_rate": 6.664693185148806e-07,
"logits/chosen": -0.9375,
"logits/rejected": -0.6171875,
"logps/chosen": -71.0,
"logps/rejected": -99.0,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.328125,
"rewards/margins": 11.1875,
"rewards/rejected": -14.5,
"step": 101
},
{
"epoch": 1.3691275167785235,
"grad_norm": 12.691638946077195,
"learning_rate": 6.590057681196191e-07,
"logits/chosen": -0.59765625,
"logits/rejected": -0.0966796875,
"logps/chosen": -148.0,
"logps/rejected": -167.0,
"loss": 0.0087,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.9375,
"rewards/margins": 13.8125,
"rewards/rejected": -9.875,
"step": 102
},
{
"epoch": 1.3825503355704698,
"grad_norm": 1.194548914112796,
"learning_rate": 6.515025901534363e-07,
"logits/chosen": -0.6953125,
"logits/rejected": -0.671875,
"logps/chosen": -89.0,
"logps/rejected": -136.0,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.46875,
"rewards/margins": 15.375,
"rewards/rejected": -10.875,
"step": 103
},
{
"epoch": 1.395973154362416,
"grad_norm": 4.105125773983479,
"learning_rate": 6.439616545655833e-07,
"logits/chosen": -0.5625,
"logits/rejected": -0.078125,
"logps/chosen": -100.5,
"logps/rejected": -170.0,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3125,
"rewards/margins": 18.75,
"rewards/rejected": -16.5,
"step": 104
},
{
"epoch": 1.4093959731543624,
"grad_norm": 4.804881345080172,
"learning_rate": 6.363848407153017e-07,
"logits/chosen": 0.5234375,
"logits/rejected": 1.0546875,
"logps/chosen": -115.0,
"logps/rejected": -118.0,
"loss": 0.0029,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5625,
"rewards/margins": 9.625,
"rewards/rejected": -6.0625,
"step": 105
},
{
"epoch": 1.4228187919463087,
"grad_norm": 0.9972861709658724,
"learning_rate": 6.287740369034485e-07,
"logits/chosen": -0.095703125,
"logits/rejected": 0.15625,
"logps/chosen": -117.5,
"logps/rejected": -174.0,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5,
"rewards/margins": 19.25,
"rewards/rejected": -15.625,
"step": 106
},
{
"epoch": 1.436241610738255,
"grad_norm": 7.783849954903146,
"learning_rate": 6.211311399018916e-07,
"logits/chosen": -0.8515625,
"logits/rejected": -0.51953125,
"logps/chosen": -142.0,
"logps/rejected": -186.0,
"loss": 0.005,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.9375,
"rewards/margins": 15.25,
"rewards/rejected": -10.375,
"step": 107
},
{
"epoch": 1.4496644295302015,
"grad_norm": 0.2393104816166644,
"learning_rate": 6.13458054480795e-07,
"logits/chosen": -0.3828125,
"logits/rejected": -0.267578125,
"logps/chosen": -160.0,
"logps/rejected": -184.0,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.5625,
"rewards/margins": 15.5,
"rewards/rejected": -10.9375,
"step": 108
},
{
"epoch": 1.4630872483221475,
"grad_norm": 0.84206416329658,
"learning_rate": 6.057566929339095e-07,
"logits/chosen": -0.17578125,
"logits/rejected": -0.24609375,
"logps/chosen": -106.5,
"logps/rejected": -150.0,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.59375,
"rewards/margins": 9.25,
"rewards/rejected": -7.625,
"step": 109
},
{
"epoch": 1.476510067114094,
"grad_norm": 1.2327719606392833,
"learning_rate": 5.980289746019891e-07,
"logits/chosen": -0.98046875,
"logits/rejected": -1.2109375,
"logps/chosen": -100.0,
"logps/rejected": -113.0,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.0625,
"rewards/margins": 20.375,
"rewards/rejected": -9.375,
"step": 110
},
{
"epoch": 1.4899328859060403,
"grad_norm": 0.14191543825131345,
"learning_rate": 5.902768253944511e-07,
"logits/chosen": -0.380859375,
"logits/rejected": 0.0791015625,
"logps/chosen": -144.0,
"logps/rejected": -154.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.125,
"rewards/margins": 9.625,
"rewards/rejected": -4.53125,
"step": 111
},
{
"epoch": 1.5033557046979866,
"grad_norm": 2.434514405857578,
"learning_rate": 5.825021773093996e-07,
"logits/chosen": -0.734375,
"logits/rejected": 0.03515625,
"logps/chosen": -80.5,
"logps/rejected": -246.0,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5625,
"rewards/margins": 16.5,
"rewards/rejected": -12.875,
"step": 112
},
{
"epoch": 1.516778523489933,
"grad_norm": 0.7697614239216926,
"learning_rate": 5.747069679521305e-07,
"logits/chosen": 1.453125,
"logits/rejected": 1.296875,
"logps/chosen": -130.0,
"logps/rejected": -166.0,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.53125,
"rewards/margins": 11.5,
"rewards/rejected": -8.9375,
"step": 113
},
{
"epoch": 1.5302013422818792,
"grad_norm": 15.870268453862796,
"learning_rate": 5.668931400522395e-07,
"logits/chosen": -0.03515625,
"logits/rejected": -0.65625,
"logps/chosen": -148.0,
"logps/rejected": -158.0,
"loss": 0.0084,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.8125,
"rewards/margins": 23.625,
"rewards/rejected": -15.875,
"step": 114
},
{
"epoch": 1.5436241610738255,
"grad_norm": 0.7970654918372324,
"learning_rate": 5.59062640979454e-07,
"logits/chosen": -0.35546875,
"logits/rejected": 0.099609375,
"logps/chosen": -138.0,
"logps/rejected": -167.0,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0625,
"rewards/margins": 13.1875,
"rewards/rejected": -11.125,
"step": 115
},
{
"epoch": 1.5570469798657718,
"grad_norm": 1.3955582466387715,
"learning_rate": 5.512174222583066e-07,
"logits/chosen": 0.58203125,
"logits/rejected": -0.02392578125,
"logps/chosen": -235.0,
"logps/rejected": -127.0,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.5,
"rewards/margins": 14.875,
"rewards/rejected": -9.375,
"step": 116
},
{
"epoch": 1.570469798657718,
"grad_norm": 1.3089153115059224,
"learning_rate": 5.433594390817755e-07,
"logits/chosen": 0.765625,
"logits/rejected": 1.578125,
"logps/chosen": -156.0,
"logps/rejected": -206.0,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5,
"rewards/margins": 12.625,
"rewards/rejected": -9.125,
"step": 117
},
{
"epoch": 1.5838926174496644,
"grad_norm": 5.1209862474975605,
"learning_rate": 5.354906498240079e-07,
"logits/chosen": 1.4609375,
"logits/rejected": 1.0234375,
"logps/chosen": -98.5,
"logps/rejected": -98.0,
"loss": 0.0032,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.125,
"rewards/margins": 9.0,
"rewards/rejected": -6.90625,
"step": 118
},
{
"epoch": 1.5973154362416109,
"grad_norm": 2.0143010331470426,
"learning_rate": 5.27613015552254e-07,
"logits/chosen": -0.2451171875,
"logits/rejected": -0.361328125,
"logps/chosen": -111.0,
"logps/rejected": -110.0,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.03125,
"rewards/margins": 11.75,
"rewards/rejected": -7.6875,
"step": 119
},
{
"epoch": 1.610738255033557,
"grad_norm": 0.8804392446661593,
"learning_rate": 5.197284995381264e-07,
"logits/chosen": 0.0245361328125,
"logits/rejected": 0.43359375,
"logps/chosen": -176.0,
"logps/rejected": -210.0,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.0,
"rewards/margins": 19.625,
"rewards/rejected": -12.625,
"step": 120
},
{
"epoch": 1.6241610738255035,
"grad_norm": 1.117425903818996,
"learning_rate": 5.118390667683119e-07,
"logits/chosen": -0.3515625,
"logits/rejected": -0.17578125,
"logps/chosen": -146.0,
"logps/rejected": -192.0,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.90625,
"rewards/margins": 23.125,
"rewards/rejected": -17.25,
"step": 121
},
{
"epoch": 1.6375838926174495,
"grad_norm": 1.1187984471032872,
"learning_rate": 5.039466834548567e-07,
"logits/chosen": -0.87109375,
"logits/rejected": -0.5625,
"logps/chosen": -105.0,
"logps/rejected": -149.0,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.375,
"rewards/margins": 13.25,
"rewards/rejected": -7.875,
"step": 122
},
{
"epoch": 1.651006711409396,
"grad_norm": 0.33656915007850763,
"learning_rate": 4.960533165451435e-07,
"logits/chosen": -0.25,
"logits/rejected": -0.0830078125,
"logps/chosen": -128.0,
"logps/rejected": -144.0,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.9375,
"rewards/margins": 12.3125,
"rewards/rejected": -8.375,
"step": 123
},
{
"epoch": 1.6644295302013423,
"grad_norm": 0.5334984974014568,
"learning_rate": 4.881609332316881e-07,
"logits/chosen": 0.369140625,
"logits/rejected": 0.54296875,
"logps/chosen": -152.0,
"logps/rejected": -182.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.71875,
"rewards/margins": 13.375,
"rewards/rejected": -10.625,
"step": 124
},
{
"epoch": 1.6778523489932886,
"grad_norm": 7.835269891896578,
"learning_rate": 4.802715004618737e-07,
"logits/chosen": -0.2265625,
"logits/rejected": 0.44921875,
"logps/chosen": -72.5,
"logps/rejected": -150.0,
"loss": 0.0044,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7109375,
"rewards/margins": 11.25,
"rewards/rejected": -9.5,
"step": 125
},
{
"epoch": 1.691275167785235,
"grad_norm": 3.783250644515536,
"learning_rate": 4.7238698444774593e-07,
"logits/chosen": -1.46875,
"logits/rejected": -1.0,
"logps/chosen": -62.75,
"logps/rejected": -104.0,
"loss": 0.0032,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.546875,
"rewards/margins": 16.0,
"rewards/rejected": -13.375,
"step": 126
},
{
"epoch": 1.7046979865771812,
"grad_norm": 4.79724903560345,
"learning_rate": 4.6450935017599195e-07,
"logits/chosen": -0.29296875,
"logits/rejected": 0.078125,
"logps/chosen": -110.0,
"logps/rejected": -182.0,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.28125,
"rewards/margins": 12.5,
"rewards/rejected": -10.25,
"step": 127
},
{
"epoch": 1.7181208053691275,
"grad_norm": 8.223743623974363,
"learning_rate": 4.5664056091822465e-07,
"logits/chosen": -0.52734375,
"logits/rejected": -0.48046875,
"logps/chosen": -89.0,
"logps/rejected": -112.0,
"loss": 0.008,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.265625,
"rewards/margins": 18.0,
"rewards/rejected": -14.75,
"step": 128
},
{
"epoch": 1.7315436241610738,
"grad_norm": 5.21471157795929,
"learning_rate": 4.4878257774169345e-07,
"logits/chosen": 1.1015625,
"logits/rejected": 1.859375,
"logps/chosen": -161.0,
"logps/rejected": -268.0,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1875,
"rewards/margins": 9.0625,
"rewards/rejected": -8.875,
"step": 129
},
{
"epoch": 1.7449664429530203,
"grad_norm": 0.6772448197318001,
"learning_rate": 4.4093735902054603e-07,
"logits/chosen": 0.1962890625,
"logits/rejected": 0.85546875,
"logps/chosen": -120.0,
"logps/rejected": -168.0,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0,
"rewards/margins": 11.6875,
"rewards/rejected": -9.6875,
"step": 130
},
{
"epoch": 1.7583892617449663,
"grad_norm": 2.3798718473775407,
"learning_rate": 4.331068599477605e-07,
"logits/chosen": -0.306640625,
"logits/rejected": -0.431640625,
"logps/chosen": -84.5,
"logps/rejected": -115.0,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.203125,
"rewards/margins": 10.9375,
"rewards/rejected": -9.75,
"step": 131
},
{
"epoch": 1.7718120805369129,
"grad_norm": 0.19696843032685835,
"learning_rate": 4.252930320478695e-07,
"logits/chosen": 0.55078125,
"logits/rejected": 1.09375,
"logps/chosen": -106.0,
"logps/rejected": -200.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.9375,
"rewards/margins": 14.4375,
"rewards/rejected": -11.5,
"step": 132
},
{
"epoch": 1.785234899328859,
"grad_norm": 0.6762022924163246,
"learning_rate": 4.1749782269060043e-07,
"logits/chosen": -0.11865234375,
"logits/rejected": 0.1240234375,
"logps/chosen": -84.0,
"logps/rejected": -126.0,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.515625,
"rewards/margins": 11.6875,
"rewards/rejected": -8.1875,
"step": 133
},
{
"epoch": 1.7986577181208054,
"grad_norm": 2.143182334362571,
"learning_rate": 4.09723174605549e-07,
"logits/chosen": 0.82421875,
"logits/rejected": 1.0625,
"logps/chosen": -111.0,
"logps/rejected": -221.0,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.65625,
"rewards/margins": 16.0,
"rewards/rejected": -13.3125,
"step": 134
},
{
"epoch": 1.8120805369127517,
"grad_norm": 28.90404860779291,
"learning_rate": 4.01971025398011e-07,
"logits/chosen": -0.9296875,
"logits/rejected": -0.423828125,
"logps/chosen": -144.0,
"logps/rejected": -194.0,
"loss": 0.0199,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.875,
"rewards/margins": 20.5,
"rewards/rejected": -12.625,
"step": 135
},
{
"epoch": 1.825503355704698,
"grad_norm": 3.275655892122567,
"learning_rate": 3.942433070660905e-07,
"logits/chosen": -1.0703125,
"logits/rejected": -0.38671875,
"logps/chosen": -96.0,
"logps/rejected": -145.0,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.78125,
"rewards/margins": 14.0,
"rewards/rejected": -14.75,
"step": 136
},
{
"epoch": 1.8389261744966443,
"grad_norm": 0.4614476645335935,
"learning_rate": 3.865419455192048e-07,
"logits/chosen": 0.0400390625,
"logits/rejected": 0.228515625,
"logps/chosen": -140.0,
"logps/rejected": -174.0,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6875,
"rewards/margins": 15.1875,
"rewards/rejected": -9.5,
"step": 137
},
{
"epoch": 1.8523489932885906,
"grad_norm": 0.6025210241714362,
"learning_rate": 3.788688600981085e-07,
"logits/chosen": -0.26953125,
"logits/rejected": -0.47265625,
"logps/chosen": -155.0,
"logps/rejected": -124.0,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.40625,
"rewards/margins": 11.0,
"rewards/rejected": -8.5625,
"step": 138
},
{
"epoch": 1.8657718120805369,
"grad_norm": 40.4801745827274,
"learning_rate": 3.7122596309655174e-07,
"logits/chosen": -0.546875,
"logits/rejected": -0.6015625,
"logps/chosen": -114.5,
"logps/rejected": -170.0,
"loss": 0.0242,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.625,
"rewards/margins": 14.25,
"rewards/rejected": -11.625,
"step": 139
},
{
"epoch": 1.8791946308724832,
"grad_norm": 0.36856314373131915,
"learning_rate": 3.6361515928469845e-07,
"logits/chosen": 0.380859375,
"logits/rejected": -0.16796875,
"logps/chosen": -110.5,
"logps/rejected": -111.5,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.375,
"rewards/margins": 11.875,
"rewards/rejected": -11.5,
"step": 140
},
{
"epoch": 1.8926174496644297,
"grad_norm": 37.29857910882844,
"learning_rate": 3.560383454344168e-07,
"logits/chosen": -0.265625,
"logits/rejected": 0.15234375,
"logps/chosen": -121.0,
"logps/rejected": -128.0,
"loss": 0.0203,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.75,
"rewards/margins": 9.125,
"rewards/rejected": -6.40625,
"step": 141
},
{
"epoch": 1.9060402684563758,
"grad_norm": 0.6134596336148415,
"learning_rate": 3.484974098465636e-07,
"logits/chosen": -1.1171875,
"logits/rejected": -0.9921875,
"logps/chosen": -120.0,
"logps/rejected": -158.0,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4375,
"rewards/margins": 11.6875,
"rewards/rejected": -12.125,
"step": 142
},
{
"epoch": 1.9194630872483223,
"grad_norm": 3.6457141240616147,
"learning_rate": 3.409942318803809e-07,
"logits/chosen": 0.24609375,
"logits/rejected": 0.5625,
"logps/chosen": -115.5,
"logps/rejected": -158.0,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.1875,
"rewards/margins": 11.9375,
"rewards/rejected": -8.75,
"step": 143
},
{
"epoch": 1.9328859060402683,
"grad_norm": 0.7732701048562175,
"learning_rate": 3.335306814851195e-07,
"logits/chosen": -0.158203125,
"logits/rejected": 0.125,
"logps/chosen": -96.0,
"logps/rejected": -134.0,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.78125,
"rewards/margins": 12.5625,
"rewards/rejected": -8.75,
"step": 144
},
{
"epoch": 1.9463087248322148,
"grad_norm": 0.32377376860458673,
"learning_rate": 3.261086187340088e-07,
"logits/chosen": 0.34765625,
"logits/rejected": 0.466796875,
"logps/chosen": -214.0,
"logps/rejected": -243.0,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.5,
"rewards/margins": 18.25,
"rewards/rejected": -11.75,
"step": 145
},
{
"epoch": 1.959731543624161,
"grad_norm": 0.33931684180319055,
"learning_rate": 3.187298933606878e-07,
"logits/chosen": 0.24609375,
"logits/rejected": 0.166015625,
"logps/chosen": -158.0,
"logps/rejected": -147.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8125,
"rewards/margins": 13.5625,
"rewards/rejected": -12.75,
"step": 146
},
{
"epoch": 1.9731543624161074,
"grad_norm": 0.036679139127851305,
"learning_rate": 3.1139634429821195e-07,
"logits/chosen": -1.2109375,
"logits/rejected": -0.142578125,
"logps/chosen": -75.0,
"logps/rejected": -146.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.4375,
"rewards/margins": 17.0,
"rewards/rejected": -10.625,
"step": 147
},
{
"epoch": 1.9865771812080537,
"grad_norm": 3.0122112521033113,
"learning_rate": 3.041097992207534e-07,
"logits/chosen": -0.36328125,
"logits/rejected": 0.21875,
"logps/chosen": -140.0,
"logps/rejected": -164.0,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0625,
"rewards/margins": 14.125,
"rewards/rejected": -11.0,
"step": 148
},
{
"epoch": 2.0,
"grad_norm": 0.06581830858065343,
"learning_rate": 2.9687207408810555e-07,
"logits/chosen": 0.310546875,
"logits/rejected": 0.765625,
"logps/chosen": -153.0,
"logps/rejected": -168.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.78125,
"rewards/margins": 11.75,
"rewards/rejected": -12.5625,
"step": 149
},
{
"epoch": 2.0134228187919465,
"grad_norm": 0.1894473762562904,
"learning_rate": 2.8968497269310797e-07,
"logits/chosen": -0.70703125,
"logits/rejected": -1.015625,
"logps/chosen": -156.0,
"logps/rejected": -142.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.65625,
"rewards/margins": 16.0,
"rewards/rejected": -11.3125,
"step": 150
},
{
"epoch": 2.0268456375838926,
"grad_norm": 0.3166445418481177,
"learning_rate": 2.8255028621210354e-07,
"logits/chosen": -1.15625,
"logits/rejected": -0.8671875,
"logps/chosen": -67.0,
"logps/rejected": -118.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.65625,
"rewards/margins": 15.5,
"rewards/rejected": -12.875,
"step": 151
},
{
"epoch": 2.040268456375839,
"grad_norm": 0.10060281063304236,
"learning_rate": 2.7546979275853987e-07,
"logits/chosen": -1.046875,
"logits/rejected": -0.921875,
"logps/chosen": -118.0,
"logps/rejected": -242.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.875,
"rewards/margins": 35.5,
"rewards/rejected": -29.5,
"step": 152
},
{
"epoch": 2.053691275167785,
"grad_norm": 0.029261474444326915,
"learning_rate": 2.684452569398261e-07,
"logits/chosen": -0.09765625,
"logits/rejected": -0.0078125,
"logps/chosen": -120.0,
"logps/rejected": -127.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.96875,
"rewards/margins": 12.625,
"rewards/rejected": -8.6875,
"step": 153
},
{
"epoch": 2.0671140939597317,
"grad_norm": 0.07377090871392322,
"learning_rate": 2.614784294175554e-07,
"logits/chosen": -0.6484375,
"logits/rejected": 0.17578125,
"logps/chosen": -76.0,
"logps/rejected": -189.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6875,
"rewards/margins": 22.125,
"rewards/rejected": -21.5,
"step": 154
},
{
"epoch": 2.0805369127516777,
"grad_norm": 0.13600847657907822,
"learning_rate": 2.545710464712032e-07,
"logits/chosen": -0.1796875,
"logits/rejected": 0.08203125,
"logps/chosen": -95.0,
"logps/rejected": -154.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.96875,
"rewards/margins": 15.5625,
"rewards/rejected": -11.5625,
"step": 155
},
{
"epoch": 2.0939597315436242,
"grad_norm": 0.06348564739132218,
"learning_rate": 2.477248295654113e-07,
"logits/chosen": -0.458984375,
"logits/rejected": 0.2890625,
"logps/chosen": -128.0,
"logps/rejected": -183.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0,
"rewards/margins": 14.9375,
"rewards/rejected": -13.9375,
"step": 156
},
{
"epoch": 2.1073825503355703,
"grad_norm": 0.09396441188908901,
"learning_rate": 2.409414849209612e-07,
"logits/chosen": -0.1884765625,
"logits/rejected": 1.6171875,
"logps/chosen": -81.0,
"logps/rejected": -188.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3125,
"rewards/margins": 12.25,
"rewards/rejected": -9.9375,
"step": 157
},
{
"epoch": 2.120805369127517,
"grad_norm": 0.4686669453110811,
"learning_rate": 2.3422270308954933e-07,
"logits/chosen": -1.21875,
"logits/rejected": -0.392578125,
"logps/chosen": -53.0,
"logps/rejected": -74.5,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.40625,
"rewards/margins": 15.0,
"rewards/rejected": -14.5,
"step": 158
},
{
"epoch": 2.134228187919463,
"grad_norm": 0.27303818497026094,
"learning_rate": 2.275701585324649e-07,
"logits/chosen": -0.54296875,
"logits/rejected": -0.76171875,
"logps/chosen": -123.0,
"logps/rejected": -133.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.546875,
"rewards/margins": 17.75,
"rewards/rejected": -15.25,
"step": 159
},
{
"epoch": 2.1476510067114094,
"grad_norm": 0.2517214530806255,
"learning_rate": 2.2098550920327995e-07,
"logits/chosen": 0.03125,
"logits/rejected": 0.31640625,
"logps/chosen": -87.0,
"logps/rejected": -146.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.75,
"rewards/margins": 14.3125,
"rewards/rejected": -13.5625,
"step": 160
},
{
"epoch": 2.1610738255033555,
"grad_norm": 0.03752622378950569,
"learning_rate": 2.144703961346526e-07,
"logits/chosen": 0.609375,
"logits/rejected": 0.55078125,
"logps/chosen": -174.0,
"logps/rejected": -186.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.25,
"rewards/margins": 13.4375,
"rewards/rejected": -9.1875,
"step": 161
},
{
"epoch": 2.174496644295302,
"grad_norm": 0.05988803532422006,
"learning_rate": 2.080264430293468e-07,
"logits/chosen": 0.30078125,
"logits/rejected": 0.234375,
"logps/chosen": -152.0,
"logps/rejected": -168.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.125,
"rewards/margins": 16.25,
"rewards/rejected": -10.1875,
"step": 162
},
{
"epoch": 2.1879194630872485,
"grad_norm": 0.20922924966030673,
"learning_rate": 2.0165525585557203e-07,
"logits/chosen": -0.322265625,
"logits/rejected": 0.140625,
"logps/chosen": -95.5,
"logps/rejected": -165.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.328125,
"rewards/margins": 13.4375,
"rewards/rejected": -12.125,
"step": 163
},
{
"epoch": 2.2013422818791946,
"grad_norm": 0.08964239202695976,
"learning_rate": 1.953584224467418e-07,
"logits/chosen": -0.9609375,
"logits/rejected": -0.7578125,
"logps/chosen": -164.0,
"logps/rejected": -207.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6875,
"rewards/margins": 14.75,
"rewards/rejected": -11.0625,
"step": 164
},
{
"epoch": 2.214765100671141,
"grad_norm": 0.1394991195345149,
"learning_rate": 1.8913751210575247e-07,
"logits/chosen": -0.765625,
"logits/rejected": -0.38671875,
"logps/chosen": -146.0,
"logps/rejected": -174.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.78125,
"rewards/margins": 15.8125,
"rewards/rejected": -14.0,
"step": 165
},
{
"epoch": 2.228187919463087,
"grad_norm": 0.06224467844986688,
"learning_rate": 1.8299407521388065e-07,
"logits/chosen": -1.171875,
"logits/rejected": -0.4609375,
"logps/chosen": -80.0,
"logps/rejected": -172.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4375,
"rewards/margins": 20.0,
"rewards/rejected": -19.5,
"step": 166
},
{
"epoch": 2.2416107382550337,
"grad_norm": 0.017121698135529603,
"learning_rate": 1.7692964284439506e-07,
"logits/chosen": -0.703125,
"logits/rejected": -0.69921875,
"logps/chosen": -122.5,
"logps/rejected": -194.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.59375,
"rewards/margins": 22.25,
"rewards/rejected": -19.75,
"step": 167
},
{
"epoch": 2.2550335570469797,
"grad_norm": 0.14929743942170393,
"learning_rate": 1.709457263809812e-07,
"logits/chosen": 0.12451171875,
"logits/rejected": 0.8828125,
"logps/chosen": -96.0,
"logps/rejected": -178.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.65625,
"rewards/margins": 14.9375,
"rewards/rejected": -13.3125,
"step": 168
},
{
"epoch": 2.2684563758389262,
"grad_norm": 0.11892827820734325,
"learning_rate": 1.6504381714107252e-07,
"logits/chosen": 0.578125,
"logits/rejected": 0.69921875,
"logps/chosen": -135.0,
"logps/rejected": -136.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.5,
"rewards/margins": 11.125,
"rewards/rejected": -6.625,
"step": 169
},
{
"epoch": 2.2818791946308723,
"grad_norm": 0.03271406609147028,
"learning_rate": 1.5922538600418317e-07,
"logits/chosen": 0.984375,
"logits/rejected": 0.6015625,
"logps/chosen": -112.5,
"logps/rejected": -133.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.90625,
"rewards/margins": 15.125,
"rewards/rejected": -11.1875,
"step": 170
},
{
"epoch": 2.295302013422819,
"grad_norm": 0.4554889059510281,
"learning_rate": 1.534918830453341e-07,
"logits/chosen": -0.40625,
"logits/rejected": 0.369140625,
"logps/chosen": -111.5,
"logps/rejected": -220.0,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.34375,
"rewards/margins": 22.25,
"rewards/rejected": -16.0,
"step": 171
},
{
"epoch": 2.3087248322147653,
"grad_norm": 0.006581780362424091,
"learning_rate": 1.4784473717366387e-07,
"logits/chosen": -1.3828125,
"logits/rejected": 0.337890625,
"logps/chosen": -109.0,
"logps/rejected": -268.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1875,
"rewards/margins": 23.0,
"rewards/rejected": -23.25,
"step": 172
},
{
"epoch": 2.3221476510067114,
"grad_norm": 0.012619009500990765,
"learning_rate": 1.422853557763144e-07,
"logits/chosen": -1.2109375,
"logits/rejected": -0.375,
"logps/chosen": -66.0,
"logps/rejected": -122.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.390625,
"rewards/margins": 14.5,
"rewards/rejected": -14.125,
"step": 173
},
{
"epoch": 2.335570469798658,
"grad_norm": 0.01796264226604086,
"learning_rate": 1.3681512436768046e-07,
"logits/chosen": -1.578125,
"logits/rejected": -0.97265625,
"logps/chosen": -63.0,
"logps/rejected": -124.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.40625,
"rewards/margins": 20.25,
"rewards/rejected": -18.0,
"step": 174
},
{
"epoch": 2.348993288590604,
"grad_norm": 0.07568692984212666,
"learning_rate": 1.3143540624411058e-07,
"logits/chosen": -0.59765625,
"logits/rejected": -0.1630859375,
"logps/chosen": -96.0,
"logps/rejected": -174.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4375,
"rewards/margins": 16.0,
"rewards/rejected": -13.5,
"step": 175
},
{
"epoch": 2.3624161073825505,
"grad_norm": 0.06708516383348302,
"learning_rate": 1.2614754214414548e-07,
"logits/chosen": 0.012939453125,
"logits/rejected": 0.421875,
"logps/chosen": -146.0,
"logps/rejected": -208.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9375,
"rewards/margins": 16.875,
"rewards/rejected": -10.9375,
"step": 176
},
{
"epoch": 2.3758389261744965,
"grad_norm": 0.10761380684324387,
"learning_rate": 1.2095284991437733e-07,
"logits/chosen": -0.384765625,
"logits/rejected": 0.033203125,
"logps/chosen": -119.5,
"logps/rejected": -139.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5,
"rewards/margins": 18.5,
"rewards/rejected": -14.9375,
"step": 177
},
{
"epoch": 2.389261744966443,
"grad_norm": 0.028119404041323803,
"learning_rate": 1.1585262418101466e-07,
"logits/chosen": -0.7734375,
"logits/rejected": -0.8125,
"logps/chosen": -107.0,
"logps/rejected": -192.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8125,
"rewards/margins": 23.75,
"rewards/rejected": -20.0,
"step": 178
},
{
"epoch": 2.402684563758389,
"grad_norm": 0.06427772073604729,
"learning_rate": 1.1084813602723514e-07,
"logits/chosen": 0.162109375,
"logits/rejected": 0.267578125,
"logps/chosen": -127.0,
"logps/rejected": -149.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.125,
"rewards/margins": 10.8125,
"rewards/rejected": -10.9375,
"step": 179
},
{
"epoch": 2.4161073825503356,
"grad_norm": 0.031365944388578905,
"learning_rate": 1.0594063267640385e-07,
"logits/chosen": -0.8046875,
"logits/rejected": -0.67578125,
"logps/chosen": -128.0,
"logps/rejected": -214.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.1875,
"rewards/margins": 20.75,
"rewards/rejected": -13.625,
"step": 180
},
{
"epoch": 2.4295302013422817,
"grad_norm": 0.052700817810413206,
"learning_rate": 1.0113133718124034e-07,
"logits/chosen": -0.453125,
"logits/rejected": -0.0556640625,
"logps/chosen": -122.5,
"logps/rejected": -156.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.875,
"rewards/margins": 10.1875,
"rewards/rejected": -8.3125,
"step": 181
},
{
"epoch": 2.442953020134228,
"grad_norm": 0.042796528316408955,
"learning_rate": 9.642144811900737e-08,
"logits/chosen": -0.63671875,
"logits/rejected": 0.1494140625,
"logps/chosen": -124.5,
"logps/rejected": -189.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.0,
"rewards/margins": 17.0,
"rewards/rejected": -12.0625,
"step": 182
},
{
"epoch": 2.4563758389261743,
"grad_norm": 0.05890047207159928,
"learning_rate": 9.181213929280046e-08,
"logits/chosen": -1.0390625,
"logits/rejected": -0.5390625,
"logps/chosen": -132.0,
"logps/rejected": -132.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.84375,
"rewards/margins": 10.625,
"rewards/rejected": -11.4375,
"step": 183
},
{
"epoch": 2.469798657718121,
"grad_norm": 0.08060615031100707,
"learning_rate": 8.730455943901199e-08,
"logits/chosen": 0.37109375,
"logits/rejected": 0.8515625,
"logps/chosen": -96.0,
"logps/rejected": -164.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3125,
"rewards/margins": 13.0,
"rewards/rejected": -10.6875,
"step": 184
},
{
"epoch": 2.4832214765100673,
"grad_norm": 0.5557104575302182,
"learning_rate": 8.289983194104127e-08,
"logits/chosen": -1.2109375,
"logits/rejected": -0.8046875,
"logps/chosen": -114.5,
"logps/rejected": -144.0,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.1875,
"rewards/margins": 12.0,
"rewards/rejected": -16.25,
"step": 185
},
{
"epoch": 2.4966442953020134,
"grad_norm": 0.05401238664527387,
"learning_rate": 7.85990545493247e-08,
"logits/chosen": -0.4921875,
"logits/rejected": -0.03515625,
"logps/chosen": -102.0,
"logps/rejected": -202.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.5,
"rewards/margins": 30.75,
"rewards/rejected": -21.375,
"step": 186
},
{
"epoch": 2.51006711409396,
"grad_norm": 0.04674015967818173,
"learning_rate": 7.440329910775272e-08,
"logits/chosen": -0.53125,
"logits/rejected": -0.353515625,
"logps/chosen": -77.0,
"logps/rejected": -140.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.8125,
"rewards/margins": 19.25,
"rewards/rejected": -13.5,
"step": 187
},
{
"epoch": 2.523489932885906,
"grad_norm": 0.03178944530558772,
"learning_rate": 7.0313611286544e-08,
"logits/chosen": -0.259765625,
"logits/rejected": -0.67578125,
"logps/chosen": -153.0,
"logps/rejected": -137.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1875,
"rewards/margins": 14.0625,
"rewards/rejected": -13.875,
"step": 188
},
{
"epoch": 2.5369127516778525,
"grad_norm": 0.0227179843800343,
"learning_rate": 6.633101032164273e-08,
"logits/chosen": -1.0703125,
"logits/rejected": -0.98046875,
"logps/chosen": -90.5,
"logps/rejected": -144.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.015625,
"rewards/margins": 19.75,
"rewards/rejected": -17.75,
"step": 189
},
{
"epoch": 2.5503355704697985,
"grad_norm": 0.03328439296626924,
"learning_rate": 6.24564887607032e-08,
"logits/chosen": -0.421875,
"logits/rejected": -0.42578125,
"logps/chosen": -100.5,
"logps/rejected": -124.5,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.4375,
"rewards/margins": 25.25,
"rewards/rejected": -14.8125,
"step": 190
},
{
"epoch": 2.563758389261745,
"grad_norm": 0.047336260336703316,
"learning_rate": 5.869101221572653e-08,
"logits/chosen": -0.97265625,
"logits/rejected": -0.8125,
"logps/chosen": -246.0,
"logps/rejected": -352.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.125,
"rewards/margins": 20.125,
"rewards/rejected": -15.0,
"step": 191
},
{
"epoch": 2.577181208053691,
"grad_norm": 0.1370090812674146,
"learning_rate": 5.503551912240989e-08,
"logits/chosen": 0.14453125,
"logits/rejected": 0.3125,
"logps/chosen": -212.0,
"logps/rejected": -200.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.25,
"rewards/margins": 23.5,
"rewards/rejected": -13.25,
"step": 192
},
{
"epoch": 2.5906040268456376,
"grad_norm": 0.027265765751800227,
"learning_rate": 5.1490920506268246e-08,
"logits/chosen": -0.5625,
"logits/rejected": -0.0283203125,
"logps/chosen": -116.5,
"logps/rejected": -177.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.375,
"rewards/margins": 18.5,
"rewards/rejected": -13.1875,
"step": 193
},
{
"epoch": 2.604026845637584,
"grad_norm": 0.15268686054307481,
"learning_rate": 4.805809975558828e-08,
"logits/chosen": -1.484375,
"logits/rejected": -0.81640625,
"logps/chosen": -67.5,
"logps/rejected": -125.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.375,
"rewards/margins": 17.75,
"rewards/rejected": -20.0,
"step": 194
},
{
"epoch": 2.61744966442953,
"grad_norm": 0.0271020117795592,
"learning_rate": 4.4737912401268894e-08,
"logits/chosen": -0.345703125,
"logits/rejected": -0.5546875,
"logps/chosen": -173.0,
"logps/rejected": -204.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.8125,
"rewards/margins": 17.0,
"rewards/rejected": -8.1875,
"step": 195
},
{
"epoch": 2.6308724832214763,
"grad_norm": 0.02768965257686107,
"learning_rate": 4.15311859036056e-08,
"logits/chosen": -0.51953125,
"logits/rejected": 0.11181640625,
"logps/chosen": -110.0,
"logps/rejected": -150.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.25,
"rewards/margins": 16.25,
"rewards/rejected": -15.0,
"step": 196
},
{
"epoch": 2.6442953020134228,
"grad_norm": 0.030583666420774456,
"learning_rate": 3.843871944606969e-08,
"logits/chosen": -1.046875,
"logits/rejected": -0.55859375,
"logps/chosen": -104.0,
"logps/rejected": -134.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.84375,
"rewards/margins": 20.0,
"rewards/rejected": -15.125,
"step": 197
},
{
"epoch": 2.6577181208053693,
"grad_norm": 0.01983459256439879,
"learning_rate": 3.546128373613472e-08,
"logits/chosen": -0.08984375,
"logits/rejected": -0.01171875,
"logps/chosen": -148.0,
"logps/rejected": -152.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.25,
"rewards/margins": 10.875,
"rewards/rejected": -9.625,
"step": 198
},
{
"epoch": 2.6711409395973154,
"grad_norm": 0.06308533652339048,
"learning_rate": 3.2599620813200835e-08,
"logits/chosen": -1.75,
"logits/rejected": -1.375,
"logps/chosen": -126.0,
"logps/rejected": -159.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.6875,
"rewards/margins": 21.25,
"rewards/rejected": -14.5625,
"step": 199
},
{
"epoch": 2.684563758389262,
"grad_norm": 0.02445058345278849,
"learning_rate": 2.985444386366226e-08,
"logits/chosen": -0.265625,
"logits/rejected": -0.3203125,
"logps/chosen": -109.5,
"logps/rejected": -155.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.40625,
"rewards/margins": 21.0,
"rewards/rejected": -14.5625,
"step": 200
},
{
"epoch": 2.697986577181208,
"grad_norm": 0.023961536504966064,
"learning_rate": 2.7226437043166518e-08,
"logits/chosen": -0.2080078125,
"logits/rejected": 0.09375,
"logps/chosen": -87.5,
"logps/rejected": -140.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.09375,
"rewards/margins": 15.125,
"rewards/rejected": -13.0,
"step": 201
},
{
"epoch": 2.7114093959731544,
"grad_norm": 0.16636675194104117,
"learning_rate": 2.47162553061086e-08,
"logits/chosen": 0.0,
"logits/rejected": -0.10546875,
"logps/chosen": -170.0,
"logps/rejected": -181.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.15625,
"rewards/margins": 11.6875,
"rewards/rejected": -8.5,
"step": 202
},
{
"epoch": 2.7248322147651005,
"grad_norm": 0.1382165261046544,
"learning_rate": 2.232452424240261e-08,
"logits/chosen": -1.671875,
"logits/rejected": -1.203125,
"logps/chosen": -88.5,
"logps/rejected": -142.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5,
"rewards/margins": 17.25,
"rewards/rejected": -16.625,
"step": 203
},
{
"epoch": 2.738255033557047,
"grad_norm": 0.15799222178098044,
"learning_rate": 2.0051839921571444e-08,
"logits/chosen": -0.625,
"logits/rejected": -0.2333984375,
"logps/chosen": -114.0,
"logps/rejected": -154.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5,
"rewards/margins": 10.9375,
"rewards/rejected": -10.4375,
"step": 204
},
{
"epoch": 2.751677852348993,
"grad_norm": 0.2806855827386244,
"learning_rate": 1.789876874419416e-08,
"logits/chosen": -0.06640625,
"logits/rejected": 0.107421875,
"logps/chosen": -121.5,
"logps/rejected": -115.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.8125,
"rewards/margins": 8.75,
"rewards/rejected": -10.625,
"step": 205
},
{
"epoch": 2.7651006711409396,
"grad_norm": 0.01678193593288143,
"learning_rate": 1.5865847300746415e-08,
"logits/chosen": -0.046875,
"logits/rejected": 0.0546875,
"logps/chosen": -109.0,
"logps/rejected": -146.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.125,
"rewards/margins": 20.875,
"rewards/rejected": -18.75,
"step": 206
},
{
"epoch": 2.778523489932886,
"grad_norm": 0.08731970382784059,
"learning_rate": 1.395358223787152e-08,
"logits/chosen": -1.0625,
"logits/rejected": -0.859375,
"logps/chosen": -160.0,
"logps/rejected": -209.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.90625,
"rewards/margins": 19.625,
"rewards/rejected": -14.6875,
"step": 207
},
{
"epoch": 2.791946308724832,
"grad_norm": 0.05918517232146335,
"learning_rate": 1.21624501321132e-08,
"logits/chosen": -0.4921875,
"logits/rejected": 0.0,
"logps/chosen": -85.5,
"logps/rejected": -103.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.421875,
"rewards/margins": 9.8125,
"rewards/rejected": -8.375,
"step": 208
},
{
"epoch": 2.8053691275167782,
"grad_norm": 0.08699969705520388,
"learning_rate": 1.0492897371142728e-08,
"logits/chosen": -0.83984375,
"logits/rejected": -0.75390625,
"logps/chosen": -137.0,
"logps/rejected": -168.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03125,
"rewards/margins": 13.6875,
"rewards/rejected": -13.6875,
"step": 209
},
{
"epoch": 2.8187919463087248,
"grad_norm": 0.07365183308008798,
"learning_rate": 8.945340042509797e-09,
"logits/chosen": -0.47265625,
"logits/rejected": -0.26953125,
"logps/chosen": -88.0,
"logps/rejected": -115.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8125,
"rewards/margins": 8.9375,
"rewards/rejected": -6.125,
"step": 210
},
{
"epoch": 2.8322147651006713,
"grad_norm": 0.038963738208368105,
"learning_rate": 7.520163829944803e-09,
"logits/chosen": -0.029296875,
"logits/rejected": 0.28515625,
"logps/chosen": -148.0,
"logps/rejected": -180.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.15625,
"rewards/margins": 20.75,
"rewards/rejected": -16.625,
"step": 211
},
{
"epoch": 2.8456375838926173,
"grad_norm": 0.05147316584374017,
"learning_rate": 6.217723917238127e-09,
"logits/chosen": -1.2109375,
"logits/rejected": -0.93359375,
"logps/chosen": -49.75,
"logps/rejected": -85.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4140625,
"rewards/margins": 11.1875,
"rewards/rejected": -9.8125,
"step": 212
},
{
"epoch": 2.859060402684564,
"grad_norm": 0.023022328378360397,
"learning_rate": 5.038344899721436e-09,
"logits/chosen": -0.055419921875,
"logits/rejected": 0.1142578125,
"logps/chosen": -122.0,
"logps/rejected": -165.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.84375,
"rewards/margins": 14.125,
"rewards/rejected": -12.3125,
"step": 213
},
{
"epoch": 2.87248322147651,
"grad_norm": 0.06406190550280132,
"learning_rate": 3.982320703371067e-09,
"logits/chosen": -2.03125,
"logits/rejected": -1.640625,
"logps/chosen": -73.5,
"logps/rejected": -113.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.15625,
"rewards/margins": 17.0,
"rewards/rejected": -14.875,
"step": 214
},
{
"epoch": 2.8859060402684564,
"grad_norm": 0.08226436699918951,
"learning_rate": 3.0499145115561177e-09,
"logits/chosen": -0.2890625,
"logits/rejected": -0.2734375,
"logps/chosen": -127.0,
"logps/rejected": -150.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.125,
"rewards/margins": 13.5,
"rewards/rejected": -11.375,
"step": 215
},
{
"epoch": 2.899328859060403,
"grad_norm": 0.06701429938679973,
"learning_rate": 2.2413586994470825e-09,
"logits/chosen": -1.2265625,
"logits/rejected": -1.4140625,
"logps/chosen": -116.0,
"logps/rejected": -126.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.40625,
"rewards/margins": 21.75,
"rewards/rejected": -14.375,
"step": 216
},
{
"epoch": 2.912751677852349,
"grad_norm": 0.15959257076282526,
"learning_rate": 1.5568547761034001e-09,
"logits/chosen": 0.078125,
"logits/rejected": 0.1376953125,
"logps/chosen": -162.0,
"logps/rejected": -162.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.75,
"rewards/margins": 10.1875,
"rewards/rejected": -9.4375,
"step": 217
},
{
"epoch": 2.926174496644295,
"grad_norm": 0.06860454706791937,
"learning_rate": 9.965733342532923e-10,
"logits/chosen": -0.68359375,
"logits/rejected": -0.453125,
"logps/chosen": -127.0,
"logps/rejected": -209.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.6875,
"rewards/margins": 28.0,
"rewards/rejected": -16.25,
"step": 218
},
{
"epoch": 2.9395973154362416,
"grad_norm": 0.8332909450330214,
"learning_rate": 5.606540077782162e-10,
"logits/chosen": 0.58984375,
"logits/rejected": 0.318359375,
"logps/chosen": -136.0,
"logps/rejected": -165.0,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.625,
"rewards/margins": 9.8125,
"rewards/rejected": -10.4375,
"step": 219
},
{
"epoch": 2.953020134228188,
"grad_norm": 0.035230135449407174,
"learning_rate": 2.4920543691309137e-10,
"logits/chosen": 1.0,
"logits/rejected": 0.7109375,
"logps/chosen": -102.0,
"logps/rejected": -145.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.71875,
"rewards/margins": 14.9375,
"rewards/rejected": -12.1875,
"step": 220
},
{
"epoch": 2.966442953020134,
"grad_norm": 0.155120664961761,
"learning_rate": 6.230524117134539e-11,
"logits/chosen": 0.080078125,
"logits/rejected": 0.6015625,
"logps/chosen": -122.0,
"logps/rejected": -198.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.5625,
"rewards/margins": 14.0625,
"rewards/rejected": -11.5,
"step": 221
},
{
"epoch": 2.9798657718120807,
"grad_norm": 0.09475005748408788,
"learning_rate": 0.0,
"logits/chosen": 0.16015625,
"logits/rejected": 0.345703125,
"logps/chosen": -121.0,
"logps/rejected": -154.0,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.125,
"rewards/margins": 15.625,
"rewards/rejected": -12.5,
"step": 222
},
{
"epoch": 2.9798657718120807,
"step": 222,
"total_flos": 0.0,
"train_loss": 0.1353412357209354,
"train_runtime": 2791.2284,
"train_samples_per_second": 1.913,
"train_steps_per_second": 0.08
}
],
"logging_steps": 1,
"max_steps": 222,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}