llama-8b-instruct-ipo-full / trainer_state.json
Sean13's picture
Model save
23f6fe7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002136752136752137,
"grad_norm": 32.17777061214746,
"learning_rate": 1.4893617021276595e-08,
"logits/chosen": -0.279296875,
"logits/rejected": -0.35546875,
"logps/chosen": -0.99609375,
"logps/rejected": -0.890625,
"loss": 1.0,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.004273504273504274,
"grad_norm": 35.04133277530198,
"learning_rate": 2.978723404255319e-08,
"logits/chosen": -0.41796875,
"logits/rejected": -0.283203125,
"logps/chosen": -1.6796875,
"logps/rejected": -1.9453125,
"loss": 1.0,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.00641025641025641,
"grad_norm": 45.030473466856385,
"learning_rate": 4.468085106382978e-08,
"logits/chosen": -0.419921875,
"logits/rejected": -0.41015625,
"logps/chosen": -0.86328125,
"logps/rejected": -0.8203125,
"loss": 0.9984,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.00042724609375,
"rewards/margins": -0.0001220703125,
"rewards/rejected": -0.00030517578125,
"step": 3
},
{
"epoch": 0.008547008547008548,
"grad_norm": 20.44290602161885,
"learning_rate": 5.957446808510638e-08,
"logits/chosen": -0.208984375,
"logits/rejected": -0.23046875,
"logps/chosen": -2.0625,
"logps/rejected": -1.7109375,
"loss": 0.9998,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.00323486328125,
"rewards/margins": -0.00140380859375,
"rewards/rejected": -0.0018310546875,
"step": 4
},
{
"epoch": 0.010683760683760684,
"grad_norm": 25.98006866123092,
"learning_rate": 7.446808510638298e-08,
"logits/chosen": -0.439453125,
"logits/rejected": -0.421875,
"logps/chosen": -0.7109375,
"logps/rejected": -0.6796875,
"loss": 0.9991,
"rewards/accuracies": 0.25,
"rewards/chosen": 3.0517578125e-05,
"rewards/margins": -0.000213623046875,
"rewards/rejected": 0.000244140625,
"step": 5
},
{
"epoch": 0.01282051282051282,
"grad_norm": 26.860512892988023,
"learning_rate": 8.936170212765956e-08,
"logits/chosen": -0.0947265625,
"logits/rejected": -0.177734375,
"logps/chosen": -0.9140625,
"logps/rejected": -0.625,
"loss": 0.9963,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.00128173828125,
"rewards/margins": -0.0008544921875,
"rewards/rejected": -0.00042724609375,
"step": 6
},
{
"epoch": 0.014957264957264958,
"grad_norm": 32.66934693408433,
"learning_rate": 1.0425531914893615e-07,
"logits/chosen": -0.61328125,
"logits/rejected": -0.4765625,
"logps/chosen": -2.046875,
"logps/rejected": -2.734375,
"loss": 0.9972,
"rewards/accuracies": 0.0625,
"rewards/chosen": -0.00439453125,
"rewards/margins": -0.00103759765625,
"rewards/rejected": -0.00335693359375,
"step": 7
},
{
"epoch": 0.017094017094017096,
"grad_norm": 80.33188302131406,
"learning_rate": 1.1914893617021276e-07,
"logits/chosen": -0.5390625,
"logits/rejected": -0.47265625,
"logps/chosen": -1.078125,
"logps/rejected": -0.6484375,
"loss": 1.0045,
"rewards/accuracies": 0.25,
"rewards/chosen": 0.00201416015625,
"rewards/margins": 0.00262451171875,
"rewards/rejected": -0.0006103515625,
"step": 8
},
{
"epoch": 0.019230769230769232,
"grad_norm": 31.740360264135372,
"learning_rate": 1.3404255319148934e-07,
"logits/chosen": -0.5078125,
"logits/rejected": -0.515625,
"logps/chosen": -1.0859375,
"logps/rejected": -1.2109375,
"loss": 0.9946,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.00146484375,
"rewards/margins": -0.0001220703125,
"rewards/rejected": -0.0013427734375,
"step": 9
},
{
"epoch": 0.021367521367521368,
"grad_norm": 33.44232905394117,
"learning_rate": 1.4893617021276595e-07,
"logits/chosen": -0.458984375,
"logits/rejected": -0.390625,
"logps/chosen": -0.59375,
"logps/rejected": -0.5703125,
"loss": 0.9956,
"rewards/accuracies": 0.125,
"rewards/chosen": -0.000762939453125,
"rewards/margins": -0.00079345703125,
"rewards/rejected": 3.0517578125e-05,
"step": 10
},
{
"epoch": 0.023504273504273504,
"grad_norm": 29.571097837737284,
"learning_rate": 1.6382978723404256e-07,
"logits/chosen": -0.310546875,
"logits/rejected": -0.341796875,
"logps/chosen": -0.8046875,
"logps/rejected": -0.89453125,
"loss": 1.0005,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.00048828125,
"rewards/margins": 0.0009765625,
"rewards/rejected": -0.00146484375,
"step": 11
},
{
"epoch": 0.02564102564102564,
"grad_norm": 36.03559141159498,
"learning_rate": 1.7872340425531912e-07,
"logits/chosen": -0.39453125,
"logits/rejected": -0.380859375,
"logps/chosen": -0.88671875,
"logps/rejected": -1.484375,
"loss": 0.9994,
"rewards/accuracies": 0.25,
"rewards/chosen": 0.00079345703125,
"rewards/margins": 0.00079345703125,
"rewards/rejected": 0.0,
"step": 12
},
{
"epoch": 0.027777777777777776,
"grad_norm": 24.278252772621258,
"learning_rate": 1.9361702127659575e-07,
"logits/chosen": -0.49609375,
"logits/rejected": -0.40234375,
"logps/chosen": -0.58203125,
"logps/rejected": -0.4296875,
"loss": 0.9996,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.00048828125,
"rewards/margins": 3.0517578125e-05,
"rewards/rejected": -0.000518798828125,
"step": 13
},
{
"epoch": 0.029914529914529916,
"grad_norm": 53.95191319044397,
"learning_rate": 2.085106382978723e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.6015625,
"logps/chosen": -0.60546875,
"logps/rejected": -0.55078125,
"loss": 1.0034,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.000732421875,
"rewards/margins": -0.00054931640625,
"rewards/rejected": -0.00018310546875,
"step": 14
},
{
"epoch": 0.03205128205128205,
"grad_norm": 28.363357101937858,
"learning_rate": 2.2340425531914894e-07,
"logits/chosen": -0.369140625,
"logits/rejected": -0.380859375,
"logps/chosen": -0.8125,
"logps/rejected": -1.6875,
"loss": 1.0035,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.002044677734375,
"rewards/margins": 0.002166748046875,
"rewards/rejected": -0.00421142578125,
"step": 15
},
{
"epoch": 0.03418803418803419,
"grad_norm": 60.351935351269326,
"learning_rate": 2.3829787234042553e-07,
"logits/chosen": -0.48046875,
"logits/rejected": -0.4453125,
"logps/chosen": -0.578125,
"logps/rejected": -0.6640625,
"loss": 0.998,
"rewards/accuracies": 0.5625,
"rewards/chosen": -6.103515625e-05,
"rewards/margins": 0.001434326171875,
"rewards/rejected": -0.001495361328125,
"step": 16
},
{
"epoch": 0.03632478632478633,
"grad_norm": 73.3125350448486,
"learning_rate": 2.5319148936170213e-07,
"logits/chosen": -0.61328125,
"logits/rejected": -0.55078125,
"logps/chosen": -0.640625,
"logps/rejected": -0.67578125,
"loss": 0.9965,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.00067138671875,
"rewards/margins": 0.0013427734375,
"rewards/rejected": -0.00201416015625,
"step": 17
},
{
"epoch": 0.038461538461538464,
"grad_norm": 39.12184643853537,
"learning_rate": 2.680851063829787e-07,
"logits/chosen": -0.271484375,
"logits/rejected": -0.294921875,
"logps/chosen": -1.34375,
"logps/rejected": -1.296875,
"loss": 1.002,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.010009765625,
"rewards/margins": 0.0025634765625,
"rewards/rejected": 0.0074462890625,
"step": 18
},
{
"epoch": 0.0405982905982906,
"grad_norm": 37.45050759373028,
"learning_rate": 2.829787234042553e-07,
"logits/chosen": -0.6640625,
"logits/rejected": -0.5234375,
"logps/chosen": -0.82421875,
"logps/rejected": -0.69921875,
"loss": 0.996,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.00079345703125,
"rewards/margins": 0.00042724609375,
"rewards/rejected": -0.001220703125,
"step": 19
},
{
"epoch": 0.042735042735042736,
"grad_norm": 29.741015994855005,
"learning_rate": 2.978723404255319e-07,
"logits/chosen": -0.484375,
"logits/rejected": -0.38671875,
"logps/chosen": -1.21875,
"logps/rejected": -0.734375,
"loss": 1.0015,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.002685546875,
"rewards/margins": 0.00238037109375,
"rewards/rejected": 0.00030517578125,
"step": 20
},
{
"epoch": 0.04487179487179487,
"grad_norm": 35.69376933733865,
"learning_rate": 3.1276595744680846e-07,
"logits/chosen": -0.451171875,
"logits/rejected": -0.431640625,
"logps/chosen": -1.3828125,
"logps/rejected": -1.328125,
"loss": 0.9961,
"rewards/accuracies": 0.1875,
"rewards/chosen": 0.00091552734375,
"rewards/margins": 0.00225830078125,
"rewards/rejected": -0.0013427734375,
"step": 21
},
{
"epoch": 0.04700854700854701,
"grad_norm": 42.08445297151301,
"learning_rate": 3.276595744680851e-07,
"logits/chosen": -0.5859375,
"logits/rejected": -0.51953125,
"logps/chosen": -1.0625,
"logps/rejected": -1.5546875,
"loss": 0.9996,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.00054931640625,
"rewards/margins": -0.0015869140625,
"rewards/rejected": 0.00103759765625,
"step": 22
},
{
"epoch": 0.049145299145299144,
"grad_norm": 53.496163216600635,
"learning_rate": 3.425531914893617e-07,
"logits/chosen": -0.408203125,
"logits/rejected": -0.26953125,
"logps/chosen": -0.796875,
"logps/rejected": -0.6953125,
"loss": 1.0005,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.00347900390625,
"rewards/margins": -0.00250244140625,
"rewards/rejected": -0.0009765625,
"step": 23
},
{
"epoch": 0.05128205128205128,
"grad_norm": 63.90357017444423,
"learning_rate": 3.5744680851063824e-07,
"logits/chosen": -0.376953125,
"logits/rejected": -0.380859375,
"logps/chosen": -0.59375,
"logps/rejected": -0.578125,
"loss": 0.9982,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0001220703125,
"rewards/margins": 0.0008544921875,
"rewards/rejected": -0.000732421875,
"step": 24
},
{
"epoch": 0.053418803418803416,
"grad_norm": 17.141245281554458,
"learning_rate": 3.7234042553191484e-07,
"logits/chosen": -0.5,
"logits/rejected": -0.390625,
"logps/chosen": -0.67578125,
"logps/rejected": -0.796875,
"loss": 0.9971,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.00244140625,
"rewards/margins": -0.00164794921875,
"rewards/rejected": -0.00079345703125,
"step": 25
},
{
"epoch": 0.05555555555555555,
"grad_norm": 41.45668288946838,
"learning_rate": 3.872340425531915e-07,
"logits/chosen": -0.1650390625,
"logits/rejected": -0.1806640625,
"logps/chosen": -0.58984375,
"logps/rejected": -0.6171875,
"loss": 0.9935,
"rewards/accuracies": 0.25,
"rewards/chosen": 0.000335693359375,
"rewards/margins": 0.0003509521484375,
"rewards/rejected": -1.52587890625e-05,
"step": 26
},
{
"epoch": 0.057692307692307696,
"grad_norm": 36.88109178289957,
"learning_rate": 4.021276595744681e-07,
"logits/chosen": -0.44921875,
"logits/rejected": -0.373046875,
"logps/chosen": -1.9140625,
"logps/rejected": -2.34375,
"loss": 0.9972,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.001953125,
"rewards/margins": -0.0030517578125,
"rewards/rejected": 0.0010986328125,
"step": 27
},
{
"epoch": 0.05982905982905983,
"grad_norm": 22.545905854772307,
"learning_rate": 4.170212765957446e-07,
"logits/chosen": -0.59375,
"logits/rejected": -0.578125,
"logps/chosen": -0.85546875,
"logps/rejected": -0.8671875,
"loss": 0.9972,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.00213623046875,
"rewards/margins": -0.00115966796875,
"rewards/rejected": -0.0009765625,
"step": 28
},
{
"epoch": 0.06196581196581197,
"grad_norm": 43.896588688689796,
"learning_rate": 4.319148936170213e-07,
"logits/chosen": -0.37890625,
"logits/rejected": -0.3515625,
"logps/chosen": -0.671875,
"logps/rejected": -0.6171875,
"loss": 1.0031,
"rewards/accuracies": 0.3125,
"rewards/chosen": 0.0029296875,
"rewards/margins": 0.0040283203125,
"rewards/rejected": -0.0010986328125,
"step": 29
},
{
"epoch": 0.0641025641025641,
"grad_norm": 40.25049255062445,
"learning_rate": 4.468085106382979e-07,
"logits/chosen": -0.7421875,
"logits/rejected": -0.62109375,
"logps/chosen": -0.796875,
"logps/rejected": -1.8359375,
"loss": 0.9939,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.00146484375,
"rewards/margins": -0.00030517578125,
"rewards/rejected": -0.00115966796875,
"step": 30
},
{
"epoch": 0.06623931623931624,
"grad_norm": 52.5845011579458,
"learning_rate": 4.617021276595744e-07,
"logits/chosen": -0.2021484375,
"logits/rejected": -0.228515625,
"logps/chosen": -0.671875,
"logps/rejected": -0.89453125,
"loss": 1.0049,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.003692626953125,
"rewards/margins": 0.00396728515625,
"rewards/rejected": -0.007659912109375,
"step": 31
},
{
"epoch": 0.06837606837606838,
"grad_norm": 49.712522346335234,
"learning_rate": 4.7659574468085105e-07,
"logits/chosen": -0.34765625,
"logits/rejected": -0.40234375,
"logps/chosen": -0.53125,
"logps/rejected": -0.609375,
"loss": 1.0063,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0009765625,
"rewards/margins": 0.0025634765625,
"rewards/rejected": -0.0035400390625,
"step": 32
},
{
"epoch": 0.07051282051282051,
"grad_norm": 38.46690827922013,
"learning_rate": 4.914893617021277e-07,
"logits/chosen": -0.365234375,
"logits/rejected": -0.322265625,
"logps/chosen": -0.5234375,
"logps/rejected": -0.50390625,
"loss": 0.9963,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.000701904296875,
"rewards/margins": 0.0001220703125,
"rewards/rejected": -0.000823974609375,
"step": 33
},
{
"epoch": 0.07264957264957266,
"grad_norm": 92.23984823750516,
"learning_rate": 5.063829787234043e-07,
"logits/chosen": -0.294921875,
"logits/rejected": -0.1875,
"logps/chosen": -0.5546875,
"logps/rejected": -0.453125,
"loss": 1.0079,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0018463134765625,
"rewards/margins": 0.0005645751953125,
"rewards/rejected": -0.002410888671875,
"step": 34
},
{
"epoch": 0.07478632478632478,
"grad_norm": 22.203815257535165,
"learning_rate": 5.212765957446808e-07,
"logits/chosen": -0.4921875,
"logits/rejected": -0.546875,
"logps/chosen": -0.75,
"logps/rejected": -0.6953125,
"loss": 0.9875,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.000701904296875,
"rewards/margins": 0.004669189453125,
"rewards/rejected": -0.00537109375,
"step": 35
},
{
"epoch": 0.07692307692307693,
"grad_norm": 64.42185662501916,
"learning_rate": 5.361702127659574e-07,
"logits/chosen": -0.34375,
"logits/rejected": -0.236328125,
"logps/chosen": -1.1953125,
"logps/rejected": -2.15625,
"loss": 0.9974,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.006561279296875,
"rewards/margins": 0.0133056640625,
"rewards/rejected": -0.0198974609375,
"step": 36
},
{
"epoch": 0.07905982905982906,
"grad_norm": 57.75280760864894,
"learning_rate": 5.51063829787234e-07,
"logits/chosen": -0.34765625,
"logits/rejected": -0.298828125,
"logps/chosen": -1.1953125,
"logps/rejected": -0.75390625,
"loss": 0.9971,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.005584716796875,
"rewards/margins": -0.002410888671875,
"rewards/rejected": -0.003173828125,
"step": 37
},
{
"epoch": 0.0811965811965812,
"grad_norm": 50.74349404882503,
"learning_rate": 5.659574468085106e-07,
"logits/chosen": -0.416015625,
"logits/rejected": -0.3203125,
"logps/chosen": -1.0,
"logps/rejected": -0.765625,
"loss": 0.9863,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.000762939453125,
"rewards/margins": -0.0001220703125,
"rewards/rejected": -0.000640869140625,
"step": 38
},
{
"epoch": 0.08333333333333333,
"grad_norm": 65.63535736055714,
"learning_rate": 5.808510638297872e-07,
"logits/chosen": -0.5390625,
"logits/rejected": -0.52734375,
"logps/chosen": -0.7734375,
"logps/rejected": -0.65234375,
"loss": 1.0226,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.005859375,
"rewards/margins": -0.00067138671875,
"rewards/rejected": -0.00518798828125,
"step": 39
},
{
"epoch": 0.08547008547008547,
"grad_norm": 117.44282894336358,
"learning_rate": 5.957446808510638e-07,
"logits/chosen": -0.359375,
"logits/rejected": -0.37109375,
"logps/chosen": -1.0,
"logps/rejected": -0.5859375,
"loss": 1.022,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.00732421875,
"rewards/margins": -0.00384521484375,
"rewards/rejected": -0.00347900390625,
"step": 40
},
{
"epoch": 0.0876068376068376,
"grad_norm": 41.851193269641925,
"learning_rate": 6.106382978723404e-07,
"logits/chosen": -0.4609375,
"logits/rejected": -0.38671875,
"logps/chosen": -0.73828125,
"logps/rejected": -0.73828125,
"loss": 0.9847,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.006866455078125,
"rewards/margins": 0.0005645751953125,
"rewards/rejected": -0.0074462890625,
"step": 41
},
{
"epoch": 0.08974358974358974,
"grad_norm": 28.730097925755693,
"learning_rate": 6.255319148936169e-07,
"logits/chosen": -0.515625,
"logits/rejected": -0.52734375,
"logps/chosen": -0.6328125,
"logps/rejected": -0.5703125,
"loss": 0.976,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.00238037109375,
"rewards/margins": 0.006927490234375,
"rewards/rejected": -0.004547119140625,
"step": 42
},
{
"epoch": 0.09188034188034189,
"grad_norm": 66.6883676221711,
"learning_rate": 6.404255319148935e-07,
"logits/chosen": -0.0274658203125,
"logits/rejected": -0.09765625,
"logps/chosen": -2.734375,
"logps/rejected": -1.9453125,
"loss": 0.9933,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.00201416015625,
"rewards/margins": -0.0035400390625,
"rewards/rejected": 0.005584716796875,
"step": 43
},
{
"epoch": 0.09401709401709402,
"grad_norm": 28.116383690151462,
"learning_rate": 6.553191489361702e-07,
"logits/chosen": -0.75,
"logits/rejected": -0.63671875,
"logps/chosen": -1.6875,
"logps/rejected": -1.4296875,
"loss": 0.9578,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.029052734375,
"rewards/margins": 0.017333984375,
"rewards/rejected": 0.01171875,
"step": 44
},
{
"epoch": 0.09615384615384616,
"grad_norm": 24.400107414943964,
"learning_rate": 6.702127659574469e-07,
"logits/chosen": -0.431640625,
"logits/rejected": -0.375,
"logps/chosen": -0.71484375,
"logps/rejected": -1.28125,
"loss": 0.9868,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.00921630859375,
"rewards/margins": 0.01202392578125,
"rewards/rejected": -0.021240234375,
"step": 45
},
{
"epoch": 0.09829059829059829,
"grad_norm": 28.812916665898577,
"learning_rate": 6.851063829787234e-07,
"logits/chosen": -0.431640625,
"logits/rejected": -0.42578125,
"logps/chosen": -0.8359375,
"logps/rejected": -0.81640625,
"loss": 0.9963,
"rewards/accuracies": 0.25,
"rewards/chosen": 0.01171875,
"rewards/margins": 0.003814697265625,
"rewards/rejected": 0.0079345703125,
"step": 46
},
{
"epoch": 0.10042735042735043,
"grad_norm": 12.920574377710937,
"learning_rate": 7e-07,
"logits/chosen": -0.421875,
"logits/rejected": -0.294921875,
"logps/chosen": -0.71875,
"logps/rejected": -0.7421875,
"loss": 0.9788,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.003753662109375,
"rewards/margins": 0.00128173828125,
"rewards/rejected": -0.005035400390625,
"step": 47
},
{
"epoch": 0.10256410256410256,
"grad_norm": 25.071111837784507,
"learning_rate": 6.999902552301362e-07,
"logits/chosen": -0.359375,
"logits/rejected": -0.365234375,
"logps/chosen": -1.2109375,
"logps/rejected": -1.5625,
"loss": 0.9839,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.01171875,
"rewards/margins": 0.0101318359375,
"rewards/rejected": 0.0015869140625,
"step": 48
},
{
"epoch": 0.1047008547008547,
"grad_norm": 16.266651913214233,
"learning_rate": 6.999610214631767e-07,
"logits/chosen": -0.56640625,
"logits/rejected": -0.5546875,
"logps/chosen": -0.9140625,
"logps/rejected": -0.765625,
"loss": 0.9821,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00323486328125,
"rewards/margins": 0.00689697265625,
"rewards/rejected": -0.0101318359375,
"step": 49
},
{
"epoch": 0.10683760683760683,
"grad_norm": 33.0881655418136,
"learning_rate": 6.999123003269862e-07,
"logits/chosen": -0.2119140625,
"logits/rejected": -0.1875,
"logps/chosen": -1.03125,
"logps/rejected": -1.1953125,
"loss": 0.9573,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.00762939453125,
"rewards/margins": 0.009765625,
"rewards/rejected": -0.002105712890625,
"step": 50
},
{
"epoch": 0.10897435897435898,
"grad_norm": 147.71801090104262,
"learning_rate": 6.998440945345717e-07,
"logits/chosen": -0.48828125,
"logits/rejected": -0.5390625,
"logps/chosen": -1.125,
"logps/rejected": -1.4296875,
"loss": 1.0513,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0341796875,
"rewards/margins": 0.06103515625,
"rewards/rejected": -0.02685546875,
"step": 51
},
{
"epoch": 0.1111111111111111,
"grad_norm": 60.17292781809932,
"learning_rate": 6.99756407883932e-07,
"logits/chosen": -0.322265625,
"logits/rejected": -0.43359375,
"logps/chosen": -0.73046875,
"logps/rejected": -1.078125,
"loss": 1.0376,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.01129150390625,
"rewards/margins": 0.015625,
"rewards/rejected": -0.00439453125,
"step": 52
},
{
"epoch": 0.11324786324786325,
"grad_norm": 36.38997121230995,
"learning_rate": 6.996492452578456e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.64453125,
"logps/chosen": -0.62109375,
"logps/rejected": -0.58203125,
"loss": 1.0084,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00213623046875,
"rewards/margins": 0.003662109375,
"rewards/rejected": -0.00579833984375,
"step": 53
},
{
"epoch": 0.11538461538461539,
"grad_norm": 22.671705161068385,
"learning_rate": 6.995226126235988e-07,
"logits/chosen": -0.5390625,
"logits/rejected": -0.4765625,
"logps/chosen": -0.68359375,
"logps/rejected": -0.71484375,
"loss": 0.9937,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0096435546875,
"rewards/margins": 0.00054931640625,
"rewards/rejected": 0.00909423828125,
"step": 54
},
{
"epoch": 0.11752136752136752,
"grad_norm": 61.48411418434429,
"learning_rate": 6.993765170326537e-07,
"logits/chosen": -0.380859375,
"logits/rejected": -0.3828125,
"logps/chosen": -0.8671875,
"logps/rejected": -0.6875,
"loss": 1.0205,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.007232666015625,
"rewards/margins": -0.00274658203125,
"rewards/rejected": -0.004486083984375,
"step": 55
},
{
"epoch": 0.11965811965811966,
"grad_norm": 154.53160691093754,
"learning_rate": 6.992109666202556e-07,
"logits/chosen": -0.375,
"logits/rejected": -0.322265625,
"logps/chosen": -0.9609375,
"logps/rejected": -0.890625,
"loss": 1.026,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.01220703125,
"rewards/margins": 0.00152587890625,
"rewards/rejected": -0.0137939453125,
"step": 56
},
{
"epoch": 0.12179487179487179,
"grad_norm": 29.076806117840395,
"learning_rate": 6.990259706049799e-07,
"logits/chosen": -0.30078125,
"logits/rejected": -0.2451171875,
"logps/chosen": -2.265625,
"logps/rejected": -1.6640625,
"loss": 0.9692,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.03466796875,
"rewards/margins": 0.035400390625,
"rewards/rejected": -0.06982421875,
"step": 57
},
{
"epoch": 0.12393162393162394,
"grad_norm": 70.26911941495058,
"learning_rate": 6.988215392882183e-07,
"logits/chosen": -0.328125,
"logits/rejected": -0.30859375,
"logps/chosen": -0.671875,
"logps/rejected": -0.6640625,
"loss": 0.9807,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.005706787109375,
"rewards/margins": -0.0015869140625,
"rewards/rejected": -0.004119873046875,
"step": 58
},
{
"epoch": 0.12606837606837606,
"grad_norm": 20.11374097217621,
"learning_rate": 6.985976840536061e-07,
"logits/chosen": -0.69140625,
"logits/rejected": -0.58984375,
"logps/chosen": -0.490234375,
"logps/rejected": -0.44921875,
"loss": 0.9724,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.007080078125,
"rewards/margins": -0.0045166015625,
"rewards/rejected": -0.0025634765625,
"step": 59
},
{
"epoch": 0.1282051282051282,
"grad_norm": 111.39066614610299,
"learning_rate": 6.983544173663875e-07,
"logits/chosen": -0.224609375,
"logits/rejected": -0.2216796875,
"logps/chosen": -0.90234375,
"logps/rejected": -1.1015625,
"loss": 1.0992,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.0238037109375,
"rewards/margins": 0.03466796875,
"rewards/rejected": -0.058349609375,
"step": 60
},
{
"epoch": 0.13034188034188035,
"grad_norm": 25.562904676632012,
"learning_rate": 6.980917527727217e-07,
"logits/chosen": -0.3984375,
"logits/rejected": -0.34765625,
"logps/chosen": -1.0390625,
"logps/rejected": -1.140625,
"loss": 0.984,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.028076171875,
"rewards/margins": 0.01080322265625,
"rewards/rejected": -0.0390625,
"step": 61
},
{
"epoch": 0.13247863247863248,
"grad_norm": 48.932976513817344,
"learning_rate": 6.978097048989288e-07,
"logits/chosen": -0.7578125,
"logits/rejected": -0.6875,
"logps/chosen": -1.046875,
"logps/rejected": -1.203125,
"loss": 1.0165,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.00921630859375,
"rewards/margins": 0.00946044921875,
"rewards/rejected": -0.0186767578125,
"step": 62
},
{
"epoch": 0.1346153846153846,
"grad_norm": 20.033557361846213,
"learning_rate": 6.975082894506753e-07,
"logits/chosen": -0.380859375,
"logits/rejected": -0.3359375,
"logps/chosen": -0.78125,
"logps/rejected": -0.66796875,
"loss": 1.0021,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.01055908203125,
"rewards/margins": -0.00372314453125,
"rewards/rejected": -0.0068359375,
"step": 63
},
{
"epoch": 0.13675213675213677,
"grad_norm": 19.905545916166517,
"learning_rate": 6.971875232120994e-07,
"logits/chosen": -0.49609375,
"logits/rejected": -0.388671875,
"logps/chosen": -0.62890625,
"logps/rejected": -0.5703125,
"loss": 0.9924,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.009033203125,
"rewards/margins": -0.0011138916015625,
"rewards/rejected": -0.0079345703125,
"step": 64
},
{
"epoch": 0.1388888888888889,
"grad_norm": 26.422482495942226,
"learning_rate": 6.968474240448763e-07,
"logits/chosen": -0.435546875,
"logits/rejected": -0.4296875,
"logps/chosen": -0.94140625,
"logps/rejected": -1.0703125,
"loss": 0.98,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.015625,
"rewards/margins": 0.00750732421875,
"rewards/rejected": -0.023193359375,
"step": 65
},
{
"epoch": 0.14102564102564102,
"grad_norm": 68.78295228867675,
"learning_rate": 6.964880108872238e-07,
"logits/chosen": -0.515625,
"logits/rejected": -0.515625,
"logps/chosen": -0.53515625,
"logps/rejected": -0.546875,
"loss": 1.0587,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.00238037109375,
"rewards/margins": 0.004364013671875,
"rewards/rejected": -0.006744384765625,
"step": 66
},
{
"epoch": 0.14316239316239315,
"grad_norm": 19.507340401891387,
"learning_rate": 6.961093037528475e-07,
"logits/chosen": -0.546875,
"logits/rejected": -0.546875,
"logps/chosen": -0.734375,
"logps/rejected": -1.4140625,
"loss": 0.9944,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.01116943359375,
"rewards/margins": 0.0048828125,
"rewards/rejected": -0.01611328125,
"step": 67
},
{
"epoch": 0.1452991452991453,
"grad_norm": 31.79134213963872,
"learning_rate": 6.957113237298269e-07,
"logits/chosen": -0.373046875,
"logits/rejected": -0.306640625,
"logps/chosen": -2.09375,
"logps/rejected": -3.15625,
"loss": 0.976,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.056640625,
"rewards/margins": 0.0146484375,
"rewards/rejected": -0.0712890625,
"step": 68
},
{
"epoch": 0.14743589743589744,
"grad_norm": 49.21677800952418,
"learning_rate": 6.952940929794406e-07,
"logits/chosen": -0.365234375,
"logits/rejected": -0.296875,
"logps/chosen": -1.046875,
"logps/rejected": -0.99609375,
"loss": 1.0469,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.026123046875,
"rewards/margins": 0.0133056640625,
"rewards/rejected": -0.03955078125,
"step": 69
},
{
"epoch": 0.14957264957264957,
"grad_norm": 29.943057230035965,
"learning_rate": 6.948576347349319e-07,
"logits/chosen": -0.44921875,
"logits/rejected": -0.470703125,
"logps/chosen": -0.53515625,
"logps/rejected": -0.494140625,
"loss": 0.965,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.000335693359375,
"rewards/margins": 0.006011962890625,
"rewards/rejected": -0.00567626953125,
"step": 70
},
{
"epoch": 0.1517094017094017,
"grad_norm": 15.86774723874277,
"learning_rate": 6.944019733002163e-07,
"logits/chosen": -0.625,
"logits/rejected": -0.5390625,
"logps/chosen": -0.75,
"logps/rejected": -0.80078125,
"loss": 0.9816,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.013427734375,
"rewards/margins": 0.003936767578125,
"rewards/rejected": 0.00946044921875,
"step": 71
},
{
"epoch": 0.15384615384615385,
"grad_norm": 27.561572561189063,
"learning_rate": 6.939271340485266e-07,
"logits/chosen": -0.5703125,
"logits/rejected": -0.64453125,
"logps/chosen": -0.5625,
"logps/rejected": -0.6171875,
"loss": 0.9576,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0047607421875,
"rewards/margins": 0.00244140625,
"rewards/rejected": 0.0023193359375,
"step": 72
},
{
"epoch": 0.15598290598290598,
"grad_norm": 139.7151826401799,
"learning_rate": 6.934331434210018e-07,
"logits/chosen": -0.54296875,
"logits/rejected": -0.494140625,
"logps/chosen": -0.55859375,
"logps/rejected": -1.171875,
"loss": 1.1119,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.005950927734375,
"rewards/margins": -0.048828125,
"rewards/rejected": 0.0546875,
"step": 73
},
{
"epoch": 0.1581196581196581,
"grad_norm": 26.428371794410545,
"learning_rate": 6.929200289252127e-07,
"logits/chosen": -0.66796875,
"logits/rejected": -0.59375,
"logps/chosen": -1.625,
"logps/rejected": -1.96875,
"loss": 0.9962,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.037109375,
"rewards/margins": -0.00714111328125,
"rewards/rejected": 0.04443359375,
"step": 74
},
{
"epoch": 0.16025641025641027,
"grad_norm": 71.85447334397465,
"learning_rate": 6.923878191336319e-07,
"logits/chosen": -0.4609375,
"logits/rejected": -0.427734375,
"logps/chosen": -0.80859375,
"logps/rejected": -1.2109375,
"loss": 1.0996,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.000457763671875,
"rewards/margins": -0.05029296875,
"rewards/rejected": 0.05078125,
"step": 75
},
{
"epoch": 0.1623931623931624,
"grad_norm": 38.858782907246336,
"learning_rate": 6.918365436820421e-07,
"logits/chosen": -0.29296875,
"logits/rejected": -0.2431640625,
"logps/chosen": -0.71875,
"logps/rejected": -0.87109375,
"loss": 1.0227,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0166015625,
"rewards/margins": -0.003326416015625,
"rewards/rejected": -0.0133056640625,
"step": 76
},
{
"epoch": 0.16452991452991453,
"grad_norm": 10.583889406197391,
"learning_rate": 6.912662332678855e-07,
"logits/chosen": -0.515625,
"logits/rejected": -0.49609375,
"logps/chosen": -0.81640625,
"logps/rejected": -0.86328125,
"loss": 0.9626,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.01220703125,
"rewards/margins": 0.01300048828125,
"rewards/rejected": -0.0252685546875,
"step": 77
},
{
"epoch": 0.16666666666666666,
"grad_norm": 44.75517764964221,
"learning_rate": 6.906769196485548e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.65625,
"logps/chosen": -1.21875,
"logps/rejected": -1.703125,
"loss": 0.9673,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.07861328125,
"rewards/margins": 0.057373046875,
"rewards/rejected": 0.021240234375,
"step": 78
},
{
"epoch": 0.16880341880341881,
"grad_norm": 35.27372933220996,
"learning_rate": 6.90068635639625e-07,
"logits/chosen": -0.484375,
"logits/rejected": -0.373046875,
"logps/chosen": -1.5625,
"logps/rejected": -1.046875,
"loss": 0.988,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0152587890625,
"rewards/margins": 0.021240234375,
"rewards/rejected": -0.006011962890625,
"step": 79
},
{
"epoch": 0.17094017094017094,
"grad_norm": 61.73040358186559,
"learning_rate": 6.894414151130255e-07,
"logits/chosen": -0.2451171875,
"logits/rejected": -0.19140625,
"logps/chosen": -0.76953125,
"logps/rejected": -1.125,
"loss": 0.9987,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.00714111328125,
"rewards/margins": 0.0213623046875,
"rewards/rejected": -0.028564453125,
"step": 80
},
{
"epoch": 0.17307692307692307,
"grad_norm": 30.95188040932688,
"learning_rate": 6.887952929951547e-07,
"logits/chosen": -0.478515625,
"logits/rejected": -0.37890625,
"logps/chosen": -0.7890625,
"logps/rejected": -0.88671875,
"loss": 1.0059,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.007720947265625,
"rewards/margins": 0.00811767578125,
"rewards/rejected": -0.015869140625,
"step": 81
},
{
"epoch": 0.1752136752136752,
"grad_norm": 57.26839307731332,
"learning_rate": 6.881303052649344e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.56640625,
"logps/chosen": -0.5234375,
"logps/rejected": -0.56640625,
"loss": 1.0756,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.00421142578125,
"rewards/margins": 0.00244140625,
"rewards/rejected": 0.00177001953125,
"step": 82
},
{
"epoch": 0.17735042735042736,
"grad_norm": 44.65238337963925,
"learning_rate": 6.87446488951807e-07,
"logits/chosen": -0.43359375,
"logits/rejected": -0.44140625,
"logps/chosen": -0.65234375,
"logps/rejected": -1.4296875,
"loss": 0.9998,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0040283203125,
"rewards/margins": 0.0008544921875,
"rewards/rejected": -0.0048828125,
"step": 83
},
{
"epoch": 0.1794871794871795,
"grad_norm": 31.057443291067262,
"learning_rate": 6.867438821336729e-07,
"logits/chosen": -0.421875,
"logits/rejected": -0.423828125,
"logps/chosen": -0.671875,
"logps/rejected": -1.59375,
"loss": 0.9746,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.00225830078125,
"rewards/margins": 0.0751953125,
"rewards/rejected": -0.0771484375,
"step": 84
},
{
"epoch": 0.18162393162393162,
"grad_norm": 92.40532740478336,
"learning_rate": 6.860225239347707e-07,
"logits/chosen": -0.353515625,
"logits/rejected": -0.26171875,
"logps/chosen": -1.140625,
"logps/rejected": -0.96875,
"loss": 1.014,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.037109375,
"rewards/margins": 0.05029296875,
"rewards/rejected": -0.01336669921875,
"step": 85
},
{
"epoch": 0.18376068376068377,
"grad_norm": 17.92936798357233,
"learning_rate": 6.852824545234985e-07,
"logits/chosen": -0.50390625,
"logits/rejected": -0.470703125,
"logps/chosen": -0.5078125,
"logps/rejected": -0.52734375,
"loss": 0.9742,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.000732421875,
"rewards/margins": 0.004791259765625,
"rewards/rejected": -0.004058837890625,
"step": 86
},
{
"epoch": 0.1858974358974359,
"grad_norm": 20.80379260553499,
"learning_rate": 6.845237151101767e-07,
"logits/chosen": -0.228515625,
"logits/rejected": 0.023681640625,
"logps/chosen": -0.65234375,
"logps/rejected": -0.72265625,
"loss": 0.9644,
"rewards/accuracies": 0.6875,
"rewards/chosen": 3.0517578125e-05,
"rewards/margins": 0.0167236328125,
"rewards/rejected": -0.0167236328125,
"step": 87
},
{
"epoch": 0.18803418803418803,
"grad_norm": 72.09254799159736,
"learning_rate": 6.837463479447537e-07,
"logits/chosen": -0.31640625,
"logits/rejected": -0.26953125,
"logps/chosen": -1.1171875,
"logps/rejected": -2.8125,
"loss": 0.9951,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.00970458984375,
"rewards/margins": 0.11376953125,
"rewards/rejected": -0.12353515625,
"step": 88
},
{
"epoch": 0.19017094017094016,
"grad_norm": 20.176800812028358,
"learning_rate": 6.829503963144531e-07,
"logits/chosen": -0.4140625,
"logits/rejected": -0.30859375,
"logps/chosen": -0.85546875,
"logps/rejected": -0.72265625,
"loss": 1.0018,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0087890625,
"rewards/margins": -0.00335693359375,
"rewards/rejected": -0.00543212890625,
"step": 89
},
{
"epoch": 0.19230769230769232,
"grad_norm": 57.87970181056306,
"learning_rate": 6.821359045413631e-07,
"logits/chosen": -0.50390625,
"logits/rejected": -0.44921875,
"logps/chosen": -0.86328125,
"logps/rejected": -0.86328125,
"loss": 1.0153,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.00054931640625,
"rewards/margins": 0.0010986328125,
"rewards/rejected": -0.00054931640625,
"step": 90
},
{
"epoch": 0.19444444444444445,
"grad_norm": 49.16480618396826,
"learning_rate": 6.813029179799691e-07,
"logits/chosen": -0.44921875,
"logits/rejected": -0.451171875,
"logps/chosen": -0.62890625,
"logps/rejected": -1.3828125,
"loss": 1.0355,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.001312255859375,
"rewards/margins": 0.02392578125,
"rewards/rejected": -0.022705078125,
"step": 91
},
{
"epoch": 0.19658119658119658,
"grad_norm": 33.2077502398859,
"learning_rate": 6.804514830146271e-07,
"logits/chosen": -0.61328125,
"logits/rejected": -0.56640625,
"logps/chosen": -0.671875,
"logps/rejected": -0.71875,
"loss": 0.9918,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.00201416015625,
"rewards/margins": 0.01336669921875,
"rewards/rejected": -0.0113525390625,
"step": 92
},
{
"epoch": 0.1987179487179487,
"grad_norm": 33.15696851492135,
"learning_rate": 6.795816470569815e-07,
"logits/chosen": -0.46484375,
"logits/rejected": -0.40234375,
"logps/chosen": -0.6875,
"logps/rejected": -1.25,
"loss": 0.988,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0029296875,
"rewards/margins": 0.0888671875,
"rewards/rejected": -0.0859375,
"step": 93
},
{
"epoch": 0.20085470085470086,
"grad_norm": 33.760745328662495,
"learning_rate": 6.786934585433253e-07,
"logits/chosen": -0.44140625,
"logits/rejected": -0.43359375,
"logps/chosen": -0.6484375,
"logps/rejected": -0.6328125,
"loss": 0.9528,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.004486083984375,
"rewards/margins": 0.0040283203125,
"rewards/rejected": -0.008544921875,
"step": 94
},
{
"epoch": 0.202991452991453,
"grad_norm": 16.210619092434975,
"learning_rate": 6.777869669319021e-07,
"logits/chosen": -0.34765625,
"logits/rejected": -0.341796875,
"logps/chosen": -0.5078125,
"logps/rejected": -0.57421875,
"loss": 0.9913,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.007598876953125,
"rewards/margins": 0.004486083984375,
"rewards/rejected": 0.00311279296875,
"step": 95
},
{
"epoch": 0.20512820512820512,
"grad_norm": 28.57830685419194,
"learning_rate": 6.768622227001528e-07,
"logits/chosen": -0.69140625,
"logits/rejected": -0.453125,
"logps/chosen": -0.7734375,
"logps/rejected": -0.671875,
"loss": 0.9589,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.00457763671875,
"rewards/margins": 0.01177978515625,
"rewards/rejected": -0.016357421875,
"step": 96
},
{
"epoch": 0.20726495726495728,
"grad_norm": 54.39294051177383,
"learning_rate": 6.759192773419042e-07,
"logits/chosen": -0.58984375,
"logits/rejected": -0.54296875,
"logps/chosen": -1.046875,
"logps/rejected": -1.265625,
"loss": 1.0544,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00372314453125,
"rewards/margins": -0.00537109375,
"rewards/rejected": 0.00164794921875,
"step": 97
},
{
"epoch": 0.2094017094017094,
"grad_norm": 59.836687557700564,
"learning_rate": 6.749581833645022e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.5390625,
"logps/chosen": -1.6171875,
"logps/rejected": -0.65625,
"loss": 1.0487,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.019287109375,
"rewards/margins": -0.0028076171875,
"rewards/rejected": -0.016357421875,
"step": 98
},
{
"epoch": 0.21153846153846154,
"grad_norm": 37.8780536986917,
"learning_rate": 6.739789942858876e-07,
"logits/chosen": -0.365234375,
"logits/rejected": -0.375,
"logps/chosen": -0.89453125,
"logps/rejected": -0.76171875,
"loss": 0.9814,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0108642578125,
"rewards/margins": 0.002716064453125,
"rewards/rejected": 0.00811767578125,
"step": 99
},
{
"epoch": 0.21367521367521367,
"grad_norm": 29.168248507387297,
"learning_rate": 6.729817646316158e-07,
"logits/chosen": -0.380859375,
"logits/rejected": -0.408203125,
"logps/chosen": -2.015625,
"logps/rejected": -2.125,
"loss": 0.9626,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0263671875,
"rewards/margins": 0.016845703125,
"rewards/rejected": 0.0096435546875,
"step": 100
},
{
"epoch": 0.21367521367521367,
"eval_logits/chosen": -0.50390625,
"eval_logits/rejected": -0.48828125,
"eval_logps/chosen": -1.0078125,
"eval_logps/rejected": -1.0390625,
"eval_loss": 1.00387442111969,
"eval_rewards/accuracies": 0.6290322542190552,
"eval_rewards/chosen": 0.0093994140625,
"eval_rewards/margins": 0.031982421875,
"eval_rewards/rejected": -0.0225830078125,
"eval_runtime": 105.2748,
"eval_samples_per_second": 18.627,
"eval_steps_per_second": 0.589,
"step": 100
},
{
"epoch": 0.21581196581196582,
"grad_norm": 40.180093476954596,
"learning_rate": 6.719665499318211e-07,
"logits/chosen": -0.62890625,
"logits/rejected": -0.53515625,
"logps/chosen": -0.57421875,
"logps/rejected": -0.98046875,
"loss": 0.9673,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0015869140625,
"rewards/margins": 0.0302734375,
"rewards/rejected": -0.02880859375,
"step": 101
},
{
"epoch": 0.21794871794871795,
"grad_norm": 20.943412477220253,
"learning_rate": 6.709334067181241e-07,
"logits/chosen": -0.462890625,
"logits/rejected": -0.39453125,
"logps/chosen": -1.1875,
"logps/rejected": -1.359375,
"loss": 0.9623,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.003814697265625,
"rewards/margins": 0.03271484375,
"rewards/rejected": -0.029052734375,
"step": 102
},
{
"epoch": 0.22008547008547008,
"grad_norm": 31.399197614010006,
"learning_rate": 6.69882392520484e-07,
"logits/chosen": -0.365234375,
"logits/rejected": -0.44140625,
"logps/chosen": -1.4140625,
"logps/rejected": -0.71484375,
"loss": 1.0045,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0361328125,
"rewards/margins": -0.029541015625,
"rewards/rejected": -0.00665283203125,
"step": 103
},
{
"epoch": 0.2222222222222222,
"grad_norm": 26.799960542003504,
"learning_rate": 6.688135658639948e-07,
"logits/chosen": -0.400390625,
"logits/rejected": -0.40625,
"logps/chosen": -0.69140625,
"logps/rejected": -0.640625,
"loss": 0.9724,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.00494384765625,
"rewards/margins": -0.00189208984375,
"rewards/rejected": -0.0030517578125,
"step": 104
},
{
"epoch": 0.22435897435897437,
"grad_norm": 25.35218941711545,
"learning_rate": 6.677269862656269e-07,
"logits/chosen": -0.39453125,
"logits/rejected": -0.341796875,
"logps/chosen": -0.875,
"logps/rejected": -1.0390625,
"loss": 0.9951,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.019287109375,
"rewards/margins": -0.0010833740234375,
"rewards/rejected": -0.0181884765625,
"step": 105
},
{
"epoch": 0.2264957264957265,
"grad_norm": 26.17479347283922,
"learning_rate": 6.666227142309125e-07,
"logits/chosen": -0.5234375,
"logits/rejected": -0.46484375,
"logps/chosen": -0.640625,
"logps/rejected": -1.59375,
"loss": 0.9584,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.008056640625,
"rewards/margins": 0.005126953125,
"rewards/rejected": 0.0029296875,
"step": 106
},
{
"epoch": 0.22863247863247863,
"grad_norm": 28.773921806862447,
"learning_rate": 6.655008112505764e-07,
"logits/chosen": -0.37109375,
"logits/rejected": -0.341796875,
"logps/chosen": -0.5,
"logps/rejected": -0.97265625,
"loss": 0.9639,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.006195068359375,
"rewards/margins": 0.055419921875,
"rewards/rejected": -0.061279296875,
"step": 107
},
{
"epoch": 0.23076923076923078,
"grad_norm": 11.104751430129355,
"learning_rate": 6.643613397971118e-07,
"logits/chosen": -0.515625,
"logits/rejected": -0.5234375,
"logps/chosen": -0.6796875,
"logps/rejected": -0.72265625,
"loss": 0.9629,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.01544189453125,
"rewards/margins": 0.00067138671875,
"rewards/rejected": 0.0147705078125,
"step": 108
},
{
"epoch": 0.2329059829059829,
"grad_norm": 39.285256444884,
"learning_rate": 6.632043633213024e-07,
"logits/chosen": -0.5703125,
"logits/rejected": -0.55078125,
"logps/chosen": -0.68359375,
"logps/rejected": -0.70703125,
"loss": 1.0209,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0003662109375,
"rewards/margins": 0.01263427734375,
"rewards/rejected": -0.01300048828125,
"step": 109
},
{
"epoch": 0.23504273504273504,
"grad_norm": 14.705470908694043,
"learning_rate": 6.620299462486878e-07,
"logits/chosen": -0.45703125,
"logits/rejected": -0.328125,
"logps/chosen": -0.703125,
"logps/rejected": -0.640625,
"loss": 0.9722,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0147705078125,
"rewards/margins": 0.0093994140625,
"rewards/rejected": 0.00537109375,
"step": 110
},
{
"epoch": 0.23717948717948717,
"grad_norm": 45.43856755416852,
"learning_rate": 6.608381539759773e-07,
"logits/chosen": -0.5859375,
"logits/rejected": -0.58984375,
"logps/chosen": -1.4609375,
"logps/rejected": -2.671875,
"loss": 0.9984,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0208740234375,
"rewards/margins": 0.0400390625,
"rewards/rejected": -0.060546875,
"step": 111
},
{
"epoch": 0.23931623931623933,
"grad_norm": 40.396012997379934,
"learning_rate": 6.596290528674075e-07,
"logits/chosen": -0.47265625,
"logits/rejected": -0.4921875,
"logps/chosen": -1.75,
"logps/rejected": -0.828125,
"loss": 1.0299,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.034423828125,
"rewards/margins": -0.025146484375,
"rewards/rejected": -0.00933837890625,
"step": 112
},
{
"epoch": 0.24145299145299146,
"grad_norm": 47.741295163046054,
"learning_rate": 6.584027102510475e-07,
"logits/chosen": -0.28515625,
"logits/rejected": -0.275390625,
"logps/chosen": -0.8671875,
"logps/rejected": -0.79296875,
"loss": 1.0162,
"rewards/accuracies": 0.3125,
"rewards/chosen": 0.00927734375,
"rewards/margins": -0.00146484375,
"rewards/rejected": 0.0107421875,
"step": 113
},
{
"epoch": 0.24358974358974358,
"grad_norm": 18.775881941823393,
"learning_rate": 6.57159194415049e-07,
"logits/chosen": -0.52734375,
"logits/rejected": -0.42578125,
"logps/chosen": -0.5078125,
"logps/rejected": -0.5078125,
"loss": 0.9446,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.00067138671875,
"rewards/margins": 0.007568359375,
"rewards/rejected": -0.00689697265625,
"step": 114
},
{
"epoch": 0.24572649572649571,
"grad_norm": 74.2807011818759,
"learning_rate": 6.558985746038441e-07,
"logits/chosen": -0.3984375,
"logits/rejected": -0.42578125,
"logps/chosen": -1.375,
"logps/rejected": -2.8125,
"loss": 1.047,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.002960205078125,
"rewards/margins": -0.0458984375,
"rewards/rejected": 0.048828125,
"step": 115
},
{
"epoch": 0.24786324786324787,
"grad_norm": 48.801715754922114,
"learning_rate": 6.546209210142898e-07,
"logits/chosen": -0.50390625,
"logits/rejected": -0.359375,
"logps/chosen": -0.80859375,
"logps/rejected": -1.515625,
"loss": 1.0457,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.03515625,
"rewards/margins": -0.0283203125,
"rewards/rejected": -0.00689697265625,
"step": 116
},
{
"epoch": 0.25,
"grad_norm": 80.28448840086726,
"learning_rate": 6.533263047917585e-07,
"logits/chosen": -0.53515625,
"logits/rejected": -0.54296875,
"logps/chosen": -1.1484375,
"logps/rejected": -1.46875,
"loss": 1.0875,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.00018310546875,
"rewards/margins": -0.019775390625,
"rewards/rejected": 0.02001953125,
"step": 117
},
{
"epoch": 0.25213675213675213,
"grad_norm": 108.99518172041073,
"learning_rate": 6.520147980261769e-07,
"logits/chosen": -0.30859375,
"logits/rejected": -0.2216796875,
"logps/chosen": -1.75,
"logps/rejected": -0.875,
"loss": 1.1354,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.08447265625,
"rewards/margins": -0.09326171875,
"rewards/rejected": 0.0093994140625,
"step": 118
},
{
"epoch": 0.25427350427350426,
"grad_norm": 18.814589732473536,
"learning_rate": 6.506864737480113e-07,
"logits/chosen": -0.478515625,
"logits/rejected": -0.40625,
"logps/chosen": -0.66796875,
"logps/rejected": -0.6015625,
"loss": 0.948,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.00238037109375,
"rewards/margins": 0.01226806640625,
"rewards/rejected": -0.0098876953125,
"step": 119
},
{
"epoch": 0.2564102564102564,
"grad_norm": 27.390756219900865,
"learning_rate": 6.493414059242011e-07,
"logits/chosen": -0.6328125,
"logits/rejected": -0.625,
"logps/chosen": -0.546875,
"logps/rejected": -0.82421875,
"loss": 0.9696,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.00482177734375,
"rewards/margins": 0.0208740234375,
"rewards/rejected": -0.025634765625,
"step": 120
},
{
"epoch": 0.25854700854700857,
"grad_norm": 23.34685170798232,
"learning_rate": 6.479796694540399e-07,
"logits/chosen": -0.6015625,
"logits/rejected": -0.53125,
"logps/chosen": -0.859375,
"logps/rejected": -0.90625,
"loss": 0.9795,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.01123046875,
"rewards/margins": 0.00103759765625,
"rewards/rejected": -0.0123291015625,
"step": 121
},
{
"epoch": 0.2606837606837607,
"grad_norm": 40.69213923113079,
"learning_rate": 6.46601340165005e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.5625,
"logps/chosen": -0.76171875,
"logps/rejected": -0.87890625,
"loss": 0.9565,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.002777099609375,
"rewards/margins": 0.002105712890625,
"rewards/rejected": -0.0048828125,
"step": 122
},
{
"epoch": 0.26282051282051283,
"grad_norm": 74.88918479593436,
"learning_rate": 6.452064948085348e-07,
"logits/chosen": -0.45703125,
"logits/rejected": -0.546875,
"logps/chosen": -0.8046875,
"logps/rejected": -1.765625,
"loss": 1.0458,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00103759765625,
"rewards/margins": -0.040771484375,
"rewards/rejected": 0.03955078125,
"step": 123
},
{
"epoch": 0.26495726495726496,
"grad_norm": 22.294441111678267,
"learning_rate": 6.43795211055755e-07,
"logits/chosen": -0.498046875,
"logits/rejected": -0.392578125,
"logps/chosen": -0.6171875,
"logps/rejected": -0.73828125,
"loss": 1.0101,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.00872802734375,
"rewards/margins": 0.0096435546875,
"rewards/rejected": -0.018310546875,
"step": 124
},
{
"epoch": 0.2670940170940171,
"grad_norm": 29.375252043352734,
"learning_rate": 6.423675674931533e-07,
"logits/chosen": -0.703125,
"logits/rejected": -0.7109375,
"logps/chosen": -0.5625,
"logps/rejected": -0.90234375,
"loss": 1.0037,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0098876953125,
"rewards/margins": -0.020751953125,
"rewards/rejected": 0.01080322265625,
"step": 125
},
{
"epoch": 0.2692307692307692,
"grad_norm": 37.173942240659194,
"learning_rate": 6.409236436182041e-07,
"logits/chosen": -0.423828125,
"logits/rejected": -0.384765625,
"logps/chosen": -0.7109375,
"logps/rejected": -0.77734375,
"loss": 0.9626,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.01416015625,
"rewards/margins": 0.033447265625,
"rewards/rejected": -0.047607421875,
"step": 126
},
{
"epoch": 0.27136752136752135,
"grad_norm": 29.85823779609622,
"learning_rate": 6.394635198349408e-07,
"logits/chosen": -0.51171875,
"logits/rejected": -0.458984375,
"logps/chosen": -0.953125,
"logps/rejected": -2.265625,
"loss": 0.9742,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0296630859375,
"rewards/margins": 0.0361328125,
"rewards/rejected": -0.06591796875,
"step": 127
},
{
"epoch": 0.27350427350427353,
"grad_norm": 65.30857432466394,
"learning_rate": 6.37987277449479e-07,
"logits/chosen": -0.3515625,
"logits/rejected": -0.369140625,
"logps/chosen": -0.82421875,
"logps/rejected": -1.4296875,
"loss": 0.9835,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.025634765625,
"rewards/margins": 0.0205078125,
"rewards/rejected": -0.0458984375,
"step": 128
},
{
"epoch": 0.27564102564102566,
"grad_norm": 149.41739915395954,
"learning_rate": 6.364949986654889e-07,
"logits/chosen": -0.59375,
"logits/rejected": -0.51953125,
"logps/chosen": -0.8515625,
"logps/rejected": -1.1484375,
"loss": 1.0493,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.00823974609375,
"rewards/margins": 0.0205078125,
"rewards/rejected": -0.028564453125,
"step": 129
},
{
"epoch": 0.2777777777777778,
"grad_norm": 37.22138619167702,
"learning_rate": 6.349867665796183e-07,
"logits/chosen": -0.5859375,
"logits/rejected": -0.462890625,
"logps/chosen": -1.0,
"logps/rejected": -1.953125,
"loss": 0.9562,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.01171875,
"rewards/margins": 0.04345703125,
"rewards/rejected": -0.055419921875,
"step": 130
},
{
"epoch": 0.2799145299145299,
"grad_norm": 28.846364966596266,
"learning_rate": 6.334626651768649e-07,
"logits/chosen": -0.287109375,
"logits/rejected": -0.302734375,
"logps/chosen": -0.6015625,
"logps/rejected": -0.60546875,
"loss": 0.9603,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.004425048828125,
"rewards/margins": 0.019775390625,
"rewards/rejected": -0.024169921875,
"step": 131
},
{
"epoch": 0.28205128205128205,
"grad_norm": 33.885232819627916,
"learning_rate": 6.319227793258992e-07,
"logits/chosen": -0.458984375,
"logits/rejected": -0.4375,
"logps/chosen": -0.53125,
"logps/rejected": -0.5234375,
"loss": 0.968,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.003753662109375,
"rewards/margins": 0.0150146484375,
"rewards/rejected": -0.018798828125,
"step": 132
},
{
"epoch": 0.2841880341880342,
"grad_norm": 25.422079466708336,
"learning_rate": 6.3036719477434e-07,
"logits/chosen": -0.30078125,
"logits/rejected": -0.2421875,
"logps/chosen": -0.890625,
"logps/rejected": -0.7109375,
"loss": 0.9771,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.00311279296875,
"rewards/margins": 0.008544921875,
"rewards/rejected": -0.0115966796875,
"step": 133
},
{
"epoch": 0.2863247863247863,
"grad_norm": 18.812454516360713,
"learning_rate": 6.287959981439785e-07,
"logits/chosen": -0.462890625,
"logits/rejected": -0.443359375,
"logps/chosen": -0.90625,
"logps/rejected": -1.1171875,
"loss": 0.9211,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0072021484375,
"rewards/margins": 0.05712890625,
"rewards/rejected": -0.064453125,
"step": 134
},
{
"epoch": 0.28846153846153844,
"grad_norm": 14.372906903551076,
"learning_rate": 6.272092769259549e-07,
"logits/chosen": -0.65625,
"logits/rejected": -0.5546875,
"logps/chosen": -0.65625,
"logps/rejected": -0.65625,
"loss": 0.9391,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0064697265625,
"rewards/margins": 0.006744384765625,
"rewards/rejected": -0.01318359375,
"step": 135
},
{
"epoch": 0.2905982905982906,
"grad_norm": 18.32033757466638,
"learning_rate": 6.256071194758872e-07,
"logits/chosen": -0.62109375,
"logits/rejected": -0.625,
"logps/chosen": -1.65625,
"logps/rejected": -2.734375,
"loss": 0.9296,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0137939453125,
"rewards/margins": 0.03173828125,
"rewards/rejected": -0.01806640625,
"step": 136
},
{
"epoch": 0.29273504273504275,
"grad_norm": 31.82635270525967,
"learning_rate": 6.239896150089505e-07,
"logits/chosen": -0.6328125,
"logits/rejected": -0.5703125,
"logps/chosen": -0.71875,
"logps/rejected": -1.7890625,
"loss": 0.989,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.017333984375,
"rewards/margins": 0.04443359375,
"rewards/rejected": -0.0615234375,
"step": 137
},
{
"epoch": 0.2948717948717949,
"grad_norm": 21.238442566798064,
"learning_rate": 6.223568535949091e-07,
"logits/chosen": -0.4296875,
"logits/rejected": -0.4140625,
"logps/chosen": -0.875,
"logps/rejected": -0.99609375,
"loss": 0.9547,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0115966796875,
"rewards/margins": 0.038330078125,
"rewards/rejected": -0.0267333984375,
"step": 138
},
{
"epoch": 0.297008547008547,
"grad_norm": 51.91085456898058,
"learning_rate": 6.207089261531013e-07,
"logits/chosen": -0.55859375,
"logits/rejected": -0.58984375,
"logps/chosen": -1.9609375,
"logps/rejected": -1.546875,
"loss": 1.0664,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.034912109375,
"rewards/margins": 0.037109375,
"rewards/rejected": -0.00213623046875,
"step": 139
},
{
"epoch": 0.29914529914529914,
"grad_norm": 41.597766022453776,
"learning_rate": 6.19045924447377e-07,
"logits/chosen": -0.53125,
"logits/rejected": -0.4609375,
"logps/chosen": -1.2421875,
"logps/rejected": -1.921875,
"loss": 0.968,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.037353515625,
"rewards/margins": -0.00286865234375,
"rewards/rejected": -0.03466796875,
"step": 140
},
{
"epoch": 0.30128205128205127,
"grad_norm": 46.81449342159252,
"learning_rate": 6.173679410809868e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.484375,
"logps/chosen": -1.0546875,
"logps/rejected": -0.58984375,
"loss": 1.0081,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.0732421875,
"rewards/margins": 0.08203125,
"rewards/rejected": -0.00982666015625,
"step": 141
},
{
"epoch": 0.3034188034188034,
"grad_norm": 67.6599719874742,
"learning_rate": 6.156750694914267e-07,
"logits/chosen": -0.220703125,
"logits/rejected": -0.27734375,
"logps/chosen": -0.70703125,
"logps/rejected": -0.7734375,
"loss": 1.103,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.022705078125,
"rewards/margins": -0.00018310546875,
"rewards/rejected": -0.0224609375,
"step": 142
},
{
"epoch": 0.3055555555555556,
"grad_norm": 63.19976246483412,
"learning_rate": 6.139674039452337e-07,
"logits/chosen": -0.341796875,
"logits/rejected": -0.330078125,
"logps/chosen": -0.765625,
"logps/rejected": -0.88671875,
"loss": 1.0105,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.00946044921875,
"rewards/margins": 0.0125732421875,
"rewards/rejected": -0.02197265625,
"step": 143
},
{
"epoch": 0.3076923076923077,
"grad_norm": 45.56384975545583,
"learning_rate": 6.12245039532738e-07,
"logits/chosen": -0.439453125,
"logits/rejected": -0.455078125,
"logps/chosen": -0.515625,
"logps/rejected": -0.61328125,
"loss": 1.0502,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0072021484375,
"rewards/margins": 0.008056640625,
"rewards/rejected": -0.0008544921875,
"step": 144
},
{
"epoch": 0.30982905982905984,
"grad_norm": 10.977460072453223,
"learning_rate": 6.105080721627672e-07,
"logits/chosen": -0.66796875,
"logits/rejected": -0.6875,
"logps/chosen": -0.6953125,
"logps/rejected": -0.71875,
"loss": 0.9111,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.01806640625,
"rewards/margins": 0.046630859375,
"rewards/rejected": -0.064453125,
"step": 145
},
{
"epoch": 0.31196581196581197,
"grad_norm": 36.093300799361764,
"learning_rate": 6.087565985573058e-07,
"logits/chosen": -0.427734375,
"logits/rejected": -0.375,
"logps/chosen": -1.28125,
"logps/rejected": -2.5625,
"loss": 0.9296,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.009765625,
"rewards/margins": 0.10986328125,
"rewards/rejected": -0.10009765625,
"step": 146
},
{
"epoch": 0.3141025641025641,
"grad_norm": 24.555454331128033,
"learning_rate": 6.069907162461091e-07,
"logits/chosen": -0.384765625,
"logits/rejected": -0.3203125,
"logps/chosen": -0.88671875,
"logps/rejected": -1.703125,
"loss": 0.925,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.011474609375,
"rewards/margins": 0.01226806640625,
"rewards/rejected": -0.02392578125,
"step": 147
},
{
"epoch": 0.3162393162393162,
"grad_norm": 17.962639768725776,
"learning_rate": 6.052105235612728e-07,
"logits/chosen": -0.5546875,
"logits/rejected": -0.482421875,
"logps/chosen": -0.6640625,
"logps/rejected": -0.76171875,
"loss": 0.9261,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.01116943359375,
"rewards/margins": 0.034912109375,
"rewards/rejected": -0.04638671875,
"step": 148
},
{
"epoch": 0.31837606837606836,
"grad_norm": 88.52202221948862,
"learning_rate": 6.03416119631757e-07,
"logits/chosen": -0.62890625,
"logits/rejected": -0.55078125,
"logps/chosen": -0.671875,
"logps/rejected": -0.7578125,
"loss": 1.1044,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0087890625,
"rewards/margins": 0.04638671875,
"rewards/rejected": -0.05517578125,
"step": 149
},
{
"epoch": 0.32051282051282054,
"grad_norm": 72.8710811659705,
"learning_rate": 6.016076043778666e-07,
"logits/chosen": -0.56640625,
"logits/rejected": -0.515625,
"logps/chosen": -1.03125,
"logps/rejected": -0.8046875,
"loss": 1.0563,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.000885009765625,
"rewards/margins": 0.030029296875,
"rewards/rejected": -0.029052734375,
"step": 150
},
{
"epoch": 0.32264957264957267,
"grad_norm": 41.317922231176695,
"learning_rate": 5.99785078505687e-07,
"logits/chosen": -0.4609375,
"logits/rejected": -0.484375,
"logps/chosen": -0.84375,
"logps/rejected": -0.89453125,
"loss": 0.9644,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.01446533203125,
"rewards/margins": 0.031494140625,
"rewards/rejected": -0.0458984375,
"step": 151
},
{
"epoch": 0.3247863247863248,
"grad_norm": 91.94873476738564,
"learning_rate": 5.979486435014762e-07,
"logits/chosen": -0.74609375,
"logits/rejected": -0.66015625,
"logps/chosen": -0.6484375,
"logps/rejected": -0.76953125,
"loss": 1.0979,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.03857421875,
"rewards/margins": 0.024658203125,
"rewards/rejected": -0.0634765625,
"step": 152
},
{
"epoch": 0.3269230769230769,
"grad_norm": 25.837369633853438,
"learning_rate": 5.960984016260143e-07,
"logits/chosen": -0.5390625,
"logits/rejected": -0.416015625,
"logps/chosen": -0.890625,
"logps/rejected": -1.453125,
"loss": 0.9852,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.019775390625,
"rewards/margins": 0.01904296875,
"rewards/rejected": -0.038818359375,
"step": 153
},
{
"epoch": 0.32905982905982906,
"grad_norm": 39.18517654429065,
"learning_rate": 5.942344559089085e-07,
"logits/chosen": -0.34765625,
"logits/rejected": -0.34375,
"logps/chosen": -1.3671875,
"logps/rejected": -1.0,
"loss": 1.0094,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.040771484375,
"rewards/margins": -0.005279541015625,
"rewards/rejected": -0.03564453125,
"step": 154
},
{
"epoch": 0.3311965811965812,
"grad_norm": 35.50412006815021,
"learning_rate": 5.923569101428565e-07,
"logits/chosen": -0.341796875,
"logits/rejected": -0.275390625,
"logps/chosen": -1.0703125,
"logps/rejected": -1.09375,
"loss": 0.9484,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0291748046875,
"rewards/margins": 0.040283203125,
"rewards/rejected": -0.0693359375,
"step": 155
},
{
"epoch": 0.3333333333333333,
"grad_norm": 22.773657057689636,
"learning_rate": 5.904658688778659e-07,
"logits/chosen": -0.4921875,
"logits/rejected": -0.5546875,
"logps/chosen": -0.6640625,
"logps/rejected": -0.765625,
"loss": 0.9808,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0208740234375,
"rewards/margins": 0.0145263671875,
"rewards/rejected": -0.035400390625,
"step": 156
},
{
"epoch": 0.33547008547008544,
"grad_norm": 52.9050755178356,
"learning_rate": 5.885614374154336e-07,
"logits/chosen": -0.396484375,
"logits/rejected": -0.30078125,
"logps/chosen": -0.4453125,
"logps/rejected": -0.45703125,
"loss": 0.9656,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.00335693359375,
"rewards/margins": 0.01165771484375,
"rewards/rejected": -0.0150146484375,
"step": 157
},
{
"epoch": 0.33760683760683763,
"grad_norm": 86.68634256496178,
"learning_rate": 5.866437218026815e-07,
"logits/chosen": -0.54296875,
"logits/rejected": -0.4921875,
"logps/chosen": -0.5625,
"logps/rejected": -0.69921875,
"loss": 1.061,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.004150390625,
"rewards/margins": 0.0101318359375,
"rewards/rejected": -0.0142822265625,
"step": 158
},
{
"epoch": 0.33974358974358976,
"grad_norm": 40.533651979323515,
"learning_rate": 5.847128288264513e-07,
"logits/chosen": -0.435546875,
"logits/rejected": -0.396484375,
"logps/chosen": -0.828125,
"logps/rejected": -0.9921875,
"loss": 0.9945,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.01055908203125,
"rewards/margins": 0.01904296875,
"rewards/rejected": -0.0296630859375,
"step": 159
},
{
"epoch": 0.3418803418803419,
"grad_norm": 45.12294203204509,
"learning_rate": 5.827688660073584e-07,
"logits/chosen": -0.46875,
"logits/rejected": -0.46875,
"logps/chosen": -0.65234375,
"logps/rejected": -0.609375,
"loss": 1.024,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.002410888671875,
"rewards/margins": 0.005523681640625,
"rewards/rejected": -0.0079345703125,
"step": 160
},
{
"epoch": 0.344017094017094,
"grad_norm": 26.553957066037817,
"learning_rate": 5.808119415938044e-07,
"logits/chosen": -0.64453125,
"logits/rejected": -0.5625,
"logps/chosen": -0.87890625,
"logps/rejected": -1.1484375,
"loss": 0.962,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.02978515625,
"rewards/margins": 0.0712890625,
"rewards/rejected": -0.10107421875,
"step": 161
},
{
"epoch": 0.34615384615384615,
"grad_norm": 28.26727251383747,
"learning_rate": 5.788421645559498e-07,
"logits/chosen": -0.1748046875,
"logits/rejected": -0.2314453125,
"logps/chosen": -0.69140625,
"logps/rejected": -0.7109375,
"loss": 0.9536,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.011474609375,
"rewards/margins": 0.0181884765625,
"rewards/rejected": -0.0296630859375,
"step": 162
},
{
"epoch": 0.3482905982905983,
"grad_norm": 34.150127616766795,
"learning_rate": 5.768596445796454e-07,
"logits/chosen": -0.28125,
"logits/rejected": -0.244140625,
"logps/chosen": -1.9609375,
"logps/rejected": -0.73046875,
"loss": 0.9164,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.0654296875,
"rewards/margins": 0.1259765625,
"rewards/rejected": -0.060302734375,
"step": 163
},
{
"epoch": 0.3504273504273504,
"grad_norm": 55.99080770473473,
"learning_rate": 5.748644920603248e-07,
"logits/chosen": -0.53515625,
"logits/rejected": -0.515625,
"logps/chosen": -0.93359375,
"logps/rejected": -1.5625,
"loss": 0.9708,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0037841796875,
"rewards/margins": 0.06640625,
"rewards/rejected": -0.06298828125,
"step": 164
},
{
"epoch": 0.3525641025641026,
"grad_norm": 88.95253575474342,
"learning_rate": 5.728568180968577e-07,
"logits/chosen": -0.734375,
"logits/rejected": -0.6953125,
"logps/chosen": -1.0078125,
"logps/rejected": -1.2421875,
"loss": 1.0817,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0113525390625,
"rewards/margins": 0.026611328125,
"rewards/rejected": -0.0380859375,
"step": 165
},
{
"epoch": 0.3547008547008547,
"grad_norm": 42.9833424237718,
"learning_rate": 5.708367344853625e-07,
"logits/chosen": -0.27734375,
"logits/rejected": -0.21875,
"logps/chosen": -1.671875,
"logps/rejected": -1.640625,
"loss": 0.9656,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.01336669921875,
"rewards/margins": -0.01092529296875,
"rewards/rejected": -0.00244140625,
"step": 166
},
{
"epoch": 0.35683760683760685,
"grad_norm": 28.891155958443296,
"learning_rate": 5.688043537129817e-07,
"logits/chosen": -0.3828125,
"logits/rejected": -0.44140625,
"logps/chosen": -1.65625,
"logps/rejected": -1.6796875,
"loss": 0.9359,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0634765625,
"rewards/margins": 0.01953125,
"rewards/rejected": 0.04345703125,
"step": 167
},
{
"epoch": 0.358974358974359,
"grad_norm": 41.10465613204891,
"learning_rate": 5.667597889516172e-07,
"logits/chosen": -0.369140625,
"logits/rejected": -0.20703125,
"logps/chosen": -1.6640625,
"logps/rejected": -1.1015625,
"loss": 1.0015,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0108642578125,
"rewards/margins": 0.06005859375,
"rewards/rejected": -0.07080078125,
"step": 168
},
{
"epoch": 0.3611111111111111,
"grad_norm": 83.41369039535908,
"learning_rate": 5.647031540516297e-07,
"logits/chosen": -0.404296875,
"logits/rejected": -0.45703125,
"logps/chosen": -0.80859375,
"logps/rejected": -0.69921875,
"loss": 1.1058,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0162353515625,
"rewards/margins": 0.0155029296875,
"rewards/rejected": -0.03173828125,
"step": 169
},
{
"epoch": 0.36324786324786323,
"grad_norm": 26.2199862579392,
"learning_rate": 5.626345635354979e-07,
"logits/chosen": -0.6015625,
"logits/rejected": -0.578125,
"logps/chosen": -0.87890625,
"logps/rejected": -1.125,
"loss": 0.9514,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.00225830078125,
"rewards/margins": 0.008056640625,
"rewards/rejected": -0.01031494140625,
"step": 170
},
{
"epoch": 0.36538461538461536,
"grad_norm": 205.59104124832635,
"learning_rate": 5.605541325914418e-07,
"logits/chosen": -0.462890625,
"logits/rejected": -0.462890625,
"logps/chosen": -0.6875,
"logps/rejected": -0.6015625,
"loss": 1.0734,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.005462646484375,
"rewards/margins": 0.0250244140625,
"rewards/rejected": -0.030517578125,
"step": 171
},
{
"epoch": 0.36752136752136755,
"grad_norm": 38.5807267512022,
"learning_rate": 5.584619770670089e-07,
"logits/chosen": -0.2373046875,
"logits/rejected": -0.41015625,
"logps/chosen": -1.78125,
"logps/rejected": -1.703125,
"loss": 0.9011,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.022705078125,
"rewards/margins": -0.004119873046875,
"rewards/rejected": -0.0185546875,
"step": 172
},
{
"epoch": 0.3696581196581197,
"grad_norm": 56.16265649435291,
"learning_rate": 5.563582134626227e-07,
"logits/chosen": -0.5546875,
"logits/rejected": -0.52734375,
"logps/chosen": -0.87109375,
"logps/rejected": -1.828125,
"loss": 0.9905,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.009765625,
"rewards/margins": 0.048828125,
"rewards/rejected": -0.038818359375,
"step": 173
},
{
"epoch": 0.3717948717948718,
"grad_norm": 33.662597431065436,
"learning_rate": 5.542429589250953e-07,
"logits/chosen": -0.46484375,
"logits/rejected": -0.4375,
"logps/chosen": -1.9375,
"logps/rejected": -2.125,
"loss": 0.9218,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.01318359375,
"rewards/margins": 0.041015625,
"rewards/rejected": -0.02783203125,
"step": 174
},
{
"epoch": 0.37393162393162394,
"grad_norm": 39.14271257757384,
"learning_rate": 5.52116331241105e-07,
"logits/chosen": -0.46875,
"logits/rejected": -0.37890625,
"logps/chosen": -0.9609375,
"logps/rejected": -1.34375,
"loss": 0.9592,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.01220703125,
"rewards/margins": -0.03564453125,
"rewards/rejected": 0.0235595703125,
"step": 175
},
{
"epoch": 0.37606837606837606,
"grad_norm": 19.69124876410317,
"learning_rate": 5.499784488306366e-07,
"logits/chosen": -0.330078125,
"logits/rejected": -0.265625,
"logps/chosen": -1.34375,
"logps/rejected": -0.73828125,
"loss": 0.9491,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0093994140625,
"rewards/margins": 0.047119140625,
"rewards/rejected": -0.056640625,
"step": 176
},
{
"epoch": 0.3782051282051282,
"grad_norm": 19.592787918798177,
"learning_rate": 5.47829430740388e-07,
"logits/chosen": -0.47265625,
"logits/rejected": -0.41015625,
"logps/chosen": -0.8671875,
"logps/rejected": -1.375,
"loss": 0.9664,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.000213623046875,
"rewards/margins": 0.04931640625,
"rewards/rejected": -0.049072265625,
"step": 177
},
{
"epoch": 0.3803418803418803,
"grad_norm": 41.26617381984967,
"learning_rate": 5.456693966371404e-07,
"logits/chosen": -0.4921875,
"logits/rejected": -0.5625,
"logps/chosen": -0.9375,
"logps/rejected": -1.3203125,
"loss": 0.9364,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0040283203125,
"rewards/margins": 0.04296875,
"rewards/rejected": -0.046875,
"step": 178
},
{
"epoch": 0.38247863247863245,
"grad_norm": 47.31641000734864,
"learning_rate": 5.43498466801095e-07,
"logits/chosen": -0.392578125,
"logits/rejected": -0.337890625,
"logps/chosen": -0.6484375,
"logps/rejected": -1.0625,
"loss": 0.9885,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.0037841796875,
"rewards/margins": 0.055419921875,
"rewards/rejected": -0.0517578125,
"step": 179
},
{
"epoch": 0.38461538461538464,
"grad_norm": 31.34178974992674,
"learning_rate": 5.413167621191755e-07,
"logits/chosen": -0.318359375,
"logits/rejected": -0.306640625,
"logps/chosen": -1.2421875,
"logps/rejected": -1.328125,
"loss": 0.9866,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03173828125,
"rewards/margins": 0.0196533203125,
"rewards/rejected": -0.05126953125,
"step": 180
},
{
"epoch": 0.38675213675213677,
"grad_norm": 38.69009356294788,
"learning_rate": 5.391244040782964e-07,
"logits/chosen": -0.71875,
"logits/rejected": -0.7578125,
"logps/chosen": -2.34375,
"logps/rejected": -2.4375,
"loss": 0.9636,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0634765625,
"rewards/margins": 0.0201416015625,
"rewards/rejected": -0.08349609375,
"step": 181
},
{
"epoch": 0.3888888888888889,
"grad_norm": 66.88208286581656,
"learning_rate": 5.369215147585981e-07,
"logits/chosen": -0.76171875,
"logits/rejected": -0.609375,
"logps/chosen": -1.3125,
"logps/rejected": -0.84375,
"loss": 1.0153,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0220947265625,
"rewards/margins": -0.00927734375,
"rewards/rejected": -0.0128173828125,
"step": 182
},
{
"epoch": 0.391025641025641,
"grad_norm": 19.918239619320428,
"learning_rate": 5.347082168266491e-07,
"logits/chosen": -0.44140625,
"logits/rejected": -0.466796875,
"logps/chosen": -0.6015625,
"logps/rejected": -1.640625,
"loss": 0.9355,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.003173828125,
"rewards/margins": 0.017333984375,
"rewards/rejected": -0.01409912109375,
"step": 183
},
{
"epoch": 0.39316239316239315,
"grad_norm": 54.06880890834597,
"learning_rate": 5.324846335286148e-07,
"logits/chosen": -0.3984375,
"logits/rejected": -0.50390625,
"logps/chosen": -0.91015625,
"logps/rejected": -0.79296875,
"loss": 0.9766,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.04443359375,
"rewards/margins": 0.0167236328125,
"rewards/rejected": -0.0615234375,
"step": 184
},
{
"epoch": 0.3952991452991453,
"grad_norm": 60.553261993821486,
"learning_rate": 5.302508886833953e-07,
"logits/chosen": -0.2314453125,
"logits/rejected": -0.19921875,
"logps/chosen": -0.7265625,
"logps/rejected": -0.78515625,
"loss": 0.9797,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.001129150390625,
"rewards/margins": 0.060546875,
"rewards/rejected": -0.0615234375,
"step": 185
},
{
"epoch": 0.3974358974358974,
"grad_norm": 46.18394100489199,
"learning_rate": 5.280071066757304e-07,
"logits/chosen": -0.498046875,
"logits/rejected": -0.48046875,
"logps/chosen": -1.171875,
"logps/rejected": -1.359375,
"loss": 0.9438,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.01043701171875,
"rewards/margins": 0.01708984375,
"rewards/rejected": -0.0274658203125,
"step": 186
},
{
"epoch": 0.3995726495726496,
"grad_norm": 26.559128159512724,
"learning_rate": 5.25753412449273e-07,
"logits/chosen": -0.337890625,
"logits/rejected": -0.271484375,
"logps/chosen": -0.69921875,
"logps/rejected": -0.546875,
"loss": 0.9596,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.01123046875,
"rewards/margins": 0.023193359375,
"rewards/rejected": -0.011962890625,
"step": 187
},
{
"epoch": 0.4017094017094017,
"grad_norm": 77.25370785550327,
"learning_rate": 5.234899314996325e-07,
"logits/chosen": -0.4296875,
"logits/rejected": -0.369140625,
"logps/chosen": -0.671875,
"logps/rejected": -0.7734375,
"loss": 1.052,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.002410888671875,
"rewards/margins": 0.035400390625,
"rewards/rejected": -0.037841796875,
"step": 188
},
{
"epoch": 0.40384615384615385,
"grad_norm": 79.5046519078694,
"learning_rate": 5.212167898673855e-07,
"logits/chosen": -0.515625,
"logits/rejected": -0.51171875,
"logps/chosen": -1.0078125,
"logps/rejected": -0.92578125,
"loss": 1.0046,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0224609375,
"rewards/margins": 0.030029296875,
"rewards/rejected": -0.052490234375,
"step": 189
},
{
"epoch": 0.405982905982906,
"grad_norm": 61.583682807858615,
"learning_rate": 5.189341141310579e-07,
"logits/chosen": -0.6640625,
"logits/rejected": -0.5703125,
"logps/chosen": -1.6015625,
"logps/rejected": -1.53125,
"loss": 1.006,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.013916015625,
"rewards/margins": 0.00494384765625,
"rewards/rejected": 0.0089111328125,
"step": 190
},
{
"epoch": 0.4081196581196581,
"grad_norm": 15.327322287722419,
"learning_rate": 5.166420314000771e-07,
"logits/chosen": -0.51953125,
"logits/rejected": -0.52734375,
"logps/chosen": -0.890625,
"logps/rejected": -0.72265625,
"loss": 0.9421,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.025390625,
"rewards/margins": 0.003662109375,
"rewards/rejected": -0.0289306640625,
"step": 191
},
{
"epoch": 0.41025641025641024,
"grad_norm": 54.74166945437457,
"learning_rate": 5.143406693076928e-07,
"logits/chosen": -0.56640625,
"logits/rejected": -0.453125,
"logps/chosen": -0.96875,
"logps/rejected": -1.21875,
"loss": 0.9989,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.013916015625,
"rewards/margins": 0.0242919921875,
"rewards/rejected": -0.038330078125,
"step": 192
},
{
"epoch": 0.41239316239316237,
"grad_norm": 62.61311550998391,
"learning_rate": 5.120301560038705e-07,
"logits/chosen": -0.384765625,
"logits/rejected": -0.38671875,
"logps/chosen": -0.73828125,
"logps/rejected": -0.71875,
"loss": 1.0123,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.001953125,
"rewards/margins": 0.0201416015625,
"rewards/rejected": -0.0220947265625,
"step": 193
},
{
"epoch": 0.41452991452991456,
"grad_norm": 41.1464773305956,
"learning_rate": 5.097106201481553e-07,
"logits/chosen": -0.765625,
"logits/rejected": -0.69921875,
"logps/chosen": -1.0625,
"logps/rejected": -0.9140625,
"loss": 0.964,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.00640869140625,
"rewards/margins": 0.0130615234375,
"rewards/rejected": -0.006561279296875,
"step": 194
},
{
"epoch": 0.4166666666666667,
"grad_norm": 18.43312573869149,
"learning_rate": 5.073821909025078e-07,
"logits/chosen": -0.77734375,
"logits/rejected": -0.71484375,
"logps/chosen": -0.6875,
"logps/rejected": -1.078125,
"loss": 0.9498,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.014404296875,
"rewards/margins": 0.050537109375,
"rewards/rejected": -0.06494140625,
"step": 195
},
{
"epoch": 0.4188034188034188,
"grad_norm": 71.4495599627426,
"learning_rate": 5.050449979241119e-07,
"logits/chosen": -0.494140625,
"logits/rejected": -0.4921875,
"logps/chosen": -0.71875,
"logps/rejected": -1.2265625,
"loss": 1.0096,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.00286865234375,
"rewards/margins": 0.03662109375,
"rewards/rejected": -0.03955078125,
"step": 196
},
{
"epoch": 0.42094017094017094,
"grad_norm": 19.446033419100402,
"learning_rate": 5.026991713581543e-07,
"logits/chosen": -0.27734375,
"logits/rejected": -0.33984375,
"logps/chosen": -0.6171875,
"logps/rejected": -0.72265625,
"loss": 0.9258,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.015625,
"rewards/margins": 0.03076171875,
"rewards/rejected": -0.046142578125,
"step": 197
},
{
"epoch": 0.4230769230769231,
"grad_norm": 24.647809279268525,
"learning_rate": 5.003448418305781e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.546875,
"logps/chosen": -0.91796875,
"logps/rejected": -1.6875,
"loss": 0.8944,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.056396484375,
"rewards/margins": 0.0712890625,
"rewards/rejected": -0.1279296875,
"step": 198
},
{
"epoch": 0.4252136752136752,
"grad_norm": 29.9361578160495,
"learning_rate": 4.979821404408084e-07,
"logits/chosen": -0.55078125,
"logits/rejected": -0.52734375,
"logps/chosen": -0.6328125,
"logps/rejected": -0.9140625,
"loss": 0.9269,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.0133056640625,
"rewards/margins": 0.0537109375,
"rewards/rejected": -0.06689453125,
"step": 199
},
{
"epoch": 0.42735042735042733,
"grad_norm": 29.028111116310793,
"learning_rate": 4.956111987544529e-07,
"logits/chosen": -0.39453125,
"logits/rejected": -0.5,
"logps/chosen": -1.8515625,
"logps/rejected": -1.890625,
"loss": 0.9635,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.02392578125,
"rewards/margins": 0.05078125,
"rewards/rejected": -0.0751953125,
"step": 200
},
{
"epoch": 0.42735042735042733,
"eval_logits/chosen": -0.56640625,
"eval_logits/rejected": -0.5625,
"eval_logps/chosen": -1.03125,
"eval_logps/rejected": -1.09375,
"eval_loss": 0.9727697968482971,
"eval_rewards/accuracies": 0.6774193644523621,
"eval_rewards/chosen": -0.0037994384765625,
"eval_rewards/margins": 0.046142578125,
"eval_rewards/rejected": -0.050048828125,
"eval_runtime": 104.9199,
"eval_samples_per_second": 18.69,
"eval_steps_per_second": 0.591,
"step": 200
},
{
"epoch": 0.42948717948717946,
"grad_norm": 19.81681568724776,
"learning_rate": 4.932321487959748e-07,
"logits/chosen": -0.48046875,
"logits/rejected": -0.494140625,
"logps/chosen": -0.63671875,
"logps/rejected": -0.671875,
"loss": 0.9302,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.010498046875,
"rewards/margins": 0.0150146484375,
"rewards/rejected": -0.004486083984375,
"step": 201
},
{
"epoch": 0.43162393162393164,
"grad_norm": 63.3680908382062,
"learning_rate": 4.908451230413419e-07,
"logits/chosen": -0.71484375,
"logits/rejected": -0.65625,
"logps/chosen": -0.59765625,
"logps/rejected": -0.57421875,
"loss": 1.0619,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.004058837890625,
"rewards/margins": 0.0101318359375,
"rewards/rejected": -0.006072998046875,
"step": 202
},
{
"epoch": 0.4337606837606838,
"grad_norm": 27.94217013723975,
"learning_rate": 4.884502544106492e-07,
"logits/chosen": -0.46875,
"logits/rejected": -0.5625,
"logps/chosen": -0.796875,
"logps/rejected": -0.734375,
"loss": 0.9215,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.01318359375,
"rewards/margins": 0.005615234375,
"rewards/rejected": -0.018798828125,
"step": 203
},
{
"epoch": 0.4358974358974359,
"grad_norm": 40.54930795227308,
"learning_rate": 4.860476762607174e-07,
"logits/chosen": -0.5078125,
"logits/rejected": -0.45703125,
"logps/chosen": -0.5,
"logps/rejected": -0.51171875,
"loss": 0.9994,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.003936767578125,
"rewards/margins": 0.014404296875,
"rewards/rejected": -0.01043701171875,
"step": 204
},
{
"epoch": 0.43803418803418803,
"grad_norm": 39.57095333313773,
"learning_rate": 4.836375223776678e-07,
"logits/chosen": -0.52734375,
"logits/rejected": -0.53125,
"logps/chosen": -1.71875,
"logps/rejected": -1.6171875,
"loss": 0.9875,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0859375,
"rewards/margins": 0.000701904296875,
"rewards/rejected": 0.0849609375,
"step": 205
},
{
"epoch": 0.44017094017094016,
"grad_norm": 18.511716887685402,
"learning_rate": 4.812199269694711e-07,
"logits/chosen": -0.62109375,
"logits/rejected": -0.57421875,
"logps/chosen": -0.51953125,
"logps/rejected": -0.53515625,
"loss": 0.9124,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0010986328125,
"rewards/margins": 0.006103515625,
"rewards/rejected": -0.0072021484375,
"step": 206
},
{
"epoch": 0.4423076923076923,
"grad_norm": 57.69428013444241,
"learning_rate": 4.787950246584753e-07,
"logits/chosen": -0.76171875,
"logits/rejected": -0.76171875,
"logps/chosen": -0.4921875,
"logps/rejected": -0.52734375,
"loss": 0.9723,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.00738525390625,
"rewards/margins": 0.01373291015625,
"rewards/rejected": -0.0211181640625,
"step": 207
},
{
"epoch": 0.4444444444444444,
"grad_norm": 75.86709550039322,
"learning_rate": 4.7636295047390865e-07,
"logits/chosen": -0.515625,
"logits/rejected": -0.56640625,
"logps/chosen": -2.84375,
"logps/rejected": -2.0,
"loss": 1.0264,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.1962890625,
"rewards/margins": 0.177734375,
"rewards/rejected": 0.0189208984375,
"step": 208
},
{
"epoch": 0.4465811965811966,
"grad_norm": 24.333855810540772,
"learning_rate": 4.7392383984436104e-07,
"logits/chosen": -0.38671875,
"logits/rejected": -0.357421875,
"logps/chosen": -0.765625,
"logps/rejected": -1.6875,
"loss": 0.8757,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.001495361328125,
"rewards/margins": 0.11474609375,
"rewards/rejected": -0.11328125,
"step": 209
},
{
"epoch": 0.44871794871794873,
"grad_norm": 44.84857230941575,
"learning_rate": 4.7147782859024246e-07,
"logits/chosen": -0.7109375,
"logits/rejected": -0.65234375,
"logps/chosen": -1.5625,
"logps/rejected": -1.484375,
"loss": 1.0098,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.01019287109375,
"rewards/margins": 0.00518798828125,
"rewards/rejected": 0.00506591796875,
"step": 210
},
{
"epoch": 0.45085470085470086,
"grad_norm": 50.30855337720552,
"learning_rate": 4.6902505291622014e-07,
"logits/chosen": -0.7265625,
"logits/rejected": -0.73828125,
"logps/chosen": -0.71484375,
"logps/rejected": -0.8046875,
"loss": 0.9634,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02978515625,
"rewards/margins": 0.014404296875,
"rewards/rejected": -0.04443359375,
"step": 211
},
{
"epoch": 0.452991452991453,
"grad_norm": 55.94011111026385,
"learning_rate": 4.66565649403634e-07,
"logits/chosen": -0.49609375,
"logits/rejected": -0.5,
"logps/chosen": -1.3125,
"logps/rejected": -1.78125,
"loss": 0.9763,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.00958251953125,
"rewards/margins": 0.0419921875,
"rewards/rejected": -0.032470703125,
"step": 212
},
{
"epoch": 0.4551282051282051,
"grad_norm": 84.25142664538966,
"learning_rate": 4.6409975500289086e-07,
"logits/chosen": -0.484375,
"logits/rejected": -0.46875,
"logps/chosen": -0.66796875,
"logps/rejected": -0.703125,
"loss": 1.0564,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.00836181640625,
"rewards/margins": 0.0206298828125,
"rewards/rejected": -0.0289306640625,
"step": 213
},
{
"epoch": 0.45726495726495725,
"grad_norm": 20.75895587246269,
"learning_rate": 4.6162750702583916e-07,
"logits/chosen": -0.40625,
"logits/rejected": -0.40625,
"logps/chosen": -0.62890625,
"logps/rejected": -0.63671875,
"loss": 0.9518,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.0025634765625,
"rewards/margins": 0.03271484375,
"rewards/rejected": -0.030029296875,
"step": 214
},
{
"epoch": 0.4594017094017094,
"grad_norm": 41.23220742312626,
"learning_rate": 4.591490431381221e-07,
"logits/chosen": -0.466796875,
"logits/rejected": -0.50390625,
"logps/chosen": -0.609375,
"logps/rejected": -0.75390625,
"loss": 1.0527,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.016845703125,
"rewards/margins": 0.0010986328125,
"rewards/rejected": -0.01806640625,
"step": 215
},
{
"epoch": 0.46153846153846156,
"grad_norm": 44.974264192003716,
"learning_rate": 4.5666450135151236e-07,
"logits/chosen": -0.77734375,
"logits/rejected": -0.8359375,
"logps/chosen": -0.73828125,
"logps/rejected": -1.375,
"loss": 0.9866,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0224609375,
"rewards/margins": 0.051025390625,
"rewards/rejected": -0.07373046875,
"step": 216
},
{
"epoch": 0.4636752136752137,
"grad_norm": 29.27367889944009,
"learning_rate": 4.541740200162266e-07,
"logits/chosen": -0.5859375,
"logits/rejected": -0.58984375,
"logps/chosen": -2.125,
"logps/rejected": -1.359375,
"loss": 0.939,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0771484375,
"rewards/margins": 0.036865234375,
"rewards/rejected": -0.11376953125,
"step": 217
},
{
"epoch": 0.4658119658119658,
"grad_norm": 31.584259547296156,
"learning_rate": 4.5167773781322175e-07,
"logits/chosen": -0.6015625,
"logits/rejected": -0.431640625,
"logps/chosen": -2.109375,
"logps/rejected": -2.765625,
"loss": 0.9585,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0279541015625,
"rewards/margins": 0.08349609375,
"rewards/rejected": -0.111328125,
"step": 218
},
{
"epoch": 0.46794871794871795,
"grad_norm": 42.48284320276936,
"learning_rate": 4.4917579374647265e-07,
"logits/chosen": -0.4609375,
"logits/rejected": -0.51953125,
"logps/chosen": -0.671875,
"logps/rejected": -0.68359375,
"loss": 0.9618,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0155029296875,
"rewards/margins": 0.0089111328125,
"rewards/rejected": -0.0244140625,
"step": 219
},
{
"epoch": 0.4700854700854701,
"grad_norm": 18.23118783681457,
"learning_rate": 4.466683271352315e-07,
"logits/chosen": -0.49609375,
"logits/rejected": -0.345703125,
"logps/chosen": -0.734375,
"logps/rejected": -0.79296875,
"loss": 0.9417,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0174560546875,
"rewards/margins": 0.019287109375,
"rewards/rejected": -0.036865234375,
"step": 220
},
{
"epoch": 0.4722222222222222,
"grad_norm": 26.2463503471584,
"learning_rate": 4.4415547760627006e-07,
"logits/chosen": -0.271484375,
"logits/rejected": -0.330078125,
"logps/chosen": -1.0078125,
"logps/rejected": -1.25,
"loss": 0.9237,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03955078125,
"rewards/margins": 0.052001953125,
"rewards/rejected": -0.09130859375,
"step": 221
},
{
"epoch": 0.47435897435897434,
"grad_norm": 34.08188430294409,
"learning_rate": 4.416373850861047e-07,
"logits/chosen": -0.66796875,
"logits/rejected": -0.5078125,
"logps/chosen": -1.7578125,
"logps/rejected": -1.3359375,
"loss": 1.0026,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0211181640625,
"rewards/margins": -0.0009765625,
"rewards/rejected": -0.0201416015625,
"step": 222
},
{
"epoch": 0.47649572649572647,
"grad_norm": 21.569342340756666,
"learning_rate": 4.391141897932045e-07,
"logits/chosen": -0.640625,
"logits/rejected": -0.640625,
"logps/chosen": -0.56640625,
"logps/rejected": -0.486328125,
"loss": 0.9335,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.01531982421875,
"rewards/margins": 0.0142822265625,
"rewards/rejected": -0.029541015625,
"step": 223
},
{
"epoch": 0.47863247863247865,
"grad_norm": 30.256607772494906,
"learning_rate": 4.3658603223018377e-07,
"logits/chosen": -0.40234375,
"logits/rejected": -0.40625,
"logps/chosen": -0.99609375,
"logps/rejected": -1.1640625,
"loss": 0.9175,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0196533203125,
"rewards/margins": 0.08984375,
"rewards/rejected": -0.109375,
"step": 224
},
{
"epoch": 0.4807692307692308,
"grad_norm": 102.05207305719217,
"learning_rate": 4.340530531759773e-07,
"logits/chosen": -0.625,
"logits/rejected": -0.65234375,
"logps/chosen": -0.7578125,
"logps/rejected": -1.4140625,
"loss": 1.035,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0263671875,
"rewards/margins": 0.012939453125,
"rewards/rejected": -0.039306640625,
"step": 225
},
{
"epoch": 0.4829059829059829,
"grad_norm": 30.438856795785753,
"learning_rate": 4.3151539367800197e-07,
"logits/chosen": -0.75,
"logits/rejected": -0.6953125,
"logps/chosen": -1.4296875,
"logps/rejected": -1.234375,
"loss": 0.9395,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0167236328125,
"rewards/margins": -0.0120849609375,
"rewards/rejected": -0.00469970703125,
"step": 226
},
{
"epoch": 0.48504273504273504,
"grad_norm": 64.65057839238729,
"learning_rate": 4.289731950443024e-07,
"logits/chosen": -0.55859375,
"logits/rejected": -0.51171875,
"logps/chosen": -0.7421875,
"logps/rejected": -0.6171875,
"loss": 1.0064,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.00738525390625,
"rewards/margins": 0.00927734375,
"rewards/rejected": -0.0166015625,
"step": 227
},
{
"epoch": 0.48717948717948717,
"grad_norm": 22.221149752600365,
"learning_rate": 4.2642659883568226e-07,
"logits/chosen": -0.66015625,
"logits/rejected": -0.60546875,
"logps/chosen": -0.87109375,
"logps/rejected": -0.93359375,
"loss": 0.8986,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07080078125,
"rewards/margins": 0.024658203125,
"rewards/rejected": -0.09521484375,
"step": 228
},
{
"epoch": 0.4893162393162393,
"grad_norm": 34.72558335941602,
"learning_rate": 4.2387574685782143e-07,
"logits/chosen": -0.44921875,
"logits/rejected": -0.49609375,
"logps/chosen": -0.61328125,
"logps/rejected": -1.5859375,
"loss": 0.9515,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.00750732421875,
"rewards/margins": 0.05322265625,
"rewards/rejected": -0.060546875,
"step": 229
},
{
"epoch": 0.49145299145299143,
"grad_norm": 40.26736969888996,
"learning_rate": 4.213207811533797e-07,
"logits/chosen": -0.453125,
"logits/rejected": -0.392578125,
"logps/chosen": -0.96875,
"logps/rejected": -1.6328125,
"loss": 0.9952,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.06640625,
"rewards/margins": 0.06396484375,
"rewards/rejected": -0.1298828125,
"step": 230
},
{
"epoch": 0.4935897435897436,
"grad_norm": 26.92761732861108,
"learning_rate": 4.1876184399408744e-07,
"logits/chosen": -0.462890625,
"logits/rejected": -0.451171875,
"logps/chosen": -1.1328125,
"logps/rejected": -1.203125,
"loss": 0.9368,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.030029296875,
"rewards/margins": 0.04736328125,
"rewards/rejected": -0.07763671875,
"step": 231
},
{
"epoch": 0.49572649572649574,
"grad_norm": 17.793046500243925,
"learning_rate": 4.161990778728231e-07,
"logits/chosen": -0.2392578125,
"logits/rejected": -0.197265625,
"logps/chosen": -0.7421875,
"logps/rejected": -1.09375,
"loss": 0.948,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.058837890625,
"rewards/margins": 0.0166015625,
"rewards/rejected": -0.07568359375,
"step": 232
},
{
"epoch": 0.49786324786324787,
"grad_norm": 20.994788691900062,
"learning_rate": 4.136326254956784e-07,
"logits/chosen": -0.3671875,
"logits/rejected": -0.390625,
"logps/chosen": -1.3046875,
"logps/rejected": -0.8828125,
"loss": 0.9448,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.00347900390625,
"rewards/margins": 0.003326416015625,
"rewards/rejected": -0.006866455078125,
"step": 233
},
{
"epoch": 0.5,
"grad_norm": 33.14361065612054,
"learning_rate": 4.110626297740122e-07,
"logits/chosen": -0.59375,
"logits/rejected": -0.68359375,
"logps/chosen": -1.1640625,
"logps/rejected": -0.88671875,
"loss": 0.9075,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.03857421875,
"rewards/margins": -0.00201416015625,
"rewards/rejected": -0.03662109375,
"step": 234
},
{
"epoch": 0.5021367521367521,
"grad_norm": 23.710110812392536,
"learning_rate": 4.0848923381649195e-07,
"logits/chosen": -0.62109375,
"logits/rejected": -0.625,
"logps/chosen": -0.671875,
"logps/rejected": -0.921875,
"loss": 0.9398,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0179443359375,
"rewards/margins": 0.005706787109375,
"rewards/rejected": -0.023681640625,
"step": 235
},
{
"epoch": 0.5042735042735043,
"grad_norm": 20.519140914016205,
"learning_rate": 4.059125809211259e-07,
"logits/chosen": -0.72265625,
"logits/rejected": -0.69140625,
"logps/chosen": -0.98046875,
"logps/rejected": -0.8515625,
"loss": 0.9618,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.022216796875,
"rewards/margins": 0.028076171875,
"rewards/rejected": -0.05029296875,
"step": 236
},
{
"epoch": 0.5064102564102564,
"grad_norm": 55.432096028551676,
"learning_rate": 4.033328145672822e-07,
"logits/chosen": -0.39453125,
"logits/rejected": -0.51171875,
"logps/chosen": -1.3828125,
"logps/rejected": -1.640625,
"loss": 1.0188,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.00909423828125,
"rewards/margins": 0.0157470703125,
"rewards/rejected": -0.00665283203125,
"step": 237
},
{
"epoch": 0.5085470085470085,
"grad_norm": 46.20803113821289,
"learning_rate": 4.007500784077006e-07,
"logits/chosen": -0.56640625,
"logits/rejected": -0.490234375,
"logps/chosen": -0.90625,
"logps/rejected": -0.8203125,
"loss": 0.9741,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.00634765625,
"rewards/margins": 0.0164794921875,
"rewards/rejected": -0.0228271484375,
"step": 238
},
{
"epoch": 0.5106837606837606,
"grad_norm": 54.70162005395297,
"learning_rate": 3.9816451626049247e-07,
"logits/chosen": -0.50390625,
"logits/rejected": -0.490234375,
"logps/chosen": -1.1171875,
"logps/rejected": -0.8359375,
"loss": 0.9551,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0322265625,
"rewards/margins": 0.024658203125,
"rewards/rejected": -0.056884765625,
"step": 239
},
{
"epoch": 0.5128205128205128,
"grad_norm": 21.189791976104605,
"learning_rate": 3.9557627210113264e-07,
"logits/chosen": -0.68359375,
"logits/rejected": -0.58203125,
"logps/chosen": -1.25,
"logps/rejected": -1.328125,
"loss": 0.9316,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.01458740234375,
"rewards/margins": 0.01361083984375,
"rewards/rejected": 0.0009765625,
"step": 240
},
{
"epoch": 0.5149572649572649,
"grad_norm": 30.828528043387475,
"learning_rate": 3.92985490054442e-07,
"logits/chosen": -0.53125,
"logits/rejected": -0.50390625,
"logps/chosen": -1.1640625,
"logps/rejected": -1.5,
"loss": 0.9681,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0245361328125,
"rewards/margins": 0.042724609375,
"rewards/rejected": -0.0673828125,
"step": 241
},
{
"epoch": 0.5170940170940171,
"grad_norm": 41.70036042913,
"learning_rate": 3.903923143865625e-07,
"logits/chosen": -0.5703125,
"logits/rejected": -0.60546875,
"logps/chosen": -1.28125,
"logps/rejected": -2.078125,
"loss": 0.9426,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.033447265625,
"rewards/margins": 0.0458984375,
"rewards/rejected": -0.07958984375,
"step": 242
},
{
"epoch": 0.5192307692307693,
"grad_norm": 73.93778223271919,
"learning_rate": 3.8779688949692316e-07,
"logits/chosen": -0.498046875,
"logits/rejected": -0.51171875,
"logps/chosen": -2.859375,
"logps/rejected": -1.8125,
"loss": 1.0366,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.02685546875,
"rewards/margins": 0.00848388671875,
"rewards/rejected": -0.035400390625,
"step": 243
},
{
"epoch": 0.5213675213675214,
"grad_norm": 35.776829076433316,
"learning_rate": 3.851993599101998e-07,
"logits/chosen": -0.6484375,
"logits/rejected": -0.66796875,
"logps/chosen": -0.6875,
"logps/rejected": -0.64453125,
"loss": 0.9495,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.04052734375,
"rewards/margins": 0.0096435546875,
"rewards/rejected": -0.05029296875,
"step": 244
},
{
"epoch": 0.5235042735042735,
"grad_norm": 27.238620302576226,
"learning_rate": 3.825998702682668e-07,
"logits/chosen": -0.7109375,
"logits/rejected": -0.65625,
"logps/chosen": -0.70703125,
"logps/rejected": -0.671875,
"loss": 0.9001,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.00372314453125,
"rewards/margins": 0.0289306640625,
"rewards/rejected": -0.025146484375,
"step": 245
},
{
"epoch": 0.5256410256410257,
"grad_norm": 20.872195276246313,
"learning_rate": 3.799985653221433e-07,
"logits/chosen": -0.494140625,
"logits/rejected": -0.546875,
"logps/chosen": -0.734375,
"logps/rejected": -1.03125,
"loss": 0.9192,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.037109375,
"rewards/margins": 0.004852294921875,
"rewards/rejected": -0.0419921875,
"step": 246
},
{
"epoch": 0.5277777777777778,
"grad_norm": 37.24895950193556,
"learning_rate": 3.773955899239325e-07,
"logits/chosen": -0.63671875,
"logits/rejected": -0.62109375,
"logps/chosen": -0.67578125,
"logps/rejected": -0.75,
"loss": 0.9956,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0145263671875,
"rewards/margins": 0.013427734375,
"rewards/rejected": -0.02783203125,
"step": 247
},
{
"epoch": 0.5299145299145299,
"grad_norm": 25.848328585587293,
"learning_rate": 3.747910890187553e-07,
"logits/chosen": -0.3828125,
"logits/rejected": -0.38671875,
"logps/chosen": -0.70703125,
"logps/rejected": -0.6484375,
"loss": 0.9127,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.0157470703125,
"rewards/margins": 0.0322265625,
"rewards/rejected": -0.0166015625,
"step": 248
},
{
"epoch": 0.532051282051282,
"grad_norm": 34.06646508786053,
"learning_rate": 3.7218520763667986e-07,
"logits/chosen": -0.734375,
"logits/rejected": -0.75390625,
"logps/chosen": -1.5703125,
"logps/rejected": -1.3515625,
"loss": 0.9003,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.01263427734375,
"rewards/margins": -0.0029296875,
"rewards/rejected": -0.00970458984375,
"step": 249
},
{
"epoch": 0.5341880341880342,
"grad_norm": 41.28995974148372,
"learning_rate": 3.695780908846459e-07,
"logits/chosen": -0.796875,
"logits/rejected": -0.765625,
"logps/chosen": -0.87109375,
"logps/rejected": -0.76953125,
"loss": 0.9581,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.004302978515625,
"rewards/margins": 0.013671875,
"rewards/rejected": -0.01806640625,
"step": 250
},
{
"epoch": 0.5363247863247863,
"grad_norm": 11.821036836394699,
"learning_rate": 3.669698839383829e-07,
"logits/chosen": -0.46484375,
"logits/rejected": -0.48046875,
"logps/chosen": -0.466796875,
"logps/rejected": -0.47265625,
"loss": 0.9258,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.0107421875,
"rewards/margins": 0.0260009765625,
"rewards/rejected": -0.036865234375,
"step": 251
},
{
"epoch": 0.5384615384615384,
"grad_norm": 35.07428065878141,
"learning_rate": 3.6436073203432805e-07,
"logits/chosen": -0.416015625,
"logits/rejected": -0.396484375,
"logps/chosen": -0.9375,
"logps/rejected": -1.75,
"loss": 0.959,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.008544921875,
"rewards/margins": 0.0277099609375,
"rewards/rejected": -0.036376953125,
"step": 252
},
{
"epoch": 0.5405982905982906,
"grad_norm": 23.55218496365697,
"learning_rate": 3.6175078046153744e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.59765625,
"logps/chosen": -0.765625,
"logps/rejected": -1.59375,
"loss": 0.9366,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.045166015625,
"rewards/margins": -0.0059814453125,
"rewards/rejected": -0.0390625,
"step": 253
},
{
"epoch": 0.5427350427350427,
"grad_norm": 20.862502117987717,
"learning_rate": 3.591401745535965e-07,
"logits/chosen": -0.63671875,
"logits/rejected": -0.6171875,
"logps/chosen": -0.49609375,
"logps/rejected": -0.5078125,
"loss": 0.921,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.021484375,
"rewards/margins": 0.014404296875,
"rewards/rejected": -0.035888671875,
"step": 254
},
{
"epoch": 0.5448717948717948,
"grad_norm": 29.660987179522632,
"learning_rate": 3.5652905968052676e-07,
"logits/chosen": -0.50390625,
"logits/rejected": -0.58203125,
"logps/chosen": -0.74609375,
"logps/rejected": -0.7578125,
"loss": 0.942,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07080078125,
"rewards/margins": 0.01129150390625,
"rewards/rejected": -0.08203125,
"step": 255
},
{
"epoch": 0.5470085470085471,
"grad_norm": 18.529268471979883,
"learning_rate": 3.5391758124069124e-07,
"logits/chosen": -0.40234375,
"logits/rejected": -0.48046875,
"logps/chosen": -1.09375,
"logps/rejected": -1.328125,
"loss": 0.9069,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.003662109375,
"rewards/margins": 0.0174560546875,
"rewards/rejected": -0.02099609375,
"step": 256
},
{
"epoch": 0.5491452991452992,
"grad_norm": 109.6700121132754,
"learning_rate": 3.5130588465269785e-07,
"logits/chosen": -0.5078125,
"logits/rejected": -0.5234375,
"logps/chosen": -0.546875,
"logps/rejected": -0.51171875,
"loss": 1.048,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.021240234375,
"rewards/margins": 0.0185546875,
"rewards/rejected": -0.039794921875,
"step": 257
},
{
"epoch": 0.5512820512820513,
"grad_norm": 19.426233176236774,
"learning_rate": 3.486941153473021e-07,
"logits/chosen": -0.75,
"logits/rejected": -0.8046875,
"logps/chosen": -2.03125,
"logps/rejected": -2.453125,
"loss": 0.9047,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.1357421875,
"rewards/margins": 0.07421875,
"rewards/rejected": -0.2099609375,
"step": 258
},
{
"epoch": 0.5534188034188035,
"grad_norm": 46.82508695281614,
"learning_rate": 3.460824187593088e-07,
"logits/chosen": -0.52734375,
"logits/rejected": -0.474609375,
"logps/chosen": -1.328125,
"logps/rejected": -2.46875,
"loss": 1.02,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.029052734375,
"rewards/margins": -0.00341796875,
"rewards/rejected": -0.0257568359375,
"step": 259
},
{
"epoch": 0.5555555555555556,
"grad_norm": 93.85201459491968,
"learning_rate": 3.4347094031947326e-07,
"logits/chosen": -0.609375,
"logits/rejected": -0.59375,
"logps/chosen": -0.796875,
"logps/rejected": -1.1328125,
"loss": 0.9739,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.05224609375,
"rewards/margins": 0.02880859375,
"rewards/rejected": -0.0810546875,
"step": 260
},
{
"epoch": 0.5576923076923077,
"grad_norm": 25.035527126644347,
"learning_rate": 3.408598254464035e-07,
"logits/chosen": -0.44921875,
"logits/rejected": -0.5078125,
"logps/chosen": -0.8359375,
"logps/rejected": -1.25,
"loss": 0.9421,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.029052734375,
"rewards/margins": -0.005859375,
"rewards/rejected": -0.023193359375,
"step": 261
},
{
"epoch": 0.5598290598290598,
"grad_norm": 25.292262701656554,
"learning_rate": 3.382492195384625e-07,
"logits/chosen": -0.392578125,
"logits/rejected": -0.392578125,
"logps/chosen": -0.828125,
"logps/rejected": -0.7421875,
"loss": 0.9186,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.024169921875,
"rewards/margins": 0.01904296875,
"rewards/rejected": -0.043212890625,
"step": 262
},
{
"epoch": 0.561965811965812,
"grad_norm": 77.95609584102277,
"learning_rate": 3.3563926796567187e-07,
"logits/chosen": -0.625,
"logits/rejected": -0.6953125,
"logps/chosen": -0.76953125,
"logps/rejected": -0.8046875,
"loss": 1.082,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.021728515625,
"rewards/margins": 0.044189453125,
"rewards/rejected": -0.06591796875,
"step": 263
},
{
"epoch": 0.5641025641025641,
"grad_norm": 31.432425326232472,
"learning_rate": 3.33030116061617e-07,
"logits/chosen": -0.64453125,
"logits/rejected": -0.6484375,
"logps/chosen": -1.5390625,
"logps/rejected": -1.2734375,
"loss": 0.8667,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.01904296875,
"rewards/margins": 0.06982421875,
"rewards/rejected": -0.0888671875,
"step": 264
},
{
"epoch": 0.5662393162393162,
"grad_norm": 47.53840395860865,
"learning_rate": 3.3042190911535425e-07,
"logits/chosen": -0.4609375,
"logits/rejected": -0.4453125,
"logps/chosen": -1.390625,
"logps/rejected": -2.359375,
"loss": 0.9517,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.00433349609375,
"rewards/margins": -0.0281982421875,
"rewards/rejected": 0.0322265625,
"step": 265
},
{
"epoch": 0.5683760683760684,
"grad_norm": 41.666834938138656,
"learning_rate": 3.278147923633201e-07,
"logits/chosen": -0.1318359375,
"logits/rejected": -0.0517578125,
"logps/chosen": -0.828125,
"logps/rejected": -1.015625,
"loss": 0.9088,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.04833984375,
"rewards/margins": 0.06494140625,
"rewards/rejected": -0.11328125,
"step": 266
},
{
"epoch": 0.5705128205128205,
"grad_norm": 25.14772301172709,
"learning_rate": 3.2520891098124484e-07,
"logits/chosen": -0.400390625,
"logits/rejected": -0.306640625,
"logps/chosen": -1.328125,
"logps/rejected": -1.609375,
"loss": 0.8958,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.05517578125,
"rewards/margins": 0.0751953125,
"rewards/rejected": -0.130859375,
"step": 267
},
{
"epoch": 0.5726495726495726,
"grad_norm": 29.876794568739818,
"learning_rate": 3.2260441007606763e-07,
"logits/chosen": -0.1708984375,
"logits/rejected": -0.19140625,
"logps/chosen": -1.0234375,
"logps/rejected": -2.265625,
"loss": 0.8914,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0859375,
"rewards/margins": 0.056396484375,
"rewards/rejected": -0.142578125,
"step": 268
},
{
"epoch": 0.5747863247863247,
"grad_norm": 28.482704912447183,
"learning_rate": 3.2000143467785667e-07,
"logits/chosen": -0.66796875,
"logits/rejected": -0.703125,
"logps/chosen": -2.0625,
"logps/rejected": -1.7890625,
"loss": 0.881,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.048583984375,
"rewards/margins": 0.1025390625,
"rewards/rejected": -0.05419921875,
"step": 269
},
{
"epoch": 0.5769230769230769,
"grad_norm": 30.390512025141742,
"learning_rate": 3.174001297317332e-07,
"logits/chosen": -0.6484375,
"logits/rejected": -0.416015625,
"logps/chosen": -1.34375,
"logps/rejected": -1.3828125,
"loss": 0.968,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.051513671875,
"rewards/margins": 0.0169677734375,
"rewards/rejected": -0.068359375,
"step": 270
},
{
"epoch": 0.5790598290598291,
"grad_norm": 50.43026193511625,
"learning_rate": 3.1480064008980024e-07,
"logits/chosen": -0.515625,
"logits/rejected": -0.53515625,
"logps/chosen": -0.703125,
"logps/rejected": -1.7265625,
"loss": 0.9518,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.003082275390625,
"rewards/margins": 0.025146484375,
"rewards/rejected": -0.028076171875,
"step": 271
},
{
"epoch": 0.5811965811965812,
"grad_norm": 31.084216885562007,
"learning_rate": 3.122031105030768e-07,
"logits/chosen": -0.55078125,
"logits/rejected": -0.44140625,
"logps/chosen": -0.953125,
"logps/rejected": -1.0078125,
"loss": 0.9025,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.00531005859375,
"rewards/margins": 0.029541015625,
"rewards/rejected": -0.0242919921875,
"step": 272
},
{
"epoch": 0.5833333333333334,
"grad_norm": 21.587874916679418,
"learning_rate": 3.0960768561343756e-07,
"logits/chosen": -0.6640625,
"logits/rejected": -0.5703125,
"logps/chosen": -1.4453125,
"logps/rejected": -1.328125,
"loss": 0.9187,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.072265625,
"rewards/margins": 0.028076171875,
"rewards/rejected": -0.1005859375,
"step": 273
},
{
"epoch": 0.5854700854700855,
"grad_norm": 35.6378668836564,
"learning_rate": 3.07014509945558e-07,
"logits/chosen": -0.59375,
"logits/rejected": -0.515625,
"logps/chosen": -0.921875,
"logps/rejected": -0.84375,
"loss": 0.9381,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.053955078125,
"rewards/margins": 0.033935546875,
"rewards/rejected": -0.087890625,
"step": 274
},
{
"epoch": 0.5876068376068376,
"grad_norm": 73.21173382326778,
"learning_rate": 3.0442372789886744e-07,
"logits/chosen": -0.68359375,
"logits/rejected": -0.640625,
"logps/chosen": -0.84375,
"logps/rejected": -1.1796875,
"loss": 1.0093,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.00958251953125,
"rewards/margins": 0.060791015625,
"rewards/rejected": -0.0703125,
"step": 275
},
{
"epoch": 0.5897435897435898,
"grad_norm": 25.70974817197716,
"learning_rate": 3.0183548373950755e-07,
"logits/chosen": -0.6796875,
"logits/rejected": -0.6640625,
"logps/chosen": -2.203125,
"logps/rejected": -2.515625,
"loss": 0.9109,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.044189453125,
"rewards/margins": 0.1103515625,
"rewards/rejected": -0.154296875,
"step": 276
},
{
"epoch": 0.5918803418803419,
"grad_norm": 46.216925593680635,
"learning_rate": 2.992499215922993e-07,
"logits/chosen": -0.53125,
"logits/rejected": -0.54296875,
"logps/chosen": -1.0859375,
"logps/rejected": -1.078125,
"loss": 0.9515,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.007568359375,
"rewards/margins": 0.044189453125,
"rewards/rejected": -0.0517578125,
"step": 277
},
{
"epoch": 0.594017094017094,
"grad_norm": 37.597203962581744,
"learning_rate": 2.966671854327177e-07,
"logits/chosen": -0.58984375,
"logits/rejected": -0.458984375,
"logps/chosen": -0.7421875,
"logps/rejected": -0.82421875,
"loss": 0.9862,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.01470947265625,
"rewards/margins": 0.03125,
"rewards/rejected": -0.0458984375,
"step": 278
},
{
"epoch": 0.5961538461538461,
"grad_norm": 27.881961513246008,
"learning_rate": 2.9408741907887424e-07,
"logits/chosen": -0.54296875,
"logits/rejected": -0.52734375,
"logps/chosen": -0.90625,
"logps/rejected": -0.74609375,
"loss": 0.8914,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0211181640625,
"rewards/margins": 0.02392578125,
"rewards/rejected": -0.045166015625,
"step": 279
},
{
"epoch": 0.5982905982905983,
"grad_norm": 43.73080236556192,
"learning_rate": 2.91510766183508e-07,
"logits/chosen": -0.7890625,
"logits/rejected": -0.7734375,
"logps/chosen": -1.0625,
"logps/rejected": -1.03125,
"loss": 0.9796,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.029296875,
"rewards/margins": 0.047607421875,
"rewards/rejected": -0.07666015625,
"step": 280
},
{
"epoch": 0.6004273504273504,
"grad_norm": 46.70933914401127,
"learning_rate": 2.889373702259879e-07,
"logits/chosen": -0.8046875,
"logits/rejected": -0.77734375,
"logps/chosen": -0.63671875,
"logps/rejected": -0.62890625,
"loss": 0.9463,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.020263671875,
"rewards/margins": 0.019775390625,
"rewards/rejected": -0.0400390625,
"step": 281
},
{
"epoch": 0.6025641025641025,
"grad_norm": 30.57420853354831,
"learning_rate": 2.863673745043216e-07,
"logits/chosen": -0.9140625,
"logits/rejected": -0.79296875,
"logps/chosen": -1.6796875,
"logps/rejected": -1.078125,
"loss": 0.9337,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.027099609375,
"rewards/margins": 0.05419921875,
"rewards/rejected": -0.08154296875,
"step": 282
},
{
"epoch": 0.6047008547008547,
"grad_norm": 43.53013754695041,
"learning_rate": 2.838009221271769e-07,
"logits/chosen": -0.388671875,
"logits/rejected": -0.37109375,
"logps/chosen": -0.9296875,
"logps/rejected": -0.84765625,
"loss": 0.9728,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.04296875,
"rewards/margins": 0.0150146484375,
"rewards/rejected": -0.057861328125,
"step": 283
},
{
"epoch": 0.6068376068376068,
"grad_norm": 19.88438908623489,
"learning_rate": 2.812381560059126e-07,
"logits/chosen": -0.4296875,
"logits/rejected": -0.48828125,
"logps/chosen": -0.8984375,
"logps/rejected": -0.953125,
"loss": 0.8746,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0390625,
"rewards/margins": 0.03564453125,
"rewards/rejected": -0.07470703125,
"step": 284
},
{
"epoch": 0.6089743589743589,
"grad_norm": 23.40336854183335,
"learning_rate": 2.786792188466203e-07,
"logits/chosen": -0.5625,
"logits/rejected": -0.478515625,
"logps/chosen": -1.0703125,
"logps/rejected": -1.1640625,
"loss": 0.9618,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.04736328125,
"rewards/margins": -0.001007080078125,
"rewards/rejected": -0.046142578125,
"step": 285
},
{
"epoch": 0.6111111111111112,
"grad_norm": 72.08451890329711,
"learning_rate": 2.7612425314217865e-07,
"logits/chosen": -0.14453125,
"logits/rejected": -0.12451171875,
"logps/chosen": -1.0546875,
"logps/rejected": -1.109375,
"loss": 0.9712,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.027099609375,
"rewards/margins": 0.0859375,
"rewards/rejected": -0.11328125,
"step": 286
},
{
"epoch": 0.6132478632478633,
"grad_norm": 29.875118737426025,
"learning_rate": 2.7357340116431776e-07,
"logits/chosen": -0.302734375,
"logits/rejected": -0.19140625,
"logps/chosen": -0.59375,
"logps/rejected": -0.671875,
"loss": 0.837,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.0032196044921875,
"rewards/margins": 0.037841796875,
"rewards/rejected": -0.041015625,
"step": 287
},
{
"epoch": 0.6153846153846154,
"grad_norm": 32.72926634266307,
"learning_rate": 2.7102680495569755e-07,
"logits/chosen": -0.423828125,
"logits/rejected": -0.5078125,
"logps/chosen": -0.8359375,
"logps/rejected": -1.25,
"loss": 0.9363,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.04541015625,
"rewards/margins": 0.0869140625,
"rewards/rejected": -0.1318359375,
"step": 288
},
{
"epoch": 0.6175213675213675,
"grad_norm": 72.79449105495546,
"learning_rate": 2.6848460632199805e-07,
"logits/chosen": -0.55859375,
"logits/rejected": -0.546875,
"logps/chosen": -1.390625,
"logps/rejected": -2.28125,
"loss": 1.0001,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0673828125,
"rewards/margins": -0.03173828125,
"rewards/rejected": -0.035400390625,
"step": 289
},
{
"epoch": 0.6196581196581197,
"grad_norm": 205.0964309554123,
"learning_rate": 2.6594694682402267e-07,
"logits/chosen": -0.5,
"logits/rejected": -0.50390625,
"logps/chosen": -0.640625,
"logps/rejected": -0.63671875,
"loss": 1.5129,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.03369140625,
"rewards/margins": 0.01531982421875,
"rewards/rejected": -0.048828125,
"step": 290
},
{
"epoch": 0.6217948717948718,
"grad_norm": 19.915515429772448,
"learning_rate": 2.6341396776981614e-07,
"logits/chosen": -0.59765625,
"logits/rejected": -0.5078125,
"logps/chosen": -1.78125,
"logps/rejected": -1.484375,
"loss": 0.9001,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0201416015625,
"rewards/margins": 0.0291748046875,
"rewards/rejected": -0.049072265625,
"step": 291
},
{
"epoch": 0.6239316239316239,
"grad_norm": 28.296895506212497,
"learning_rate": 2.6088581020679536e-07,
"logits/chosen": -0.1181640625,
"logits/rejected": -0.1533203125,
"logps/chosen": -0.8515625,
"logps/rejected": -0.91796875,
"loss": 0.9131,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.02294921875,
"rewards/margins": 0.048583984375,
"rewards/rejected": -0.0712890625,
"step": 292
},
{
"epoch": 0.6260683760683761,
"grad_norm": 33.11228677517524,
"learning_rate": 2.583626149138954e-07,
"logits/chosen": -0.6328125,
"logits/rejected": -0.55859375,
"logps/chosen": -1.359375,
"logps/rejected": -0.9453125,
"loss": 0.8931,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.02978515625,
"rewards/margins": 0.054443359375,
"rewards/rejected": -0.08447265625,
"step": 293
},
{
"epoch": 0.6282051282051282,
"grad_norm": 29.244535425237153,
"learning_rate": 2.5584452239373e-07,
"logits/chosen": -0.671875,
"logits/rejected": -0.71484375,
"logps/chosen": -1.59375,
"logps/rejected": -2.953125,
"loss": 0.9468,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.0145263671875,
"rewards/margins": 0.060302734375,
"rewards/rejected": -0.07470703125,
"step": 294
},
{
"epoch": 0.6303418803418803,
"grad_norm": 24.469679488213703,
"learning_rate": 2.5333167286476864e-07,
"logits/chosen": -0.50390625,
"logits/rejected": -0.609375,
"logps/chosen": -0.734375,
"logps/rejected": -1.4453125,
"loss": 0.9009,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.03564453125,
"rewards/margins": 0.029541015625,
"rewards/rejected": -0.06494140625,
"step": 295
},
{
"epoch": 0.6324786324786325,
"grad_norm": 43.066148630199685,
"learning_rate": 2.5082420625352737e-07,
"logits/chosen": -0.390625,
"logits/rejected": -0.44140625,
"logps/chosen": -1.8984375,
"logps/rejected": -2.078125,
"loss": 0.9454,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.057861328125,
"rewards/margins": 0.08642578125,
"rewards/rejected": -0.14453125,
"step": 296
},
{
"epoch": 0.6346153846153846,
"grad_norm": 80.27061560705891,
"learning_rate": 2.4832226218677827e-07,
"logits/chosen": -0.58984375,
"logits/rejected": -0.51953125,
"logps/chosen": -2.125,
"logps/rejected": -2.21875,
"loss": 0.9612,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1201171875,
"rewards/margins": 0.0277099609375,
"rewards/rejected": -0.1474609375,
"step": 297
},
{
"epoch": 0.6367521367521367,
"grad_norm": 47.26237956889532,
"learning_rate": 2.458259799837735e-07,
"logits/chosen": -0.341796875,
"logits/rejected": -0.41015625,
"logps/chosen": -2.015625,
"logps/rejected": -1.9765625,
"loss": 0.9471,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.06298828125,
"rewards/margins": 0.03466796875,
"rewards/rejected": -0.09765625,
"step": 298
},
{
"epoch": 0.6388888888888888,
"grad_norm": 28.351687048174814,
"learning_rate": 2.4333549864848766e-07,
"logits/chosen": -0.51171875,
"logits/rejected": -0.44921875,
"logps/chosen": -0.71875,
"logps/rejected": -0.8515625,
"loss": 0.9233,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.03662109375,
"rewards/margins": 0.033447265625,
"rewards/rejected": -0.06982421875,
"step": 299
},
{
"epoch": 0.6410256410256411,
"grad_norm": 23.496220784871824,
"learning_rate": 2.408509568618779e-07,
"logits/chosen": -0.70703125,
"logits/rejected": -0.671875,
"logps/chosen": -1.140625,
"logps/rejected": -1.3359375,
"loss": 0.8739,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.083984375,
"rewards/margins": 0.00604248046875,
"rewards/rejected": -0.09033203125,
"step": 300
},
{
"epoch": 0.6410256410256411,
"eval_logits/chosen": -0.58203125,
"eval_logits/rejected": -0.59375,
"eval_logps/chosen": -1.1171875,
"eval_logps/rejected": -1.1953125,
"eval_loss": 0.9189149141311646,
"eval_rewards/accuracies": 0.7096773982048035,
"eval_rewards/chosen": -0.0439453125,
"eval_rewards/margins": 0.05810546875,
"eval_rewards/rejected": -0.10205078125,
"eval_runtime": 102.2621,
"eval_samples_per_second": 19.176,
"eval_steps_per_second": 0.606,
"step": 300
},
{
"epoch": 0.6431623931623932,
"grad_norm": 47.08783752673241,
"learning_rate": 2.3837249297416086e-07,
"logits/chosen": -0.671875,
"logits/rejected": -0.703125,
"logps/chosen": -0.6328125,
"logps/rejected": -0.69140625,
"loss": 0.944,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.01507568359375,
"rewards/margins": 0.027099609375,
"rewards/rejected": -0.042236328125,
"step": 301
},
{
"epoch": 0.6452991452991453,
"grad_norm": 61.449734896636464,
"learning_rate": 2.3590024499710916e-07,
"logits/chosen": -0.671875,
"logits/rejected": -0.75390625,
"logps/chosen": -0.95703125,
"logps/rejected": -1.0546875,
"loss": 0.9629,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.08935546875,
"rewards/margins": 0.00823974609375,
"rewards/rejected": -0.09765625,
"step": 302
},
{
"epoch": 0.6474358974358975,
"grad_norm": 52.862250928964876,
"learning_rate": 2.3343435059636606e-07,
"logits/chosen": -0.7578125,
"logits/rejected": -0.70703125,
"logps/chosen": -2.171875,
"logps/rejected": -1.171875,
"loss": 1.0004,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.201171875,
"rewards/margins": -0.0400390625,
"rewards/rejected": -0.162109375,
"step": 303
},
{
"epoch": 0.6495726495726496,
"grad_norm": 28.629721734286697,
"learning_rate": 2.3097494708377977e-07,
"logits/chosen": -0.5859375,
"logits/rejected": -0.58203125,
"logps/chosen": -1.078125,
"logps/rejected": -1.2265625,
"loss": 0.8655,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0400390625,
"rewards/margins": 0.0380859375,
"rewards/rejected": -0.078125,
"step": 304
},
{
"epoch": 0.6517094017094017,
"grad_norm": 33.418704353017965,
"learning_rate": 2.285221714097575e-07,
"logits/chosen": -0.42578125,
"logits/rejected": -0.4375,
"logps/chosen": -1.96875,
"logps/rejected": -1.4375,
"loss": 0.8831,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0079345703125,
"rewards/margins": 0.10791015625,
"rewards/rejected": -0.1162109375,
"step": 305
},
{
"epoch": 0.6538461538461539,
"grad_norm": 129.10426681364987,
"learning_rate": 2.2607616015563896e-07,
"logits/chosen": -0.48046875,
"logits/rejected": -0.46484375,
"logps/chosen": -1.25,
"logps/rejected": -0.91015625,
"loss": 1.0697,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0830078125,
"rewards/margins": -0.0052490234375,
"rewards/rejected": -0.078125,
"step": 306
},
{
"epoch": 0.655982905982906,
"grad_norm": 36.45007975348447,
"learning_rate": 2.2363704952609142e-07,
"logits/chosen": -0.5078125,
"logits/rejected": -0.42578125,
"logps/chosen": -2.3125,
"logps/rejected": -1.390625,
"loss": 0.9263,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.046630859375,
"rewards/margins": 0.0126953125,
"rewards/rejected": -0.0595703125,
"step": 307
},
{
"epoch": 0.6581196581196581,
"grad_norm": 80.4627542411564,
"learning_rate": 2.2120497534152476e-07,
"logits/chosen": -0.3125,
"logits/rejected": -0.32421875,
"logps/chosen": -1.6015625,
"logps/rejected": -1.8125,
"loss": 0.9742,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.03271484375,
"rewards/margins": 0.0322265625,
"rewards/rejected": 0.0006103515625,
"step": 308
},
{
"epoch": 0.6602564102564102,
"grad_norm": 19.48595987582158,
"learning_rate": 2.1878007303052892e-07,
"logits/chosen": -0.640625,
"logits/rejected": -0.69140625,
"logps/chosen": -0.9375,
"logps/rejected": -1.484375,
"loss": 0.8988,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.00213623046875,
"rewards/margins": 0.06787109375,
"rewards/rejected": -0.0703125,
"step": 309
},
{
"epoch": 0.6623931623931624,
"grad_norm": 38.239536478841806,
"learning_rate": 2.1636247762233223e-07,
"logits/chosen": -0.765625,
"logits/rejected": -0.7890625,
"logps/chosen": -0.8515625,
"logps/rejected": -0.9453125,
"loss": 0.9138,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0634765625,
"rewards/margins": -0.0001220703125,
"rewards/rejected": -0.0634765625,
"step": 310
},
{
"epoch": 0.6645299145299145,
"grad_norm": 22.654040122814195,
"learning_rate": 2.1395232373928256e-07,
"logits/chosen": -0.6484375,
"logits/rejected": -0.54296875,
"logps/chosen": -0.8984375,
"logps/rejected": -0.88671875,
"loss": 0.9166,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.05322265625,
"rewards/margins": 0.01904296875,
"rewards/rejected": -0.072265625,
"step": 311
},
{
"epoch": 0.6666666666666666,
"grad_norm": 22.814581881293506,
"learning_rate": 2.1154974558935087e-07,
"logits/chosen": -0.5078125,
"logits/rejected": -0.53125,
"logps/chosen": -1.34375,
"logps/rejected": -1.7734375,
"loss": 0.8981,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.0419921875,
"rewards/margins": 0.042724609375,
"rewards/rejected": -0.00128173828125,
"step": 312
},
{
"epoch": 0.6688034188034188,
"grad_norm": 27.638929228638546,
"learning_rate": 2.091548769586581e-07,
"logits/chosen": -0.6171875,
"logits/rejected": -0.57421875,
"logps/chosen": -0.828125,
"logps/rejected": -0.703125,
"loss": 0.9036,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.039306640625,
"rewards/margins": 0.02685546875,
"rewards/rejected": -0.06640625,
"step": 313
},
{
"epoch": 0.6709401709401709,
"grad_norm": 25.374046730125965,
"learning_rate": 2.0676785120402512e-07,
"logits/chosen": -0.5078125,
"logits/rejected": -0.494140625,
"logps/chosen": -0.58203125,
"logps/rejected": -0.54296875,
"loss": 0.943,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.055419921875,
"rewards/margins": 0.01275634765625,
"rewards/rejected": -0.068359375,
"step": 314
},
{
"epoch": 0.6730769230769231,
"grad_norm": 51.9089203355178,
"learning_rate": 2.043888012455471e-07,
"logits/chosen": -0.6484375,
"logits/rejected": -0.65625,
"logps/chosen": -1.921875,
"logps/rejected": -0.796875,
"loss": 0.9812,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.099609375,
"rewards/margins": -0.044677734375,
"rewards/rejected": -0.05517578125,
"step": 315
},
{
"epoch": 0.6752136752136753,
"grad_norm": 29.258971220803748,
"learning_rate": 2.0201785955919153e-07,
"logits/chosen": -0.57421875,
"logits/rejected": -0.51953125,
"logps/chosen": -0.6640625,
"logps/rejected": -0.6640625,
"loss": 0.8953,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.05224609375,
"rewards/margins": 0.011474609375,
"rewards/rejected": -0.06396484375,
"step": 316
},
{
"epoch": 0.6773504273504274,
"grad_norm": 47.087186810023475,
"learning_rate": 1.9965515816942188e-07,
"logits/chosen": 0.0111083984375,
"logits/rejected": -0.103515625,
"logps/chosen": -0.6875,
"logps/rejected": -0.6875,
"loss": 0.9901,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.020263671875,
"rewards/margins": 0.04296875,
"rewards/rejected": -0.06298828125,
"step": 317
},
{
"epoch": 0.6794871794871795,
"grad_norm": 28.150562895031253,
"learning_rate": 1.9730082864184569e-07,
"logits/chosen": -0.7109375,
"logits/rejected": -0.75390625,
"logps/chosen": -0.921875,
"logps/rejected": -0.8671875,
"loss": 0.8884,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.00335693359375,
"rewards/margins": 0.052734375,
"rewards/rejected": -0.055908203125,
"step": 318
},
{
"epoch": 0.6816239316239316,
"grad_norm": 37.95822554361127,
"learning_rate": 1.9495500207588803e-07,
"logits/chosen": -0.51953125,
"logits/rejected": -0.466796875,
"logps/chosen": -0.734375,
"logps/rejected": -0.8203125,
"loss": 0.9323,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0458984375,
"rewards/margins": 0.039794921875,
"rewards/rejected": -0.08544921875,
"step": 319
},
{
"epoch": 0.6837606837606838,
"grad_norm": 17.346530667365943,
"learning_rate": 1.9261780909749204e-07,
"logits/chosen": -0.515625,
"logits/rejected": -0.61328125,
"logps/chosen": -0.73828125,
"logps/rejected": -0.7578125,
"loss": 0.8615,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.03759765625,
"rewards/margins": 0.05859375,
"rewards/rejected": -0.095703125,
"step": 320
},
{
"epoch": 0.6858974358974359,
"grad_norm": 26.013086502242853,
"learning_rate": 1.9028937985184488e-07,
"logits/chosen": -0.271484375,
"logits/rejected": -0.44140625,
"logps/chosen": -1.1875,
"logps/rejected": -1.15625,
"loss": 0.8679,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.05908203125,
"rewards/margins": 0.037841796875,
"rewards/rejected": -0.09716796875,
"step": 321
},
{
"epoch": 0.688034188034188,
"grad_norm": 23.10477919916546,
"learning_rate": 1.8796984399612961e-07,
"logits/chosen": -0.65234375,
"logits/rejected": -0.62890625,
"logps/chosen": -0.6484375,
"logps/rejected": -0.73046875,
"loss": 0.9015,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.041015625,
"rewards/margins": 0.045166015625,
"rewards/rejected": -0.0859375,
"step": 322
},
{
"epoch": 0.6901709401709402,
"grad_norm": 40.346721902061745,
"learning_rate": 1.8565933069230723e-07,
"logits/chosen": -0.455078125,
"logits/rejected": -0.44921875,
"logps/chosen": -0.72265625,
"logps/rejected": -1.3203125,
"loss": 0.8928,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.078125,
"rewards/margins": 0.07373046875,
"rewards/rejected": -0.15234375,
"step": 323
},
{
"epoch": 0.6923076923076923,
"grad_norm": 17.96847921628251,
"learning_rate": 1.8335796859992293e-07,
"logits/chosen": -0.470703125,
"logits/rejected": -0.4140625,
"logps/chosen": -1.0546875,
"logps/rejected": -0.99609375,
"loss": 0.8604,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.080078125,
"rewards/margins": 0.0283203125,
"rewards/rejected": -0.10791015625,
"step": 324
},
{
"epoch": 0.6944444444444444,
"grad_norm": 28.95665753195518,
"learning_rate": 1.8106588586894203e-07,
"logits/chosen": -0.7421875,
"logits/rejected": -0.76171875,
"logps/chosen": -0.79296875,
"logps/rejected": -0.8828125,
"loss": 0.899,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.050048828125,
"rewards/margins": 0.0361328125,
"rewards/rejected": -0.0859375,
"step": 325
},
{
"epoch": 0.6965811965811965,
"grad_norm": 42.415256817420016,
"learning_rate": 1.7878321013261467e-07,
"logits/chosen": -0.59375,
"logits/rejected": -0.6171875,
"logps/chosen": -0.96875,
"logps/rejected": -1.15625,
"loss": 0.9105,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.05224609375,
"rewards/margins": 0.04052734375,
"rewards/rejected": -0.0927734375,
"step": 326
},
{
"epoch": 0.6987179487179487,
"grad_norm": 24.916024571168084,
"learning_rate": 1.765100685003675e-07,
"logits/chosen": -0.263671875,
"logits/rejected": -0.37109375,
"logps/chosen": -1.765625,
"logps/rejected": -0.9296875,
"loss": 0.8912,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.039306640625,
"rewards/margins": 0.042236328125,
"rewards/rejected": -0.08154296875,
"step": 327
},
{
"epoch": 0.7008547008547008,
"grad_norm": 38.23555158814467,
"learning_rate": 1.7424658755072684e-07,
"logits/chosen": -0.69140625,
"logits/rejected": -0.6171875,
"logps/chosen": -1.7421875,
"logps/rejected": -0.83984375,
"loss": 0.9182,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02490234375,
"rewards/margins": 0.059326171875,
"rewards/rejected": -0.083984375,
"step": 328
},
{
"epoch": 0.7029914529914529,
"grad_norm": 71.67585045413601,
"learning_rate": 1.7199289332426963e-07,
"logits/chosen": -0.60546875,
"logits/rejected": -0.6171875,
"logps/chosen": -1.0546875,
"logps/rejected": -0.69140625,
"loss": 0.9418,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.11328125,
"rewards/margins": -0.042724609375,
"rewards/rejected": -0.0703125,
"step": 329
},
{
"epoch": 0.7051282051282052,
"grad_norm": 31.48093318308997,
"learning_rate": 1.697491113166047e-07,
"logits/chosen": -0.7421875,
"logits/rejected": -0.7265625,
"logps/chosen": -0.765625,
"logps/rejected": -1.2421875,
"loss": 0.9808,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0279541015625,
"rewards/margins": 0.080078125,
"rewards/rejected": -0.10791015625,
"step": 330
},
{
"epoch": 0.7072649572649573,
"grad_norm": 36.822454161010484,
"learning_rate": 1.6751536647138525e-07,
"logits/chosen": -0.5078125,
"logits/rejected": -0.498046875,
"logps/chosen": -1.21875,
"logps/rejected": -1.3359375,
"loss": 0.8826,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.06298828125,
"rewards/margins": 0.0595703125,
"rewards/rejected": -0.12255859375,
"step": 331
},
{
"epoch": 0.7094017094017094,
"grad_norm": 21.571313736776336,
"learning_rate": 1.652917831733509e-07,
"logits/chosen": -0.5625,
"logits/rejected": -0.53125,
"logps/chosen": -0.7578125,
"logps/rejected": -0.90234375,
"loss": 0.9144,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.044189453125,
"rewards/margins": 0.0361328125,
"rewards/rejected": -0.08056640625,
"step": 332
},
{
"epoch": 0.7115384615384616,
"grad_norm": 32.84234185812603,
"learning_rate": 1.6307848524140175e-07,
"logits/chosen": -0.5390625,
"logits/rejected": -0.494140625,
"logps/chosen": -1.015625,
"logps/rejected": -1.0390625,
"loss": 0.9741,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0830078125,
"rewards/margins": 0.0166015625,
"rewards/rejected": -0.099609375,
"step": 333
},
{
"epoch": 0.7136752136752137,
"grad_norm": 29.13502053907057,
"learning_rate": 1.6087559592170356e-07,
"logits/chosen": -0.498046875,
"logits/rejected": -0.515625,
"logps/chosen": -1.25,
"logps/rejected": -1.3203125,
"loss": 0.8452,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1015625,
"rewards/margins": 0.06640625,
"rewards/rejected": -0.16796875,
"step": 334
},
{
"epoch": 0.7158119658119658,
"grad_norm": 37.18731356468967,
"learning_rate": 1.5868323788082462e-07,
"logits/chosen": -0.52734375,
"logits/rejected": -0.546875,
"logps/chosen": -0.86328125,
"logps/rejected": -0.71875,
"loss": 0.9375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.046875,
"rewards/margins": 0.01953125,
"rewards/rejected": -0.06640625,
"step": 335
},
{
"epoch": 0.717948717948718,
"grad_norm": 43.083953890628244,
"learning_rate": 1.5650153319890508e-07,
"logits/chosen": -0.63671875,
"logits/rejected": -0.546875,
"logps/chosen": -0.86328125,
"logps/rejected": -1.3984375,
"loss": 0.8783,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.06591796875,
"rewards/margins": 0.07080078125,
"rewards/rejected": -0.13671875,
"step": 336
},
{
"epoch": 0.7200854700854701,
"grad_norm": 39.50465162692857,
"learning_rate": 1.543306033628597e-07,
"logits/chosen": -0.5859375,
"logits/rejected": -0.546875,
"logps/chosen": -0.8828125,
"logps/rejected": -1.2265625,
"loss": 0.9142,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.062255859375,
"rewards/margins": 0.07861328125,
"rewards/rejected": -0.140625,
"step": 337
},
{
"epoch": 0.7222222222222222,
"grad_norm": 45.92938655648082,
"learning_rate": 1.5217056925961196e-07,
"logits/chosen": -0.58203125,
"logits/rejected": -0.59375,
"logps/chosen": -1.4765625,
"logps/rejected": -0.7734375,
"loss": 0.9757,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0306396484375,
"rewards/margins": 0.0390625,
"rewards/rejected": -0.06982421875,
"step": 338
},
{
"epoch": 0.7243589743589743,
"grad_norm": 31.162218682245538,
"learning_rate": 1.5002155116936342e-07,
"logits/chosen": -0.6640625,
"logits/rejected": -0.71875,
"logps/chosen": -0.9140625,
"logps/rejected": -1.03125,
"loss": 0.9158,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.072265625,
"rewards/margins": 0.0693359375,
"rewards/rejected": -0.1416015625,
"step": 339
},
{
"epoch": 0.7264957264957265,
"grad_norm": 53.49760741911357,
"learning_rate": 1.4788366875889506e-07,
"logits/chosen": -0.5703125,
"logits/rejected": -0.52734375,
"logps/chosen": -0.953125,
"logps/rejected": -0.8671875,
"loss": 0.944,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.08203125,
"rewards/margins": -0.0052490234375,
"rewards/rejected": -0.07666015625,
"step": 340
},
{
"epoch": 0.7286324786324786,
"grad_norm": 26.13264228294242,
"learning_rate": 1.4575704107490483e-07,
"logits/chosen": -0.640625,
"logits/rejected": -0.63671875,
"logps/chosen": -2.171875,
"logps/rejected": -1.96875,
"loss": 0.8617,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.060302734375,
"rewards/margins": 0.0751953125,
"rewards/rejected": -0.1357421875,
"step": 341
},
{
"epoch": 0.7307692307692307,
"grad_norm": 42.99292575995844,
"learning_rate": 1.4364178653737737e-07,
"logits/chosen": -0.6015625,
"logits/rejected": -0.59375,
"logps/chosen": -1.796875,
"logps/rejected": -0.8515625,
"loss": 0.9573,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.10400390625,
"rewards/margins": -0.0225830078125,
"rewards/rejected": -0.08154296875,
"step": 342
},
{
"epoch": 0.7329059829059829,
"grad_norm": 51.89913814066636,
"learning_rate": 1.4153802293299097e-07,
"logits/chosen": -0.3125,
"logits/rejected": -0.345703125,
"logps/chosen": -1.4609375,
"logps/rejected": -2.234375,
"loss": 0.8804,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.03173828125,
"rewards/margins": 0.07958984375,
"rewards/rejected": -0.111328125,
"step": 343
},
{
"epoch": 0.7350427350427351,
"grad_norm": 65.90958403692584,
"learning_rate": 1.3944586740855812e-07,
"logits/chosen": -0.68359375,
"logits/rejected": -0.66796875,
"logps/chosen": -0.98828125,
"logps/rejected": -2.015625,
"loss": 0.9264,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.083984375,
"rewards/margins": 0.0908203125,
"rewards/rejected": -0.1748046875,
"step": 344
},
{
"epoch": 0.7371794871794872,
"grad_norm": 27.869444404639868,
"learning_rate": 1.373654364645021e-07,
"logits/chosen": -0.65234375,
"logits/rejected": -0.703125,
"logps/chosen": -0.84765625,
"logps/rejected": -2.0625,
"loss": 0.8766,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.037353515625,
"rewards/margins": 0.08935546875,
"rewards/rejected": -0.126953125,
"step": 345
},
{
"epoch": 0.7393162393162394,
"grad_norm": 28.689619149528802,
"learning_rate": 1.3529684594837035e-07,
"logits/chosen": -0.6328125,
"logits/rejected": -0.62109375,
"logps/chosen": -1.2421875,
"logps/rejected": -1.7890625,
"loss": 0.8604,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0517578125,
"rewards/margins": 0.0400390625,
"rewards/rejected": -0.09228515625,
"step": 346
},
{
"epoch": 0.7414529914529915,
"grad_norm": 32.71728165861424,
"learning_rate": 1.3324021104838275e-07,
"logits/chosen": -0.56640625,
"logits/rejected": -0.58984375,
"logps/chosen": -1.8828125,
"logps/rejected": -3.171875,
"loss": 0.8964,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.049560546875,
"rewards/margins": 0.00396728515625,
"rewards/rejected": -0.053466796875,
"step": 347
},
{
"epoch": 0.7435897435897436,
"grad_norm": 34.24589371209163,
"learning_rate": 1.3119564628701822e-07,
"logits/chosen": -0.337890625,
"logits/rejected": -0.41015625,
"logps/chosen": -1.34375,
"logps/rejected": -1.6875,
"loss": 0.8834,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.023193359375,
"rewards/margins": 0.0859375,
"rewards/rejected": -0.10888671875,
"step": 348
},
{
"epoch": 0.7457264957264957,
"grad_norm": 13.972227869423218,
"learning_rate": 1.2916326551463748e-07,
"logits/chosen": -0.8359375,
"logits/rejected": -0.8046875,
"logps/chosen": -0.83984375,
"logps/rejected": -0.765625,
"loss": 0.8831,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.05224609375,
"rewards/margins": 0.042236328125,
"rewards/rejected": -0.09423828125,
"step": 349
},
{
"epoch": 0.7478632478632479,
"grad_norm": 39.05059309519189,
"learning_rate": 1.2714318190314227e-07,
"logits/chosen": -0.57421875,
"logits/rejected": -0.7109375,
"logps/chosen": -0.8828125,
"logps/rejected": -1.5234375,
"loss": 0.9318,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.049072265625,
"rewards/margins": -0.0107421875,
"rewards/rejected": -0.0380859375,
"step": 350
},
{
"epoch": 0.75,
"grad_norm": 30.774060476068694,
"learning_rate": 1.251355079396752e-07,
"logits/chosen": -0.625,
"logits/rejected": -0.65625,
"logps/chosen": -1.8828125,
"logps/rejected": -0.8515625,
"loss": 0.8772,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.0673828125,
"rewards/margins": 0.05517578125,
"rewards/rejected": -0.1220703125,
"step": 351
},
{
"epoch": 0.7521367521367521,
"grad_norm": 27.8606967937537,
"learning_rate": 1.2314035542035478e-07,
"logits/chosen": -0.65234375,
"logits/rejected": -0.546875,
"logps/chosen": -1.265625,
"logps/rejected": -0.85546875,
"loss": 0.8914,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.08837890625,
"rewards/margins": -0.00115966796875,
"rewards/rejected": -0.0869140625,
"step": 352
},
{
"epoch": 0.7542735042735043,
"grad_norm": 23.469754541397073,
"learning_rate": 1.2115783544405026e-07,
"logits/chosen": -0.45703125,
"logits/rejected": -0.38671875,
"logps/chosen": -0.8984375,
"logps/rejected": -1.65625,
"loss": 0.8776,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.03564453125,
"rewards/margins": 0.1083984375,
"rewards/rejected": -0.1435546875,
"step": 353
},
{
"epoch": 0.7564102564102564,
"grad_norm": 61.11898109311696,
"learning_rate": 1.1918805840619561e-07,
"logits/chosen": -0.412109375,
"logits/rejected": -0.54296875,
"logps/chosen": -0.6875,
"logps/rejected": -2.21875,
"loss": 0.983,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0537109375,
"rewards/margins": 0.11328125,
"rewards/rejected": -0.1669921875,
"step": 354
},
{
"epoch": 0.7585470085470085,
"grad_norm": 34.686715289245434,
"learning_rate": 1.1723113399264162e-07,
"logits/chosen": -0.53125,
"logits/rejected": -0.45703125,
"logps/chosen": -0.8046875,
"logps/rejected": -0.90625,
"loss": 0.8938,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.0595703125,
"rewards/margins": 0.078125,
"rewards/rejected": -0.1376953125,
"step": 355
},
{
"epoch": 0.7606837606837606,
"grad_norm": 28.581977784197978,
"learning_rate": 1.1528717117354865e-07,
"logits/chosen": -0.57421875,
"logits/rejected": -0.625,
"logps/chosen": -1.5078125,
"logps/rejected": -2.46875,
"loss": 0.8634,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.0537109375,
"rewards/margins": 0.0556640625,
"rewards/rejected": -0.109375,
"step": 356
},
{
"epoch": 0.7628205128205128,
"grad_norm": 35.61439045102826,
"learning_rate": 1.1335627819731852e-07,
"logits/chosen": -0.4375,
"logits/rejected": -0.51953125,
"logps/chosen": -0.97265625,
"logps/rejected": -1.0234375,
"loss": 0.8699,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.10693359375,
"rewards/margins": 0.0096435546875,
"rewards/rejected": -0.11669921875,
"step": 357
},
{
"epoch": 0.7649572649572649,
"grad_norm": 94.57763593516974,
"learning_rate": 1.114385625845664e-07,
"logits/chosen": -0.578125,
"logits/rejected": -0.58984375,
"logps/chosen": -0.76171875,
"logps/rejected": -0.921875,
"loss": 1.0251,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0771484375,
"rewards/margins": 0.0712890625,
"rewards/rejected": -0.1484375,
"step": 358
},
{
"epoch": 0.7670940170940171,
"grad_norm": 41.86614792413909,
"learning_rate": 1.0953413112213418e-07,
"logits/chosen": -0.296875,
"logits/rejected": -0.259765625,
"logps/chosen": -0.890625,
"logps/rejected": -1.71875,
"loss": 0.9625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0458984375,
"rewards/margins": -0.00421142578125,
"rewards/rejected": -0.041748046875,
"step": 359
},
{
"epoch": 0.7692307692307693,
"grad_norm": 65.35514474408973,
"learning_rate": 1.0764308985714354e-07,
"logits/chosen": -0.52734375,
"logits/rejected": -0.5078125,
"logps/chosen": -1.0390625,
"logps/rejected": -0.69140625,
"loss": 1.0248,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.017333984375,
"rewards/margins": 0.072265625,
"rewards/rejected": -0.08935546875,
"step": 360
},
{
"epoch": 0.7713675213675214,
"grad_norm": 21.30177814317387,
"learning_rate": 1.0576554409109134e-07,
"logits/chosen": -0.51171875,
"logits/rejected": -0.5078125,
"logps/chosen": -0.796875,
"logps/rejected": -0.71875,
"loss": 0.9224,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.047119140625,
"rewards/margins": 0.017333984375,
"rewards/rejected": -0.064453125,
"step": 361
},
{
"epoch": 0.7735042735042735,
"grad_norm": 20.90088638719811,
"learning_rate": 1.039015983739857e-07,
"logits/chosen": -0.47265625,
"logits/rejected": -0.46875,
"logps/chosen": -0.75,
"logps/rejected": -1.4609375,
"loss": 0.9038,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.06591796875,
"rewards/margins": 0.054443359375,
"rewards/rejected": -0.12060546875,
"step": 362
},
{
"epoch": 0.7756410256410257,
"grad_norm": 23.552172762978927,
"learning_rate": 1.0205135649852387e-07,
"logits/chosen": -0.37109375,
"logits/rejected": -0.5234375,
"logps/chosen": -0.90234375,
"logps/rejected": -1.28125,
"loss": 0.8752,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.04638671875,
"rewards/margins": 0.0576171875,
"rewards/rejected": -0.103515625,
"step": 363
},
{
"epoch": 0.7777777777777778,
"grad_norm": 43.97955125787066,
"learning_rate": 1.0021492149431308e-07,
"logits/chosen": -0.55078125,
"logits/rejected": -0.62890625,
"logps/chosen": -1.2890625,
"logps/rejected": -1.65625,
"loss": 0.8794,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.016357421875,
"rewards/margins": 0.1025390625,
"rewards/rejected": -0.11865234375,
"step": 364
},
{
"epoch": 0.7799145299145299,
"grad_norm": 17.611224627975545,
"learning_rate": 9.839239562213343e-08,
"logits/chosen": -0.26953125,
"logits/rejected": -0.322265625,
"logps/chosen": -0.9765625,
"logps/rejected": -1.171875,
"loss": 0.8906,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.044677734375,
"rewards/margins": 0.04443359375,
"rewards/rejected": -0.0888671875,
"step": 365
},
{
"epoch": 0.782051282051282,
"grad_norm": 34.066165086726,
"learning_rate": 9.658388036824293e-08,
"logits/chosen": -0.60546875,
"logits/rejected": -0.671875,
"logps/chosen": -0.70703125,
"logps/rejected": -1.3203125,
"loss": 0.8866,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.050048828125,
"rewards/margins": 0.052734375,
"rewards/rejected": -0.10302734375,
"step": 366
},
{
"epoch": 0.7841880341880342,
"grad_norm": 21.579122943736486,
"learning_rate": 9.478947643872724e-08,
"logits/chosen": -0.5625,
"logits/rejected": -0.5625,
"logps/chosen": -1.8203125,
"logps/rejected": -1.4921875,
"loss": 0.8434,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0888671875,
"rewards/margins": 0.007568359375,
"rewards/rejected": -0.0966796875,
"step": 367
},
{
"epoch": 0.7863247863247863,
"grad_norm": 27.05462584053435,
"learning_rate": 9.300928375389093e-08,
"logits/chosen": -0.50390625,
"logits/rejected": -0.44921875,
"logps/chosen": -1.953125,
"logps/rejected": -1.0390625,
"loss": 0.8708,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.09375,
"rewards/margins": 0.01104736328125,
"rewards/rejected": -0.10498046875,
"step": 368
},
{
"epoch": 0.7884615384615384,
"grad_norm": 29.745532951341836,
"learning_rate": 9.124340144269419e-08,
"logits/chosen": -0.55859375,
"logits/rejected": -0.56640625,
"logps/chosen": -1.078125,
"logps/rejected": -1.4453125,
"loss": 0.8735,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.05908203125,
"rewards/margins": 0.0732421875,
"rewards/rejected": -0.1328125,
"step": 369
},
{
"epoch": 0.7905982905982906,
"grad_norm": 36.32061374606974,
"learning_rate": 8.949192783723277e-08,
"logits/chosen": -0.796875,
"logits/rejected": -0.7265625,
"logps/chosen": -0.7265625,
"logps/rejected": -0.7734375,
"loss": 0.9109,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0732421875,
"rewards/margins": 0.02685546875,
"rewards/rejected": -0.10009765625,
"step": 370
},
{
"epoch": 0.7927350427350427,
"grad_norm": 35.858803281032266,
"learning_rate": 8.775496046726193e-08,
"logits/chosen": -0.5703125,
"logits/rejected": -0.58203125,
"logps/chosen": -1.125,
"logps/rejected": -1.0234375,
"loss": 0.8995,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.06005859375,
"rewards/margins": 0.0146484375,
"rewards/rejected": -0.07470703125,
"step": 371
},
{
"epoch": 0.7948717948717948,
"grad_norm": 101.49292897549103,
"learning_rate": 8.603259605476635e-08,
"logits/chosen": -0.66015625,
"logits/rejected": -0.640625,
"logps/chosen": -0.9453125,
"logps/rejected": -1.2265625,
"loss": 0.9249,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.056640625,
"rewards/margins": 0.003662109375,
"rewards/rejected": -0.06005859375,
"step": 372
},
{
"epoch": 0.7970085470085471,
"grad_norm": 18.221625405676257,
"learning_rate": 8.43249305085733e-08,
"logits/chosen": -0.640625,
"logits/rejected": -0.61328125,
"logps/chosen": -0.796875,
"logps/rejected": -0.76953125,
"loss": 0.853,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.04736328125,
"rewards/margins": 0.055908203125,
"rewards/rejected": -0.103515625,
"step": 373
},
{
"epoch": 0.7991452991452992,
"grad_norm": 27.286873106336177,
"learning_rate": 8.263205891901301e-08,
"logits/chosen": -0.55078125,
"logits/rejected": -0.58984375,
"logps/chosen": -1.0703125,
"logps/rejected": -1.515625,
"loss": 0.8521,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.033935546875,
"rewards/margins": 0.0888671875,
"rewards/rejected": -0.12255859375,
"step": 374
},
{
"epoch": 0.8012820512820513,
"grad_norm": 28.665743004887997,
"learning_rate": 8.095407555262294e-08,
"logits/chosen": -0.4375,
"logits/rejected": -0.416015625,
"logps/chosen": -1.0390625,
"logps/rejected": -1.21875,
"loss": 0.885,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.05322265625,
"rewards/margins": 0.040771484375,
"rewards/rejected": -0.09423828125,
"step": 375
},
{
"epoch": 0.8034188034188035,
"grad_norm": 34.61683957680185,
"learning_rate": 7.929107384689855e-08,
"logits/chosen": -0.64453125,
"logits/rejected": -0.640625,
"logps/chosen": -0.81640625,
"logps/rejected": -0.82421875,
"loss": 0.8433,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.056884765625,
"rewards/margins": 0.0213623046875,
"rewards/rejected": -0.078125,
"step": 376
},
{
"epoch": 0.8055555555555556,
"grad_norm": 27.203509280102132,
"learning_rate": 7.764314640509094e-08,
"logits/chosen": -0.8203125,
"logits/rejected": -0.7890625,
"logps/chosen": -2.46875,
"logps/rejected": -1.703125,
"loss": 0.932,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0361328125,
"rewards/margins": 0.041748046875,
"rewards/rejected": -0.07763671875,
"step": 377
},
{
"epoch": 0.8076923076923077,
"grad_norm": 69.76164123890895,
"learning_rate": 7.601038499104956e-08,
"logits/chosen": -0.5234375,
"logits/rejected": -0.494140625,
"logps/chosen": -1.609375,
"logps/rejected": -1.890625,
"loss": 0.9048,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.023681640625,
"rewards/margins": 0.06396484375,
"rewards/rejected": -0.08740234375,
"step": 378
},
{
"epoch": 0.8098290598290598,
"grad_norm": 16.95114226496351,
"learning_rate": 7.439288052411272e-08,
"logits/chosen": -0.53125,
"logits/rejected": -0.6171875,
"logps/chosen": -1.0078125,
"logps/rejected": -0.859375,
"loss": 0.8966,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.022216796875,
"rewards/margins": 0.059326171875,
"rewards/rejected": -0.08154296875,
"step": 379
},
{
"epoch": 0.811965811965812,
"grad_norm": 22.739756048836032,
"learning_rate": 7.279072307404507e-08,
"logits/chosen": -0.63671875,
"logits/rejected": -0.6484375,
"logps/chosen": -0.90625,
"logps/rejected": -1.7265625,
"loss": 0.8435,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.06494140625,
"rewards/margins": 0.040771484375,
"rewards/rejected": -0.10595703125,
"step": 380
},
{
"epoch": 0.8141025641025641,
"grad_norm": 26.571323472554763,
"learning_rate": 7.120400185602155e-08,
"logits/chosen": -0.267578125,
"logits/rejected": -0.220703125,
"logps/chosen": -0.7890625,
"logps/rejected": -1.359375,
"loss": 0.9124,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.062255859375,
"rewards/margins": 0.00299072265625,
"rewards/rejected": -0.0654296875,
"step": 381
},
{
"epoch": 0.8162393162393162,
"grad_norm": 36.79009357491132,
"learning_rate": 6.963280522565996e-08,
"logits/chosen": -0.421875,
"logits/rejected": -0.376953125,
"logps/chosen": -1.1640625,
"logps/rejected": -1.3046875,
"loss": 0.863,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0703125,
"rewards/margins": 0.05419921875,
"rewards/rejected": -0.12451171875,
"step": 382
},
{
"epoch": 0.8183760683760684,
"grad_norm": 29.76450789194752,
"learning_rate": 6.807722067410082e-08,
"logits/chosen": -0.1787109375,
"logits/rejected": -0.27734375,
"logps/chosen": -0.7265625,
"logps/rejected": -0.78125,
"loss": 0.9323,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0361328125,
"rewards/margins": 0.057373046875,
"rewards/rejected": -0.09375,
"step": 383
},
{
"epoch": 0.8205128205128205,
"grad_norm": 35.13396843518851,
"learning_rate": 6.653733482313519e-08,
"logits/chosen": -0.44140625,
"logits/rejected": -0.421875,
"logps/chosen": -0.703125,
"logps/rejected": -1.3984375,
"loss": 0.8912,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0556640625,
"rewards/margins": 0.095703125,
"rewards/rejected": -0.15234375,
"step": 384
},
{
"epoch": 0.8226495726495726,
"grad_norm": 53.4634091517358,
"learning_rate": 6.501323342038164e-08,
"logits/chosen": -0.466796875,
"logits/rejected": -0.51171875,
"logps/chosen": -0.84765625,
"logps/rejected": -1.09375,
"loss": 0.9086,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.06640625,
"rewards/margins": 0.0771484375,
"rewards/rejected": -0.1435546875,
"step": 385
},
{
"epoch": 0.8247863247863247,
"grad_norm": 60.84615207783601,
"learning_rate": 6.350500133451102e-08,
"logits/chosen": -0.53125,
"logits/rejected": -0.671875,
"logps/chosen": -1.953125,
"logps/rejected": -1.5703125,
"loss": 0.8564,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.033447265625,
"rewards/margins": 0.13671875,
"rewards/rejected": -0.169921875,
"step": 386
},
{
"epoch": 0.8269230769230769,
"grad_norm": 17.914476410603207,
"learning_rate": 6.201272255052099e-08,
"logits/chosen": -0.0986328125,
"logits/rejected": -0.10888671875,
"logps/chosen": -0.73828125,
"logps/rejected": -0.83984375,
"loss": 0.8545,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.03173828125,
"rewards/margins": 0.055419921875,
"rewards/rejected": -0.08740234375,
"step": 387
},
{
"epoch": 0.8290598290598291,
"grad_norm": 21.49907230153767,
"learning_rate": 6.05364801650592e-08,
"logits/chosen": -0.74609375,
"logits/rejected": -0.6875,
"logps/chosen": -0.76171875,
"logps/rejected": -1.28125,
"loss": 0.8809,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07666015625,
"rewards/margins": 0.020751953125,
"rewards/rejected": -0.09716796875,
"step": 388
},
{
"epoch": 0.8311965811965812,
"grad_norm": 61.47517825652869,
"learning_rate": 5.907635638179577e-08,
"logits/chosen": -0.66796875,
"logits/rejected": -0.6328125,
"logps/chosen": -0.71875,
"logps/rejected": -0.75,
"loss": 0.9121,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.07666015625,
"rewards/margins": 0.039794921875,
"rewards/rejected": -0.1162109375,
"step": 389
},
{
"epoch": 0.8333333333333334,
"grad_norm": 72.42429142086857,
"learning_rate": 5.763243250684664e-08,
"logits/chosen": -0.69140625,
"logits/rejected": -0.6875,
"logps/chosen": -1.4375,
"logps/rejected": -2.484375,
"loss": 1.0898,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.0439453125,
"rewards/margins": 0.2470703125,
"rewards/rejected": -0.291015625,
"step": 390
},
{
"epoch": 0.8354700854700855,
"grad_norm": 29.995287539704513,
"learning_rate": 5.6204788944245117e-08,
"logits/chosen": -0.703125,
"logits/rejected": -0.70703125,
"logps/chosen": -1.03125,
"logps/rejected": -2.1875,
"loss": 0.8693,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0478515625,
"rewards/margins": 0.14453125,
"rewards/rejected": -0.1923828125,
"step": 391
},
{
"epoch": 0.8376068376068376,
"grad_norm": 18.326694038182005,
"learning_rate": 5.479350519146523e-08,
"logits/chosen": -0.46875,
"logits/rejected": -0.56640625,
"logps/chosen": -0.78125,
"logps/rejected": -1.1015625,
"loss": 0.7939,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0576171875,
"rewards/margins": 0.080078125,
"rewards/rejected": -0.138671875,
"step": 392
},
{
"epoch": 0.8397435897435898,
"grad_norm": 43.4344545311177,
"learning_rate": 5.3398659834995075e-08,
"logits/chosen": -0.56640625,
"logits/rejected": -0.6328125,
"logps/chosen": -0.82421875,
"logps/rejected": -0.8984375,
"loss": 0.9371,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.080078125,
"rewards/margins": 0.03173828125,
"rewards/rejected": -0.11181640625,
"step": 393
},
{
"epoch": 0.8418803418803419,
"grad_norm": 54.049826393890534,
"learning_rate": 5.202033054596012e-08,
"logits/chosen": -0.796875,
"logits/rejected": -0.703125,
"logps/chosen": -1.3671875,
"logps/rejected": -1.5,
"loss": 0.892,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0220947265625,
"rewards/margins": 0.076171875,
"rewards/rejected": -0.0986328125,
"step": 394
},
{
"epoch": 0.844017094017094,
"grad_norm": 20.13102225768583,
"learning_rate": 5.0658594075799e-08,
"logits/chosen": -0.6796875,
"logits/rejected": -0.671875,
"logps/chosen": -1.765625,
"logps/rejected": -2.875,
"loss": 0.8788,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0277099609375,
"rewards/margins": 0.06982421875,
"rewards/rejected": -0.09765625,
"step": 395
},
{
"epoch": 0.8461538461538461,
"grad_norm": 28.927848143732405,
"learning_rate": 4.931352625198872e-08,
"logits/chosen": -0.75390625,
"logits/rejected": -0.7109375,
"logps/chosen": -0.859375,
"logps/rejected": -0.7109375,
"loss": 0.8744,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.046630859375,
"rewards/margins": 0.04052734375,
"rewards/rejected": -0.0869140625,
"step": 396
},
{
"epoch": 0.8482905982905983,
"grad_norm": 63.12214875940479,
"learning_rate": 4.7985201973823056e-08,
"logits/chosen": -0.61328125,
"logits/rejected": -0.6171875,
"logps/chosen": -0.671875,
"logps/rejected": -0.9609375,
"loss": 0.9304,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.076171875,
"rewards/margins": 0.0400390625,
"rewards/rejected": -0.1162109375,
"step": 397
},
{
"epoch": 0.8504273504273504,
"grad_norm": 44.643489074895264,
"learning_rate": 4.6673695208241485e-08,
"logits/chosen": -0.47265625,
"logits/rejected": -0.5546875,
"logps/chosen": -0.9140625,
"logps/rejected": -1.0546875,
"loss": 0.887,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.05322265625,
"rewards/margins": 0.0244140625,
"rewards/rejected": -0.07763671875,
"step": 398
},
{
"epoch": 0.8525641025641025,
"grad_norm": 39.04412527761531,
"learning_rate": 4.53790789857102e-08,
"logits/chosen": -0.76171875,
"logits/rejected": -0.76171875,
"logps/chosen": -1.1796875,
"logps/rejected": -1.625,
"loss": 0.9543,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0966796875,
"rewards/margins": 0.0263671875,
"rewards/rejected": -0.123046875,
"step": 399
},
{
"epoch": 0.8547008547008547,
"grad_norm": 27.42452165554953,
"learning_rate": 4.41014253961559e-08,
"logits/chosen": -0.515625,
"logits/rejected": -0.53125,
"logps/chosen": -1.078125,
"logps/rejected": -1.4453125,
"loss": 0.8307,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0810546875,
"rewards/margins": 0.083984375,
"rewards/rejected": -0.1650390625,
"step": 400
},
{
"epoch": 0.8547008547008547,
"eval_logits/chosen": -0.59765625,
"eval_logits/rejected": -0.609375,
"eval_logps/chosen": -1.1640625,
"eval_logps/rejected": -1.265625,
"eval_loss": 0.8950571417808533,
"eval_rewards/accuracies": 0.7338709831237793,
"eval_rewards/chosen": -0.068359375,
"eval_rewards/margins": 0.06982421875,
"eval_rewards/rejected": -0.138671875,
"eval_runtime": 101.1209,
"eval_samples_per_second": 19.393,
"eval_steps_per_second": 0.613,
"step": 400
},
{
"epoch": 0.8568376068376068,
"grad_norm": 33.032240773717284,
"learning_rate": 4.2840805584951014e-08,
"logits/chosen": -0.48828125,
"logits/rejected": -0.48046875,
"logps/chosen": -0.796875,
"logps/rejected": -0.8359375,
"loss": 0.8789,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.07421875,
"rewards/margins": 0.04248046875,
"rewards/rejected": -0.1162109375,
"step": 401
},
{
"epoch": 0.8589743589743589,
"grad_norm": 31.63013982511431,
"learning_rate": 4.159728974895238e-08,
"logits/chosen": -0.7578125,
"logits/rejected": -0.76953125,
"logps/chosen": -0.8828125,
"logps/rejected": -0.71875,
"loss": 0.8907,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0908203125,
"rewards/margins": 0.02392578125,
"rewards/rejected": -0.115234375,
"step": 402
},
{
"epoch": 0.8611111111111112,
"grad_norm": 48.908225956606366,
"learning_rate": 4.037094713259238e-08,
"logits/chosen": -0.640625,
"logits/rejected": -0.625,
"logps/chosen": -0.65234375,
"logps/rejected": -1.640625,
"loss": 0.9325,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0341796875,
"rewards/margins": -0.00103759765625,
"rewards/rejected": -0.032958984375,
"step": 403
},
{
"epoch": 0.8632478632478633,
"grad_norm": 56.81805342384158,
"learning_rate": 3.91618460240227e-08,
"logits/chosen": -0.53125,
"logits/rejected": -0.38671875,
"logps/chosen": -0.9140625,
"logps/rejected": -1.046875,
"loss": 0.9548,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.080078125,
"rewards/margins": 0.059814453125,
"rewards/rejected": -0.140625,
"step": 404
},
{
"epoch": 0.8653846153846154,
"grad_norm": 36.45563015025386,
"learning_rate": 3.797005375131227e-08,
"logits/chosen": -0.447265625,
"logits/rejected": -0.33984375,
"logps/chosen": -0.7734375,
"logps/rejected": -0.8203125,
"loss": 0.9068,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0703125,
"rewards/margins": 0.009521484375,
"rewards/rejected": -0.080078125,
"step": 405
},
{
"epoch": 0.8675213675213675,
"grad_norm": 29.66434209928639,
"learning_rate": 3.6795636678697766e-08,
"logits/chosen": -0.5859375,
"logits/rejected": -0.68359375,
"logps/chosen": -1.4609375,
"logps/rejected": -1.5703125,
"loss": 0.8785,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.111328125,
"rewards/margins": 0.036865234375,
"rewards/rejected": -0.1484375,
"step": 406
},
{
"epoch": 0.8696581196581197,
"grad_norm": 50.48136026330252,
"learning_rate": 3.563866020288821e-08,
"logits/chosen": -0.625,
"logits/rejected": -0.578125,
"logps/chosen": -2.46875,
"logps/rejected": -2.40625,
"loss": 0.9175,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.076171875,
"rewards/margins": 0.046875,
"rewards/rejected": -0.12353515625,
"step": 407
},
{
"epoch": 0.8717948717948718,
"grad_norm": 31.90288630862614,
"learning_rate": 3.449918874942371e-08,
"logits/chosen": -0.455078125,
"logits/rejected": -0.443359375,
"logps/chosen": -1.9296875,
"logps/rejected": -1.859375,
"loss": 0.9159,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0732421875,
"rewards/margins": 0.002899169921875,
"rewards/rejected": -0.076171875,
"step": 408
},
{
"epoch": 0.8739316239316239,
"grad_norm": 20.338754487967687,
"learning_rate": 3.337728576908747e-08,
"logits/chosen": -0.546875,
"logits/rejected": -0.44140625,
"logps/chosen": -0.6640625,
"logps/rejected": -0.68359375,
"loss": 0.8912,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.056396484375,
"rewards/margins": 0.0205078125,
"rewards/rejected": -0.0771484375,
"step": 409
},
{
"epoch": 0.8760683760683761,
"grad_norm": 30.746459120515787,
"learning_rate": 3.2273013734373e-08,
"logits/chosen": -0.337890625,
"logits/rejected": -0.462890625,
"logps/chosen": -1.0703125,
"logps/rejected": -1.1171875,
"loss": 0.8478,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.044921875,
"rewards/margins": 0.026123046875,
"rewards/rejected": -0.0712890625,
"step": 410
},
{
"epoch": 0.8782051282051282,
"grad_norm": 21.508129319749067,
"learning_rate": 3.11864341360052e-08,
"logits/chosen": -0.5625,
"logits/rejected": -0.5625,
"logps/chosen": -1.03125,
"logps/rejected": -1.1015625,
"loss": 0.8339,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.119140625,
"rewards/margins": 0.06689453125,
"rewards/rejected": -0.1865234375,
"step": 411
},
{
"epoch": 0.8803418803418803,
"grad_norm": 76.39894216110301,
"learning_rate": 3.0117607479516015e-08,
"logits/chosen": -0.486328125,
"logits/rejected": -0.458984375,
"logps/chosen": -0.69140625,
"logps/rejected": -0.82421875,
"loss": 0.9626,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.045654296875,
"rewards/margins": 0.0556640625,
"rewards/rejected": -0.1015625,
"step": 412
},
{
"epoch": 0.8824786324786325,
"grad_norm": 35.131873970074196,
"learning_rate": 2.9066593281875916e-08,
"logits/chosen": -0.81640625,
"logits/rejected": -0.7578125,
"logps/chosen": -0.98046875,
"logps/rejected": -1.25,
"loss": 0.8566,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.056396484375,
"rewards/margins": 0.025634765625,
"rewards/rejected": -0.08203125,
"step": 413
},
{
"epoch": 0.8846153846153846,
"grad_norm": 30.913338192487096,
"learning_rate": 2.8033450068178878e-08,
"logits/chosen": -0.25,
"logits/rejected": -0.1845703125,
"logps/chosen": -1.234375,
"logps/rejected": -1.375,
"loss": 0.8087,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.11328125,
"rewards/margins": 0.062255859375,
"rewards/rejected": -0.17578125,
"step": 414
},
{
"epoch": 0.8867521367521367,
"grad_norm": 50.75341001418908,
"learning_rate": 2.7018235368384134e-08,
"logits/chosen": -0.375,
"logits/rejected": -0.466796875,
"logps/chosen": -1.859375,
"logps/rejected": -1.9609375,
"loss": 0.9288,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.09375,
"rewards/margins": 0.111328125,
"rewards/rejected": -0.205078125,
"step": 415
},
{
"epoch": 0.8888888888888888,
"grad_norm": 39.95487638253479,
"learning_rate": 2.6021005714112375e-08,
"logits/chosen": -0.734375,
"logits/rejected": -0.7734375,
"logps/chosen": -0.84765625,
"logps/rejected": -1.171875,
"loss": 0.8656,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.06005859375,
"rewards/margins": 0.060546875,
"rewards/rejected": -0.12060546875,
"step": 416
},
{
"epoch": 0.8910256410256411,
"grad_norm": 19.38742446704256,
"learning_rate": 2.5041816635497703e-08,
"logits/chosen": -0.55078125,
"logits/rejected": -0.5078125,
"logps/chosen": -1.7109375,
"logps/rejected": -1.1875,
"loss": 0.8466,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0289306640625,
"rewards/margins": 0.078125,
"rewards/rejected": -0.04931640625,
"step": 417
},
{
"epoch": 0.8931623931623932,
"grad_norm": 24.217067390108564,
"learning_rate": 2.408072265809576e-08,
"logits/chosen": -0.5859375,
"logits/rejected": -0.546875,
"logps/chosen": -0.7109375,
"logps/rejected": -0.9375,
"loss": 0.8562,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.060546875,
"rewards/margins": 0.050537109375,
"rewards/rejected": -0.111328125,
"step": 418
},
{
"epoch": 0.8952991452991453,
"grad_norm": 36.092412288856494,
"learning_rate": 2.313777729984726e-08,
"logits/chosen": -0.423828125,
"logits/rejected": -0.302734375,
"logps/chosen": -0.75,
"logps/rejected": -1.5234375,
"loss": 0.9023,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.10693359375,
"rewards/margins": 0.1015625,
"rewards/rejected": -0.2080078125,
"step": 419
},
{
"epoch": 0.8974358974358975,
"grad_norm": 27.603997759182036,
"learning_rate": 2.221303306809788e-08,
"logits/chosen": -0.546875,
"logits/rejected": -0.55859375,
"logps/chosen": -0.6640625,
"logps/rejected": -0.734375,
"loss": 0.892,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0732421875,
"rewards/margins": 0.0302734375,
"rewards/rejected": -0.10400390625,
"step": 420
},
{
"epoch": 0.8995726495726496,
"grad_norm": 26.452313301173593,
"learning_rate": 2.1306541456674736e-08,
"logits/chosen": -0.5703125,
"logits/rejected": -0.5625,
"logps/chosen": -0.8515625,
"logps/rejected": -0.9765625,
"loss": 0.877,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.0225830078125,
"rewards/margins": 0.054931640625,
"rewards/rejected": -0.07763671875,
"step": 421
},
{
"epoch": 0.9017094017094017,
"grad_norm": 27.150619563751665,
"learning_rate": 2.0418352943018497e-08,
"logits/chosen": -0.49609375,
"logits/rejected": -0.53515625,
"logps/chosen": -0.94921875,
"logps/rejected": -0.953125,
"loss": 0.877,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1357421875,
"rewards/margins": 0.017333984375,
"rewards/rejected": -0.1533203125,
"step": 422
},
{
"epoch": 0.9038461538461539,
"grad_norm": 25.600194103146695,
"learning_rate": 1.9548516985372982e-08,
"logits/chosen": -0.515625,
"logits/rejected": -0.47265625,
"logps/chosen": -0.66796875,
"logps/rejected": -1.046875,
"loss": 0.8555,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.057861328125,
"rewards/margins": 0.0595703125,
"rewards/rejected": -0.1171875,
"step": 423
},
{
"epoch": 0.905982905982906,
"grad_norm": 62.612070797475276,
"learning_rate": 1.869708202003093e-08,
"logits/chosen": -0.8046875,
"logits/rejected": -0.77734375,
"logps/chosen": -1.0703125,
"logps/rejected": -1.296875,
"loss": 0.943,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.055908203125,
"rewards/margins": 0.11328125,
"rewards/rejected": -0.169921875,
"step": 424
},
{
"epoch": 0.9081196581196581,
"grad_norm": 27.367161811022896,
"learning_rate": 1.7864095458636836e-08,
"logits/chosen": -0.55078125,
"logits/rejected": -0.48046875,
"logps/chosen": -0.90625,
"logps/rejected": -0.7890625,
"loss": 0.8156,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0751953125,
"rewards/margins": 0.03564453125,
"rewards/rejected": -0.1103515625,
"step": 425
},
{
"epoch": 0.9102564102564102,
"grad_norm": 29.270727979912408,
"learning_rate": 1.7049603685546986e-08,
"logits/chosen": -0.828125,
"logits/rejected": -0.765625,
"logps/chosen": -0.7734375,
"logps/rejected": -0.7734375,
"loss": 0.8855,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.080078125,
"rewards/margins": 0.028076171875,
"rewards/rejected": -0.10791015625,
"step": 426
},
{
"epoch": 0.9123931623931624,
"grad_norm": 26.620286341772662,
"learning_rate": 1.6253652055246357e-08,
"logits/chosen": -0.62109375,
"logits/rejected": -0.59375,
"logps/chosen": -1.1171875,
"logps/rejected": -1.125,
"loss": 0.8418,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.049560546875,
"rewards/margins": 0.04931640625,
"rewards/rejected": -0.0986328125,
"step": 427
},
{
"epoch": 0.9145299145299145,
"grad_norm": 31.429203131736777,
"learning_rate": 1.5476284889823315e-08,
"logits/chosen": -0.455078125,
"logits/rejected": -0.55859375,
"logps/chosen": -1.0546875,
"logps/rejected": -2.609375,
"loss": 0.8818,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.03515625,
"rewards/margins": 0.125,
"rewards/rejected": -0.16015625,
"step": 428
},
{
"epoch": 0.9166666666666666,
"grad_norm": 25.327941068879277,
"learning_rate": 1.4717545476501487e-08,
"logits/chosen": -0.484375,
"logits/rejected": -0.58203125,
"logps/chosen": -0.8203125,
"logps/rejected": -0.86328125,
"loss": 0.8188,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0830078125,
"rewards/margins": 0.056396484375,
"rewards/rejected": -0.1396484375,
"step": 429
},
{
"epoch": 0.9188034188034188,
"grad_norm": 36.762939345250295,
"learning_rate": 1.3977476065229216e-08,
"logits/chosen": -0.70703125,
"logits/rejected": -0.66015625,
"logps/chosen": -0.8046875,
"logps/rejected": -1.171875,
"loss": 0.8722,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07177734375,
"rewards/margins": -3.0517578125e-05,
"rewards/rejected": -0.07177734375,
"step": 430
},
{
"epoch": 0.9209401709401709,
"grad_norm": 47.33246317581902,
"learning_rate": 1.3256117866327116e-08,
"logits/chosen": -0.6015625,
"logits/rejected": -0.6328125,
"logps/chosen": -1.78125,
"logps/rejected": -1.125,
"loss": 0.9324,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.107421875,
"rewards/margins": 0.041015625,
"rewards/rejected": -0.1484375,
"step": 431
},
{
"epoch": 0.9230769230769231,
"grad_norm": 37.81514750899869,
"learning_rate": 1.2553511048193044e-08,
"logits/chosen": -0.65625,
"logits/rejected": -0.5625,
"logps/chosen": -1.015625,
"logps/rejected": -1.1484375,
"loss": 0.884,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07080078125,
"rewards/margins": 0.037353515625,
"rewards/rejected": -0.10791015625,
"step": 432
},
{
"epoch": 0.9252136752136753,
"grad_norm": 66.29958051188179,
"learning_rate": 1.1869694735065606e-08,
"logits/chosen": -0.66015625,
"logits/rejected": -0.61328125,
"logps/chosen": -0.765625,
"logps/rejected": -0.7734375,
"loss": 0.9717,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0634765625,
"rewards/margins": 0.025390625,
"rewards/rejected": -0.0888671875,
"step": 433
},
{
"epoch": 0.9273504273504274,
"grad_norm": 113.35297326249436,
"learning_rate": 1.1204707004845316e-08,
"logits/chosen": -0.5546875,
"logits/rejected": -0.64453125,
"logps/chosen": -1.0078125,
"logps/rejected": -0.9609375,
"loss": 0.879,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1279296875,
"rewards/margins": 0.0322265625,
"rewards/rejected": -0.16015625,
"step": 434
},
{
"epoch": 0.9294871794871795,
"grad_norm": 21.10807579120109,
"learning_rate": 1.0558584886974482e-08,
"logits/chosen": -0.421875,
"logits/rejected": -0.37890625,
"logps/chosen": -0.98046875,
"logps/rejected": -0.8515625,
"loss": 0.8398,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0162353515625,
"rewards/margins": 0.072265625,
"rewards/rejected": -0.08837890625,
"step": 435
},
{
"epoch": 0.9316239316239316,
"grad_norm": 25.73470244065982,
"learning_rate": 9.931364360375016e-09,
"logits/chosen": -0.5546875,
"logits/rejected": -0.57421875,
"logps/chosen": -1.46875,
"logps/rejected": -2.0,
"loss": 0.8243,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.07080078125,
"rewards/margins": 0.04296875,
"rewards/rejected": -0.11376953125,
"step": 436
},
{
"epoch": 0.9337606837606838,
"grad_norm": 38.22779506686182,
"learning_rate": 9.323080351445167e-09,
"logits/chosen": -0.49609375,
"logits/rejected": -0.65234375,
"logps/chosen": -1.4609375,
"logps/rejected": -1.65625,
"loss": 0.8678,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.05517578125,
"rewards/margins": 0.09326171875,
"rewards/rejected": -0.1484375,
"step": 437
},
{
"epoch": 0.9358974358974359,
"grad_norm": 31.173946501797236,
"learning_rate": 8.733766732114484e-09,
"logits/chosen": -0.78515625,
"logits/rejected": -0.7578125,
"logps/chosen": -0.9921875,
"logps/rejected": -1.09375,
"loss": 0.8744,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.126953125,
"rewards/margins": 0.019775390625,
"rewards/rejected": -0.1474609375,
"step": 438
},
{
"epoch": 0.938034188034188,
"grad_norm": 42.95244579752985,
"learning_rate": 8.163456317957856e-09,
"logits/chosen": -0.3203125,
"logits/rejected": -0.326171875,
"logps/chosen": -0.63671875,
"logps/rejected": -0.703125,
"loss": 0.8615,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.06103515625,
"rewards/margins": 0.048828125,
"rewards/rejected": -0.10986328125,
"step": 439
},
{
"epoch": 0.9401709401709402,
"grad_norm": 27.187695525499535,
"learning_rate": 7.612180866367995e-09,
"logits/chosen": -0.43359375,
"logits/rejected": -0.45703125,
"logps/chosen": -0.7734375,
"logps/rejected": -0.796875,
"loss": 0.8594,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0849609375,
"rewards/margins": 0.019775390625,
"rewards/rejected": -0.1044921875,
"step": 440
},
{
"epoch": 0.9423076923076923,
"grad_norm": 40.38489726441983,
"learning_rate": 7.079971074787322e-09,
"logits/chosen": -0.47265625,
"logits/rejected": -0.427734375,
"logps/chosen": -1.40625,
"logps/rejected": -0.859375,
"loss": 0.9238,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.032958984375,
"rewards/margins": 0.050048828125,
"rewards/rejected": -0.0830078125,
"step": 441
},
{
"epoch": 0.9444444444444444,
"grad_norm": 90.14195771853652,
"learning_rate": 6.5668565789983036e-09,
"logits/chosen": -0.66015625,
"logits/rejected": -0.62109375,
"logps/chosen": -3.09375,
"logps/rejected": -1.1875,
"loss": 0.929,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0169677734375,
"rewards/margins": 0.09912109375,
"rewards/rejected": -0.08251953125,
"step": 442
},
{
"epoch": 0.9465811965811965,
"grad_norm": 36.27940783870119,
"learning_rate": 6.072865951473316e-09,
"logits/chosen": -0.6484375,
"logits/rejected": -0.65625,
"logps/chosen": -0.91015625,
"logps/rejected": -0.8671875,
"loss": 0.927,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0908203125,
"rewards/margins": 0.01611328125,
"rewards/rejected": -0.10693359375,
"step": 443
},
{
"epoch": 0.9487179487179487,
"grad_norm": 39.72454819271685,
"learning_rate": 5.59802669978377e-09,
"logits/chosen": -0.5390625,
"logits/rejected": -0.609375,
"logps/chosen": -2.234375,
"logps/rejected": -1.7109375,
"loss": 0.8793,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.11083984375,
"rewards/margins": -0.0004425048828125,
"rewards/rejected": -0.1103515625,
"step": 444
},
{
"epoch": 0.9508547008547008,
"grad_norm": 34.77865009821792,
"learning_rate": 5.142365265068022e-09,
"logits/chosen": -0.58984375,
"logits/rejected": -0.6015625,
"logps/chosen": -0.83984375,
"logps/rejected": -0.7890625,
"loss": 0.9043,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.087890625,
"rewards/margins": 0.01446533203125,
"rewards/rejected": -0.1025390625,
"step": 445
},
{
"epoch": 0.9529914529914529,
"grad_norm": 29.280675486054747,
"learning_rate": 4.705907020559363e-09,
"logits/chosen": -0.515625,
"logits/rejected": -0.625,
"logps/chosen": -0.8515625,
"logps/rejected": -1.484375,
"loss": 0.8662,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.050537109375,
"rewards/margins": 0.0206298828125,
"rewards/rejected": -0.0712890625,
"step": 446
},
{
"epoch": 0.9551282051282052,
"grad_norm": 19.018888833187862,
"learning_rate": 4.288676270172959e-09,
"logits/chosen": -0.62109375,
"logits/rejected": -0.6015625,
"logps/chosen": -0.7421875,
"logps/rejected": -1.03125,
"loss": 0.8485,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0595703125,
"rewards/margins": 0.06591796875,
"rewards/rejected": -0.1259765625,
"step": 447
},
{
"epoch": 0.9572649572649573,
"grad_norm": 44.77098877511915,
"learning_rate": 3.890696247152425e-09,
"logits/chosen": -0.57421875,
"logits/rejected": -0.6484375,
"logps/chosen": -1.8984375,
"logps/rejected": -1.6015625,
"loss": 0.8834,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0703125,
"rewards/margins": 0.0279541015625,
"rewards/rejected": -0.09814453125,
"step": 448
},
{
"epoch": 0.9594017094017094,
"grad_norm": 37.310355068897586,
"learning_rate": 3.5119891127762592e-09,
"logits/chosen": -0.53515625,
"logits/rejected": -0.458984375,
"logps/chosen": -0.97265625,
"logps/rejected": -1.0,
"loss": 0.9282,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.10107421875,
"rewards/margins": 0.0152587890625,
"rewards/rejected": -0.1162109375,
"step": 449
},
{
"epoch": 0.9615384615384616,
"grad_norm": 33.412721023215504,
"learning_rate": 3.1525759551237485e-09,
"logits/chosen": -0.6484375,
"logits/rejected": -0.6171875,
"logps/chosen": -0.66796875,
"logps/rejected": -0.66015625,
"loss": 0.8937,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.076171875,
"rewards/margins": 0.02001953125,
"rewards/rejected": -0.0966796875,
"step": 450
},
{
"epoch": 0.9636752136752137,
"grad_norm": 25.449096875471138,
"learning_rate": 2.8124767879005752e-09,
"logits/chosen": -0.71484375,
"logits/rejected": -0.765625,
"logps/chosen": -0.87109375,
"logps/rejected": -1.4765625,
"loss": 0.8815,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0771484375,
"rewards/margins": 0.031005859375,
"rewards/rejected": -0.10791015625,
"step": 451
},
{
"epoch": 0.9658119658119658,
"grad_norm": 26.9546143729673,
"learning_rate": 2.491710549324644e-09,
"logits/chosen": -0.4296875,
"logits/rejected": -0.470703125,
"logps/chosen": -0.984375,
"logps/rejected": -1.78125,
"loss": 0.8539,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.06005859375,
"rewards/margins": 0.046142578125,
"rewards/rejected": -0.10595703125,
"step": 452
},
{
"epoch": 0.967948717948718,
"grad_norm": 21.60271449407226,
"learning_rate": 2.190295101071171e-09,
"logits/chosen": -0.5234375,
"logits/rejected": -0.5078125,
"logps/chosen": -0.7578125,
"logps/rejected": -0.9921875,
"loss": 0.8593,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0810546875,
"rewards/margins": 0.037109375,
"rewards/rejected": -0.1181640625,
"step": 453
},
{
"epoch": 0.9700854700854701,
"grad_norm": 21.38516131392487,
"learning_rate": 1.9082472272783146e-09,
"logits/chosen": -0.40234375,
"logits/rejected": -0.4921875,
"logps/chosen": -0.7890625,
"logps/rejected": -0.80078125,
"loss": 0.8829,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.06640625,
"rewards/margins": 0.018310546875,
"rewards/rejected": -0.0849609375,
"step": 454
},
{
"epoch": 0.9722222222222222,
"grad_norm": 81.83247110230516,
"learning_rate": 1.6455826336124857e-09,
"logits/chosen": -0.7578125,
"logits/rejected": -0.671875,
"logps/chosen": -1.453125,
"logps/rejected": -1.59375,
"loss": 0.9213,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.064453125,
"rewards/margins": 0.0260009765625,
"rewards/rejected": -0.09033203125,
"step": 455
},
{
"epoch": 0.9743589743589743,
"grad_norm": 60.332814313074266,
"learning_rate": 1.4023159463938173e-09,
"logits/chosen": -0.6015625,
"logits/rejected": -0.6484375,
"logps/chosen": -0.828125,
"logps/rejected": -0.91796875,
"loss": 0.9134,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.06494140625,
"rewards/margins": 0.036376953125,
"rewards/rejected": -0.1015625,
"step": 456
},
{
"epoch": 0.9764957264957265,
"grad_norm": 49.45626820858851,
"learning_rate": 1.178460711781587e-09,
"logits/chosen": -0.65625,
"logits/rejected": -0.703125,
"logps/chosen": -0.85546875,
"logps/rejected": -1.1484375,
"loss": 0.869,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.052490234375,
"rewards/margins": 0.07421875,
"rewards/rejected": -0.126953125,
"step": 457
},
{
"epoch": 0.9786324786324786,
"grad_norm": 42.320706874764625,
"learning_rate": 9.74029395020065e-10,
"logits/chosen": -0.58984375,
"logits/rejected": -0.69921875,
"logps/chosen": -0.8671875,
"logps/rejected": -1.2265625,
"loss": 0.9081,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0625,
"rewards/margins": 0.061767578125,
"rewards/rejected": -0.12451171875,
"step": 458
},
{
"epoch": 0.9807692307692307,
"grad_norm": 35.68338029015088,
"learning_rate": 7.890333797442805e-10,
"logits/chosen": -0.57421875,
"logits/rejected": -0.61328125,
"logps/chosen": -0.7578125,
"logps/rejected": -0.86328125,
"loss": 0.8683,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.0771484375,
"rewards/margins": 0.051513671875,
"rewards/rejected": -0.12890625,
"step": 459
},
{
"epoch": 0.9829059829059829,
"grad_norm": 40.17334625901446,
"learning_rate": 6.234829673462505e-10,
"logits/chosen": -0.5546875,
"logits/rejected": -0.490234375,
"logps/chosen": -1.171875,
"logps/rejected": -1.578125,
"loss": 0.8818,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.09228515625,
"rewards/margins": 0.04931640625,
"rewards/rejected": -0.1416015625,
"step": 460
},
{
"epoch": 0.9850427350427351,
"grad_norm": 46.29329122227152,
"learning_rate": 4.773873764012059e-10,
"logits/chosen": -0.6953125,
"logits/rejected": -0.80859375,
"logps/chosen": -0.80078125,
"logps/rejected": -1.078125,
"loss": 0.9723,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07421875,
"rewards/margins": 0.0106201171875,
"rewards/rejected": -0.0849609375,
"step": 461
},
{
"epoch": 0.9871794871794872,
"grad_norm": 38.67355799271119,
"learning_rate": 3.507547421543966e-10,
"logits/chosen": -0.251953125,
"logits/rejected": -0.333984375,
"logps/chosen": -1.09375,
"logps/rejected": -0.79296875,
"loss": 0.9374,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.09326171875,
"rewards/margins": 0.021484375,
"rewards/rejected": -0.11474609375,
"step": 462
},
{
"epoch": 0.9893162393162394,
"grad_norm": 25.110448028534133,
"learning_rate": 2.435921160678922e-10,
"logits/chosen": -0.74609375,
"logits/rejected": -0.765625,
"logps/chosen": -1.078125,
"logps/rejected": -1.09375,
"loss": 0.8717,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.0751953125,
"rewards/margins": 0.0458984375,
"rewards/rejected": -0.12109375,
"step": 463
},
{
"epoch": 0.9914529914529915,
"grad_norm": 33.231328715847084,
"learning_rate": 1.559054654281966e-10,
"logits/chosen": -0.490234375,
"logits/rejected": -0.47265625,
"logps/chosen": -1.3125,
"logps/rejected": -1.53125,
"loss": 0.9052,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.1337890625,
"rewards/margins": 0.0252685546875,
"rewards/rejected": -0.1591796875,
"step": 464
},
{
"epoch": 0.9935897435897436,
"grad_norm": 21.045630521161826,
"learning_rate": 8.769967301381909e-11,
"logits/chosen": -0.66015625,
"logits/rejected": -0.66015625,
"logps/chosen": -0.9296875,
"logps/rejected": -1.046875,
"loss": 0.8311,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.10498046875,
"rewards/margins": 0.0595703125,
"rewards/rejected": -0.1650390625,
"step": 465
},
{
"epoch": 0.9957264957264957,
"grad_norm": 18.01335853415681,
"learning_rate": 3.8978536823230934e-11,
"logits/chosen": -0.2734375,
"logits/rejected": -0.333984375,
"logps/chosen": -1.234375,
"logps/rejected": -1.8671875,
"loss": 0.8609,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.04052734375,
"rewards/margins": 0.1318359375,
"rewards/rejected": -0.171875,
"step": 466
},
{
"epoch": 0.9978632478632479,
"grad_norm": 48.25462426916188,
"learning_rate": 9.744769863712088e-12,
"logits/chosen": -0.609375,
"logits/rejected": -0.5625,
"logps/chosen": -0.8203125,
"logps/rejected": -0.9140625,
"loss": 0.8513,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.08984375,
"rewards/margins": 0.059326171875,
"rewards/rejected": -0.1494140625,
"step": 467
},
{
"epoch": 1.0,
"grad_norm": 31.93733299063659,
"learning_rate": 0.0,
"logits/chosen": -0.6875,
"logits/rejected": -0.72265625,
"logps/chosen": -0.80859375,
"logps/rejected": -1.1875,
"loss": 0.887,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1064453125,
"rewards/margins": 0.0274658203125,
"rewards/rejected": -0.1337890625,
"step": 468
},
{
"epoch": 1.0,
"step": 468,
"total_flos": 0.0,
"train_loss": 0.9510387192424546,
"train_runtime": 8444.0036,
"train_samples_per_second": 7.091,
"train_steps_per_second": 0.055
}
],
"logging_steps": 1,
"max_steps": 468,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}