dpo-checkpoint-66 / trainer_state.json
yilingwang's picture
Add files using upload-large-folder tool
1b6ac1b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 66,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030303030303030304,
"grad_norm": 3.359375,
"kl_divergence": 0.0,
"learning_rate": 2e-05,
"log_prob_preferred": -40.3125,
"log_prob_rejected": -38.6875,
"loss": 0.69140625,
"policy_diff": -1.625,
"ref_diff": -1.625,
"reward_margin": 0.0,
"step": 1,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.06060606060606061,
"grad_norm": 2.28125,
"kl_divergence": -0.00021457672119140625,
"learning_rate": 1.96969696969697e-05,
"log_prob_preferred": -38.5,
"log_prob_rejected": -37.5625,
"loss": 0.6796875,
"policy_diff": -0.9375,
"ref_diff": -1.0625,
"reward_margin": 0.0250244140625,
"step": 2,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.09090909090909091,
"grad_norm": 1.921875,
"kl_divergence": -0.000499725341796875,
"learning_rate": 1.9393939393939395e-05,
"log_prob_preferred": -37.3125,
"log_prob_rejected": -36.5625,
"loss": 0.6796875,
"policy_diff": -0.75,
"ref_diff": -0.875,
"reward_margin": 0.0250244140625,
"step": 3,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.12121212121212122,
"grad_norm": 1.84375,
"kl_divergence": -0.0011119842529296875,
"learning_rate": 1.9090909090909094e-05,
"log_prob_preferred": -37.375,
"log_prob_rejected": -36.3125,
"loss": 0.6796875,
"policy_diff": -1.0625,
"ref_diff": -1.1875,
"reward_margin": 0.0250244140625,
"step": 4,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.15151515151515152,
"grad_norm": 2.546875,
"kl_divergence": -0.0003497600555419922,
"learning_rate": 1.8787878787878792e-05,
"log_prob_preferred": -35.875,
"log_prob_rejected": -34.65625,
"loss": 0.65625,
"policy_diff": -1.21875,
"ref_diff": -1.59375,
"reward_margin": 0.07513427734375,
"step": 5,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.18181818181818182,
"grad_norm": 1.71875,
"kl_divergence": -0.00025653839111328125,
"learning_rate": 1.8484848484848487e-05,
"log_prob_preferred": -38.375,
"log_prob_rejected": -37.5,
"loss": 0.66796875,
"policy_diff": -0.875,
"ref_diff": -1.125,
"reward_margin": 0.050048828125,
"step": 6,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.21212121212121213,
"grad_norm": 2.03125,
"kl_divergence": -6.866455078125e-05,
"learning_rate": 1.8181818181818182e-05,
"log_prob_preferred": -38.625,
"log_prob_rejected": -37.3125,
"loss": 0.66796875,
"policy_diff": -1.3125,
"ref_diff": -1.5625,
"reward_margin": 0.050048828125,
"step": 7,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.24242424242424243,
"grad_norm": 1.6015625,
"kl_divergence": 0.00064849853515625,
"learning_rate": 1.787878787878788e-05,
"log_prob_preferred": -38.5625,
"log_prob_rejected": -37.75,
"loss": 0.662109375,
"policy_diff": -0.8125,
"ref_diff": -1.125,
"reward_margin": 0.06256103515625,
"step": 8,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.2727272727272727,
"grad_norm": 1.734375,
"kl_divergence": 0.0003542900085449219,
"learning_rate": 1.7575757575757576e-05,
"log_prob_preferred": -39.5625,
"log_prob_rejected": -38.75,
"loss": 0.662109375,
"policy_diff": -0.8125,
"ref_diff": -1.125,
"reward_margin": 0.06256103515625,
"step": 9,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.30303030303030304,
"grad_norm": 1.3515625,
"kl_divergence": -7.82012939453125e-05,
"learning_rate": 1.7272727272727274e-05,
"log_prob_preferred": -36.9375,
"log_prob_rejected": -36.375,
"loss": 0.662109375,
"policy_diff": -0.5625,
"ref_diff": -0.875,
"reward_margin": 0.06256103515625,
"step": 10,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1.71875,
"kl_divergence": 0.0005383491516113281,
"learning_rate": 1.6969696969696972e-05,
"log_prob_preferred": -35.6875,
"log_prob_rejected": -34.75,
"loss": 0.662109375,
"policy_diff": -0.9375,
"ref_diff": -1.25,
"reward_margin": 0.06256103515625,
"step": 11,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.36363636363636365,
"grad_norm": 1.6328125,
"kl_divergence": -6.29425048828125e-05,
"learning_rate": 1.6666666666666667e-05,
"log_prob_preferred": -38.3125,
"log_prob_rejected": -37.375,
"loss": 0.662109375,
"policy_diff": -0.9375,
"ref_diff": -1.25,
"reward_margin": 0.06256103515625,
"step": 12,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.3939393939393939,
"grad_norm": 1.7109375,
"kl_divergence": 0.0003426074981689453,
"learning_rate": 1.6363636363636366e-05,
"log_prob_preferred": -40.1875,
"log_prob_rejected": -39.375,
"loss": 0.64453125,
"policy_diff": -0.8125,
"ref_diff": -1.3125,
"reward_margin": 0.10015869140625,
"step": 13,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.42424242424242425,
"grad_norm": 1.6328125,
"kl_divergence": 0.0017499923706054688,
"learning_rate": 1.606060606060606e-05,
"log_prob_preferred": -35.125,
"log_prob_rejected": -34.5,
"loss": 0.6591796875,
"policy_diff": -0.625,
"ref_diff": -0.96875,
"reward_margin": 0.0687255859375,
"step": 14,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.45454545454545453,
"grad_norm": 1.6484375,
"kl_divergence": 0.0004515647888183594,
"learning_rate": 1.575757575757576e-05,
"log_prob_preferred": -38.6875,
"log_prob_rejected": -37.875,
"loss": 0.662109375,
"policy_diff": -0.8125,
"ref_diff": -1.125,
"reward_margin": 0.0626220703125,
"step": 15,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.48484848484848486,
"grad_norm": 1.5546875,
"kl_divergence": 0.0008220672607421875,
"learning_rate": 1.5454545454545454e-05,
"log_prob_preferred": -36.75,
"log_prob_rejected": -36.0625,
"loss": 0.64453125,
"policy_diff": -0.6875,
"ref_diff": -1.1875,
"reward_margin": 0.10015869140625,
"step": 16,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.5151515151515151,
"grad_norm": 1.6953125,
"kl_divergence": 0.0004444122314453125,
"learning_rate": 1.5151515151515153e-05,
"log_prob_preferred": -35.125,
"log_prob_rejected": -34.25,
"loss": 0.6416015625,
"policy_diff": -0.875,
"ref_diff": -1.40625,
"reward_margin": 0.1063232421875,
"step": 17,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.5454545454545454,
"grad_norm": 1.7421875,
"kl_divergence": 0.000736236572265625,
"learning_rate": 1.484848484848485e-05,
"log_prob_preferred": -38.1875,
"log_prob_rejected": -37.5625,
"loss": 0.650390625,
"policy_diff": -0.625,
"ref_diff": -1.0625,
"reward_margin": 0.087646484375,
"step": 18,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.5757575757575758,
"grad_norm": 1.578125,
"kl_divergence": 0.0011148452758789062,
"learning_rate": 1.4545454545454546e-05,
"log_prob_preferred": -39.5625,
"log_prob_rejected": -38.9375,
"loss": 0.662109375,
"policy_diff": -0.625,
"ref_diff": -0.9375,
"reward_margin": 0.06256103515625,
"step": 19,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.6060606060606061,
"grad_norm": 1.5,
"kl_divergence": -0.00011205673217773438,
"learning_rate": 1.4242424242424245e-05,
"log_prob_preferred": -38.9375,
"log_prob_rejected": -38.3125,
"loss": 0.638671875,
"policy_diff": -0.625,
"ref_diff": -1.1875,
"reward_margin": 0.112548828125,
"step": 20,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.6363636363636364,
"grad_norm": 1.671875,
"kl_divergence": 0.0005855560302734375,
"learning_rate": 1.3939393939393942e-05,
"log_prob_preferred": -40.0625,
"log_prob_rejected": -39.125,
"loss": 0.62109375,
"policy_diff": -0.9375,
"ref_diff": -1.6875,
"reward_margin": 0.150146484375,
"step": 21,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.75,
"kl_divergence": 0.0017135143280029297,
"learning_rate": 1.3636363636363637e-05,
"log_prob_preferred": -37.875,
"log_prob_rejected": -37.1875,
"loss": 0.638671875,
"policy_diff": -0.6875,
"ref_diff": -1.25,
"reward_margin": 0.1126708984375,
"step": 22,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.696969696969697,
"grad_norm": 1.2265625,
"kl_divergence": 0.0012230873107910156,
"learning_rate": 1.3333333333333333e-05,
"log_prob_preferred": -37.4375,
"log_prob_rejected": -36.875,
"loss": 0.650390625,
"policy_diff": -0.5625,
"ref_diff": -1.0,
"reward_margin": 0.087646484375,
"step": 23,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.7272727272727273,
"grad_norm": 1.1796875,
"kl_divergence": 0.00089263916015625,
"learning_rate": 1.3030303030303032e-05,
"log_prob_preferred": -37.0625,
"log_prob_rejected": -36.59375,
"loss": 0.6474609375,
"policy_diff": -0.46875,
"ref_diff": -0.9375,
"reward_margin": 0.093902587890625,
"step": 24,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.7575757575757576,
"grad_norm": 1.1953125,
"kl_divergence": 0.0015249252319335938,
"learning_rate": 1.2727272727272728e-05,
"log_prob_preferred": -38.0625,
"log_prob_rejected": -37.5,
"loss": 0.64453125,
"policy_diff": -0.5625,
"ref_diff": -1.0625,
"reward_margin": 0.1002197265625,
"step": 25,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.7878787878787878,
"grad_norm": 1.4921875,
"kl_divergence": -0.0013937950134277344,
"learning_rate": 1.2424242424242425e-05,
"log_prob_preferred": -38.3125,
"log_prob_rejected": -37.75,
"loss": 0.650390625,
"policy_diff": -0.5625,
"ref_diff": -1.0,
"reward_margin": 0.087646484375,
"step": 26,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.8181818181818182,
"grad_norm": 1.1328125,
"kl_divergence": 0.000850677490234375,
"learning_rate": 1.2121212121212122e-05,
"log_prob_preferred": -37.375,
"log_prob_rejected": -37.0625,
"loss": 0.626953125,
"policy_diff": -0.3125,
"ref_diff": -1.0,
"reward_margin": 0.1376953125,
"step": 27,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.8484848484848485,
"grad_norm": 1.6171875,
"kl_divergence": 0.000988006591796875,
"learning_rate": 1.181818181818182e-05,
"log_prob_preferred": -36.0,
"log_prob_rejected": -35.3125,
"loss": 0.6142578125,
"policy_diff": -0.6875,
"ref_diff": -1.53125,
"reward_margin": 0.16888427734375,
"step": 28,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.8787878787878788,
"grad_norm": 1.59375,
"kl_divergence": 0.0015506744384765625,
"learning_rate": 1.1515151515151517e-05,
"log_prob_preferred": -39.0,
"log_prob_rejected": -38.3125,
"loss": 0.638671875,
"policy_diff": -0.6875,
"ref_diff": -1.25,
"reward_margin": 0.1126708984375,
"step": 29,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.65625,
"kl_divergence": -0.0009489059448242188,
"learning_rate": 1.1212121212121212e-05,
"log_prob_preferred": -41.0,
"log_prob_rejected": -40.125,
"loss": 0.650390625,
"policy_diff": -0.875,
"ref_diff": -1.3125,
"reward_margin": 0.08758544921875,
"step": 30,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.9393939393939394,
"grad_norm": 1.828125,
"kl_divergence": 0.001901388168334961,
"learning_rate": 1.0909090909090909e-05,
"log_prob_preferred": -35.0625,
"log_prob_rejected": -34.1875,
"loss": 0.6181640625,
"policy_diff": -0.875,
"ref_diff": -1.65625,
"reward_margin": 0.15625,
"step": 31,
"valid_samples_per_batch": 4.0
},
{
"epoch": 0.9696969696969697,
"grad_norm": 1.2734375,
"kl_divergence": 0.0008959770202636719,
"learning_rate": 1.0606060606060606e-05,
"log_prob_preferred": -37.25,
"log_prob_rejected": -36.6875,
"loss": 0.650390625,
"policy_diff": -0.5625,
"ref_diff": -1.0,
"reward_margin": 0.08758544921875,
"step": 32,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.0,
"grad_norm": 1.3828125,
"kl_divergence": 0.0013256072998046875,
"learning_rate": 1.0303030303030304e-05,
"log_prob_preferred": -37.1875,
"log_prob_rejected": -36.6875,
"loss": 0.626953125,
"policy_diff": -0.5,
"ref_diff": -1.1875,
"reward_margin": 0.13763427734375,
"step": 33,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.0303030303030303,
"grad_norm": 1.0234375,
"kl_divergence": 0.0015974044799804688,
"learning_rate": 1e-05,
"log_prob_preferred": -35.75,
"log_prob_rejected": -35.25,
"loss": 0.650390625,
"policy_diff": -0.5,
"ref_diff": -0.9375,
"reward_margin": 0.087646484375,
"step": 34,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.0606060606060606,
"grad_norm": 0.953125,
"kl_divergence": 0.0008296966552734375,
"learning_rate": 9.696969696969698e-06,
"log_prob_preferred": -37.1875,
"log_prob_rejected": -36.8125,
"loss": 0.64453125,
"policy_diff": -0.375,
"ref_diff": -0.875,
"reward_margin": 0.10015869140625,
"step": 35,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.0909090909090908,
"grad_norm": 1.1015625,
"kl_divergence": -0.0008130073547363281,
"learning_rate": 9.393939393939396e-06,
"log_prob_preferred": -37.375,
"log_prob_rejected": -36.875,
"loss": 0.6416015625,
"policy_diff": -0.5,
"ref_diff": -1.03125,
"reward_margin": 0.10626220703125,
"step": 36,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.121212121212121,
"grad_norm": 1.3984375,
"kl_divergence": 0.0008101463317871094,
"learning_rate": 9.090909090909091e-06,
"log_prob_preferred": -38.75,
"log_prob_rejected": -38.125,
"loss": 0.609375,
"policy_diff": -0.625,
"ref_diff": -1.5,
"reward_margin": 0.17529296875,
"step": 37,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.1515151515151516,
"grad_norm": 1.3515625,
"kl_divergence": 0.0025653839111328125,
"learning_rate": 8.787878787878788e-06,
"log_prob_preferred": -37.5,
"log_prob_rejected": -36.9375,
"loss": 0.650390625,
"policy_diff": -0.5625,
"ref_diff": -1.0,
"reward_margin": 0.087646484375,
"step": 38,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.1818181818181819,
"grad_norm": 1.25,
"kl_divergence": -0.000659942626953125,
"learning_rate": 8.484848484848486e-06,
"log_prob_preferred": -38.5,
"log_prob_rejected": -38.0,
"loss": 0.62109375,
"policy_diff": -0.5,
"ref_diff": -1.25,
"reward_margin": 0.150146484375,
"step": 39,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.2121212121212122,
"grad_norm": 1.328125,
"kl_divergence": -7.82012939453125e-05,
"learning_rate": 8.181818181818183e-06,
"log_prob_preferred": -39.125,
"log_prob_rejected": -38.5,
"loss": 0.650390625,
"policy_diff": -0.625,
"ref_diff": -1.0625,
"reward_margin": 0.08758544921875,
"step": 40,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.2424242424242424,
"grad_norm": 1.2578125,
"kl_divergence": 0.0023040771484375,
"learning_rate": 7.87878787878788e-06,
"log_prob_preferred": -35.1875,
"log_prob_rejected": -34.5625,
"loss": 0.626953125,
"policy_diff": -0.625,
"ref_diff": -1.3125,
"reward_margin": 0.13763427734375,
"step": 41,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.2727272727272727,
"grad_norm": 1.4453125,
"kl_divergence": 0.0010166168212890625,
"learning_rate": 7.5757575757575764e-06,
"log_prob_preferred": -37.25,
"log_prob_rejected": -36.625,
"loss": 0.6171875,
"policy_diff": -0.625,
"ref_diff": -1.4375,
"reward_margin": 0.16259765625,
"step": 42,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.303030303030303,
"grad_norm": 1.625,
"kl_divergence": 0.0015864372253417969,
"learning_rate": 7.272727272727273e-06,
"log_prob_preferred": -39.0,
"log_prob_rejected": -38.375,
"loss": 0.615234375,
"policy_diff": -0.625,
"ref_diff": -1.4375,
"reward_margin": 0.16265869140625,
"step": 43,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.97265625,
"kl_divergence": -0.000118255615234375,
"learning_rate": 6.969696969696971e-06,
"log_prob_preferred": -34.625,
"log_prob_rejected": -34.21875,
"loss": 0.6337890625,
"policy_diff": -0.40625,
"ref_diff": -1.03125,
"reward_margin": 0.12530517578125,
"step": 44,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.3636363636363638,
"grad_norm": 1.8203125,
"kl_divergence": 0.0010471343994140625,
"learning_rate": 6.666666666666667e-06,
"log_prob_preferred": -38.75,
"log_prob_rejected": -37.75,
"loss": 0.56640625,
"policy_diff": -1.0,
"ref_diff": -2.375,
"reward_margin": 0.275146484375,
"step": 45,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.393939393939394,
"grad_norm": 0.9453125,
"kl_divergence": 0.0012845993041992188,
"learning_rate": 6.363636363636364e-06,
"log_prob_preferred": -38.6875,
"log_prob_rejected": -38.3125,
"loss": 0.650390625,
"policy_diff": -0.375,
"ref_diff": -0.8125,
"reward_margin": 0.08758544921875,
"step": 46,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.4242424242424243,
"grad_norm": 0.8984375,
"kl_divergence": -0.0010428428649902344,
"learning_rate": 6.060606060606061e-06,
"log_prob_preferred": -37.8125,
"log_prob_rejected": -37.5,
"loss": 0.64453125,
"policy_diff": -0.3125,
"ref_diff": -0.8125,
"reward_margin": 0.10015869140625,
"step": 47,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.4545454545454546,
"grad_norm": 1.5,
"kl_divergence": 0.0023941993713378906,
"learning_rate": 5.7575757575757586e-06,
"log_prob_preferred": -39.375,
"log_prob_rejected": -38.625,
"loss": 0.6171875,
"policy_diff": -0.75,
"ref_diff": -1.5625,
"reward_margin": 0.16253662109375,
"step": 48,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.4848484848484849,
"grad_norm": 0.8203125,
"kl_divergence": 1.52587890625e-05,
"learning_rate": 5.4545454545454545e-06,
"log_prob_preferred": -34.0,
"log_prob_rejected": -33.6875,
"loss": 0.650390625,
"policy_diff": -0.3125,
"ref_diff": -0.75,
"reward_margin": 0.087646484375,
"step": 49,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.5151515151515151,
"grad_norm": 1.5,
"kl_divergence": 0.0006866455078125,
"learning_rate": 5.151515151515152e-06,
"log_prob_preferred": -38.125,
"log_prob_rejected": -37.375,
"loss": 0.609375,
"policy_diff": -0.75,
"ref_diff": -1.625,
"reward_margin": 0.1751708984375,
"step": 50,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.5454545454545454,
"grad_norm": 1.140625,
"kl_divergence": 5.7220458984375e-05,
"learning_rate": 4.848484848484849e-06,
"log_prob_preferred": -38.6875,
"log_prob_rejected": -38.125,
"loss": 0.638671875,
"policy_diff": -0.5625,
"ref_diff": -1.125,
"reward_margin": 0.11260986328125,
"step": 51,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.5757575757575757,
"grad_norm": 0.99609375,
"kl_divergence": 0.0010547637939453125,
"learning_rate": 4.5454545454545455e-06,
"log_prob_preferred": -37.5625,
"log_prob_rejected": -37.125,
"loss": 0.662109375,
"policy_diff": -0.4375,
"ref_diff": -0.75,
"reward_margin": 0.06256103515625,
"step": 52,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.606060606060606,
"grad_norm": 1.0078125,
"kl_divergence": 0.0012416839599609375,
"learning_rate": 4.242424242424243e-06,
"log_prob_preferred": -39.1875,
"log_prob_rejected": -38.6875,
"loss": 0.65625,
"policy_diff": -0.5,
"ref_diff": -0.875,
"reward_margin": 0.0750732421875,
"step": 53,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.6363636363636362,
"grad_norm": 1.125,
"kl_divergence": 0.0001773834228515625,
"learning_rate": 3.93939393939394e-06,
"log_prob_preferred": -34.9375,
"log_prob_rejected": -34.5,
"loss": 0.62109375,
"policy_diff": -0.4375,
"ref_diff": -1.1875,
"reward_margin": 0.15020751953125,
"step": 54,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.3046875,
"kl_divergence": 0.0010638236999511719,
"learning_rate": 3.6363636363636366e-06,
"log_prob_preferred": -34.21875,
"log_prob_rejected": -33.6875,
"loss": 0.615234375,
"policy_diff": -0.53125,
"ref_diff": -1.34375,
"reward_margin": 0.16259765625,
"step": 55,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.696969696969697,
"grad_norm": 1.71875,
"kl_divergence": 0.0024175643920898438,
"learning_rate": 3.3333333333333333e-06,
"log_prob_preferred": -37.625,
"log_prob_rejected": -36.9375,
"loss": 0.609375,
"policy_diff": -0.6875,
"ref_diff": -1.5625,
"reward_margin": 0.1751708984375,
"step": 56,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.7272727272727273,
"grad_norm": 1.25,
"kl_divergence": 0.0025844573974609375,
"learning_rate": 3.0303030303030305e-06,
"log_prob_preferred": -37.8125,
"log_prob_rejected": -37.25,
"loss": 0.6044921875,
"policy_diff": -0.5625,
"ref_diff": -1.5,
"reward_margin": 0.1878662109375,
"step": 57,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.7575757575757576,
"grad_norm": 1.03125,
"kl_divergence": -0.000789642333984375,
"learning_rate": 2.7272727272727272e-06,
"log_prob_preferred": -35.59375,
"log_prob_rejected": -35.28125,
"loss": 0.6533203125,
"policy_diff": -0.3125,
"ref_diff": -0.71875,
"reward_margin": 0.08135986328125,
"step": 58,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.7878787878787878,
"grad_norm": 1.109375,
"kl_divergence": 0.00030517578125,
"learning_rate": 2.4242424242424244e-06,
"log_prob_preferred": -37.6875,
"log_prob_rejected": -37.1875,
"loss": 0.662109375,
"policy_diff": -0.5,
"ref_diff": -0.8125,
"reward_margin": 0.06256103515625,
"step": 59,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.8181818181818183,
"grad_norm": 1.4453125,
"kl_divergence": 0.0015001296997070312,
"learning_rate": 2.1212121212121216e-06,
"log_prob_preferred": -39.1875,
"log_prob_rejected": -38.5625,
"loss": 0.626953125,
"policy_diff": -0.625,
"ref_diff": -1.3125,
"reward_margin": 0.13775634765625,
"step": 60,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.8484848484848486,
"grad_norm": 0.921875,
"kl_divergence": 0.00011348724365234375,
"learning_rate": 1.8181818181818183e-06,
"log_prob_preferred": -38.625,
"log_prob_rejected": -38.3125,
"loss": 0.650390625,
"policy_diff": -0.3125,
"ref_diff": -0.75,
"reward_margin": 0.08758544921875,
"step": 61,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.878787878787879,
"grad_norm": 1.4609375,
"kl_divergence": 0.002395153045654297,
"learning_rate": 1.5151515151515152e-06,
"log_prob_preferred": -36.8125,
"log_prob_rejected": -36.125,
"loss": 0.59375,
"policy_diff": -0.6875,
"ref_diff": -1.75,
"reward_margin": 0.2125244140625,
"step": 62,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.9090909090909092,
"grad_norm": 1.21875,
"kl_divergence": -0.0005307197570800781,
"learning_rate": 1.2121212121212122e-06,
"log_prob_preferred": -38.25,
"log_prob_rejected": -37.8125,
"loss": 0.64453125,
"policy_diff": -0.4375,
"ref_diff": -0.9375,
"reward_margin": 0.10015869140625,
"step": 63,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.9393939393939394,
"grad_norm": 1.2421875,
"kl_divergence": 0.0015139579772949219,
"learning_rate": 9.090909090909091e-07,
"log_prob_preferred": -37.75,
"log_prob_rejected": -37.3125,
"loss": 0.62109375,
"policy_diff": -0.4375,
"ref_diff": -1.1875,
"reward_margin": 0.1502685546875,
"step": 64,
"valid_samples_per_batch": 4.0
},
{
"epoch": 1.9696969696969697,
"grad_norm": 1.2578125,
"kl_divergence": 0.0006976127624511719,
"learning_rate": 6.060606060606061e-07,
"log_prob_preferred": -37.6875,
"log_prob_rejected": -37.1875,
"loss": 0.62109375,
"policy_diff": -0.5,
"ref_diff": -1.25,
"reward_margin": 0.150146484375,
"step": 65,
"valid_samples_per_batch": 4.0
},
{
"epoch": 2.0,
"grad_norm": 1.25,
"kl_divergence": 0.000301361083984375,
"learning_rate": 3.0303030303030305e-07,
"log_prob_preferred": -38.0625,
"log_prob_rejected": -37.5625,
"loss": 0.615234375,
"policy_diff": -0.5,
"ref_diff": -1.3125,
"reward_margin": 0.1627197265625,
"step": 66,
"valid_samples_per_batch": 4.0
}
],
"logging_steps": 1.0,
"max_steps": 66,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}