Qwen2-0.5B-math-grpo / trainer_state.json
stpete2's picture
Upload folder using huggingface_hub
8db58f0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 125,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 482.90625,
"epoch": 0.008,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.5384615384615387e-06,
"loss": 0.0,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"second_item": 0.0,
"step": 1,
"total_sum": 0.0
},
{
"clip_ratio": 0.0,
"completion_length": 457.34375,
"epoch": 0.016,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.0,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"second_item": 0.0,
"step": 2,
"total_sum": 0.0
},
{
"clip_ratio": 0.0,
"completion_length": 447.6875,
"epoch": 0.024,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 4.615384615384616e-06,
"loss": 0.0,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"second_item": 0.0,
"step": 3,
"total_sum": 0.0
},
{
"clip_ratio": 0.0,
"completion_length": 474.9375,
"epoch": 0.032,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 6.153846153846155e-06,
"loss": 0.0,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"second_item": 0.0,
"step": 4,
"total_sum": 0.0
},
{
"clip_ratio": 0.0,
"completion_length": 452.34375,
"epoch": 0.04,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 7.692307692307694e-06,
"loss": 0.0,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"second_item": 0.0,
"step": 5,
"total_sum": 0.0
},
{
"clip_ratio": 0.0,
"completion_length": 464.34375,
"epoch": 0.048,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 9.230769230769232e-06,
"loss": 0.0,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"second_item": 0.0,
"step": 6,
"total_sum": 0.0
},
{
"clip_ratio": 0.0,
"completion_length": 450.59375,
"epoch": 0.056,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.076923076923077e-05,
"loss": 0.0,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"second_item": 0.0,
"step": 7,
"total_sum": 0.0
},
{
"clip_ratio": 0.0,
"completion_length": 450.8125,
"epoch": 0.064,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.7793766260147095,
"kl": 0.0,
"learning_rate": 1.230769230769231e-05,
"loss": -0.0027,
"reward": 2.03125,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.03125,
"second_item": 0.0,
"step": 8,
"total_sum": 0.0
},
{
"clip_ratio": 0.0,
"completion_length": 454.09375,
"epoch": 0.072,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.027297493070364,
"kl": 0.00109100341796875,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.0,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"second_item": -0.00028061866760253906,
"step": 9,
"total_sum": -0.00028082728385925293
},
{
"clip_ratio": 0.0,
"completion_length": 449.75,
"epoch": 0.08,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.08137404173612595,
"kl": 0.0069732666015625,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"second_item": 0.0002465248107910156,
"step": 10,
"total_sum": 0.0002470235340297222
},
{
"clip_ratio": 0.0,
"completion_length": 455.125,
"epoch": 0.088,
"first_item": 3.725290298461914e-09,
"first_item_div_second_item": 2.0901278008428459e-07,
"grad_norm": 1.5568859577178955,
"kl": 0.01507568359375,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.0111,
"reward": 2.1796875,
"reward_std": 0.359375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.0625,
"rewards/tag_count_reward": 0.1171875,
"second_item": 0.00919342041015625,
"step": 11,
"total_sum": 0.009175417013466358
},
{
"clip_ratio": 0.0,
"completion_length": 456.46875,
"epoch": 0.096,
"first_item": 1.4901161193847656e-08,
"first_item_div_second_item": 1.5119791379746302e-07,
"grad_norm": 1.7241485118865967,
"kl": 0.03033447265625,
"learning_rate": 1.8461538461538465e-05,
"loss": -0.0271,
"reward": 2.71875,
"reward_std": 0.5814496129751205,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.40625,
"rewards/tag_count_reward": 0.3125,
"second_item": 0.17236328125,
"step": 12,
"total_sum": 0.1725619211792946
},
{
"clip_ratio": 0.0,
"completion_length": 476.84375,
"epoch": 0.104,
"first_item": 0.0,
"first_item_div_second_item": -3.1597506488735924e-08,
"grad_norm": 1.110132098197937,
"kl": 0.0904541015625,
"learning_rate": 2e-05,
"loss": 0.0434,
"reward": 3.640625,
"reward_std": 0.5629279315471649,
"rewards/accuracy_reward": 1.9375,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.828125,
"second_item": 4.12060546875,
"step": 13,
"total_sum": 4.109678082168102
},
{
"clip_ratio": 0.0,
"completion_length": 288.4375,
"epoch": 0.112,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -4.94644289303585e-10,
"grad_norm": 1.3746771812438965,
"kl": 0.3095703125,
"learning_rate": 1.9996066263830533e-05,
"loss": -0.0027,
"reward": 3.8046875,
"reward_std": 0.390625,
"rewards/accuracy_reward": 1.9375,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.8984375,
"second_item": 7.5,
"step": 14,
"total_sum": 7.5020482540130615
},
{
"clip_ratio": 0.0,
"completion_length": 281.40625,
"epoch": 0.12,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 2.6501071453094482,
"kl": 0.50390625,
"learning_rate": 1.998426815017817e-05,
"loss": 0.0337,
"reward": 2.6640625,
"reward_std": 0.39768657833337784,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.125,
"rewards/tag_count_reward": 0.5390625,
"second_item": 0.326171875,
"step": 15,
"total_sum": 0.326133593916893
},
{
"clip_ratio": 0.0,
"completion_length": 262.5625,
"epoch": 0.128,
"first_item": 3.725290298461914e-09,
"first_item_div_second_item": 6.176646102876358e-10,
"grad_norm": 2.351057529449463,
"kl": 0.4150390625,
"learning_rate": 1.9964614941176194e-05,
"loss": 0.0875,
"reward": 3.65625,
"reward_std": 0.46278105676174164,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.75,
"rewards/tag_count_reward": 0.90625,
"second_item": 24.765625,
"step": 16,
"total_sum": 24.740907192230225
},
{
"clip_ratio": 0.0,
"completion_length": 277.71875,
"epoch": 0.136,
"first_item": -1.1175870895385742e-08,
"first_item_div_second_item": -5.519724165409711e-09,
"grad_norm": 2.0735116004943848,
"kl": 0.4453125,
"learning_rate": 1.9937122098932428e-05,
"loss": -0.0944,
"reward": 3.4375,
"reward_std": 0.47706207633018494,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.71875,
"rewards/tag_count_reward": 0.71875,
"second_item": 520.337890625,
"step": 17,
"total_sum": 519.6693426072598
},
{
"clip_ratio": 0.0,
"completion_length": 251.53125,
"epoch": 0.144,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 2.5027475357055664,
"kl": 0.4169921875,
"learning_rate": 1.9901811253364458e-05,
"loss": -0.0146,
"reward": 3.359375,
"reward_std": 0.7189894616603851,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.5625,
"rewards/tag_count_reward": 0.796875,
"second_item": 49.82568359375,
"step": 18,
"total_sum": 49.88163825124502
},
{
"clip_ratio": 0.0,
"completion_length": 329.65625,
"epoch": 0.152,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -1.003864996328273e-08,
"grad_norm": 0.5646486282348633,
"kl": 0.2529296875,
"learning_rate": 1.985871018518236e-05,
"loss": 0.0372,
"reward": 3.9375,
"reward_std": 0.125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.96875,
"second_item": 32.685546875,
"step": 19,
"total_sum": 32.599996507167816
},
{
"clip_ratio": 0.0,
"completion_length": 376.875,
"epoch": 0.16,
"first_item": -1.1175870895385742e-08,
"first_item_div_second_item": -2.719362519503432e-08,
"grad_norm": 1.3619046211242676,
"kl": 0.251953125,
"learning_rate": 1.9807852804032306e-05,
"loss": 0.0708,
"reward": 3.625,
"reward_std": 0.23103028535842896,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.84375,
"rewards/tag_count_reward": 0.78125,
"second_item": 0.3994140625,
"step": 20,
"total_sum": 0.39887452125549316
},
{
"clip_ratio": 0.0,
"completion_length": 247.5625,
"epoch": 0.168,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -5.596679419784371e-10,
"grad_norm": 1.2550877332687378,
"kl": 0.66796875,
"learning_rate": 1.9749279121818235e-05,
"loss": 0.0587,
"reward": 3.828125,
"reward_std": 0.25129537284374237,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.90625,
"rewards/tag_count_reward": 0.921875,
"second_item": 3.654296875,
"step": 21,
"total_sum": 3.6529918909072876
},
{
"clip_ratio": 0.0,
"completion_length": 308.1875,
"epoch": 0.176,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -9.58464183656221e-09,
"grad_norm": 0.3708275556564331,
"kl": 0.37890625,
"learning_rate": 1.9683035221222617e-05,
"loss": -0.0318,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 1.0,
"second_item": 0.7138671875,
"step": 22,
"total_sum": 0.7121883630752563
},
{
"clip_ratio": 0.0,
"completion_length": 291.78125,
"epoch": 0.184,
"first_item": 0.0,
"first_item_div_second_item": -2.3482853970924673e-09,
"grad_norm": 2.5349650382995605,
"kl": 0.404296875,
"learning_rate": 1.9609173219450998e-05,
"loss": -0.1219,
"reward": 3.28125,
"reward_std": 0.8468633890151978,
"rewards/accuracy_reward": 1.875,
"rewards/format_reward": 0.625,
"rewards/tag_count_reward": 0.78125,
"second_item": 0.94140625,
"step": 23,
"total_sum": 0.9402182698249817
},
{
"clip_ratio": 0.0,
"completion_length": 374.9375,
"epoch": 0.192,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 1.2565770149230957,
"kl": 0.29052734375,
"learning_rate": 1.9527751227228964e-05,
"loss": -0.0221,
"reward": 3.875,
"reward_std": 0.25,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.90625,
"rewards/tag_count_reward": 0.96875,
"second_item": 1.0078125,
"step": 24,
"total_sum": 1.0071582198143005
},
{
"clip_ratio": 0.0,
"completion_length": 402.375,
"epoch": 0.2,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 1.1214624643325806,
"kl": 0.2421875,
"learning_rate": 1.9438833303083677e-05,
"loss": -0.0381,
"reward": 3.796875,
"reward_std": 0.3270031735301018,
"rewards/accuracy_reward": 1.9375,
"rewards/format_reward": 0.90625,
"rewards/tag_count_reward": 0.953125,
"second_item": 0.697265625,
"step": 25,
"total_sum": 0.6968253254890442
},
{
"clip_ratio": 0.0,
"completion_length": 398.96875,
"epoch": 0.208,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -4.7211539989644865e-09,
"grad_norm": 0.7691929340362549,
"kl": 0.236328125,
"learning_rate": 1.9342489402945997e-05,
"loss": 0.0169,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 1.0,
"second_item": 3.00390625,
"step": 26,
"total_sum": 3.0006871819496155
},
{
"clip_ratio": 0.0,
"completion_length": 433.25,
"epoch": 0.216,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.2706140875816345,
"kl": 0.255859375,
"learning_rate": 1.9238795325112867e-05,
"loss": 0.0097,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 0.94140625,
"step": 27,
"total_sum": 0.9400412738323212
},
{
"clip_ratio": 0.0,
"completion_length": 435.53125,
"epoch": 0.224,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -4.054737298490541e-10,
"grad_norm": 0.9731921553611755,
"kl": 0.2578125,
"learning_rate": 1.912783265061319e-05,
"loss": 0.0155,
"reward": 3.921875,
"reward_std": 0.15625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.953125,
"second_item": 7.875,
"step": 28,
"total_sum": 7.877335548400879
},
{
"clip_ratio": 0.0,
"completion_length": 428.84375,
"epoch": 0.232,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.1715633124113083,
"kl": 0.22021484375,
"learning_rate": 1.900968867902419e-05,
"loss": 0.0082,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 1841.671875,
"step": 29,
"total_sum": 1842.0478942394257
},
{
"clip_ratio": 0.0,
"completion_length": 428.03125,
"epoch": 0.24,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.2258675992488861,
"kl": 0.2578125,
"learning_rate": 1.8884456359788725e-05,
"loss": 0.0096,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 3.359375,
"step": 30,
"total_sum": 3.3617489337921143
},
{
"clip_ratio": 0.0,
"completion_length": 480.5,
"epoch": 0.248,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.25862640142440796,
"kl": 0.25048828125,
"learning_rate": 1.8752234219087538e-05,
"loss": 0.0099,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 2.5546875,
"step": 31,
"total_sum": 2.5553407073020935
},
{
"clip_ratio": 0.0,
"completion_length": 492.90625,
"epoch": 0.256,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.20847512781620026,
"kl": 0.2421875,
"learning_rate": 1.8613126282324092e-05,
"loss": 0.0097,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 1961.7421875,
"step": 32,
"total_sum": 1962.195048570633
},
{
"clip_ratio": 0.0,
"completion_length": 437.21875,
"epoch": 0.264,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.7570972442626953,
"kl": 0.24609375,
"learning_rate": 1.8467241992282842e-05,
"loss": 0.0151,
"reward": 3.734375,
"reward_std": 0.03125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.859375,
"second_item": 1.5546875,
"step": 33,
"total_sum": 1.557328462600708
},
{
"clip_ratio": 0.0,
"completion_length": 489.53125,
"epoch": 0.272,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.18539515137672424,
"kl": 0.23681640625,
"learning_rate": 1.8314696123025456e-05,
"loss": 0.0094,
"reward": 3.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.875,
"second_item": 23.8515625,
"step": 34,
"total_sum": 23.815812468528748
},
{
"clip_ratio": 0.0,
"completion_length": 454.8125,
"epoch": 0.28,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.19201111793518066,
"kl": 0.26904296875,
"learning_rate": 1.8155608689592604e-05,
"loss": 0.0103,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 4.2890625,
"step": 35,
"total_sum": 4.293882369995117
},
{
"clip_ratio": 0.0,
"completion_length": 456.375,
"epoch": 0.288,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.2121107578277588,
"kl": 0.2412109375,
"learning_rate": 1.7990104853582494e-05,
"loss": 0.0091,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 3.546875,
"step": 36,
"total_sum": 3.547041416168213
},
{
"clip_ratio": 0.0,
"completion_length": 442.3125,
"epoch": 0.296,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.1804787516593933,
"kl": 0.265625,
"learning_rate": 1.78183148246803e-05,
"loss": 0.0097,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 4.0,
"step": 37,
"total_sum": 4.0047523975372314
},
{
"clip_ratio": 0.0,
"completion_length": 484.96875,
"epoch": 0.304,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.6737444400787354,
"kl": 0.20166015625,
"learning_rate": 1.7640373758216075e-05,
"loss": 0.008,
"reward": 3.9609375,
"reward_std": 0.078125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.9921875,
"second_item": 1274.4375,
"step": 38,
"total_sum": 1276.6473398208618
},
{
"clip_ratio": 0.0,
"completion_length": 463.21875,
"epoch": 0.312,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.15978366136550903,
"kl": 0.216796875,
"learning_rate": 1.7456421648831658e-05,
"loss": 0.0085,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 3.328125,
"step": 39,
"total_sum": 3.32777202129364
},
{
"clip_ratio": 0.0,
"completion_length": 471.96875,
"epoch": 0.32,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.1662157028913498,
"kl": 0.22607421875,
"learning_rate": 1.7266603220340273e-05,
"loss": 0.0089,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 5.1328125,
"step": 40,
"total_sum": 5.137336730957031
},
{
"clip_ratio": 0.0,
"completion_length": 494.59375,
"epoch": 0.328,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.12864062190055847,
"kl": 0.19140625,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.0077,
"reward": 3.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.875,
"second_item": 469.46875,
"step": 41,
"total_sum": 469.15666449069977
},
{
"clip_ratio": 0.0,
"completion_length": 502.875,
"epoch": 0.336,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.19264847040176392,
"kl": 0.2001953125,
"learning_rate": 1.686996926034902e-05,
"loss": 0.008,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 3.4140625,
"step": 42,
"total_sum": 3.4142632484436035
},
{
"clip_ratio": 0.0,
"completion_length": 500.0,
"epoch": 0.344,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.1367393285036087,
"kl": 0.189453125,
"learning_rate": 1.6663465779520042e-05,
"loss": 0.0076,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 11.0625,
"step": 43,
"total_sum": 11.049258708953857
},
{
"clip_ratio": 0.0,
"completion_length": 491.125,
"epoch": 0.352,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.7059330940246582,
"kl": 0.205078125,
"learning_rate": 1.645171983542088e-05,
"loss": 0.008,
"reward": 3.9609375,
"reward_std": 0.078125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.9921875,
"second_item": 4.78125,
"step": 44,
"total_sum": 4.784100770950317
},
{
"clip_ratio": 0.0,
"completion_length": 496.4375,
"epoch": 0.36,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 1.4085657596588135,
"kl": 0.22607421875,
"learning_rate": 1.6234898018587336e-05,
"loss": 0.0092,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 5.5625,
"step": 45,
"total_sum": 5.5558922290802
},
{
"clip_ratio": 0.0,
"completion_length": 512.0,
"epoch": 0.368,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -1.4191576683063792e-09,
"grad_norm": 0.6515297889709473,
"kl": 0.1728515625,
"learning_rate": 1.601317091298406e-05,
"loss": 0.0069,
"reward": 3.984375,
"reward_std": 0.03125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 0.984375,
"second_item": 2.3203125,
"step": 46,
"total_sum": 2.324346423149109
},
{
"clip_ratio": 0.0,
"completion_length": 506.46875,
"epoch": 0.376,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.619065523147583,
"kl": 0.1796875,
"learning_rate": 1.578671296179806e-05,
"loss": 0.0072,
"reward": 3.9609375,
"reward_std": 0.078125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.9921875,
"second_item": 3.7265625,
"step": 47,
"total_sum": 3.726863741874695
},
{
"clip_ratio": 0.0,
"completion_length": 504.4375,
"epoch": 0.384,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.14806345105171204,
"kl": 0.17919921875,
"learning_rate": 1.5555702330196024e-05,
"loss": 0.0071,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 1.927734375,
"step": 48,
"total_sum": 1.9234183728694916
},
{
"clip_ratio": 0.0,
"completion_length": 482.03125,
"epoch": 0.392,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.8865867257118225,
"kl": 0.24267578125,
"learning_rate": 1.5320320765153367e-05,
"loss": 0.0082,
"reward": 3.9921875,
"reward_std": 0.015625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 0.9921875,
"second_item": 4.16015625,
"step": 49,
"total_sum": 4.161918342113495
},
{
"clip_ratio": 0.0,
"completion_length": 500.75,
"epoch": 0.4,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 1.5075924396514893,
"kl": 0.26171875,
"learning_rate": 1.5080753452465296e-05,
"loss": 0.0104,
"reward": 3.9765625,
"reward_std": 0.029919598251581192,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 0.9765625,
"second_item": 7.5234375,
"step": 50,
"total_sum": 7.529886245727539
},
{
"clip_ratio": 0.0,
"completion_length": 489.8125,
"epoch": 0.408,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.6599332094192505,
"kl": 0.22119140625,
"learning_rate": 1.4837188871052399e-05,
"loss": 0.0086,
"reward": 3.453125,
"reward_std": 0.09375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.71875,
"rewards/tag_count_reward": 0.734375,
"second_item": 14.84375,
"step": 51,
"total_sum": 14.823784828186035
},
{
"clip_ratio": 0.0,
"completion_length": 512.0,
"epoch": 0.416,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.6535671353340149,
"kl": 0.2216796875,
"learning_rate": 1.4589818644675378e-05,
"loss": 0.0089,
"reward": 3.9609375,
"reward_std": 0.078125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.9921875,
"second_item": 295.5,
"step": 52,
"total_sum": 295.7428283691406
},
{
"clip_ratio": 0.0,
"completion_length": 483.625,
"epoch": 0.424,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 1.7440216541290283,
"kl": 0.2587890625,
"learning_rate": 1.4338837391175582e-05,
"loss": 0.0249,
"reward": 3.6953125,
"reward_std": 0.49763840436935425,
"rewards/accuracy_reward": 1.875,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.8828125,
"second_item": 5.953125,
"step": 53,
"total_sum": 5.957712173461914
},
{
"clip_ratio": 0.0,
"completion_length": 498.09375,
"epoch": 0.432,
"first_item": -7.450580596923828e-09,
"first_item_div_second_item": -1.3303867507961851e-09,
"grad_norm": 1.2703138589859009,
"kl": 0.23193359375,
"learning_rate": 1.4084442569359964e-05,
"loss": 0.009,
"reward": 3.8984375,
"reward_std": 0.18319407105445862,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.9296875,
"second_item": 5.65625,
"step": 54,
"total_sum": 5.652552604675293
},
{
"clip_ratio": 0.0,
"completion_length": 503.1875,
"epoch": 0.44,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 1.0315853357315063,
"kl": 0.21875,
"learning_rate": 1.3826834323650899e-05,
"loss": 0.0085,
"reward": 3.6484375,
"reward_std": 0.13530339300632477,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.8125,
"rewards/tag_count_reward": 0.8359375,
"second_item": 188.8125,
"step": 55,
"total_sum": 188.6527681350708
},
{
"clip_ratio": 0.0,
"completion_length": 506.71875,
"epoch": 0.448,
"first_item": -7.450580596923828e-09,
"first_item_div_second_item": -3.2356165308247496e-09,
"grad_norm": 1.0939035415649414,
"kl": 0.19970703125,
"learning_rate": 1.3566215326623131e-05,
"loss": 0.008,
"reward": 3.65625,
"reward_std": 0.1875,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.8125,
"rewards/tag_count_reward": 0.84375,
"second_item": 7.0078125,
"step": 56,
"total_sum": 7.023218750953674
},
{
"clip_ratio": 0.0,
"completion_length": 471.0625,
"epoch": 0.456,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.18012821674346924,
"kl": 0.22998046875,
"learning_rate": 1.3302790619551673e-05,
"loss": 0.0084,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 968.546875,
"step": 57,
"total_sum": 967.2251669764519
},
{
"clip_ratio": 0.0,
"completion_length": 475.3125,
"epoch": 0.464,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.5792978405952454,
"kl": 0.20947265625,
"learning_rate": 1.3036767451096148e-05,
"loss": 0.0072,
"reward": 3.9921875,
"reward_std": 0.015625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 0.9921875,
"second_item": 2.349609375,
"step": 58,
"total_sum": 2.353084683418274
},
{
"clip_ratio": 0.0,
"completion_length": 489.71875,
"epoch": 0.472,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.21192185580730438,
"kl": 0.2109375,
"learning_rate": 1.2768355114248493e-05,
"loss": 0.0082,
"reward": 3.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.875,
"second_item": 0.9443359375,
"step": 59,
"total_sum": 0.9458350092172623
},
{
"clip_ratio": 0.0,
"completion_length": 466.21875,
"epoch": 0.48,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -2.8656077014624187e-10,
"grad_norm": 1.5874462127685547,
"kl": 0.2705078125,
"learning_rate": 1.249776478167227e-05,
"loss": 0.024,
"reward": 3.921875,
"reward_std": 0.15625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.953125,
"second_item": 6.939453125,
"step": 60,
"total_sum": 6.952052086591721
},
{
"clip_ratio": 0.0,
"completion_length": 448.15625,
"epoch": 0.488,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -2.785263566306959e-10,
"grad_norm": 1.4860674142837524,
"kl": 0.2607421875,
"learning_rate": 1.2225209339563144e-05,
"loss": 0.0567,
"reward": 3.5,
"reward_std": 0.4321783781051636,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.71875,
"rewards/tag_count_reward": 0.78125,
"second_item": 6.982421875,
"step": 61,
"total_sum": 6.976648718118668
},
{
"clip_ratio": 0.0,
"completion_length": 467.5,
"epoch": 0.496,
"first_item": -1.1175870895385742e-08,
"first_item_div_second_item": -1.1127608281986097e-08,
"grad_norm": 1.8530943393707275,
"kl": 0.24609375,
"learning_rate": 1.1950903220161286e-05,
"loss": 0.0063,
"reward": 3.0703125,
"reward_std": 0.9480443596839905,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.53125,
"rewards/tag_count_reward": 0.5390625,
"second_item": 97.16796875,
"step": 62,
"total_sum": 97.3824203312397
},
{
"clip_ratio": 0.0,
"completion_length": 499.1875,
"epoch": 0.504,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 1.9478371143341064,
"kl": 0.2265625,
"learning_rate": 1.1675062233047365e-05,
"loss": -0.0152,
"reward": 2.8984375,
"reward_std": 1.0290476083755493,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.4375,
"rewards/tag_count_reward": 0.4609375,
"second_item": 1.05810546875,
"step": 63,
"total_sum": 1.0574220344424248
},
{
"clip_ratio": 0.0,
"completion_length": 419.28125,
"epoch": 0.512,
"first_item": 3.725290298461914e-09,
"first_item_div_second_item": 9.489272508979978e-09,
"grad_norm": 1.7507895231246948,
"kl": 0.28515625,
"learning_rate": 1.1397903395354996e-05,
"loss": 0.1077,
"reward": 2.859375,
"reward_std": 0.9210017323493958,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.40625,
"rewards/tag_count_reward": 0.453125,
"second_item": 0.5751953125,
"step": 64,
"total_sum": 0.5762820988893509
},
{
"clip_ratio": 0.0,
"completion_length": 497.3125,
"epoch": 0.52,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 1.414428949356079,
"kl": 0.2158203125,
"learning_rate": 1.1119644761033079e-05,
"loss": -0.0004,
"reward": 3.1875,
"reward_std": 0.672369509935379,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.5625,
"rewards/tag_count_reward": 0.625,
"second_item": 8.25,
"step": 65,
"total_sum": 8.253949701786041
},
{
"clip_ratio": 0.0,
"completion_length": 469.46875,
"epoch": 0.528,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -5.327781857508907e-09,
"grad_norm": 1.1460624933242798,
"kl": 0.203125,
"learning_rate": 1.0840505249292477e-05,
"loss": 0.0146,
"reward": 3.59375,
"reward_std": 0.3125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.8125,
"rewards/tag_count_reward": 0.78125,
"second_item": 0.5107421875,
"step": 66,
"total_sum": 0.5107531398534775
},
{
"clip_ratio": 0.0,
"completion_length": 462.28125,
"epoch": 0.536,
"first_item": 1.862645149230957e-09,
"first_item_div_second_item": 3.386626928305935e-10,
"grad_norm": 1.0261918306350708,
"kl": 0.24609375,
"learning_rate": 1.0560704472371919e-05,
"loss": 0.0097,
"reward": 3.6640625,
"reward_std": 0.32916322350502014,
"rewards/accuracy_reward": 1.9375,
"rewards/format_reward": 0.8125,
"rewards/tag_count_reward": 0.9140625,
"second_item": 3.064453125,
"step": 67,
"total_sum": 3.065736472606659
},
{
"clip_ratio": 0.0,
"completion_length": 391.71875,
"epoch": 0.544,
"first_item": 3.725290298461914e-09,
"first_item_div_second_item": 1.91500768306081e-09,
"grad_norm": 0.9414896965026855,
"kl": 0.3193359375,
"learning_rate": 1.028046256275869e-05,
"loss": -0.0681,
"reward": 3.8203125,
"reward_std": 0.359375,
"rewards/accuracy_reward": 1.9375,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.9453125,
"second_item": 1.119140625,
"step": 68,
"total_sum": 1.1211046129465103
},
{
"clip_ratio": 0.0,
"completion_length": 455.90625,
"epoch": 0.552,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.47802987694740295,
"kl": 0.2197265625,
"learning_rate": 1e-05,
"loss": 0.0065,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 1.0,
"second_item": 198.6875,
"step": 69,
"total_sum": 198.5185830593109
},
{
"clip_ratio": 0.0,
"completion_length": 490.125,
"epoch": 0.56,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.9080296754837036,
"kl": 0.1943359375,
"learning_rate": 9.719537437241311e-06,
"loss": 0.0241,
"reward": 3.7109375,
"reward_std": 0.140625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.84375,
"rewards/tag_count_reward": 0.8671875,
"second_item": 0.9921875,
"step": 70,
"total_sum": 0.9923984408378601
},
{
"clip_ratio": 0.0,
"completion_length": 495.40625,
"epoch": 0.568,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.3803475797176361,
"kl": 0.232421875,
"learning_rate": 9.439295527628083e-06,
"loss": 0.007,
"reward": 3.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.875,
"second_item": 2.4609375,
"step": 71,
"total_sum": 2.456995338201523
},
{
"clip_ratio": 0.0,
"completion_length": 492.46875,
"epoch": 0.576,
"first_item": -7.450580596923828e-09,
"first_item_div_second_item": -1.4839002149245586e-08,
"grad_norm": 0.6860982179641724,
"kl": 0.17626953125,
"learning_rate": 9.159494750707527e-06,
"loss": 0.0202,
"reward": 3.90625,
"reward_std": 0.1875,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.96875,
"second_item": 0.5029296875,
"step": 72,
"total_sum": 0.5038877725601196
},
{
"clip_ratio": 0.0,
"completion_length": 474.375,
"epoch": 0.584,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -3.24378736733566e-09,
"grad_norm": 0.4427696764469147,
"kl": 0.2021484375,
"learning_rate": 8.880355238966923e-06,
"loss": 0.0064,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 1.0,
"second_item": 1.1640625,
"step": 73,
"total_sum": 1.1623322367668152
},
{
"clip_ratio": 0.0,
"completion_length": 489.96875,
"epoch": 0.592,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -1.1413266842281213e-12,
"grad_norm": 0.7066324353218079,
"kl": 0.193359375,
"learning_rate": 8.602096604645009e-06,
"loss": -0.0353,
"reward": 3.875,
"reward_std": 0.25,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.9375,
"second_item": 1640.25,
"step": 74,
"total_sum": 1641.355055809021
},
{
"clip_ratio": 0.0,
"completion_length": 473.40625,
"epoch": 0.6,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.8320920467376709,
"kl": 0.1767578125,
"learning_rate": 8.324937766952638e-06,
"loss": -0.0102,
"reward": 3.8984375,
"reward_std": 0.203125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.9609375,
"second_item": 1.6796875,
"step": 75,
"total_sum": 1.6792444586753845
},
{
"clip_ratio": 0.0,
"completion_length": 496.9375,
"epoch": 0.608,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.6126235127449036,
"kl": 0.185546875,
"learning_rate": 8.04909677983872e-06,
"loss": 0.0068,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 1.0,
"second_item": 3.89453125,
"step": 76,
"total_sum": 3.888884425163269
},
{
"clip_ratio": 0.0,
"completion_length": 495.8125,
"epoch": 0.616,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.16169646382331848,
"kl": 0.19287109375,
"learning_rate": 7.774790660436857e-06,
"loss": 0.0065,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 1.677734375,
"step": 77,
"total_sum": 1.6808350086212158
},
{
"clip_ratio": 0.0,
"completion_length": 512.0,
"epoch": 0.624,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.162797212600708,
"kl": 0.15478515625,
"learning_rate": 7.50223521832773e-06,
"loss": 0.0062,
"reward": 3.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.875,
"second_item": 2.986328125,
"step": 78,
"total_sum": 2.9852358400821686
},
{
"clip_ratio": 0.0,
"completion_length": 507.125,
"epoch": 0.632,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.3044331669807434,
"kl": 0.17431640625,
"learning_rate": 7.2316448857515076e-06,
"loss": 0.007,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 58.4609375,
"step": 79,
"total_sum": 58.46868443489075
},
{
"clip_ratio": 0.0,
"completion_length": 511.25,
"epoch": 0.64,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.14911580085754395,
"kl": 0.162109375,
"learning_rate": 6.963232548903853e-06,
"loss": 0.0065,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 2.8671875,
"step": 80,
"total_sum": 2.866468667984009
},
{
"clip_ratio": 0.0,
"completion_length": 493.125,
"epoch": 0.648,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.1717568188905716,
"kl": 0.189453125,
"learning_rate": 6.697209380448333e-06,
"loss": 0.0072,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 2.794921875,
"step": 81,
"total_sum": 2.800520181655884
},
{
"clip_ratio": 0.0,
"completion_length": 512.0,
"epoch": 0.656,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.5472955703735352,
"kl": 0.158203125,
"learning_rate": 6.43378467337687e-06,
"loss": 0.0063,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 1.0,
"second_item": 2.62109375,
"step": 82,
"total_sum": 2.6212058067321777
},
{
"clip_ratio": 0.0,
"completion_length": 498.875,
"epoch": 0.664,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.1360127478837967,
"kl": 0.1875,
"learning_rate": 6.173165676349103e-06,
"loss": 0.0073,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 0.833984375,
"step": 83,
"total_sum": 0.8321126252412796
},
{
"clip_ratio": 0.0,
"completion_length": 498.21875,
"epoch": 0.672,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.11324969679117203,
"kl": 0.1767578125,
"learning_rate": 5.91555743064004e-06,
"loss": 0.0069,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 197.015625,
"step": 84,
"total_sum": 196.66522407531738
},
{
"clip_ratio": 0.0,
"completion_length": 467.5,
"epoch": 0.68,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.6697484254837036,
"kl": 0.2392578125,
"learning_rate": 5.66116260882442e-06,
"loss": 0.0087,
"reward": 3.703125,
"reward_std": 0.09375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.84375,
"rewards/tag_count_reward": 0.859375,
"second_item": 5.73828125,
"step": 85,
"total_sum": 5.750060975551605
},
{
"clip_ratio": 0.0,
"completion_length": 491.5625,
"epoch": 0.688,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -5.449559550394315e-09,
"grad_norm": 0.5044450163841248,
"kl": 0.19677734375,
"learning_rate": 5.410181355324622e-06,
"loss": 0.0078,
"reward": 3.9375,
"reward_std": 0.125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.96875,
"second_item": 7.623046875,
"step": 86,
"total_sum": 7.606956303119659
},
{
"clip_ratio": 0.0,
"completion_length": 486.25,
"epoch": 0.696,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -5.6098404722769945e-09,
"grad_norm": 0.9169361591339111,
"kl": 0.248046875,
"learning_rate": 5.1628111289476025e-06,
"loss": 0.0075,
"reward": 3.90625,
"reward_std": 0.1875,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.96875,
"second_item": 1.58984375,
"step": 87,
"total_sum": 1.5913574695587158
},
{
"clip_ratio": 0.0,
"completion_length": 499.90625,
"epoch": 0.704,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.22143904864788055,
"kl": 0.19091796875,
"learning_rate": 4.919246547534709e-06,
"loss": 0.0074,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 3.71875,
"step": 88,
"total_sum": 3.719162940979004
},
{
"clip_ratio": 0.0,
"completion_length": 488.4375,
"epoch": 0.712,
"first_item": 3.725290298461914e-09,
"first_item_div_second_item": 3.4304799935373085e-09,
"grad_norm": 0.9220072031021118,
"kl": 0.20654296875,
"learning_rate": 4.679679234846636e-06,
"loss": 0.0081,
"reward": 3.890625,
"reward_std": 0.21875,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.953125,
"second_item": 0.890625,
"step": 89,
"total_sum": 0.8904827535152435
},
{
"clip_ratio": 0.0,
"completion_length": 495.96875,
"epoch": 0.72,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.8823760747909546,
"kl": 0.19677734375,
"learning_rate": 4.444297669803981e-06,
"loss": 0.0077,
"reward": 3.90625,
"reward_std": 0.1875,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.96875,
"second_item": 2.328125,
"step": 90,
"total_sum": 2.3348000049591064
},
{
"clip_ratio": 0.0,
"completion_length": 510.09375,
"epoch": 0.728,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -2.256110887982257e-13,
"grad_norm": 0.6629019975662231,
"kl": 0.20947265625,
"learning_rate": 4.213287038201943e-06,
"loss": 0.0084,
"reward": 3.9375,
"reward_std": 0.125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.96875,
"second_item": 8860.0,
"step": 91,
"total_sum": 8861.269836425781
},
{
"clip_ratio": 0.0,
"completion_length": 447.03125,
"epoch": 0.736,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.7865133881568909,
"kl": 0.2041015625,
"learning_rate": 3.986829087015941e-06,
"loss": -0.0353,
"reward": 3.890625,
"reward_std": 0.21875,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.953125,
"second_item": 582.5,
"step": 92,
"total_sum": 580.777759552002
},
{
"clip_ratio": 0.0,
"completion_length": 474.84375,
"epoch": 0.744,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.15393787622451782,
"kl": 0.21435546875,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.0078,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 7328.33984375,
"step": 93,
"total_sum": 7324.671706229448
},
{
"clip_ratio": 0.0,
"completion_length": 484.90625,
"epoch": 0.752,
"first_item": 3.725290298461914e-09,
"first_item_div_second_item": 4.219793080262041e-09,
"grad_norm": 0.9448766708374023,
"kl": 0.208984375,
"learning_rate": 3.5482801645791266e-06,
"loss": 0.0147,
"reward": 3.90625,
"reward_std": 0.1875,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.96875,
"second_item": 2368.44140625,
"step": 94,
"total_sum": 2369.8369680941105
},
{
"clip_ratio": 0.0,
"completion_length": 480.90625,
"epoch": 0.76,
"first_item": 3.725290298461914e-09,
"first_item_div_second_item": 3.096342609553708e-09,
"grad_norm": 0.8535664081573486,
"kl": 0.20947265625,
"learning_rate": 3.3365342204799613e-06,
"loss": 0.0206,
"reward": 3.953125,
"reward_std": 0.09375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.984375,
"second_item": 659456.6015625,
"step": 95,
"total_sum": 659550.1629930139
},
{
"clip_ratio": 0.0,
"completion_length": 501.0625,
"epoch": 0.768,
"first_item": 3.725290298461914e-09,
"first_item_div_second_item": 4.14640545620197e-09,
"grad_norm": 0.6877720355987549,
"kl": 0.2080078125,
"learning_rate": 3.1300307396509833e-06,
"loss": 0.0083,
"reward": 3.703125,
"reward_std": 0.09375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.84375,
"rewards/tag_count_reward": 0.859375,
"second_item": 0.6611328125,
"step": 96,
"total_sum": 0.6603447943925858
},
{
"clip_ratio": 0.0,
"completion_length": 457.1875,
"epoch": 0.776,
"first_item": 0.0,
"first_item_div_second_item": 5.7796523313950656e-09,
"grad_norm": 0.9136969447135925,
"kl": 0.2568359375,
"learning_rate": 2.9289321881345257e-06,
"loss": -0.0502,
"reward": 3.609375,
"reward_std": 0.34375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.78125,
"rewards/tag_count_reward": 0.828125,
"second_item": 10176.322265625,
"step": 97,
"total_sum": 10176.559444218874
},
{
"clip_ratio": 0.0,
"completion_length": 435.90625,
"epoch": 0.784,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.5218420028686523,
"kl": 0.2265625,
"learning_rate": 2.7333967796597317e-06,
"loss": -0.0079,
"reward": 3.953125,
"reward_std": 0.09375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.984375,
"second_item": 768.396484375,
"step": 98,
"total_sum": 768.6198460459709
},
{
"clip_ratio": 0.0,
"completion_length": 488.8125,
"epoch": 0.792,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.8593007326126099,
"kl": 0.20556640625,
"learning_rate": 2.5435783511683444e-06,
"loss": 0.0052,
"reward": 3.890625,
"reward_std": 0.21875,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.9375,
"rewards/tag_count_reward": 0.953125,
"second_item": 1.68359375,
"step": 99,
"total_sum": 1.6826132237911224
},
{
"clip_ratio": 0.0,
"completion_length": 487.03125,
"epoch": 0.8,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.15400159358978271,
"kl": 0.19921875,
"learning_rate": 2.3596262417839256e-06,
"loss": 0.0078,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 16.537109375,
"step": 100,
"total_sum": 16.578362345695496
},
{
"clip_ratio": 0.0,
"completion_length": 465.90625,
"epoch": 0.808,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.3664933443069458,
"kl": 0.23388671875,
"learning_rate": 2.1816851753197023e-06,
"loss": 0.0081,
"reward": 3.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.875,
"second_item": 415.0,
"step": 101,
"total_sum": 414.8429412841797
},
{
"clip_ratio": 0.0,
"completion_length": 496.78125,
"epoch": 0.816,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -6.535596900185869e-11,
"grad_norm": 0.6789385080337524,
"kl": 0.197265625,
"learning_rate": 2.009895146417512e-06,
"loss": 0.0079,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 1.0,
"second_item": 28.755859375,
"step": 102,
"total_sum": 28.723628729581833
},
{
"clip_ratio": 0.0,
"completion_length": 486.125,
"epoch": 0.824,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.5424814224243164,
"kl": 0.20654296875,
"learning_rate": 1.8443913104073984e-06,
"loss": -0.0212,
"reward": 3.953125,
"reward_std": 0.09375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.984375,
"second_item": 22.67578125,
"step": 103,
"total_sum": 22.712478399276733
},
{
"clip_ratio": 0.0,
"completion_length": 489.625,
"epoch": 0.832,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.13198262453079224,
"kl": 0.185546875,
"learning_rate": 1.6853038769745466e-06,
"loss": 0.0072,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 877.1875,
"step": 104,
"total_sum": 877.5139471292496
},
{
"clip_ratio": 0.0,
"completion_length": 463.0625,
"epoch": 0.84,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.5897831916809082,
"kl": 0.197265625,
"learning_rate": 1.5327580077171589e-06,
"loss": 0.0164,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 1.0,
"second_item": 14.84375,
"step": 105,
"total_sum": 14.816515803337097
},
{
"clip_ratio": 0.0,
"completion_length": 489.21875,
"epoch": 0.848,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 2.6089651584625244,
"kl": 0.2626953125,
"learning_rate": 1.3868737176759105e-06,
"loss": 0.0101,
"reward": 3.984375,
"reward_std": 0.03125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 0.984375,
"second_item": 1.2578125,
"step": 106,
"total_sum": 1.2589216232299805
},
{
"clip_ratio": 0.0,
"completion_length": 489.1875,
"epoch": 0.856,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.1342599242925644,
"kl": 0.2333984375,
"learning_rate": 1.2477657809124632e-06,
"loss": 0.0073,
"reward": 3.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.875,
"second_item": 160.75,
"step": 107,
"total_sum": 160.8754653930664
},
{
"clip_ratio": 0.0,
"completion_length": 466.8125,
"epoch": 0.864,
"first_item": 0.0,
"first_item_div_second_item": -6.710942669269451e-09,
"grad_norm": 0.9345314502716064,
"kl": 0.5322265625,
"learning_rate": 1.1155436402112785e-06,
"loss": 0.0367,
"reward": 3.671875,
"reward_std": 0.15625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.8125,
"rewards/tag_count_reward": 0.859375,
"second_item": 368.27734375,
"step": 108,
"total_sum": 368.1199948787689
},
{
"clip_ratio": 0.0,
"completion_length": 494.875,
"epoch": 0.872,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.30068421363830566,
"kl": 0.197265625,
"learning_rate": 9.903113209758098e-07,
"loss": 0.0078,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 114180.875,
"step": 109,
"total_sum": 114187.73381710052
},
{
"clip_ratio": 0.0,
"completion_length": 492.6875,
"epoch": 0.88,
"first_item": 3.725290298461914e-09,
"first_item_div_second_item": 5.960454940811157e-09,
"grad_norm": 0.4554659426212311,
"kl": 0.283203125,
"learning_rate": 8.721673493868111e-07,
"loss": 0.0068,
"reward": 3.953125,
"reward_std": 0.09375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.984375,
"second_item": 0.5322265625,
"step": 110,
"total_sum": 0.5327285826206207
},
{
"clip_ratio": 0.0,
"completion_length": 469.46875,
"epoch": 0.888,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.6664077043533325,
"kl": 0.2080078125,
"learning_rate": 7.612046748871327e-07,
"loss": 0.0077,
"reward": 3.9375,
"reward_std": 0.125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.96875,
"second_item": 4569.0,
"step": 111,
"total_sum": 4570.285980224609
},
{
"clip_ratio": 0.0,
"completion_length": 452.15625,
"epoch": 0.896,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -2.8959035278080874e-13,
"grad_norm": 1.0423874855041504,
"kl": 0.2353515625,
"learning_rate": 6.57510597054003e-07,
"loss": 0.0296,
"reward": 3.953125,
"reward_std": 0.09375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 0.953125,
"second_item": 6716.0,
"step": 112,
"total_sum": 6715.581207275391
},
{
"clip_ratio": 0.0,
"completion_length": 482.59375,
"epoch": 0.904,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -3.5061522751149548e-09,
"grad_norm": 0.7979276776313782,
"kl": 0.2001953125,
"learning_rate": 5.611666969163243e-07,
"loss": 0.0078,
"reward": 3.953125,
"reward_std": 0.09375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.984375,
"second_item": 58.03125,
"step": 113,
"total_sum": 58.15105998516083
},
{
"clip_ratio": 0.0,
"completion_length": 469.90625,
"epoch": 0.912,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.1514263153076172,
"kl": 0.19140625,
"learning_rate": 4.7224877277103673e-07,
"loss": 0.0073,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 27.671875,
"step": 114,
"total_sum": 27.719636023044586
},
{
"clip_ratio": 0.0,
"completion_length": 479.65625,
"epoch": 0.92,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.14016734063625336,
"kl": 0.1875,
"learning_rate": 3.908267805490051e-07,
"loss": 0.0069,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 0.673828125,
"step": 115,
"total_sum": 0.6723696887493134
},
{
"clip_ratio": 0.0,
"completion_length": 485.0625,
"epoch": 0.928,
"first_item": 3.725290298461914e-09,
"first_item_div_second_item": 5.211327772165794e-09,
"grad_norm": 0.6825590133666992,
"kl": 0.201171875,
"learning_rate": 3.1696477877738664e-07,
"loss": 0.0078,
"reward": 3.953125,
"reward_std": 0.09375,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.984375,
"second_item": 1.001953125,
"step": 116,
"total_sum": 1.0027059614658356
},
{
"clip_ratio": 0.0,
"completion_length": 493.65625,
"epoch": 0.936,
"first_item": -7.450580596923828e-09,
"first_item_div_second_item": -9.584692015580239e-09,
"grad_norm": 0.9466831088066101,
"kl": 0.19580078125,
"learning_rate": 2.507208781817638e-07,
"loss": 0.0075,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 0.96875,
"second_item": 37120.1943359375,
"step": 117,
"total_sum": 37120.29603135586
},
{
"clip_ratio": 0.0,
"completion_length": 445.03125,
"epoch": 0.944,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -5.138331375280828e-11,
"grad_norm": 1.1685991287231445,
"kl": 0.19140625,
"learning_rate": 1.921471959676957e-07,
"loss": -0.0058,
"reward": 3.671875,
"reward_std": 0.15625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.796875,
"second_item": 39.875,
"step": 118,
"total_sum": 39.85870552062988
},
{
"clip_ratio": 0.0,
"completion_length": 492.3125,
"epoch": 0.952,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -2.6848937152955265e-10,
"grad_norm": 0.7177730202674866,
"kl": 0.212890625,
"learning_rate": 1.4128981481764115e-07,
"loss": 0.0081,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 0.96875,
"second_item": 7.388671875,
"step": 119,
"total_sum": 7.404956877231598
},
{
"clip_ratio": 0.0,
"completion_length": 496.59375,
"epoch": 0.96,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.11760753393173218,
"kl": 0.1806640625,
"learning_rate": 9.818874663554356e-08,
"loss": 0.007,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 12336.0,
"step": 120,
"total_sum": 12335.790069580078
},
{
"clip_ratio": 0.0,
"completion_length": 455.5625,
"epoch": 0.968,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -2.6056657086224828e-09,
"grad_norm": 0.907755970954895,
"kl": 0.1962890625,
"learning_rate": 6.287790106757396e-08,
"loss": 0.0235,
"reward": 3.96875,
"reward_std": 0.0625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 0.96875,
"second_item": 0.9609375,
"step": 121,
"total_sum": 0.9597683101892471
},
{
"clip_ratio": 0.0,
"completion_length": 456.375,
"epoch": 0.976,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -8.315380130309733e-14,
"grad_norm": 0.9736545085906982,
"kl": 0.20703125,
"learning_rate": 3.538505882380916e-08,
"loss": 0.014,
"reward": 3.9765625,
"reward_std": 0.046875,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 0.9765625,
"second_item": 22400.3203125,
"step": 122,
"total_sum": 22400.575908362865
},
{
"clip_ratio": 0.0,
"completion_length": 475.5,
"epoch": 0.984,
"first_item": -3.725290298461914e-09,
"first_item_div_second_item": -5.449559550394315e-09,
"grad_norm": 0.6621652245521545,
"kl": 0.181640625,
"learning_rate": 1.5731849821833955e-08,
"loss": 0.0144,
"reward": 3.7421875,
"reward_std": 0.015625,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.875,
"rewards/tag_count_reward": 0.8671875,
"second_item": 0.5244140625,
"step": 123,
"total_sum": 0.5251940339803696
},
{
"clip_ratio": 0.0,
"completion_length": 491.0,
"epoch": 0.992,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 0.13678430020809174,
"kl": 0.19140625,
"learning_rate": 3.933736169471347e-09,
"loss": 0.0075,
"reward": 4.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 1.0,
"rewards/tag_count_reward": 1.0,
"second_item": 2128.0,
"step": 124,
"total_sum": 2126.9109802246094
},
{
"clip_ratio": 0.0,
"completion_length": 447.5625,
"epoch": 1.0,
"first_item": 0.0,
"first_item_div_second_item": 0.0,
"grad_norm": 1.1001026630401611,
"kl": 0.23681640625,
"learning_rate": 0.0,
"loss": 0.0292,
"reward": 3.9375,
"reward_std": 0.125,
"rewards/accuracy_reward": 2.0,
"rewards/format_reward": 0.96875,
"rewards/tag_count_reward": 0.96875,
"second_item": 3.6640625,
"step": 125,
"total_sum": 3.653566360473633
},
{
"epoch": 1.0,
"step": 125,
"total_flos": 0.0,
"train_loss": 0.006549221595370909,
"train_runtime": 9806.6961,
"train_samples_per_second": 0.102,
"train_steps_per_second": 0.013
}
],
"logging_steps": 1,
"max_steps": 125,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}