| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 125, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 482.90625, |
| "epoch": 0.008, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.5384615384615387e-06, |
| "loss": 0.0, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0, |
| "rewards/tag_count_reward": 0.0, |
| "second_item": 0.0, |
| "step": 1, |
| "total_sum": 0.0 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 457.34375, |
| "epoch": 0.016, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 3.0769230769230774e-06, |
| "loss": 0.0, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0, |
| "rewards/tag_count_reward": 0.0, |
| "second_item": 0.0, |
| "step": 2, |
| "total_sum": 0.0 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.6875, |
| "epoch": 0.024, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 4.615384615384616e-06, |
| "loss": 0.0, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0, |
| "rewards/tag_count_reward": 0.0, |
| "second_item": 0.0, |
| "step": 3, |
| "total_sum": 0.0 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.9375, |
| "epoch": 0.032, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 6.153846153846155e-06, |
| "loss": 0.0, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0, |
| "rewards/tag_count_reward": 0.0, |
| "second_item": 0.0, |
| "step": 4, |
| "total_sum": 0.0 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 452.34375, |
| "epoch": 0.04, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 7.692307692307694e-06, |
| "loss": 0.0, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0, |
| "rewards/tag_count_reward": 0.0, |
| "second_item": 0.0, |
| "step": 5, |
| "total_sum": 0.0 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.34375, |
| "epoch": 0.048, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 9.230769230769232e-06, |
| "loss": 0.0, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0, |
| "rewards/tag_count_reward": 0.0, |
| "second_item": 0.0, |
| "step": 6, |
| "total_sum": 0.0 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 450.59375, |
| "epoch": 0.056, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 1.076923076923077e-05, |
| "loss": 0.0, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0, |
| "rewards/tag_count_reward": 0.0, |
| "second_item": 0.0, |
| "step": 7, |
| "total_sum": 0.0 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 450.8125, |
| "epoch": 0.064, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.7793766260147095, |
| "kl": 0.0, |
| "learning_rate": 1.230769230769231e-05, |
| "loss": -0.0027, |
| "reward": 2.03125, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0, |
| "rewards/tag_count_reward": 0.03125, |
| "second_item": 0.0, |
| "step": 8, |
| "total_sum": 0.0 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 454.09375, |
| "epoch": 0.072, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.027297493070364, |
| "kl": 0.00109100341796875, |
| "learning_rate": 1.3846153846153847e-05, |
| "loss": 0.0, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0, |
| "rewards/tag_count_reward": 0.0, |
| "second_item": -0.00028061866760253906, |
| "step": 9, |
| "total_sum": -0.00028082728385925293 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 449.75, |
| "epoch": 0.08, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.08137404173612595, |
| "kl": 0.0069732666015625, |
| "learning_rate": 1.5384615384615387e-05, |
| "loss": 0.0003, |
| "reward": 2.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0, |
| "rewards/tag_count_reward": 0.0, |
| "second_item": 0.0002465248107910156, |
| "step": 10, |
| "total_sum": 0.0002470235340297222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.125, |
| "epoch": 0.088, |
| "first_item": 3.725290298461914e-09, |
| "first_item_div_second_item": 2.0901278008428459e-07, |
| "grad_norm": 1.5568859577178955, |
| "kl": 0.01507568359375, |
| "learning_rate": 1.6923076923076924e-05, |
| "loss": 0.0111, |
| "reward": 2.1796875, |
| "reward_std": 0.359375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.0625, |
| "rewards/tag_count_reward": 0.1171875, |
| "second_item": 0.00919342041015625, |
| "step": 11, |
| "total_sum": 0.009175417013466358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.46875, |
| "epoch": 0.096, |
| "first_item": 1.4901161193847656e-08, |
| "first_item_div_second_item": 1.5119791379746302e-07, |
| "grad_norm": 1.7241485118865967, |
| "kl": 0.03033447265625, |
| "learning_rate": 1.8461538461538465e-05, |
| "loss": -0.0271, |
| "reward": 2.71875, |
| "reward_std": 0.5814496129751205, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.40625, |
| "rewards/tag_count_reward": 0.3125, |
| "second_item": 0.17236328125, |
| "step": 12, |
| "total_sum": 0.1725619211792946 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.84375, |
| "epoch": 0.104, |
| "first_item": 0.0, |
| "first_item_div_second_item": -3.1597506488735924e-08, |
| "grad_norm": 1.110132098197937, |
| "kl": 0.0904541015625, |
| "learning_rate": 2e-05, |
| "loss": 0.0434, |
| "reward": 3.640625, |
| "reward_std": 0.5629279315471649, |
| "rewards/accuracy_reward": 1.9375, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.828125, |
| "second_item": 4.12060546875, |
| "step": 13, |
| "total_sum": 4.109678082168102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 288.4375, |
| "epoch": 0.112, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -4.94644289303585e-10, |
| "grad_norm": 1.3746771812438965, |
| "kl": 0.3095703125, |
| "learning_rate": 1.9996066263830533e-05, |
| "loss": -0.0027, |
| "reward": 3.8046875, |
| "reward_std": 0.390625, |
| "rewards/accuracy_reward": 1.9375, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.8984375, |
| "second_item": 7.5, |
| "step": 14, |
| "total_sum": 7.5020482540130615 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 281.40625, |
| "epoch": 0.12, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 2.6501071453094482, |
| "kl": 0.50390625, |
| "learning_rate": 1.998426815017817e-05, |
| "loss": 0.0337, |
| "reward": 2.6640625, |
| "reward_std": 0.39768657833337784, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.125, |
| "rewards/tag_count_reward": 0.5390625, |
| "second_item": 0.326171875, |
| "step": 15, |
| "total_sum": 0.326133593916893 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 262.5625, |
| "epoch": 0.128, |
| "first_item": 3.725290298461914e-09, |
| "first_item_div_second_item": 6.176646102876358e-10, |
| "grad_norm": 2.351057529449463, |
| "kl": 0.4150390625, |
| "learning_rate": 1.9964614941176194e-05, |
| "loss": 0.0875, |
| "reward": 3.65625, |
| "reward_std": 0.46278105676174164, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.75, |
| "rewards/tag_count_reward": 0.90625, |
| "second_item": 24.765625, |
| "step": 16, |
| "total_sum": 24.740907192230225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 277.71875, |
| "epoch": 0.136, |
| "first_item": -1.1175870895385742e-08, |
| "first_item_div_second_item": -5.519724165409711e-09, |
| "grad_norm": 2.0735116004943848, |
| "kl": 0.4453125, |
| "learning_rate": 1.9937122098932428e-05, |
| "loss": -0.0944, |
| "reward": 3.4375, |
| "reward_std": 0.47706207633018494, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.71875, |
| "rewards/tag_count_reward": 0.71875, |
| "second_item": 520.337890625, |
| "step": 17, |
| "total_sum": 519.6693426072598 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 251.53125, |
| "epoch": 0.144, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 2.5027475357055664, |
| "kl": 0.4169921875, |
| "learning_rate": 1.9901811253364458e-05, |
| "loss": -0.0146, |
| "reward": 3.359375, |
| "reward_std": 0.7189894616603851, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.5625, |
| "rewards/tag_count_reward": 0.796875, |
| "second_item": 49.82568359375, |
| "step": 18, |
| "total_sum": 49.88163825124502 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 329.65625, |
| "epoch": 0.152, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -1.003864996328273e-08, |
| "grad_norm": 0.5646486282348633, |
| "kl": 0.2529296875, |
| "learning_rate": 1.985871018518236e-05, |
| "loss": 0.0372, |
| "reward": 3.9375, |
| "reward_std": 0.125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 32.685546875, |
| "step": 19, |
| "total_sum": 32.599996507167816 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 376.875, |
| "epoch": 0.16, |
| "first_item": -1.1175870895385742e-08, |
| "first_item_div_second_item": -2.719362519503432e-08, |
| "grad_norm": 1.3619046211242676, |
| "kl": 0.251953125, |
| "learning_rate": 1.9807852804032306e-05, |
| "loss": 0.0708, |
| "reward": 3.625, |
| "reward_std": 0.23103028535842896, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.84375, |
| "rewards/tag_count_reward": 0.78125, |
| "second_item": 0.3994140625, |
| "step": 20, |
| "total_sum": 0.39887452125549316 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 247.5625, |
| "epoch": 0.168, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -5.596679419784371e-10, |
| "grad_norm": 1.2550877332687378, |
| "kl": 0.66796875, |
| "learning_rate": 1.9749279121818235e-05, |
| "loss": 0.0587, |
| "reward": 3.828125, |
| "reward_std": 0.25129537284374237, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.90625, |
| "rewards/tag_count_reward": 0.921875, |
| "second_item": 3.654296875, |
| "step": 21, |
| "total_sum": 3.6529918909072876 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 308.1875, |
| "epoch": 0.176, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -9.58464183656221e-09, |
| "grad_norm": 0.3708275556564331, |
| "kl": 0.37890625, |
| "learning_rate": 1.9683035221222617e-05, |
| "loss": -0.0318, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 0.7138671875, |
| "step": 22, |
| "total_sum": 0.7121883630752563 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 291.78125, |
| "epoch": 0.184, |
| "first_item": 0.0, |
| "first_item_div_second_item": -2.3482853970924673e-09, |
| "grad_norm": 2.5349650382995605, |
| "kl": 0.404296875, |
| "learning_rate": 1.9609173219450998e-05, |
| "loss": -0.1219, |
| "reward": 3.28125, |
| "reward_std": 0.8468633890151978, |
| "rewards/accuracy_reward": 1.875, |
| "rewards/format_reward": 0.625, |
| "rewards/tag_count_reward": 0.78125, |
| "second_item": 0.94140625, |
| "step": 23, |
| "total_sum": 0.9402182698249817 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 374.9375, |
| "epoch": 0.192, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 1.2565770149230957, |
| "kl": 0.29052734375, |
| "learning_rate": 1.9527751227228964e-05, |
| "loss": -0.0221, |
| "reward": 3.875, |
| "reward_std": 0.25, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.90625, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 1.0078125, |
| "step": 24, |
| "total_sum": 1.0071582198143005 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 402.375, |
| "epoch": 0.2, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 1.1214624643325806, |
| "kl": 0.2421875, |
| "learning_rate": 1.9438833303083677e-05, |
| "loss": -0.0381, |
| "reward": 3.796875, |
| "reward_std": 0.3270031735301018, |
| "rewards/accuracy_reward": 1.9375, |
| "rewards/format_reward": 0.90625, |
| "rewards/tag_count_reward": 0.953125, |
| "second_item": 0.697265625, |
| "step": 25, |
| "total_sum": 0.6968253254890442 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 398.96875, |
| "epoch": 0.208, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -4.7211539989644865e-09, |
| "grad_norm": 0.7691929340362549, |
| "kl": 0.236328125, |
| "learning_rate": 1.9342489402945997e-05, |
| "loss": 0.0169, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 3.00390625, |
| "step": 26, |
| "total_sum": 3.0006871819496155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 433.25, |
| "epoch": 0.216, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.2706140875816345, |
| "kl": 0.255859375, |
| "learning_rate": 1.9238795325112867e-05, |
| "loss": 0.0097, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 0.94140625, |
| "step": 27, |
| "total_sum": 0.9400412738323212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.53125, |
| "epoch": 0.224, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -4.054737298490541e-10, |
| "grad_norm": 0.9731921553611755, |
| "kl": 0.2578125, |
| "learning_rate": 1.912783265061319e-05, |
| "loss": 0.0155, |
| "reward": 3.921875, |
| "reward_std": 0.15625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.953125, |
| "second_item": 7.875, |
| "step": 28, |
| "total_sum": 7.877335548400879 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.84375, |
| "epoch": 0.232, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.1715633124113083, |
| "kl": 0.22021484375, |
| "learning_rate": 1.900968867902419e-05, |
| "loss": 0.0082, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 1841.671875, |
| "step": 29, |
| "total_sum": 1842.0478942394257 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.03125, |
| "epoch": 0.24, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.2258675992488861, |
| "kl": 0.2578125, |
| "learning_rate": 1.8884456359788725e-05, |
| "loss": 0.0096, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 3.359375, |
| "step": 30, |
| "total_sum": 3.3617489337921143 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.5, |
| "epoch": 0.248, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.25862640142440796, |
| "kl": 0.25048828125, |
| "learning_rate": 1.8752234219087538e-05, |
| "loss": 0.0099, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 2.5546875, |
| "step": 31, |
| "total_sum": 2.5553407073020935 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.90625, |
| "epoch": 0.256, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.20847512781620026, |
| "kl": 0.2421875, |
| "learning_rate": 1.8613126282324092e-05, |
| "loss": 0.0097, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 1961.7421875, |
| "step": 32, |
| "total_sum": 1962.195048570633 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.21875, |
| "epoch": 0.264, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.7570972442626953, |
| "kl": 0.24609375, |
| "learning_rate": 1.8467241992282842e-05, |
| "loss": 0.0151, |
| "reward": 3.734375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.859375, |
| "second_item": 1.5546875, |
| "step": 33, |
| "total_sum": 1.557328462600708 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.53125, |
| "epoch": 0.272, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.18539515137672424, |
| "kl": 0.23681640625, |
| "learning_rate": 1.8314696123025456e-05, |
| "loss": 0.0094, |
| "reward": 3.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.875, |
| "second_item": 23.8515625, |
| "step": 34, |
| "total_sum": 23.815812468528748 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 454.8125, |
| "epoch": 0.28, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.19201111793518066, |
| "kl": 0.26904296875, |
| "learning_rate": 1.8155608689592604e-05, |
| "loss": 0.0103, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 4.2890625, |
| "step": 35, |
| "total_sum": 4.293882369995117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.375, |
| "epoch": 0.288, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.2121107578277588, |
| "kl": 0.2412109375, |
| "learning_rate": 1.7990104853582494e-05, |
| "loss": 0.0091, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 3.546875, |
| "step": 36, |
| "total_sum": 3.547041416168213 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 442.3125, |
| "epoch": 0.296, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.1804787516593933, |
| "kl": 0.265625, |
| "learning_rate": 1.78183148246803e-05, |
| "loss": 0.0097, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 4.0, |
| "step": 37, |
| "total_sum": 4.0047523975372314 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 484.96875, |
| "epoch": 0.304, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.6737444400787354, |
| "kl": 0.20166015625, |
| "learning_rate": 1.7640373758216075e-05, |
| "loss": 0.008, |
| "reward": 3.9609375, |
| "reward_std": 0.078125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.9921875, |
| "second_item": 1274.4375, |
| "step": 38, |
| "total_sum": 1276.6473398208618 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 463.21875, |
| "epoch": 0.312, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.15978366136550903, |
| "kl": 0.216796875, |
| "learning_rate": 1.7456421648831658e-05, |
| "loss": 0.0085, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 3.328125, |
| "step": 39, |
| "total_sum": 3.32777202129364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 471.96875, |
| "epoch": 0.32, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.1662157028913498, |
| "kl": 0.22607421875, |
| "learning_rate": 1.7266603220340273e-05, |
| "loss": 0.0089, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 5.1328125, |
| "step": 40, |
| "total_sum": 5.137336730957031 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 494.59375, |
| "epoch": 0.328, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.12864062190055847, |
| "kl": 0.19140625, |
| "learning_rate": 1.7071067811865477e-05, |
| "loss": 0.0077, |
| "reward": 3.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.875, |
| "second_item": 469.46875, |
| "step": 41, |
| "total_sum": 469.15666449069977 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.875, |
| "epoch": 0.336, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.19264847040176392, |
| "kl": 0.2001953125, |
| "learning_rate": 1.686996926034902e-05, |
| "loss": 0.008, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 3.4140625, |
| "step": 42, |
| "total_sum": 3.4142632484436035 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.0, |
| "epoch": 0.344, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.1367393285036087, |
| "kl": 0.189453125, |
| "learning_rate": 1.6663465779520042e-05, |
| "loss": 0.0076, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 11.0625, |
| "step": 43, |
| "total_sum": 11.049258708953857 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 491.125, |
| "epoch": 0.352, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.7059330940246582, |
| "kl": 0.205078125, |
| "learning_rate": 1.645171983542088e-05, |
| "loss": 0.008, |
| "reward": 3.9609375, |
| "reward_std": 0.078125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.9921875, |
| "second_item": 4.78125, |
| "step": 44, |
| "total_sum": 4.784100770950317 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 496.4375, |
| "epoch": 0.36, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 1.4085657596588135, |
| "kl": 0.22607421875, |
| "learning_rate": 1.6234898018587336e-05, |
| "loss": 0.0092, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 5.5625, |
| "step": 45, |
| "total_sum": 5.5558922290802 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.0, |
| "epoch": 0.368, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -1.4191576683063792e-09, |
| "grad_norm": 0.6515297889709473, |
| "kl": 0.1728515625, |
| "learning_rate": 1.601317091298406e-05, |
| "loss": 0.0069, |
| "reward": 3.984375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 0.984375, |
| "second_item": 2.3203125, |
| "step": 46, |
| "total_sum": 2.324346423149109 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 506.46875, |
| "epoch": 0.376, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.619065523147583, |
| "kl": 0.1796875, |
| "learning_rate": 1.578671296179806e-05, |
| "loss": 0.0072, |
| "reward": 3.9609375, |
| "reward_std": 0.078125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.9921875, |
| "second_item": 3.7265625, |
| "step": 47, |
| "total_sum": 3.726863741874695 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 504.4375, |
| "epoch": 0.384, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.14806345105171204, |
| "kl": 0.17919921875, |
| "learning_rate": 1.5555702330196024e-05, |
| "loss": 0.0071, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 1.927734375, |
| "step": 48, |
| "total_sum": 1.9234183728694916 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 482.03125, |
| "epoch": 0.392, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.8865867257118225, |
| "kl": 0.24267578125, |
| "learning_rate": 1.5320320765153367e-05, |
| "loss": 0.0082, |
| "reward": 3.9921875, |
| "reward_std": 0.015625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 0.9921875, |
| "second_item": 4.16015625, |
| "step": 49, |
| "total_sum": 4.161918342113495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.75, |
| "epoch": 0.4, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 1.5075924396514893, |
| "kl": 0.26171875, |
| "learning_rate": 1.5080753452465296e-05, |
| "loss": 0.0104, |
| "reward": 3.9765625, |
| "reward_std": 0.029919598251581192, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 0.9765625, |
| "second_item": 7.5234375, |
| "step": 50, |
| "total_sum": 7.529886245727539 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.8125, |
| "epoch": 0.408, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.6599332094192505, |
| "kl": 0.22119140625, |
| "learning_rate": 1.4837188871052399e-05, |
| "loss": 0.0086, |
| "reward": 3.453125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.71875, |
| "rewards/tag_count_reward": 0.734375, |
| "second_item": 14.84375, |
| "step": 51, |
| "total_sum": 14.823784828186035 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.0, |
| "epoch": 0.416, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.6535671353340149, |
| "kl": 0.2216796875, |
| "learning_rate": 1.4589818644675378e-05, |
| "loss": 0.0089, |
| "reward": 3.9609375, |
| "reward_std": 0.078125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.9921875, |
| "second_item": 295.5, |
| "step": 52, |
| "total_sum": 295.7428283691406 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 483.625, |
| "epoch": 0.424, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 1.7440216541290283, |
| "kl": 0.2587890625, |
| "learning_rate": 1.4338837391175582e-05, |
| "loss": 0.0249, |
| "reward": 3.6953125, |
| "reward_std": 0.49763840436935425, |
| "rewards/accuracy_reward": 1.875, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.8828125, |
| "second_item": 5.953125, |
| "step": 53, |
| "total_sum": 5.957712173461914 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 498.09375, |
| "epoch": 0.432, |
| "first_item": -7.450580596923828e-09, |
| "first_item_div_second_item": -1.3303867507961851e-09, |
| "grad_norm": 1.2703138589859009, |
| "kl": 0.23193359375, |
| "learning_rate": 1.4084442569359964e-05, |
| "loss": 0.009, |
| "reward": 3.8984375, |
| "reward_std": 0.18319407105445862, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.9296875, |
| "second_item": 5.65625, |
| "step": 54, |
| "total_sum": 5.652552604675293 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 503.1875, |
| "epoch": 0.44, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 1.0315853357315063, |
| "kl": 0.21875, |
| "learning_rate": 1.3826834323650899e-05, |
| "loss": 0.0085, |
| "reward": 3.6484375, |
| "reward_std": 0.13530339300632477, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.8125, |
| "rewards/tag_count_reward": 0.8359375, |
| "second_item": 188.8125, |
| "step": 55, |
| "total_sum": 188.6527681350708 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 506.71875, |
| "epoch": 0.448, |
| "first_item": -7.450580596923828e-09, |
| "first_item_div_second_item": -3.2356165308247496e-09, |
| "grad_norm": 1.0939035415649414, |
| "kl": 0.19970703125, |
| "learning_rate": 1.3566215326623131e-05, |
| "loss": 0.008, |
| "reward": 3.65625, |
| "reward_std": 0.1875, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.8125, |
| "rewards/tag_count_reward": 0.84375, |
| "second_item": 7.0078125, |
| "step": 56, |
| "total_sum": 7.023218750953674 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 471.0625, |
| "epoch": 0.456, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.18012821674346924, |
| "kl": 0.22998046875, |
| "learning_rate": 1.3302790619551673e-05, |
| "loss": 0.0084, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 968.546875, |
| "step": 57, |
| "total_sum": 967.2251669764519 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 475.3125, |
| "epoch": 0.464, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.5792978405952454, |
| "kl": 0.20947265625, |
| "learning_rate": 1.3036767451096148e-05, |
| "loss": 0.0072, |
| "reward": 3.9921875, |
| "reward_std": 0.015625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 0.9921875, |
| "second_item": 2.349609375, |
| "step": 58, |
| "total_sum": 2.353084683418274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.71875, |
| "epoch": 0.472, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.21192185580730438, |
| "kl": 0.2109375, |
| "learning_rate": 1.2768355114248493e-05, |
| "loss": 0.0082, |
| "reward": 3.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.875, |
| "second_item": 0.9443359375, |
| "step": 59, |
| "total_sum": 0.9458350092172623 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 466.21875, |
| "epoch": 0.48, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -2.8656077014624187e-10, |
| "grad_norm": 1.5874462127685547, |
| "kl": 0.2705078125, |
| "learning_rate": 1.249776478167227e-05, |
| "loss": 0.024, |
| "reward": 3.921875, |
| "reward_std": 0.15625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.953125, |
| "second_item": 6.939453125, |
| "step": 60, |
| "total_sum": 6.952052086591721 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.15625, |
| "epoch": 0.488, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -2.785263566306959e-10, |
| "grad_norm": 1.4860674142837524, |
| "kl": 0.2607421875, |
| "learning_rate": 1.2225209339563144e-05, |
| "loss": 0.0567, |
| "reward": 3.5, |
| "reward_std": 0.4321783781051636, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.71875, |
| "rewards/tag_count_reward": 0.78125, |
| "second_item": 6.982421875, |
| "step": 61, |
| "total_sum": 6.976648718118668 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.5, |
| "epoch": 0.496, |
| "first_item": -1.1175870895385742e-08, |
| "first_item_div_second_item": -1.1127608281986097e-08, |
| "grad_norm": 1.8530943393707275, |
| "kl": 0.24609375, |
| "learning_rate": 1.1950903220161286e-05, |
| "loss": 0.0063, |
| "reward": 3.0703125, |
| "reward_std": 0.9480443596839905, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.53125, |
| "rewards/tag_count_reward": 0.5390625, |
| "second_item": 97.16796875, |
| "step": 62, |
| "total_sum": 97.3824203312397 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.1875, |
| "epoch": 0.504, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 1.9478371143341064, |
| "kl": 0.2265625, |
| "learning_rate": 1.1675062233047365e-05, |
| "loss": -0.0152, |
| "reward": 2.8984375, |
| "reward_std": 1.0290476083755493, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.4375, |
| "rewards/tag_count_reward": 0.4609375, |
| "second_item": 1.05810546875, |
| "step": 63, |
| "total_sum": 1.0574220344424248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 419.28125, |
| "epoch": 0.512, |
| "first_item": 3.725290298461914e-09, |
| "first_item_div_second_item": 9.489272508979978e-09, |
| "grad_norm": 1.7507895231246948, |
| "kl": 0.28515625, |
| "learning_rate": 1.1397903395354996e-05, |
| "loss": 0.1077, |
| "reward": 2.859375, |
| "reward_std": 0.9210017323493958, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.40625, |
| "rewards/tag_count_reward": 0.453125, |
| "second_item": 0.5751953125, |
| "step": 64, |
| "total_sum": 0.5762820988893509 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 497.3125, |
| "epoch": 0.52, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 1.414428949356079, |
| "kl": 0.2158203125, |
| "learning_rate": 1.1119644761033079e-05, |
| "loss": -0.0004, |
| "reward": 3.1875, |
| "reward_std": 0.672369509935379, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.5625, |
| "rewards/tag_count_reward": 0.625, |
| "second_item": 8.25, |
| "step": 65, |
| "total_sum": 8.253949701786041 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 469.46875, |
| "epoch": 0.528, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -5.327781857508907e-09, |
| "grad_norm": 1.1460624933242798, |
| "kl": 0.203125, |
| "learning_rate": 1.0840505249292477e-05, |
| "loss": 0.0146, |
| "reward": 3.59375, |
| "reward_std": 0.3125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.8125, |
| "rewards/tag_count_reward": 0.78125, |
| "second_item": 0.5107421875, |
| "step": 66, |
| "total_sum": 0.5107531398534775 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 462.28125, |
| "epoch": 0.536, |
| "first_item": 1.862645149230957e-09, |
| "first_item_div_second_item": 3.386626928305935e-10, |
| "grad_norm": 1.0261918306350708, |
| "kl": 0.24609375, |
| "learning_rate": 1.0560704472371919e-05, |
| "loss": 0.0097, |
| "reward": 3.6640625, |
| "reward_std": 0.32916322350502014, |
| "rewards/accuracy_reward": 1.9375, |
| "rewards/format_reward": 0.8125, |
| "rewards/tag_count_reward": 0.9140625, |
| "second_item": 3.064453125, |
| "step": 67, |
| "total_sum": 3.065736472606659 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 391.71875, |
| "epoch": 0.544, |
| "first_item": 3.725290298461914e-09, |
| "first_item_div_second_item": 1.91500768306081e-09, |
| "grad_norm": 0.9414896965026855, |
| "kl": 0.3193359375, |
| "learning_rate": 1.028046256275869e-05, |
| "loss": -0.0681, |
| "reward": 3.8203125, |
| "reward_std": 0.359375, |
| "rewards/accuracy_reward": 1.9375, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.9453125, |
| "second_item": 1.119140625, |
| "step": 68, |
| "total_sum": 1.1211046129465103 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.90625, |
| "epoch": 0.552, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.47802987694740295, |
| "kl": 0.2197265625, |
| "learning_rate": 1e-05, |
| "loss": 0.0065, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 198.6875, |
| "step": 69, |
| "total_sum": 198.5185830593109 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 490.125, |
| "epoch": 0.56, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.9080296754837036, |
| "kl": 0.1943359375, |
| "learning_rate": 9.719537437241311e-06, |
| "loss": 0.0241, |
| "reward": 3.7109375, |
| "reward_std": 0.140625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.84375, |
| "rewards/tag_count_reward": 0.8671875, |
| "second_item": 0.9921875, |
| "step": 70, |
| "total_sum": 0.9923984408378601 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 495.40625, |
| "epoch": 0.568, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.3803475797176361, |
| "kl": 0.232421875, |
| "learning_rate": 9.439295527628083e-06, |
| "loss": 0.007, |
| "reward": 3.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.875, |
| "second_item": 2.4609375, |
| "step": 71, |
| "total_sum": 2.456995338201523 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.46875, |
| "epoch": 0.576, |
| "first_item": -7.450580596923828e-09, |
| "first_item_div_second_item": -1.4839002149245586e-08, |
| "grad_norm": 0.6860982179641724, |
| "kl": 0.17626953125, |
| "learning_rate": 9.159494750707527e-06, |
| "loss": 0.0202, |
| "reward": 3.90625, |
| "reward_std": 0.1875, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 0.5029296875, |
| "step": 72, |
| "total_sum": 0.5038877725601196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.375, |
| "epoch": 0.584, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -3.24378736733566e-09, |
| "grad_norm": 0.4427696764469147, |
| "kl": 0.2021484375, |
| "learning_rate": 8.880355238966923e-06, |
| "loss": 0.0064, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 1.1640625, |
| "step": 73, |
| "total_sum": 1.1623322367668152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.96875, |
| "epoch": 0.592, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -1.1413266842281213e-12, |
| "grad_norm": 0.7066324353218079, |
| "kl": 0.193359375, |
| "learning_rate": 8.602096604645009e-06, |
| "loss": -0.0353, |
| "reward": 3.875, |
| "reward_std": 0.25, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.9375, |
| "second_item": 1640.25, |
| "step": 74, |
| "total_sum": 1641.355055809021 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 473.40625, |
| "epoch": 0.6, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.8320920467376709, |
| "kl": 0.1767578125, |
| "learning_rate": 8.324937766952638e-06, |
| "loss": -0.0102, |
| "reward": 3.8984375, |
| "reward_std": 0.203125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.9609375, |
| "second_item": 1.6796875, |
| "step": 75, |
| "total_sum": 1.6792444586753845 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 496.9375, |
| "epoch": 0.608, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.6126235127449036, |
| "kl": 0.185546875, |
| "learning_rate": 8.04909677983872e-06, |
| "loss": 0.0068, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 3.89453125, |
| "step": 76, |
| "total_sum": 3.888884425163269 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 495.8125, |
| "epoch": 0.616, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.16169646382331848, |
| "kl": 0.19287109375, |
| "learning_rate": 7.774790660436857e-06, |
| "loss": 0.0065, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 1.677734375, |
| "step": 77, |
| "total_sum": 1.6808350086212158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.0, |
| "epoch": 0.624, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.162797212600708, |
| "kl": 0.15478515625, |
| "learning_rate": 7.50223521832773e-06, |
| "loss": 0.0062, |
| "reward": 3.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.875, |
| "second_item": 2.986328125, |
| "step": 78, |
| "total_sum": 2.9852358400821686 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 507.125, |
| "epoch": 0.632, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.3044331669807434, |
| "kl": 0.17431640625, |
| "learning_rate": 7.2316448857515076e-06, |
| "loss": 0.007, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 58.4609375, |
| "step": 79, |
| "total_sum": 58.46868443489075 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.25, |
| "epoch": 0.64, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.14911580085754395, |
| "kl": 0.162109375, |
| "learning_rate": 6.963232548903853e-06, |
| "loss": 0.0065, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 2.8671875, |
| "step": 80, |
| "total_sum": 2.866468667984009 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 493.125, |
| "epoch": 0.648, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.1717568188905716, |
| "kl": 0.189453125, |
| "learning_rate": 6.697209380448333e-06, |
| "loss": 0.0072, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 2.794921875, |
| "step": 81, |
| "total_sum": 2.800520181655884 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.0, |
| "epoch": 0.656, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.5472955703735352, |
| "kl": 0.158203125, |
| "learning_rate": 6.43378467337687e-06, |
| "loss": 0.0063, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 2.62109375, |
| "step": 82, |
| "total_sum": 2.6212058067321777 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 498.875, |
| "epoch": 0.664, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.1360127478837967, |
| "kl": 0.1875, |
| "learning_rate": 6.173165676349103e-06, |
| "loss": 0.0073, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 0.833984375, |
| "step": 83, |
| "total_sum": 0.8321126252412796 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 498.21875, |
| "epoch": 0.672, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.11324969679117203, |
| "kl": 0.1767578125, |
| "learning_rate": 5.91555743064004e-06, |
| "loss": 0.0069, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 197.015625, |
| "step": 84, |
| "total_sum": 196.66522407531738 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.5, |
| "epoch": 0.68, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.6697484254837036, |
| "kl": 0.2392578125, |
| "learning_rate": 5.66116260882442e-06, |
| "loss": 0.0087, |
| "reward": 3.703125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.84375, |
| "rewards/tag_count_reward": 0.859375, |
| "second_item": 5.73828125, |
| "step": 85, |
| "total_sum": 5.750060975551605 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 491.5625, |
| "epoch": 0.688, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -5.449559550394315e-09, |
| "grad_norm": 0.5044450163841248, |
| "kl": 0.19677734375, |
| "learning_rate": 5.410181355324622e-06, |
| "loss": 0.0078, |
| "reward": 3.9375, |
| "reward_std": 0.125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 7.623046875, |
| "step": 86, |
| "total_sum": 7.606956303119659 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.25, |
| "epoch": 0.696, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -5.6098404722769945e-09, |
| "grad_norm": 0.9169361591339111, |
| "kl": 0.248046875, |
| "learning_rate": 5.1628111289476025e-06, |
| "loss": 0.0075, |
| "reward": 3.90625, |
| "reward_std": 0.1875, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 1.58984375, |
| "step": 87, |
| "total_sum": 1.5913574695587158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.90625, |
| "epoch": 0.704, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.22143904864788055, |
| "kl": 0.19091796875, |
| "learning_rate": 4.919246547534709e-06, |
| "loss": 0.0074, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 3.71875, |
| "step": 88, |
| "total_sum": 3.719162940979004 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 488.4375, |
| "epoch": 0.712, |
| "first_item": 3.725290298461914e-09, |
| "first_item_div_second_item": 3.4304799935373085e-09, |
| "grad_norm": 0.9220072031021118, |
| "kl": 0.20654296875, |
| "learning_rate": 4.679679234846636e-06, |
| "loss": 0.0081, |
| "reward": 3.890625, |
| "reward_std": 0.21875, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.953125, |
| "second_item": 0.890625, |
| "step": 89, |
| "total_sum": 0.8904827535152435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 495.96875, |
| "epoch": 0.72, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.8823760747909546, |
| "kl": 0.19677734375, |
| "learning_rate": 4.444297669803981e-06, |
| "loss": 0.0077, |
| "reward": 3.90625, |
| "reward_std": 0.1875, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 2.328125, |
| "step": 90, |
| "total_sum": 2.3348000049591064 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 510.09375, |
| "epoch": 0.728, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -2.256110887982257e-13, |
| "grad_norm": 0.6629019975662231, |
| "kl": 0.20947265625, |
| "learning_rate": 4.213287038201943e-06, |
| "loss": 0.0084, |
| "reward": 3.9375, |
| "reward_std": 0.125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 8860.0, |
| "step": 91, |
| "total_sum": 8861.269836425781 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.03125, |
| "epoch": 0.736, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.7865133881568909, |
| "kl": 0.2041015625, |
| "learning_rate": 3.986829087015941e-06, |
| "loss": -0.0353, |
| "reward": 3.890625, |
| "reward_std": 0.21875, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.953125, |
| "second_item": 582.5, |
| "step": 92, |
| "total_sum": 580.777759552002 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.84375, |
| "epoch": 0.744, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.15393787622451782, |
| "kl": 0.21435546875, |
| "learning_rate": 3.7651019814126656e-06, |
| "loss": 0.0078, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 7328.33984375, |
| "step": 93, |
| "total_sum": 7324.671706229448 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 484.90625, |
| "epoch": 0.752, |
| "first_item": 3.725290298461914e-09, |
| "first_item_div_second_item": 4.219793080262041e-09, |
| "grad_norm": 0.9448766708374023, |
| "kl": 0.208984375, |
| "learning_rate": 3.5482801645791266e-06, |
| "loss": 0.0147, |
| "reward": 3.90625, |
| "reward_std": 0.1875, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 2368.44140625, |
| "step": 94, |
| "total_sum": 2369.8369680941105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.90625, |
| "epoch": 0.76, |
| "first_item": 3.725290298461914e-09, |
| "first_item_div_second_item": 3.096342609553708e-09, |
| "grad_norm": 0.8535664081573486, |
| "kl": 0.20947265625, |
| "learning_rate": 3.3365342204799613e-06, |
| "loss": 0.0206, |
| "reward": 3.953125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.984375, |
| "second_item": 659456.6015625, |
| "step": 95, |
| "total_sum": 659550.1629930139 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 501.0625, |
| "epoch": 0.768, |
| "first_item": 3.725290298461914e-09, |
| "first_item_div_second_item": 4.14640545620197e-09, |
| "grad_norm": 0.6877720355987549, |
| "kl": 0.2080078125, |
| "learning_rate": 3.1300307396509833e-06, |
| "loss": 0.0083, |
| "reward": 3.703125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.84375, |
| "rewards/tag_count_reward": 0.859375, |
| "second_item": 0.6611328125, |
| "step": 96, |
| "total_sum": 0.6603447943925858 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 457.1875, |
| "epoch": 0.776, |
| "first_item": 0.0, |
| "first_item_div_second_item": 5.7796523313950656e-09, |
| "grad_norm": 0.9136969447135925, |
| "kl": 0.2568359375, |
| "learning_rate": 2.9289321881345257e-06, |
| "loss": -0.0502, |
| "reward": 3.609375, |
| "reward_std": 0.34375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.78125, |
| "rewards/tag_count_reward": 0.828125, |
| "second_item": 10176.322265625, |
| "step": 97, |
| "total_sum": 10176.559444218874 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.90625, |
| "epoch": 0.784, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.5218420028686523, |
| "kl": 0.2265625, |
| "learning_rate": 2.7333967796597317e-06, |
| "loss": -0.0079, |
| "reward": 3.953125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.984375, |
| "second_item": 768.396484375, |
| "step": 98, |
| "total_sum": 768.6198460459709 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 488.8125, |
| "epoch": 0.792, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.8593007326126099, |
| "kl": 0.20556640625, |
| "learning_rate": 2.5435783511683444e-06, |
| "loss": 0.0052, |
| "reward": 3.890625, |
| "reward_std": 0.21875, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/tag_count_reward": 0.953125, |
| "second_item": 1.68359375, |
| "step": 99, |
| "total_sum": 1.6826132237911224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.03125, |
| "epoch": 0.8, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.15400159358978271, |
| "kl": 0.19921875, |
| "learning_rate": 2.3596262417839256e-06, |
| "loss": 0.0078, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 16.537109375, |
| "step": 100, |
| "total_sum": 16.578362345695496 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.90625, |
| "epoch": 0.808, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.3664933443069458, |
| "kl": 0.23388671875, |
| "learning_rate": 2.1816851753197023e-06, |
| "loss": 0.0081, |
| "reward": 3.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.875, |
| "second_item": 415.0, |
| "step": 101, |
| "total_sum": 414.8429412841797 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 496.78125, |
| "epoch": 0.816, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -6.535596900185869e-11, |
| "grad_norm": 0.6789385080337524, |
| "kl": 0.197265625, |
| "learning_rate": 2.009895146417512e-06, |
| "loss": 0.0079, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 28.755859375, |
| "step": 102, |
| "total_sum": 28.723628729581833 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.125, |
| "epoch": 0.824, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.5424814224243164, |
| "kl": 0.20654296875, |
| "learning_rate": 1.8443913104073984e-06, |
| "loss": -0.0212, |
| "reward": 3.953125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.984375, |
| "second_item": 22.67578125, |
| "step": 103, |
| "total_sum": 22.712478399276733 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.625, |
| "epoch": 0.832, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.13198262453079224, |
| "kl": 0.185546875, |
| "learning_rate": 1.6853038769745466e-06, |
| "loss": 0.0072, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 877.1875, |
| "step": 104, |
| "total_sum": 877.5139471292496 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 463.0625, |
| "epoch": 0.84, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.5897831916809082, |
| "kl": 0.197265625, |
| "learning_rate": 1.5327580077171589e-06, |
| "loss": 0.0164, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 14.84375, |
| "step": 105, |
| "total_sum": 14.816515803337097 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.21875, |
| "epoch": 0.848, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 2.6089651584625244, |
| "kl": 0.2626953125, |
| "learning_rate": 1.3868737176759105e-06, |
| "loss": 0.0101, |
| "reward": 3.984375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 0.984375, |
| "second_item": 1.2578125, |
| "step": 106, |
| "total_sum": 1.2589216232299805 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.1875, |
| "epoch": 0.856, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.1342599242925644, |
| "kl": 0.2333984375, |
| "learning_rate": 1.2477657809124632e-06, |
| "loss": 0.0073, |
| "reward": 3.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.875, |
| "second_item": 160.75, |
| "step": 107, |
| "total_sum": 160.8754653930664 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 466.8125, |
| "epoch": 0.864, |
| "first_item": 0.0, |
| "first_item_div_second_item": -6.710942669269451e-09, |
| "grad_norm": 0.9345314502716064, |
| "kl": 0.5322265625, |
| "learning_rate": 1.1155436402112785e-06, |
| "loss": 0.0367, |
| "reward": 3.671875, |
| "reward_std": 0.15625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.8125, |
| "rewards/tag_count_reward": 0.859375, |
| "second_item": 368.27734375, |
| "step": 108, |
| "total_sum": 368.1199948787689 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 494.875, |
| "epoch": 0.872, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.30068421363830566, |
| "kl": 0.197265625, |
| "learning_rate": 9.903113209758098e-07, |
| "loss": 0.0078, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 114180.875, |
| "step": 109, |
| "total_sum": 114187.73381710052 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.6875, |
| "epoch": 0.88, |
| "first_item": 3.725290298461914e-09, |
| "first_item_div_second_item": 5.960454940811157e-09, |
| "grad_norm": 0.4554659426212311, |
| "kl": 0.283203125, |
| "learning_rate": 8.721673493868111e-07, |
| "loss": 0.0068, |
| "reward": 3.953125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.984375, |
| "second_item": 0.5322265625, |
| "step": 110, |
| "total_sum": 0.5327285826206207 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 469.46875, |
| "epoch": 0.888, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.6664077043533325, |
| "kl": 0.2080078125, |
| "learning_rate": 7.612046748871327e-07, |
| "loss": 0.0077, |
| "reward": 3.9375, |
| "reward_std": 0.125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 4569.0, |
| "step": 111, |
| "total_sum": 4570.285980224609 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 452.15625, |
| "epoch": 0.896, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -2.8959035278080874e-13, |
| "grad_norm": 1.0423874855041504, |
| "kl": 0.2353515625, |
| "learning_rate": 6.57510597054003e-07, |
| "loss": 0.0296, |
| "reward": 3.953125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 0.953125, |
| "second_item": 6716.0, |
| "step": 112, |
| "total_sum": 6715.581207275391 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 482.59375, |
| "epoch": 0.904, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -3.5061522751149548e-09, |
| "grad_norm": 0.7979276776313782, |
| "kl": 0.2001953125, |
| "learning_rate": 5.611666969163243e-07, |
| "loss": 0.0078, |
| "reward": 3.953125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.984375, |
| "second_item": 58.03125, |
| "step": 113, |
| "total_sum": 58.15105998516083 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 469.90625, |
| "epoch": 0.912, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.1514263153076172, |
| "kl": 0.19140625, |
| "learning_rate": 4.7224877277103673e-07, |
| "loss": 0.0073, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 27.671875, |
| "step": 114, |
| "total_sum": 27.719636023044586 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 479.65625, |
| "epoch": 0.92, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.14016734063625336, |
| "kl": 0.1875, |
| "learning_rate": 3.908267805490051e-07, |
| "loss": 0.0069, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 0.673828125, |
| "step": 115, |
| "total_sum": 0.6723696887493134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.0625, |
| "epoch": 0.928, |
| "first_item": 3.725290298461914e-09, |
| "first_item_div_second_item": 5.211327772165794e-09, |
| "grad_norm": 0.6825590133666992, |
| "kl": 0.201171875, |
| "learning_rate": 3.1696477877738664e-07, |
| "loss": 0.0078, |
| "reward": 3.953125, |
| "reward_std": 0.09375, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.984375, |
| "second_item": 1.001953125, |
| "step": 116, |
| "total_sum": 1.0027059614658356 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 493.65625, |
| "epoch": 0.936, |
| "first_item": -7.450580596923828e-09, |
| "first_item_div_second_item": -9.584692015580239e-09, |
| "grad_norm": 0.9466831088066101, |
| "kl": 0.19580078125, |
| "learning_rate": 2.507208781817638e-07, |
| "loss": 0.0075, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 37120.1943359375, |
| "step": 117, |
| "total_sum": 37120.29603135586 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.03125, |
| "epoch": 0.944, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -5.138331375280828e-11, |
| "grad_norm": 1.1685991287231445, |
| "kl": 0.19140625, |
| "learning_rate": 1.921471959676957e-07, |
| "loss": -0.0058, |
| "reward": 3.671875, |
| "reward_std": 0.15625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.796875, |
| "second_item": 39.875, |
| "step": 118, |
| "total_sum": 39.85870552062988 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.3125, |
| "epoch": 0.952, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -2.6848937152955265e-10, |
| "grad_norm": 0.7177730202674866, |
| "kl": 0.212890625, |
| "learning_rate": 1.4128981481764115e-07, |
| "loss": 0.0081, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 7.388671875, |
| "step": 119, |
| "total_sum": 7.404956877231598 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 496.59375, |
| "epoch": 0.96, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.11760753393173218, |
| "kl": 0.1806640625, |
| "learning_rate": 9.818874663554356e-08, |
| "loss": 0.007, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 12336.0, |
| "step": 120, |
| "total_sum": 12335.790069580078 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.5625, |
| "epoch": 0.968, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -2.6056657086224828e-09, |
| "grad_norm": 0.907755970954895, |
| "kl": 0.1962890625, |
| "learning_rate": 6.287790106757396e-08, |
| "loss": 0.0235, |
| "reward": 3.96875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 0.9609375, |
| "step": 121, |
| "total_sum": 0.9597683101892471 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.375, |
| "epoch": 0.976, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -8.315380130309733e-14, |
| "grad_norm": 0.9736545085906982, |
| "kl": 0.20703125, |
| "learning_rate": 3.538505882380916e-08, |
| "loss": 0.014, |
| "reward": 3.9765625, |
| "reward_std": 0.046875, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 0.9765625, |
| "second_item": 22400.3203125, |
| "step": 122, |
| "total_sum": 22400.575908362865 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 475.5, |
| "epoch": 0.984, |
| "first_item": -3.725290298461914e-09, |
| "first_item_div_second_item": -5.449559550394315e-09, |
| "grad_norm": 0.6621652245521545, |
| "kl": 0.181640625, |
| "learning_rate": 1.5731849821833955e-08, |
| "loss": 0.0144, |
| "reward": 3.7421875, |
| "reward_std": 0.015625, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.875, |
| "rewards/tag_count_reward": 0.8671875, |
| "second_item": 0.5244140625, |
| "step": 123, |
| "total_sum": 0.5251940339803696 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 491.0, |
| "epoch": 0.992, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 0.13678430020809174, |
| "kl": 0.19140625, |
| "learning_rate": 3.933736169471347e-09, |
| "loss": 0.0075, |
| "reward": 4.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 1.0, |
| "rewards/tag_count_reward": 1.0, |
| "second_item": 2128.0, |
| "step": 124, |
| "total_sum": 2126.9109802246094 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.5625, |
| "epoch": 1.0, |
| "first_item": 0.0, |
| "first_item_div_second_item": 0.0, |
| "grad_norm": 1.1001026630401611, |
| "kl": 0.23681640625, |
| "learning_rate": 0.0, |
| "loss": 0.0292, |
| "reward": 3.9375, |
| "reward_std": 0.125, |
| "rewards/accuracy_reward": 2.0, |
| "rewards/format_reward": 0.96875, |
| "rewards/tag_count_reward": 0.96875, |
| "second_item": 3.6640625, |
| "step": 125, |
| "total_sum": 3.653566360473633 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 125, |
| "total_flos": 0.0, |
| "train_loss": 0.006549221595370909, |
| "train_runtime": 9806.6961, |
| "train_samples_per_second": 0.102, |
| "train_steps_per_second": 0.013 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 125, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|