{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999146539216524, "eval_steps": 500, "global_step": 2929, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 820.84765625, "completions/mean_terminated_length": 803.837646484375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.0003413843133907997, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5386123465695015, "kl": 0.0003418922424316406, "learning_rate": 0.0, "loss": -0.013, "num_tokens": 504514.0, "reward": 0.96484375, "reward_std": 0.5946651697158813, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.3203125, "rewards/format_reward/std": 0.4670529365539551, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.4549144506454468, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 879.552734375, "completions/mean_terminated_length": 870.3523559570312, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.0006827686267815994, "frac_reward_zero_std": 0.15625, "grad_norm": 0.6597088379715952, "kl": 0.0003294944763183594, "learning_rate": 6.825938566552902e-08, "loss": -0.0149, "num_tokens": 1033709.0, "reward": 0.55078125, "reward_std": 0.5261605978012085, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.17578125, "rewards/format_reward/std": 0.3810062110424042, "rewards/tag_count_reward/mean": 0.33203125, "rewards/tag_count_reward/std": 0.4405558109283447, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 830.02734375, "completions/mean_terminated_length": 827.6438598632812, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.001024152940172399, "frac_reward_zero_std": 0.0, "grad_norm": 0.5653287725508552, "kl": 0.00035953521728515625, "learning_rate": 1.3651877133105803e-07, "loss": 0.0046, "num_tokens": 1546187.0, "reward": 0.90087890625, "reward_std": 0.6402367353439331, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.291015625, "rewards/format_reward/std": 0.45467492938041687, "rewards/tag_count_reward/mean": 0.54931640625, "rewards/tag_count_reward/std": 0.4474700689315796, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 878.095703125, "completions/mean_terminated_length": 864.2233276367188, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.0013655372535631989, "frac_reward_zero_std": 0.09375, "grad_norm": 0.6340820161136902, "kl": 0.0003681182861328125, "learning_rate": 2.0477815699658704e-07, "loss": 0.0213, "num_tokens": 2081804.0, "reward": 0.783203125, "reward_std": 0.6095758080482483, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.23046875, "rewards/format_reward/std": 0.42154473066329956, "rewards/tag_count_reward/mean": 0.470703125, "rewards/tag_count_reward/std": 0.4499087631702423, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 843.2421875, "completions/mean_terminated_length": 838.5177001953125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.0017069215669539984, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6090561995463253, "kl": 0.0003261566162109375, "learning_rate": 2.7303754266211607e-07, "loss": 0.0058, "num_tokens": 2600680.0, "reward": 0.98583984375, "reward_std": 0.6458786725997925, "rewards/accuracy_reward/mean": 0.10483870655298233, "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.3203125, "rewards/format_reward/std": 0.4670529365539551, "rewards/tag_count_reward/mean": 0.56396484375, "rewards/tag_count_reward/std": 0.4507932960987091, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 754.77734375, "completions/mean_terminated_length": 749.7059326171875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.002048305880344798, "frac_reward_zero_std": 0.03125, "grad_norm": 0.665219722555773, "kl": 0.00040435791015625, "learning_rate": 3.412969283276451e-07, "loss": 0.0076, "num_tokens": 3082182.0, "reward": 1.1396484375, "reward_std": 0.6443605422973633, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.439453125, "rewards/format_reward/std": 0.49680593609809875, "rewards/tag_count_reward/mean": 0.6259765625, "rewards/tag_count_reward/std": 0.4398774802684784, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 852.431640625, "completions/mean_terminated_length": 847.7431640625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.0023896901937355977, "frac_reward_zero_std": 0.03125, "grad_norm": 0.619651959888123, "kl": 0.000385284423828125, "learning_rate": 4.0955631399317407e-07, "loss": -0.0157, "num_tokens": 3596643.0, "reward": 0.90087890625, "reward_std": 0.6476004123687744, "rewards/accuracy_reward/mean": 0.09879032522439957, "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.2890625, "rewards/format_reward/std": 0.45377036929130554, "rewards/tag_count_reward/mean": 0.51611328125, "rewards/tag_count_reward/std": 0.4587794840335846, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 825.439453125, "completions/mean_terminated_length": 825.439453125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.0027310745071263977, "frac_reward_zero_std": 0.03125, "grad_norm": 0.6552995456858007, "kl": 0.0004863739013671875, "learning_rate": 4.778156996587031e-07, "loss": 0.0157, "num_tokens": 4101604.0, "reward": 0.79638671875, "reward_std": 0.621026873588562, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.216796875, "rewards/format_reward/std": 0.4124660789966583, "rewards/tag_count_reward/mean": 0.45849609375, "rewards/tag_count_reward/std": 0.456911563873291, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 774.205078125, "completions/mean_terminated_length": 774.205078125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.0030724588205171973, "frac_reward_zero_std": 0.0, "grad_norm": 0.5615118222551848, "kl": 0.0005502700805664062, "learning_rate": 5.460750853242321e-07, "loss": -0.012, "num_tokens": 4577421.0, "reward": 1.091796875, "reward_std": 0.6412665843963623, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.39453125, "rewards/format_reward/std": 0.4892277717590332, "rewards/tag_count_reward/mean": 0.603515625, "rewards/tag_count_reward/std": 0.44653189182281494, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 840.119140625, "completions/mean_terminated_length": 835.3823852539062, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.003413843133907997, "frac_reward_zero_std": 0.0, "grad_norm": 0.46783707291497556, "kl": 0.00124359130859375, "learning_rate": 6.143344709897612e-07, "loss": 0.0163, "num_tokens": 5085130.0, "reward": 1.3984375, "reward_std": 0.6860847473144531, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.560546875, "rewards/format_reward/std": 0.49680593609809875, "rewards/tag_count_reward/mean": 0.787109375, "rewards/tag_count_reward/std": 0.3384808897972107, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 846.595703125, "completions/mean_terminated_length": 844.24462890625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.003755227447298797, "frac_reward_zero_std": 0.03125, "grad_norm": 0.33444788015740484, "kl": 0.0016937255859375, "learning_rate": 6.825938566552902e-07, "loss": -0.0046, "num_tokens": 5599803.0, "reward": 1.4619140625, "reward_std": 0.6533540487289429, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.572265625, "rewards/format_reward/std": 0.4952339828014374, "rewards/tag_count_reward/mean": 0.8134765625, "rewards/tag_count_reward/std": 0.3133900463581085, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 831.51953125, "completions/mean_terminated_length": 826.7490844726562, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.004096611760689596, "frac_reward_zero_std": 0.0, "grad_norm": 0.4519456045319824, "kl": 0.00231170654296875, "learning_rate": 7.508532423208192e-07, "loss": 0.0272, "num_tokens": 6102549.0, "reward": 1.3984375, "reward_std": 0.6598341464996338, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.552734375, "rewards/format_reward/std": 0.4976975917816162, "rewards/tag_count_reward/mean": 0.787109375, "rewards/tag_count_reward/std": 0.35194432735443115, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 689.5390625, "completions/mean_terminated_length": 689.5390625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.004437996074080396, "frac_reward_zero_std": 0.03125, "grad_norm": 0.36242418534611626, "kl": 0.003021240234375, "learning_rate": 8.191126279863481e-07, "loss": 0.022, "num_tokens": 6537033.0, "reward": 1.75, "reward_std": 0.5517961978912354, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42402184009552, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.16699250042438507, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1795.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 789.5703125, "completions/mean_terminated_length": 789.5703125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.0047793803874711955, "frac_reward_zero_std": 0.03125, "grad_norm": 0.312958751816161, "kl": 0.00263214111328125, "learning_rate": 8.873720136518772e-07, "loss": 0.0136, "num_tokens": 7024189.0, "reward": 1.76318359375, "reward_std": 0.5157672166824341, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.794921875, "rewards/format_reward/std": 0.4041535556316376, "rewards/tag_count_reward/mean": 0.92724609375, "rewards/tag_count_reward/std": 0.16538763046264648, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 806.23828125, "completions/mean_terminated_length": 789.0257568359375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.005120764700861995, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2728655650595758, "kl": 0.00342559814453125, "learning_rate": 9.556313993174062e-07, "loss": 0.0274, "num_tokens": 7519767.0, "reward": 1.80908203125, "reward_std": 0.48705413937568665, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.810546875, "rewards/format_reward/std": 0.3922513723373413, "rewards/tag_count_reward/mean": 0.93212890625, "rewards/tag_count_reward/std": 0.15844757854938507, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 750.650390625, "completions/mean_terminated_length": 750.650390625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.0054621490142527955, "frac_reward_zero_std": 0.0625, "grad_norm": 0.29233778309185243, "kl": 0.003498077392578125, "learning_rate": 1.0238907849829352e-06, "loss": 0.0023, "num_tokens": 7984900.0, "reward": 1.849609375, "reward_std": 0.49978017807006836, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.837890625, "rewards/format_reward/std": 0.3689115643501282, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14141270518302917, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 749.4296875, "completions/mean_terminated_length": 746.888427734375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.005803533327643595, "frac_reward_zero_std": 0.15625, "grad_norm": 0.2747894834993212, "kl": 0.004058837890625, "learning_rate": 1.0921501706484643e-06, "loss": 0.0081, "num_tokens": 8449968.0, "reward": 1.93408203125, "reward_std": 0.3827357292175293, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.90234375, "rewards/format_reward/std": 0.29713961482048035, "rewards/tag_count_reward/mean": 0.95751953125, "rewards/tag_count_reward/std": 0.14172235131263733, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 725.5703125, "completions/mean_terminated_length": 725.5703125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.006144917641034395, "frac_reward_zero_std": 0.34375, "grad_norm": 0.26757421802014325, "kl": 0.0064849853515625, "learning_rate": 1.1604095563139933e-06, "loss": 0.0107, "num_tokens": 8910052.0, "reward": 1.9697265625, "reward_std": 0.22774367034435272, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.1939331740140915, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.07015689462423325, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 784.65234375, "completions/mean_terminated_length": 784.65234375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.006486301954425194, "frac_reward_zero_std": 0.375, "grad_norm": 0.2145392633389538, "kl": 0.0089569091796875, "learning_rate": 1.2286689419795223e-06, "loss": 0.0052, "num_tokens": 9394770.0, "reward": 2.0048828125, "reward_std": 0.1971101015806198, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.061630137264728546, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 678.298828125, "completions/mean_terminated_length": 678.298828125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.006827686267815994, "frac_reward_zero_std": 0.375, "grad_norm": 0.26309752000387737, "kl": 0.009735107421875, "learning_rate": 1.2969283276450511e-06, "loss": 0.0213, "num_tokens": 9824907.0, "reward": 1.984375, "reward_std": 0.17482058703899384, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.06678669899702072, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1894.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 713.42578125, "completions/mean_terminated_length": 713.42578125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.007169070581206793, "frac_reward_zero_std": 0.40625, "grad_norm": 0.24042019398328887, "kl": 0.010223388671875, "learning_rate": 1.3651877133105804e-06, "loss": 0.0117, "num_tokens": 10269173.0, "reward": 1.9990234375, "reward_std": 0.18747201561927795, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.08145132660865784, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 712.986328125, "completions/mean_terminated_length": 705.117919921875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.007510454894597594, "frac_reward_zero_std": 0.375, "grad_norm": 0.21445290967881397, "kl": 0.0095672607421875, "learning_rate": 1.4334470989761092e-06, "loss": 0.05, "num_tokens": 10722766.0, "reward": 2.013671875, "reward_std": 0.22162634134292603, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.10658079385757446, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 727.134765625, "completions/mean_terminated_length": 724.5499267578125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.007851839207988393, "frac_reward_zero_std": 0.28125, "grad_norm": 0.30206462169054304, "kl": 0.0109100341796875, "learning_rate": 1.5017064846416384e-06, "loss": 0.0261, "num_tokens": 11172339.0, "reward": 1.96435546875, "reward_std": 0.24655835330486298, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.1939331740140915, "rewards/tag_count_reward/mean": 0.97216796875, "rewards/tag_count_reward/std": 0.12248162925243378, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 659.625, "completions/mean_terminated_length": 659.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.008193223521379193, "frac_reward_zero_std": 0.46875, "grad_norm": 0.29061370269868153, "kl": 0.0100250244140625, "learning_rate": 1.5699658703071675e-06, "loss": 0.0118, "num_tokens": 11595315.0, "reward": 1.97314453125, "reward_std": 0.16807854175567627, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.98291015625, "rewards/tag_count_reward/std": 0.09929264336824417, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 765.0703125, "completions/mean_terminated_length": 760.0392456054688, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.008534607834769992, "frac_reward_zero_std": 0.5, "grad_norm": 0.22452790659116365, "kl": 0.0085601806640625, "learning_rate": 1.6382252559726963e-06, "loss": 0.0234, "num_tokens": 12069767.0, "reward": 2.015625, "reward_std": 0.14892420172691345, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.0727052316069603, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 641.421875, "completions/mean_terminated_length": 635.9059448242188, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.008875992148160792, "frac_reward_zero_std": 0.46875, "grad_norm": 1.1022754621945696, "kl": 0.0167236328125, "learning_rate": 1.7064846416382255e-06, "loss": 0.0233, "num_tokens": 12482335.0, "reward": 2.048828125, "reward_std": 0.1961648017168045, "rewards/accuracy_reward/mean": 0.07258064299821854, "rewards/accuracy_reward/std": 0.25970885157585144, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05386113002896309, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 649.701171875, "completions/mean_terminated_length": 649.701171875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.009217376461551591, "frac_reward_zero_std": 0.625, "grad_norm": 0.18614904028242898, "kl": 0.00786590576171875, "learning_rate": 1.7747440273037543e-06, "loss": 0.0075, "num_tokens": 12889894.0, "reward": 2.00634765625, "reward_std": 0.12228904664516449, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.052765581756830215, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 727.623046875, "completions/mean_terminated_length": 727.623046875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.009558760774942391, "frac_reward_zero_std": 0.78125, "grad_norm": 0.1373132023432314, "kl": 0.00531768798828125, "learning_rate": 1.8430034129692834e-06, "loss": 0.0005, "num_tokens": 13357157.0, "reward": 2.02197265625, "reward_std": 0.07914142310619354, "rewards/accuracy_reward/mean": 0.02822580561041832, "rewards/accuracy_reward/std": 0.1657845675945282, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 766.595703125, "completions/mean_terminated_length": 759.0432739257812, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.00990014508833319, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1412372841169799, "kl": 0.006683349609375, "learning_rate": 1.9112627986348124e-06, "loss": 0.0299, "num_tokens": 13830550.0, "reward": 2.03955078125, "reward_std": 0.14909085631370544, "rewards/accuracy_reward/mean": 0.06451612710952759, "rewards/accuracy_reward/std": 0.2459181249141693, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.06310669332742691, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 756.369140625, "completions/mean_terminated_length": 756.369140625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.01024152940172399, "frac_reward_zero_std": 0.5625, "grad_norm": 0.19341596718299997, "kl": 0.00778961181640625, "learning_rate": 1.9795221843003416e-06, "loss": 0.0012, "num_tokens": 14303411.0, "reward": 2.0859375, "reward_std": 0.14812877774238586, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1871.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 764.876953125, "completions/mean_terminated_length": 764.876953125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.01058291371511479, "frac_reward_zero_std": 0.5625, "grad_norm": 0.17601401715831713, "kl": 0.00745391845703125, "learning_rate": 2.0477815699658705e-06, "loss": 0.0127, "num_tokens": 14779444.0, "reward": 2.0908203125, "reward_std": 0.15102121233940125, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03484956547617912, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 787.1796875, "completions/mean_terminated_length": 786.3972778320312, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.010924298028505591, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15293538943718132, "kl": 0.007781982421875, "learning_rate": 2.1160409556313997e-06, "loss": 0.0096, "num_tokens": 15265536.0, "reward": 2.07958984375, "reward_std": 0.17488238215446472, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 818.80078125, "completions/mean_terminated_length": 809.1220703125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.01126568234189639, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1602668773904955, "kl": 0.0070648193359375, "learning_rate": 2.1843003412969285e-06, "loss": 0.0217, "num_tokens": 15775130.0, "reward": 2.015625, "reward_std": 0.13852274417877197, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05386113002896309, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 766.484375, "completions/mean_terminated_length": 758.9312744140625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.01160706665528719, "frac_reward_zero_std": 0.625, "grad_norm": 0.15354791378667002, "kl": 0.008209228515625, "learning_rate": 2.2525597269624573e-06, "loss": 0.0198, "num_tokens": 16246466.0, "reward": 2.01904296875, "reward_std": 0.15988598763942719, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.06675629317760468, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 809.111328125, "completions/mean_terminated_length": 799.3563232421875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.01194845096867799, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1644315292317749, "kl": 0.009307861328125, "learning_rate": 2.3208191126279866e-06, "loss": 0.0284, "num_tokens": 16748379.0, "reward": 2.1220703125, "reward_std": 0.22698579728603363, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.060186829417943954, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 750.615234375, "completions/mean_terminated_length": 748.0762939453125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.01228983528206879, "frac_reward_zero_std": 0.53125, "grad_norm": 0.17525465881442223, "kl": 0.008514404296875, "learning_rate": 2.3890784982935154e-06, "loss": 0.0104, "num_tokens": 17222022.0, "reward": 2.06298828125, "reward_std": 0.1706467717885971, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 817.125, "completions/mean_terminated_length": 807.4330444335938, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.012631219595459589, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13541527166507658, "kl": 0.0085601806640625, "learning_rate": 2.4573378839590446e-06, "loss": 0.031, "num_tokens": 17720966.0, "reward": 2.02880859375, "reward_std": 0.1527203917503357, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.0753624215722084, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 762.181640625, "completions/mean_terminated_length": 759.6653442382812, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.012972603908850388, "frac_reward_zero_std": 0.53125, "grad_norm": 0.28583935597592536, "kl": 0.0110321044921875, "learning_rate": 2.5255972696245735e-06, "loss": 0.0102, "num_tokens": 18197555.0, "reward": 2.00537109375, "reward_std": 0.151970773935318, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.04515400901436806, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 799.130859375, "completions/mean_terminated_length": 799.130859375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.013313988222241188, "frac_reward_zero_std": 0.40625, "grad_norm": 0.2411281908739854, "kl": 0.009490966796875, "learning_rate": 2.5938566552901023e-06, "loss": 0.0066, "num_tokens": 18690246.0, "reward": 2.08984375, "reward_std": 0.22769387066364288, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04910992085933685, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1953.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 853.515625, "completions/mean_terminated_length": 852.3561401367188, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.013655372535631987, "frac_reward_zero_std": 0.53125, "grad_norm": 0.17682514476353361, "kl": 0.0099334716796875, "learning_rate": 2.662116040955632e-06, "loss": 0.0085, "num_tokens": 19213470.0, "reward": 2.0732421875, "reward_std": 0.1828780323266983, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 822.83984375, "completions/mean_terminated_length": 820.4422607421875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.013996756849022787, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1516435237245826, "kl": 0.0098724365234375, "learning_rate": 2.7303754266211608e-06, "loss": 0.0085, "num_tokens": 19711260.0, "reward": 2.04150390625, "reward_std": 0.13356801867485046, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 772.712890625, "completions/mean_terminated_length": 770.2172241210938, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.014338141162413586, "frac_reward_zero_std": 0.40625, "grad_norm": 0.20680144948704263, "kl": 0.0105743408203125, "learning_rate": 2.7986348122866896e-06, "loss": 0.0003, "num_tokens": 20197881.0, "reward": 2.12451171875, "reward_std": 0.2566344141960144, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 915.7421875, "completions/mean_terminated_length": 913.5264282226562, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.014679525475804386, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10965918851369479, "kl": 0.008819580078125, "learning_rate": 2.8668941979522184e-06, "loss": 0.0052, "num_tokens": 20745973.0, "reward": 2.10791015625, "reward_std": 0.14487463235855103, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04269581660628319, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 846.67578125, "completions/mean_terminated_length": 837.216552734375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.015020909789195187, "frac_reward_zero_std": 0.375, "grad_norm": 0.15613792727768447, "kl": 0.011383056640625, "learning_rate": 2.9351535836177476e-06, "loss": 0.0294, "num_tokens": 21255311.0, "reward": 2.15771484375, "reward_std": 0.2871488630771637, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.07204344123601913, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 851.689453125, "completions/mean_terminated_length": 849.3483276367188, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.015362294102585987, "frac_reward_zero_std": 0.71875, "grad_norm": 0.14760441396092616, "kl": 0.0143890380859375, "learning_rate": 3.003412969283277e-06, "loss": 0.004, "num_tokens": 21778768.0, "reward": 2.07568359375, "reward_std": 0.10568341612815857, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 814.833984375, "completions/mean_terminated_length": 814.833984375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.015703678415976786, "frac_reward_zero_std": 0.40625, "grad_norm": 0.18616677701406573, "kl": 0.012603759765625, "learning_rate": 3.0716723549488057e-06, "loss": 0.0051, "num_tokens": 22276043.0, "reward": 2.111328125, "reward_std": 0.20531868934631348, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04096253588795662, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 884.90625, "completions/mean_terminated_length": 882.630126953125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.016045062729367586, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14470657449684687, "kl": 0.0125579833984375, "learning_rate": 3.139931740614335e-06, "loss": 0.0056, "num_tokens": 22810571.0, "reward": 2.048828125, "reward_std": 0.15875791013240814, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.043846867978572845, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 901.37890625, "completions/mean_terminated_length": 894.620849609375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.016386447042758386, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1414827562694951, "kl": 0.010955810546875, "learning_rate": 3.2081911262798638e-06, "loss": 0.0083, "num_tokens": 23358413.0, "reward": 2.0234375, "reward_std": 0.1654457151889801, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.06179272010922432, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 906.9375, "completions/mean_terminated_length": 893.4071655273438, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.016727831356149185, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13314098011248393, "kl": 0.0124053955078125, "learning_rate": 3.2764505119453926e-06, "loss": 0.0177, "num_tokens": 23901789.0, "reward": 2.06396484375, "reward_std": 0.16372129321098328, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.05711371824145317, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 842.3203125, "completions/mean_terminated_length": 828.0237426757812, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.017069215669539985, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14262098714009108, "kl": 0.0130462646484375, "learning_rate": 3.3447098976109214e-06, "loss": 0.0364, "num_tokens": 24406753.0, "reward": 2.09521484375, "reward_std": 0.19180285930633545, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.07204344123601913, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 841.986328125, "completions/mean_terminated_length": 841.986328125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.017410599982930784, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13546948188954713, "kl": 0.010528564453125, "learning_rate": 3.412969283276451e-06, "loss": 0.0042, "num_tokens": 24922714.0, "reward": 2.12646484375, "reward_std": 0.16320887207984924, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1944.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 851.44140625, "completions/mean_terminated_length": 851.44140625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.017751984296321584, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1418569478108374, "kl": 0.0111083984375, "learning_rate": 3.48122866894198e-06, "loss": 0.0065, "num_tokens": 25442604.0, "reward": 2.06298828125, "reward_std": 0.1530916392803192, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 802.248046875, "completions/mean_terminated_length": 799.8101806640625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.018093368609712383, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15726629329302486, "kl": 0.0141754150390625, "learning_rate": 3.5494880546075087e-06, "loss": 0.0235, "num_tokens": 25933147.0, "reward": 2.111328125, "reward_std": 0.18602468073368073, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 720.26953125, "completions/mean_terminated_length": 720.26953125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.018434752923103183, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1398131567140378, "kl": 0.0114593505859375, "learning_rate": 3.617747440273038e-06, "loss": -0.0034, "num_tokens": 26390565.0, "reward": 2.09130859375, "reward_std": 0.12315960973501205, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.045533329248428345, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 841.99609375, "completions/mean_terminated_length": 841.99609375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.018776137236493982, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14460215798991333, "kl": 0.0112762451171875, "learning_rate": 3.6860068259385667e-06, "loss": 0.0009, "num_tokens": 26900163.0, "reward": 2.09716796875, "reward_std": 0.16678181290626526, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 727.873046875, "completions/mean_terminated_length": 727.873046875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.019117521549884782, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15847983093844253, "kl": 0.0118560791015625, "learning_rate": 3.7542662116040956e-06, "loss": -0.0025, "num_tokens": 27355666.0, "reward": 2.126953125, "reward_std": 0.19583982229232788, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 848.68359375, "completions/mean_terminated_length": 843.98046875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.01945890586327558, "frac_reward_zero_std": 0.5, "grad_norm": 0.16720374097356439, "kl": 0.0101318359375, "learning_rate": 3.822525597269625e-06, "loss": 0.0195, "num_tokens": 27873744.0, "reward": 2.08642578125, "reward_std": 0.19845128059387207, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.055034760385751724, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 848.81640625, "completions/mean_terminated_length": 846.4696655273438, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.01980029017666638, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1551357093456691, "kl": 0.0101470947265625, "learning_rate": 3.890784982935154e-06, "loss": -0.0002, "num_tokens": 28406306.0, "reward": 2.13427734375, "reward_std": 0.2071191370487213, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04538619518280029, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 868.041015625, "completions/mean_terminated_length": 865.7318725585938, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.02014167449005718, "frac_reward_zero_std": 0.4375, "grad_norm": 0.15277243362939052, "kl": 0.009796142578125, "learning_rate": 3.959044368600683e-06, "loss": 0.0199, "num_tokens": 28924807.0, "reward": 2.220703125, "reward_std": 0.24765071272850037, "rewards/accuracy_reward/mean": 0.23046875, "rewards/accuracy_reward/std": 0.42154473066329956, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 837.349609375, "completions/mean_terminated_length": 827.8169555664062, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.02048305880344798, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13685225435926437, "kl": 0.0087890625, "learning_rate": 4.027303754266212e-06, "loss": 0.0087, "num_tokens": 29442586.0, "reward": 2.09326171875, "reward_std": 0.17899833619594574, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 886.79296875, "completions/mean_terminated_length": 884.5205688476562, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.02082444311683878, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13298508159333194, "kl": 0.0106201171875, "learning_rate": 4.095563139931741e-06, "loss": 0.0107, "num_tokens": 29976384.0, "reward": 2.11865234375, "reward_std": 0.1559920608997345, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 706.08984375, "completions/mean_terminated_length": 706.08984375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.02116582743022958, "frac_reward_zero_std": 0.625, "grad_norm": 0.14406416317337442, "kl": 0.0115203857421875, "learning_rate": 4.163822525597269e-06, "loss": 0.0079, "num_tokens": 30417454.0, "reward": 2.15185546875, "reward_std": 0.1571672558784485, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 720.01953125, "completions/mean_terminated_length": 720.01953125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.021507211743620382, "frac_reward_zero_std": 0.625, "grad_norm": 0.14902486307223808, "kl": 0.0111236572265625, "learning_rate": 4.232081911262799e-06, "loss": 0.0068, "num_tokens": 30874792.0, "reward": 2.12890625, "reward_std": 0.1464948207139969, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 746.58984375, "completions/mean_terminated_length": 746.58984375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.021848596057011182, "frac_reward_zero_std": 0.625, "grad_norm": 0.1300150051708278, "kl": 0.0108642578125, "learning_rate": 4.300341296928328e-06, "loss": 0.0033, "num_tokens": 31341606.0, "reward": 2.16162109375, "reward_std": 0.17012381553649902, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.045533329248428345, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 727.578125, "completions/mean_terminated_length": 724.994140625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.02218998037040198, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15469557675643675, "kl": 0.010406494140625, "learning_rate": 4.368600682593857e-06, "loss": 0.013, "num_tokens": 31799886.0, "reward": 2.2236328125, "reward_std": 0.18897220492362976, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42402184009552, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 799.56640625, "completions/mean_terminated_length": 799.56640625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.02253136468379278, "frac_reward_zero_std": 0.75, "grad_norm": 0.09533756067579502, "kl": 0.009307861328125, "learning_rate": 4.436860068259386e-06, "loss": 0.002, "num_tokens": 32292240.0, "reward": 2.06884765625, "reward_std": 0.08973807096481323, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 759.392578125, "completions/mean_terminated_length": 759.392578125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.02287274899718358, "frac_reward_zero_std": 0.625, "grad_norm": 0.13817742998817506, "kl": 0.01104736328125, "learning_rate": 4.505119453924915e-06, "loss": 0.0035, "num_tokens": 32755145.0, "reward": 2.10107421875, "reward_std": 0.15076538920402527, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 744.6953125, "completions/mean_terminated_length": 742.1448364257812, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.02321413331057438, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1699752031278437, "kl": 0.0104522705078125, "learning_rate": 4.573378839590444e-06, "loss": 0.0107, "num_tokens": 33219357.0, "reward": 2.17138671875, "reward_std": 0.17760857939720154, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 768.427734375, "completions/mean_terminated_length": 765.9236450195312, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.02355551762396518, "frac_reward_zero_std": 0.75, "grad_norm": 0.1267262279839064, "kl": 0.0100555419921875, "learning_rate": 4.641638225255973e-06, "loss": 0.0016, "num_tokens": 33702840.0, "reward": 2.060546875, "reward_std": 0.09396954625844955, "rewards/accuracy_reward/mean": 0.07056451588869095, "rewards/accuracy_reward/std": 0.25635457038879395, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 807.64453125, "completions/mean_terminated_length": 805.2172241210938, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.02389690193735598, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15300787963925722, "kl": 0.00946044921875, "learning_rate": 4.709897610921502e-06, "loss": 0.0093, "num_tokens": 34200626.0, "reward": 2.0869140625, "reward_std": 0.1690889447927475, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 762.787109375, "completions/mean_terminated_length": 760.2720336914062, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.02423828625074678, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1321676198079411, "kl": 0.010101318359375, "learning_rate": 4.778156996587031e-06, "loss": 0.0099, "num_tokens": 34672933.0, "reward": 2.095703125, "reward_std": 0.1370055377483368, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04406425356864929, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 741.2109375, "completions/mean_terminated_length": 741.2109375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.02457967056413758, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13068167613539136, "kl": 0.0119171142578125, "learning_rate": 4.84641638225256e-06, "loss": -0.0, "num_tokens": 35144193.0, "reward": 2.12939453125, "reward_std": 0.13878193497657776, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 753.27734375, "completions/mean_terminated_length": 750.74365234375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.024921054877528378, "frac_reward_zero_std": 0.71875, "grad_norm": 0.13119415567921328, "kl": 0.0107574462890625, "learning_rate": 4.914675767918089e-06, "loss": 0.0143, "num_tokens": 35621855.0, "reward": 2.0859375, "reward_std": 0.12526729702949524, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 748.83203125, "completions/mean_terminated_length": 748.83203125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.025262439190919177, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13228613266170122, "kl": 0.01043701171875, "learning_rate": 4.982935153583618e-06, "loss": 0.0074, "num_tokens": 36084761.0, "reward": 2.14599609375, "reward_std": 0.13030928373336792, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 844.08203125, "completions/mean_terminated_length": 844.08203125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.025603823504309977, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11719312956580923, "kl": 0.0103302001953125, "learning_rate": 5.051194539249147e-06, "loss": -0.0027, "num_tokens": 36602563.0, "reward": 2.0498046875, "reward_std": 0.10757724940776825, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 790.970703125, "completions/mean_terminated_length": 790.970703125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.025945207817700777, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1399004001143959, "kl": 0.0127105712890625, "learning_rate": 5.119453924914676e-06, "loss": 0.0082, "num_tokens": 37082852.0, "reward": 2.1572265625, "reward_std": 0.17140349745750427, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 851.830078125, "completions/mean_terminated_length": 851.830078125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.026286592131091576, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13146610700581615, "kl": 0.010650634765625, "learning_rate": 5.1877133105802046e-06, "loss": 0.0109, "num_tokens": 37602125.0, "reward": 2.09326171875, "reward_std": 0.1320321261882782, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 818.9453125, "completions/mean_terminated_length": 809.2677001953125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.026627976444482376, "frac_reward_zero_std": 0.5, "grad_norm": 0.15935815577252627, "kl": 0.0115966796875, "learning_rate": 5.255972696245735e-06, "loss": 0.0254, "num_tokens": 38117793.0, "reward": 2.099609375, "reward_std": 0.18368026614189148, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.33332720398902893, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0695069283246994, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 862.716796875, "completions/mean_terminated_length": 860.3972778320312, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.026969360757873175, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14638309151877646, "kl": 0.010345458984375, "learning_rate": 5.324232081911264e-06, "loss": 0.0117, "num_tokens": 38645760.0, "reward": 2.0537109375, "reward_std": 0.17996756732463837, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 787.35546875, "completions/mean_terminated_length": 787.35546875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.027310745071263975, "frac_reward_zero_std": 0.625, "grad_norm": 0.1213320592778572, "kl": 0.0117645263671875, "learning_rate": 5.392491467576792e-06, "loss": 0.0118, "num_tokens": 39142262.0, "reward": 2.0849609375, "reward_std": 0.1529192179441452, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.06037704274058342, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 833.35546875, "completions/mean_terminated_length": 833.35546875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.027652129384654774, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1518712390644362, "kl": 0.011474609375, "learning_rate": 5.4607508532423215e-06, "loss": -0.0018, "num_tokens": 39646412.0, "reward": 2.18408203125, "reward_std": 0.19743922352790833, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 805.306640625, "completions/mean_terminated_length": 805.306640625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.027993513698045574, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15912866430296094, "kl": 0.01141357421875, "learning_rate": 5.529010238907851e-06, "loss": 0.0019, "num_tokens": 40143769.0, "reward": 2.15966796875, "reward_std": 0.16627737879753113, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 800.3046875, "completions/mean_terminated_length": 797.863037109375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.028334898011436373, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13576195671643787, "kl": 0.0109710693359375, "learning_rate": 5.597269624573379e-06, "loss": 0.0071, "num_tokens": 40633301.0, "reward": 2.05712890625, "reward_std": 0.10889958590269089, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 798.181640625, "completions/mean_terminated_length": 795.7357788085938, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.028676282324827173, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15046269275870813, "kl": 0.0116424560546875, "learning_rate": 5.665529010238908e-06, "loss": -0.0044, "num_tokens": 41122242.0, "reward": 2.0595703125, "reward_std": 0.18405753374099731, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 792.078125, "completions/mean_terminated_length": 784.6758422851562, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.029017666638217973, "frac_reward_zero_std": 0.5625, "grad_norm": 0.16535291130598712, "kl": 0.01202392578125, "learning_rate": 5.733788395904437e-06, "loss": 0.0187, "num_tokens": 41609258.0, "reward": 2.142578125, "reward_std": 0.17841148376464844, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 812.57421875, "completions/mean_terminated_length": 812.57421875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.029359050951608772, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14402724618705065, "kl": 0.011444091796875, "learning_rate": 5.802047781569966e-06, "loss": -0.0035, "num_tokens": 42116160.0, "reward": 2.11083984375, "reward_std": 0.14984716475009918, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.033087924122810364, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 731.646484375, "completions/mean_terminated_length": 729.0704345703125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.02970043526499957, "frac_reward_zero_std": 0.53125, "grad_norm": 0.16983335733608973, "kl": 0.0141448974609375, "learning_rate": 5.870307167235495e-06, "loss": 0.0198, "num_tokens": 42578267.0, "reward": 2.13525390625, "reward_std": 0.18807247281074524, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 796.76171875, "completions/mean_terminated_length": 796.76171875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.030041819578390375, "frac_reward_zero_std": 0.375, "grad_norm": 0.20496555064438712, "kl": 0.013885498046875, "learning_rate": 5.938566552901024e-06, "loss": 0.0053, "num_tokens": 43087441.0, "reward": 2.138671875, "reward_std": 0.23711621761322021, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07124487310647964, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 857.607421875, "completions/mean_terminated_length": 855.2778930664062, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.030383203891781174, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1347164730996847, "kl": 0.010772705078125, "learning_rate": 6.006825938566554e-06, "loss": 0.0134, "num_tokens": 43605816.0, "reward": 2.09521484375, "reward_std": 0.17403262853622437, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04538619518280029, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 850.666015625, "completions/mean_terminated_length": 845.9706420898438, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.030724588205171974, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12016585319330404, "kl": 0.010833740234375, "learning_rate": 6.075085324232083e-06, "loss": 0.0083, "num_tokens": 44122045.0, "reward": 2.1025390625, "reward_std": 0.14531686902046204, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 800.958984375, "completions/mean_terminated_length": 800.958984375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.031065972518562773, "frac_reward_zero_std": 0.625, "grad_norm": 0.13900997260178893, "kl": 0.0114898681640625, "learning_rate": 6.143344709897611e-06, "loss": 0.0027, "num_tokens": 44613064.0, "reward": 2.107421875, "reward_std": 0.16039569675922394, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 932.52734375, "completions/mean_terminated_length": 932.52734375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.03140735683195357, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11082713979873099, "kl": 0.01068115234375, "learning_rate": 6.211604095563141e-06, "loss": -0.0, "num_tokens": 45188038.0, "reward": 2.0654296875, "reward_std": 0.12064535915851593, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.26366615295410156, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1833.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 798.77734375, "completions/mean_terminated_length": 798.77734375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.03174874114534437, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1739922500997126, "kl": 0.0142974853515625, "learning_rate": 6.27986348122867e-06, "loss": 0.0015, "num_tokens": 45671780.0, "reward": 2.12353515625, "reward_std": 0.18916988372802734, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 824.947265625, "completions/mean_terminated_length": 824.947265625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.03209012545873517, "frac_reward_zero_std": 0.53125, "grad_norm": 0.150345548109226, "kl": 0.0130462646484375, "learning_rate": 6.348122866894198e-06, "loss": 0.0017, "num_tokens": 46177449.0, "reward": 2.0986328125, "reward_std": 0.17094185948371887, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 875.662109375, "completions/mean_terminated_length": 873.367919921875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.03243150977212597, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14453312653089773, "kl": 0.0129547119140625, "learning_rate": 6.4163822525597275e-06, "loss": 0.0099, "num_tokens": 46713484.0, "reward": 2.16943359375, "reward_std": 0.1710798442363739, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 773.3359375, "completions/mean_terminated_length": 770.8414916992188, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.03277289408551677, "frac_reward_zero_std": 0.5, "grad_norm": 0.16922953822252232, "kl": 0.01519775390625, "learning_rate": 6.484641638225257e-06, "loss": 0.0224, "num_tokens": 47184888.0, "reward": 2.1337890625, "reward_std": 0.17767873406410217, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 798.6796875, "completions/mean_terminated_length": 793.7804565429688, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.03311427839890757, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11781296551157633, "kl": 0.012603759765625, "learning_rate": 6.552901023890785e-06, "loss": -0.0017, "num_tokens": 47675492.0, "reward": 2.1181640625, "reward_std": 0.11990725994110107, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 811.14453125, "completions/mean_terminated_length": 808.7240600585938, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.03345566271229837, "frac_reward_zero_std": 0.625, "grad_norm": 0.13804520944937634, "kl": 0.0133819580078125, "learning_rate": 6.621160409556314e-06, "loss": 0.0143, "num_tokens": 48177326.0, "reward": 2.07275390625, "reward_std": 0.15440312027931213, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 756.46875, "completions/mean_terminated_length": 755.1624145507812, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.03379704702568917, "frac_reward_zero_std": 0.78125, "grad_norm": 14.773057563874916, "kl": 0.1103363037109375, "learning_rate": 6.689419795221843e-06, "loss": 0.0091, "num_tokens": 48647150.0, "reward": 2.0703125, "reward_std": 0.08849336206912994, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 797.341796875, "completions/mean_terminated_length": 794.1333618164062, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.03413843133907997, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1312800408165888, "kl": 0.0168304443359375, "learning_rate": 6.757679180887372e-06, "loss": 0.0151, "num_tokens": 49138157.0, "reward": 2.06787109375, "reward_std": 0.1108778864145279, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1596.0, "completions/max_terminated_length": 1596.0, "completions/mean_length": 806.927734375, "completions/mean_terminated_length": 806.927734375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.03447981565247077, "frac_reward_zero_std": 0.6875, "grad_norm": 0.139579446612696, "kl": 0.0162200927734375, "learning_rate": 6.825938566552902e-06, "loss": 0.0004, "num_tokens": 49628200.0, "reward": 2.05224609375, "reward_std": 0.12144151329994202, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 776.900390625, "completions/mean_terminated_length": 776.900390625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.03482119996586157, "frac_reward_zero_std": 0.78125, "grad_norm": 0.10980986552426582, "kl": 0.0139923095703125, "learning_rate": 6.894197952218431e-06, "loss": -0.0001, "num_tokens": 50108789.0, "reward": 2.08154296875, "reward_std": 0.10082019865512848, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 723.1796875, "completions/mean_terminated_length": 723.1796875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.03516258427925237, "frac_reward_zero_std": 0.53125, "grad_norm": 0.173467658688331, "kl": 0.0151214599609375, "learning_rate": 6.96245733788396e-06, "loss": -0.002, "num_tokens": 50562545.0, "reward": 2.0888671875, "reward_std": 0.1717228889465332, "rewards/accuracy_reward/mean": 0.09677419066429138, "rewards/accuracy_reward/std": 0.2959485352039337, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 770.9140625, "completions/mean_terminated_length": 770.9140625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.03550396859264317, "frac_reward_zero_std": 0.46875, "grad_norm": 0.14734989494462525, "kl": 0.0144195556640625, "learning_rate": 7.030716723549489e-06, "loss": 0.0074, "num_tokens": 51044453.0, "reward": 2.1845703125, "reward_std": 0.22156678140163422, "rewards/accuracy_reward/mean": 0.19556452333927155, "rewards/accuracy_reward/std": 0.3970351219177246, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 788.359375, "completions/mean_terminated_length": 783.419677734375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.03584535290603397, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14202678434724195, "kl": 0.014312744140625, "learning_rate": 7.098976109215017e-06, "loss": 0.0228, "num_tokens": 51530109.0, "reward": 2.107421875, "reward_std": 0.17038534581661224, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1929.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 748.271484375, "completions/mean_terminated_length": 748.271484375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.03618673721942477, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1229791689199644, "kl": 0.0144500732421875, "learning_rate": 7.167235494880547e-06, "loss": 0.0092, "num_tokens": 52003784.0, "reward": 2.150390625, "reward_std": 0.1716441512107849, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 788.154296875, "completions/mean_terminated_length": 788.154296875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.036528121532815566, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13187504103241537, "kl": 0.0145111083984375, "learning_rate": 7.235494880546076e-06, "loss": 0.0111, "num_tokens": 52490167.0, "reward": 2.11279296875, "reward_std": 0.16934418678283691, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 832.939453125, "completions/mean_terminated_length": 830.5616455078125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.036869505846206366, "frac_reward_zero_std": 0.625, "grad_norm": 0.12148007824629696, "kl": 0.01519775390625, "learning_rate": 7.303754266211604e-06, "loss": 0.0021, "num_tokens": 53004264.0, "reward": 2.11962890625, "reward_std": 0.15868374705314636, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 808.375, "completions/mean_terminated_length": 805.9490966796875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.037210890159597165, "frac_reward_zero_std": 0.5, "grad_norm": 0.16250981672801032, "kl": 0.016326904296875, "learning_rate": 7.3720136518771335e-06, "loss": 0.004, "num_tokens": 53495400.0, "reward": 2.142578125, "reward_std": 0.17896346747875214, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 845.50390625, "completions/mean_terminated_length": 843.1506958007812, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.037552274472987965, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11967100958582799, "kl": 0.015869140625, "learning_rate": 7.440273037542663e-06, "loss": 0.0093, "num_tokens": 54009002.0, "reward": 2.06689453125, "reward_std": 0.13457003235816956, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 805.09375, "completions/mean_terminated_length": 805.09375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.037893658786378764, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13446570626059617, "kl": 0.016143798828125, "learning_rate": 7.508532423208191e-06, "loss": 0.0009, "num_tokens": 54503706.0, "reward": 2.16162109375, "reward_std": 0.16743090748786926, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 818.98046875, "completions/mean_terminated_length": 818.98046875, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.038235043099769564, "frac_reward_zero_std": 0.625, "grad_norm": 0.12987160821238156, "kl": 0.01837158203125, "learning_rate": 7.57679180887372e-06, "loss": -0.0022, "num_tokens": 55003584.0, "reward": 2.13232421875, "reward_std": 0.15996167063713074, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 794.775390625, "completions/mean_terminated_length": 791.1961059570312, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.038576427413160363, "frac_reward_zero_std": 0.75, "grad_norm": 0.23003810330297964, "kl": 0.0301513671875, "learning_rate": 7.64505119453925e-06, "loss": 0.0014, "num_tokens": 55490365.0, "reward": 2.13427734375, "reward_std": 0.08840136229991913, "rewards/accuracy_reward/mean": 0.14717741310596466, "rewards/accuracy_reward/std": 0.3546403646469116, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1954.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 849.607421875, "completions/mean_terminated_length": 849.607421875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.03891781172655116, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10831987736242026, "kl": 0.01873779296875, "learning_rate": 7.713310580204779e-06, "loss": 0.0105, "num_tokens": 56009876.0, "reward": 2.08154296875, "reward_std": 0.1053328663110733, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 874.328125, "completions/mean_terminated_length": 872.0313110351562, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.03925919603994196, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12393681387888436, "kl": 0.0196533203125, "learning_rate": 7.781569965870308e-06, "loss": 0.0123, "num_tokens": 56531372.0, "reward": 2.23681640625, "reward_std": 0.18657714128494263, "rewards/accuracy_reward/mean": 0.240234375, "rewards/accuracy_reward/std": 0.4276435375213623, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 862.244140625, "completions/mean_terminated_length": 856.58154296875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.03960058035333276, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14589664735270078, "kl": 0.02691650390625, "learning_rate": 7.849829351535837e-06, "loss": 0.0124, "num_tokens": 57048665.0, "reward": 2.1171875, "reward_std": 0.16706842184066772, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 799.126953125, "completions/mean_terminated_length": 793.290771484375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.03994196466672356, "frac_reward_zero_std": 0.5625, "grad_norm": 23.36638142793221, "kl": 0.61322021484375, "learning_rate": 7.918088737201367e-06, "loss": 0.0436, "num_tokens": 57536714.0, "reward": 2.11669921875, "reward_std": 0.15579885244369507, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1812.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 811.896484375, "completions/mean_terminated_length": 811.896484375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.04028334898011436, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13393104094411312, "kl": 0.022979736328125, "learning_rate": 7.986348122866894e-06, "loss": 0.0056, "num_tokens": 58037477.0, "reward": 2.1171875, "reward_std": 0.1605905294418335, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 894.486328125, "completions/mean_terminated_length": 892.2289428710938, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.04062473329350516, "frac_reward_zero_std": 0.46875, "grad_norm": 0.15552137557378531, "kl": 0.02105712890625, "learning_rate": 8.054607508532423e-06, "loss": 0.0177, "num_tokens": 58586622.0, "reward": 2.146484375, "reward_std": 0.21458591520786285, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 794.009765625, "completions/mean_terminated_length": 794.009765625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.04096611760689596, "frac_reward_zero_std": 0.46875, "grad_norm": 0.16520585369284563, "kl": 0.023773193359375, "learning_rate": 8.122866894197953e-06, "loss": 0.0142, "num_tokens": 59069507.0, "reward": 2.16015625, "reward_std": 0.20333366096019745, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 842.58984375, "completions/mean_terminated_length": 837.3753051757812, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.04130750192028676, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14565391936728855, "kl": 0.03326416015625, "learning_rate": 8.191126279863482e-06, "loss": 0.0206, "num_tokens": 59583393.0, "reward": 2.083984375, "reward_std": 0.15823453664779663, "rewards/accuracy_reward/mean": 0.10282257944345474, "rewards/accuracy_reward/std": 0.30403366684913635, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1812.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 889.33203125, "completions/mean_terminated_length": 889.33203125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.04164888623367756, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1228738239815732, "kl": 0.021514892578125, "learning_rate": 8.259385665529011e-06, "loss": 0.0146, "num_tokens": 60136875.0, "reward": 2.1875, "reward_std": 0.14405205845832825, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 784.853515625, "completions/mean_terminated_length": 784.853515625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.04199027054706836, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13651652066975964, "kl": 0.023162841796875, "learning_rate": 8.327645051194539e-06, "loss": 0.0096, "num_tokens": 60622208.0, "reward": 2.14306640625, "reward_std": 0.1921081840991974, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1704.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 708.06640625, "completions/mean_terminated_length": 708.06640625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.04233165486045916, "frac_reward_zero_std": 0.46875, "grad_norm": 0.15755310853648846, "kl": 0.0244140625, "learning_rate": 8.395904436860068e-06, "loss": 0.0054, "num_tokens": 61063122.0, "reward": 2.16259765625, "reward_std": 0.20249128341674805, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 717.90625, "completions/mean_terminated_length": 716.870849609375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.04267303917384996, "frac_reward_zero_std": 0.625, "grad_norm": 0.15758264995170257, "kl": 0.03106689453125, "learning_rate": 8.464163822525599e-06, "loss": -0.0009, "num_tokens": 61523122.0, "reward": 2.0849609375, "reward_std": 0.15378868579864502, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 780.58203125, "completions/mean_terminated_length": 780.58203125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.043014423487240765, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12745665302309464, "kl": 0.022247314453125, "learning_rate": 8.532423208191128e-06, "loss": 0.008, "num_tokens": 62008972.0, "reward": 2.0625, "reward_std": 0.12641537189483643, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1325.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 664.947265625, "completions/mean_terminated_length": 664.947265625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.043355807800631564, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12152771242276154, "kl": 0.023651123046875, "learning_rate": 8.600682593856656e-06, "loss": 0.0024, "num_tokens": 62425729.0, "reward": 2.09130859375, "reward_std": 0.10461260378360748, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 716.982421875, "completions/mean_terminated_length": 716.982421875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.043697192114022364, "frac_reward_zero_std": 0.625, "grad_norm": 0.13949923775163436, "kl": 0.020904541015625, "learning_rate": 8.668941979522185e-06, "loss": -0.0023, "num_tokens": 62879656.0, "reward": 2.06103515625, "reward_std": 0.13731415569782257, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1678.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 737.13671875, "completions/mean_terminated_length": 737.13671875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.04403857642741316, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12412023500467365, "kl": 0.020538330078125, "learning_rate": 8.737201365187714e-06, "loss": -0.0044, "num_tokens": 63343470.0, "reward": 2.103515625, "reward_std": 0.1237497627735138, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 760.24609375, "completions/mean_terminated_length": 756.8843383789062, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.04437996074080396, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1570309229477717, "kl": 0.03814697265625, "learning_rate": 8.805460750853243e-06, "loss": 0.0032, "num_tokens": 63818172.0, "reward": 2.1064453125, "reward_std": 0.13378655910491943, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 780.78125, "completions/mean_terminated_length": 780.78125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.04472134505419476, "frac_reward_zero_std": 0.5, "grad_norm": 0.14625134707123474, "kl": 0.01922607421875, "learning_rate": 8.873720136518773e-06, "loss": 0.0014, "num_tokens": 64302908.0, "reward": 2.119140625, "reward_std": 0.17071321606636047, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2022.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 836.42578125, "completions/mean_terminated_length": 835.2974243164062, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.04506272936758556, "frac_reward_zero_std": 0.59375, "grad_norm": 1.775331461244142, "kl": 0.028228759765625, "learning_rate": 8.9419795221843e-06, "loss": 0.0075, "num_tokens": 64824534.0, "reward": 2.14990234375, "reward_std": 0.15210983157157898, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 890.31640625, "completions/mean_terminated_length": 877.4012451171875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.04540411368097636, "frac_reward_zero_std": 0.625, "grad_norm": 0.1672070751839777, "kl": 0.02508544921875, "learning_rate": 9.01023890784983e-06, "loss": 0.0139, "num_tokens": 65360424.0, "reward": 2.0751953125, "reward_std": 0.14532876014709473, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 858.533203125, "completions/mean_terminated_length": 849.1672973632812, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.04574549799436716, "frac_reward_zero_std": 0.625, "grad_norm": 0.11883278298681166, "kl": 0.020599365234375, "learning_rate": 9.078498293515359e-06, "loss": 0.0198, "num_tokens": 65879625.0, "reward": 2.05126953125, "reward_std": 0.16094663739204407, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2013.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 805.9921875, "completions/mean_terminated_length": 805.9921875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.04608688230775796, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1586830098285242, "kl": 0.024078369140625, "learning_rate": 9.146757679180888e-06, "loss": 0.0137, "num_tokens": 66373109.0, "reward": 2.236328125, "reward_std": 0.18574437499046326, "rewards/accuracy_reward/mean": 0.236328125, "rewards/accuracy_reward/std": 0.42524150013923645, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 855.70703125, "completions/mean_terminated_length": 852.61962890625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.04642826662114876, "frac_reward_zero_std": 0.65625, "grad_norm": 1.2711689890858628, "kl": 0.02783203125, "learning_rate": 9.215017064846417e-06, "loss": 0.0074, "num_tokens": 66895471.0, "reward": 2.1103515625, "reward_std": 0.12417568266391754, "rewards/accuracy_reward/mean": 0.12096773833036423, "rewards/accuracy_reward/std": 0.32641899585723877, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 937.83984375, "completions/mean_terminated_length": 915.7591552734375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.04676965093453956, "frac_reward_zero_std": 0.625, "grad_norm": 29.696749998410123, "kl": 0.90380859375, "learning_rate": 9.283276450511946e-06, "loss": 0.0513, "num_tokens": 67492797.0, "reward": 2.0380859375, "reward_std": 0.14263811707496643, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.0771619975566864, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 919.318359375, "completions/mean_terminated_length": 917.1095581054688, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.04711103524793036, "frac_reward_zero_std": 0.75, "grad_norm": 0.1136252490093659, "kl": 0.025482177734375, "learning_rate": 9.351535836177476e-06, "loss": 0.0027, "num_tokens": 68040480.0, "reward": 2.08740234375, "reward_std": 0.0996030941605568, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 988.435546875, "completions/mean_terminated_length": 966.34130859375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.04745241956132116, "frac_reward_zero_std": 0.625, "grad_norm": 211.45893338604253, "kl": 20.14398193359375, "learning_rate": 9.419795221843005e-06, "loss": 0.8204, "num_tokens": 68633599.0, "reward": 2.09130859375, "reward_std": 0.1526174247264862, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.06489825993776321, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 939.4296875, "completions/mean_terminated_length": 928.4970703125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.04779380387471196, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1388573447300414, "kl": 0.025726318359375, "learning_rate": 9.488054607508534e-06, "loss": 0.0128, "num_tokens": 69191627.0, "reward": 2.12255859375, "reward_std": 0.14924678206443787, "rewards/accuracy_reward/mean": 0.1391129046678543, "rewards/accuracy_reward/std": 0.3464137017726898, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 934.23828125, "completions/mean_terminated_length": 932.0587158203125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.04813518818810276, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14562856912141742, "kl": 0.02642822265625, "learning_rate": 9.556313993174062e-06, "loss": 0.009, "num_tokens": 69753093.0, "reward": 2.1845703125, "reward_std": 0.16715767979621887, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 932.203125, "completions/mean_terminated_length": 927.8275146484375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.04847657250149356, "frac_reward_zero_std": 0.5, "grad_norm": 0.1357094737964101, "kl": 0.024169921875, "learning_rate": 9.62457337883959e-06, "loss": 0.0145, "num_tokens": 70319277.0, "reward": 2.10205078125, "reward_std": 0.19469258189201355, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 820.94921875, "completions/mean_terminated_length": 820.94921875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.04881795681488436, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13978783614094345, "kl": 0.025726318359375, "learning_rate": 9.69283276450512e-06, "loss": 0.0016, "num_tokens": 70818435.0, "reward": 2.1396484375, "reward_std": 0.17956265807151794, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 777.27734375, "completions/mean_terminated_length": 777.27734375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.04915934112827516, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13688798655672554, "kl": 0.027618408203125, "learning_rate": 9.76109215017065e-06, "loss": 0.0113, "num_tokens": 71302673.0, "reward": 2.109375, "reward_std": 0.14033456146717072, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 664.71484375, "completions/mean_terminated_length": 664.71484375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.049500725441665956, "frac_reward_zero_std": 0.65625, "grad_norm": 0.16457664195291086, "kl": 0.031341552734375, "learning_rate": 9.829351535836179e-06, "loss": 0.0015, "num_tokens": 71718943.0, "reward": 2.140625, "reward_std": 0.1254390925168991, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 753.400390625, "completions/mean_terminated_length": 750.866943359375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.049842109755056756, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13289543245894916, "kl": 0.027984619140625, "learning_rate": 9.897610921501706e-06, "loss": 0.0143, "num_tokens": 72184492.0, "reward": 2.1474609375, "reward_std": 0.14891016483306885, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 716.302734375, "completions/mean_terminated_length": 716.302734375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.050183494068447555, "frac_reward_zero_std": 0.59375, "grad_norm": 0.140218639091156, "kl": 0.028564453125, "learning_rate": 9.965870307167235e-06, "loss": 0.0163, "num_tokens": 72635991.0, "reward": 2.12744140625, "reward_std": 0.1545775830745697, "rewards/accuracy_reward/mean": 0.1391129046678543, "rewards/accuracy_reward/std": 0.3464137017726898, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 766.693359375, "completions/mean_terminated_length": 766.693359375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.050524878381838355, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13423928584329878, "kl": 0.02728271484375, "learning_rate": 1.0034129692832766e-05, "loss": 0.0131, "num_tokens": 73112922.0, "reward": 2.17041015625, "reward_std": 0.17447277903556824, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1970.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 813.576171875, "completions/mean_terminated_length": 813.576171875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.050866262695229154, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11753861971917089, "kl": 0.02410888671875, "learning_rate": 1.0102389078498294e-05, "loss": 0.0059, "num_tokens": 73613185.0, "reward": 2.1025390625, "reward_std": 0.11042675375938416, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 739.36328125, "completions/mean_terminated_length": 734.2314453125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.051207647008619954, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11928691831416911, "kl": 0.029327392578125, "learning_rate": 1.0170648464163823e-05, "loss": 0.0119, "num_tokens": 74072555.0, "reward": 2.13330078125, "reward_std": 0.12418802082538605, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1844.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 740.3046875, "completions/mean_terminated_length": 740.3046875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.051549031322010753, "frac_reward_zero_std": 0.75, "grad_norm": 0.12225419403072507, "kl": 0.027191162109375, "learning_rate": 1.0238907849829352e-05, "loss": -0.0027, "num_tokens": 74539239.0, "reward": 2.0615234375, "reward_std": 0.09262566268444061, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 798.265625, "completions/mean_terminated_length": 798.265625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.05189041563540155, "frac_reward_zero_std": 0.625, "grad_norm": 0.14291999975728545, "kl": 0.027313232421875, "learning_rate": 1.0307167235494882e-05, "loss": 0.0023, "num_tokens": 75037919.0, "reward": 2.14013671875, "reward_std": 0.15727964043617249, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 851.537109375, "completions/mean_terminated_length": 851.537109375, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.05223179994879235, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14441667515682752, "kl": 0.02435302734375, "learning_rate": 1.0375426621160409e-05, "loss": 0.0029, "num_tokens": 75554098.0, "reward": 2.05517578125, "reward_std": 0.14818859100341797, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 771.70703125, "completions/mean_terminated_length": 769.2094116210938, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.05257318426218315, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1491113926273657, "kl": 0.02691650390625, "learning_rate": 1.044368600682594e-05, "loss": 0.0026, "num_tokens": 76029004.0, "reward": 2.0791015625, "reward_std": 0.15503205358982086, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 761.845703125, "completions/mean_terminated_length": 761.845703125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.05291456857557395, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1575088547931706, "kl": 0.027679443359375, "learning_rate": 1.051194539249147e-05, "loss": 0.0024, "num_tokens": 76492029.0, "reward": 2.08544921875, "reward_std": 0.19507813453674316, "rewards/accuracy_reward/mean": 0.11290322244167328, "rewards/accuracy_reward/std": 0.3167939782142639, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 842.49609375, "completions/mean_terminated_length": 837.7686767578125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.05325595288896475, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11822692096080516, "kl": 0.024688720703125, "learning_rate": 1.0580204778156997e-05, "loss": 0.0164, "num_tokens": 77002011.0, "reward": 2.11962890625, "reward_std": 0.16266947984695435, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1855.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 756.525390625, "completions/mean_terminated_length": 756.525390625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.05359733720235555, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1770235349270608, "kl": 0.026641845703125, "learning_rate": 1.0648464163822528e-05, "loss": 0.0049, "num_tokens": 77464824.0, "reward": 2.142578125, "reward_std": 0.16239947080612183, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 845.966796875, "completions/mean_terminated_length": 845.966796875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.05393872151574635, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13619097639363412, "kl": 0.02520751953125, "learning_rate": 1.0716723549488055e-05, "loss": 0.0016, "num_tokens": 77984055.0, "reward": 2.09326171875, "reward_std": 0.14367687702178955, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 879.0, "completions/mean_terminated_length": 879.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.05428010582913715, "frac_reward_zero_std": 0.75, "grad_norm": 0.11208350071598998, "kl": 0.026611328125, "learning_rate": 1.0784982935153585e-05, "loss": 0.0015, "num_tokens": 78519959.0, "reward": 2.091796875, "reward_std": 0.10217027366161346, "rewards/accuracy_reward/mean": 0.0947580635547638, "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 920.396484375, "completions/mean_terminated_length": 916.1353759765625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.05462149014252795, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11225178390434097, "kl": 0.0426025390625, "learning_rate": 1.0853242320819112e-05, "loss": 0.0166, "num_tokens": 79073746.0, "reward": 2.13427734375, "reward_std": 0.1238735169172287, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 927.939453125, "completions/mean_terminated_length": 923.547119140625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.05496287445591875, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1343829496504795, "kl": 0.026275634765625, "learning_rate": 1.0921501706484643e-05, "loss": 0.0221, "num_tokens": 79636259.0, "reward": 2.14013671875, "reward_std": 0.13714823126792908, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 897.39453125, "completions/mean_terminated_length": 888.3346557617188, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.05530425876930955, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1136040890486408, "kl": 0.02593994140625, "learning_rate": 1.098976109215017e-05, "loss": 0.0161, "num_tokens": 80178029.0, "reward": 2.09130859375, "reward_std": 0.1323082149028778, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 943.642578125, "completions/mean_terminated_length": 937.1336059570312, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.05564564308270035, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13734077663722105, "kl": 0.027313232421875, "learning_rate": 1.1058020477815702e-05, "loss": 0.0281, "num_tokens": 80743334.0, "reward": 2.10546875, "reward_std": 0.1802365481853485, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.060289934277534485, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 947.150390625, "completions/mean_terminated_length": 934.0968627929688, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.05598702739609115, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13251484437171362, "kl": 0.026947021484375, "learning_rate": 1.1126279863481229e-05, "loss": 0.0236, "num_tokens": 81311843.0, "reward": 2.09814453125, "reward_std": 0.17692013084888458, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0814909115433693, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1959.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 925.6328125, "completions/mean_terminated_length": 925.6328125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.05632841170948195, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12220602159910018, "kl": 0.027496337890625, "learning_rate": 1.1194539249146758e-05, "loss": 0.0132, "num_tokens": 81867319.0, "reward": 2.08251953125, "reward_std": 0.13102984428405762, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 846.525390625, "completions/mean_terminated_length": 844.1741333007812, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.05666979602287275, "frac_reward_zero_std": 0.5, "grad_norm": 0.13773337283759685, "kl": 0.025482177734375, "learning_rate": 1.126279863481229e-05, "loss": 0.0204, "num_tokens": 82383444.0, "reward": 2.1259765625, "reward_std": 0.19696414470672607, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 860.966796875, "completions/mean_terminated_length": 858.643798828125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.057011180336263546, "frac_reward_zero_std": 0.59375, "grad_norm": 0.136329055708113, "kl": 0.026580810546875, "learning_rate": 1.1331058020477817e-05, "loss": 0.0207, "num_tokens": 82900499.0, "reward": 2.091796875, "reward_std": 0.15305736660957336, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 808.658203125, "completions/mean_terminated_length": 803.2180786132812, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.057352564649654346, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13503109068655123, "kl": 0.0469970703125, "learning_rate": 1.1399317406143346e-05, "loss": 0.018, "num_tokens": 83407604.0, "reward": 2.07763671875, "reward_std": 0.1351052224636078, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 678.689453125, "completions/mean_terminated_length": 678.689453125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.057693948963045145, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15563234289455563, "kl": 0.03271484375, "learning_rate": 1.1467576791808874e-05, "loss": 0.0049, "num_tokens": 83834085.0, "reward": 2.15625, "reward_std": 0.15748226642608643, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1786.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 781.623046875, "completions/mean_terminated_length": 781.623046875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.058035333276435945, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12253344571814269, "kl": 0.03466796875, "learning_rate": 1.1535836177474405e-05, "loss": 0.0041, "num_tokens": 84318788.0, "reward": 2.041015625, "reward_std": 0.10194280743598938, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 767.984375, "completions/mean_terminated_length": 765.4794311523438, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.058376717589826745, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1197193218592598, "kl": 0.03076171875, "learning_rate": 1.1604095563139932e-05, "loss": 0.0141, "num_tokens": 84795132.0, "reward": 2.15185546875, "reward_std": 0.13680344820022583, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 766.482421875, "completions/mean_terminated_length": 766.482421875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.058718101903217544, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14870877342289215, "kl": 0.031890869140625, "learning_rate": 1.1672354948805463e-05, "loss": 0.0032, "num_tokens": 85278835.0, "reward": 2.11083984375, "reward_std": 0.15709464251995087, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 844.134765625, "completions/mean_terminated_length": 841.7788696289062, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.059059486216608344, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12082804099168512, "kl": 0.02886962890625, "learning_rate": 1.174061433447099e-05, "loss": 0.018, "num_tokens": 85792376.0, "reward": 2.15185546875, "reward_std": 0.14056313037872314, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 786.158203125, "completions/mean_terminated_length": 786.158203125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.05940087052999914, "frac_reward_zero_std": 0.5, "grad_norm": 0.1396461425948652, "kl": 0.02984619140625, "learning_rate": 1.180887372013652e-05, "loss": 0.0062, "num_tokens": 86280537.0, "reward": 2.189453125, "reward_std": 0.1953611522912979, "rewards/accuracy_reward/mean": 0.19556452333927155, "rewards/accuracy_reward/std": 0.3970350921154022, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 769.63671875, "completions/mean_terminated_length": 769.63671875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.05974225484338994, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12399933034645906, "kl": 0.030914306640625, "learning_rate": 1.1877133105802047e-05, "loss": 0.0048, "num_tokens": 86757967.0, "reward": 2.1640625, "reward_std": 0.13375158607959747, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 768.486328125, "completions/mean_terminated_length": 768.486328125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.06008363915678075, "frac_reward_zero_std": 0.53125, "grad_norm": 0.17134182438014373, "kl": 0.03619384765625, "learning_rate": 1.1945392491467578e-05, "loss": -0.0054, "num_tokens": 87236920.0, "reward": 2.12255859375, "reward_std": 0.1612197607755661, "rewards/accuracy_reward/mean": 0.12903225421905518, "rewards/accuracy_reward/std": 0.33557409048080444, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 824.662109375, "completions/mean_terminated_length": 822.26806640625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.06042502347017155, "frac_reward_zero_std": 0.625, "grad_norm": 0.1218083368490041, "kl": 0.03265380859375, "learning_rate": 1.2013651877133108e-05, "loss": 0.019, "num_tokens": 87742251.0, "reward": 2.15966796875, "reward_std": 0.15033157169818878, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 816.025390625, "completions/mean_terminated_length": 816.025390625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.06076640778356235, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12499063144924824, "kl": 0.0338134765625, "learning_rate": 1.2081911262798635e-05, "loss": 0.0056, "num_tokens": 88244824.0, "reward": 2.146484375, "reward_std": 0.13786455988883972, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 818.732421875, "completions/mean_terminated_length": 818.732421875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.06110779209695315, "frac_reward_zero_std": 0.75, "grad_norm": 0.10317506397002195, "kl": 0.0330810546875, "learning_rate": 1.2150170648464166e-05, "loss": 0.0026, "num_tokens": 88743759.0, "reward": 2.09326171875, "reward_std": 0.09420640766620636, "rewards/accuracy_reward/mean": 0.09879032522439957, "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1988.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 856.703125, "completions/mean_terminated_length": 856.703125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.06144917641034395, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11599089080710942, "kl": 0.0345458984375, "learning_rate": 1.2218430034129694e-05, "loss": 0.0072, "num_tokens": 89277111.0, "reward": 2.06640625, "reward_std": 0.09743183106184006, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1668.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 751.109375, "completions/mean_terminated_length": 751.109375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.06179056072373475, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12655031798422983, "kl": 0.0357666015625, "learning_rate": 1.2286689419795223e-05, "loss": 0.0108, "num_tokens": 89748959.0, "reward": 2.15625, "reward_std": 0.11618966609239578, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2037.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 832.57421875, "completions/mean_terminated_length": 832.57421875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.06213194503712555, "frac_reward_zero_std": 0.75, "grad_norm": 0.105943027262723, "kl": 0.03717041015625, "learning_rate": 1.2354948805460752e-05, "loss": -0.0034, "num_tokens": 90259013.0, "reward": 2.052734375, "reward_std": 0.0915808230638504, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 765.556640625, "completions/mean_terminated_length": 765.556640625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.062473329350516346, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12891613600834045, "kl": 0.03704833984375, "learning_rate": 1.2423208191126281e-05, "loss": 0.0001, "num_tokens": 90732962.0, "reward": 2.078125, "reward_std": 0.11533089727163315, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 829.21484375, "completions/mean_terminated_length": 829.21484375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.06281471366390715, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11497017257370187, "kl": 0.0352783203125, "learning_rate": 1.2491467576791809e-05, "loss": 0.0107, "num_tokens": 91239136.0, "reward": 2.09619140625, "reward_std": 0.12711459398269653, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 771.150390625, "completions/mean_terminated_length": 768.6516723632812, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.06315609797729795, "frac_reward_zero_std": 0.625, "grad_norm": 0.12216743963019966, "kl": 0.0382080078125, "learning_rate": 1.255972696245734e-05, "loss": 0.0087, "num_tokens": 91721277.0, "reward": 2.1318359375, "reward_std": 0.14486746490001678, "rewards/accuracy_reward/mean": 0.14919355511665344, "rewards/accuracy_reward/std": 0.3566388487815857, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 802.814453125, "completions/mean_terminated_length": 800.377685546875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.06349748229068874, "frac_reward_zero_std": 0.40625, "grad_norm": 0.18498185707347292, "kl": 0.0367431640625, "learning_rate": 1.2627986348122867e-05, "loss": 0.0222, "num_tokens": 92219646.0, "reward": 2.14697265625, "reward_std": 0.232492595911026, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.052765581756830215, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 755.892578125, "completions/mean_terminated_length": 755.892578125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.06383886660407954, "frac_reward_zero_std": 0.5, "grad_norm": 0.153740072271737, "kl": 0.03851318359375, "learning_rate": 1.2696245733788397e-05, "loss": 0.0236, "num_tokens": 92687239.0, "reward": 2.1259765625, "reward_std": 0.17176616191864014, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 847.017578125, "completions/mean_terminated_length": 847.017578125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.06418025091747034, "frac_reward_zero_std": 0.75, "grad_norm": 0.10923769505945445, "kl": 0.0367431640625, "learning_rate": 1.2764505119453924e-05, "loss": 0.0053, "num_tokens": 93204400.0, "reward": 2.0791015625, "reward_std": 0.09838822484016418, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 728.189453125, "completions/mean_terminated_length": 725.6066284179688, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.06452163523086114, "frac_reward_zero_std": 0.625, "grad_norm": 0.1528345230700535, "kl": 0.0394287109375, "learning_rate": 1.2832764505119455e-05, "loss": 0.0117, "num_tokens": 93659393.0, "reward": 2.083984375, "reward_std": 0.1433878242969513, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 805.837890625, "completions/mean_terminated_length": 805.837890625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.06486301954425194, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14430566553715315, "kl": 0.03741455078125, "learning_rate": 1.2901023890784984e-05, "loss": 0.0009, "num_tokens": 94150094.0, "reward": 2.12939453125, "reward_std": 0.17342619597911835, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 819.873046875, "completions/mean_terminated_length": 817.4696655273438, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.06520440385764274, "frac_reward_zero_std": 0.5, "grad_norm": 0.15181760085996146, "kl": 0.03948974609375, "learning_rate": 1.2969283276450513e-05, "loss": 0.0176, "num_tokens": 94657101.0, "reward": 2.1435546875, "reward_std": 0.1921914666891098, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 867.4140625, "completions/mean_terminated_length": 865.1036987304688, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.06554578817103354, "frac_reward_zero_std": 0.625, "grad_norm": 0.1534257910706746, "kl": 0.04296875, "learning_rate": 1.3037542662116043e-05, "loss": 0.0127, "num_tokens": 95182977.0, "reward": 2.0458984375, "reward_std": 0.14404582977294922, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 834.265625, "completions/mean_terminated_length": 834.265625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.06588717248442434, "frac_reward_zero_std": 0.5, "grad_norm": 0.1605823338113651, "kl": 0.041748046875, "learning_rate": 1.310580204778157e-05, "loss": 0.0056, "num_tokens": 95699497.0, "reward": 2.02685546875, "reward_std": 0.20409616827964783, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.962890625, "rewards/format_reward/std": 0.18921469151973724, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.0543358214199543, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 774.791015625, "completions/mean_terminated_length": 774.791015625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.06622855679781514, "frac_reward_zero_std": 0.09375, "grad_norm": 0.23348827043764103, "kl": 0.04644775390625, "learning_rate": 1.3174061433447101e-05, "loss": 0.0067, "num_tokens": 96179934.0, "reward": 1.92822265625, "reward_std": 0.4242745637893677, "rewards/accuracy_reward/mean": 0.08266129344701767, "rewards/accuracy_reward/std": 0.2756475806236267, "rewards/format_reward/mean": 0.87890625, "rewards/format_reward/std": 0.3265552520751953, "rewards/tag_count_reward/mean": 0.96923828125, "rewards/tag_count_reward/std": 0.08220306783914566, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2008.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 793.267578125, "completions/mean_terminated_length": 793.267578125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.06656994111120594, "frac_reward_zero_std": 0.4375, "grad_norm": 0.17606760587146053, "kl": 0.04241943359375, "learning_rate": 1.3242320819112629e-05, "loss": 0.0111, "num_tokens": 96665159.0, "reward": 2.11474609375, "reward_std": 0.2291555404663086, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.04999455437064171, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 837.728515625, "completions/mean_terminated_length": 835.3600463867188, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.06691132542459674, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12497162367393376, "kl": 0.042724609375, "learning_rate": 1.3310580204778158e-05, "loss": 0.0058, "num_tokens": 97175756.0, "reward": 2.046875, "reward_std": 0.16321980953216553, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 847.27734375, "completions/mean_terminated_length": 815.9960327148438, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.06725270973798754, "frac_reward_zero_std": 0.1875, "grad_norm": 0.26058449023408076, "kl": 0.04754638671875, "learning_rate": 1.3378839590443686e-05, "loss": 0.1057, "num_tokens": 97690378.0, "reward": 2.0732421875, "reward_std": 0.19499725103378296, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9794921875, "rewards/tag_count_reward/std": 0.09420246630907059, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 838.201171875, "completions/mean_terminated_length": 838.201171875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.06759409405137834, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11242935883527941, "kl": 0.03851318359375, "learning_rate": 1.3447098976109216e-05, "loss": 0.0065, "num_tokens": 98196497.0, "reward": 2.111328125, "reward_std": 0.12647771835327148, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 800.19140625, "completions/mean_terminated_length": 797.74951171875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.06793547836476914, "frac_reward_zero_std": 0.625, "grad_norm": 0.12767124172803032, "kl": 0.040771484375, "learning_rate": 1.3515358361774744e-05, "loss": -0.0002, "num_tokens": 98689139.0, "reward": 2.140625, "reward_std": 0.15148448944091797, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 839.45703125, "completions/mean_terminated_length": 837.0919799804688, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.06827686267815994, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09255762955865272, "kl": 0.04180908203125, "learning_rate": 1.3583617747440275e-05, "loss": 0.0089, "num_tokens": 99192285.0, "reward": 2.07421875, "reward_std": 0.10983555018901825, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1892.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 838.986328125, "completions/mean_terminated_length": 838.986328125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.06861824699155074, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11370745735714806, "kl": 0.04022216796875, "learning_rate": 1.3651877133105804e-05, "loss": 0.0079, "num_tokens": 99704998.0, "reward": 2.0830078125, "reward_std": 0.1041383445262909, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1831.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 767.84765625, "completions/mean_terminated_length": 766.4520263671875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.06895963130494154, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14931082807102003, "kl": 0.05902099609375, "learning_rate": 1.3720136518771332e-05, "loss": 0.012, "num_tokens": 100187912.0, "reward": 2.16015625, "reward_std": 0.17492808401584625, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 805.5, "completions/mean_terminated_length": 805.5, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.06930101561833234, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1560195704919149, "kl": 0.04180908203125, "learning_rate": 1.3788395904436863e-05, "loss": -0.0041, "num_tokens": 100684296.0, "reward": 2.0654296875, "reward_std": 0.15003928542137146, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 653.884765625, "completions/mean_terminated_length": 652.6810302734375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.06964239993172314, "frac_reward_zero_std": 0.5, "grad_norm": 0.2414370899443503, "kl": 0.0753173828125, "learning_rate": 1.385665529010239e-05, "loss": -0.0075, "num_tokens": 101096989.0, "reward": 2.18603515625, "reward_std": 0.2050938606262207, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1754.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 774.412109375, "completions/mean_terminated_length": 774.412109375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.06998378424511394, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13788676915508966, "kl": 0.04345703125, "learning_rate": 1.392491467576792e-05, "loss": 0.0058, "num_tokens": 101572512.0, "reward": 2.0732421875, "reward_std": 0.13043171167373657, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 753.552734375, "completions/mean_terminated_length": 753.552734375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.07032516855850474, "frac_reward_zero_std": 0.59375, "grad_norm": 0.17180746455244172, "kl": 0.0438232421875, "learning_rate": 1.3993174061433447e-05, "loss": 0.0087, "num_tokens": 102049211.0, "reward": 2.1103515625, "reward_std": 0.1656356155872345, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 660.513671875, "completions/mean_terminated_length": 659.5361938476562, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.07066655287189554, "frac_reward_zero_std": 0.75, "grad_norm": 0.15190415782324365, "kl": 0.07513427734375, "learning_rate": 1.4061433447098978e-05, "loss": -0.0063, "num_tokens": 102468418.0, "reward": 2.0595703125, "reward_std": 0.10607418417930603, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 582.7265625, "completions/mean_terminated_length": 579.8590698242188, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.07100793718528634, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12997009135925297, "kl": 0.0628662109375, "learning_rate": 1.4129692832764506e-05, "loss": 0.0007, "num_tokens": 102845190.0, "reward": 2.11865234375, "reward_std": 0.11820854246616364, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 642.28515625, "completions/mean_terminated_length": 642.28515625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.07134932149867713, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1923476435117786, "kl": 0.064697265625, "learning_rate": 1.4197952218430035e-05, "loss": 0.0137, "num_tokens": 103254312.0, "reward": 2.1123046875, "reward_std": 0.20182780921459198, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 615.8359375, "completions/mean_terminated_length": 613.8333740234375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.07169070581206793, "frac_reward_zero_std": 0.6875, "grad_norm": 0.34994112288612084, "kl": 0.10986328125, "learning_rate": 1.4266211604095564e-05, "loss": 0.0077, "num_tokens": 103644516.0, "reward": 2.05029296875, "reward_std": 0.12004424631595612, "rewards/accuracy_reward/mean": 0.060483869165182114, "rewards/accuracy_reward/std": 0.2386218160390854, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 678.701171875, "completions/mean_terminated_length": 678.1995849609375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.07203209012545873, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1501660733218651, "kl": 0.080322265625, "learning_rate": 1.4334470989761093e-05, "loss": -0.0009, "num_tokens": 104074683.0, "reward": 2.1484375, "reward_std": 0.15458112955093384, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1407.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 714.306640625, "completions/mean_terminated_length": 714.306640625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.07237347443884953, "frac_reward_zero_std": 0.71875, "grad_norm": 0.13878688742535583, "kl": 0.0601806640625, "learning_rate": 1.4402730375426624e-05, "loss": 0.0043, "num_tokens": 104528040.0, "reward": 2.10791015625, "reward_std": 0.10487570613622665, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 825.927734375, "completions/mean_terminated_length": 818.7249755859375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.07271485875224033, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14206806949863637, "kl": 0.05316162109375, "learning_rate": 1.4470989761092152e-05, "loss": 0.0001, "num_tokens": 105043587.0, "reward": 2.0888671875, "reward_std": 0.17081201076507568, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 753.8359375, "completions/mean_terminated_length": 753.8359375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.07305624306563113, "frac_reward_zero_std": 0.78125, "grad_norm": 0.11066360149316294, "kl": 0.05523681640625, "learning_rate": 1.4539249146757681e-05, "loss": 0.0085, "num_tokens": 105510031.0, "reward": 2.0390625, "reward_std": 0.08203192055225372, "rewards/accuracy_reward/mean": 0.04032257944345474, "rewards/accuracy_reward/std": 0.19691328704357147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1662.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 820.064453125, "completions/mean_terminated_length": 820.064453125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.07339762737902193, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11683581326795346, "kl": 0.0531005859375, "learning_rate": 1.4607508532423209e-05, "loss": -0.0036, "num_tokens": 106014928.0, "reward": 2.07373046875, "reward_std": 0.10808777064085007, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1786.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 881.078125, "completions/mean_terminated_length": 881.078125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.07373901169241273, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11135721106525936, "kl": 0.04913330078125, "learning_rate": 1.467576791808874e-05, "loss": 0.0128, "num_tokens": 106547048.0, "reward": 2.13671875, "reward_std": 0.13178187608718872, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 899.015625, "completions/mean_terminated_length": 899.015625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.07408039600580353, "frac_reward_zero_std": 0.65625, "grad_norm": 0.17342700103013456, "kl": 0.05108642578125, "learning_rate": 1.4744027303754267e-05, "loss": 0.0165, "num_tokens": 107089248.0, "reward": 2.115234375, "reward_std": 0.14372749626636505, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1624.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 852.74609375, "completions/mean_terminated_length": 852.74609375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.07442178031919433, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12530949852364104, "kl": 0.0540771484375, "learning_rate": 1.4812286689419796e-05, "loss": 0.0174, "num_tokens": 107605886.0, "reward": 2.09326171875, "reward_std": 0.09841175377368927, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 913.04296875, "completions/mean_terminated_length": 904.1063232421875, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.07476316463258513, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10517861475734508, "kl": 0.0501708984375, "learning_rate": 1.4880546075085325e-05, "loss": 0.0119, "num_tokens": 108169700.0, "reward": 2.0888671875, "reward_std": 0.13363876938819885, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 881.4140625, "completions/mean_terminated_length": 881.4140625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.07510454894597593, "frac_reward_zero_std": 0.625, "grad_norm": 0.12161946888216645, "kl": 0.05035400390625, "learning_rate": 1.4948805460750855e-05, "loss": 0.0074, "num_tokens": 108707768.0, "reward": 2.09912109375, "reward_std": 0.13659736514091492, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 853.765625, "completions/mean_terminated_length": 853.765625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.07544593325936673, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1544372551126634, "kl": 0.0484619140625, "learning_rate": 1.5017064846416382e-05, "loss": 0.008, "num_tokens": 109223248.0, "reward": 2.11474609375, "reward_std": 0.1904720515012741, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 781.4609375, "completions/mean_terminated_length": 781.4609375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.07578731757275753, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11161178093920547, "kl": 0.05352783203125, "learning_rate": 1.5085324232081913e-05, "loss": 0.0038, "num_tokens": 109707676.0, "reward": 2.154296875, "reward_std": 0.10909011214971542, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 771.978515625, "completions/mean_terminated_length": 769.4813842773438, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.07612870188614833, "frac_reward_zero_std": 0.625, "grad_norm": 0.8397392186157854, "kl": 0.05279541015625, "learning_rate": 1.515358361774744e-05, "loss": 0.018, "num_tokens": 110187121.0, "reward": 2.14453125, "reward_std": 0.1571779102087021, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 753.470703125, "completions/mean_terminated_length": 753.470703125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.07647008619953913, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13024321393607402, "kl": 0.0572509765625, "learning_rate": 1.522184300341297e-05, "loss": 0.0052, "num_tokens": 110655634.0, "reward": 2.0546875, "reward_std": 0.1049705445766449, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 737.85546875, "completions/mean_terminated_length": 737.85546875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.07681147051292993, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11578378454876567, "kl": 0.054931640625, "learning_rate": 1.52901023890785e-05, "loss": 0.002, "num_tokens": 111119960.0, "reward": 2.0703125, "reward_std": 0.11778736114501953, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 730.658203125, "completions/mean_terminated_length": 730.658203125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.07715285482632073, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1432708176903187, "kl": 0.05450439453125, "learning_rate": 1.5358361774744027e-05, "loss": 0.012, "num_tokens": 111583177.0, "reward": 2.115234375, "reward_std": 0.16122952103614807, "rewards/accuracy_reward/mean": 0.11895161122083664, "rewards/accuracy_reward/std": 0.3240584135055542, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 790.8046875, "completions/mean_terminated_length": 790.8046875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.07749423913971153, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12491908633437313, "kl": 0.05242919921875, "learning_rate": 1.5426621160409558e-05, "loss": -0.002, "num_tokens": 112072869.0, "reward": 2.08984375, "reward_std": 0.13618168234825134, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 838.80078125, "completions/mean_terminated_length": 838.80078125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.07783562345310233, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1376432025244319, "kl": 0.0511474609375, "learning_rate": 1.5494880546075085e-05, "loss": -0.0023, "num_tokens": 112579167.0, "reward": 2.0712890625, "reward_std": 0.14074990153312683, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1924.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 818.17578125, "completions/mean_terminated_length": 818.17578125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.07817700776649313, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12175992150997472, "kl": 0.0501708984375, "learning_rate": 1.5563139931740616e-05, "loss": 0.0137, "num_tokens": 113088329.0, "reward": 2.138671875, "reward_std": 0.1669841706752777, "rewards/accuracy_reward/mean": 0.14314515888690948, "rewards/accuracy_reward/std": 0.35057440400123596, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 902.98828125, "completions/mean_terminated_length": 902.98828125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.07851839207988393, "frac_reward_zero_std": 0.78125, "grad_norm": 0.10308985147187796, "kl": 0.0496826171875, "learning_rate": 1.5631399317406144e-05, "loss": -0.0005, "num_tokens": 113640195.0, "reward": 2.048828125, "reward_std": 0.08643084019422531, "rewards/accuracy_reward/mean": 0.05040322616696358, "rewards/accuracy_reward/std": 0.21899642050266266, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 863.966796875, "completions/mean_terminated_length": 861.6497192382812, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.07885977639327472, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14411715867408834, "kl": 0.050048828125, "learning_rate": 1.5699658703071675e-05, "loss": 0.0042, "num_tokens": 114164530.0, "reward": 2.107421875, "reward_std": 0.16251389682292938, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 840.857421875, "completions/mean_terminated_length": 840.857421875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.07920116070666552, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11058437690606583, "kl": 0.0478515625, "learning_rate": 1.5767918088737202e-05, "loss": 0.0086, "num_tokens": 114680505.0, "reward": 2.0498046875, "reward_std": 0.13923382759094238, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 871.998046875, "completions/mean_terminated_length": 869.6966552734375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.07954254502005632, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11782399762849323, "kl": 0.04913330078125, "learning_rate": 1.5836177474402733e-05, "loss": 0.0198, "num_tokens": 115208424.0, "reward": 2.052734375, "reward_std": 0.1163494735956192, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 802.15625, "completions/mean_terminated_length": 802.15625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.07988392933344712, "frac_reward_zero_std": 0.84375, "grad_norm": 0.08168179379577921, "kl": 0.05584716796875, "learning_rate": 1.590443686006826e-05, "loss": 0.0013, "num_tokens": 115705640.0, "reward": 2.04541015625, "reward_std": 0.07521604001522064, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 808.26953125, "completions/mean_terminated_length": 808.26953125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.08022531364683792, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12115610126401027, "kl": 0.05322265625, "learning_rate": 1.5972696245733788e-05, "loss": 0.009, "num_tokens": 116197778.0, "reward": 2.08251953125, "reward_std": 0.13635608553886414, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 763.16015625, "completions/mean_terminated_length": 760.6458129882812, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.08056669796022872, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1295268184458147, "kl": 0.05419921875, "learning_rate": 1.604095563139932e-05, "loss": 0.0165, "num_tokens": 116668532.0, "reward": 2.1025390625, "reward_std": 0.16994675993919373, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1991.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 847.44140625, "completions/mean_terminated_length": 847.44140625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.08090808227361952, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12498712693399602, "kl": 0.0518798828125, "learning_rate": 1.6109215017064847e-05, "loss": 0.0115, "num_tokens": 117191142.0, "reward": 2.1064453125, "reward_std": 0.18117861449718475, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1429.0, "completions/max_terminated_length": 1429.0, "completions/mean_length": 723.119140625, "completions/mean_terminated_length": 723.119140625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.08124946658701032, "frac_reward_zero_std": 0.75, "grad_norm": 0.15185799797298982, "kl": 0.0633544921875, "learning_rate": 1.6177474402730378e-05, "loss": -0.0, "num_tokens": 117641075.0, "reward": 2.04443359375, "reward_std": 0.09933625161647797, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1322.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 719.421875, "completions/mean_terminated_length": 719.421875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.08159085090040112, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1789907588144806, "kl": 0.05731201171875, "learning_rate": 1.6245733788395905e-05, "loss": 0.0048, "num_tokens": 118095723.0, "reward": 2.111328125, "reward_std": 0.19588729739189148, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1477.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 748.248046875, "completions/mean_terminated_length": 748.248046875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.08193223521379192, "frac_reward_zero_std": 0.15625, "grad_norm": 0.21490496605666, "kl": 0.0582275390625, "learning_rate": 1.6313993174061436e-05, "loss": -0.0011, "num_tokens": 118557930.0, "reward": 1.9296875, "reward_std": 0.3996448516845703, "rewards/accuracy_reward/mean": 0.08467742055654526, "rewards/accuracy_reward/std": 0.278682142496109, "rewards/format_reward/mean": 0.884765625, "rewards/format_reward/std": 0.3196168541908264, "rewards/tag_count_reward/mean": 0.962890625, "rewards/tag_count_reward/std": 0.1205006018280983, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 673.337890625, "completions/mean_terminated_length": 673.337890625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.08227361952718272, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1969671578914452, "kl": 0.06268310546875, "learning_rate": 1.6382252559726964e-05, "loss": -0.0025, "num_tokens": 118981671.0, "reward": 2.0009765625, "reward_std": 0.29356300830841064, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.931640625, "rewards/format_reward/std": 0.25260838866233826, "rewards/tag_count_reward/mean": 0.9794921875, "rewards/tag_count_reward/std": 0.08747007697820663, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 777.1640625, "completions/mean_terminated_length": 777.1640625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.08261500384057352, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1495775322399563, "kl": 0.0511474609375, "learning_rate": 1.6450511945392495e-05, "loss": -0.0026, "num_tokens": 119463707.0, "reward": 2.1015625, "reward_std": 0.20735880732536316, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 718.203125, "completions/mean_terminated_length": 718.203125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.08295638815396432, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14435066854919656, "kl": 0.060302734375, "learning_rate": 1.6518771331058022e-05, "loss": -0.0043, "num_tokens": 119916531.0, "reward": 2.12109375, "reward_std": 0.16134843230247498, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1715.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 754.341796875, "completions/mean_terminated_length": 754.341796875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.08329777246735512, "frac_reward_zero_std": 0.625, "grad_norm": 0.12802397730769152, "kl": 0.05645751953125, "learning_rate": 1.658703071672355e-05, "loss": 0.0014, "num_tokens": 120387026.0, "reward": 2.08544921875, "reward_std": 0.1342269331216812, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 761.328125, "completions/mean_terminated_length": 761.328125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.08363915678074592, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11904279595985516, "kl": 0.055908203125, "learning_rate": 1.6655290102389077e-05, "loss": 0.0016, "num_tokens": 120856714.0, "reward": 2.03662109375, "reward_std": 0.09487161040306091, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 833.255859375, "completions/mean_terminated_length": 830.878662109375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.08398054109413672, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10419643328334756, "kl": 0.0576171875, "learning_rate": 1.6723549488054608e-05, "loss": 0.016, "num_tokens": 121368109.0, "reward": 2.06640625, "reward_std": 0.130885511636734, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1633.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 821.78125, "completions/mean_terminated_length": 821.78125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.08432192540752752, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12737462173837538, "kl": 0.0552978515625, "learning_rate": 1.6791808873720136e-05, "loss": 0.0082, "num_tokens": 121876285.0, "reward": 2.09912109375, "reward_std": 0.16816964745521545, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 825.47265625, "completions/mean_terminated_length": 824.7905883789062, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.08466330972091832, "frac_reward_zero_std": 0.71875, "grad_norm": 0.14988861413126375, "kl": 0.06524658203125, "learning_rate": 1.6860068259385667e-05, "loss": 0.0027, "num_tokens": 122386079.0, "reward": 2.07421875, "reward_std": 0.10465700179338455, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 782.625, "completions/mean_terminated_length": 781.7299194335938, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.08500469403430912, "frac_reward_zero_std": 0.6875, "grad_norm": 359.9371102655088, "kl": 6.91656494140625, "learning_rate": 1.6928327645051198e-05, "loss": 0.2721, "num_tokens": 122873295.0, "reward": 2.07275390625, "reward_std": 0.12266142666339874, "rewards/accuracy_reward/mean": 0.08064515888690948, "rewards/accuracy_reward/std": 0.2725643217563629, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 778.259765625, "completions/mean_terminated_length": 772.037353515625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.08534607834769992, "frac_reward_zero_std": 0.75, "grad_norm": 2.1571904627230327, "kl": 0.12579345703125, "learning_rate": 1.6996587030716725e-05, "loss": 0.0091, "num_tokens": 123349524.0, "reward": 2.05810546875, "reward_std": 0.11230636388063431, "rewards/accuracy_reward/mean": 0.07083333283662796, "rewards/accuracy_reward/std": 0.2568138837814331, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 808.296875, "completions/mean_terminated_length": 805.6373291015625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.08568746266109073, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1545935952077004, "kl": 0.08184814453125, "learning_rate": 1.7064846416382256e-05, "loss": 0.0112, "num_tokens": 123844556.0, "reward": 2.0791015625, "reward_std": 0.11278925091028214, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 798.93359375, "completions/mean_terminated_length": 785.169677734375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.08602884697448153, "frac_reward_zero_std": 0.65625, "grad_norm": 0.19807682397661, "kl": 0.231201171875, "learning_rate": 1.7133105802047784e-05, "loss": 0.0157, "num_tokens": 124346410.0, "reward": 2.07470703125, "reward_std": 0.1205042153596878, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.765625, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 756.685546875, "completions/mean_terminated_length": 740.8912963867188, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.08637023128787233, "frac_reward_zero_std": 0.6875, "grad_norm": 0.29854602852492623, "kl": 0.279541015625, "learning_rate": 1.720136518771331e-05, "loss": 0.0136, "num_tokens": 124828297.0, "reward": 2.02392578125, "reward_std": 0.10065489262342453, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 751.599609375, "completions/mean_terminated_length": 733.325927734375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.08671161560126313, "frac_reward_zero_std": 0.46875, "grad_norm": 0.2052009752030875, "kl": 0.228759765625, "learning_rate": 1.726962457337884e-05, "loss": 0.0289, "num_tokens": 125287788.0, "reward": 2.1337890625, "reward_std": 0.21519465744495392, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 694.6796875, "completions/mean_terminated_length": 681.2410278320312, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.08705299991465393, "frac_reward_zero_std": 0.46875, "grad_norm": 2.353838266217558, "kl": 0.1640625, "learning_rate": 1.733788395904437e-05, "loss": 0.0276, "num_tokens": 125732824.0, "reward": 2.11962890625, "reward_std": 0.21155564486980438, "rewards/accuracy_reward/mean": 0.14314515888690948, "rewards/accuracy_reward/std": 0.35057440400123596, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.04249391704797745, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 707.177734375, "completions/mean_terminated_length": 695.711181640625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.08739438422804473, "frac_reward_zero_std": 0.59375, "grad_norm": 0.9548725137496432, "kl": 0.193115234375, "learning_rate": 1.7406143344709897e-05, "loss": 0.0085, "num_tokens": 126180035.0, "reward": 2.06005859375, "reward_std": 0.16362124681472778, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.036414988338947296, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 781.845703125, "completions/mean_terminated_length": 763.8922119140625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.08773576854143553, "frac_reward_zero_std": 0.53125, "grad_norm": 0.22647947510546376, "kl": 0.141357421875, "learning_rate": 1.7474402730375428e-05, "loss": 0.0123, "num_tokens": 126661156.0, "reward": 2.076171875, "reward_std": 0.17785194516181946, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 829.72265625, "completions/mean_terminated_length": 823.2066650390625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.08807715285482633, "frac_reward_zero_std": 0.84375, "grad_norm": 0.21363530996461214, "kl": 0.09161376953125, "learning_rate": 1.7542662116040956e-05, "loss": 0.0167, "num_tokens": 127160566.0, "reward": 2.0166015625, "reward_std": 0.0596856027841568, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 777.017578125, "completions/mean_terminated_length": 777.017578125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.08841853716821713, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11251643324884836, "kl": 0.062255859375, "learning_rate": 1.7610921501706487e-05, "loss": 0.0066, "num_tokens": 127638463.0, "reward": 2.13134765625, "reward_std": 0.1087883859872818, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 828.14453125, "completions/mean_terminated_length": 820.575927734375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.08875992148160793, "frac_reward_zero_std": 0.75, "grad_norm": 3.199992004158806, "kl": 0.20172119140625, "learning_rate": 1.7679180887372018e-05, "loss": -0.0008, "num_tokens": 128147369.0, "reward": 2.03076171875, "reward_std": 0.0877925306558609, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1741.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 861.720703125, "completions/mean_terminated_length": 858.1572265625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.08910130579499873, "frac_reward_zero_std": 0.625, "grad_norm": 0.18081466761179749, "kl": 0.1002197265625, "learning_rate": 1.7747440273037545e-05, "loss": 0.0063, "num_tokens": 128668378.0, "reward": 2.0703125, "reward_std": 0.1302781105041504, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 796.74609375, "completions/mean_terminated_length": 791.822509765625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.08944269010838952, "frac_reward_zero_std": 0.53125, "grad_norm": 0.2511194742595462, "kl": 0.1297607421875, "learning_rate": 1.7815699658703073e-05, "loss": 0.0093, "num_tokens": 129156872.0, "reward": 2.130859375, "reward_std": 0.18195047974586487, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 810.220703125, "completions/mean_terminated_length": 802.3379516601562, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.08978407442178032, "frac_reward_zero_std": 0.5, "grad_norm": 0.17884258814801382, "kl": 0.1309814453125, "learning_rate": 1.78839590443686e-05, "loss": 0.0051, "num_tokens": 129651113.0, "reward": 2.150390625, "reward_std": 0.18869252502918243, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 854.88671875, "completions/mean_terminated_length": 847.4931030273438, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.09012545873517112, "frac_reward_zero_std": 0.6875, "grad_norm": 1.589935869696168, "kl": 0.165771484375, "learning_rate": 1.795221843003413e-05, "loss": 0.0074, "num_tokens": 130178015.0, "reward": 2.09130859375, "reward_std": 0.13492536544799805, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 872.5703125, "completions/mean_terminated_length": 861.2970581054688, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.09046684304856192, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15325646581710026, "kl": 0.115478515625, "learning_rate": 1.802047781569966e-05, "loss": 0.0104, "num_tokens": 130712099.0, "reward": 2.09619140625, "reward_std": 0.14421473443508148, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04269581660628319, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1905.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 932.82421875, "completions/mean_terminated_length": 927.6436767578125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.09080822736195272, "frac_reward_zero_std": 0.65625, "grad_norm": 5.507163804131017, "kl": 0.763671875, "learning_rate": 1.808873720136519e-05, "loss": 0.036, "num_tokens": 131270713.0, "reward": 2.1015625, "reward_std": 0.1281953752040863, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 887.873046875, "completions/mean_terminated_length": 885.10595703125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.09114961167534352, "frac_reward_zero_std": 0.65625, "grad_norm": 0.34134178775360924, "kl": 0.103515625, "learning_rate": 1.8156996587030717e-05, "loss": 0.0147, "num_tokens": 131809144.0, "reward": 2.14453125, "reward_std": 0.1342383325099945, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 841.32421875, "completions/mean_terminated_length": 838.9628295898438, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.09149099598873432, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12166516154879253, "kl": 0.08154296875, "learning_rate": 1.8225255972696248e-05, "loss": 0.0138, "num_tokens": 132322286.0, "reward": 2.12939453125, "reward_std": 0.13623586297035217, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 777.619140625, "completions/mean_terminated_length": 776.6536254882812, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.09183238030212512, "frac_reward_zero_std": 0.5, "grad_norm": 1.443654013334719, "kl": 0.0970458984375, "learning_rate": 1.8293515358361776e-05, "loss": 0.0158, "num_tokens": 132798827.0, "reward": 2.15087890625, "reward_std": 0.1936877965927124, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 762.041015625, "completions/mean_terminated_length": 762.041015625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.09217376461551592, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11553405890227907, "kl": 0.0706787109375, "learning_rate": 1.8361774744027307e-05, "loss": 0.0098, "num_tokens": 133265984.0, "reward": 2.10302734375, "reward_std": 0.1249895766377449, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1994.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 740.82421875, "completions/mean_terminated_length": 740.82421875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.09251514892890672, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12114695461176872, "kl": 0.071044921875, "learning_rate": 1.8430034129692834e-05, "loss": 0.0027, "num_tokens": 133724822.0, "reward": 2.099609375, "reward_std": 0.11933182179927826, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1769.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 680.205078125, "completions/mean_terminated_length": 680.205078125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.09285653324229752, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13430002889202572, "kl": 0.067626953125, "learning_rate": 1.8498293515358362e-05, "loss": 0.0016, "num_tokens": 134152815.0, "reward": 2.11865234375, "reward_std": 0.14578565955162048, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 716.84375, "completions/mean_terminated_length": 716.84375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.09319791755568832, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12395104596807272, "kl": 0.0684814453125, "learning_rate": 1.8566552901023893e-05, "loss": 0.0139, "num_tokens": 134603519.0, "reward": 2.111328125, "reward_std": 0.15086117386817932, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 753.46484375, "completions/mean_terminated_length": 753.46484375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.09353930186907912, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12155639272852196, "kl": 0.0621337890625, "learning_rate": 1.863481228668942e-05, "loss": 0.0028, "num_tokens": 135066301.0, "reward": 2.10400390625, "reward_std": 0.15320807695388794, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1455.0, "completions/max_terminated_length": 1455.0, "completions/mean_length": 731.736328125, "completions/mean_terminated_length": 731.736328125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.09388068618246992, "frac_reward_zero_std": 0.75, "grad_norm": 0.10532726159961442, "kl": 0.06268310546875, "learning_rate": 1.870307167235495e-05, "loss": 0.0042, "num_tokens": 135529638.0, "reward": 2.1171875, "reward_std": 0.10421229898929596, "rewards/accuracy_reward/mean": 0.12096773833036423, "rewards/accuracy_reward/std": 0.32641899585723877, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 835.513671875, "completions/mean_terminated_length": 833.140869140625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.09422207049586072, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10699638443235238, "kl": 0.054443359375, "learning_rate": 1.877133105802048e-05, "loss": 0.0148, "num_tokens": 136035373.0, "reward": 2.12353515625, "reward_std": 0.14420121908187866, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 788.59375, "completions/mean_terminated_length": 783.6549682617188, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.09456345480925152, "frac_reward_zero_std": 0.65625, "grad_norm": 0.117577444229771, "kl": 0.06219482421875, "learning_rate": 1.883959044368601e-05, "loss": 0.0153, "num_tokens": 136519053.0, "reward": 2.0888671875, "reward_std": 0.13002407550811768, "rewards/accuracy_reward/mean": 0.09677419066429138, "rewards/accuracy_reward/std": 0.2959485352039337, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 775.13671875, "completions/mean_terminated_length": 775.13671875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.09490483912264232, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11337002378836218, "kl": 0.06591796875, "learning_rate": 1.8907849829351537e-05, "loss": -0.0001, "num_tokens": 136997507.0, "reward": 2.08544921875, "reward_std": 0.13769380748271942, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2022.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 782.837890625, "completions/mean_terminated_length": 782.837890625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.09524622343603312, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11347581278692712, "kl": 0.0650634765625, "learning_rate": 1.8976109215017068e-05, "loss": 0.0067, "num_tokens": 137493744.0, "reward": 2.07763671875, "reward_std": 0.1096266657114029, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 841.396484375, "completions/mean_terminated_length": 839.0352172851562, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.09558760774942392, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10968999747167223, "kl": 0.0599365234375, "learning_rate": 1.9044368600682596e-05, "loss": 0.0115, "num_tokens": 137998971.0, "reward": 2.03369140625, "reward_std": 0.10180558264255524, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 788.005859375, "completions/mean_terminated_length": 786.4990234375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.09592899206281472, "frac_reward_zero_std": 0.65625, "grad_norm": 723318.6472364176, "kl": 62720.048828125, "learning_rate": 1.9112627986348123e-05, "loss": 2505.5159, "num_tokens": 138482782.0, "reward": 2.0966796875, "reward_std": 0.13820309937000275, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 869.248046875, "completions/mean_terminated_length": 869.248046875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.09627037637620552, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10269849841159259, "kl": 0.06268310546875, "learning_rate": 1.918088737201365e-05, "loss": 0.0056, "num_tokens": 139003997.0, "reward": 2.08935546875, "reward_std": 0.13919848203659058, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1911.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 870.3515625, "completions/mean_terminated_length": 870.3515625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.09661176068959632, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09195470691210171, "kl": 0.0657958984375, "learning_rate": 1.924914675767918e-05, "loss": 0.0011, "num_tokens": 139543537.0, "reward": 2.05224609375, "reward_std": 0.09239605814218521, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 795.634765625, "completions/mean_terminated_length": 793.1839599609375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.09695314500298712, "frac_reward_zero_std": 0.625, "grad_norm": 0.1270516701647163, "kl": 0.0670166015625, "learning_rate": 1.9317406143344713e-05, "loss": 0.0114, "num_tokens": 140024966.0, "reward": 2.13134765625, "reward_std": 0.15914222598075867, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 828.564453125, "completions/mean_terminated_length": 828.564453125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.09729452931637791, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12844022110649664, "kl": 0.0714111328125, "learning_rate": 1.938566552901024e-05, "loss": 0.0029, "num_tokens": 140538215.0, "reward": 2.15185546875, "reward_std": 0.15210142731666565, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 840.705078125, "completions/mean_terminated_length": 833.5894165039062, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.09763591362976871, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11564333917553804, "kl": 0.0643310546875, "learning_rate": 1.945392491467577e-05, "loss": 0.0126, "num_tokens": 141053632.0, "reward": 2.09423828125, "reward_std": 0.16099828481674194, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1715.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 790.70703125, "completions/mean_terminated_length": 790.70703125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.09797729794315951, "frac_reward_zero_std": 0.625, "grad_norm": 0.12705966520662126, "kl": 0.0677490234375, "learning_rate": 1.95221843003413e-05, "loss": 0.0066, "num_tokens": 141551018.0, "reward": 2.0830078125, "reward_std": 0.14034633338451385, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 796.09765625, "completions/mean_terminated_length": 796.09765625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.09831868225655031, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09378213540206425, "kl": 0.0728759765625, "learning_rate": 1.959044368600683e-05, "loss": 0.0002, "num_tokens": 142045564.0, "reward": 2.0546875, "reward_std": 0.077679343521595, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 777.22265625, "completions/mean_terminated_length": 774.7357788085938, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.09866006656994111, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1165099937348341, "kl": 0.075439453125, "learning_rate": 1.9658703071672357e-05, "loss": 0.0112, "num_tokens": 142526846.0, "reward": 2.12255859375, "reward_std": 0.12711243331432343, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 753.1015625, "completions/mean_terminated_length": 749.2137451171875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.09900145088333191, "frac_reward_zero_std": 0.71875, "grad_norm": 5.323351121305739, "kl": 0.082275390625, "learning_rate": 1.9726962457337885e-05, "loss": 0.0155, "num_tokens": 142993826.0, "reward": 2.05859375, "reward_std": 0.09919346868991852, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1617.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 810.26171875, "completions/mean_terminated_length": 810.26171875, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.09934283519672271, "frac_reward_zero_std": 0.8125, "grad_norm": 0.08596498840323069, "kl": 0.0740966796875, "learning_rate": 1.9795221843003412e-05, "loss": 0.003, "num_tokens": 143491752.0, "reward": 2.056640625, "reward_std": 0.07780591398477554, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1851.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 776.228515625, "completions/mean_terminated_length": 776.228515625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.09968421951011351, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13905891640324397, "kl": 0.0723876953125, "learning_rate": 1.9863481228668943e-05, "loss": 0.0125, "num_tokens": 143970333.0, "reward": 2.1328125, "reward_std": 0.16537627577781677, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 756.919921875, "completions/mean_terminated_length": 756.919921875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.10002560382350431, "frac_reward_zero_std": 0.625, "grad_norm": 0.2284393339170605, "kl": 0.085693359375, "learning_rate": 1.993174061433447e-05, "loss": 0.0067, "num_tokens": 144439540.0, "reward": 2.1044921875, "reward_std": 0.1444859653711319, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1424.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 727.513671875, "completions/mean_terminated_length": 727.513671875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.10036698813689511, "frac_reward_zero_std": 0.0, "grad_norm": 0.26498941994787845, "kl": 0.0794677734375, "learning_rate": 2e-05, "loss": 0.0042, "num_tokens": 144894955.0, "reward": 1.388671875, "reward_std": 0.6424903869628906, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.484375, "rewards/format_reward/std": 0.5002445578575134, "rewards/tag_count_reward/mean": 0.857421875, "rewards/tag_count_reward/std": 0.14898085594177246, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 761.94921875, "completions/mean_terminated_length": 759.4324951171875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.10070837245028591, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2174653794711409, "kl": 0.0770263671875, "learning_rate": 1.9999992903414513e-05, "loss": 0.0129, "num_tokens": 145364481.0, "reward": 1.73779296875, "reward_std": 0.5102505683898926, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.767578125, "rewards/format_reward/std": 0.42278963327407837, "rewards/tag_count_reward/mean": 0.93701171875, "rewards/tag_count_reward/std": 0.11936826258897781, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 664.404296875, "completions/mean_terminated_length": 664.404296875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.10104975676367671, "frac_reward_zero_std": 0.28125, "grad_norm": 0.20312343138676514, "kl": 0.0782470703125, "learning_rate": 1.9999971613668125e-05, "loss": -0.0003, "num_tokens": 145782048.0, "reward": 2.05078125, "reward_std": 0.2876293659210205, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21157780289649963, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.06301766633987427, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 767.6015625, "completions/mean_terminated_length": 757.5196533203125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.10139114107706751, "frac_reward_zero_std": 0.40625, "grad_norm": 0.19470404161093027, "kl": 0.08251953125, "learning_rate": 1.999993613079105e-05, "loss": 0.0213, "num_tokens": 146258676.0, "reward": 2.0234375, "reward_std": 0.21192438900470734, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.06374134123325348, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 724.669921875, "completions/mean_terminated_length": 722.0802001953125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.10173252539045831, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15476948721127676, "kl": 0.0784912109375, "learning_rate": 1.9999886454833646e-05, "loss": 0.0194, "num_tokens": 146708907.0, "reward": 2.11279296875, "reward_std": 0.17581740021705627, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 682.673828125, "completions/mean_terminated_length": 661.0020141601562, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.10207390970384911, "frac_reward_zero_std": 0.375, "grad_norm": 0.2251341290040109, "kl": 0.0906982421875, "learning_rate": 1.999982258586643e-05, "loss": 0.0571, "num_tokens": 147144468.0, "reward": 2.09228515625, "reward_std": 0.22088834643363953, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.98486328125, "rewards/tag_count_reward/std": 0.09712290018796921, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 730.982421875, "completions/mean_terminated_length": 730.982421875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.10241529401723991, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13798804594900327, "kl": 0.07666015625, "learning_rate": 1.9999744523980044e-05, "loss": -0.0021, "num_tokens": 147605403.0, "reward": 2.02587890625, "reward_std": 0.11270694434642792, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 771.453125, "completions/mean_terminated_length": 771.453125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.10275667833063071, "frac_reward_zero_std": 0.625, "grad_norm": 0.11516823012205396, "kl": 0.075439453125, "learning_rate": 1.9999652269285282e-05, "loss": 0.01, "num_tokens": 148080467.0, "reward": 2.12158203125, "reward_std": 0.16576609015464783, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 701.0390625, "completions/mean_terminated_length": 698.4031372070312, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.10309806264402151, "frac_reward_zero_std": 0.625, "grad_norm": 0.13799144741485156, "kl": 0.082275390625, "learning_rate": 1.9999545821913088e-05, "loss": 0.011, "num_tokens": 148522919.0, "reward": 2.06689453125, "reward_std": 0.14634603261947632, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.055034760385751724, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 885.04296875, "completions/mean_terminated_length": 882.76708984375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.1034394469574123, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10732307901406357, "kl": 0.0712890625, "learning_rate": 1.999942518201454e-05, "loss": 0.0087, "num_tokens": 149074957.0, "reward": 2.0703125, "reward_std": 0.12180034816265106, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0347534641623497, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 844.28125, "completions/mean_terminated_length": 841.9256591796875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.1037808312708031, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1337035480822417, "kl": 0.075927734375, "learning_rate": 1.9999290349760866e-05, "loss": 0.0008, "num_tokens": 149588973.0, "reward": 2.09765625, "reward_std": 0.16923630237579346, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 907.333984375, "completions/mean_terminated_length": 902.86083984375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.1041222155841939, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11674576888241989, "kl": 0.070556640625, "learning_rate": 1.9999141325343436e-05, "loss": 0.0107, "num_tokens": 150132616.0, "reward": 2.06787109375, "reward_std": 0.15555807948112488, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 801.359375, "completions/mean_terminated_length": 801.359375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.1044635998975847, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15802830178414942, "kl": 0.0858154296875, "learning_rate": 1.9998978108973763e-05, "loss": 0.0046, "num_tokens": 150637472.0, "reward": 2.11083984375, "reward_std": 0.16202810406684875, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 781.375, "completions/mean_terminated_length": 781.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.1048049842109755, "frac_reward_zero_std": 0.78125, "grad_norm": 0.10734775770454566, "kl": 0.0849609375, "learning_rate": 1.9998800700883506e-05, "loss": 0.0018, "num_tokens": 151115424.0, "reward": 2.046875, "reward_std": 0.08764266967773438, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.2181723266839981, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1902.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 786.919921875, "completions/mean_terminated_length": 786.919921875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.1051463685243663, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1417706454773269, "kl": 0.085693359375, "learning_rate": 1.9998609101324456e-05, "loss": 0.0119, "num_tokens": 151603927.0, "reward": 2.13427734375, "reward_std": 0.16098354756832123, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1476.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 694.1953125, "completions/mean_terminated_length": 694.1953125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.1054877528377571, "frac_reward_zero_std": 0.625, "grad_norm": 0.14435566752037493, "kl": 0.09130859375, "learning_rate": 1.9998403310568557e-05, "loss": 0.0078, "num_tokens": 152041739.0, "reward": 2.06494140625, "reward_std": 0.1365382969379425, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04269581660628319, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 686.478515625, "completions/mean_terminated_length": 686.478515625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1058291371511479, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1324242004319431, "kl": 0.09814453125, "learning_rate": 1.9998183328907892e-05, "loss": 0.0023, "num_tokens": 152471168.0, "reward": 2.07275390625, "reward_std": 0.11242619156837463, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1328.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 731.83203125, "completions/mean_terminated_length": 731.83203125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.1061705214645387, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12576672743647457, "kl": 0.095947265625, "learning_rate": 1.9997949156654686e-05, "loss": 0.0089, "num_tokens": 152930666.0, "reward": 2.0361328125, "reward_std": 0.0918736457824707, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 799.49609375, "completions/mean_terminated_length": 799.49609375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.1065119057779295, "frac_reward_zero_std": 0.75, "grad_norm": 0.1234459606014435, "kl": 0.09814453125, "learning_rate": 1.99977007941413e-05, "loss": -0.0005, "num_tokens": 153420184.0, "reward": 2.02490234375, "reward_std": 0.06483898311853409, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 796.271484375, "completions/mean_terminated_length": 796.271484375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.1068532900913203, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10844434982700603, "kl": 0.09326171875, "learning_rate": 1.9997438241720242e-05, "loss": 0.0114, "num_tokens": 153905651.0, "reward": 2.08203125, "reward_std": 0.11786758899688721, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 852.08984375, "completions/mean_terminated_length": 852.08984375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.1071946744047111, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12871120471516137, "kl": 0.0863037109375, "learning_rate": 1.9997161499764156e-05, "loss": 0.0098, "num_tokens": 154426353.0, "reward": 2.05029296875, "reward_std": 0.0956014096736908, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 940.095703125, "completions/mean_terminated_length": 937.9276123046875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.1075360587181019, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1097926515918573, "kl": 0.0828857421875, "learning_rate": 1.9996870568665825e-05, "loss": 0.0059, "num_tokens": 154993202.0, "reward": 2.05419921875, "reward_std": 0.13586637377738953, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24230584502220154, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 875.501953125, "completions/mean_terminated_length": 875.501953125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.1078774430314927, "frac_reward_zero_std": 0.75, "grad_norm": 0.09309563409428152, "kl": 0.079345703125, "learning_rate": 1.9996565448838177e-05, "loss": 0.001, "num_tokens": 155523139.0, "reward": 2.07568359375, "reward_std": 0.10555722564458847, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 859.080078125, "completions/mean_terminated_length": 859.080078125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.1082188273448835, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13395003772236655, "kl": 0.0841064453125, "learning_rate": 1.999624614071427e-05, "loss": 0.004, "num_tokens": 156043980.0, "reward": 2.0712890625, "reward_std": 0.13079217076301575, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1621.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 761.1171875, "completions/mean_terminated_length": 761.1171875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.1085602116582743, "frac_reward_zero_std": 0.625, "grad_norm": 0.12900462015698402, "kl": 0.091552734375, "learning_rate": 1.999591264474731e-05, "loss": 0.003, "num_tokens": 156510552.0, "reward": 2.064453125, "reward_std": 0.12237662822008133, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293970108032227, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 810.09765625, "completions/mean_terminated_length": 810.09765625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.1089015959716651, "frac_reward_zero_std": 0.75, "grad_norm": 0.09286898519575433, "kl": 0.0869140625, "learning_rate": 1.9995564961410622e-05, "loss": 0.0015, "num_tokens": 157001866.0, "reward": 2.06640625, "reward_std": 0.09413031488656998, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293973088264465, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1992.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 836.7421875, "completions/mean_terminated_length": 836.7421875, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.1092429802850559, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11236056838532936, "kl": 0.0867919921875, "learning_rate": 1.9995203091197686e-05, "loss": -0.0028, "num_tokens": 157518326.0, "reward": 2.060546875, "reward_std": 0.10475420951843262, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 847.3984375, "completions/mean_terminated_length": 847.3984375, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.1095843645984467, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09266463802454053, "kl": 0.0865478515625, "learning_rate": 1.999482703462211e-05, "loss": 0.0108, "num_tokens": 158032866.0, "reward": 2.125, "reward_std": 0.11712867021560669, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 856.185546875, "completions/mean_terminated_length": 856.185546875, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.1099257489118375, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11080207101822266, "kl": 0.082763671875, "learning_rate": 1.999443679221764e-05, "loss": 0.0177, "num_tokens": 158548401.0, "reward": 2.060546875, "reward_std": 0.1296931505203247, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 884.94140625, "completions/mean_terminated_length": 880.3804321289062, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.1102671332252283, "frac_reward_zero_std": 0.625, "grad_norm": 0.11294706743628945, "kl": 0.0848388671875, "learning_rate": 1.999403236453815e-05, "loss": 0.006, "num_tokens": 159085715.0, "reward": 2.06689453125, "reward_std": 0.14652618765830994, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 798.814453125, "completions/mean_terminated_length": 798.814453125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.1106085175386191, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1080906427593134, "kl": 0.096923828125, "learning_rate": 1.9993613752157647e-05, "loss": 0.0099, "num_tokens": 159590788.0, "reward": 2.03662109375, "reward_std": 0.09059925377368927, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 762.958984375, "completions/mean_terminated_length": 762.958984375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.1109499018520099, "frac_reward_zero_std": 0.625, "grad_norm": 0.13458700231396795, "kl": 0.0946044921875, "learning_rate": 1.9993180955670285e-05, "loss": 0.0191, "num_tokens": 160056031.0, "reward": 2.0966796875, "reward_std": 0.13284574449062347, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 793.546875, "completions/mean_terminated_length": 793.546875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.1112912861654007, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11536034579497821, "kl": 0.0855712890625, "learning_rate": 1.9992733975690333e-05, "loss": 0.0094, "num_tokens": 160542359.0, "reward": 2.06787109375, "reward_std": 0.1171584203839302, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 783.837890625, "completions/mean_terminated_length": 781.364013671875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.1116326704787915, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14045556498613573, "kl": 0.0838623046875, "learning_rate": 1.9992272812852198e-05, "loss": 0.0022, "num_tokens": 161023700.0, "reward": 2.15771484375, "reward_std": 0.1727783977985382, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 765.212890625, "completions/mean_terminated_length": 765.212890625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.1119740547921823, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13461921544123806, "kl": 0.0897216796875, "learning_rate": 1.9991797467810417e-05, "loss": -0.004, "num_tokens": 161496273.0, "reward": 2.0625, "reward_std": 0.12185370922088623, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 717.736328125, "completions/mean_terminated_length": 717.736328125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.1123154391055731, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11014441235960672, "kl": 0.08349609375, "learning_rate": 1.9991307941239656e-05, "loss": 0.0019, "num_tokens": 161941610.0, "reward": 2.1484375, "reward_std": 0.13392479717731476, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 779.271484375, "completions/mean_terminated_length": 779.271484375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.1126568234189639, "frac_reward_zero_std": 0.75, "grad_norm": 0.10486840887640052, "kl": 0.084228515625, "learning_rate": 1.9990804233834707e-05, "loss": 0.0052, "num_tokens": 162423125.0, "reward": 2.02587890625, "reward_std": 0.08381806313991547, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 765.798828125, "completions/mean_terminated_length": 765.798828125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.1129982077323547, "frac_reward_zero_std": 0.8125, "grad_norm": 0.08498585393119823, "kl": 0.0872802734375, "learning_rate": 1.9990286346310494e-05, "loss": 0.0061, "num_tokens": 162903486.0, "reward": 2.083984375, "reward_std": 0.0870293527841568, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 779.298828125, "completions/mean_terminated_length": 779.298828125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.1133395920457455, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1258417442733969, "kl": 0.0892333984375, "learning_rate": 1.9989754279402055e-05, "loss": 0.0039, "num_tokens": 163387463.0, "reward": 2.07568359375, "reward_std": 0.1277276575565338, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 822.908203125, "completions/mean_terminated_length": 822.908203125, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.1136809763591363, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13731040730535207, "kl": 0.09326171875, "learning_rate": 1.998920803386457e-05, "loss": 0.0105, "num_tokens": 163892488.0, "reward": 2.10107421875, "reward_std": 0.16197827458381653, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.033087924122810364, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 792.5234375, "completions/mean_terminated_length": 792.5234375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.11402236067252709, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11240160056296827, "kl": 0.0972900390625, "learning_rate": 1.9988647610473334e-05, "loss": 0.0136, "num_tokens": 164381124.0, "reward": 2.060546875, "reward_std": 0.10805702954530716, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 735.078125, "completions/mean_terminated_length": 735.078125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.11436374498591789, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1425452404716953, "kl": 0.100341796875, "learning_rate": 1.998807301002376e-05, "loss": 0.0048, "num_tokens": 164840124.0, "reward": 2.13818359375, "reward_std": 0.16893647611141205, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 759.025390625, "completions/mean_terminated_length": 759.025390625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.11470512929930869, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14623629243957023, "kl": 0.0946044921875, "learning_rate": 1.9987484233331394e-05, "loss": 0.0074, "num_tokens": 165308249.0, "reward": 2.17431640625, "reward_std": 0.24441897869110107, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1994.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 805.2421875, "completions/mean_terminated_length": 805.2421875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.11504651361269949, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11998362285294985, "kl": 0.091796875, "learning_rate": 1.9986881281231894e-05, "loss": 0.0088, "num_tokens": 165802885.0, "reward": 2.0693359375, "reward_std": 0.12693575024604797, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 752.029296875, "completions/mean_terminated_length": 744.3909912109375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.11538789792609029, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14207064244705112, "kl": 0.098388671875, "learning_rate": 1.9986264154581043e-05, "loss": 0.0295, "num_tokens": 166279140.0, "reward": 2.0703125, "reward_std": 0.14308004081249237, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1988.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 709.94140625, "completions/mean_terminated_length": 709.94140625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.11572928223948109, "frac_reward_zero_std": 0.625, "grad_norm": 0.13557320410367782, "kl": 0.1024169921875, "learning_rate": 1.9985632854254735e-05, "loss": 0.0082, "num_tokens": 166729798.0, "reward": 2.1015625, "reward_std": 0.15074807405471802, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 738.7578125, "completions/mean_terminated_length": 738.7578125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.11607066655287189, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1219205658002744, "kl": 0.09716796875, "learning_rate": 1.9984987381148987e-05, "loss": 0.0073, "num_tokens": 167185610.0, "reward": 2.05078125, "reward_std": 0.08957062661647797, "rewards/accuracy_reward/mean": 0.052419353276491165, "rewards/accuracy_reward/std": 0.22309619188308716, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 766.125, "completions/mean_terminated_length": 766.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.11641205086626269, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14500163608651948, "kl": 0.0902099609375, "learning_rate": 1.9984327736179934e-05, "loss": -0.0047, "num_tokens": 167662826.0, "reward": 2.15380859375, "reward_std": 0.18714523315429688, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1871.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 785.841796875, "completions/mean_terminated_length": 785.841796875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.11675343517965349, "frac_reward_zero_std": 0.625, "grad_norm": 0.12591349890601516, "kl": 0.0882568359375, "learning_rate": 1.9983653920283816e-05, "loss": 0.0051, "num_tokens": 168146457.0, "reward": 2.087890625, "reward_std": 0.13968831300735474, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 777.6640625, "completions/mean_terminated_length": 777.6640625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.11709481949304429, "frac_reward_zero_std": 0.625, "grad_norm": 0.1515651573777293, "kl": 0.0919189453125, "learning_rate": 1.9982965934416992e-05, "loss": 0.0205, "num_tokens": 168618989.0, "reward": 2.06005859375, "reward_std": 0.13591766357421875, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 789.8203125, "completions/mean_terminated_length": 789.8203125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.11743620380643509, "frac_reward_zero_std": 0.75, "grad_norm": 0.1044941307707021, "kl": 0.0919189453125, "learning_rate": 1.9982263779555937e-05, "loss": 0.0073, "num_tokens": 169105073.0, "reward": 2.0693359375, "reward_std": 0.11075945198535919, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1844.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 804.740234375, "completions/mean_terminated_length": 804.740234375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.11777758811982589, "frac_reward_zero_std": 0.625, "grad_norm": 0.11811693056645911, "kl": 0.0867919921875, "learning_rate": 1.9981547456697224e-05, "loss": 0.0079, "num_tokens": 169601884.0, "reward": 2.142578125, "reward_std": 0.13083267211914062, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 800.126953125, "completions/mean_terminated_length": 800.126953125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.11811897243321669, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11324067144848583, "kl": 0.0889892578125, "learning_rate": 1.998081696685755e-05, "loss": 0.0042, "num_tokens": 170100397.0, "reward": 2.064453125, "reward_std": 0.09149399399757385, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1596.0, "completions/max_terminated_length": 1596.0, "completions/mean_length": 703.642578125, "completions/mean_terminated_length": 703.642578125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.11846035674660749, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12645219617210404, "kl": 0.08837890625, "learning_rate": 1.9980072311073706e-05, "loss": 0.0054, "num_tokens": 170540342.0, "reward": 2.154296875, "reward_std": 0.14833936095237732, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 768.53515625, "completions/mean_terminated_length": 768.53515625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.11880174105999829, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13804678370299253, "kl": 0.0875244140625, "learning_rate": 1.99793134904026e-05, "loss": 0.0016, "num_tokens": 171016904.0, "reward": 2.13671875, "reward_std": 0.1797829270362854, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1477.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 827.408203125, "completions/mean_terminated_length": 827.408203125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.11914312537338909, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11878817593392821, "kl": 0.0821533203125, "learning_rate": 1.997854050592123e-05, "loss": 0.0044, "num_tokens": 171523385.0, "reward": 2.08544921875, "reward_std": 0.14224785566329956, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 771.5625, "completions/mean_terminated_length": 771.5625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.11948450968677989, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09769351481234359, "kl": 0.092529296875, "learning_rate": 1.9977753358726715e-05, "loss": 0.0039, "num_tokens": 171994873.0, "reward": 2.0361328125, "reward_std": 0.08903916925191879, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 834.404296875, "completions/mean_terminated_length": 834.404296875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.1198258940001707, "frac_reward_zero_std": 0.625, "grad_norm": 0.1119095327445531, "kl": 0.08544921875, "learning_rate": 1.9976952049936262e-05, "loss": 0.0028, "num_tokens": 172506936.0, "reward": 2.0947265625, "reward_std": 0.15126933157444, "rewards/accuracy_reward/mean": 0.10282257944345474, "rewards/accuracy_reward/std": 0.30403366684913635, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 800.146484375, "completions/mean_terminated_length": 800.146484375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.1201672783135615, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11402592873573691, "kl": 0.091064453125, "learning_rate": 1.9976136580687183e-05, "loss": 0.0077, "num_tokens": 172989363.0, "reward": 2.0361328125, "reward_std": 0.10147039592266083, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 833.041015625, "completions/mean_terminated_length": 833.041015625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.1205086626269523, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10423898410132625, "kl": 0.0897216796875, "learning_rate": 1.9975306952136893e-05, "loss": 0.0019, "num_tokens": 173504360.0, "reward": 2.0458984375, "reward_std": 0.09826165437698364, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 816.94921875, "completions/mean_terminated_length": 816.94921875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.1208500469403431, "frac_reward_zero_std": 0.78125, "grad_norm": 0.10439918931659659, "kl": 0.09521484375, "learning_rate": 1.9974463165462887e-05, "loss": 0.0072, "num_tokens": 174009774.0, "reward": 2.08203125, "reward_std": 0.09131664782762527, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 795.23828125, "completions/mean_terminated_length": 795.23828125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.1211914312537339, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14936536535152534, "kl": 0.0938720703125, "learning_rate": 1.9973605221862776e-05, "loss": 0.0144, "num_tokens": 174497432.0, "reward": 2.08935546875, "reward_std": 0.1668163537979126, "rewards/accuracy_reward/mean": 0.10483870655298233, "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1451.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 750.48046875, "completions/mean_terminated_length": 750.48046875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.1215328155671247, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12478994187117828, "kl": 0.09130859375, "learning_rate": 1.9972733122554246e-05, "loss": 0.0065, "num_tokens": 174967902.0, "reward": 2.0576171875, "reward_std": 0.11904998123645782, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 757.544921875, "completions/mean_terminated_length": 757.544921875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.1218741998805155, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13902654112347654, "kl": 0.093505859375, "learning_rate": 1.997184686877509e-05, "loss": 0.0191, "num_tokens": 175437461.0, "reward": 2.10888671875, "reward_std": 0.18727843463420868, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 656.646484375, "completions/mean_terminated_length": 656.646484375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.1222155841939063, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12879389052284673, "kl": 0.094970703125, "learning_rate": 1.9970946461783177e-05, "loss": 0.0041, "num_tokens": 175870048.0, "reward": 2.15283203125, "reward_std": 0.12729500234127045, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 729.427734375, "completions/mean_terminated_length": 729.427734375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.1225569685072971, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1293848095615603, "kl": 0.0953369140625, "learning_rate": 1.9970031902856476e-05, "loss": 0.0027, "num_tokens": 176331019.0, "reward": 2.05859375, "reward_std": 0.1205422505736351, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 691.744140625, "completions/mean_terminated_length": 689.0900268554688, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.1228983528206879, "frac_reward_zero_std": 0.78125, "grad_norm": 0.12453696141727835, "kl": 0.0986328125, "learning_rate": 1.996910319329303e-05, "loss": 0.009, "num_tokens": 176773688.0, "reward": 2.0205078125, "reward_std": 0.07692177593708038, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 699.576171875, "completions/mean_terminated_length": 699.576171875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.1232397371340787, "frac_reward_zero_std": 0.46875, "grad_norm": 0.15078949386282842, "kl": 0.08935546875, "learning_rate": 1.9968160334410976e-05, "loss": 0.0189, "num_tokens": 177219327.0, "reward": 2.12353515625, "reward_std": 0.20991584658622742, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 656.26953125, "completions/mean_terminated_length": 656.26953125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1235811214474695, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15778133737171, "kl": 0.08984375, "learning_rate": 1.9967203327548527e-05, "loss": 0.0099, "num_tokens": 177643321.0, "reward": 2.0693359375, "reward_std": 0.1533603072166443, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 663.390625, "completions/mean_terminated_length": 663.390625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.1239225057608603, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13963434218825113, "kl": 0.08544921875, "learning_rate": 1.9966232174063983e-05, "loss": 0.015, "num_tokens": 178073585.0, "reward": 2.11572265625, "reward_std": 0.1529872566461563, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 762.7109375, "completions/mean_terminated_length": 762.7109375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.1242638900742511, "frac_reward_zero_std": 0.625, "grad_norm": 0.11541158676667476, "kl": 0.08837890625, "learning_rate": 1.9965246875335715e-05, "loss": 0.0101, "num_tokens": 178537213.0, "reward": 2.05517578125, "reward_std": 0.14906933903694153, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 723.421875, "completions/mean_terminated_length": 723.421875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.12460527438764189, "frac_reward_zero_std": 0.5, "grad_norm": 0.1343062275933729, "kl": 0.0848388671875, "learning_rate": 1.9964247432762177e-05, "loss": 0.0031, "num_tokens": 178989029.0, "reward": 2.1357421875, "reward_std": 0.18455973267555237, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 780.552734375, "completions/mean_terminated_length": 778.0723876953125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.12494665870103269, "frac_reward_zero_std": 0.5, "grad_norm": 0.1471102895771736, "kl": 0.0889892578125, "learning_rate": 1.9963233847761896e-05, "loss": 0.0129, "num_tokens": 179471104.0, "reward": 2.09521484375, "reward_std": 0.20765116810798645, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04538619518280029, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 770.0, "completions/mean_terminated_length": 770.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.12528804301442348, "frac_reward_zero_std": 0.625, "grad_norm": 0.1249761874509325, "kl": 0.085693359375, "learning_rate": 1.9962206121773464e-05, "loss": 0.0088, "num_tokens": 179948336.0, "reward": 2.080078125, "reward_std": 0.14690595865249634, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 796.291015625, "completions/mean_terminated_length": 796.291015625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.1256294273278143, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1485732859829053, "kl": 0.0819091796875, "learning_rate": 1.9961164256255557e-05, "loss": 0.0047, "num_tokens": 180433989.0, "reward": 2.11328125, "reward_std": 0.15694251656532288, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 754.029296875, "completions/mean_terminated_length": 754.029296875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.12597081164120508, "frac_reward_zero_std": 0.59375, "grad_norm": 0.2274863685629634, "kl": 0.102294921875, "learning_rate": 1.996010825268691e-05, "loss": 0.0111, "num_tokens": 180898228.0, "reward": 2.10986328125, "reward_std": 0.16645778715610504, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 782.28515625, "completions/mean_terminated_length": 779.8082275390625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.1263121959545959, "frac_reward_zero_std": 0.5, "grad_norm": 0.14895767259855133, "kl": 0.089111328125, "learning_rate": 1.995903811256633e-05, "loss": 0.02, "num_tokens": 181380342.0, "reward": 2.1572265625, "reward_std": 0.18110236525535583, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03484956547617912, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 753.3515625, "completions/mean_terminated_length": 753.3515625, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.12665358026798668, "frac_reward_zero_std": 0.75, "grad_norm": 0.10811234914160572, "kl": 0.0936279296875, "learning_rate": 1.9957953837412678e-05, "loss": 0.0055, "num_tokens": 181848346.0, "reward": 2.0380859375, "reward_std": 0.0845898687839508, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 749.42578125, "completions/mean_terminated_length": 749.42578125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.1269949645813775, "frac_reward_zero_std": 0.75, "grad_norm": 0.11296979749781694, "kl": 0.107666015625, "learning_rate": 1.9956855428764892e-05, "loss": 0.0059, "num_tokens": 182328708.0, "reward": 2.03564453125, "reward_std": 0.08562840521335602, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.045470330864191055, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1517.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 799.99609375, "completions/mean_terminated_length": 799.99609375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.12733634889476828, "frac_reward_zero_std": 0.5, "grad_norm": 0.13488082551576472, "kl": 0.0965576171875, "learning_rate": 1.9955742888181954e-05, "loss": 0.0112, "num_tokens": 182818562.0, "reward": 2.15673828125, "reward_std": 0.17301906645298004, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 795.41796875, "completions/mean_terminated_length": 792.9667358398438, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.1276777332081591, "frac_reward_zero_std": 0.625, "grad_norm": 0.13106743689143652, "kl": 0.0985107421875, "learning_rate": 1.9954616217242918e-05, "loss": 0.0211, "num_tokens": 183305640.0, "reward": 2.07373046875, "reward_std": 0.12574264407157898, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 760.8125, "completions/mean_terminated_length": 755.7647705078125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.12801911752154987, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13970074175686184, "kl": 0.1051025390625, "learning_rate": 1.995347541754689e-05, "loss": 0.0139, "num_tokens": 183783048.0, "reward": 2.05322265625, "reward_std": 0.12589280307292938, "rewards/accuracy_reward/mean": 0.06653226166963577, "rewards/accuracy_reward/std": 0.2494617998600006, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 786.85546875, "completions/mean_terminated_length": 786.85546875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.1283605018349407, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11527793642690921, "kl": 0.099853515625, "learning_rate": 1.995232049071302e-05, "loss": 0.0073, "num_tokens": 184272078.0, "reward": 2.03662109375, "reward_std": 0.10159565508365631, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 741.677734375, "completions/mean_terminated_length": 741.677734375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.12870188614833147, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0952500342421456, "kl": 0.1024169921875, "learning_rate": 1.9951151438380515e-05, "loss": 0.0121, "num_tokens": 184725097.0, "reward": 2.072265625, "reward_std": 0.07669497281312943, "rewards/accuracy_reward/mean": 0.07459677755832672, "rewards/accuracy_reward/std": 0.263004869222641, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 789.53515625, "completions/mean_terminated_length": 789.53515625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.1290432704617223, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11309583684200847, "kl": 0.10400390625, "learning_rate": 1.9949968262208637e-05, "loss": 0.0053, "num_tokens": 185204443.0, "reward": 2.05908203125, "reward_std": 0.09234350919723511, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1619.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 822.630859375, "completions/mean_terminated_length": 822.630859375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.12938465477511307, "frac_reward_zero_std": 0.625, "grad_norm": 0.12594030065225614, "kl": 0.096923828125, "learning_rate": 1.9948770963876683e-05, "loss": 0.0008, "num_tokens": 185706158.0, "reward": 2.0625, "reward_std": 0.1284964382648468, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 859.47265625, "completions/mean_terminated_length": 859.47265625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.12972603908850389, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10220483969680058, "kl": 0.089599609375, "learning_rate": 1.9947559545084e-05, "loss": 0.0039, "num_tokens": 186227424.0, "reward": 2.134765625, "reward_std": 0.12997153401374817, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 755.724609375, "completions/mean_terminated_length": 755.724609375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.13006742340189467, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1089771471639626, "kl": 0.0987548828125, "learning_rate": 1.994633400754998e-05, "loss": -0.0012, "num_tokens": 186697731.0, "reward": 2.07177734375, "reward_std": 0.09487161040306091, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 815.75390625, "completions/mean_terminated_length": 815.75390625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.13040880771528549, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11777347714725431, "kl": 0.095703125, "learning_rate": 1.9945094353014042e-05, "loss": 0.0039, "num_tokens": 187194789.0, "reward": 2.099609375, "reward_std": 0.12485411763191223, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1405.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 783.708984375, "completions/mean_terminated_length": 783.708984375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.13075019202867627, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13942971941628468, "kl": 0.090576171875, "learning_rate": 1.9943840583235655e-05, "loss": 0.0069, "num_tokens": 187684576.0, "reward": 2.12841796875, "reward_std": 0.18511544167995453, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 759.900390625, "completions/mean_terminated_length": 759.900390625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.13109157634206708, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11615786185818612, "kl": 0.095703125, "learning_rate": 1.994257269999431e-05, "loss": 0.004, "num_tokens": 188151245.0, "reward": 2.0859375, "reward_std": 0.1336916983127594, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1424.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 727.15625, "completions/mean_terminated_length": 727.15625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.13143296065545787, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11191479876027816, "kl": 0.0953369140625, "learning_rate": 1.9941290705089545e-05, "loss": 0.0028, "num_tokens": 188623965.0, "reward": 2.05810546875, "reward_std": 0.10465522855520248, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24230584502220154, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 738.16015625, "completions/mean_terminated_length": 738.16015625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.13177434496884868, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11815457701543179, "kl": 0.0975341796875, "learning_rate": 1.9939994600340906e-05, "loss": 0.0109, "num_tokens": 189078223.0, "reward": 2.09228515625, "reward_std": 0.11820229887962341, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 760.03515625, "completions/mean_terminated_length": 757.5146484375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.13211572928223947, "frac_reward_zero_std": 0.8125, "grad_norm": 0.09455997345741289, "kl": 0.0992431640625, "learning_rate": 1.9938684387587984e-05, "loss": 0.0059, "num_tokens": 189554913.0, "reward": 2.0166015625, "reward_std": 0.06256291270256042, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 769.302734375, "completions/mean_terminated_length": 766.8004150390625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.13245711359563028, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10659184197145534, "kl": 0.0914306640625, "learning_rate": 1.993736006869038e-05, "loss": 0.0194, "num_tokens": 190043196.0, "reward": 2.060546875, "reward_std": 0.11230767518281937, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 726.56640625, "completions/mean_terminated_length": 725.4422607421875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.13279849790902107, "frac_reward_zero_std": 0.71875, "grad_norm": 96.66304257021238, "kl": 7.8531494140625, "learning_rate": 1.9936021645527734e-05, "loss": 0.3232, "num_tokens": 190500926.0, "reward": 2.05029296875, "reward_std": 0.11104878038167953, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 780.041015625, "completions/mean_terminated_length": 780.041015625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.13313988222241188, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09793672926940249, "kl": 0.09423828125, "learning_rate": 1.993466911999968e-05, "loss": 0.01, "num_tokens": 190983107.0, "reward": 2.130859375, "reward_std": 0.13393032550811768, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 772.58203125, "completions/mean_terminated_length": 772.58203125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.13348126653580267, "frac_reward_zero_std": 0.625, "grad_norm": 0.1184975759769518, "kl": 0.1004638671875, "learning_rate": 1.9933302494025885e-05, "loss": 0.0014, "num_tokens": 191463197.0, "reward": 2.09375, "reward_std": 0.14135143160820007, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 788.7265625, "completions/mean_terminated_length": 788.7265625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.13382265084919348, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10927283551568331, "kl": 0.0999755859375, "learning_rate": 1.9931921769546026e-05, "loss": 0.0065, "num_tokens": 191946033.0, "reward": 2.04296875, "reward_std": 0.10133734345436096, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 805.673828125, "completions/mean_terminated_length": 800.802001953125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.13416403516258427, "frac_reward_zero_std": 0.625, "grad_norm": 0.10870061965777136, "kl": 0.0970458984375, "learning_rate": 1.9930526948519793e-05, "loss": 0.0325, "num_tokens": 192439594.0, "reward": 2.0791015625, "reward_std": 0.1412525326013565, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 775.9453125, "completions/mean_terminated_length": 775.9453125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.13450541947597508, "frac_reward_zero_std": 0.84375, "grad_norm": 0.07227545115380704, "kl": 0.094970703125, "learning_rate": 1.9929118032926872e-05, "loss": 0.0087, "num_tokens": 192912238.0, "reward": 2.0625, "reward_std": 0.06640692055225372, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 769.130859375, "completions/mean_terminated_length": 769.130859375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.13484680378936587, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13063173463161987, "kl": 0.09912109375, "learning_rate": 1.9927695024766964e-05, "loss": 0.022, "num_tokens": 193381233.0, "reward": 2.16015625, "reward_std": 0.18865203857421875, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 766.03125, "completions/mean_terminated_length": 766.03125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.13518818810275668, "frac_reward_zero_std": 0.75, "grad_norm": 0.09277865139355225, "kl": 0.098876953125, "learning_rate": 1.9926257926059768e-05, "loss": 0.0064, "num_tokens": 193859457.0, "reward": 2.07958984375, "reward_std": 0.09778076410293579, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1617.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 757.59765625, "completions/mean_terminated_length": 757.59765625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.13552957241614746, "frac_reward_zero_std": 0.625, "grad_norm": 0.11993020369633604, "kl": 0.103515625, "learning_rate": 1.9924806738844982e-05, "loss": 0.0108, "num_tokens": 194324067.0, "reward": 2.13916015625, "reward_std": 0.14977243542671204, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 726.265625, "completions/mean_terminated_length": 726.265625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.13587095672953828, "frac_reward_zero_std": 0.625, "grad_norm": 0.12776932380566325, "kl": 0.096435546875, "learning_rate": 1.9923341465182307e-05, "loss": 0.0065, "num_tokens": 194775323.0, "reward": 2.08740234375, "reward_std": 0.12430571764707565, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 828.728515625, "completions/mean_terminated_length": 828.728515625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.13621234104292906, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12625454715882156, "kl": 0.0902099609375, "learning_rate": 1.9921862107151422e-05, "loss": 0.003, "num_tokens": 195287280.0, "reward": 2.111328125, "reward_std": 0.16829097270965576, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 793.216796875, "completions/mean_terminated_length": 793.216796875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.13655372535631988, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07732472923752011, "kl": 0.0928955078125, "learning_rate": 1.992036866685201e-05, "loss": 0.0065, "num_tokens": 195783279.0, "reward": 2.068359375, "reward_std": 0.08565815538167953, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 785.986328125, "completions/mean_terminated_length": 785.986328125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1368951096697107, "frac_reward_zero_std": 0.625, "grad_norm": 0.11127956140348617, "kl": 0.08740234375, "learning_rate": 1.9918861146403733e-05, "loss": 0.0062, "num_tokens": 196276744.0, "reward": 2.111328125, "reward_std": 0.14899644255638123, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 841.876953125, "completions/mean_terminated_length": 837.7098388671875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.13723649398310148, "frac_reward_zero_std": 0.59375, "grad_norm": 6216501.869402045, "kl": 507904.06591796875, "learning_rate": 1.9917339547946247e-05, "loss": 20413.8203, "num_tokens": 196778201.0, "reward": 2.0908203125, "reward_std": 0.1522984504699707, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 738.458984375, "completions/mean_terminated_length": 738.458984375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1375778782964923, "frac_reward_zero_std": 0.5, "grad_norm": 0.14451964014750493, "kl": 0.092041015625, "learning_rate": 1.991580387363918e-05, "loss": 0.0139, "num_tokens": 197247300.0, "reward": 2.14990234375, "reward_std": 0.20068654417991638, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 803.03515625, "completions/mean_terminated_length": 800.5988159179688, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.13791926260988308, "frac_reward_zero_std": 0.625, "grad_norm": 0.11699645842370845, "kl": 0.08837890625, "learning_rate": 1.991425412566214e-05, "loss": 0.0082, "num_tokens": 197740390.0, "reward": 2.13330078125, "reward_std": 0.15015456080436707, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 786.1015625, "completions/mean_terminated_length": 786.1015625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.1382606469232739, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11249917526925826, "kl": 0.0914306640625, "learning_rate": 1.9912690306214705e-05, "loss": 0.0134, "num_tokens": 198224970.0, "reward": 2.05859375, "reward_std": 0.12219851464033127, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 779.58203125, "completions/mean_terminated_length": 774.60791015625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.13860203123666467, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1320642677483368, "kl": 0.092041015625, "learning_rate": 1.991111241751644e-05, "loss": 0.0117, "num_tokens": 198703668.0, "reward": 2.14111328125, "reward_std": 0.1906587779521942, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 712.86328125, "completions/mean_terminated_length": 712.0488891601562, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.1389434155500555, "frac_reward_zero_std": 0.5625, "grad_norm": 1.0624436841935276, "kl": 0.1163330078125, "learning_rate": 1.9909520461806867e-05, "loss": 0.0183, "num_tokens": 199167870.0, "reward": 2.06689453125, "reward_std": 0.1740715652704239, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.06310669332742691, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 753.5546875, "completions/mean_terminated_length": 753.5546875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.13928479986344627, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11129643815353872, "kl": 0.0987548828125, "learning_rate": 1.990791444134547e-05, "loss": 0.0057, "num_tokens": 199648426.0, "reward": 2.04931640625, "reward_std": 0.1090659350156784, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 737.716796875, "completions/mean_terminated_length": 737.716796875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1396261841768371, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12254678118921161, "kl": 0.09912109375, "learning_rate": 1.9906294358411712e-05, "loss": 0.003, "num_tokens": 200108537.0, "reward": 2.064453125, "reward_std": 0.09860946238040924, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 749.685546875, "completions/mean_terminated_length": 749.685546875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.13996756849022787, "frac_reward_zero_std": 0.78125, "grad_norm": 0.1023729567010461, "kl": 0.1021728515625, "learning_rate": 1.9904660215304994e-05, "loss": 0.0137, "num_tokens": 200588792.0, "reward": 2.0361328125, "reward_std": 0.08999897539615631, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 722.724609375, "completions/mean_terminated_length": 722.724609375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.1403089528036187, "frac_reward_zero_std": 0.625, "grad_norm": 0.12537023552244395, "kl": 0.0972900390625, "learning_rate": 1.9903012014344688e-05, "loss": 0.0088, "num_tokens": 201041339.0, "reward": 2.12109375, "reward_std": 0.1542215347290039, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 750.96875, "completions/mean_terminated_length": 750.96875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.14065033711700947, "frac_reward_zero_std": 0.75, "grad_norm": 0.11849497803441236, "kl": 0.0975341796875, "learning_rate": 1.990134975787011e-05, "loss": 0.0002, "num_tokens": 201508699.0, "reward": 2.09814453125, "reward_std": 0.10495433211326599, "rewards/accuracy_reward/mean": 0.1088709682226181, "rewards/accuracy_reward/std": 0.3117917478084564, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 747.59765625, "completions/mean_terminated_length": 747.59765625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.14099172143040029, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12605349309670538, "kl": 0.1016845703125, "learning_rate": 1.989967344824054e-05, "loss": 0.0018, "num_tokens": 201965949.0, "reward": 2.11181640625, "reward_std": 0.14768604934215546, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 795.33203125, "completions/mean_terminated_length": 795.33203125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.14133310574379107, "frac_reward_zero_std": 0.625, "grad_norm": 0.11384198375335428, "kl": 0.09423828125, "learning_rate": 1.989798308783518e-05, "loss": 0.0096, "num_tokens": 202466311.0, "reward": 2.0927734375, "reward_std": 0.15982502698898315, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 769.13671875, "completions/mean_terminated_length": 769.13671875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.14167449005718188, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11427982746581099, "kl": 0.096923828125, "learning_rate": 1.9896278679053193e-05, "loss": 0.0095, "num_tokens": 202939741.0, "reward": 2.07177734375, "reward_std": 0.12293452024459839, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 722.380859375, "completions/mean_terminated_length": 719.7866821289062, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.14201587437057267, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13670810574717485, "kl": 0.1055908203125, "learning_rate": 1.9894560224313676e-05, "loss": 0.0172, "num_tokens": 203392944.0, "reward": 2.11962890625, "reward_std": 0.15813785791397095, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 761.43359375, "completions/mean_terminated_length": 761.43359375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.14235725868396348, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13056808157228048, "kl": 0.0950927734375, "learning_rate": 1.989282772605566e-05, "loss": 0.0132, "num_tokens": 203860238.0, "reward": 2.162109375, "reward_std": 0.18270456790924072, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 763.6015625, "completions/mean_terminated_length": 763.6015625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.14269864299735427, "frac_reward_zero_std": 0.625, "grad_norm": 0.1207634463920805, "kl": 0.0953369140625, "learning_rate": 1.989108118673811e-05, "loss": 0.006, "num_tokens": 204332642.0, "reward": 2.0751953125, "reward_std": 0.13539619743824005, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 747.7109375, "completions/mean_terminated_length": 747.7109375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.14304002731074508, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10053563950601692, "kl": 0.088134765625, "learning_rate": 1.9889320608839924e-05, "loss": -0.0001, "num_tokens": 204793166.0, "reward": 2.1396484375, "reward_std": 0.12739671766757965, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1700.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 821.361328125, "completions/mean_terminated_length": 821.361328125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.14338141162413587, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10946102244740558, "kl": 0.0908203125, "learning_rate": 1.988754599485991e-05, "loss": 0.0014, "num_tokens": 205303783.0, "reward": 2.0546875, "reward_std": 0.12213177978992462, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24230584502220154, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 844.6640625, "completions/mean_terminated_length": 844.6640625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.14372279593752668, "frac_reward_zero_std": 0.75, "grad_norm": 0.08205350121920078, "kl": 0.0880126953125, "learning_rate": 1.9885757347316815e-05, "loss": 0.0102, "num_tokens": 205826667.0, "reward": 2.123046875, "reward_std": 0.08753518760204315, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 853.2109375, "completions/mean_terminated_length": 853.2109375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.14406418025091747, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12347902633960316, "kl": 0.0897216796875, "learning_rate": 1.9883954668749292e-05, "loss": 0.0029, "num_tokens": 206347239.0, "reward": 2.0556640625, "reward_std": 0.17140242457389832, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 842.787109375, "completions/mean_terminated_length": 842.787109375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.14440556456430828, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12432316027193507, "kl": 0.089599609375, "learning_rate": 1.9882137961715918e-05, "loss": 0.0102, "num_tokens": 206862314.0, "reward": 2.10546875, "reward_std": 0.16066935658454895, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2024.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 887.314453125, "completions/mean_terminated_length": 887.314453125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.14474694887769907, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10056519635558754, "kl": 0.0950927734375, "learning_rate": 1.9880307228795175e-05, "loss": 0.0084, "num_tokens": 207398379.0, "reward": 2.08154296875, "reward_std": 0.12119981646537781, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 853.9296875, "completions/mean_terminated_length": 853.9296875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.14508833319108988, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11728409846871944, "kl": 0.097900390625, "learning_rate": 1.9878462472585457e-05, "loss": 0.0121, "num_tokens": 207925847.0, "reward": 2.0634765625, "reward_std": 0.14816677570343018, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 776.5625, "completions/mean_terminated_length": 776.5625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.14542971750448067, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1273825892360386, "kl": 0.09912109375, "learning_rate": 1.987660369570505e-05, "loss": 0.006, "num_tokens": 208404663.0, "reward": 2.04345703125, "reward_std": 0.13023656606674194, "rewards/accuracy_reward/mean": 0.052419353276491165, "rewards/accuracy_reward/std": 0.22309619188308716, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1290.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 670.66796875, "completions/mean_terminated_length": 670.66796875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.14577110181787148, "frac_reward_zero_std": 0.625, "grad_norm": 0.12834950178488985, "kl": 0.1072998046875, "learning_rate": 1.9874730900792157e-05, "loss": -0.0076, "num_tokens": 208832237.0, "reward": 2.07470703125, "reward_std": 0.13680174946784973, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 697.533203125, "completions/mean_terminated_length": 697.533203125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.14611248613126226, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1185750395545449, "kl": 0.10400390625, "learning_rate": 1.9872844090504858e-05, "loss": 0.0113, "num_tokens": 209265790.0, "reward": 2.14404296875, "reward_std": 0.1625327467918396, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 681.158203125, "completions/mean_terminated_length": 681.158203125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.14645387044465308, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13120381078499818, "kl": 0.10107421875, "learning_rate": 1.9870943267521144e-05, "loss": 0.0024, "num_tokens": 209699743.0, "reward": 2.05517578125, "reward_std": 0.16942539811134338, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.052765581756830215, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 652.611328125, "completions/mean_terminated_length": 652.611328125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.14679525475804386, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1501255294922734, "kl": 0.107421875, "learning_rate": 1.9869028434538885e-05, "loss": 0.0062, "num_tokens": 210127928.0, "reward": 2.1181640625, "reward_std": 0.1497630476951599, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 732.2265625, "completions/mean_terminated_length": 732.2265625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.14713663907143468, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11550591006501917, "kl": 0.09521484375, "learning_rate": 1.9867099594275828e-05, "loss": 0.0053, "num_tokens": 210590572.0, "reward": 2.11328125, "reward_std": 0.1307895928621292, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1940.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 741.4609375, "completions/mean_terminated_length": 741.4609375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.14747802338482546, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10670120230248349, "kl": 0.1002197265625, "learning_rate": 1.9865156749469613e-05, "loss": 0.0024, "num_tokens": 211064984.0, "reward": 2.07470703125, "reward_std": 0.11027636379003525, "rewards/accuracy_reward/mean": 0.08467742055654526, "rewards/accuracy_reward/std": 0.278682142496109, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1827.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 797.234375, "completions/mean_terminated_length": 797.234375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.14781940769821628, "frac_reward_zero_std": 0.625, "grad_norm": 0.11534715749820813, "kl": 0.096435546875, "learning_rate": 1.986319990287776e-05, "loss": 0.0006, "num_tokens": 211561488.0, "reward": 2.10595703125, "reward_std": 0.12695381045341492, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1619.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 795.234375, "completions/mean_terminated_length": 795.234375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.14816079201160706, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12039878929829649, "kl": 0.10205078125, "learning_rate": 1.9861229057277643e-05, "loss": 0.002, "num_tokens": 212051240.0, "reward": 2.0498046875, "reward_std": 0.10667058080434799, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 843.646484375, "completions/mean_terminated_length": 838.923583984375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.14850217632499788, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12339222187665862, "kl": 0.09228515625, "learning_rate": 1.9859244215466525e-05, "loss": 0.0139, "num_tokens": 212568163.0, "reward": 2.0986328125, "reward_std": 0.13716205954551697, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2019.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 803.234375, "completions/mean_terminated_length": 803.234375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.14884356063838866, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09662747091356985, "kl": 0.1009521484375, "learning_rate": 1.9857245380261527e-05, "loss": 0.0125, "num_tokens": 213062683.0, "reward": 2.10595703125, "reward_std": 0.12736955285072327, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 765.3359375, "completions/mean_terminated_length": 765.3359375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.14918494495177947, "frac_reward_zero_std": 0.71875, "grad_norm": 0.0934539858324814, "kl": 0.0926513671875, "learning_rate": 1.9855232554499623e-05, "loss": 0.0102, "num_tokens": 213542103.0, "reward": 2.158203125, "reward_std": 0.12812869250774384, "rewards/accuracy_reward/mean": 0.16330644488334656, "rewards/accuracy_reward/std": 0.37001824378967285, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 794.720703125, "completions/mean_terminated_length": 794.720703125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.14952632926517026, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11376849764759775, "kl": 0.0994873046875, "learning_rate": 1.9853205741037652e-05, "loss": 0.0111, "num_tokens": 214039800.0, "reward": 2.0908203125, "reward_std": 0.1333150565624237, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 841.5625, "completions/mean_terminated_length": 839.2015380859375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.14986771357856107, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0897882542516861, "kl": 0.1080322265625, "learning_rate": 1.985116494275231e-05, "loss": 0.0124, "num_tokens": 214547400.0, "reward": 2.0361328125, "reward_std": 0.08487741649150848, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 804.33203125, "completions/mean_terminated_length": 804.33203125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.15020909789195186, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10604106556804012, "kl": 0.1015625, "learning_rate": 1.9849110162540137e-05, "loss": 0.0122, "num_tokens": 215045858.0, "reward": 2.0908203125, "reward_std": 0.12745001912117004, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 778.720703125, "completions/mean_terminated_length": 778.720703125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.15055048220534267, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1277284061117903, "kl": 0.101318359375, "learning_rate": 1.9847041403317513e-05, "loss": 0.0016, "num_tokens": 215518307.0, "reward": 2.12255859375, "reward_std": 0.12556388974189758, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 774.26171875, "completions/mean_terminated_length": 774.26171875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.15089186651873346, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11848707860405584, "kl": 0.092041015625, "learning_rate": 1.9844958668020668e-05, "loss": 0.0088, "num_tokens": 216005033.0, "reward": 2.123046875, "reward_std": 0.14580363035202026, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.3333272337913513, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 760.021484375, "completions/mean_terminated_length": 760.021484375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.15123325083212427, "frac_reward_zero_std": 0.625, "grad_norm": 0.12257903735026678, "kl": 0.1036376953125, "learning_rate": 1.9842861959605658e-05, "loss": 0.0057, "num_tokens": 216482692.0, "reward": 2.09326171875, "reward_std": 0.1367950737476349, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 753.251953125, "completions/mean_terminated_length": 749.8255615234375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.15157463514551506, "frac_reward_zero_std": 0.65625, "grad_norm": 135752.7436057107, "kl": 26368.076049804688, "learning_rate": 1.9840751281048382e-05, "loss": 1058.1938, "num_tokens": 216952197.0, "reward": 2.07568359375, "reward_std": 0.12454189360141754, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 795.619140625, "completions/mean_terminated_length": 795.619140625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.15191601945890587, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12884538866988032, "kl": 0.0977783203125, "learning_rate": 1.9838626635344558e-05, "loss": 0.0013, "num_tokens": 217442194.0, "reward": 2.07666015625, "reward_std": 0.15922769904136658, "rewards/accuracy_reward/mean": 0.08266129344701767, "rewards/accuracy_reward/std": 0.2756476104259491, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 770.240234375, "completions/mean_terminated_length": 770.240234375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.15225740377229666, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11455987705054997, "kl": 0.10400390625, "learning_rate": 1.9836488025509738e-05, "loss": -0.0012, "num_tokens": 217917213.0, "reward": 2.04248046875, "reward_std": 0.1004360169172287, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 725.66796875, "completions/mean_terminated_length": 725.66796875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.15259878808568747, "frac_reward_zero_std": 0.625, "grad_norm": 0.1672783274136197, "kl": 0.111083984375, "learning_rate": 1.9834335454579283e-05, "loss": 0.0057, "num_tokens": 218366243.0, "reward": 2.06640625, "reward_std": 0.14161010086536407, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 728.494140625, "completions/mean_terminated_length": 728.494140625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.15294017239907826, "frac_reward_zero_std": 0.625, "grad_norm": 0.13518540251103014, "kl": 0.0977783203125, "learning_rate": 1.9832168925608374e-05, "loss": 0.0132, "num_tokens": 218822640.0, "reward": 2.13916015625, "reward_std": 0.13829420506954193, "rewards/accuracy_reward/mean": 0.15120968222618103, "rewards/accuracy_reward/std": 0.35861483216285706, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 724.986328125, "completions/mean_terminated_length": 724.986328125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.15328155671246907, "frac_reward_zero_std": 0.625, "grad_norm": 0.12327630642942787, "kl": 0.098388671875, "learning_rate": 1.9829988441672004e-05, "loss": 0.0051, "num_tokens": 219270201.0, "reward": 2.05322265625, "reward_std": 0.15820443630218506, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 667.314453125, "completions/mean_terminated_length": 667.314453125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.15362294102585985, "frac_reward_zero_std": 0.4375, "grad_norm": 0.16671483027751446, "kl": 0.0960693359375, "learning_rate": 1.982779400586497e-05, "loss": 0.0108, "num_tokens": 219692314.0, "reward": 2.0810546875, "reward_std": 0.2437809705734253, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.04081062600016594, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 684.205078125, "completions/mean_terminated_length": 684.205078125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.15396432533925067, "frac_reward_zero_std": 0.40625, "grad_norm": 0.16833762266604888, "kl": 0.1019287109375, "learning_rate": 1.9825585621301873e-05, "loss": -0.0111, "num_tokens": 220129747.0, "reward": 2.015625, "reward_std": 0.24530720710754395, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.2029850035905838, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.07374914735555649, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 616.41015625, "completions/mean_terminated_length": 616.41015625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.15430570965264145, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14264187829866457, "kl": 0.1031494140625, "learning_rate": 1.982336329111711e-05, "loss": 0.0048, "num_tokens": 220529157.0, "reward": 2.05322265625, "reward_std": 0.1648309826850891, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 593.134765625, "completions/mean_terminated_length": 593.134765625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.15464709396603227, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13882963919835797, "kl": 0.115966796875, "learning_rate": 1.9821127018464874e-05, "loss": 0.0068, "num_tokens": 220924282.0, "reward": 2.10498046875, "reward_std": 0.15197619795799255, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 624.484375, "completions/mean_terminated_length": 624.484375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.15498847827942305, "frac_reward_zero_std": 0.59375, "grad_norm": 0.15034491182875342, "kl": 0.1103515625, "learning_rate": 1.9818876806519147e-05, "loss": 0.0033, "num_tokens": 221323314.0, "reward": 2.06298828125, "reward_std": 0.1407267153263092, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 682.130859375, "completions/mean_terminated_length": 682.130859375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.15532986259281387, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1158000077432017, "kl": 0.1224365234375, "learning_rate": 1.9816612658473685e-05, "loss": 0.0037, "num_tokens": 221759573.0, "reward": 2.00634765625, "reward_std": 0.08948209881782532, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 657.41015625, "completions/mean_terminated_length": 657.41015625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.15567124690620465, "frac_reward_zero_std": 0.53125, "grad_norm": 0.19102590357700158, "kl": 0.119140625, "learning_rate": 1.981433457754204e-05, "loss": 0.0076, "num_tokens": 222202007.0, "reward": 2.09716796875, "reward_std": 0.1793922781944275, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1437.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 700.630859375, "completions/mean_terminated_length": 700.630859375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.15601263121959547, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3516634878692276, "kl": 0.1156005859375, "learning_rate": 1.9812042566957527e-05, "loss": 0.027, "num_tokens": 222636810.0, "reward": 1.7470703125, "reward_std": 0.5668600797653198, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.763671875, "rewards/format_reward/std": 0.42524150013923645, "rewards/tag_count_reward/mean": 0.9228515625, "rewards/tag_count_reward/std": 0.16375388205051422, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1707.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 731.7265625, "completions/mean_terminated_length": 730.001953125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.15635401553298625, "frac_reward_zero_std": 0.03125, "grad_norm": 0.31293219069124634, "kl": 0.1363525390625, "learning_rate": 1.980973662997324e-05, "loss": 0.0099, "num_tokens": 223094766.0, "reward": 0.451171875, "reward_std": 0.2733317017555237, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.21951310336589813, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1160.376953125, "completions/mean_terminated_length": 958.1607055664062, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.15669539984637706, "frac_reward_zero_std": 0.0, "grad_norm": 0.3866413195463281, "kl": 0.2275390625, "learning_rate": 1.980741676986203e-05, "loss": 0.1769, "num_tokens": 223769823.0, "reward": 0.5888671875, "reward_std": 0.3412451148033142, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4267578125, "rewards/tag_count_reward/std": 0.2598487436771393, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1281.5703125, "completions/mean_terminated_length": 1020.7434692382812, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.15703678415976785, "frac_reward_zero_std": 0.0, "grad_norm": 0.6856570164899043, "kl": 0.28466796875, "learning_rate": 1.9805082989916515e-05, "loss": 0.2106, "num_tokens": 224511507.0, "reward": 0.52197265625, "reward_std": 0.3420443534851074, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.47119140625, "rewards/tag_count_reward/std": 0.2930625379085541, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 1025.3125, "completions/mean_terminated_length": 1001.9298706054688, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.15737816847315866, "frac_reward_zero_std": 0.0, "grad_norm": 0.4261369200811903, "kl": 0.237060546875, "learning_rate": 1.980273529344907e-05, "loss": 0.0478, "num_tokens": 225121123.0, "reward": 0.708984375, "reward_std": 0.2296358197927475, "rewards/accuracy_reward/mean": 0.04233871027827263, "rewards/accuracy_reward/std": 0.2015640139579773, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.18731644749641418, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 966.71875, "completions/mean_terminated_length": 957.325439453125, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.15771955278654945, "frac_reward_zero_std": 0.0, "grad_norm": 4.730555944405957e+18, "kl": 3.884354678607053e+16, "learning_rate": 1.9800373683791826e-05, "loss": 1554234176700416.0, "num_tokens": 225698355.0, "reward": 0.82080078125, "reward_std": 0.252747118473053, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.73876953125, "rewards/tag_count_reward/std": 0.17624549567699432, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 899.921875, "completions/mean_terminated_length": 891.3905639648438, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.15806093709994026, "frac_reward_zero_std": 0.0, "grad_norm": 0.434749375620988, "kl": 0.1640625, "learning_rate": 1.9797998164296647e-05, "loss": 0.0056, "num_tokens": 226261211.0, "reward": 0.79443359375, "reward_std": 0.25799277424812317, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.75146484375, "rewards/tag_count_reward/std": 0.25339725613594055, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 852.474609375, "completions/mean_terminated_length": 846.6063232421875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.15840232141333105, "frac_reward_zero_std": 0.0, "grad_norm": 343.034871716506, "kl": 7.47265625, "learning_rate": 1.9795608738335153e-05, "loss": 0.2867, "num_tokens": 226776926.0, "reward": 0.931640625, "reward_std": 0.30665355920791626, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.849609375, "rewards/tag_count_reward/std": 0.26696789264678955, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 774.1484375, "completions/mean_terminated_length": 768.7027587890625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.15874370572672186, "frac_reward_zero_std": 0.03125, "grad_norm": 3.369015954433405, "kl": 0.1826171875, "learning_rate": 1.9793205409298696e-05, "loss": 0.0074, "num_tokens": 227253018.0, "reward": 0.98974609375, "reward_std": 0.274214506149292, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89794921875, "rewards/tag_count_reward/std": 0.23608162999153137, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 777.634765625, "completions/mean_terminated_length": 774.5236206054688, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.15908509004011265, "frac_reward_zero_std": 0.0625, "grad_norm": 1.8624808439677152, "kl": 0.474609375, "learning_rate": 1.979078818059836e-05, "loss": 0.0172, "num_tokens": 227732415.0, "reward": 0.94287109375, "reward_std": 0.21937672793865204, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90966796875, "rewards/tag_count_reward/std": 0.2057616412639618, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 718.763671875, "completions/mean_terminated_length": 718.0997924804688, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.15942647435350346, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3218512288849305, "kl": 0.174560546875, "learning_rate": 1.9788357055664963e-05, "loss": 0.013, "num_tokens": 228183510.0, "reward": 0.97216796875, "reward_std": 0.2203109860420227, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91943359375, "rewards/tag_count_reward/std": 0.18166513741016388, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 724.560546875, "completions/mean_terminated_length": 722.8883056640625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.15976785866689425, "frac_reward_zero_std": 0.0, "grad_norm": 483.4008891124644, "kl": 16.18212890625, "learning_rate": 1.9785912037949044e-05, "loss": 0.652, "num_tokens": 228630293.0, "reward": 0.962890625, "reward_std": 0.16525369882583618, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16547498106956482, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 714.25390625, "completions/mean_terminated_length": 714.25390625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.16010924298028506, "frac_reward_zero_std": 0.21875, "grad_norm": 0.1952923448035522, "kl": 0.174560546875, "learning_rate": 1.9783453130920847e-05, "loss": 0.0038, "num_tokens": 229076743.0, "reward": 1.033203125, "reward_std": 0.14642271399497986, "rewards/accuracy_reward/mean": 0.060483869165182114, "rewards/accuracy_reward/std": 0.2386218160390854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.10058461129665375, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 709.486328125, "completions/mean_terminated_length": 709.486328125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.16045062729367585, "frac_reward_zero_std": 0.21875, "grad_norm": 0.19816877340401076, "kl": 0.181640625, "learning_rate": 1.9780980338070356e-05, "loss": -0.0018, "num_tokens": 229526352.0, "reward": 1.02001953125, "reward_std": 0.15582773089408875, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98095703125, "rewards/tag_count_reward/std": 0.09121737629175186, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 820.732421875, "completions/mean_terminated_length": 820.732421875, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.16079201160706666, "frac_reward_zero_std": 0.40625, "grad_norm": 0.15558714575597182, "kl": 0.167236328125, "learning_rate": 1.9778493662907237e-05, "loss": -0.001, "num_tokens": 230021783.0, "reward": 1.01123046875, "reward_std": 0.10319773107767105, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98388671875, "rewards/tag_count_reward/std": 0.09310437738895416, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 904.126953125, "completions/mean_terminated_length": 874.32666015625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.16113339592045745, "frac_reward_zero_std": 0.4375, "grad_norm": 0.15173607782253287, "kl": 0.156494140625, "learning_rate": 1.9775993108960878e-05, "loss": 0.0584, "num_tokens": 230556040.0, "reward": 1.03515625, "reward_std": 0.1439889669418335, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.12449444830417633, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1002.30078125, "completions/mean_terminated_length": 981.4701538085938, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.16147478023384826, "frac_reward_zero_std": 0.21875, "grad_norm": 0.15622270850317543, "kl": 0.14794921875, "learning_rate": 1.9773478679780352e-05, "loss": 0.0279, "num_tokens": 231153698.0, "reward": 1.1103515625, "reward_std": 0.18391528725624084, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.12070263922214508, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 1077.078125, "completions/mean_terminated_length": 1075.1781005859375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.16181616454723904, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12528339341300393, "kl": 0.148681640625, "learning_rate": 1.9770950378934433e-05, "loss": 0.0105, "num_tokens": 231779498.0, "reward": 1.0703125, "reward_std": 0.13490328192710876, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.062070440500974655, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1221.572265625, "completions/mean_terminated_length": 1213.422119140625, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.16215754886062986, "frac_reward_zero_std": 0.375, "grad_norm": 0.12023705303769512, "kl": 0.139892578125, "learning_rate": 1.9768408210011584e-05, "loss": -0.0052, "num_tokens": 232487167.0, "reward": 1.03466796875, "reward_std": 0.17465433478355408, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96240234375, "rewards/tag_count_reward/std": 0.17674899101257324, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1149.71484375, "completions/mean_terminated_length": 1133.64208984375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.16249893317402064, "frac_reward_zero_std": 0.40625, "grad_norm": 0.11991028073810521, "kl": 0.13671875, "learning_rate": 1.9765852176619948e-05, "loss": -0.0118, "num_tokens": 233158653.0, "reward": 1.10986328125, "reward_std": 0.2174900472164154, "rewards/accuracy_reward/mean": 0.13709677755832672, "rewards/accuracy_reward/std": 0.34429675340652466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97705078125, "rewards/tag_count_reward/std": 0.12642802298069, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 1110.96875, "completions/mean_terminated_length": 1101.727783203125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.16284031748741146, "frac_reward_zero_std": 0.4375, "grad_norm": 1.996603277920844, "kl": 0.248291015625, "learning_rate": 1.9763282282387342e-05, "loss": -0.0009, "num_tokens": 233810445.0, "reward": 1.0380859375, "reward_std": 0.17414729297161102, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9697265625, "rewards/tag_count_reward/std": 0.16347356140613556, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1122.048828125, "completions/mean_terminated_length": 1114.0966796875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.16318170180080224, "frac_reward_zero_std": 0.375, "grad_norm": 4.216682052283631, "kl": 0.3818359375, "learning_rate": 1.9760698530961268e-05, "loss": -0.0014, "num_tokens": 234467606.0, "reward": 1.06640625, "reward_std": 0.18280701339244843, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.978515625, "rewards/tag_count_reward/std": 0.1300208568572998, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 1075.658203125, "completions/mean_terminated_length": 1042.9127197265625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.16352308611419306, "frac_reward_zero_std": 0.46875, "grad_norm": 17.863516478120648, "kl": 1.123779296875, "learning_rate": 1.9758100926008886e-05, "loss": 0.0594, "num_tokens": 235097431.0, "reward": 1.0107421875, "reward_std": 0.11170204728841782, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9794921875, "rewards/tag_count_reward/std": 0.1253930628299713, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 1048.466796875, "completions/mean_terminated_length": 976.537353515625, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.16386447042758384, "frac_reward_zero_std": 0.375, "grad_norm": 12.274691799513283, "kl": 0.93408203125, "learning_rate": 1.975548947121702e-05, "loss": 0.0561, "num_tokens": 235717798.0, "reward": 1.0859375, "reward_std": 0.15370376408100128, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.07731663435697556, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 955.6015625, "completions/mean_terminated_length": 945.041748046875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.16420585474097465, "frac_reward_zero_std": 0.5, "grad_norm": 0.6322942984275361, "kl": 0.211181640625, "learning_rate": 1.9752864170292152e-05, "loss": 0.0183, "num_tokens": 236287130.0, "reward": 1.11376953125, "reward_std": 0.14780829846858978, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.33332720398902893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.07360486686229706, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 972.826171875, "completions/mean_terminated_length": 970.7221069335938, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.16454723905436544, "frac_reward_zero_std": 0.5, "grad_norm": 0.14144045086292906, "kl": 0.16845703125, "learning_rate": 1.975022502696042e-05, "loss": 0.0122, "num_tokens": 236874817.0, "reward": 1.0322265625, "reward_std": 0.11265942454338074, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.07745862752199173, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1941.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 1048.580078125, "completions/mean_terminated_length": 1048.580078125, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.16488862336775625, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0825495803031627, "kl": 0.1591796875, "learning_rate": 1.97475720449676e-05, "loss": 0.0042, "num_tokens": 237489818.0, "reward": 1.0283203125, "reward_std": 0.053049236536026, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.06430104374885559, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1068.82421875, "completions/mean_terminated_length": 1066.907958984375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.16523000768114704, "frac_reward_zero_std": 0.625, "grad_norm": 0.10759118465289531, "kl": 0.15869140625, "learning_rate": 1.9744905228079117e-05, "loss": 0.0008, "num_tokens": 238122896.0, "reward": 1.04052734375, "reward_std": 0.0941610336303711, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06877271831035614, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1188.271484375, "completions/mean_terminated_length": 1177.0574951171875, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.16557139199453785, "frac_reward_zero_std": 0.4375, "grad_norm": 8.219880582521965, "kl": 0.17578125, "learning_rate": 1.9742224580080032e-05, "loss": 0.0096, "num_tokens": 238828123.0, "reward": 1.05126953125, "reward_std": 0.15758302807807922, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98291015625, "rewards/tag_count_reward/std": 0.11418873816728592, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1312.888671875, "completions/mean_terminated_length": 1290.7021484375, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.16591277630792864, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12271186900734447, "kl": 0.141845703125, "learning_rate": 1.9739530104775033e-05, "loss": 0.0168, "num_tokens": 239587282.0, "reward": 1.0087890625, "reward_std": 0.14596453309059143, "rewards/accuracy_reward/mean": 0.03427419438958168, "rewards/accuracy_reward/std": 0.18211629986763, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.13137705624103546, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1361.490234375, "completions/mean_terminated_length": 1337.9132080078125, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.16625416062131945, "frac_reward_zero_std": 0.5, "grad_norm": 0.11345716436338416, "kl": 0.139404296875, "learning_rate": 1.9736821805988436e-05, "loss": 0.0178, "num_tokens": 240366717.0, "reward": 1.021484375, "reward_std": 0.1282581388950348, "rewards/accuracy_reward/mean": 0.0463709682226181, "rewards/accuracy_reward/std": 0.21049949526786804, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.12487763166427612, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1282.8125, "completions/mean_terminated_length": 1267.5697021484375, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.16659554493471024, "frac_reward_zero_std": 0.53125, "grad_norm": 2.2349868778906936, "kl": 0.143310546875, "learning_rate": 1.9734099687564174e-05, "loss": 0.0004, "num_tokens": 241103517.0, "reward": 1.04638671875, "reward_std": 0.141056090593338, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97607421875, "rewards/tag_count_reward/std": 0.12912023067474365, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1195.703125, "completions/mean_terminated_length": 1185.596923828125, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.16693692924810105, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1048536304141833, "kl": 0.13623046875, "learning_rate": 1.9731363753365792e-05, "loss": 0.0098, "num_tokens": 241806405.0, "reward": 1.0576171875, "reward_std": 0.10501043498516083, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.05141965299844742, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1112.97265625, "completions/mean_terminated_length": 1107.461669921875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.16727831356149184, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12845878093525617, "kl": 0.139404296875, "learning_rate": 1.9728614007276458e-05, "loss": 0.0057, "num_tokens": 242460199.0, "reward": 1.06689453125, "reward_std": 0.1503855437040329, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98876953125, "rewards/tag_count_reward/std": 0.0827360525727272, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 993.984375, "completions/mean_terminated_length": 992.74365234375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.16761969787488265, "frac_reward_zero_std": 0.5, "grad_norm": 1.4662174817440485, "kl": 0.196044921875, "learning_rate": 1.9725850453198927e-05, "loss": 0.0096, "num_tokens": 243048943.0, "reward": 1.1083984375, "reward_std": 0.17145679891109467, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.0678301453590393, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 951.58203125, "completions/mean_terminated_length": 951.58203125, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.16796108218827344, "frac_reward_zero_std": 0.5, "grad_norm": 0.12083815606886483, "kl": 0.14453125, "learning_rate": 1.972307309505556e-05, "loss": -0.0023, "num_tokens": 243616713.0, "reward": 1.08251953125, "reward_std": 0.16114963591098785, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.09004709124565125, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 890.900390625, "completions/mean_terminated_length": 890.900390625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.16830246650166425, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1147730346288353, "kl": 0.1396484375, "learning_rate": 1.972028193678831e-05, "loss": 0.0033, "num_tokens": 244153478.0, "reward": 1.04150390625, "reward_std": 0.12180577218532562, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.10924496501684189, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 840.75390625, "completions/mean_terminated_length": 840.75390625, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.16864385081505504, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10865307503716587, "kl": 0.135009765625, "learning_rate": 1.9717476982358712e-05, "loss": -0.0026, "num_tokens": 244658248.0, "reward": 1.0244140625, "reward_std": 0.09113894402980804, "rewards/accuracy_reward/mean": 0.032258063554763794, "rewards/accuracy_reward/std": 0.17686307430267334, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.07790146768093109, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1682.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 850.7421875, "completions/mean_terminated_length": 850.7421875, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.16898523512844585, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12180480370162285, "kl": 0.138671875, "learning_rate": 1.9714658235747888e-05, "loss": 0.0082, "num_tokens": 245180580.0, "reward": 1.0185546875, "reward_std": 0.07648997008800507, "rewards/accuracy_reward/mean": 0.026209676638245583, "rewards/accuracy_reward/std": 0.1599196344614029, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06960996240377426, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 929.146484375, "completions/mean_terminated_length": 928.5126953125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.16932661944183663, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4790484900183338, "kl": 0.1309814453125, "learning_rate": 1.9711825700956537e-05, "loss": 0.0153, "num_tokens": 245732751.0, "reward": 1.115234375, "reward_std": 0.16991615295410156, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.051725368946790695, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 909.7734375, "completions/mean_terminated_length": 909.7734375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.16966800375522745, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10414123060203988, "kl": 0.126708984375, "learning_rate": 1.970897938200492e-05, "loss": 0.0052, "num_tokens": 246280891.0, "reward": 1.03466796875, "reward_std": 0.08552159368991852, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.06337865442037582, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 942.068359375, "completions/mean_terminated_length": 942.068359375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.17000938806861823, "frac_reward_zero_std": 0.625, "grad_norm": 0.10559972643744062, "kl": 0.1192626953125, "learning_rate": 1.9706119282932867e-05, "loss": 0.0091, "num_tokens": 246849406.0, "reward": 1.10888671875, "reward_std": 0.12052936106920242, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 919.779296875, "completions/mean_terminated_length": 919.779296875, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.17035077238200905, "frac_reward_zero_std": 0.625, "grad_norm": 0.11663439926863461, "kl": 0.1219482421875, "learning_rate": 1.9703245407799765e-05, "loss": 0.0042, "num_tokens": 247401853.0, "reward": 1.09375, "reward_std": 0.11348139494657516, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1909.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 901.185546875, "completions/mean_terminated_length": 901.185546875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.17069215669539983, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12340759101838357, "kl": 0.1256103515625, "learning_rate": 1.9700357760684553e-05, "loss": 0.0041, "num_tokens": 247943484.0, "reward": 1.1171875, "reward_std": 0.12557722628116608, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1046.984375, "completions/mean_terminated_length": 1045.025390625, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.17103354100879065, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08181570793230143, "kl": 0.1138916015625, "learning_rate": 1.969745634568572e-05, "loss": 0.0142, "num_tokens": 248565044.0, "reward": 1.09423828125, "reward_std": 0.0938824713230133, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1135.587890625, "completions/mean_terminated_length": 1132.0098876953125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.17137492532218146, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10680584575007387, "kl": 0.11962890625, "learning_rate": 1.9694541166921295e-05, "loss": 0.0105, "num_tokens": 249225649.0, "reward": 1.04931640625, "reward_std": 0.11011934280395508, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1228.26171875, "completions/mean_terminated_length": 1223.4302978515625, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.17171630963557225, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10998780919350326, "kl": 0.115966796875, "learning_rate": 1.9691612228528838e-05, "loss": 0.0057, "num_tokens": 249943719.0, "reward": 1.10595703125, "reward_std": 0.1365422010421753, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.06867539137601852, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1261.5859375, "completions/mean_terminated_length": 1252.2608642578125, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.17205769394896306, "frac_reward_zero_std": 0.5, "grad_norm": 0.1118249954423882, "kl": 0.116943359375, "learning_rate": 1.9688669534665443e-05, "loss": 0.031, "num_tokens": 250673667.0, "reward": 1.1025390625, "reward_std": 0.11888115853071213, "rewards/accuracy_reward/mean": 0.11491935700178146, "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.07113077491521835, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1244.013671875, "completions/mean_terminated_length": 1236.0848388671875, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.17239907826235384, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11223576927700094, "kl": 0.1236572265625, "learning_rate": 1.9685713089507726e-05, "loss": 0.0062, "num_tokens": 251390362.0, "reward": 1.126953125, "reward_std": 0.19013258814811707, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07459938526153564, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1254.486328125, "completions/mean_terminated_length": 1241.8909912109375, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.17274046257574466, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11214220105941833, "kl": 0.1202392578125, "learning_rate": 1.968274289725182e-05, "loss": 0.0126, "num_tokens": 252116915.0, "reward": 1.12060546875, "reward_std": 0.17347687482833862, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.07032524049282074, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1248.5, "completions/mean_terminated_length": 1240.6153564453125, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.17308184688913544, "frac_reward_zero_std": 0.46875, "grad_norm": 0.10527712108068331, "kl": 0.1187744140625, "learning_rate": 1.9679758962113367e-05, "loss": 0.0022, "num_tokens": 252842211.0, "reward": 1.10888671875, "reward_std": 0.15307745337486267, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98388671875, "rewards/tag_count_reward/std": 0.10657967627048492, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1206.34765625, "completions/mean_terminated_length": 1203.047119140625, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.17342323120252626, "frac_reward_zero_std": 0.5, "grad_norm": 0.10242902601346553, "kl": 0.1251220703125, "learning_rate": 1.9676761288327523e-05, "loss": -0.008, "num_tokens": 253547381.0, "reward": 1.03662109375, "reward_std": 0.15151987969875336, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97802734375, "rewards/tag_count_reward/std": 0.13770774006843567, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1176.501953125, "completions/mean_terminated_length": 1173.0843505859375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.17376461551591704, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11315559677882853, "kl": 0.118896484375, "learning_rate": 1.9673749880148932e-05, "loss": 0.0096, "num_tokens": 254243270.0, "reward": 1.04443359375, "reward_std": 0.14602318406105042, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98583984375, "rewards/tag_count_reward/std": 0.11023759096860886, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1156.6015625, "completions/mean_terminated_length": 1149.5826416015625, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.17410599982930786, "frac_reward_zero_std": 0.375, "grad_norm": 0.14157746537104632, "kl": 0.1175537109375, "learning_rate": 1.9670724741851737e-05, "loss": -0.0033, "num_tokens": 254914602.0, "reward": 1.0234375, "reward_std": 0.17017918825149536, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.16772332787513733, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 994.98046875, "completions/mean_terminated_length": 993.9706420898438, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.17444738414269864, "frac_reward_zero_std": 0.5625, "grad_norm": 0.8583849761695921, "kl": 0.1337890625, "learning_rate": 1.966768587772957e-05, "loss": -0.0101, "num_tokens": 255508960.0, "reward": 1.0048828125, "reward_std": 0.13661859929561615, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9716796875, "rewards/tag_count_reward/std": 0.1638239026069641, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 936.1171875, "completions/mean_terminated_length": 936.1171875, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.17478876845608946, "frac_reward_zero_std": 0.3125, "grad_norm": 0.14197718886264732, "kl": 0.1185302734375, "learning_rate": 1.966463329209555e-05, "loss": -0.036, "num_tokens": 256069724.0, "reward": 0.99169921875, "reward_std": 0.2092738151550293, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95068359375, "rewards/tag_count_reward/std": 0.21589146554470062, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 989.294921875, "completions/mean_terminated_length": 989.294921875, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.17513015276948024, "frac_reward_zero_std": 0.3125, "grad_norm": 0.13477310618498975, "kl": 0.1131591796875, "learning_rate": 1.9661566989282253e-05, "loss": -0.0163, "num_tokens": 256666211.0, "reward": 1.05810546875, "reward_std": 0.19433505833148956, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97998046875, "rewards/tag_count_reward/std": 0.13350094854831696, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 920.400390625, "completions/mean_terminated_length": 918.1937255859375, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.17547153708287105, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13320494596115542, "kl": 0.1162109375, "learning_rate": 1.965848697364174e-05, "loss": 0.0073, "num_tokens": 257222080.0, "reward": 1.08203125, "reward_std": 0.18767747282981873, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.982421875, "rewards/tag_count_reward/std": 0.11986454576253891, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1922.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 971.94140625, "completions/mean_terminated_length": 971.94140625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.17581292139626184, "frac_reward_zero_std": 0.53125, "grad_norm": 0.114795741555402, "kl": 0.109375, "learning_rate": 1.9655393249545528e-05, "loss": 0.0095, "num_tokens": 257809154.0, "reward": 1.12353515625, "reward_std": 0.14718443155288696, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06134068965911865, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1040.19140625, "completions/mean_terminated_length": 1034.25146484375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.17615430570965265, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10841091283598901, "kl": 0.1048583984375, "learning_rate": 1.9652285821384597e-05, "loss": 0.0152, "num_tokens": 258431140.0, "reward": 1.04296875, "reward_std": 0.13035848736763, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1030.58984375, "completions/mean_terminated_length": 1030.58984375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.17649569002304344, "frac_reward_zero_std": 0.625, "grad_norm": 0.10186122128870224, "kl": 0.1087646484375, "learning_rate": 1.9649164693569367e-05, "loss": 0.018, "num_tokens": 259041730.0, "reward": 1.10400390625, "reward_std": 0.1221393495798111, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.06520672142505646, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1067.806640625, "completions/mean_terminated_length": 1067.806640625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.17683707433643425, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10556247904350831, "kl": 0.110107421875, "learning_rate": 1.9646029870529713e-05, "loss": -0.0049, "num_tokens": 259680783.0, "reward": 1.08349609375, "reward_std": 0.12113868445158005, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.0722421184182167, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1065.76953125, "completions/mean_terminated_length": 1063.8472900390625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.17717845864982504, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11344197543634695, "kl": 0.114501953125, "learning_rate": 1.964288135671494e-05, "loss": 0.0064, "num_tokens": 260314025.0, "reward": 1.07177734375, "reward_std": 0.1476125419139862, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1919.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 1042.5625, "completions/mean_terminated_length": 1042.5625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.17751984296321585, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09978318638267382, "kl": 0.1170654296875, "learning_rate": 1.9639719156593786e-05, "loss": 0.0095, "num_tokens": 260932793.0, "reward": 1.10595703125, "reward_std": 0.11292648315429688, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 974.25390625, "completions/mean_terminated_length": 974.25390625, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.17786122727660664, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11785659482382242, "kl": 0.11328125, "learning_rate": 1.963654327465442e-05, "loss": 0.0099, "num_tokens": 261520443.0, "reward": 1.05322265625, "reward_std": 0.10785128176212311, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.055034760385751724, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 1054.13671875, "completions/mean_terminated_length": 1052.1917724609375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.17820261158999745, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10757781921837202, "kl": 0.1112060546875, "learning_rate": 1.9633353715404423e-05, "loss": -0.0031, "num_tokens": 262143233.0, "reward": 1.02392578125, "reward_std": 0.11254428327083588, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.10230714827775955, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 982.962890625, "completions/mean_terminated_length": 970.3340454101562, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.17854399590338824, "frac_reward_zero_std": 0.8125, "grad_norm": 0.08350308175960736, "kl": 0.114990234375, "learning_rate": 1.9630150483370793e-05, "loss": 0.0135, "num_tokens": 262726030.0, "reward": 1.013671875, "reward_std": 0.04192390665411949, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 951.037109375, "completions/mean_terminated_length": 951.037109375, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.17888538021677905, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12369051962249021, "kl": 0.11572265625, "learning_rate": 1.962693358309993e-05, "loss": 0.0078, "num_tokens": 263293185.0, "reward": 1.08642578125, "reward_std": 0.13062909245491028, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 959.0546875, "completions/mean_terminated_length": 956.9236450195312, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.17922676453016984, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10101459169651589, "kl": 0.1160888671875, "learning_rate": 1.9623703019157637e-05, "loss": 0.0069, "num_tokens": 263870973.0, "reward": 1.06494140625, "reward_std": 0.09270144999027252, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1741.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 1002.841796875, "completions/mean_terminated_length": 1002.841796875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.17956814884356065, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11560061102221568, "kl": 0.1129150390625, "learning_rate": 1.9620458796129104e-05, "loss": -0.0004, "num_tokens": 264468780.0, "reward": 1.10205078125, "reward_std": 0.163252592086792, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 1019.568359375, "completions/mean_terminated_length": 1019.568359375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.17990953315695143, "frac_reward_zero_std": 0.625, "grad_norm": 0.13056978560639163, "kl": 0.1141357421875, "learning_rate": 1.9617200918618923e-05, "loss": -0.0014, "num_tokens": 265086607.0, "reward": 1.080078125, "reward_std": 0.13385246694087982, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1040.6875, "completions/mean_terminated_length": 1040.6875, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.18025091747034225, "frac_reward_zero_std": 0.5625, "grad_norm": 204.84692875528327, "kl": 0.16015625, "learning_rate": 1.9613929391251042e-05, "loss": 0.0042, "num_tokens": 265696447.0, "reward": 1.1328125, "reward_std": 0.14860516786575317, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.07143239676952362, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1072.595703125, "completions/mean_terminated_length": 1070.6868896484375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.18059230178373303, "frac_reward_zero_std": 0.5625, "grad_norm": 1.4279675306930268, "kl": 0.119873046875, "learning_rate": 1.96106442186688e-05, "loss": 0.0055, "num_tokens": 266323712.0, "reward": 1.099609375, "reward_std": 0.12287362664937973, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.08365631848573685, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1212.501953125, "completions/mean_terminated_length": 1204.2623291015625, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.18093368609712385, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10125202854139345, "kl": 0.107421875, "learning_rate": 1.9607345405534907e-05, "loss": 0.0013, "num_tokens": 267029249.0, "reward": 1.01904296875, "reward_std": 0.10358865559101105, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98779296875, "rewards/tag_count_reward/std": 0.08969622105360031, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1187.75390625, "completions/mean_terminated_length": 1177.553466796875, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.18127507041051463, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09265774541390535, "kl": 0.1016845703125, "learning_rate": 1.9604032956531412e-05, "loss": -0.0084, "num_tokens": 267723587.0, "reward": 1.06201171875, "reward_std": 0.0909958928823471, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98583984375, "rewards/tag_count_reward/std": 0.09600471705198288, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1201.287109375, "completions/mean_terminated_length": 1186.1370849609375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.18161645472390545, "frac_reward_zero_std": 0.375, "grad_norm": 0.13358284350697489, "kl": 0.1142578125, "learning_rate": 1.9600706876359742e-05, "loss": 0.0207, "num_tokens": 268428278.0, "reward": 1.09716796875, "reward_std": 0.18151992559432983, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.06829868257045746, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1275.447265625, "completions/mean_terminated_length": 1244.83740234375, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.18195783903729623, "frac_reward_zero_std": 0.375, "grad_norm": 8794.206818932951, "kl": 1620.0533447265625, "learning_rate": 1.9597367169740652e-05, "loss": 64.7735, "num_tokens": 269161851.0, "reward": 1.06103515625, "reward_std": 0.15485742688179016, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98291015625, "rewards/tag_count_reward/std": 0.07867498695850372, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1118.9453125, "completions/mean_terminated_length": 1048.6807861328125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.18229922335068705, "frac_reward_zero_std": 0.40625, "grad_norm": 76326.71595031732, "kl": 7981.046875, "learning_rate": 1.9594013841414247e-05, "loss": 320.4044, "num_tokens": 269809871.0, "reward": 1.09765625, "reward_std": 0.16412407159805298, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.10807649046182632, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 1284.560546875, "completions/mean_terminated_length": 1078.0718994140625, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.18264060766407783, "frac_reward_zero_std": 0.15625, "grad_norm": 557074.8687008574, "kl": 33248.0, "learning_rate": 1.959064689613996e-05, "loss": 1331.2423, "num_tokens": 270547822.0, "reward": 1.0322265625, "reward_std": 0.2134747952222824, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9443359375, "rewards/tag_count_reward/std": 0.15664394199848175, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 1353.1875, "completions/mean_terminated_length": 995.5029907226562, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.18298199197746864, "frac_reward_zero_std": 0.09375, "grad_norm": 90719.40849628203, "kl": 6036.0, "learning_rate": 1.9587266338696564e-05, "loss": 241.5728, "num_tokens": 271340174.0, "reward": 1.00048828125, "reward_std": 0.20038625597953796, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293970108032227, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93408203125, "rewards/tag_count_reward/std": 0.1630660742521286, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 1473.97265625, "completions/mean_terminated_length": 1023.9512329101562, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.18332337629085943, "frac_reward_zero_std": 0.125, "grad_norm": 339.4425122649091, "kl": 70.4375, "learning_rate": 1.958387217388213e-05, "loss": 2.9132, "num_tokens": 272172304.0, "reward": 0.9736328125, "reward_std": 0.20018313825130463, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.005859375, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9189453125, "rewards/tag_count_reward/std": 0.18379150331020355, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 1499.20703125, "completions/mean_terminated_length": 1003.4572143554688, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.18366476060425024, "frac_reward_zero_std": 0.15625, "grad_norm": 102.45466461460207, "kl": 24.28125, "learning_rate": 1.958046440651406e-05, "loss": 1.0506, "num_tokens": 273015578.0, "reward": 1.0126953125, "reward_std": 0.21400108933448792, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9287109375, "rewards/tag_count_reward/std": 0.1663934737443924, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1362.626953125, "completions/mean_terminated_length": 952.2892456054688, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.18400614491764103, "frac_reward_zero_std": 0.21875, "grad_norm": 20.488275393898785, "kl": 3.861328125, "learning_rate": 1.957704304142906e-05, "loss": 0.2326, "num_tokens": 273795051.0, "reward": 1.0263671875, "reward_std": 0.20130415260791779, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.009765625, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9580078125, "rewards/tag_count_reward/std": 0.13793392479419708, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 1492.169921875, "completions/mean_terminated_length": 957.635986328125, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.18434752923103184, "frac_reward_zero_std": 0.15625, "grad_norm": 5.238027714997651, "kl": 1.357421875, "learning_rate": 1.9573608083483124e-05, "loss": 0.1241, "num_tokens": 274637650.0, "reward": 1.0078125, "reward_std": 0.1994955688714981, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.153242826461792, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 1173.232421875, "completions/mean_terminated_length": 994.097412109375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.18468891354442263, "frac_reward_zero_std": 0.4375, "grad_norm": 0.21484107353490794, "kl": 0.912109375, "learning_rate": 1.9570159537551553e-05, "loss": 0.0889, "num_tokens": 275324441.0, "reward": 1.06494140625, "reward_std": 0.15749263763427734, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.97900390625, "rewards/tag_count_reward/std": 0.09474232792854309, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 918.623046875, "completions/mean_terminated_length": 891.0798950195312, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.18503029785781344, "frac_reward_zero_std": 0.21875, "grad_norm": 27.741690042080013, "kl": 0.44189453125, "learning_rate": 1.9566697408528922e-05, "loss": 0.1027, "num_tokens": 275883480.0, "reward": 1.0712890625, "reward_std": 0.24440008401870728, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.01171875, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9560546875, "rewards/tag_count_reward/std": 0.1890403777360916, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1119.00390625, "completions/mean_terminated_length": 963.450927734375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.18537168217120423, "frac_reward_zero_std": 0.0, "grad_norm": 23713507.059220374, "kl": 838656.0, "learning_rate": 1.9563221701329094e-05, "loss": 33605.2539, "num_tokens": 276543610.0, "reward": 0.83935546875, "reward_std": 0.4097687900066376, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.009765625, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.75732421875, "rewards/tag_count_reward/std": 0.4040161371231079, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.421875, "completions/max_length": 2017.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1246.05078125, "completions/mean_terminated_length": 934.0953979492188, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.18571306648459504, "frac_reward_zero_std": 0.0, "grad_norm": 1685577.8039754147, "kl": 65024.0, "learning_rate": 1.95597324208852e-05, "loss": 2602.8381, "num_tokens": 277259604.0, "reward": 0.75, "reward_std": 0.5278667211532593, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.01953125, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.603515625, "rewards/tag_count_reward/std": 0.4533279240131378, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.078125, "completions/max_length": 1619.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 1145.947265625, "completions/mean_terminated_length": 820.7614135742188, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.18605445079798583, "frac_reward_zero_std": 0.03125, "grad_norm": 56.39515632562626, "kl": 2.03515625, "learning_rate": 1.9556229572149628e-05, "loss": 0.1737, "num_tokens": 277934393.0, "reward": 0.5380859375, "reward_std": 0.44693490862846375, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.009765625, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.4404296875, "rewards/tag_count_reward/std": 0.46064937114715576, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.046875, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 1176.025390625, "completions/mean_terminated_length": 756.4122314453125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.18639583511137664, "frac_reward_zero_std": 0.0, "grad_norm": 53.73872716114291, "kl": 2.484375, "learning_rate": 1.955271316009404e-05, "loss": 0.1983, "num_tokens": 278620262.0, "reward": 0.435546875, "reward_std": 0.4427300691604614, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.01171875, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.314453125, "rewards/tag_count_reward/std": 0.428036630153656, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1236.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 985.486328125, "completions/mean_terminated_length": 732.1224365234375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.18673721942476743, "frac_reward_zero_std": 0.125, "grad_norm": 4038.71234394653, "kl": 279.0, "learning_rate": 1.9549183189709335e-05, "loss": 11.2106, "num_tokens": 279207759.0, "reward": 0.15576171875, "reward_std": 0.2902292013168335, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.009765625, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.13427734375, "rewards/tag_count_reward/std": 0.3079378306865692, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1430.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 961.11328125, "completions/mean_terminated_length": 731.5, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.18707860373815824, "frac_reward_zero_std": 0.1875, "grad_norm": 303.27123800538965, "kl": 28.5, "learning_rate": 1.9545639666005663e-05, "loss": 1.1757, "num_tokens": 279790905.0, "reward": 0.1318359375, "reward_std": 0.23260755836963654, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.1005859375, "rewards/tag_count_reward/std": 0.2666007876396179, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 931.107421875, "completions/mean_terminated_length": 689.0294189453125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.18741998805154902, "frac_reward_zero_std": 0.1875, "grad_norm": 19.67723745912663, "kl": 4.390625, "learning_rate": 1.9542082594012406e-05, "loss": 0.1963, "num_tokens": 280341968.0, "reward": 0.14404296875, "reward_std": 0.20339080691337585, "rewards/accuracy_reward/mean": 0.05040322616696358, "rewards/accuracy_reward/std": 0.21899642050266266, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.09521484375, "rewards/tag_count_reward/std": 0.2590656876564026, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 1098.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 776.654296875, "completions/mean_terminated_length": 664.2857666015625, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.18776137236493984, "frac_reward_zero_std": 0.25, "grad_norm": 12.380428697232155, "kl": 5.046875, "learning_rate": 1.953851197877818e-05, "loss": 0.2143, "num_tokens": 280819743.0, "reward": 0.0732421875, "reward_std": 0.1378255933523178, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0537109375, "rewards/tag_count_reward/std": 0.18451793491840363, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 846.91796875, "completions/mean_terminated_length": 711.11767578125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.18810275667833062, "frac_reward_zero_std": 0.21875, "grad_norm": 17.20741584362489, "kl": 3.4375, "learning_rate": 1.9534927825370814e-05, "loss": 0.1591, "num_tokens": 281339733.0, "reward": 0.0859375, "reward_std": 0.19654256105422974, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.06640625, "rewards/tag_count_reward/std": 0.20745493471622467, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 1056.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 746.16796875, "completions/mean_terminated_length": 422.7692565917969, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.18844414099172144, "frac_reward_zero_std": 0.1875, "grad_norm": 102815.25858200013, "kl": 8016.0, "learning_rate": 1.9531330138877363e-05, "loss": 321.5016, "num_tokens": 281797083.0, "reward": 0.11767578125, "reward_std": 0.1902218759059906, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.017578125, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.08642578125, "rewards/tag_count_reward/std": 0.23193290829658508, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 940.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 745.63671875, "completions/mean_terminated_length": 476.77777099609375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.18878552530511222, "frac_reward_zero_std": 0.0, "grad_norm": 740128.518393557, "kl": 57600.0, "learning_rate": 1.9527718924404086e-05, "loss": 2303.4436, "num_tokens": 282260577.0, "reward": 0.22412109375, "reward_std": 0.30162039399147034, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.021484375, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.15966796875, "rewards/tag_count_reward/std": 0.314006507396698, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 1156.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 852.91796875, "completions/mean_terminated_length": 648.7142944335938, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.18912690961850304, "frac_reward_zero_std": 0.09375, "grad_norm": 2562.9815810343657, "kl": 266.5, "learning_rate": 1.9524094187076437e-05, "loss": 10.7013, "num_tokens": 282782327.0, "reward": 0.12548828125, "reward_std": 0.2327386736869812, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.009765625, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.10986328125, "rewards/tag_count_reward/std": 0.24383871257305145, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 874.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 757.830078125, "completions/mean_terminated_length": 367.20001220703125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.18946829393189382, "frac_reward_zero_std": 0.0, "grad_norm": 17.33861242127036, "kl": 3.67578125, "learning_rate": 1.9520455932039074e-05, "loss": 0.1796, "num_tokens": 283258864.0, "reward": 0.2060546875, "reward_std": 0.28254348039627075, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.005859375, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.1767578125, "rewards/tag_count_reward/std": 0.25365594029426575, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 655.7578125, "completions/mean_terminated_length": 466.90911865234375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.18980967824528464, "frac_reward_zero_std": 0.0, "grad_norm": 14.998085012093503, "kl": 5.0234375, "learning_rate": 1.9516804164455828e-05, "loss": 0.2185, "num_tokens": 283679524.0, "reward": 0.158203125, "reward_std": 0.22537630796432495, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.150390625, "rewards/tag_count_reward/std": 0.21916458010673523, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 824.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 637.2421875, "completions/mean_terminated_length": 441.8000183105469, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.19015106255867542, "frac_reward_zero_std": 0.0, "grad_norm": 14.808593142359962, "kl": 4.40625, "learning_rate": 1.9513138889509717e-05, "loss": 0.1853, "num_tokens": 284085760.0, "reward": 0.189453125, "reward_std": 0.22907450795173645, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.005859375, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.181640625, "rewards/tag_count_reward/std": 0.22331088781356812, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 898.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 589.955078125, "completions/mean_terminated_length": 376.933349609375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.19049244687206623, "frac_reward_zero_std": 0.0, "grad_norm": 6.881607575417302, "kl": 4.1796875, "learning_rate": 1.950946011240293e-05, "loss": 0.1861, "num_tokens": 284470153.0, "reward": 0.22509765625, "reward_std": 0.24013200402259827, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.017578125, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.20166015625, "rewards/tag_count_reward/std": 0.23352187871932983, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 810.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 659.93359375, "completions/mean_terminated_length": 420.952392578125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.19083383118545702, "frac_reward_zero_std": 0.0, "grad_norm": 41781.741730202346, "kl": 4544.0, "learning_rate": 1.9505767838356818e-05, "loss": 181.8075, "num_tokens": 284888103.0, "reward": 0.31884765625, "reward_std": 0.30944299697875977, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.025390625, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.29150390625, "rewards/tag_count_reward/std": 0.2600449323654175, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 876.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 668.37109375, "completions/mean_terminated_length": 380.875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.19117521549884783, "frac_reward_zero_std": 0.0, "grad_norm": 425181.16854114877, "kl": 41792.0, "learning_rate": 1.950206207261189e-05, "loss": 1673.2413, "num_tokens": 285309365.0, "reward": 0.4169921875, "reward_std": 0.3771216571331024, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.044921875, "rewards/format_reward/std": 0.20733514428138733, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.27708959579467773, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 688.4375, "completions/mean_terminated_length": 454.5, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.19151659981223862, "frac_reward_zero_std": 0.0, "grad_norm": 21366.439563707045, "kl": 2448.0, "learning_rate": 1.9498342820427796e-05, "loss": 97.9572, "num_tokens": 285740533.0, "reward": 0.36572265625, "reward_std": 0.30031347274780273, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.029296875, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.33251953125, "rewards/tag_count_reward/std": 0.2410879135131836, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 659.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 601.20703125, "completions/mean_terminated_length": 339.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.19185798412562943, "frac_reward_zero_std": 0.03125, "grad_norm": 39.24729646734868, "kl": 10.0625, "learning_rate": 1.949461008708334e-05, "loss": 0.4176, "num_tokens": 286135711.0, "reward": 0.31787109375, "reward_std": 0.2223648726940155, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.01171875, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.30615234375, "rewards/tag_count_reward/std": 0.21478210389614105, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 491.48046875, "completions/mean_terminated_length": 287.79168701171875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.19219936843902022, "frac_reward_zero_std": 0.03125, "grad_norm": 20.179966977857944, "kl": 5.0625, "learning_rate": 1.9490863877876455e-05, "loss": 0.2292, "num_tokens": 286468485.0, "reward": 0.361328125, "reward_std": 0.2560356557369232, "rewards/accuracy_reward/mean": 0.004032257944345474, "rewards/accuracy_reward/std": 0.06343588978052139, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.326171875, "rewards/tag_count_reward/std": 0.2357662171125412, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 642.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 524.189453125, "completions/mean_terminated_length": 335.71429443359375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.19254075275241103, "frac_reward_zero_std": 0.0625, "grad_norm": 13.69201864353312, "kl": 5.984375, "learning_rate": 1.94871041981242e-05, "loss": 0.2551, "num_tokens": 286829654.0, "reward": 0.31982421875, "reward_std": 0.19106550514698029, "rewards/accuracy_reward/mean": 0.009765625, "rewards/accuracy_reward/std": 0.09843364357948303, "rewards/format_reward/mean": 0.01953125, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.29052734375, "rewards/tag_count_reward/std": 0.18224281072616577, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 612.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 427.123046875, "completions/mean_terminated_length": 35.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.19288213706580182, "frac_reward_zero_std": 0.0, "grad_norm": 32.14341678143367, "kl": 6.734375, "learning_rate": 1.9483331053162747e-05, "loss": 0.2715, "num_tokens": 287128613.0, "reward": 0.205078125, "reward_std": 0.17534798383712769, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.205078125, "rewards/tag_count_reward/std": 0.18881024420261383, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 869.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 493.58984375, "completions/mean_terminated_length": 269.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.19322352137919263, "frac_reward_zero_std": 0.0, "grad_norm": 5176.7298866062765, "kl": 8.4921875, "learning_rate": 1.9479544448347393e-05, "loss": 0.3441, "num_tokens": 287469299.0, "reward": 0.15625, "reward_std": 0.18579982221126556, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.15625, "rewards/tag_count_reward/std": 0.19968174397945404, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 732.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 580.501953125, "completions/mean_terminated_length": 280.4545593261719, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.19356490569258342, "frac_reward_zero_std": 0.0, "grad_norm": 192.4818269043784, "kl": 4.546875, "learning_rate": 1.9475744389052527e-05, "loss": 0.2005, "num_tokens": 287857380.0, "reward": 0.189453125, "reward_std": 0.21463382244110107, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.189453125, "rewards/tag_count_reward/std": 0.22932320833206177, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 1347.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 774.958984375, "completions/mean_terminated_length": 524.888916015625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.19390629000597423, "frac_reward_zero_std": 0.0, "grad_norm": 2720.316164371927, "kl": 4.01953125, "learning_rate": 1.9471930880671635e-05, "loss": 0.168, "num_tokens": 288333087.0, "reward": 0.19287109375, "reward_std": 0.22268745303153992, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.19091796875, "rewards/tag_count_reward/std": 0.22513431310653687, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 1062.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 764.626953125, "completions/mean_terminated_length": 441.5882263183594, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.19424767431936502, "frac_reward_zero_std": 0.0, "grad_norm": 611.4505134516988, "kl": 3.67578125, "learning_rate": 1.94681039286173e-05, "loss": 0.1704, "num_tokens": 288803120.0, "reward": 0.1728515625, "reward_std": 0.21897584199905396, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1728515625, "rewards/tag_count_reward/std": 0.2421068400144577, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 1387.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 1018.732421875, "completions/mean_terminated_length": 415.76470947265625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.19458905863275583, "frac_reward_zero_std": 0.0, "grad_norm": 101.42138707180158, "kl": 3.15234375, "learning_rate": 1.9464263538321176e-05, "loss": 0.1679, "num_tokens": 289403639.0, "reward": 0.22705078125, "reward_std": 0.24914902448654175, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.22705078125, "rewards/tag_count_reward/std": 0.2595263123512268, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1074.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 937.4609375, "completions/mean_terminated_length": 633.4583740234375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.19493044294614661, "frac_reward_zero_std": 0.0, "grad_norm": 3512.889882506308, "kl": 3.7734375, "learning_rate": 1.9460409715233996e-05, "loss": 0.1789, "num_tokens": 289973811.0, "reward": 0.228515625, "reward_std": 0.2606068253517151, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.228515625, "rewards/tag_count_reward/std": 0.2815725803375244, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 1245.166015625, "completions/mean_terminated_length": 505.7872314453125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.19527182725953743, "frac_reward_zero_std": 0.0, "grad_norm": 189.48238584308018, "kl": 3.328125, "learning_rate": 1.945654246482556e-05, "loss": 0.2144, "num_tokens": 290693576.0, "reward": 0.279296875, "reward_std": 0.28431689739227295, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.279296875, "rewards/tag_count_reward/std": 0.3046782910823822, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 1602.7109375, "completions/mean_terminated_length": 666.551025390625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.19561321157292821, "frac_reward_zero_std": 0.0, "grad_norm": 95.99827734378142, "kl": 3.8515625, "learning_rate": 1.945266179258472e-05, "loss": 0.231, "num_tokens": 291596324.0, "reward": 0.32763671875, "reward_std": 0.28740912675857544, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.32763671875, "rewards/tag_count_reward/std": 0.3111635744571686, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1820.50390625, "completions/mean_terminated_length": 690.2083740234375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.19595459588631903, "frac_reward_zero_std": 0.0, "grad_norm": 737.993771223648, "kl": 5.515625, "learning_rate": 1.944876770401938e-05, "loss": 0.2986, "num_tokens": 292610870.0, "reward": 0.298828125, "reward_std": 0.29840362071990967, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.298828125, "rewards/tag_count_reward/std": 0.3136780560016632, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 1908.912109375, "completions/mean_terminated_length": 718.5399780273438, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.1962959801997098, "frac_reward_zero_std": 0.0, "grad_norm": 140.49826403785798, "kl": 10.921875, "learning_rate": 1.9444860204656494e-05, "loss": 0.5599, "num_tokens": 293667945.0, "reward": 0.26171875, "reward_std": 0.3236551284790039, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.26171875, "rewards/tag_count_reward/std": 0.3229774832725525, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 1656.232421875, "completions/mean_terminated_length": 661.442138671875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.19663736451310063, "frac_reward_zero_std": 0.0, "grad_norm": 20.093552331040637, "kl": 12.265625, "learning_rate": 1.9440939300042027e-05, "loss": 0.6548, "num_tokens": 294606528.0, "reward": 0.38671875, "reward_std": 0.3448584973812103, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.36493757367134094, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 1512.25390625, "completions/mean_terminated_length": 641.7938232421875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.1969787488264914, "frac_reward_zero_std": 0.0, "grad_norm": 34.18217758930206, "kl": 5.328125, "learning_rate": 1.9437004995741e-05, "loss": 0.3581, "num_tokens": 295466226.0, "reward": 0.39697265625, "reward_std": 0.3343315124511719, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39697265625, "rewards/tag_count_reward/std": 0.35871267318725586, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1445.708984375, "completions/mean_terminated_length": 671.41796875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.19732013313988223, "frac_reward_zero_std": 0.0, "grad_norm": 22.081398939809176, "kl": 5.453125, "learning_rate": 1.9433057297337432e-05, "loss": 0.372, "num_tokens": 296288493.0, "reward": 0.44677734375, "reward_std": 0.36073029041290283, "rewards/accuracy_reward/mean": 0.002016128972172737, "rewards/accuracy_reward/std": 0.044901326298713684, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.44482421875, "rewards/tag_count_reward/std": 0.38087546825408936, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 1149.0703125, "completions/mean_terminated_length": 656.8143920898438, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.197661517453273, "frac_reward_zero_std": 0.0, "grad_norm": 183.60443795930576, "kl": 27.125, "learning_rate": 1.942909621043436e-05, "loss": 1.2335, "num_tokens": 296961809.0, "reward": 0.5234375, "reward_std": 0.32795244455337524, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5234375, "rewards/tag_count_reward/std": 0.37820711731910706, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.578125, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1133.4609375, "completions/mean_terminated_length": 645.375732421875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.19800290176666382, "frac_reward_zero_std": 0.0, "grad_norm": 50.68176971807628, "kl": 14.328125, "learning_rate": 1.9425121740653824e-05, "loss": 0.7209, "num_tokens": 297626749.0, "reward": 0.55029296875, "reward_std": 0.3315295875072479, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.55029296875, "rewards/tag_count_reward/std": 0.3648377060890198, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 1687.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 1164.78515625, "completions/mean_terminated_length": 635.2667236328125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1983442860800546, "frac_reward_zero_std": 0.0, "grad_norm": 31.674548395506253, "kl": 4.6484375, "learning_rate": 1.9421133893636856e-05, "loss": 0.2835, "num_tokens": 298310943.0, "reward": 0.4921875, "reward_std": 0.326386034488678, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08812850713729858, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.484375, "rewards/tag_count_reward/std": 0.35216689109802246, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.828125, "completions/max_length": 1532.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 1146.15234375, "completions/mean_terminated_length": 729.5812377929688, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.19868567039344542, "frac_reward_zero_std": 0.0, "grad_norm": 19.60879034961793, "kl": 4.15234375, "learning_rate": 1.941713267504347e-05, "loss": 0.268, "num_tokens": 298989469.0, "reward": 0.50830078125, "reward_std": 0.3417467474937439, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.50830078125, "rewards/tag_count_reward/std": 0.357070654630661, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.65625, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 1068.328125, "completions/mean_terminated_length": 727.5059204101562, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1990270547068362, "frac_reward_zero_std": 0.0, "grad_norm": 10.90468122589196, "kl": 7.3671875, "learning_rate": 1.9413118090552672e-05, "loss": 0.4032, "num_tokens": 299613477.0, "reward": 0.6025390625, "reward_std": 0.34706878662109375, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5966796875, "rewards/tag_count_reward/std": 0.3492775857448578, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 1286.455078125, "completions/mean_terminated_length": 801.2512817382812, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.19936843902022702, "frac_reward_zero_std": 0.0, "grad_norm": 108.65200021240719, "kl": 28.6875, "learning_rate": 1.9409090145862427e-05, "loss": 1.3109, "num_tokens": 300359726.0, "reward": 0.64208984375, "reward_std": 0.3368116021156311, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.63818359375, "rewards/tag_count_reward/std": 0.34525299072265625, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 1622.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 1075.6484375, "completions/mean_terminated_length": 715.6327514648438, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.1997098233336178, "frac_reward_zero_std": 0.0, "grad_norm": 95.48081041322044, "kl": 27.15625, "learning_rate": 1.9405048846689668e-05, "loss": 1.2641, "num_tokens": 300985402.0, "reward": 0.67041015625, "reward_std": 0.33401262760162354, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66845703125, "rewards/tag_count_reward/std": 0.33444783091545105, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 1293.7265625, "completions/mean_terminated_length": 776.15087890625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.20005120764700862, "frac_reward_zero_std": 0.0, "grad_norm": 4.388943580338186, "kl": 10.3671875, "learning_rate": 1.9400994198770277e-05, "loss": 0.6473, "num_tokens": 301731406.0, "reward": 0.69580078125, "reward_std": 0.3429878056049347, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.68408203125, "rewards/tag_count_reward/std": 0.33788153529167175, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 1420.02734375, "completions/mean_terminated_length": 708.1808471679688, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.2003925919603994, "frac_reward_zero_std": 0.0, "grad_norm": 21.29695891478487, "kl": 5.796875, "learning_rate": 1.9396926207859085e-05, "loss": 0.4327, "num_tokens": 302546524.0, "reward": 0.6640625, "reward_std": 0.30997902154922485, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.662109375, "rewards/tag_count_reward/std": 0.3141164183616638, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1185.01953125, "completions/mean_terminated_length": 696.609375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.20073397627379022, "frac_reward_zero_std": 0.0, "grad_norm": 52.623119865386975, "kl": 6.734375, "learning_rate": 1.9392844879729864e-05, "loss": 0.5239, "num_tokens": 303234614.0, "reward": 0.6357421875, "reward_std": 0.33507442474365234, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6357421875, "rewards/tag_count_reward/std": 0.3360030949115753, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 953.7109375, "completions/mean_terminated_length": 539.3390502929688, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.201075360587181, "frac_reward_zero_std": 0.0, "grad_norm": 8.954274309313597, "kl": 8.5625, "learning_rate": 1.938875022017531e-05, "loss": 0.5897, "num_tokens": 303806594.0, "reward": 0.6416015625, "reward_std": 0.3112793564796448, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6416015625, "rewards/tag_count_reward/std": 0.3250286281108856, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 731.966796875, "completions/mean_terminated_length": 489.2642822265625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.20141674490057182, "frac_reward_zero_std": 0.0, "grad_norm": 61.22114894433112, "kl": 17.078125, "learning_rate": 1.9384642235007044e-05, "loss": 0.9265, "num_tokens": 304254465.0, "reward": 0.65966796875, "reward_std": 0.30554234981536865, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.65771484375, "rewards/tag_count_reward/std": 0.30633246898651123, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.859375, "completions/max_length": 1099.0, "completions/max_terminated_length": 1058.0, "completions/mean_length": 623.7734375, "completions/mean_terminated_length": 496.31732177734375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.2017581292139626, "frac_reward_zero_std": 0.0, "grad_norm": 44.585637016066556, "kl": 4.44921875, "learning_rate": 1.9380520930055603e-05, "loss": 0.3457, "num_tokens": 304661133.0, "reward": 0.7353515625, "reward_std": 0.2753329277038574, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08812850713729858, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7275390625, "rewards/tag_count_reward/std": 0.26539376378059387, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 1149.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 587.85546875, "completions/mean_terminated_length": 460.69744873046875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.20209951352735342, "frac_reward_zero_std": 0.0, "grad_norm": 630.0363657589487, "kl": 4.62109375, "learning_rate": 1.9376386311170416e-05, "loss": 0.367, "num_tokens": 305041427.0, "reward": 0.74169921875, "reward_std": 0.28393739461898804, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.72802734375, "rewards/tag_count_reward/std": 0.2773769795894623, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 1153.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 540.138671875, "completions/mean_terminated_length": 430.4104309082031, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2024408978407442, "frac_reward_zero_std": 0.0, "grad_norm": 43.55449582505704, "kl": 3.4375, "learning_rate": 1.937223838421983e-05, "loss": 0.3041, "num_tokens": 305405354.0, "reward": 0.6904296875, "reward_std": 0.2723011076450348, "rewards/accuracy_reward/mean": 0.002016128972172737, "rewards/accuracy_reward/std": 0.044901326298713684, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6884765625, "rewards/tag_count_reward/std": 0.2715722620487213, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 416.65625, "completions/mean_terminated_length": 350.9729919433594, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.20278228215413502, "frac_reward_zero_std": 0.0, "grad_norm": 19.882684714974282, "kl": 4.7734375, "learning_rate": 1.9368077155091064e-05, "loss": 0.2928, "num_tokens": 305694906.0, "reward": 0.73779296875, "reward_std": 0.2595180869102478, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.72021484375, "rewards/tag_count_reward/std": 0.248708114027977, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 951.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 303.142578125, "completions/mean_terminated_length": 263.45318603515625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2031236664675258, "frac_reward_zero_std": 0.0, "grad_norm": 105.41401937030716, "kl": 18.0, "learning_rate": 1.936390262969022e-05, "loss": 0.7131, "num_tokens": 305930563.0, "reward": 0.716796875, "reward_std": 0.23446841537952423, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.712890625, "rewards/tag_count_reward/std": 0.23111625015735626, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 318.939453125, "completions/mean_terminated_length": 217.03648376464844, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.20346505078091662, "frac_reward_zero_std": 0.0, "grad_norm": 53.9337225064596, "kl": 14.1875, "learning_rate": 1.9359714813942272e-05, "loss": 0.7393, "num_tokens": 306165444.0, "reward": 0.68212890625, "reward_std": 0.2535245716571808, "rewards/accuracy_reward/mean": 0.008064515888690948, "rewards/accuracy_reward/std": 0.0895301103591919, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.67236328125, "rewards/tag_count_reward/std": 0.2606872022151947, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 299.05078125, "completions/mean_terminated_length": 249.6695098876953, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2038064350943074, "frac_reward_zero_std": 0.0, "grad_norm": 8.368197447430646, "kl": 4.27734375, "learning_rate": 1.9355513713791063e-05, "loss": 0.2872, "num_tokens": 306396094.0, "reward": 0.70654296875, "reward_std": 0.2459806203842163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.70263671875, "rewards/tag_count_reward/std": 0.24941787123680115, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 307.96875, "completions/mean_terminated_length": 269.98358154296875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.20414781940769822, "frac_reward_zero_std": 0.0, "grad_norm": 22.883270058128822, "kl": 2.50390625, "learning_rate": 1.9351299335199285e-05, "loss": 0.1992, "num_tokens": 306641838.0, "reward": 0.79150390625, "reward_std": 0.28061577677726746, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.009765625, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.75634765625, "rewards/tag_count_reward/std": 0.24094915390014648, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 400.45703125, "completions/mean_terminated_length": 254.3603515625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.204489203721089, "frac_reward_zero_std": 0.0, "grad_norm": 34.11496508004652, "kl": 5.69921875, "learning_rate": 1.9347071684148475e-05, "loss": 0.5384, "num_tokens": 306938696.0, "reward": 0.765625, "reward_std": 0.2806857228279114, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.01953125, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.27623477578163147, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.359375, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 367.283203125, "completions/mean_terminated_length": 330.72613525390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.20483058803447982, "frac_reward_zero_std": 0.0, "grad_norm": 381.90884020350205, "kl": 2.548828125, "learning_rate": 1.9342830766639013e-05, "loss": 0.1582, "num_tokens": 307212329.0, "reward": 0.77783203125, "reward_std": 0.2214535027742386, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.77197265625, "rewards/tag_count_reward/std": 0.21925558149814606, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.171875, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 486.431640625, "completions/mean_terminated_length": 446.02178955078125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.20517197234787063, "frac_reward_zero_std": 0.0, "grad_norm": 16.084375981077955, "kl": 1.30859375, "learning_rate": 1.9338576588690103e-05, "loss": 0.0801, "num_tokens": 307543142.0, "reward": 0.85498046875, "reward_std": 0.20681136846542358, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.83935546875, "rewards/tag_count_reward/std": 0.1914231777191162, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.25, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 540.681640625, "completions/mean_terminated_length": 503.53662109375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.20551335666126141, "frac_reward_zero_std": 0.0, "grad_norm": 15.941114184904277, "kl": 3.01953125, "learning_rate": 1.9334309156339778e-05, "loss": 0.1753, "num_tokens": 307904531.0, "reward": 0.90771484375, "reward_std": 0.21625730395317078, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.001953125, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.88037109375, "rewards/tag_count_reward/std": 0.18629789352416992, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 592.2421875, "completions/mean_terminated_length": 556.2927856445312, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.20585474097465223, "frac_reward_zero_std": 0.03125, "grad_norm": 74.11282046679982, "kl": 8.2265625, "learning_rate": 1.933002847564487e-05, "loss": 0.4159, "num_tokens": 308288239.0, "reward": 0.93212890625, "reward_std": 0.1774749755859375, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.10772226005792618, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.91650390625, "rewards/tag_count_reward/std": 0.16473926603794098, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.171875, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 611.603515625, "completions/mean_terminated_length": 565.1939086914062, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.20619612528804301, "frac_reward_zero_std": 0.0, "grad_norm": 20.546166706061463, "kl": 3.044921875, "learning_rate": 1.932573455268103e-05, "loss": 0.2098, "num_tokens": 308685972.0, "reward": 0.95166015625, "reward_std": 0.2125062644481659, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.91259765625, "rewards/tag_count_reward/std": 0.1649363487958908, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.171875, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 594.26171875, "completions/mean_terminated_length": 554.85400390625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.20653750960143383, "frac_reward_zero_std": 0.0, "grad_norm": 207.5461162410878, "kl": 4.8125, "learning_rate": 1.932142739354269e-05, "loss": 0.2663, "num_tokens": 309068986.0, "reward": 0.90673828125, "reward_std": 0.2212596833705902, "rewards/accuracy_reward/mean": 0.009765625, "rewards/accuracy_reward/std": 0.09843364357948303, "rewards/format_reward/mean": 0.009765625, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.88720703125, "rewards/tag_count_reward/std": 0.2023526281118393, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 671.5859375, "completions/mean_terminated_length": 561.7932739257812, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.2068788939148246, "frac_reward_zero_std": 0.0, "grad_norm": 16837.82821606865, "kl": 11.40625, "learning_rate": 1.9317107004343078e-05, "loss": 0.6495, "num_tokens": 309496246.0, "reward": 0.85986328125, "reward_std": 0.24510937929153442, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08812850713729858, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.84814453125, "rewards/tag_count_reward/std": 0.24181610345840454, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 742.919921875, "completions/mean_terminated_length": 528.9879150390625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.20722027822821543, "frac_reward_zero_std": 0.0, "grad_norm": 4401.891050117203, "kl": 18.734375, "learning_rate": 1.9312773391214197e-05, "loss": 1.0633, "num_tokens": 309956477.0, "reward": 0.9306640625, "reward_std": 0.4780745506286621, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.15234375, "rewards/format_reward/std": 0.35970520973205566, "rewards/tag_count_reward/mean": 0.7724609375, "rewards/tag_count_reward/std": 0.30884450674057007, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 736.486328125, "completions/mean_terminated_length": 574.0950927734375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.2075616625416062, "frac_reward_zero_std": 0.0, "grad_norm": 75.32932924930279, "kl": 15.171875, "learning_rate": 1.9308426560306817e-05, "loss": 0.8803, "num_tokens": 310418870.0, "reward": 0.978515625, "reward_std": 0.49280229210853577, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.19921875, "rewards/format_reward/std": 0.39980348944664, "rewards/tag_count_reward/mean": 0.779296875, "rewards/tag_count_reward/std": 0.2990053594112396, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 928.310546875, "completions/mean_terminated_length": 669.9845581054688, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.20790304685499703, "frac_reward_zero_std": 0.0, "grad_norm": 57.47602829014051, "kl": 9.578125, "learning_rate": 1.9304066517790465e-05, "loss": 0.7099, "num_tokens": 310976549.0, "reward": 0.81201171875, "reward_std": 0.4603969156742096, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.123046875, "rewards/format_reward/std": 0.32881227135658264, "rewards/tag_count_reward/mean": 0.68701171875, "rewards/tag_count_reward/std": 0.3567386567592621, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1420.0390625, "completions/mean_terminated_length": 854.0303955078125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.2082444311683878, "frac_reward_zero_std": 0.0, "grad_norm": 2864.6635432926955, "kl": 10.171875, "learning_rate": 1.9299693269853433e-05, "loss": 0.6729, "num_tokens": 311797337.0, "reward": 0.486328125, "reward_std": 0.3746780455112457, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.013671875, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.47265625, "rewards/tag_count_reward/std": 0.3750203847885132, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1982.05859375, "completions/mean_terminated_length": 1344.625, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.20858581548177862, "frac_reward_zero_std": 0.0625, "grad_norm": 442.35424028939866, "kl": 8.3828125, "learning_rate": 1.929530682270274e-05, "loss": 0.3695, "num_tokens": 312896903.0, "reward": 0.09130859375, "reward_std": 0.1637609899044037, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.09130859375, "rewards/tag_count_reward/std": 0.2047327607870102, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 2018.544921875, "completions/mean_terminated_length": 1293.9500732421875, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.2089271997951694, "frac_reward_zero_std": 0.3125, "grad_norm": 267.25280987257287, "kl": 6.6484375, "learning_rate": 1.9290907182564146e-05, "loss": 0.2749, "num_tokens": 314020270.0, "reward": 0.02978515625, "reward_std": 0.0758780688047409, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.02978515625, "rewards/tag_count_reward/std": 0.1137191578745842, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 1980.392578125, "completions/mean_terminated_length": 1369.2745361328125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.20926858410856022, "frac_reward_zero_std": 0.03125, "grad_norm": 41.31514773925588, "kl": 5.1640625, "learning_rate": 1.9286494355682138e-05, "loss": 0.2497, "num_tokens": 315111591.0, "reward": 0.1259765625, "reward_std": 0.20236381888389587, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1259765625, "rewards/tag_count_reward/std": 0.23355714976787567, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1762.51171875, "completions/mean_terminated_length": 1116.98095703125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.209609968421951, "frac_reward_zero_std": 0.0, "grad_norm": 25.426304095178384, "kl": 5.34375, "learning_rate": 1.9282068348319913e-05, "loss": 0.3644, "num_tokens": 316097757.0, "reward": 0.35498046875, "reward_std": 0.33237552642822266, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35498046875, "rewards/tag_count_reward/std": 0.3598483204841614, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 1263.337890625, "completions/mean_terminated_length": 922.2869873046875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.20995135273534182, "frac_reward_zero_std": 0.0, "grad_norm": 13.605893091956311, "kl": 13.359375, "learning_rate": 1.927762916675938e-05, "loss": 0.814, "num_tokens": 316839594.0, "reward": 0.673828125, "reward_std": 0.36832356452941895, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.013671875, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.66015625, "rewards/tag_count_reward/std": 0.3724021017551422, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1039.39453125, "completions/mean_terminated_length": 790.0280151367188, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.2102927370487326, "frac_reward_zero_std": 0.0, "grad_norm": 136664.24521862817, "kl": 24.359375, "learning_rate": 1.927317681730115e-05, "loss": 1.234, "num_tokens": 317461204.0, "reward": 0.81298828125, "reward_std": 0.3747243285179138, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0546875, "rewards/format_reward/std": 0.2275916188955307, "rewards/tag_count_reward/mean": 0.75830078125, "rewards/tag_count_reward/std": 0.3427391052246094, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 1016.400390625, "completions/mean_terminated_length": 791.1143798828125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.21063412136212342, "frac_reward_zero_std": 0.0, "grad_norm": 18950.94407135541, "kl": 84.90625, "learning_rate": 1.9268711306264512e-05, "loss": 3.6701, "num_tokens": 318070097.0, "reward": 0.79296875, "reward_std": 0.35344913601875305, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.06243881583213806, "rewards/format_reward/mean": 0.021484375, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.767578125, "rewards/tag_count_reward/std": 0.32497718930244446, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 994.798828125, "completions/mean_terminated_length": 771.8525390625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2109755056755142, "frac_reward_zero_std": 0.0, "grad_norm": 19.04287156882581, "kl": 10.03125, "learning_rate": 1.926423263998745e-05, "loss": 0.6961, "num_tokens": 318654186.0, "reward": 0.85302734375, "reward_std": 0.3852663040161133, "rewards/accuracy_reward/mean": 0.001953125, "rewards/accuracy_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 0.07421875, "rewards/format_reward/std": 0.2623828947544098, "rewards/tag_count_reward/mean": 0.77685546875, "rewards/tag_count_reward/std": 0.3410678505897522, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 880.802734375, "completions/mean_terminated_length": 701.2559204101562, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.21131688998890502, "frac_reward_zero_std": 0.03125, "grad_norm": 23.536981574142764, "kl": 11.75, "learning_rate": 1.9259740824826604e-05, "loss": 0.7117, "num_tokens": 319189157.0, "reward": 0.96044921875, "reward_std": 0.3863375782966614, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.126953125, "rewards/format_reward/std": 0.33324605226516724, "rewards/tag_count_reward/mean": 0.83349609375, "rewards/tag_count_reward/std": 0.2797574996948242, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1391.0, "completions/mean_length": 828.3515625, "completions/mean_terminated_length": 686.3525390625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.2116582743022958, "frac_reward_zero_std": 0.0, "grad_norm": 16.635017375217778, "kl": 4.8515625, "learning_rate": 1.925523586715729e-05, "loss": 0.3964, "num_tokens": 319696377.0, "reward": 1.10400390625, "reward_std": 0.44946593046188354, "rewards/accuracy_reward/mean": 0.005859375, "rewards/accuracy_reward/std": 0.07639661431312561, "rewards/format_reward/mean": 0.23046875, "rewards/format_reward/std": 0.42154473066329956, "rewards/tag_count_reward/mean": 0.86767578125, "rewards/tag_count_reward/std": 0.24569696187973022, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.828125, "completions/max_length": 1429.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 709.994140625, "completions/mean_terminated_length": 633.725341796875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.21199965861568662, "frac_reward_zero_std": 0.03125, "grad_norm": 29.09021203711036, "kl": 6.0703125, "learning_rate": 1.9250717773373463e-05, "loss": 0.3885, "num_tokens": 320143398.0, "reward": 1.21142578125, "reward_std": 0.45997172594070435, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.279296875, "rewards/format_reward/std": 0.44909247756004333, "rewards/tag_count_reward/mean": 0.90087890625, "rewards/tag_count_reward/std": 0.20166675746440887, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 735.70703125, "completions/mean_terminated_length": 655.8369140625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.2123410429290774, "frac_reward_zero_std": 0.0, "grad_norm": 28.151340291064482, "kl": 5.73046875, "learning_rate": 1.924618654988774e-05, "loss": 0.4257, "num_tokens": 320603856.0, "reward": 1.42529296875, "reward_std": 0.557567834854126, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.44921875, "rewards/format_reward/std": 0.497901052236557, "rewards/tag_count_reward/mean": 0.92529296875, "rewards/tag_count_reward/std": 0.18679475784301758, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.265625, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 714.2578125, "completions/mean_terminated_length": 668.9849853515625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.21268242724246822, "frac_reward_zero_std": 0.0, "grad_norm": 49.04143402570228, "kl": 1.71484375, "learning_rate": 1.9241642203131356e-05, "loss": 0.2287, "num_tokens": 321048948.0, "reward": 1.43505859375, "reward_std": 0.5916279554367065, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.44921875, "rewards/format_reward/std": 0.497901052236557, "rewards/tag_count_reward/mean": 0.93896484375, "rewards/tag_count_reward/std": 0.17007049918174744, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 723.447265625, "completions/mean_terminated_length": 662.8890991210938, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.213023811555859, "frac_reward_zero_std": 0.0, "grad_norm": 16.508996456014064, "kl": 1.26171875, "learning_rate": 1.9237084739554186e-05, "loss": 0.2096, "num_tokens": 321528633.0, "reward": 1.38623046875, "reward_std": 0.5709192156791687, "rewards/accuracy_reward/mean": 0.030241934582591057, "rewards/accuracy_reward/std": 0.1714252382516861, "rewards/format_reward/mean": 0.423828125, "rewards/format_reward/std": 0.4946470856666565, "rewards/tag_count_reward/mean": 0.93310546875, "rewards/tag_count_reward/std": 0.17637556791305542, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.421875, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 661.662109375, "completions/mean_terminated_length": 617.8104858398438, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.21336519586924982, "frac_reward_zero_std": 0.0, "grad_norm": 9.47200998880435, "kl": 1.76171875, "learning_rate": 1.9232514165624714e-05, "loss": 0.2454, "num_tokens": 321956700.0, "reward": 1.5859375, "reward_std": 0.5828517079353333, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.607421875, "rewards/format_reward/std": 0.4888018071651459, "rewards/tag_count_reward/mean": 0.939453125, "rewards/tag_count_reward/std": 0.16327762603759766, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.6875, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 608.5859375, "completions/mean_terminated_length": 589.1748046875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.2137065801826406, "frac_reward_zero_std": 0.03125, "grad_norm": 401.9848462988652, "kl": 27.5625, "learning_rate": 1.9227930487830037e-05, "loss": 1.232, "num_tokens": 322354120.0, "reward": 1.66796875, "reward_std": 0.5318648815155029, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.68359375, "rewards/format_reward/std": 0.46552830934524536, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1495569944381714, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 626.048828125, "completions/mean_terminated_length": 595.2827758789062, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.21404796449603142, "frac_reward_zero_std": 0.0, "grad_norm": 2129.0019331087365, "kl": 142.5625, "learning_rate": 1.922333371267584e-05, "loss": 5.844, "num_tokens": 322758849.0, "reward": 1.724609375, "reward_std": 0.5159881114959717, "rewards/accuracy_reward/mean": 0.04032257944345474, "rewards/accuracy_reward/std": 0.19691328704357147, "rewards/format_reward/mean": 0.724609375, "rewards/format_reward/std": 0.44714778661727905, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11885584890842438, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.703125, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 607.841796875, "completions/mean_terminated_length": 581.7687377929688, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2143893488094222, "frac_reward_zero_std": 0.0, "grad_norm": 438.9369390102765, "kl": 50.1875, "learning_rate": 1.921872384668641e-05, "loss": 2.1326, "num_tokens": 323155312.0, "reward": 1.70458984375, "reward_std": 0.5371509790420532, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.71484375, "rewards/format_reward/std": 0.45193037390708923, "rewards/tag_count_reward/mean": 0.96044921875, "rewards/tag_count_reward/std": 0.1399720162153244, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.796875, "completions/max_length": 1299.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 541.873046875, "completions/mean_terminated_length": 527.6092529296875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.21473073312281302, "frac_reward_zero_std": 0.0, "grad_norm": 8.035244593931369, "kl": 3.515625, "learning_rate": 1.9214100896404607e-05, "loss": 0.2415, "num_tokens": 323521471.0, "reward": 1.71240234375, "reward_std": 0.5006308555603027, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.724609375, "rewards/format_reward/std": 0.44714778661727905, "rewards/tag_count_reward/mean": 0.96630859375, "rewards/tag_count_reward/std": 0.11042812466621399, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 542.857421875, "completions/mean_terminated_length": 525.9818115234375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.2150721174362038, "frac_reward_zero_std": 0.0, "grad_norm": 6.450574313884225, "kl": 1.033203125, "learning_rate": 1.9209464868391863e-05, "loss": 0.1404, "num_tokens": 323877990.0, "reward": 1.69140625, "reward_std": 0.5426321029663086, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.45732781291007996, "rewards/tag_count_reward/mean": 0.955078125, "rewards/tag_count_reward/std": 0.13520874083042145, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 511.900390625, "completions/mean_terminated_length": 495.50201416015625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.21541350174959462, "frac_reward_zero_std": 0.03125, "grad_norm": 9.594420478333612, "kl": 0.6201171875, "learning_rate": 1.9204815769228174e-05, "loss": 0.111, "num_tokens": 324220131.0, "reward": 1.67822265625, "reward_std": 0.5660897493362427, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.701171875, "rewards/format_reward/std": 0.45819199085235596, "rewards/tag_count_reward/mean": 0.94384765625, "rewards/tag_count_reward/std": 0.1591816246509552, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 484.42578125, "completions/mean_terminated_length": 475.19879150390625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.2157548860629854, "frac_reward_zero_std": 0.0, "grad_norm": 4.909454593870483, "kl": 0.73046875, "learning_rate": 1.9200153605512085e-05, "loss": 0.0995, "num_tokens": 324550989.0, "reward": 1.69287109375, "reward_std": 0.5149257183074951, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.712890625, "rewards/format_reward/std": 0.45285552740097046, "rewards/tag_count_reward/mean": 0.95458984375, "rewards/tag_count_reward/std": 0.1390611082315445, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.640625, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 509.71484375, "completions/mean_terminated_length": 493.13494873046875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.21609627037637621, "frac_reward_zero_std": 0.03125, "grad_norm": 7.91552869749862, "kl": 1.53125, "learning_rate": 1.9195478383860684e-05, "loss": 0.1543, "num_tokens": 324891099.0, "reward": 1.7763671875, "reward_std": 0.48613423109054565, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.41380295157432556, "rewards/tag_count_reward/mean": 0.9638671875, "rewards/tag_count_reward/std": 0.10796592384576797, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 451.150390625, "completions/mean_terminated_length": 445.481201171875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.216437654689767, "frac_reward_zero_std": 0.03125, "grad_norm": 10.8425320836682, "kl": 2.931640625, "learning_rate": 1.919079011090959e-05, "loss": 0.1808, "num_tokens": 325200568.0, "reward": 1.85546875, "reward_std": 0.37818431854248047, "rewards/accuracy_reward/mean": 0.012096773833036423, "rewards/accuracy_reward/std": 0.10942844301462173, "rewards/format_reward/mean": 0.865234375, "rewards/format_reward/std": 0.3418070077896118, "rewards/tag_count_reward/mean": 0.978515625, "rewards/tag_count_reward/std": 0.0783722847700119, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.765625, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 525.146484375, "completions/mean_terminated_length": 515.8048095703125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.21677903900315781, "frac_reward_zero_std": 0.0625, "grad_norm": 118.36484086700797, "kl": 19.40625, "learning_rate": 1.918608879331296e-05, "loss": 0.826, "num_tokens": 325554355.0, "reward": 1.89306640625, "reward_std": 0.37193071842193604, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.31241437792778015, "rewards/tag_count_reward/mean": 0.97119140625, "rewards/tag_count_reward/std": 0.0977993980050087, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.765625, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 540.34765625, "completions/mean_terminated_length": 532.1549072265625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.2171204233165486, "frac_reward_zero_std": 0.09375, "grad_norm": 50.29668709674924, "kl": 8.62109375, "learning_rate": 1.918137443774344e-05, "loss": 0.387, "num_tokens": 325912693.0, "reward": 1.9189453125, "reward_std": 0.3707965016365051, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.90234375, "rewards/format_reward/std": 0.29713961482048035, "rewards/tag_count_reward/mean": 0.9716796875, "rewards/tag_count_reward/std": 0.0921514481306076, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.671875, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 540.947265625, "completions/mean_terminated_length": 528.433837890625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.2174618076299394, "frac_reward_zero_std": 0.09375, "grad_norm": 3.4557853313487943, "kl": 2.9677734375, "learning_rate": 1.91766470508922e-05, "loss": 0.1865, "num_tokens": 326275082.0, "reward": 1.88623046875, "reward_std": 0.40363866090774536, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.880859375, "rewards/format_reward/std": 0.32427072525024414, "rewards/tag_count_reward/mean": 0.96826171875, "rewards/tag_count_reward/std": 0.1087804064154625, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 520.98828125, "completions/mean_terminated_length": 512.777099609375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.2178031919433302, "frac_reward_zero_std": 0.0625, "grad_norm": 7.917572397801049, "kl": 0.6171875, "learning_rate": 1.9171906639468908e-05, "loss": 0.0786, "num_tokens": 326615956.0, "reward": 1.9423828125, "reward_std": 0.34262293577194214, "rewards/accuracy_reward/mean": 0.05040322616696358, "rewards/accuracy_reward/std": 0.21899642050266266, "rewards/format_reward/mean": 0.91796875, "rewards/format_reward/std": 0.2746807038784027, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.08358203619718552, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.765625, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 510.7421875, "completions/mean_terminated_length": 500.7182922363281, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.218144576256721, "frac_reward_zero_std": 0.21875, "grad_norm": 9.273150814014235, "kl": 0.68798828125, "learning_rate": 1.9167153210201702e-05, "loss": 0.0933, "num_tokens": 326956272.0, "reward": 1.95166015625, "reward_std": 0.28634098172187805, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.935546875, "rewards/format_reward/std": 0.24579854309558868, "rewards/tag_count_reward/mean": 0.98095703125, "rewards/tag_count_reward/std": 0.08424676209688187, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 514.96875, "completions/mean_terminated_length": 514.1019897460938, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.2184859605701118, "frac_reward_zero_std": 0.34375, "grad_norm": 27.89121423103594, "kl": 2.7470703125, "learning_rate": 1.916238676983721e-05, "loss": 0.1281, "num_tokens": 327301328.0, "reward": 2.009765625, "reward_std": 0.2296554446220398, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.0685935989022255, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 529.94140625, "completions/mean_terminated_length": 524.8294067382812, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.2188273448835026, "frac_reward_zero_std": 0.34375, "grad_norm": 118.82588654925867, "kl": 10.2216796875, "learning_rate": 1.9157607325140522e-05, "loss": 0.4416, "num_tokens": 327652930.0, "reward": 1.99658203125, "reward_std": 0.21616914868354797, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.98876953125, "rewards/tag_count_reward/std": 0.05635581538081169, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 500.2421875, "completions/mean_terminated_length": 495.7032165527344, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2191687291968934, "frac_reward_zero_std": 0.375, "grad_norm": 25.556274231531695, "kl": 3.17041015625, "learning_rate": 1.915281488289519e-05, "loss": 0.1605, "num_tokens": 327992254.0, "reward": 1.99853515625, "reward_std": 0.212384894490242, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.98486328125, "rewards/tag_count_reward/std": 0.07751258462667465, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 523.125, "completions/mean_terminated_length": 517.3984375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2195101135102842, "frac_reward_zero_std": 0.34375, "grad_norm": 6.580658200618257, "kl": 0.32666015625, "learning_rate": 1.91480094499032e-05, "loss": 0.0531, "num_tokens": 328345454.0, "reward": 1.97900390625, "reward_std": 0.22517889738082886, "rewards/accuracy_reward/mean": 0.04032257944345474, "rewards/accuracy_reward/std": 0.19691328704357147, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21157780289649963, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.06779315322637558, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.828125, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 498.56640625, "completions/mean_terminated_length": 492.1457214355469, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.219851497823675, "frac_reward_zero_std": 0.28125, "grad_norm": 6.192302494807084, "kl": 0.3212890625, "learning_rate": 1.914319103298499e-05, "loss": 0.0483, "num_tokens": 328680352.0, "reward": 1.9912109375, "reward_std": 0.255888432264328, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.951171875, "rewards/format_reward/std": 0.2157193273305893, "rewards/tag_count_reward/mean": 0.9833984375, "rewards/tag_count_reward/std": 0.07145245373249054, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 525.345703125, "completions/mean_terminated_length": 522.4209594726562, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.2201928821370658, "frac_reward_zero_std": 0.46875, "grad_norm": 3.9375759845522635, "kl": 0.30419921875, "learning_rate": 1.9138359638979423e-05, "loss": 0.0372, "num_tokens": 329029841.0, "reward": 1.98583984375, "reward_std": 0.16634424030780792, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.058665309101343155, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.796875, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 537.517578125, "completions/mean_terminated_length": 531.1703491210938, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.2205342664504566, "frac_reward_zero_std": 0.5, "grad_norm": 5.6799442115696435, "kl": 0.6015625, "learning_rate": 1.9133515274743773e-05, "loss": 0.0648, "num_tokens": 329399530.0, "reward": 1.96435546875, "reward_std": 0.18469011783599854, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.955078125, "rewards/format_reward/std": 0.20733514428138733, "rewards/tag_count_reward/mean": 0.98779296875, "rewards/tag_count_reward/std": 0.06234500929713249, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 577.1015625, "completions/mean_terminated_length": 573.3280639648438, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.2208756507638474, "frac_reward_zero_std": 0.5, "grad_norm": 404.73990960555614, "kl": 46.3828125, "learning_rate": 1.9128657947153737e-05, "loss": 1.8815, "num_tokens": 329781598.0, "reward": 1.99755859375, "reward_std": 0.18550288677215576, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.98779296875, "rewards/tag_count_reward/std": 0.0780286192893982, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 604.380859375, "completions/mean_terminated_length": 602.5068969726562, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.2212170350772382, "frac_reward_zero_std": 0.375, "grad_norm": 396.6304619990057, "kl": 41.590087890625, "learning_rate": 1.9123787663103398e-05, "loss": 1.685, "num_tokens": 330168609.0, "reward": 2.03271484375, "reward_std": 0.1982235610485077, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.03936556726694107, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 627.12890625, "completions/mean_terminated_length": 624.6672973632812, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.221558419390629, "frac_reward_zero_std": 0.4375, "grad_norm": 383.44415724984606, "kl": 44.3427734375, "learning_rate": 1.9118904429505236e-05, "loss": 1.7967, "num_tokens": 330568979.0, "reward": 2.03662109375, "reward_std": 0.17710793018341064, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.04235878214240074, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1395.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 645.3125, "completions/mean_terminated_length": 642.4066772460938, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.2218998037040198, "frac_reward_zero_std": 0.5, "grad_norm": 15.360655790079436, "kl": 2.684326171875, "learning_rate": 1.9114008253290108e-05, "loss": 0.1302, "num_tokens": 330985747.0, "reward": 2.111328125, "reward_std": 0.1888924390077591, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.060067638754844666, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 692.5625, "completions/mean_terminated_length": 683.0040283203125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.2222411880174106, "frac_reward_zero_std": 0.40625, "grad_norm": 5.338008194490763, "kl": 1.134765625, "learning_rate": 1.9109099141407235e-05, "loss": 0.0979, "num_tokens": 331419171.0, "reward": 1.98046875, "reward_std": 0.20367029309272766, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05942792445421219, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 722.71875, "completions/mean_terminated_length": 716.4561767578125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.2225825723308014, "frac_reward_zero_std": 0.4375, "grad_norm": 5.050122480796413, "kl": 0.35498046875, "learning_rate": 1.910417710082421e-05, "loss": 0.0409, "num_tokens": 331875171.0, "reward": 2.0087890625, "reward_std": 0.22241222858428955, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.1939331740140915, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.06322204321622849, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.703125, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 750.97265625, "completions/mean_terminated_length": 738.1156005859375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.2229239566441922, "frac_reward_zero_std": 0.34375, "grad_norm": 7.612432344026052, "kl": 0.5380859375, "learning_rate": 1.9099242138526967e-05, "loss": 0.0688, "num_tokens": 332344837.0, "reward": 1.99609375, "reward_std": 0.2702275514602661, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.947265625, "rewards/format_reward/std": 0.22372129559516907, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.06301766633987427, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 812.064453125, "completions/mean_terminated_length": 795.686279296875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.223265340957583, "frac_reward_zero_std": 0.34375, "grad_norm": 6.67535145341048, "kl": 1.02392578125, "learning_rate": 1.9094294261519787e-05, "loss": 0.0822, "num_tokens": 332847846.0, "reward": 1.97607421875, "reward_std": 0.23755869269371033, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.2029850035905838, "rewards/tag_count_reward/mean": 0.98779296875, "rewards/tag_count_reward/std": 0.06234500929713249, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.828125, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 792.6484375, "completions/mean_terminated_length": 784.7884521484375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.2236067252709738, "frac_reward_zero_std": 0.3125, "grad_norm": 3.6329726540880105, "kl": 0.3759765625, "learning_rate": 1.9089333476825265e-05, "loss": 0.0495, "num_tokens": 333331890.0, "reward": 2.0029296875, "reward_std": 0.24777719378471375, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.955078125, "rewards/format_reward/std": 0.20733514428138733, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.05074625089764595, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 777.419921875, "completions/mean_terminated_length": 767.2100219726562, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.2239481095843646, "frac_reward_zero_std": 0.28125, "grad_norm": 63.14404439826683, "kl": 13.65283203125, "learning_rate": 1.9084359791484338e-05, "loss": 0.5885, "num_tokens": 333811225.0, "reward": 2.06005859375, "reward_std": 0.27992069721221924, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.958984375, "rewards/format_reward/std": 0.19852031767368317, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.04483545944094658, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 1515.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 800.044921875, "completions/mean_terminated_length": 790.5438232421875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.2242894938977554, "frac_reward_zero_std": 0.34375, "grad_norm": 667.793877878653, "kl": 112.0625, "learning_rate": 1.9079373212556242e-05, "loss": 4.5143, "num_tokens": 334298880.0, "reward": 1.998046875, "reward_std": 0.20138373970985413, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043540701270103455, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 778.615234375, "completions/mean_terminated_length": 773.7371826171875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2246308782111462, "frac_reward_zero_std": 0.5, "grad_norm": 287.3021892344694, "kl": 47.808837890625, "learning_rate": 1.9074373747118504e-05, "loss": 1.9272, "num_tokens": 334772171.0, "reward": 1.99658203125, "reward_std": 0.1589832752943039, "rewards/accuracy_reward/mean": 0.030241934582591057, "rewards/accuracy_reward/std": 0.1714252382516861, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.03951093927025795, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.828125, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 818.017578125, "completions/mean_terminated_length": 810.4351196289062, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.224972262524537, "frac_reward_zero_std": 0.28125, "grad_norm": 89.21748807295326, "kl": 20.4345703125, "learning_rate": 1.9069361402266962e-05, "loss": 0.8521, "num_tokens": 335278292.0, "reward": 2.0244140625, "reward_std": 0.25218287110328674, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.055701348930597305, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 792.76171875, "completions/mean_terminated_length": 787.1485595703125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.2253136468379278, "frac_reward_zero_std": 0.3125, "grad_norm": 7.961848609570634, "kl": 3.1083984375, "learning_rate": 1.9064336185115718e-05, "loss": 0.1515, "num_tokens": 335763466.0, "reward": 2.0087890625, "reward_std": 0.23141621053218842, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.05346047133207321, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 778.94921875, "completions/mean_terminated_length": 770.2381591796875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.2256550311513186, "frac_reward_zero_std": 0.40625, "grad_norm": 1.3934875944769056, "kl": 1.02099609375, "learning_rate": 1.9059298102797145e-05, "loss": 0.0702, "num_tokens": 336245984.0, "reward": 2.00048828125, "reward_std": 0.21550479531288147, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.1939331740140915, "rewards/tag_count_reward/mean": 0.98876953125, "rewards/tag_count_reward/std": 0.06632548570632935, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 742.865234375, "completions/mean_terminated_length": 736.761962890625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2259964154647094, "frac_reward_zero_std": 0.3125, "grad_norm": 3.5523223479348993, "kl": 0.49609375, "learning_rate": 1.9054247162461888e-05, "loss": 0.0424, "num_tokens": 336709595.0, "reward": 2.06591796875, "reward_std": 0.2498883306980133, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.962890625, "rewards/format_reward/std": 0.18921469151973724, "rewards/tag_count_reward/mean": 0.98974609375, "rewards/tag_count_reward/std": 0.06829868257045746, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 730.70703125, "completions/mean_terminated_length": 726.2138671875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.2263377997781002, "frac_reward_zero_std": 0.53125, "grad_norm": 2.787916955113399, "kl": 0.8095703125, "learning_rate": 1.9049183371278828e-05, "loss": 0.0586, "num_tokens": 337173877.0, "reward": 2.0341796875, "reward_std": 0.17134781181812286, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.059932272881269455, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 655.431640625, "completions/mean_terminated_length": 654.9080200195312, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.226679184091491, "frac_reward_zero_std": 0.5, "grad_norm": 0.7111506147017174, "kl": 0.271728515625, "learning_rate": 1.9044106736435092e-05, "loss": 0.0208, "num_tokens": 337587762.0, "reward": 2.03662109375, "reward_std": 0.15430888533592224, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.05028042942285538, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 1322.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 639.7578125, "completions/mean_terminated_length": 636.4181518554688, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2270205684048818, "frac_reward_zero_std": 0.59375, "grad_norm": 18.686902223353304, "kl": 2.625, "learning_rate": 1.9039017265136032e-05, "loss": 0.1257, "num_tokens": 338002246.0, "reward": 1.97607421875, "reward_std": 0.14811784029006958, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.05897396802902222, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 635.35546875, "completions/mean_terminated_length": 634.001953125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.2273619527182726, "frac_reward_zero_std": 0.53125, "grad_norm": 7.3754155573777185, "kl": 1.14794921875, "learning_rate": 1.903391496460522e-05, "loss": 0.0486, "num_tokens": 338412300.0, "reward": 2.07177734375, "reward_std": 0.17919717729091644, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 586.830078125, "completions/mean_terminated_length": 584.0453491210938, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.2277033370316634, "frac_reward_zero_std": 0.28125, "grad_norm": 10.887755463807872, "kl": 1.90869140625, "learning_rate": 1.9028799842084442e-05, "loss": 0.0816, "num_tokens": 338797205.0, "reward": 2.01220703125, "reward_std": 0.25288158655166626, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.2029850035905838, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.06284875422716141, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 603.072265625, "completions/mean_terminated_length": 599.8836669921875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.22804472134505419, "frac_reward_zero_std": 0.34375, "grad_norm": 1.1828626740784682, "kl": 1.0478515625, "learning_rate": 1.9023671904833682e-05, "loss": 0.0724, "num_tokens": 339184890.0, "reward": 2.02294921875, "reward_std": 0.2530953586101532, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.958984375, "rewards/format_reward/std": 0.19852031767368317, "rewards/tag_count_reward/mean": 0.98583984375, "rewards/tag_count_reward/std": 0.08375762403011322, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 620.169921875, "completions/mean_terminated_length": 617.2750854492188, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.228386105658445, "frac_reward_zero_std": 0.40625, "grad_norm": 1.5083432270083068, "kl": 0.2890625, "learning_rate": 1.9018531160131103e-05, "loss": 0.0329, "num_tokens": 339585713.0, "reward": 1.99462890625, "reward_std": 0.21819886565208435, "rewards/accuracy_reward/mean": 0.0463709682226181, "rewards/accuracy_reward/std": 0.21049949526786804, "rewards/format_reward/mean": 0.955078125, "rewards/format_reward/std": 0.20733514428138733, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.036283548921346664, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 628.984375, "completions/mean_terminated_length": 626.7171020507812, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.22872748997183578, "frac_reward_zero_std": 0.46875, "grad_norm": 1.5110832463457617, "kl": 0.236328125, "learning_rate": 1.9013377615273054e-05, "loss": 0.0265, "num_tokens": 339995241.0, "reward": 2.0673828125, "reward_std": 0.20106813311576843, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.06938997656106949, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1083.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 622.55078125, "completions/mean_terminated_length": 621.1470947265625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2290688742852266, "frac_reward_zero_std": 0.5625, "grad_norm": 1.0939794005386914, "kl": 0.2158203125, "learning_rate": 1.9008211277574047e-05, "loss": 0.0123, "num_tokens": 340387619.0, "reward": 2.07275390625, "reward_std": 0.1632240116596222, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 689.55078125, "completions/mean_terminated_length": 689.55078125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.22941025859861738, "frac_reward_zero_std": 0.625, "grad_norm": 0.2443629141872389, "kl": 0.199462890625, "learning_rate": 1.9003032154366762e-05, "loss": 0.0158, "num_tokens": 340848573.0, "reward": 2.00537109375, "reward_std": 0.12854893505573273, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 709.365234375, "completions/mean_terminated_length": 708.1569213867188, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.2297516429120082, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5054882210597255, "kl": 0.396728515625, "learning_rate": 1.8997840253002008e-05, "loss": 0.0255, "num_tokens": 341296744.0, "reward": 2.04345703125, "reward_std": 0.20704588294029236, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 714.119140625, "completions/mean_terminated_length": 714.119140625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.23009302722539898, "frac_reward_zero_std": 0.40625, "grad_norm": 0.17614100945140954, "kl": 0.1806640625, "learning_rate": 1.8992635580848738e-05, "loss": 0.0259, "num_tokens": 341744837.0, "reward": 1.978515625, "reward_std": 0.23541906476020813, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.939453125, "rewards/format_reward/std": 0.2387305200099945, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043540701270103455, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 774.818359375, "completions/mean_terminated_length": 771.218505859375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.2304344115387898, "frac_reward_zero_std": 0.34375, "grad_norm": 22.82366035495352, "kl": 3.71875, "learning_rate": 1.8987418145294045e-05, "loss": 0.1677, "num_tokens": 342228376.0, "reward": 2.00390625, "reward_std": 0.2543540894985199, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.955078125, "rewards/format_reward/std": 0.20733514428138733, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.06925903260707855, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 773.7890625, "completions/mean_terminated_length": 773.7890625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.23077579585218058, "frac_reward_zero_std": 0.4375, "grad_norm": 0.16929337201530145, "kl": 0.170654296875, "learning_rate": 1.898218795374311e-05, "loss": 0.0145, "num_tokens": 342711148.0, "reward": 2.091796875, "reward_std": 0.20242804288864136, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1518.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 798.392578125, "completions/mean_terminated_length": 797.0587158203125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.2311171801655714, "frac_reward_zero_std": 0.46875, "grad_norm": 9.018447669248332, "kl": 2.083740234375, "learning_rate": 1.897694501361924e-05, "loss": 0.0928, "num_tokens": 343209061.0, "reward": 1.9951171875, "reward_std": 0.1840175986289978, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06218579038977623, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1929.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 803.392578125, "completions/mean_terminated_length": 800.3182983398438, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.23145856447896218, "frac_reward_zero_std": 0.46875, "grad_norm": 1.0671147728395016, "kl": 0.83056640625, "learning_rate": 1.8971689332363833e-05, "loss": 0.0499, "num_tokens": 343697662.0, "reward": 2.0771484375, "reward_std": 0.20392721891403198, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 822.130859375, "completions/mean_terminated_length": 821.6594848632812, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.231799948792353, "frac_reward_zero_std": 0.53125, "grad_norm": 0.5007265332776789, "kl": 0.205810546875, "learning_rate": 1.8966420917436357e-05, "loss": 0.0104, "num_tokens": 344202977.0, "reward": 2.0595703125, "reward_std": 0.17628931999206543, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06412246823310852, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 809.2265625, "completions/mean_terminated_length": 807.3823852539062, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.23214133310574378, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6176997264902349, "kl": 0.183349609375, "learning_rate": 1.8961139776314374e-05, "loss": 0.0082, "num_tokens": 344708357.0, "reward": 1.89208984375, "reward_std": 0.48240748047828674, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.865234375, "rewards/format_reward/std": 0.3418070077896118, "rewards/tag_count_reward/mean": 0.94482421875, "rewards/tag_count_reward/std": 0.14674407243728638, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 692.21875, "completions/mean_terminated_length": 690.998046875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.2324827174191346, "frac_reward_zero_std": 0.03125, "grad_norm": 0.3150256593380208, "kl": 0.3212890625, "learning_rate": 1.8955845916493493e-05, "loss": 0.0095, "num_tokens": 345142293.0, "reward": 1.66064453125, "reward_std": 0.6749235391616821, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.705078125, "rewards/format_reward/std": 0.4564536213874817, "rewards/tag_count_reward/mean": 0.85009765625, "rewards/tag_count_reward/std": 0.2301088571548462, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 661.826171875, "completions/mean_terminated_length": 661.5714111328125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.23282410173252538, "frac_reward_zero_std": 0.25, "grad_norm": 0.3628962307601668, "kl": 0.20458984375, "learning_rate": 1.8950539345487376e-05, "loss": 0.0207, "num_tokens": 345562572.0, "reward": 1.98974609375, "reward_std": 0.31024956703186035, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.943359375, "rewards/format_reward/std": 0.23138070106506348, "rewards/tag_count_reward/mean": 0.97802734375, "rewards/tag_count_reward/std": 0.08919412642717361, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 624.216796875, "completions/mean_terminated_length": 622.6149291992188, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.2331654860459162, "frac_reward_zero_std": 0.375, "grad_norm": 1.2960423730141202, "kl": 0.342529296875, "learning_rate": 1.894522007082774e-05, "loss": 0.0208, "num_tokens": 345972555.0, "reward": 2.01220703125, "reward_std": 0.23095479607582092, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.06476560980081558, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 556.41796875, "completions/mean_terminated_length": 555.498046875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.23350687035930698, "frac_reward_zero_std": 0.28125, "grad_norm": 0.49292943835790926, "kl": 0.43310546875, "learning_rate": 1.8939888100064314e-05, "loss": 0.0322, "num_tokens": 346337297.0, "reward": 1.99658203125, "reward_std": 0.2540942430496216, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.1939331740140915, "rewards/tag_count_reward/mean": 0.97119140625, "rewards/tag_count_reward/std": 0.11070467531681061, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 514.529296875, "completions/mean_terminated_length": 513.2549438476562, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.2338482546726978, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6282785339933225, "kl": 0.48876953125, "learning_rate": 1.8934543440764854e-05, "loss": 0.041, "num_tokens": 346685488.0, "reward": 2.01171875, "reward_std": 0.1696675568819046, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.057698383927345276, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 555.544921875, "completions/mean_terminated_length": 554.7549438476562, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.23418963898608858, "frac_reward_zero_std": 0.5, "grad_norm": 31.797527351434393, "kl": 5.181884765625, "learning_rate": 1.8929186100515137e-05, "loss": 0.2172, "num_tokens": 347051815.0, "reward": 2.04296875, "reward_std": 0.20281068980693817, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05386113002896309, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 612.16015625, "completions/mean_terminated_length": 611.5968627929688, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.2345310232994794, "frac_reward_zero_std": 0.46875, "grad_norm": 20.890981587157547, "kl": 3.8681640625, "learning_rate": 1.8923816086918916e-05, "loss": 0.1621, "num_tokens": 347450249.0, "reward": 2.015625, "reward_std": 0.17003706097602844, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1869.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 836.3984375, "completions/mean_terminated_length": 832.5855102539062, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.23487240761287018, "frac_reward_zero_std": 0.34375, "grad_norm": 5.058336805530098, "kl": 1.386474609375, "learning_rate": 1.8918433407597948e-05, "loss": 0.0968, "num_tokens": 347987621.0, "reward": 1.9482421875, "reward_std": 0.20006674528121948, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08812850713729858, "rewards/format_reward/mean": 0.951171875, "rewards/format_reward/std": 0.2157193273305893, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.07256709784269333, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 909.94140625, "completions/mean_terminated_length": 900.9802856445312, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.235213791926261, "frac_reward_zero_std": 0.34375, "grad_norm": 0.26211177846991773, "kl": 0.255126953125, "learning_rate": 1.8913038070191964e-05, "loss": 0.0616, "num_tokens": 348539127.0, "reward": 1.95654296875, "reward_std": 0.23301362991333008, "rewards/accuracy_reward/mean": 0.026209676638245583, "rewards/accuracy_reward/std": 0.1599196344614029, "rewards/format_reward/mean": 0.939453125, "rewards/format_reward/std": 0.2387305200099945, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.047485124319791794, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 1001.96875, "completions/mean_terminated_length": 997.86669921875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.23555517623965178, "frac_reward_zero_std": 0.53125, "grad_norm": 0.17401548201612058, "kl": 0.18408203125, "learning_rate": 1.8907630082358657e-05, "loss": 0.0339, "num_tokens": 349137063.0, "reward": 1.998046875, "reward_std": 0.16825857758522034, "rewards/accuracy_reward/mean": 0.038306452333927155, "rewards/accuracy_reward/std": 0.19212885200977325, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0347534641623497, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 928.494140625, "completions/mean_terminated_length": 927.3619995117188, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.2358965605530426, "frac_reward_zero_std": 0.375, "grad_norm": 0.5036971340276302, "kl": 0.186767578125, "learning_rate": 1.8902209451773674e-05, "loss": 0.0319, "num_tokens": 349694676.0, "reward": 2.00732421875, "reward_std": 0.2198643684387207, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.056862205266952515, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 968.47265625, "completions/mean_terminated_length": 965.0628051757812, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.23623794486643337, "frac_reward_zero_std": 0.5625, "grad_norm": 1.3651489070730412, "kl": 0.221923828125, "learning_rate": 1.889677618613061e-05, "loss": 0.0391, "num_tokens": 350285158.0, "reward": 1.98876953125, "reward_std": 0.1439269483089447, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.03951093927025795, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 843.22265625, "completions/mean_terminated_length": 842.387451171875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.2365793291798242, "frac_reward_zero_std": 0.375, "grad_norm": 0.4911460530903395, "kl": 0.198974609375, "learning_rate": 1.8891330293140995e-05, "loss": 0.0244, "num_tokens": 350792200.0, "reward": 1.9921875, "reward_std": 0.1848812848329544, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.057698383927345276, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 834.033203125, "completions/mean_terminated_length": 802.955810546875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.23692071349321497, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5012551869775229, "kl": 0.364990234375, "learning_rate": 1.8885871780534273e-05, "loss": 0.0806, "num_tokens": 351297369.0, "reward": 1.98583984375, "reward_std": 0.19380617141723633, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.97998046875, "rewards/tag_count_reward/std": 0.09623336046934128, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 676.087890625, "completions/mean_terminated_length": 669.868408203125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.2372620978066058, "frac_reward_zero_std": 0.28125, "grad_norm": 0.3232214235901771, "kl": 0.2314453125, "learning_rate": 1.8880400656057805e-05, "loss": 0.0544, "num_tokens": 351722758.0, "reward": 2.00244140625, "reward_std": 0.18124976754188538, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.98681640625, "rewards/tag_count_reward/std": 0.06214544549584389, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 607.09765625, "completions/mean_terminated_length": 598.6051025390625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.23760348211999657, "frac_reward_zero_std": 0.28125, "grad_norm": 0.2415837596256046, "kl": 0.22705078125, "learning_rate": 1.8874916927476857e-05, "loss": 0.0704, "num_tokens": 352118664.0, "reward": 2.02783203125, "reward_std": 0.20636031031608582, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.98486328125, "rewards/tag_count_reward/std": 0.08358632773160934, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 542.462890625, "completions/mean_terminated_length": 539.5166015625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.2379448664333874, "frac_reward_zero_std": 0.5, "grad_norm": 0.20267637578964093, "kl": 0.22802734375, "learning_rate": 1.8869420602574574e-05, "loss": 0.0292, "num_tokens": 352478181.0, "reward": 2.04150390625, "reward_std": 0.14401453733444214, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99072265625, "rewards/tag_count_reward/std": 0.0588279627263546, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 545.5078125, "completions/mean_terminated_length": 545.5078125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.23828625074677817, "frac_reward_zero_std": 0.53125, "grad_norm": 0.19128301334876777, "kl": 0.22998046875, "learning_rate": 1.8863911689151987e-05, "loss": 0.0212, "num_tokens": 352837481.0, "reward": 2.01025390625, "reward_std": 0.14744237065315247, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.05480858311057091, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 554.03515625, "completions/mean_terminated_length": 554.03515625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.23862763506016899, "frac_reward_zero_std": 0.5, "grad_norm": 0.19747968690212558, "kl": 0.223388671875, "learning_rate": 1.8858390195027986e-05, "loss": 0.015, "num_tokens": 353198939.0, "reward": 2.08251953125, "reward_std": 0.16467493772506714, "rewards/accuracy_reward/mean": 0.09879032522439957, "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 522.548828125, "completions/mean_terminated_length": 522.548828125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.23896901937355977, "frac_reward_zero_std": 0.59375, "grad_norm": 0.19469135718202066, "kl": 0.220703125, "learning_rate": 1.8852856128039327e-05, "loss": 0.021, "num_tokens": 353542756.0, "reward": 2.076171875, "reward_std": 0.16269893944263458, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0347534641623497, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 573.39453125, "completions/mean_terminated_length": 573.39453125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.23931040368695058, "frac_reward_zero_std": 0.5625, "grad_norm": 0.21366183214623485, "kl": 0.197509765625, "learning_rate": 1.88473094960406e-05, "loss": 0.0122, "num_tokens": 353920990.0, "reward": 2.04296875, "reward_std": 0.16229617595672607, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1076.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 578.724609375, "completions/mean_terminated_length": 577.75146484375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2396517880003414, "frac_reward_zero_std": 0.59375, "grad_norm": 1.386081961532928, "kl": 0.414794921875, "learning_rate": 1.884175030690424e-05, "loss": 0.02, "num_tokens": 354305841.0, "reward": 2.0673828125, "reward_std": 0.13903596997261047, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 660.67578125, "completions/mean_terminated_length": 659.886474609375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.23999317231373218, "frac_reward_zero_std": 0.46875, "grad_norm": 5.529521649918981, "kl": 1.26953125, "learning_rate": 1.88361785685205e-05, "loss": 0.0608, "num_tokens": 354718475.0, "reward": 2.0126953125, "reward_std": 0.17463375627994537, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 474.82421875, "completions/mean_terminated_length": 474.1683044433594, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.240334556627123, "frac_reward_zero_std": 0.09375, "grad_norm": 1.2242242074260918, "kl": 0.470703125, "learning_rate": 1.8830594288797438e-05, "loss": 0.0817, "num_tokens": 355042897.0, "reward": 1.9697265625, "reward_std": 0.3665688633918762, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.892578125, "rewards/format_reward/std": 0.30995169281959534, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.05141965299844742, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 579.00390625, "completions/mean_terminated_length": 575.84619140625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.24067594094051378, "frac_reward_zero_std": 0.375, "grad_norm": 2.302909508453275, "kl": 0.41845703125, "learning_rate": 1.8824997475660925e-05, "loss": 0.0383, "num_tokens": 355421875.0, "reward": 2.06591796875, "reward_std": 0.2455468773841858, "rewards/accuracy_reward/mean": 0.1088709682226181, "rewards/accuracy_reward/std": 0.31179171800613403, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.052656810730695724, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 814.791015625, "completions/mean_terminated_length": 808.4990234375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.2410173252539046, "frac_reward_zero_std": 0.375, "grad_norm": 0.7754755818290427, "kl": 0.3037109375, "learning_rate": 1.8819388137054605e-05, "loss": 0.0503, "num_tokens": 355925560.0, "reward": 2.064453125, "reward_std": 0.23884792625904083, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.947265625, "rewards/format_reward/std": 0.22372129559516907, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04910992085933685, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 922.18359375, "completions/mean_terminated_length": 920.9080200195312, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.24135870956729538, "frac_reward_zero_std": 0.25, "grad_norm": 4.899116854167915, "kl": 0.915771484375, "learning_rate": 1.8813766280939917e-05, "loss": 0.0641, "num_tokens": 356485686.0, "reward": 1.8974609375, "reward_std": 0.3288388252258301, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.8203125, "rewards/format_reward/std": 0.38430243730545044, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 928.357421875, "completions/mean_terminated_length": 924.5560302734375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.2417000938806862, "frac_reward_zero_std": 0.53125, "grad_norm": 0.5629613524098477, "kl": 0.230224609375, "learning_rate": 1.8808131915296046e-05, "loss": 0.0362, "num_tokens": 357047821.0, "reward": 2.01708984375, "reward_std": 0.16341650485992432, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 963.044921875, "completions/mean_terminated_length": 958.91357421875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.24204147819407698, "frac_reward_zero_std": 0.625, "grad_norm": 0.29731474889409815, "kl": 0.341064453125, "learning_rate": 1.8802485048119956e-05, "loss": 0.0338, "num_tokens": 357615764.0, "reward": 2.02783203125, "reward_std": 0.1312284618616104, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 928.611328125, "completions/mean_terminated_length": 922.7086791992188, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.2423828625074678, "frac_reward_zero_std": 0.5625, "grad_norm": 4.00195281806466, "kl": 1.55029296875, "learning_rate": 1.8796825687426334e-05, "loss": 0.0727, "num_tokens": 358168029.0, "reward": 2.05810546875, "reward_std": 0.15651792287826538, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 877.857421875, "completions/mean_terminated_length": 874.0098266601562, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.24272424682085858, "frac_reward_zero_std": 0.5, "grad_norm": 1.528178655131469, "kl": 0.913818359375, "learning_rate": 1.8791153841247613e-05, "loss": 0.0611, "num_tokens": 358699684.0, "reward": 2.08984375, "reward_std": 0.18509919941425323, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 769.048828125, "completions/mean_terminated_length": 766.5459594726562, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.2430656311342494, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14730965054568115, "kl": 0.201416015625, "learning_rate": 1.878546951763394e-05, "loss": 0.0273, "num_tokens": 359174621.0, "reward": 2.07080078125, "reward_std": 0.142983078956604, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1139.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 710.740234375, "completions/mean_terminated_length": 710.740234375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.24340701544764018, "frac_reward_zero_std": 0.78125, "grad_norm": 0.12152994236685485, "kl": 0.232177734375, "learning_rate": 1.8779772724653166e-05, "loss": 0.0091, "num_tokens": 359618616.0, "reward": 2.0126953125, "reward_std": 0.07051706314086914, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 641.982421875, "completions/mean_terminated_length": 640.7059326171875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.243748399761031, "frac_reward_zero_std": 0.625, "grad_norm": 1.0592912867004693, "kl": 0.244140625, "learning_rate": 1.8774063470390856e-05, "loss": 0.028, "num_tokens": 360029999.0, "reward": 2.03076171875, "reward_std": 0.11806728690862656, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 636.0859375, "completions/mean_terminated_length": 635.6046752929688, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.24408978407442178, "frac_reward_zero_std": 0.71875, "grad_norm": 0.3863060841761403, "kl": 0.330078125, "learning_rate": 1.8768341762950246e-05, "loss": 0.0228, "num_tokens": 360431755.0, "reward": 2.06494140625, "reward_std": 0.11203382909297943, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 612.021484375, "completions/mean_terminated_length": 605.542236328125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.2444311683878126, "frac_reward_zero_std": 0.5625, "grad_norm": 0.41037440076491427, "kl": 0.542236328125, "learning_rate": 1.8762607610452255e-05, "loss": 0.0586, "num_tokens": 360828806.0, "reward": 2.044921875, "reward_std": 0.1582881212234497, "rewards/accuracy_reward/mean": 0.06451612710952759, "rewards/accuracy_reward/std": 0.2459181249141693, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 591.1484375, "completions/mean_terminated_length": 590.630126953125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.24477255270120338, "frac_reward_zero_std": 0.625, "grad_norm": 4.96970918748099, "kl": 1.171875, "learning_rate": 1.8756861021035463e-05, "loss": 0.067, "num_tokens": 361210946.0, "reward": 2.05078125, "reward_std": 0.1436268389225006, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04406425356864929, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 599.83984375, "completions/mean_terminated_length": 597.005859375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.2451139370145942, "frac_reward_zero_std": 0.5, "grad_norm": 0.20399247558998893, "kl": 0.20263671875, "learning_rate": 1.8751102002856107e-05, "loss": 0.0411, "num_tokens": 361598464.0, "reward": 2.0556640625, "reward_std": 0.18719644844532013, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 651.806640625, "completions/mean_terminated_length": 651.806640625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.24545532132798498, "frac_reward_zero_std": 0.6875, "grad_norm": 0.17399403031460803, "kl": 0.18896484375, "learning_rate": 1.8745330564088053e-05, "loss": 0.0121, "num_tokens": 362014509.0, "reward": 2.04296875, "reward_std": 0.11213831603527069, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1288.0, "completions/max_terminated_length": 1288.0, "completions/mean_length": 678.107421875, "completions/mean_terminated_length": 675.7156982421875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.2457967056413758, "frac_reward_zero_std": 0.5625, "grad_norm": 3.027891147300104, "kl": 0.746826171875, "learning_rate": 1.8739546712922805e-05, "loss": 0.0434, "num_tokens": 362437284.0, "reward": 2.04638671875, "reward_std": 0.16124294698238373, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1240.0, "completions/max_terminated_length": 1240.0, "completions/mean_length": 703.4453125, "completions/mean_terminated_length": 702.8258056640625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.24613808995476658, "frac_reward_zero_std": 0.5, "grad_norm": 0.40715633425735853, "kl": 0.2841796875, "learning_rate": 1.8733750457569485e-05, "loss": 0.0269, "num_tokens": 362873752.0, "reward": 2.06494140625, "reward_std": 0.18680770695209503, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1374.0, "completions/max_terminated_length": 1374.0, "completions/mean_length": 691.232421875, "completions/mean_terminated_length": 690.6927490234375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2464794742681574, "frac_reward_zero_std": 0.71875, "grad_norm": 0.4307057327736994, "kl": 0.25244140625, "learning_rate": 1.8727941806254816e-05, "loss": 0.0165, "num_tokens": 363300287.0, "reward": 2.0048828125, "reward_std": 0.088307224214077, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 676.33984375, "completions/mean_terminated_length": 676.33984375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.24682085858154817, "frac_reward_zero_std": 0.71875, "grad_norm": 0.15257885699327675, "kl": 0.187744140625, "learning_rate": 1.8722120767223114e-05, "loss": 0.0123, "num_tokens": 363725949.0, "reward": 2.01953125, "reward_std": 0.08510598540306091, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 648.466796875, "completions/mean_terminated_length": 648.466796875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.247162242894939, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2046974178681094, "kl": 0.195556640625, "learning_rate": 1.8716287348736282e-05, "loss": 0.0019, "num_tokens": 364145020.0, "reward": 2.0546875, "reward_std": 0.09561453759670258, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 615.3359375, "completions/mean_terminated_length": 615.3359375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.24750362720832977, "frac_reward_zero_std": 0.71875, "grad_norm": 0.14579642374429877, "kl": 0.212646484375, "learning_rate": 1.871044155907379e-05, "loss": 0.0113, "num_tokens": 364541720.0, "reward": 2.0087890625, "reward_std": 0.08419187366962433, "rewards/accuracy_reward/mean": 0.018145160749554634, "rewards/accuracy_reward/std": 0.1336110383272171, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1093.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 623.505859375, "completions/mean_terminated_length": 622.5255126953125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.2478450115217206, "frac_reward_zero_std": 0.5, "grad_norm": 3.7832540403170913, "kl": 0.276611328125, "learning_rate": 1.8704583406532664e-05, "loss": 0.0283, "num_tokens": 364948891.0, "reward": 2.00244140625, "reward_std": 0.16799931228160858, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.052765581756830215, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 575.7265625, "completions/mean_terminated_length": 574.3831176757812, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.24818639583511137, "frac_reward_zero_std": 0.59375, "grad_norm": 1.347660868911599, "kl": 0.2421875, "learning_rate": 1.8698712899427482e-05, "loss": 0.0159, "num_tokens": 365344159.0, "reward": 2.02001953125, "reward_std": 0.146458238363266, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 588.125, "completions/mean_terminated_length": 587.612548828125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.2485277801485022, "frac_reward_zero_std": 0.625, "grad_norm": 0.45467957975565093, "kl": 0.228515625, "learning_rate": 1.8692830046090356e-05, "loss": 0.0221, "num_tokens": 365728079.0, "reward": 2.02294921875, "reward_std": 0.1189655289053917, "rewards/accuracy_reward/mean": 0.04032257944345474, "rewards/accuracy_reward/std": 0.19691328704357147, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 581.216796875, "completions/mean_terminated_length": 580.6019897460938, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.24886916446189297, "frac_reward_zero_std": 0.59375, "grad_norm": 7.473753168298465, "kl": 1.506591796875, "learning_rate": 1.8686934854870925e-05, "loss": 0.0756, "num_tokens": 366106814.0, "reward": 1.9990234375, "reward_std": 0.1443006843328476, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1092.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 604.52734375, "completions/mean_terminated_length": 603.5733642578125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.24921054877528379, "frac_reward_zero_std": 0.40625, "grad_norm": 0.6855131020332847, "kl": 0.25732421875, "learning_rate": 1.8681027334136324e-05, "loss": 0.0261, "num_tokens": 366493948.0, "reward": 2.00439453125, "reward_std": 0.2151053249835968, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.94140625, "rewards/format_reward/std": 0.23509246110916138, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1322.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 682.09375, "completions/mean_terminated_length": 682.09375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.24955193308867457, "frac_reward_zero_std": 0.71875, "grad_norm": 0.17883139637027953, "kl": 0.22119140625, "learning_rate": 1.8675107492271208e-05, "loss": 0.0166, "num_tokens": 366927548.0, "reward": 2.01904296875, "reward_std": 0.08904820680618286, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 656.953125, "completions/mean_terminated_length": 656.26220703125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.24989331740206538, "frac_reward_zero_std": 0.53125, "grad_norm": 0.6527343473479479, "kl": 0.45068359375, "learning_rate": 1.86691753376777e-05, "loss": 0.0354, "num_tokens": 367341636.0, "reward": 2.05517578125, "reward_std": 0.1807306706905365, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 838.509765625, "completions/mean_terminated_length": 837.6888427734375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.2502347017154562, "frac_reward_zero_std": 0.65625, "grad_norm": 0.5913976243898903, "kl": 0.40380859375, "learning_rate": 1.866323087877542e-05, "loss": 0.0244, "num_tokens": 367861929.0, "reward": 2.03564453125, "reward_std": 0.12295182794332504, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 936.509765625, "completions/mean_terminated_length": 851.8104858398438, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.25057608602884696, "frac_reward_zero_std": 0.0, "grad_norm": 9.376993104650992, "kl": 1.45947265625, "learning_rate": 1.865727412400143e-05, "loss": 0.2283, "num_tokens": 368429022.0, "reward": 1.8095703125, "reward_std": 0.4299490451812744, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.91015625, "rewards/format_reward/std": 0.2862374484539032, "rewards/tag_count_reward/mean": 0.8701171875, "rewards/tag_count_reward/std": 0.2432565838098526, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 800.91796875, "completions/mean_terminated_length": 800.91796875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.25091747034223777, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12312533751706578, "kl": 0.184814453125, "learning_rate": 1.865130508181026e-05, "loss": 0.0137, "num_tokens": 368922836.0, "reward": 2.04931640625, "reward_std": 0.13967756927013397, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 810.939453125, "completions/mean_terminated_length": 808.8745727539062, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.2512588546556286, "frac_reward_zero_std": 0.65625, "grad_norm": 0.6801386220788931, "kl": 0.27880859375, "learning_rate": 1.864532376067387e-05, "loss": 0.0225, "num_tokens": 369421797.0, "reward": 2.0380859375, "reward_std": 0.11903989315032959, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 696.37109375, "completions/mean_terminated_length": 694.7059326171875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.2516002389690194, "frac_reward_zero_std": 0.625, "grad_norm": 0.2416114046798476, "kl": 0.259521484375, "learning_rate": 1.8639330169081656e-05, "loss": 0.0232, "num_tokens": 369854019.0, "reward": 2.08154296875, "reward_std": 0.14331835508346558, "rewards/accuracy_reward/mean": 0.09677419066429138, "rewards/accuracy_reward/std": 0.2959485352039337, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05285605043172836, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1306.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 679.125, "completions/mean_terminated_length": 679.125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.25194162328241015, "frac_reward_zero_std": 0.625, "grad_norm": 0.1550827582441562, "kl": 0.1962890625, "learning_rate": 1.863332431554042e-05, "loss": 0.0163, "num_tokens": 370280291.0, "reward": 2.080078125, "reward_std": 0.13865847885608673, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1273.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 627.578125, "completions/mean_terminated_length": 627.578125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.25228300759580097, "frac_reward_zero_std": 0.65625, "grad_norm": 0.16521930064030885, "kl": 0.20263671875, "learning_rate": 1.8627306208574372e-05, "loss": 0.011, "num_tokens": 370682283.0, "reward": 2.0634765625, "reward_std": 0.10810750722885132, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 579.525390625, "completions/mean_terminated_length": 578.9432373046875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.2526243919091918, "frac_reward_zero_std": 0.5, "grad_norm": 12.012118995812179, "kl": 1.742919921875, "learning_rate": 1.862127585672512e-05, "loss": 0.0722, "num_tokens": 371062088.0, "reward": 2.04150390625, "reward_std": 0.14893868565559387, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 541.7734375, "completions/mean_terminated_length": 541.7734375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.2529657762225826, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1441272340302245, "kl": 0.21826171875, "learning_rate": 1.8615233268551645e-05, "loss": 0.0082, "num_tokens": 371420484.0, "reward": 2.08447265625, "reward_std": 0.12188272178173065, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 556.041015625, "completions/mean_terminated_length": 556.041015625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.25330716053597335, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6352967043344359, "kl": 0.225830078125, "learning_rate": 1.8609178452630288e-05, "loss": 0.02, "num_tokens": 371795689.0, "reward": 2.0302734375, "reward_std": 0.13423766195774078, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 560.3671875, "completions/mean_terminated_length": 559.7905883789062, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.25364854484936417, "frac_reward_zero_std": 0.75, "grad_norm": 0.6689540161918642, "kl": 0.551513671875, "learning_rate": 1.8603111417554763e-05, "loss": 0.0266, "num_tokens": 372166869.0, "reward": 2.03955078125, "reward_std": 0.09636025875806808, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 572.60546875, "completions/mean_terminated_length": 571.8043212890625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.253989929162755, "frac_reward_zero_std": 0.71875, "grad_norm": 0.46062734649060183, "kl": 0.26123046875, "learning_rate": 1.8597032171936103e-05, "loss": 0.0073, "num_tokens": 372538667.0, "reward": 2.05078125, "reward_std": 0.10989578068256378, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 576.138671875, "completions/mean_terminated_length": 576.138671875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.2543313134761458, "frac_reward_zero_std": 0.65625, "grad_norm": 0.26330289222148356, "kl": 0.210205078125, "learning_rate": 1.8590940724402703e-05, "loss": 0.009, "num_tokens": 372914482.0, "reward": 2.09716796875, "reward_std": 0.13630448281764984, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 583.45703125, "completions/mean_terminated_length": 583.45703125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.25467269778953655, "frac_reward_zero_std": 0.53125, "grad_norm": 0.16390933925984055, "kl": 0.2119140625, "learning_rate": 1.8584837083600244e-05, "loss": 0.0138, "num_tokens": 373296284.0, "reward": 2.0791015625, "reward_std": 0.16289277374744415, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 625.23046875, "completions/mean_terminated_length": 625.23046875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.25501408210292736, "frac_reward_zero_std": 0.5625, "grad_norm": 0.19531205239149713, "kl": 0.208740234375, "learning_rate": 1.8578721258191736e-05, "loss": 0.0165, "num_tokens": 373703762.0, "reward": 2.05029296875, "reward_std": 0.15863493084907532, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1707.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 758.078125, "completions/mean_terminated_length": 758.078125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.2553554664163182, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10601486047882147, "kl": 0.189697265625, "learning_rate": 1.857259325685747e-05, "loss": 0.0157, "num_tokens": 374171306.0, "reward": 2.05224609375, "reward_std": 0.11080821603536606, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 797.501953125, "completions/mean_terminated_length": 797.501953125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.255696850729709, "frac_reward_zero_std": 0.46875, "grad_norm": 0.14340281866806404, "kl": 0.192138671875, "learning_rate": 1.8566453088295023e-05, "loss": 0.0203, "num_tokens": 374671067.0, "reward": 2.01416015625, "reward_std": 0.1685619056224823, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 799.763671875, "completions/mean_terminated_length": 798.1412353515625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.25603823504309975, "frac_reward_zero_std": 0.40625, "grad_norm": 0.7813702715248106, "kl": 0.212646484375, "learning_rate": 1.8560300761219246e-05, "loss": 0.0229, "num_tokens": 375161490.0, "reward": 2.0517578125, "reward_std": 0.23468706011772156, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.1939331740140915, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 851.33203125, "completions/mean_terminated_length": 851.33203125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.25637961935649056, "frac_reward_zero_std": 0.34375, "grad_norm": 0.14548098494792291, "kl": 0.1796875, "learning_rate": 1.8554136284362236e-05, "loss": 0.0258, "num_tokens": 375682412.0, "reward": 2.08447265625, "reward_std": 0.24018244445323944, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 920.787109375, "completions/mean_terminated_length": 919.4412231445312, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2567210036698814, "frac_reward_zero_std": 0.4375, "grad_norm": 0.18798318348911616, "kl": 0.3515625, "learning_rate": 1.8547959666473347e-05, "loss": 0.0189, "num_tokens": 376239535.0, "reward": 2.0302734375, "reward_std": 0.21329021453857422, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.1939331740140915, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 870.8828125, "completions/mean_terminated_length": 870.1702270507812, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.2570623879832722, "frac_reward_zero_std": 0.46875, "grad_norm": 0.6464231346097012, "kl": 0.330078125, "learning_rate": 1.8541770916319152e-05, "loss": 0.0231, "num_tokens": 376768179.0, "reward": 2.048828125, "reward_std": 0.19084051251411438, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 816.009765625, "completions/mean_terminated_length": 816.009765625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.25740377229666295, "frac_reward_zero_std": 0.3125, "grad_norm": 0.15473962058177768, "kl": 0.173095703125, "learning_rate": 1.853557004268345e-05, "loss": 0.008, "num_tokens": 377271944.0, "reward": 2.02978515625, "reward_std": 0.2256394624710083, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 793.01171875, "completions/mean_terminated_length": 792.0391235351562, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.25774515661005376, "frac_reward_zero_std": 0.34375, "grad_norm": 0.17580832278859576, "kl": 0.3271484375, "learning_rate": 1.8529357054367252e-05, "loss": 0.0244, "num_tokens": 377757022.0, "reward": 2.060546875, "reward_std": 0.25521421432495117, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.962890625, "rewards/format_reward/std": 0.18921469151973724, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 763.109375, "completions/mean_terminated_length": 763.109375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.2580865409234446, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12206212618349961, "kl": 0.18505859375, "learning_rate": 1.8523131960188757e-05, "loss": 0.014, "num_tokens": 378229414.0, "reward": 2.05615234375, "reward_std": 0.1324760615825653, "rewards/accuracy_reward/mean": 0.06653226166963577, "rewards/accuracy_reward/std": 0.2494617998600006, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 772.30078125, "completions/mean_terminated_length": 770.7117919921875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.2584279252368354, "frac_reward_zero_std": 0.46875, "grad_norm": 0.6559061690627144, "kl": 0.271240234375, "learning_rate": 1.8516894768983346e-05, "loss": 0.0285, "num_tokens": 378709072.0, "reward": 2.044921875, "reward_std": 0.19025182723999023, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 772.759765625, "completions/mean_terminated_length": 772.759765625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.25876930955022615, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1237717232088125, "kl": 0.1611328125, "learning_rate": 1.8510645489603575e-05, "loss": 0.018, "num_tokens": 379189973.0, "reward": 2.09765625, "reward_std": 0.20579296350479126, "rewards/accuracy_reward/mean": 0.11895161122083664, "rewards/accuracy_reward/std": 0.3240584135055542, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 669.138671875, "completions/mean_terminated_length": 669.138671875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.25911069386361696, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10808203386711653, "kl": 0.17626953125, "learning_rate": 1.8504384130919145e-05, "loss": 0.0128, "num_tokens": 379610988.0, "reward": 2.04248046875, "reward_std": 0.10772188752889633, "rewards/accuracy_reward/mean": 0.052419353276491165, "rewards/accuracy_reward/std": 0.22309619188308716, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 650.251953125, "completions/mean_terminated_length": 650.251953125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.25945207817700777, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14470601255028265, "kl": 0.188232421875, "learning_rate": 1.8498110701816916e-05, "loss": 0.0087, "num_tokens": 380029949.0, "reward": 2.07373046875, "reward_std": 0.1786130964756012, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 679.9609375, "completions/mean_terminated_length": 679.136962890625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.2597934624903986, "frac_reward_zero_std": 0.6875, "grad_norm": 1.7490038406360877, "kl": 0.528564453125, "learning_rate": 1.8491825211200872e-05, "loss": 0.0231, "num_tokens": 380459817.0, "reward": 2.080078125, "reward_std": 0.11415327340364456, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 657.392578125, "completions/mean_terminated_length": 657.392578125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.26013484680378934, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11639610835252594, "kl": 0.18310546875, "learning_rate": 1.8485527667992115e-05, "loss": 0.0112, "num_tokens": 380875666.0, "reward": 2.05078125, "reward_std": 0.13365308940410614, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 625.142578125, "completions/mean_terminated_length": 625.142578125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.26047623111718016, "frac_reward_zero_std": 0.625, "grad_norm": 0.13364002177266932, "kl": 0.194091796875, "learning_rate": 1.847921808112886e-05, "loss": 0.0085, "num_tokens": 381281835.0, "reward": 2.072265625, "reward_std": 0.14120477437973022, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 627.572265625, "completions/mean_terminated_length": 626.618408203125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.26081761543057097, "frac_reward_zero_std": 0.71875, "grad_norm": 0.629955844687579, "kl": 0.411865234375, "learning_rate": 1.847289645956641e-05, "loss": 0.0215, "num_tokens": 381680144.0, "reward": 2.0078125, "reward_std": 0.09210042655467987, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 701.23828125, "completions/mean_terminated_length": 699.8414916992188, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.2611589997439618, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3185685051138373, "kl": 0.29296875, "learning_rate": 1.8466562812277146e-05, "loss": 0.018, "num_tokens": 382116410.0, "reward": 2.00634765625, "reward_std": 0.13277921080589294, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1165.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 687.39453125, "completions/mean_terminated_length": 685.9686889648438, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.26150038405735254, "frac_reward_zero_std": 0.5, "grad_norm": 12.495950996142879, "kl": 0.45458984375, "learning_rate": 1.8460217148250525e-05, "loss": 0.019, "num_tokens": 382560084.0, "reward": 2.07373046875, "reward_std": 0.19261382520198822, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 790.216796875, "completions/mean_terminated_length": 790.216796875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.26184176837074336, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10470566315731711, "kl": 0.1796875, "learning_rate": 1.845385947649306e-05, "loss": 0.0066, "num_tokens": 383047395.0, "reward": 2.05322265625, "reward_std": 0.11222708970308304, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 781.5546875, "completions/mean_terminated_length": 781.5546875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.26218315268413417, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1035932764208866, "kl": 0.177490234375, "learning_rate": 1.8447489806028298e-05, "loss": 0.0107, "num_tokens": 383528687.0, "reward": 2.04638671875, "reward_std": 0.13478940725326538, "rewards/accuracy_reward/mean": 0.060483869165182114, "rewards/accuracy_reward/std": 0.2386218160390854, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 757.4609375, "completions/mean_terminated_length": 756.5303344726562, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.262524536997525, "frac_reward_zero_std": 0.71875, "grad_norm": 0.5498276163488196, "kl": 0.247314453125, "learning_rate": 1.8441108145896825e-05, "loss": 0.0104, "num_tokens": 383991979.0, "reward": 2.0126953125, "reward_std": 0.08818808197975159, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 849.4609375, "completions/mean_terminated_length": 849.4609375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.26286592131091574, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11129602862606125, "kl": 0.177001953125, "learning_rate": 1.8434714505156236e-05, "loss": 0.0109, "num_tokens": 384514135.0, "reward": 2.037109375, "reward_std": 0.11093294620513916, "rewards/accuracy_reward/mean": 0.04435483738780022, "rewards/accuracy_reward/std": 0.2060900777578354, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 793.552734375, "completions/mean_terminated_length": 793.552734375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.26320730562430655, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11247406508399456, "kl": 0.17333984375, "learning_rate": 1.8428308892881143e-05, "loss": 0.0057, "num_tokens": 385011026.0, "reward": 2.13037109375, "reward_std": 0.15163201093673706, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 737.58984375, "completions/mean_terminated_length": 737.58984375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.26354868993769737, "frac_reward_zero_std": 0.78125, "grad_norm": 0.08762040222114603, "kl": 0.191650390625, "learning_rate": 1.8421891318163128e-05, "loss": 0.0028, "num_tokens": 385467104.0, "reward": 2.02294921875, "reward_std": 0.07919417321681976, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 733.34765625, "completions/mean_terminated_length": 732.2510375976562, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.2638900742510882, "frac_reward_zero_std": 0.6875, "grad_norm": 0.9412309719028507, "kl": 0.1923828125, "learning_rate": 1.8415461790110778e-05, "loss": 0.0204, "num_tokens": 385922818.0, "reward": 2.01611328125, "reward_std": 0.09083002805709839, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 682.0, "completions/mean_terminated_length": 682.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.26423145856447894, "frac_reward_zero_std": 0.40625, "grad_norm": 0.21167791802435107, "kl": 0.184326171875, "learning_rate": 1.8409020317849622e-05, "loss": 0.0134, "num_tokens": 386355586.0, "reward": 2.11328125, "reward_std": 0.2233310341835022, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 694.28515625, "completions/mean_terminated_length": 694.28515625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.26457284287786975, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1161274935831768, "kl": 0.190185546875, "learning_rate": 1.840256691052216e-05, "loss": 0.0168, "num_tokens": 386788036.0, "reward": 2.09716796875, "reward_std": 0.13080579042434692, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1304.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 679.623046875, "completions/mean_terminated_length": 679.623046875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.26491422719126057, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12008158818626126, "kl": 0.18408203125, "learning_rate": 1.8396101577287815e-05, "loss": 0.011, "num_tokens": 387216323.0, "reward": 2.0302734375, "reward_std": 0.1153111457824707, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 704.48828125, "completions/mean_terminated_length": 704.48828125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.2652556115046514, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10229505701276155, "kl": 0.183837890625, "learning_rate": 1.838962432732295e-05, "loss": 0.0128, "num_tokens": 387662141.0, "reward": 2.076171875, "reward_std": 0.12295899540185928, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1445.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 704.380859375, "completions/mean_terminated_length": 704.380859375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.26559699581804214, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14499392305079317, "kl": 0.181640625, "learning_rate": 1.8383135169820835e-05, "loss": 0.0205, "num_tokens": 388106320.0, "reward": 2.05810546875, "reward_std": 0.16483643651008606, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 750.404296875, "completions/mean_terminated_length": 750.404296875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.26593838013143295, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1400327087029648, "kl": 0.159912109375, "learning_rate": 1.8376634113991643e-05, "loss": 0.006, "num_tokens": 388574895.0, "reward": 2.15771484375, "reward_std": 0.21827611327171326, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1429.0, "completions/max_terminated_length": 1429.0, "completions/mean_length": 684.05078125, "completions/mean_terminated_length": 684.05078125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.26627976444482376, "frac_reward_zero_std": 0.53125, "grad_norm": 0.2314542057773493, "kl": 0.174072265625, "learning_rate": 1.8370121169062436e-05, "loss": 0.0251, "num_tokens": 389003529.0, "reward": 2.0927734375, "reward_std": 0.1783510148525238, "rewards/accuracy_reward/mean": 0.11290322244167328, "rewards/accuracy_reward/std": 0.3167939782142639, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 789.095703125, "completions/mean_terminated_length": 789.095703125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.2666211487582146, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12023436545226494, "kl": 0.1494140625, "learning_rate": 1.8363596344277145e-05, "loss": 0.0194, "num_tokens": 389486746.0, "reward": 2.08837890625, "reward_std": 0.16241201758384705, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1744.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 786.13671875, "completions/mean_terminated_length": 786.13671875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.26696253307160533, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11863646988019184, "kl": 0.145751953125, "learning_rate": 1.8357059648896564e-05, "loss": 0.0272, "num_tokens": 389969840.0, "reward": 2.041015625, "reward_std": 0.15181958675384521, "rewards/accuracy_reward/mean": 0.058467742055654526, "rewards/accuracy_reward/std": 0.23486270010471344, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 845.890625, "completions/mean_terminated_length": 844.391357421875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.26730391738499615, "frac_reward_zero_std": 0.71875, "grad_norm": 4.343193784969712, "kl": 0.65283203125, "learning_rate": 1.8350511092198344e-05, "loss": 0.0292, "num_tokens": 390482872.0, "reward": 2.00927734375, "reward_std": 0.10102302581071854, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 814.0859375, "completions/mean_terminated_length": 814.0859375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.26764530169838696, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11393435706335321, "kl": 0.146728515625, "learning_rate": 1.834395068347695e-05, "loss": 0.0226, "num_tokens": 390979700.0, "reward": 2.009765625, "reward_std": 0.16558581590652466, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.962890625, "rewards/format_reward/std": 0.18921469151973724, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1853.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 808.41796875, "completions/mean_terminated_length": 808.41796875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.2679866860117778, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09826857936057935, "kl": 0.149658203125, "learning_rate": 1.8337378432043695e-05, "loss": 0.0113, "num_tokens": 391472362.0, "reward": 2.06494140625, "reward_std": 0.08695326000452042, "rewards/accuracy_reward/mean": 0.07459677755832672, "rewards/accuracy_reward/std": 0.263004869222641, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1907.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 823.6171875, "completions/mean_terminated_length": 823.6171875, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.26832807032516853, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11157704279870347, "kl": 0.14453125, "learning_rate": 1.833079434722668e-05, "loss": 0.0137, "num_tokens": 391972966.0, "reward": 2.06689453125, "reward_std": 0.1569683849811554, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1450.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 763.5390625, "completions/mean_terminated_length": 763.5390625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.26866945463855935, "frac_reward_zero_std": 0.625, "grad_norm": 0.10996620745231872, "kl": 0.151611328125, "learning_rate": 1.8324198438370814e-05, "loss": 0.0002, "num_tokens": 392443930.0, "reward": 2.08203125, "reward_std": 0.14443901181221008, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 802.28125, "completions/mean_terminated_length": 785.013916015625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.26901083895195016, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3008633999941177, "kl": 0.154296875, "learning_rate": 1.8317590714837784e-05, "loss": 0.0173, "num_tokens": 392935834.0, "reward": 2.068359375, "reward_std": 0.16907358169555664, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05386113002896309, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 736.58984375, "completions/mean_terminated_length": 736.58984375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.269352223265341, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14250438021340822, "kl": 0.158935546875, "learning_rate": 1.831097118600604e-05, "loss": 0.0166, "num_tokens": 393394888.0, "reward": 2.08154296875, "reward_std": 0.20519259572029114, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 746.865234375, "completions/mean_terminated_length": 745.6849365234375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.26969360757873173, "frac_reward_zero_std": 0.46875, "grad_norm": 2.1836563738792476, "kl": 0.9794921875, "learning_rate": 1.8304339861270795e-05, "loss": 0.0591, "num_tokens": 393857491.0, "reward": 2.05224609375, "reward_std": 0.1944778859615326, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 678.462890625, "completions/mean_terminated_length": 678.462890625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.27003499189212254, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11325936140737657, "kl": 0.165771484375, "learning_rate": 1.8297696750044e-05, "loss": 0.0155, "num_tokens": 394275728.0, "reward": 2.046875, "reward_std": 0.10372312366962433, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1812.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 712.642578125, "completions/mean_terminated_length": 712.642578125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.27037637620551336, "frac_reward_zero_std": 0.625, "grad_norm": 0.12345280537198731, "kl": 0.158447265625, "learning_rate": 1.829104186175434e-05, "loss": 0.0078, "num_tokens": 394722489.0, "reward": 1.9921875, "reward_std": 0.11857917159795761, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1429.0, "completions/max_terminated_length": 1429.0, "completions/mean_length": 672.787109375, "completions/mean_terminated_length": 672.1487426757812, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.27071776051890417, "frac_reward_zero_std": 0.5, "grad_norm": 0.28666171140817437, "kl": 0.33251953125, "learning_rate": 1.828437520584721e-05, "loss": 0.0271, "num_tokens": 395152316.0, "reward": 2.0556640625, "reward_std": 0.16792310774326324, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 664.666015625, "completions/mean_terminated_length": 664.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.27105914483229493, "frac_reward_zero_std": 0.53125, "grad_norm": 0.526230994086242, "kl": 0.169921875, "learning_rate": 1.8277696791784703e-05, "loss": 0.0201, "num_tokens": 395596209.0, "reward": 2.060546875, "reward_std": 0.148185133934021, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 680.638671875, "completions/mean_terminated_length": 680.638671875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.27140052914568574, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10604410480230819, "kl": 0.160888671875, "learning_rate": 1.8271006629045618e-05, "loss": 0.0085, "num_tokens": 396018488.0, "reward": 2.04345703125, "reward_std": 0.11875543743371964, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 706.01953125, "completions/mean_terminated_length": 706.01953125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.27174191345907656, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11236794403418786, "kl": 0.160400390625, "learning_rate": 1.826430472712541e-05, "loss": 0.0114, "num_tokens": 396462242.0, "reward": 2.0302734375, "reward_std": 0.10690569877624512, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 656.169921875, "completions/mean_terminated_length": 656.169921875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.27208329777246737, "frac_reward_zero_std": 0.5, "grad_norm": 0.1340799687637902, "kl": 0.153564453125, "learning_rate": 1.82575910955362e-05, "loss": 0.0178, "num_tokens": 396882425.0, "reward": 2.1552734375, "reward_std": 0.21476086974143982, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1795.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 727.322265625, "completions/mean_terminated_length": 727.322265625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.2724246820858581, "frac_reward_zero_std": 0.625, "grad_norm": 0.11616109438930641, "kl": 0.145751953125, "learning_rate": 1.8250865743806766e-05, "loss": 0.0155, "num_tokens": 397343390.0, "reward": 2.091796875, "reward_std": 0.12599126994609833, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 748.94921875, "completions/mean_terminated_length": 747.874755859375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.27276606639924894, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5100855601403236, "kl": 0.18603515625, "learning_rate": 1.8244128681482513e-05, "loss": 0.0077, "num_tokens": 397803268.0, "reward": 2.08544921875, "reward_std": 0.1205199733376503, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 762.095703125, "completions/mean_terminated_length": 762.095703125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.27310745071263975, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10334264094490109, "kl": 0.135986328125, "learning_rate": 1.8237379918125468e-05, "loss": 0.01, "num_tokens": 398274981.0, "reward": 2.06591796875, "reward_std": 0.10109330713748932, "rewards/accuracy_reward/mean": 0.07056451588869095, "rewards/accuracy_reward/std": 0.25635457038879395, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 802.373046875, "completions/mean_terminated_length": 802.373046875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.27344883502603057, "frac_reward_zero_std": 0.625, "grad_norm": 0.10817640236719489, "kl": 0.140380859375, "learning_rate": 1.823061946331427e-05, "loss": -0.0002, "num_tokens": 398768452.0, "reward": 2.07861328125, "reward_std": 0.1094517633318901, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 804.87109375, "completions/mean_terminated_length": 802.4383544921875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.2737902193394214, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08619796525102705, "kl": 0.1337890625, "learning_rate": 1.822384732664414e-05, "loss": 0.0048, "num_tokens": 399262274.0, "reward": 2.0634765625, "reward_std": 0.10609422624111176, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1451.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 758.041015625, "completions/mean_terminated_length": 758.041015625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.27413160365281214, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09234022363489718, "kl": 0.1396484375, "learning_rate": 1.821706351772689e-05, "loss": 0.0121, "num_tokens": 399730519.0, "reward": 2.10107421875, "reward_std": 0.12858039140701294, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 739.275390625, "completions/mean_terminated_length": 739.275390625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.27447298796620295, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12256403714731438, "kl": 0.148681640625, "learning_rate": 1.8210268046190905e-05, "loss": 0.0175, "num_tokens": 400186740.0, "reward": 2.083984375, "reward_std": 0.1559392511844635, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 771.359375, "completions/mean_terminated_length": 767.2745361328125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.27481437227959377, "frac_reward_zero_std": 0.625, "grad_norm": 0.4234930487088023, "kl": 0.156494140625, "learning_rate": 1.8203460921681104e-05, "loss": 0.018, "num_tokens": 400659772.0, "reward": 2.0830078125, "reward_std": 0.1593989133834839, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1368.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 731.833984375, "completions/mean_terminated_length": 731.833984375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.2751557565929846, "frac_reward_zero_std": 0.75, "grad_norm": 0.10491916166026827, "kl": 0.153076171875, "learning_rate": 1.819664215385896e-05, "loss": 0.0053, "num_tokens": 401123191.0, "reward": 2.01953125, "reward_std": 0.0921592265367508, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 686.142578125, "completions/mean_terminated_length": 686.142578125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.27549714090637534, "frac_reward_zero_std": 0.8125, "grad_norm": 0.08993348294107091, "kl": 0.165283203125, "learning_rate": 1.818981175240246e-05, "loss": 0.0083, "num_tokens": 401578288.0, "reward": 2.02197265625, "reward_std": 0.0704539567232132, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 696.314453125, "completions/mean_terminated_length": 695.7025146484375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.27583852521976615, "frac_reward_zero_std": 0.625, "grad_norm": 0.4796604575225995, "kl": 0.358642578125, "learning_rate": 1.818296972700612e-05, "loss": 0.0255, "num_tokens": 402015649.0, "reward": 2.1044921875, "reward_std": 0.15991780161857605, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 699.32421875, "completions/mean_terminated_length": 699.32421875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.27617990953315696, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12908664943990938, "kl": 0.15283203125, "learning_rate": 1.8176116087380933e-05, "loss": 0.0184, "num_tokens": 402464711.0, "reward": 2.13134765625, "reward_std": 0.18136432766914368, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 664.51953125, "completions/mean_terminated_length": 664.51953125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.2765212938465478, "frac_reward_zero_std": 0.75, "grad_norm": 0.10850592215651052, "kl": 0.161865234375, "learning_rate": 1.8169250843254397e-05, "loss": 0.0052, "num_tokens": 402883953.0, "reward": 2.08203125, "reward_std": 0.10144392400979996, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 677.97265625, "completions/mean_terminated_length": 677.97265625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.27686267815993854, "frac_reward_zero_std": 0.75, "grad_norm": 0.12324422236554995, "kl": 0.1650390625, "learning_rate": 1.8162374004370463e-05, "loss": 0.0095, "num_tokens": 403313747.0, "reward": 2.09912109375, "reward_std": 0.09873927384614944, "rewards/accuracy_reward/mean": 0.10483870655298233, "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 656.205078125, "completions/mean_terminated_length": 656.205078125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.27720406247332935, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1753979374585878, "kl": 0.169677734375, "learning_rate": 1.815548558048955e-05, "loss": 0.0041, "num_tokens": 403735180.0, "reward": 2.0830078125, "reward_std": 0.15143023431301117, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 733.494140625, "completions/mean_terminated_length": 733.494140625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.27754544678672016, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1008297288901874, "kl": 0.16259765625, "learning_rate": 1.814858558138851e-05, "loss": 0.0073, "num_tokens": 404193913.0, "reward": 2.04736328125, "reward_std": 0.10621218383312225, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 740.466796875, "completions/mean_terminated_length": 739.0488891601562, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.277886831100111, "frac_reward_zero_std": 0.625, "grad_norm": 1.7453529829896646, "kl": 0.5810546875, "learning_rate": 1.8141674016860636e-05, "loss": 0.0255, "num_tokens": 404652248.0, "reward": 2.017578125, "reward_std": 0.12528301775455475, "rewards/accuracy_reward/mean": 0.03629032149910927, "rewards/accuracy_reward/std": 0.1872003823518753, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 734.18359375, "completions/mean_terminated_length": 734.18359375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.27822821541350173, "frac_reward_zero_std": 0.625, "grad_norm": 0.15770901270070342, "kl": 0.162353515625, "learning_rate": 1.813475089671563e-05, "loss": 0.0102, "num_tokens": 405116054.0, "reward": 2.0693359375, "reward_std": 0.13586071133613586, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 786.173828125, "completions/mean_terminated_length": 786.173828125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.27856959972689255, "frac_reward_zero_std": 0.75, "grad_norm": 0.09160596486403562, "kl": 0.156005859375, "learning_rate": 1.8127816230779588e-05, "loss": 0.0071, "num_tokens": 405607791.0, "reward": 2.08349609375, "reward_std": 0.08277321606874466, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 773.046875, "completions/mean_terminated_length": 773.046875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.27891098404028336, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10592538163714786, "kl": 0.15673828125, "learning_rate": 1.812087002889501e-05, "loss": 0.0029, "num_tokens": 406085223.0, "reward": 2.041015625, "reward_std": 0.11905106902122498, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 736.5625, "completions/mean_terminated_length": 736.5625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.2792523683536742, "frac_reward_zero_std": 0.59375, "grad_norm": 7.568882759930803, "kl": 0.1689453125, "learning_rate": 1.811391230092075e-05, "loss": 0.0014, "num_tokens": 406541047.0, "reward": 2.09326171875, "reward_std": 0.1544884890317917, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1901.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 814.423828125, "completions/mean_terminated_length": 813.7025146484375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.27959375266706493, "frac_reward_zero_std": 0.5, "grad_norm": 0.15562308709984868, "kl": 0.149658203125, "learning_rate": 1.8106943056732036e-05, "loss": 0.0201, "num_tokens": 407044752.0, "reward": 2.06982421875, "reward_std": 0.18778389692306519, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 769.005859375, "completions/mean_terminated_length": 767.890380859375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.27993513698045575, "frac_reward_zero_std": 0.46875, "grad_norm": 6.1538759735765005, "kl": 0.19677734375, "learning_rate": 1.8099962306220427e-05, "loss": 0.0224, "num_tokens": 407516611.0, "reward": 2.09912109375, "reward_std": 0.22144785523414612, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 765.951171875, "completions/mean_terminated_length": 765.951171875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.28027652129384656, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10925709847567679, "kl": 0.146728515625, "learning_rate": 1.8092970059293837e-05, "loss": 0.0096, "num_tokens": 407988954.0, "reward": 2.1064453125, "reward_std": 0.1491161286830902, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 752.03125, "completions/mean_terminated_length": 749.4951171875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.2806179056072374, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1214536960658244, "kl": 0.16455078125, "learning_rate": 1.8085966325876465e-05, "loss": 0.0149, "num_tokens": 408461066.0, "reward": 2.10400390625, "reward_std": 0.11695725470781326, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 725.796875, "completions/mean_terminated_length": 725.796875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.28095928992062813, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14289585907195204, "kl": 0.16357421875, "learning_rate": 1.807895111590884e-05, "loss": 0.0152, "num_tokens": 408917746.0, "reward": 2.10107421875, "reward_std": 0.13706931471824646, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1581.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 731.294921875, "completions/mean_terminated_length": 731.294921875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.28130067423401894, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12828320523651698, "kl": 0.1611328125, "learning_rate": 1.8071924439347765e-05, "loss": 0.0114, "num_tokens": 409380473.0, "reward": 2.119140625, "reward_std": 0.1706247627735138, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1984.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 731.677734375, "completions/mean_terminated_length": 731.677734375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.28164205854740976, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1098158511006886, "kl": 0.165771484375, "learning_rate": 1.8064886306166326e-05, "loss": 0.0171, "num_tokens": 409837492.0, "reward": 2.0732421875, "reward_std": 0.1250484138727188, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 740.875, "completions/mean_terminated_length": 740.875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.28198344286080057, "frac_reward_zero_std": 0.75, "grad_norm": 0.10072106186464463, "kl": 0.163330078125, "learning_rate": 1.805783672635386e-05, "loss": 0.0099, "num_tokens": 410297380.0, "reward": 2.0244140625, "reward_std": 0.08736822754144669, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 694.8984375, "completions/mean_terminated_length": 694.8984375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.28232482717419133, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12024122599012944, "kl": 0.15380859375, "learning_rate": 1.805077570991596e-05, "loss": 0.0082, "num_tokens": 410753616.0, "reward": 2.11572265625, "reward_std": 0.1779591143131256, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 749.501953125, "completions/mean_terminated_length": 749.501953125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.28266621148758214, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08710801162506951, "kl": 0.141845703125, "learning_rate": 1.8043703266874447e-05, "loss": 0.003, "num_tokens": 411223153.0, "reward": 2.06396484375, "reward_std": 0.11798909306526184, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 755.873046875, "completions/mean_terminated_length": 755.873046875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.28300759580097296, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10843253318636724, "kl": 0.1611328125, "learning_rate": 1.803661940726736e-05, "loss": 0.0021, "num_tokens": 411695520.0, "reward": 2.02978515625, "reward_std": 0.13508902490139008, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 763.615234375, "completions/mean_terminated_length": 762.4735717773438, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.28334898011436377, "frac_reward_zero_std": 0.625, "grad_norm": 0.8014249139073002, "kl": 0.18359375, "learning_rate": 1.8029524141148943e-05, "loss": 0.0186, "num_tokens": 412173883.0, "reward": 2.02197265625, "reward_std": 0.12161684036254883, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1660.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 765.41015625, "completions/mean_terminated_length": 765.41015625, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.2836903644277545, "frac_reward_zero_std": 0.75, "grad_norm": 0.08415663836957323, "kl": 0.15380859375, "learning_rate": 1.802241747858963e-05, "loss": 0.0102, "num_tokens": 412646109.0, "reward": 2.04638671875, "reward_std": 0.08948136866092682, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 700.162109375, "completions/mean_terminated_length": 700.162109375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.28403174874114534, "frac_reward_zero_std": 0.625, "grad_norm": 0.10902291089552006, "kl": 0.154052734375, "learning_rate": 1.8015299429676023e-05, "loss": 0.0054, "num_tokens": 413092704.0, "reward": 2.11376953125, "reward_std": 0.12986154854297638, "rewards/accuracy_reward/mean": 0.12916666269302368, "rewards/accuracy_reward/std": 0.3357342481613159, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 735.740234375, "completions/mean_terminated_length": 735.1820068359375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.28437313305453615, "frac_reward_zero_std": 0.59375, "grad_norm": 0.4695835705162757, "kl": 0.15478515625, "learning_rate": 1.8008170004510894e-05, "loss": 0.0141, "num_tokens": 413567451.0, "reward": 2.041015625, "reward_std": 0.1445073038339615, "rewards/accuracy_reward/mean": 0.05645161122083664, "rewards/accuracy_reward/std": 0.23102475702762604, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 695.015625, "completions/mean_terminated_length": 694.2348022460938, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.28471451736792697, "frac_reward_zero_std": 0.59375, "grad_norm": 0.27720830849327904, "kl": 0.154541015625, "learning_rate": 1.800102921321316e-05, "loss": 0.0089, "num_tokens": 414012515.0, "reward": 2.1181640625, "reward_std": 0.16784319281578064, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 719.28515625, "completions/mean_terminated_length": 718.6653442382812, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.2850559016813177, "frac_reward_zero_std": 0.5, "grad_norm": 0.5718871405077267, "kl": 0.201171875, "learning_rate": 1.7993877065917863e-05, "loss": 0.0204, "num_tokens": 414463045.0, "reward": 2.1337890625, "reward_std": 0.18239185214042664, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 700.416015625, "completions/mean_terminated_length": 700.416015625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.28539728599470854, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13200335537451066, "kl": 0.1484375, "learning_rate": 1.7986713572776175e-05, "loss": 0.0137, "num_tokens": 414906234.0, "reward": 2.16259765625, "reward_std": 0.2214335799217224, "rewards/accuracy_reward/mean": 0.197265625, "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 756.478515625, "completions/mean_terminated_length": 756.478515625, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.28573867030809935, "frac_reward_zero_std": 0.5, "grad_norm": 0.12209675970721624, "kl": 0.142333984375, "learning_rate": 1.7979538743955357e-05, "loss": 0.0107, "num_tokens": 415374495.0, "reward": 2.07568359375, "reward_std": 0.17172186076641083, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 809.1015625, "completions/mean_terminated_length": 809.1015625, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.28608005462149017, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10580531119843611, "kl": 0.141845703125, "learning_rate": 1.7972352589638774e-05, "loss": 0.0058, "num_tokens": 415872963.0, "reward": 2.05029296875, "reward_std": 0.09452622383832932, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 763.982421875, "completions/mean_terminated_length": 763.982421875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.2864214389348809, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12781492261669744, "kl": 0.141357421875, "learning_rate": 1.796515512002585e-05, "loss": 0.0085, "num_tokens": 416340842.0, "reward": 2.07080078125, "reward_std": 0.1711217164993286, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 765.71484375, "completions/mean_terminated_length": 765.71484375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.28676282324827174, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12511660015043094, "kl": 0.14501953125, "learning_rate": 1.795794634533208e-05, "loss": 0.0222, "num_tokens": 416812296.0, "reward": 2.033203125, "reward_std": 0.15441367030143738, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 800.00390625, "completions/mean_terminated_length": 800.00390625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.28710420756166255, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12405870751781571, "kl": 0.144287109375, "learning_rate": 1.7950726275789e-05, "loss": 0.0065, "num_tokens": 417302202.0, "reward": 2.056640625, "reward_std": 0.15289202332496643, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 731.71484375, "completions/mean_terminated_length": 731.71484375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.28744559187505336, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1375044645583613, "kl": 0.14453125, "learning_rate": 1.7943494921644182e-05, "loss": 0.0144, "num_tokens": 417750104.0, "reward": 2.11962890625, "reward_std": 0.20312078297138214, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 793.8359375, "completions/mean_terminated_length": 793.8359375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.2877869761884441, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3376224352178844, "kl": 0.193359375, "learning_rate": 1.7936252293161205e-05, "loss": 0.0125, "num_tokens": 418242756.0, "reward": 2.09033203125, "reward_std": 0.16102544963359833, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1961.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 813.646484375, "completions/mean_terminated_length": 813.646484375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.28812836050183493, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10884380728550097, "kl": 0.141357421875, "learning_rate": 1.792899840061966e-05, "loss": 0.0146, "num_tokens": 418742303.0, "reward": 2.0615234375, "reward_std": 0.1450706124305725, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04109063372015953, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 734.34375, "completions/mean_terminated_length": 728.0059204101562, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.28846974481522575, "frac_reward_zero_std": 0.03125, "grad_norm": 138.40942107037756, "kl": 18.49755859375, "learning_rate": 1.792173325431512e-05, "loss": 0.7716, "num_tokens": 419199055.0, "reward": 1.6923828125, "reward_std": 0.6239789724349976, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.751953125, "rewards/format_reward/std": 0.4323015511035919, "rewards/tag_count_reward/mean": 0.8798828125, "rewards/tag_count_reward/std": 0.21553099155426025, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 774.8828125, "completions/mean_terminated_length": 757.2356567382812, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.28881112912861656, "frac_reward_zero_std": 0.0, "grad_norm": 0.5436295355045321, "kl": 0.194580078125, "learning_rate": 1.7914456864559125e-05, "loss": 0.0293, "num_tokens": 419674483.0, "reward": 1.1025390625, "reward_std": 0.7295447587966919, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.32421875, "rewards/format_reward/std": 0.4685399830341339, "rewards/tag_count_reward/mean": 0.6669921875, "rewards/tag_count_reward/std": 0.24519717693328857, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 754.38671875, "completions/mean_terminated_length": 733.8532104492188, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.2891525134420073, "frac_reward_zero_std": 0.0, "grad_norm": 0.23857191052298732, "kl": 0.1982421875, "learning_rate": 1.790716924167919e-05, "loss": 0.0331, "num_tokens": 420141721.0, "reward": 1.17822265625, "reward_std": 0.7541235685348511, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.48028653860092163, "rewards/tag_count_reward/mean": 0.69189453125, "rewards/tag_count_reward/std": 0.24613991379737854, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 748.083984375, "completions/mean_terminated_length": 721.4091796875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.28949389775539813, "frac_reward_zero_std": 0.0, "grad_norm": 13.187006576600316, "kl": 2.055908203125, "learning_rate": 1.7899870396018758e-05, "loss": 0.124, "num_tokens": 420614292.0, "reward": 1.53173828125, "reward_std": 0.7384889125823975, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.603515625, "rewards/format_reward/std": 0.4896455705165863, "rewards/tag_count_reward/mean": 0.83837890625, "rewards/tag_count_reward/std": 0.22638291120529175, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 687.603515625, "completions/mean_terminated_length": 666.0099487304688, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.28983528206878895, "frac_reward_zero_std": 0.0, "grad_norm": 13.931952792215222, "kl": 3.45751953125, "learning_rate": 1.7892560337937204e-05, "loss": 0.1372, "num_tokens": 421047897.0, "reward": 1.6708984375, "reward_std": 0.6547743082046509, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.693359375, "rewards/format_reward/std": 0.4615498185157776, "rewards/tag_count_reward/mean": 0.8974609375, "rewards/tag_count_reward/std": 0.17896582186222076, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 713.8046875, "completions/mean_terminated_length": 639.5299072265625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.29017666638217976, "frac_reward_zero_std": 0.0, "grad_norm": 4.292722459768836, "kl": 1.353759765625, "learning_rate": 1.7885239077809818e-05, "loss": 0.1495, "num_tokens": 421498261.0, "reward": 1.62060546875, "reward_std": 0.653489351272583, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24230584502220154, "rewards/format_reward/mean": 0.689453125, "rewards/format_reward/std": 0.46317005157470703, "rewards/tag_count_reward/mean": 0.87060546875, "rewards/tag_count_reward/std": 0.2149733155965805, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 865.8828125, "completions/mean_terminated_length": 640.455810546875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.2905180506955705, "frac_reward_zero_std": 0.0, "grad_norm": 0.6555355608665298, "kl": 0.4091796875, "learning_rate": 1.787790662602779e-05, "loss": 0.1763, "num_tokens": 422026297.0, "reward": 1.59423828125, "reward_std": 0.5975843071937561, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.744140625, "rewards/format_reward/std": 0.43676990270614624, "rewards/tag_count_reward/mean": 0.82080078125, "rewards/tag_count_reward/std": 0.24628739058971405, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 751.3984375, "completions/mean_terminated_length": 617.2672119140625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.29085943500896133, "frac_reward_zero_std": 0.0, "grad_norm": 23.027210644227395, "kl": 0.67236328125, "learning_rate": 1.7870562992998195e-05, "loss": 0.1959, "num_tokens": 422498693.0, "reward": 1.6650390625, "reward_std": 0.5706939101219177, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.7734375, "rewards/format_reward/std": 0.4190165400505066, "rewards/tag_count_reward/mean": 0.8603515625, "rewards/tag_count_reward/std": 0.22180603444576263, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 703.046875, "completions/mean_terminated_length": 595.2235717773438, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.29120081932235214, "frac_reward_zero_std": 0.0, "grad_norm": 1.1507862452588027, "kl": 0.373046875, "learning_rate": 1.7863208189143982e-05, "loss": 0.1778, "num_tokens": 422939069.0, "reward": 1.68505859375, "reward_std": 0.584242582321167, "rewards/accuracy_reward/mean": 0.04435483738780022, "rewards/accuracy_reward/std": 0.2060900777578354, "rewards/format_reward/mean": 0.759765625, "rewards/format_reward/std": 0.4276435375213623, "rewards/tag_count_reward/mean": 0.88232421875, "rewards/tag_count_reward/std": 0.20015910267829895, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 583.783203125, "completions/mean_terminated_length": 580.9177856445312, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.29154220363574296, "frac_reward_zero_std": 0.0, "grad_norm": 0.2401828778142307, "kl": 0.2412109375, "learning_rate": 1.785584222490394e-05, "loss": 0.0246, "num_tokens": 423310574.0, "reward": 1.78564453125, "reward_std": 0.515559732913971, "rewards/accuracy_reward/mean": 0.038306452333927155, "rewards/accuracy_reward/std": 0.19212882220745087, "rewards/format_reward/mean": 0.810546875, "rewards/format_reward/std": 0.3922513723373413, "rewards/tag_count_reward/mean": 0.93798828125, "rewards/tag_count_reward/std": 0.1370190531015396, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 611.001953125, "completions/mean_terminated_length": 607.6569213867188, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.2918835879491337, "frac_reward_zero_std": 0.03125, "grad_norm": 0.5355924660967324, "kl": 0.247314453125, "learning_rate": 1.7848465110732713e-05, "loss": 0.0273, "num_tokens": 423703535.0, "reward": 1.8037109375, "reward_std": 0.4583161771297455, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.837890625, "rewards/format_reward/std": 0.3689115643501282, "rewards/tag_count_reward/mean": 0.9482421875, "rewards/tag_count_reward/std": 0.12708856165409088, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 661.87890625, "completions/mean_terminated_length": 654.3372802734375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.29222497226252453, "frac_reward_zero_std": 0.0625, "grad_norm": 1.4225076853635603, "kl": 0.274658203125, "learning_rate": 1.7841076857100766e-05, "loss": 0.0609, "num_tokens": 424126721.0, "reward": 1.93310546875, "reward_std": 0.3690539300441742, "rewards/accuracy_reward/mean": 0.07459677755832672, "rewards/accuracy_reward/std": 0.263004869222641, "rewards/format_reward/mean": 0.904296875, "rewards/format_reward/std": 0.2944713830947876, "rewards/tag_count_reward/mean": 0.95654296875, "rewards/tag_count_reward/std": 0.1456853747367859, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 688.2890625, "completions/mean_terminated_length": 684.680419921875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.29256635657591534, "frac_reward_zero_std": 0.125, "grad_norm": 0.7764304456432485, "kl": 0.222900390625, "learning_rate": 1.7833677474494374e-05, "loss": 0.0309, "num_tokens": 424579365.0, "reward": 2.0224609375, "reward_std": 0.37031668424606323, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.2422981858253479, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.09585157036781311, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 681.396484375, "completions/mean_terminated_length": 677.994140625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.29290774088930616, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3289088690367107, "kl": 0.2021484375, "learning_rate": 1.7826266973415605e-05, "loss": 0.0347, "num_tokens": 425006608.0, "reward": 1.98095703125, "reward_std": 0.24361704289913177, "rewards/accuracy_reward/mean": 0.0463709682226181, "rewards/accuracy_reward/std": 0.21049949526786804, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.2029850035905838, "rewards/tag_count_reward/mean": 0.97900390625, "rewards/tag_count_reward/std": 0.09212422370910645, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 677.013671875, "completions/mean_terminated_length": 676.0371704101562, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2932491252026969, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3474301499487671, "kl": 0.262451171875, "learning_rate": 1.7818845364382314e-05, "loss": 0.0338, "num_tokens": 425437559.0, "reward": 2.0166015625, "reward_std": 0.17785173654556274, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.07392385601997375, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 664.509765625, "completions/mean_terminated_length": 663.5264282226562, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.2935905095160877, "frac_reward_zero_std": 0.28125, "grad_norm": 0.47294800808833254, "kl": 0.249755859375, "learning_rate": 1.7811412657928112e-05, "loss": 0.0253, "num_tokens": 425855932.0, "reward": 2.0439453125, "reward_std": 0.1813722550868988, "rewards/accuracy_reward/mean": 0.08064515888690948, "rewards/accuracy_reward/std": 0.2725643217563629, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.05922659486532211, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 698.40234375, "completions/mean_terminated_length": 696.298095703125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.29393189382947854, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7446852645992398, "kl": 0.17578125, "learning_rate": 1.780396886460237e-05, "loss": 0.0304, "num_tokens": 426289530.0, "reward": 2.064453125, "reward_std": 0.15716159343719482, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05608600005507469, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 713.0078125, "completions/mean_terminated_length": 713.0078125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.29427327814286935, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13743918377758527, "kl": 0.179443359375, "learning_rate": 1.779651399497019e-05, "loss": 0.0077, "num_tokens": 426736254.0, "reward": 2.1279296875, "reward_std": 0.2177475392818451, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 723.310546875, "completions/mean_terminated_length": 723.310546875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.2946146624562601, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13093084919386216, "kl": 0.180419921875, "learning_rate": 1.7789048059612397e-05, "loss": 0.0243, "num_tokens": 427184685.0, "reward": 2.11572265625, "reward_std": 0.190769761800766, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 753.681640625, "completions/mean_terminated_length": 751.1487426757812, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.2949560467696509, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1471830654304367, "kl": 0.16748046875, "learning_rate": 1.7781571069125515e-05, "loss": 0.0354, "num_tokens": 427656090.0, "reward": 2.0576171875, "reward_std": 0.15316948294639587, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.05811915174126625, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 727.435546875, "completions/mean_terminated_length": 724.8512573242188, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.29529743108304174, "frac_reward_zero_std": 0.625, "grad_norm": 0.14905966011088326, "kl": 0.176025390625, "learning_rate": 1.7774083034121777e-05, "loss": 0.0221, "num_tokens": 428112025.0, "reward": 2.05224609375, "reward_std": 0.14532890915870667, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 716.3671875, "completions/mean_terminated_length": 713.76123046875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.29563881539643255, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1289320288832047, "kl": 0.168701171875, "learning_rate": 1.7766583965229064e-05, "loss": 0.0315, "num_tokens": 428559525.0, "reward": 2.04833984375, "reward_std": 0.12554889917373657, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.0593937449157238, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 697.7109375, "completions/mean_terminated_length": 697.7109375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.2959801997098233, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11597583413673374, "kl": 0.1640625, "learning_rate": 1.7759073873090944e-05, "loss": 0.0154, "num_tokens": 429007313.0, "reward": 2.05517578125, "reward_std": 0.11781644076108932, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 722.93359375, "completions/mean_terminated_length": 720.3405151367188, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.2963215840232141, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10189706896188963, "kl": 0.161376953125, "learning_rate": 1.7751552768366616e-05, "loss": 0.0162, "num_tokens": 429468239.0, "reward": 2.05810546875, "reward_std": 0.08332538604736328, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 721.38671875, "completions/mean_terminated_length": 720.4031372070312, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.29666296833660494, "frac_reward_zero_std": 0.75, "grad_norm": 5.134243635422457, "kl": 1.113037109375, "learning_rate": 1.774402066173091e-05, "loss": 0.053, "num_tokens": 429920085.0, "reward": 2.0703125, "reward_std": 0.08982278406620026, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 675.1875, "completions/mean_terminated_length": 672.5009765625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.29700435264999575, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13780214136453653, "kl": 0.169921875, "learning_rate": 1.7736477563874274e-05, "loss": 0.0201, "num_tokens": 430340853.0, "reward": 2.0927734375, "reward_std": 0.1824759840965271, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 634.802734375, "completions/mean_terminated_length": 634.802734375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2973457369633865, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12059918184812497, "kl": 0.180419921875, "learning_rate": 1.7728923485502757e-05, "loss": 0.0156, "num_tokens": 430749840.0, "reward": 2.06201171875, "reward_std": 0.10005573183298111, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 653.513671875, "completions/mean_terminated_length": 653.513671875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.2976871212767773, "frac_reward_zero_std": 0.5, "grad_norm": 0.14803594666341344, "kl": 0.175048828125, "learning_rate": 1.7721358437337996e-05, "loss": 0.02, "num_tokens": 431167383.0, "reward": 2.06494140625, "reward_std": 0.1887962818145752, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 626.857421875, "completions/mean_terminated_length": 626.857421875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.29802850559016814, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1201801147685192, "kl": 0.175048828125, "learning_rate": 1.7713782430117185e-05, "loss": 0.0151, "num_tokens": 431570782.0, "reward": 2.0966796875, "reward_std": 0.1113523319363594, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1413.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 636.666015625, "completions/mean_terminated_length": 635.622314453125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.29836988990355895, "frac_reward_zero_std": 0.6875, "grad_norm": 10.739399947094752, "kl": 3.961669921875, "learning_rate": 1.7706195474593085e-05, "loss": 0.1703, "num_tokens": 431974931.0, "reward": 2.03466796875, "reward_std": 0.11024978756904602, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1178.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 629.6484375, "completions/mean_terminated_length": 629.6484375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.2987112742169497, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12057673478784812, "kl": 0.18310546875, "learning_rate": 1.7698597581533992e-05, "loss": 0.016, "num_tokens": 432395167.0, "reward": 2.04296875, "reward_std": 0.11948468536138535, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 637.244140625, "completions/mean_terminated_length": 637.244140625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.2990526585303405, "frac_reward_zero_std": 0.625, "grad_norm": 0.118794681235932, "kl": 0.18017578125, "learning_rate": 1.7690988761723727e-05, "loss": 0.0075, "num_tokens": 432800140.0, "reward": 2.0849609375, "reward_std": 0.1491813063621521, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 614.341796875, "completions/mean_terminated_length": 612.7201538085938, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.29939404284373133, "frac_reward_zero_std": 0.75, "grad_norm": 1.8207949524622329, "kl": 1.040283203125, "learning_rate": 1.7683369025961613e-05, "loss": 0.0548, "num_tokens": 433200571.0, "reward": 2.0224609375, "reward_std": 0.10247913002967834, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 637.42578125, "completions/mean_terminated_length": 636.2700805664062, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.29973542715712215, "frac_reward_zero_std": 0.75, "grad_norm": 0.475003901036567, "kl": 0.192138671875, "learning_rate": 1.7675738385062474e-05, "loss": 0.0156, "num_tokens": 433607477.0, "reward": 2.0302734375, "reward_std": 0.09094181656837463, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 646.8515625, "completions/mean_terminated_length": 646.8515625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.3000768114705129, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12185634277861654, "kl": 0.18505859375, "learning_rate": 1.766809684985661e-05, "loss": 0.0053, "num_tokens": 434039721.0, "reward": 2.06494140625, "reward_std": 0.1280735433101654, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 692.435546875, "completions/mean_terminated_length": 692.435546875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.3004181957839037, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11305894803304335, "kl": 0.186767578125, "learning_rate": 1.766044443118978e-05, "loss": 0.0072, "num_tokens": 434497032.0, "reward": 2.06005859375, "reward_std": 0.10082658380270004, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 695.228515625, "completions/mean_terminated_length": 694.5890502929688, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.30075958009729453, "frac_reward_zero_std": 0.65625, "grad_norm": 0.883826563786592, "kl": 0.629150390625, "learning_rate": 1.7652781139923195e-05, "loss": 0.0335, "num_tokens": 434930397.0, "reward": 2.048828125, "reward_std": 0.13716641068458557, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 730.013671875, "completions/mean_terminated_length": 730.013671875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.30110096441068535, "frac_reward_zero_std": 0.625, "grad_norm": 0.12856024614763406, "kl": 0.182861328125, "learning_rate": 1.76451069869335e-05, "loss": 0.0013, "num_tokens": 435389492.0, "reward": 2.09033203125, "reward_std": 0.13478220999240875, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1873.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 797.267578125, "completions/mean_terminated_length": 797.267578125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.3014423487240761, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13687624208712426, "kl": 0.171630859375, "learning_rate": 1.763742198311274e-05, "loss": 0.0225, "num_tokens": 435882141.0, "reward": 2.029296875, "reward_std": 0.22515061497688293, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.043540701270103455, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 800.611328125, "completions/mean_terminated_length": 800.611328125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.3017837330374669, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11181325723102226, "kl": 0.173828125, "learning_rate": 1.762972613936838e-05, "loss": 0.0133, "num_tokens": 436376038.0, "reward": 2.01904296875, "reward_std": 0.1313178837299347, "rewards/accuracy_reward/mean": 0.03629032149910927, "rewards/accuracy_reward/std": 0.1872003823518753, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 817.92578125, "completions/mean_terminated_length": 814.8333740234375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.30212511735085773, "frac_reward_zero_std": 0.5, "grad_norm": 0.3899965038668557, "kl": 0.172607421875, "learning_rate": 1.7622019466623265e-05, "loss": 0.0241, "num_tokens": 436874672.0, "reward": 2.06201171875, "reward_std": 0.18259915709495544, "rewards/accuracy_reward/mean": 0.08064515888690948, "rewards/accuracy_reward/std": 0.2725643217563629, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.06124715134501457, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1273.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 758.56640625, "completions/mean_terminated_length": 758.56640625, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.30246650166424854, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13318024222299094, "kl": 0.178955078125, "learning_rate": 1.76143019758156e-05, "loss": 0.0093, "num_tokens": 437344114.0, "reward": 2.07421875, "reward_std": 0.15045064687728882, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 781.81640625, "completions/mean_terminated_length": 781.81640625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.3028078859776393, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11542895104493078, "kl": 0.16748046875, "learning_rate": 1.760657367789895e-05, "loss": 0.0077, "num_tokens": 437861332.0, "reward": 2.10546875, "reward_std": 0.14670313894748688, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 784.841796875, "completions/mean_terminated_length": 784.841796875, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.3031492702910301, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11033139507818673, "kl": 0.171142578125, "learning_rate": 1.7598834583842234e-05, "loss": 0.0068, "num_tokens": 438354483.0, "reward": 2.015625, "reward_std": 0.10294695198535919, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 719.240234375, "completions/mean_terminated_length": 719.240234375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.30349065460442093, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12788389288611796, "kl": 0.164794921875, "learning_rate": 1.759108470462967e-05, "loss": 0.005, "num_tokens": 438802734.0, "reward": 2.09033203125, "reward_std": 0.16216400265693665, "rewards/accuracy_reward/mean": 0.09879032522439957, "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 720.640625, "completions/mean_terminated_length": 719.8356323242188, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.30383203891781174, "frac_reward_zero_std": 0.5, "grad_norm": 0.5209162665303292, "kl": 0.175537109375, "learning_rate": 1.7583324051260795e-05, "loss": 0.0174, "num_tokens": 439248582.0, "reward": 2.12841796875, "reward_std": 0.18938778340816498, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 770.431640625, "completions/mean_terminated_length": 770.431640625, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.3041734232312025, "frac_reward_zero_std": 0.625, "grad_norm": 0.12248448310578118, "kl": 0.169921875, "learning_rate": 1.7575552634750438e-05, "loss": 0.0041, "num_tokens": 439725747.0, "reward": 2.03076171875, "reward_std": 0.12757860124111176, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 735.6015625, "completions/mean_terminated_length": 735.6015625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.3045148075445933, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0755121623957933, "kl": 0.171630859375, "learning_rate": 1.7567770466128705e-05, "loss": 0.0071, "num_tokens": 440175143.0, "reward": 2.04150390625, "reward_std": 0.06913109123706818, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 708.11328125, "completions/mean_terminated_length": 708.11328125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.3048561918579841, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12188561719943747, "kl": 0.166015625, "learning_rate": 1.7559977556440956e-05, "loss": 0.0057, "num_tokens": 440620721.0, "reward": 2.07861328125, "reward_std": 0.16396740078926086, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1222.0, "completions/max_terminated_length": 1222.0, "completions/mean_length": 714.4609375, "completions/mean_terminated_length": 714.4609375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.30519757617137494, "frac_reward_zero_std": 0.5, "grad_norm": 0.1286747242044125, "kl": 0.16162109375, "learning_rate": 1.755217391674781e-05, "loss": 0.0113, "num_tokens": 441073885.0, "reward": 2.09423828125, "reward_std": 0.19688382744789124, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 675.939453125, "completions/mean_terminated_length": 675.939453125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.3055389604847657, "frac_reward_zero_std": 0.625, "grad_norm": 0.11478679488146229, "kl": 0.16259765625, "learning_rate": 1.75443595581251e-05, "loss": 0.0008, "num_tokens": 441494254.0, "reward": 2.14501953125, "reward_std": 0.15489605069160461, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03651979938149452, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 685.412109375, "completions/mean_terminated_length": 683.8215942382812, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.3058803447981565, "frac_reward_zero_std": 0.6875, "grad_norm": 0.8492009911347723, "kl": 0.2041015625, "learning_rate": 1.7536534491663876e-05, "loss": 0.0143, "num_tokens": 441923681.0, "reward": 2.01611328125, "reward_std": 0.1127190813422203, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1494.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 749.62109375, "completions/mean_terminated_length": 749.62109375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.3062217291115473, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11286325310331222, "kl": 0.162353515625, "learning_rate": 1.7528698728470393e-05, "loss": 0.0076, "num_tokens": 442402543.0, "reward": 2.1015625, "reward_std": 0.1622658222913742, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1424.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 719.3203125, "completions/mean_terminated_length": 718.4422607421875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.30656311342493814, "frac_reward_zero_std": 0.59375, "grad_norm": 1.6712438545442452, "kl": 0.56640625, "learning_rate": 1.752085227966608e-05, "loss": 0.0362, "num_tokens": 442848739.0, "reward": 2.048828125, "reward_std": 0.15790215134620667, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.26366615295410156, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 776.017578125, "completions/mean_terminated_length": 776.017578125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.3069044977383289, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12817912081129826, "kl": 0.1630859375, "learning_rate": 1.7512995156387543e-05, "loss": 0.0021, "num_tokens": 443327500.0, "reward": 2.06689453125, "reward_std": 0.18810415267944336, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 809.013671875, "completions/mean_terminated_length": 807.4148559570312, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.3072458820517197, "frac_reward_zero_std": 0.6875, "grad_norm": 0.35677894640245755, "kl": 0.178955078125, "learning_rate": 1.7505127369786524e-05, "loss": 0.0052, "num_tokens": 443817907.0, "reward": 2.0166015625, "reward_std": 0.09886808693408966, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1827.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 902.416015625, "completions/mean_terminated_length": 902.416015625, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.3075872663651105, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07353995685824735, "kl": 0.1494140625, "learning_rate": 1.7497248931029912e-05, "loss": 0.0061, "num_tokens": 444371592.0, "reward": 2.01416015625, "reward_std": 0.06819306313991547, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 835.6171875, "completions/mean_terminated_length": 835.6171875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.30792865067850134, "frac_reward_zero_std": 0.625, "grad_norm": 0.11274626800358435, "kl": 0.161376953125, "learning_rate": 1.748935985129971e-05, "loss": 0.0159, "num_tokens": 444888452.0, "reward": 2.10791015625, "reward_std": 0.1318867951631546, "rewards/accuracy_reward/mean": 0.12298387289047241, "rewards/accuracy_reward/std": 0.32875028252601624, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 836.61328125, "completions/mean_terminated_length": 833.9020385742188, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.30827003499189215, "frac_reward_zero_std": 0.53125, "grad_norm": 0.9265141115980652, "kl": 0.263916015625, "learning_rate": 1.748146014179302e-05, "loss": 0.0204, "num_tokens": 445402318.0, "reward": 2.07421875, "reward_std": 0.1716192215681076, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 858.029296875, "completions/mean_terminated_length": 858.029296875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.3086114193052829, "frac_reward_zero_std": 0.625, "grad_norm": 0.1182014562649975, "kl": 0.1494140625, "learning_rate": 1.7473549813722037e-05, "loss": 0.0096, "num_tokens": 445929213.0, "reward": 2.0517578125, "reward_std": 0.1412045657634735, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1690.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 788.666015625, "completions/mean_terminated_length": 786.9021606445312, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3089528036186737, "frac_reward_zero_std": 0.625, "grad_norm": 0.6594395005297228, "kl": 0.3203125, "learning_rate": 1.7465628878314024e-05, "loss": 0.0295, "num_tokens": 446413538.0, "reward": 2.07861328125, "reward_std": 0.1369258016347885, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 694.875, "completions/mean_terminated_length": 694.875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.30929418793206453, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13829957627569922, "kl": 0.168701171875, "learning_rate": 1.74576973468113e-05, "loss": 0.0083, "num_tokens": 446855650.0, "reward": 2.04150390625, "reward_std": 0.08970718830823898, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 703.25, "completions/mean_terminated_length": 703.25, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.30963557224545535, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10281008329325855, "kl": 0.1630859375, "learning_rate": 1.7449755230471225e-05, "loss": 0.0077, "num_tokens": 447299138.0, "reward": 2.06640625, "reward_std": 0.10322566330432892, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 598.23828125, "completions/mean_terminated_length": 597.3307495117188, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.3099769565588461, "frac_reward_zero_std": 0.46875, "grad_norm": 0.5850812387428729, "kl": 0.33642578125, "learning_rate": 1.7441802540566176e-05, "loss": 0.0231, "num_tokens": 447689452.0, "reward": 2.125, "reward_std": 0.1814446747303009, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 589.427734375, "completions/mean_terminated_length": 588.5440063476562, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3103183408722369, "frac_reward_zero_std": 0.5, "grad_norm": 0.2759012065566434, "kl": 0.3466796875, "learning_rate": 1.7433839288383547e-05, "loss": 0.0221, "num_tokens": 448070455.0, "reward": 2.0673828125, "reward_std": 0.17936529219150543, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04666558653116226, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 595.720703125, "completions/mean_terminated_length": 595.720703125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.31065972518562773, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12543113076207227, "kl": 0.206298828125, "learning_rate": 1.7425865485225717e-05, "loss": 0.0047, "num_tokens": 448454792.0, "reward": 2.0380859375, "reward_std": 0.11656804382801056, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 593.21484375, "completions/mean_terminated_length": 592.4031372070312, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.31100110949901855, "frac_reward_zero_std": 0.59375, "grad_norm": 0.9971491454126653, "kl": 0.457275390625, "learning_rate": 1.741788114241004e-05, "loss": 0.0182, "num_tokens": 448845814.0, "reward": 2.03955078125, "reward_std": 0.14698781073093414, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 607.490234375, "completions/mean_terminated_length": 607.490234375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3113424938124093, "frac_reward_zero_std": 0.75, "grad_norm": 0.11388846774344646, "kl": 0.19189453125, "learning_rate": 1.7409886271268826e-05, "loss": 0.0095, "num_tokens": 449237953.0, "reward": 2.0244140625, "reward_std": 0.0951005071401596, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03484956547617912, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 631.9609375, "completions/mean_terminated_length": 631.9609375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3116838781258001, "frac_reward_zero_std": 0.46875, "grad_norm": 0.14712415137649376, "kl": 0.182373046875, "learning_rate": 1.740188088314934e-05, "loss": 0.0113, "num_tokens": 449640045.0, "reward": 2.07275390625, "reward_std": 0.1839047074317932, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 731.697265625, "completions/mean_terminated_length": 731.697265625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.31202526243919093, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0918537346557698, "kl": 0.1845703125, "learning_rate": 1.7393864989413766e-05, "loss": 0.0067, "num_tokens": 450103058.0, "reward": 1.99755859375, "reward_std": 0.06731453537940979, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08812850713729858, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1400.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 776.28125, "completions/mean_terminated_length": 776.28125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.31236664675258174, "frac_reward_zero_std": 0.875, "grad_norm": 0.06636275504155495, "kl": 0.175048828125, "learning_rate": 1.7385838601439193e-05, "loss": 0.0074, "num_tokens": 450587714.0, "reward": 1.9599609375, "reward_std": 0.04896535724401474, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.13230475783348083, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 849.103515625, "completions/mean_terminated_length": 846.75732421875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.3127080310659725, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11722738123391277, "kl": 0.16796875, "learning_rate": 1.7377801730617614e-05, "loss": 0.0167, "num_tokens": 451125735.0, "reward": 2.05615234375, "reward_std": 0.18076300621032715, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.05711371824145317, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 858.3984375, "completions/mean_terminated_length": 857.364013671875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.3130494153793633, "frac_reward_zero_std": 0.625, "grad_norm": 0.39030528741584, "kl": 0.20068359375, "learning_rate": 1.7369754388355894e-05, "loss": 0.024, "num_tokens": 451648051.0, "reward": 2.0634765625, "reward_std": 0.15059462189674377, "rewards/accuracy_reward/mean": 0.07459677755832672, "rewards/accuracy_reward/std": 0.263004869222641, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1963.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 876.21484375, "completions/mean_terminated_length": 875.0567626953125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.31339079969275413, "frac_reward_zero_std": 0.59375, "grad_norm": 0.42882571956938864, "kl": 0.165771484375, "learning_rate": 1.736169658607577e-05, "loss": 0.0173, "num_tokens": 452176097.0, "reward": 2.09228515625, "reward_std": 0.16378751397132874, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 788.142578125, "completions/mean_terminated_length": 788.142578125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.31373218400614494, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10735331440636371, "kl": 0.16552734375, "learning_rate": 1.7353628335213813e-05, "loss": 0.0034, "num_tokens": 452662234.0, "reward": 2.056640625, "reward_std": 0.11576303839683533, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293973088264465, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 839.2734375, "completions/mean_terminated_length": 839.2734375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.3140735683195357, "frac_reward_zero_std": 0.625, "grad_norm": 0.10270939136510401, "kl": 0.163330078125, "learning_rate": 1.7345549647221432e-05, "loss": 0.0143, "num_tokens": 453172982.0, "reward": 2.05419921875, "reward_std": 0.1200198233127594, "rewards/accuracy_reward/mean": 0.058467742055654526, "rewards/accuracy_reward/std": 0.23486268520355225, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 769.404296875, "completions/mean_terminated_length": 769.404296875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.3144149526329265, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11000979127382032, "kl": 0.165283203125, "learning_rate": 1.7337460533564846e-05, "loss": 0.0108, "num_tokens": 453653989.0, "reward": 2.146484375, "reward_std": 0.13933143019676208, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 834.2265625, "completions/mean_terminated_length": 834.2265625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.3147563369463173, "frac_reward_zero_std": 0.5, "grad_norm": 0.12030907506157121, "kl": 0.155517578125, "learning_rate": 1.7329361005725072e-05, "loss": 0.0101, "num_tokens": 454171433.0, "reward": 2.0732421875, "reward_std": 0.1918981373310089, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 730.82421875, "completions/mean_terminated_length": 730.82421875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.31509772125970814, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09077118267614485, "kl": 0.16943359375, "learning_rate": 1.732125107519791e-05, "loss": 0.0116, "num_tokens": 454629471.0, "reward": 2.04150390625, "reward_std": 0.11122649163007736, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 728.134765625, "completions/mean_terminated_length": 728.134765625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3154391055730989, "frac_reward_zero_std": 0.75, "grad_norm": 0.08535803723683133, "kl": 0.157958984375, "learning_rate": 1.7313130753493917e-05, "loss": 0.0087, "num_tokens": 455080308.0, "reward": 2.0234375, "reward_std": 0.08702600002288818, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 702.357421875, "completions/mean_terminated_length": 702.357421875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.3157804898864897, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09764010308201362, "kl": 0.169921875, "learning_rate": 1.7305000052138416e-05, "loss": 0.0064, "num_tokens": 455518475.0, "reward": 2.015625, "reward_std": 0.09605925530195236, "rewards/accuracy_reward/mean": 0.026209676638245583, "rewards/accuracy_reward/std": 0.1599196344614029, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 674.208984375, "completions/mean_terminated_length": 673.5714111328125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.3161218741998805, "frac_reward_zero_std": 0.5625, "grad_norm": 0.31543891678852914, "kl": 0.166015625, "learning_rate": 1.7296858982671442e-05, "loss": 0.0206, "num_tokens": 455975206.0, "reward": 2.052734375, "reward_std": 0.1743868887424469, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21157780289649963, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.06179272010922432, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 697.205078125, "completions/mean_terminated_length": 697.205078125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.31646325851327134, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11157989622789968, "kl": 0.159912109375, "learning_rate": 1.7288707556647754e-05, "loss": 0.0043, "num_tokens": 456429583.0, "reward": 2.046875, "reward_std": 0.1466883420944214, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 695.4375, "completions/mean_terminated_length": 695.4375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.3168046428266621, "frac_reward_zero_std": 0.53125, "grad_norm": 0.16888815014132, "kl": 0.164306640625, "learning_rate": 1.7280545785636812e-05, "loss": 0.0117, "num_tokens": 456873375.0, "reward": 2.09619140625, "reward_std": 0.1880887746810913, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 722.38671875, "completions/mean_terminated_length": 722.38671875, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.3171460271400529, "frac_reward_zero_std": 0.625, "grad_norm": 0.12431878561187931, "kl": 0.1552734375, "learning_rate": 1.7272373681222762e-05, "loss": 0.0097, "num_tokens": 457321749.0, "reward": 2.0341796875, "reward_std": 0.1219729483127594, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 714.369140625, "completions/mean_terminated_length": 714.369140625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.3174874114534437, "frac_reward_zero_std": 0.75, "grad_norm": 0.09612268875543013, "kl": 0.1630859375, "learning_rate": 1.72641912550044e-05, "loss": 0.0123, "num_tokens": 457768978.0, "reward": 2.03466796875, "reward_std": 0.09791651368141174, "rewards/accuracy_reward/mean": 0.0463709682226181, "rewards/accuracy_reward/std": 0.21049949526786804, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 692.939453125, "completions/mean_terminated_length": 692.939453125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.31782879576683454, "frac_reward_zero_std": 0.625, "grad_norm": 0.11363118949506135, "kl": 0.156494140625, "learning_rate": 1.7255998518595193e-05, "loss": 0.0132, "num_tokens": 458214467.0, "reward": 2.10302734375, "reward_std": 0.14530816674232483, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 730.79296875, "completions/mean_terminated_length": 730.79296875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.3181701800802253, "frac_reward_zero_std": 0.59375, "grad_norm": 1.7092199453543464, "kl": 0.158447265625, "learning_rate": 1.724779548362323e-05, "loss": 0.0266, "num_tokens": 458669289.0, "reward": 2.07861328125, "reward_std": 0.15120935440063477, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 703.84765625, "completions/mean_terminated_length": 703.84765625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.3185115643936161, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10758074741929824, "kl": 0.159423828125, "learning_rate": 1.7239582161731218e-05, "loss": 0.0039, "num_tokens": 459110523.0, "reward": 2.03564453125, "reward_std": 0.12083861231803894, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 814.43359375, "completions/mean_terminated_length": 811.0235595703125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.3188529487070069, "frac_reward_zero_std": 0.65625, "grad_norm": 0.40955466213764824, "kl": 0.149658203125, "learning_rate": 1.723135856457647e-05, "loss": 0.0218, "num_tokens": 459610169.0, "reward": 2.06005859375, "reward_std": 0.12617143988609314, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 750.826171875, "completions/mean_terminated_length": 750.826171875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.31919433302039774, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11232609593579254, "kl": 0.148193359375, "learning_rate": 1.7223124703830864e-05, "loss": 0.0098, "num_tokens": 460105680.0, "reward": 2.11474609375, "reward_std": 0.1420927196741104, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 820.673828125, "completions/mean_terminated_length": 820.673828125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.3195357173337885, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10084092780503634, "kl": 0.1435546875, "learning_rate": 1.7214880591180874e-05, "loss": 0.0064, "num_tokens": 460613849.0, "reward": 2.0546875, "reward_std": 0.10244170576334, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1851.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 793.9296875, "completions/mean_terminated_length": 793.9296875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.3198771016471793, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12996469653356085, "kl": 0.156982421875, "learning_rate": 1.72066262383275e-05, "loss": 0.0164, "num_tokens": 461104325.0, "reward": 2.04150390625, "reward_std": 0.13467730581760406, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 812.189453125, "completions/mean_terminated_length": 812.189453125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.3202184859605701, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1239773375420398, "kl": 0.151611328125, "learning_rate": 1.7198361656986294e-05, "loss": 0.0145, "num_tokens": 461600310.0, "reward": 2.0537109375, "reward_std": 0.11437451094388962, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 851.287109375, "completions/mean_terminated_length": 851.287109375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.32055987027396093, "frac_reward_zero_std": 0.78125, "grad_norm": 0.08849995873251533, "kl": 0.146240234375, "learning_rate": 1.7190086858887317e-05, "loss": 0.0162, "num_tokens": 462116361.0, "reward": 2.0498046875, "reward_std": 0.08072778582572937, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 787.79296875, "completions/mean_terminated_length": 787.79296875, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.3209012545873517, "frac_reward_zero_std": 0.625, "grad_norm": 0.11208181572289193, "kl": 0.150634765625, "learning_rate": 1.7181801855775125e-05, "loss": 0.0135, "num_tokens": 462599391.0, "reward": 2.04345703125, "reward_std": 0.13740262389183044, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 769.517578125, "completions/mean_terminated_length": 769.517578125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.3212426389007425, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0979683038715685, "kl": 0.1484375, "learning_rate": 1.717350665940877e-05, "loss": -0.0004, "num_tokens": 463074248.0, "reward": 2.060546875, "reward_std": 0.12064187228679657, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 803.068359375, "completions/mean_terminated_length": 801.4324951171875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.3215840232141333, "frac_reward_zero_std": 0.59375, "grad_norm": 3.5049250268238925, "kl": 1.45654296875, "learning_rate": 1.716520128156176e-05, "loss": 0.0755, "num_tokens": 463590443.0, "reward": 2.01416015625, "reward_std": 0.12354997545480728, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 713.8203125, "completions/mean_terminated_length": 713.8203125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.32192540752752413, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1330431198579052, "kl": 0.152587890625, "learning_rate": 1.715688573402207e-05, "loss": 0.0238, "num_tokens": 464036751.0, "reward": 2.13916015625, "reward_std": 0.19189384579658508, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 706.38671875, "completions/mean_terminated_length": 706.38671875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3222667918409149, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10660590602115394, "kl": 0.1484375, "learning_rate": 1.7148560028592086e-05, "loss": 0.0088, "num_tokens": 464480309.0, "reward": 2.08056640625, "reward_std": 0.11891620606184006, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 698.810546875, "completions/mean_terminated_length": 698.810546875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.3226081761543057, "frac_reward_zero_std": 0.75, "grad_norm": 0.08964055167229135, "kl": 0.15234375, "learning_rate": 1.7140224177088638e-05, "loss": 0.0108, "num_tokens": 464918660.0, "reward": 2.041015625, "reward_std": 0.09898563474416733, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 680.447265625, "completions/mean_terminated_length": 680.447265625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.3229495604676965, "frac_reward_zero_std": 0.78125, "grad_norm": 0.08945810693232685, "kl": 0.14990234375, "learning_rate": 1.713187819134293e-05, "loss": 0.0006, "num_tokens": 465361257.0, "reward": 2.11474609375, "reward_std": 0.08778423070907593, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1458.0, "completions/max_terminated_length": 1458.0, "completions/mean_length": 730.181640625, "completions/mean_terminated_length": 730.181640625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.32329094478108733, "frac_reward_zero_std": 0.46875, "grad_norm": 0.2179137953055067, "kl": 0.16552734375, "learning_rate": 1.7123522083200573e-05, "loss": 0.014, "num_tokens": 465826982.0, "reward": 2.0771484375, "reward_std": 0.19152310490608215, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1495.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 691.12890625, "completions/mean_terminated_length": 691.12890625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.3236323290944781, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1137433895172128, "kl": 0.15185546875, "learning_rate": 1.7115155864521527e-05, "loss": 0.0125, "num_tokens": 466255688.0, "reward": 2.07763671875, "reward_std": 0.1213422641158104, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 736.26171875, "completions/mean_terminated_length": 736.26171875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3239737134078689, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1276024249759697, "kl": 0.1455078125, "learning_rate": 1.710677954718011e-05, "loss": 0.0064, "num_tokens": 466714654.0, "reward": 2.12255859375, "reward_std": 0.19984450936317444, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 698.474609375, "completions/mean_terminated_length": 698.0430297851562, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.3243150977212597, "frac_reward_zero_std": 0.5, "grad_norm": 0.41856846819063476, "kl": 0.1494140625, "learning_rate": 1.7098393143064972e-05, "loss": 0.0022, "num_tokens": 467164961.0, "reward": 2.07080078125, "reward_std": 0.17175361514091492, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1233.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 765.70703125, "completions/mean_terminated_length": 765.70703125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.32465648203465053, "frac_reward_zero_std": 0.5, "grad_norm": 0.11816132398697402, "kl": 0.139892578125, "learning_rate": 1.7089996664079085e-05, "loss": 0.0162, "num_tokens": 467646555.0, "reward": 2.068359375, "reward_std": 0.18943338096141815, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 745.791015625, "completions/mean_terminated_length": 745.791015625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.3249978663480413, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12246990520876544, "kl": 0.144287109375, "learning_rate": 1.708159012213971e-05, "loss": 0.0045, "num_tokens": 468113568.0, "reward": 2.09228515625, "reward_std": 0.15122801065444946, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 768.619140625, "completions/mean_terminated_length": 768.619140625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.3253392506614321, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10447900532206537, "kl": 0.138916015625, "learning_rate": 1.7073173529178396e-05, "loss": 0.0177, "num_tokens": 468592573.0, "reward": 2.08935546875, "reward_std": 0.14220339059829712, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 793.013671875, "completions/mean_terminated_length": 793.013671875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.3256806349748229, "frac_reward_zero_std": 0.59375, "grad_norm": 0.111482799681355, "kl": 0.140380859375, "learning_rate": 1.706474689714096e-05, "loss": 0.006, "num_tokens": 469078148.0, "reward": 2.06640625, "reward_std": 0.14741763472557068, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1986.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 737.998046875, "completions/mean_terminated_length": 737.998046875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3260220192882137, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14182698982971512, "kl": 0.14501953125, "learning_rate": 1.705631023798746e-05, "loss": 0.012, "num_tokens": 469534531.0, "reward": 2.05615234375, "reward_std": 0.20222368836402893, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.036414988338947296, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 746.58984375, "completions/mean_terminated_length": 744.0430297851562, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.3263634036016045, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10923168211089034, "kl": 0.14697265625, "learning_rate": 1.70478635636922e-05, "loss": 0.0118, "num_tokens": 469995793.0, "reward": 2.09326171875, "reward_std": 0.15580815076828003, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1918.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 785.869140625, "completions/mean_terminated_length": 785.869140625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3267047879149953, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11466395932530962, "kl": 0.142822265625, "learning_rate": 1.703940688624368e-05, "loss": 0.0107, "num_tokens": 470481006.0, "reward": 2.109375, "reward_std": 0.16137611865997314, "rewards/accuracy_reward/mean": 0.12096773833036423, "rewards/accuracy_reward/std": 0.32641899585723877, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 725.513671875, "completions/mean_terminated_length": 725.513671875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.3270461722283861, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10416155076069296, "kl": 0.144775390625, "learning_rate": 1.7030940217644607e-05, "loss": 0.0128, "num_tokens": 470933829.0, "reward": 2.033203125, "reward_std": 0.10728531330823898, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 725.97265625, "completions/mean_terminated_length": 722.3392333984375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.3273875565417769, "frac_reward_zero_std": 0.5, "grad_norm": 0.2551463514480699, "kl": 0.1494140625, "learning_rate": 1.7022463569911875e-05, "loss": 0.0392, "num_tokens": 471383815.0, "reward": 2.103515625, "reward_std": 0.17536726593971252, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 717.33203125, "completions/mean_terminated_length": 717.33203125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.3277289408551677, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1507696178328328, "kl": 0.1474609375, "learning_rate": 1.7013976955076534e-05, "loss": 0.0114, "num_tokens": 471833825.0, "reward": 2.07177734375, "reward_std": 0.18387073278427124, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1494.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 753.41015625, "completions/mean_terminated_length": 753.41015625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.3280703251685585, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11608734831971788, "kl": 0.141357421875, "learning_rate": 1.7005480385183774e-05, "loss": 0.0187, "num_tokens": 472303811.0, "reward": 2.08935546875, "reward_std": 0.15826989710330963, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 799.30078125, "completions/mean_terminated_length": 799.30078125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.3284117094819493, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10140429771526549, "kl": 0.139404296875, "learning_rate": 1.699697387229293e-05, "loss": 0.0059, "num_tokens": 472800957.0, "reward": 2.017578125, "reward_std": 0.11934484541416168, "rewards/accuracy_reward/mean": 0.032258063554763794, "rewards/accuracy_reward/std": 0.17686307430267334, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 747.90625, "completions/mean_terminated_length": 747.136962890625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.3287530937953401, "frac_reward_zero_std": 0.65625, "grad_norm": 0.4579796482775101, "kl": 0.192138671875, "learning_rate": 1.6988457428477438e-05, "loss": 0.0166, "num_tokens": 473272093.0, "reward": 2.07763671875, "reward_std": 0.12701989710330963, "rewards/accuracy_reward/mean": 0.08870967477560043, "rewards/accuracy_reward/std": 0.2846112847328186, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 741.056640625, "completions/mean_terminated_length": 741.056640625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.3290944781087309, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1346553107795112, "kl": 0.140625, "learning_rate": 1.6979931065824832e-05, "loss": 0.0206, "num_tokens": 473734522.0, "reward": 2.0908203125, "reward_std": 0.1306305080652237, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1905.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 769.251953125, "completions/mean_terminated_length": 769.251953125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.3294358624221217, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11293965601204693, "kl": 0.146484375, "learning_rate": 1.6971394796436727e-05, "loss": 0.012, "num_tokens": 474212427.0, "reward": 2.04150390625, "reward_std": 0.1016392707824707, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 723.28125, "completions/mean_terminated_length": 721.4608154296875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.3297772467355125, "frac_reward_zero_std": 0.71875, "grad_norm": 1.0625980269343391, "kl": 0.691650390625, "learning_rate": 1.6962848632428796e-05, "loss": 0.0401, "num_tokens": 474678699.0, "reward": 2.03759765625, "reward_std": 0.11437094211578369, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 735.24609375, "completions/mean_terminated_length": 733.75146484375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3301186310489033, "frac_reward_zero_std": 0.53125, "grad_norm": 0.30135197299548616, "kl": 0.441650390625, "learning_rate": 1.6954292585930745e-05, "loss": 0.0193, "num_tokens": 475138809.0, "reward": 2.06640625, "reward_std": 0.15931211411952972, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04406425356864929, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 721.609375, "completions/mean_terminated_length": 721.609375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.3304600153622941, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13645560248080285, "kl": 0.15087890625, "learning_rate": 1.694572666908633e-05, "loss": 0.0199, "num_tokens": 475588049.0, "reward": 2.0615234375, "reward_std": 0.17054840922355652, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1933.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 722.7265625, "completions/mean_terminated_length": 722.7265625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.3308013996756849, "frac_reward_zero_std": 0.625, "grad_norm": 0.11951107579142337, "kl": 0.1533203125, "learning_rate": 1.6937150894053306e-05, "loss": 0.0147, "num_tokens": 476038709.0, "reward": 2.10888671875, "reward_std": 0.14474192261695862, "rewards/accuracy_reward/mean": 0.11491935700178146, "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 742.19921875, "completions/mean_terminated_length": 742.19921875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.3311427839890757, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10106862972388485, "kl": 0.155517578125, "learning_rate": 1.69285652730034e-05, "loss": 0.0036, "num_tokens": 476505691.0, "reward": 2.09912109375, "reward_std": 0.12245527654886246, "rewards/accuracy_reward/mean": 0.10483870655298233, "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 752.580078125, "completions/mean_terminated_length": 752.580078125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.3314841683024665, "frac_reward_zero_std": 0.6875, "grad_norm": 0.19482172225132227, "kl": 0.164306640625, "learning_rate": 1.6919969818122346e-05, "loss": 0.0075, "num_tokens": 476994420.0, "reward": 2.04638671875, "reward_std": 0.10851994156837463, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 805.109375, "completions/mean_terminated_length": 805.109375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.3318255526158573, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11484212237047403, "kl": 0.15576171875, "learning_rate": 1.6911364541609818e-05, "loss": 0.0175, "num_tokens": 477487324.0, "reward": 2.056640625, "reward_std": 0.1498776227235794, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 779.755859375, "completions/mean_terminated_length": 779.755859375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.3321669369292481, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12155308157968424, "kl": 0.15185546875, "learning_rate": 1.690274945567942e-05, "loss": 0.0074, "num_tokens": 477970431.0, "reward": 2.13623046875, "reward_std": 0.16103112697601318, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 814.93359375, "completions/mean_terminated_length": 814.93359375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.3325083212426389, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1042503654012412, "kl": 0.1630859375, "learning_rate": 1.6894124572558704e-05, "loss": 0.0179, "num_tokens": 478476941.0, "reward": 2.01025390625, "reward_std": 0.08982963860034943, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1690.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 743.220703125, "completions/mean_terminated_length": 743.220703125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.3328497055560297, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10920455774297319, "kl": 0.16943359375, "learning_rate": 1.6885489904489113e-05, "loss": 0.0239, "num_tokens": 478936686.0, "reward": 2.09765625, "reward_std": 0.11607664078474045, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 766.107421875, "completions/mean_terminated_length": 766.107421875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3331910898694205, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1185199069161327, "kl": 0.1669921875, "learning_rate": 1.6876845463725975e-05, "loss": 0.0253, "num_tokens": 479415029.0, "reward": 2.09423828125, "reward_std": 0.14715275168418884, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 796.3984375, "completions/mean_terminated_length": 796.3984375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.3335324741828113, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1357397051791917, "kl": 0.1669921875, "learning_rate": 1.686819126253849e-05, "loss": 0.03, "num_tokens": 479900625.0, "reward": 2.12109375, "reward_std": 0.2029927372932434, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 684.984375, "completions/mean_terminated_length": 684.984375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.3338738584962021, "frac_reward_zero_std": 0.625, "grad_norm": 0.12396414652005495, "kl": 0.17529296875, "learning_rate": 1.685952731320972e-05, "loss": 0.0357, "num_tokens": 480337113.0, "reward": 2.017578125, "reward_std": 0.1301572173833847, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1328.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 695.830078125, "completions/mean_terminated_length": 695.830078125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.3342152428095929, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09230844611373841, "kl": 0.15673828125, "learning_rate": 1.6850853628036553e-05, "loss": 0.0138, "num_tokens": 480770082.0, "reward": 2.06201171875, "reward_std": 0.09619646519422531, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 708.07421875, "completions/mean_terminated_length": 707.140869140625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3345566271229837, "frac_reward_zero_std": 0.78125, "grad_norm": 1.4258123662033362, "kl": 0.3056640625, "learning_rate": 1.6842170219329702e-05, "loss": 0.0216, "num_tokens": 481210104.0, "reward": 2.033203125, "reward_std": 0.08570607006549835, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 666.380859375, "completions/mean_terminated_length": 666.380859375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.3348980114363745, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1317294288777056, "kl": 0.169921875, "learning_rate": 1.683347709941367e-05, "loss": 0.0229, "num_tokens": 481631003.0, "reward": 2.07080078125, "reward_std": 0.14431370794773102, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 663.064453125, "completions/mean_terminated_length": 663.064453125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.3352393957497653, "frac_reward_zero_std": 0.625, "grad_norm": 0.12968915000698375, "kl": 0.162841796875, "learning_rate": 1.6824774280626756e-05, "loss": 0.0265, "num_tokens": 482053868.0, "reward": 2.0302734375, "reward_std": 0.11718817055225372, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 650.1015625, "completions/mean_terminated_length": 649.115478515625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.3355807800631561, "frac_reward_zero_std": 0.5, "grad_norm": 0.488601997594502, "kl": 0.166259765625, "learning_rate": 1.6816061775321024e-05, "loss": 0.0267, "num_tokens": 482478976.0, "reward": 2.10205078125, "reward_std": 0.18467092514038086, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 619.05078125, "completions/mean_terminated_length": 619.05078125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.3359221643765469, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1237333108803286, "kl": 0.154052734375, "learning_rate": 1.680733959586227e-05, "loss": 0.0287, "num_tokens": 482878522.0, "reward": 2.14794921875, "reward_std": 0.1604921519756317, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 631.4765625, "completions/mean_terminated_length": 631.4765625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.3362635486899377, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1337506154521637, "kl": 0.16162109375, "learning_rate": 1.6798607754630043e-05, "loss": 0.0253, "num_tokens": 483296302.0, "reward": 2.1044921875, "reward_std": 0.18295548856258392, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1668.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 694.03125, "completions/mean_terminated_length": 694.03125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.3366049330033285, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10832142142794905, "kl": 0.146484375, "learning_rate": 1.678986626401759e-05, "loss": 0.0033, "num_tokens": 483738494.0, "reward": 2.03466796875, "reward_std": 0.08662690967321396, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1833.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 663.794921875, "completions/mean_terminated_length": 663.794921875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.3369463173167193, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12447392574694642, "kl": 0.14892578125, "learning_rate": 1.678111513643186e-05, "loss": 0.0236, "num_tokens": 484163413.0, "reward": 2.03955078125, "reward_std": 0.11667131632566452, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 725.904296875, "completions/mean_terminated_length": 725.904296875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.33728770163011007, "frac_reward_zero_std": 0.78125, "grad_norm": 0.08408742149999548, "kl": 0.14208984375, "learning_rate": 1.6772354384293474e-05, "loss": 0.0126, "num_tokens": 484617508.0, "reward": 2.01904296875, "reward_std": 0.07568441331386566, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 708.244140625, "completions/mean_terminated_length": 708.244140625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.3376290859435009, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09697173390894445, "kl": 0.145263671875, "learning_rate": 1.6763584020036723e-05, "loss": 0.0134, "num_tokens": 485067281.0, "reward": 2.107421875, "reward_std": 0.12741202116012573, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 727.005859375, "completions/mean_terminated_length": 727.005859375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.3379704702568917, "frac_reward_zero_std": 0.625, "grad_norm": 0.10834490937660494, "kl": 0.14453125, "learning_rate": 1.6754804056109528e-05, "loss": 0.0125, "num_tokens": 485527332.0, "reward": 2.0703125, "reward_std": 0.12988342344760895, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 709.625, "completions/mean_terminated_length": 709.625, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.3383118545702825, "frac_reward_zero_std": 0.625, "grad_norm": 0.10738707838513738, "kl": 0.1376953125, "learning_rate": 1.674601450497345e-05, "loss": 0.0132, "num_tokens": 485978996.0, "reward": 2.087890625, "reward_std": 0.14540565013885498, "rewards/accuracy_reward/mean": 0.0947580635547638, "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 772.873046875, "completions/mean_terminated_length": 772.873046875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.33865323888367327, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11771682156508191, "kl": 0.135009765625, "learning_rate": 1.6737215379103643e-05, "loss": 0.0133, "num_tokens": 486462643.0, "reward": 2.09033203125, "reward_std": 0.14823219180107117, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 741.7734375, "completions/mean_terminated_length": 741.7734375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.3389946231970641, "frac_reward_zero_std": 0.625, "grad_norm": 0.10851857812924615, "kl": 0.1376953125, "learning_rate": 1.672840669098886e-05, "loss": 0.019, "num_tokens": 486925951.0, "reward": 2.0703125, "reward_std": 0.131509929895401, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 746.61328125, "completions/mean_terminated_length": 746.61328125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.3393360075104549, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1113695492657925, "kl": 0.135986328125, "learning_rate": 1.6719588453131417e-05, "loss": 0.0133, "num_tokens": 487390393.0, "reward": 2.07373046875, "reward_std": 0.13177603483200073, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 742.466796875, "completions/mean_terminated_length": 740.054931640625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.3396773918238457, "frac_reward_zero_std": 0.71875, "grad_norm": 1.0409454720600253, "kl": 0.2822265625, "learning_rate": 1.67107606780472e-05, "loss": 0.0211, "num_tokens": 487852984.0, "reward": 2.0595703125, "reward_std": 0.10266056656837463, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 812.375, "completions/mean_terminated_length": 812.375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.34001877613723647, "frac_reward_zero_std": 0.5, "grad_norm": 0.11479058608358784, "kl": 0.13330078125, "learning_rate": 1.6701923378265615e-05, "loss": 0.0088, "num_tokens": 488349416.0, "reward": 2.0634765625, "reward_std": 0.17981986701488495, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1660.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 764.283203125, "completions/mean_terminated_length": 763.5361938476562, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.3403601604506273, "frac_reward_zero_std": 0.5625, "grad_norm": 0.17772135303529946, "kl": 0.233642578125, "learning_rate": 1.6693076566329592e-05, "loss": 0.0182, "num_tokens": 488825081.0, "reward": 2.1181640625, "reward_std": 0.15871275961399078, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 781.01953125, "completions/mean_terminated_length": 781.01953125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.3407015447640181, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11152096534924472, "kl": 0.137939453125, "learning_rate": 1.6684220254795564e-05, "loss": 0.0092, "num_tokens": 489310579.0, "reward": 2.08642578125, "reward_std": 0.15400472283363342, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 739.638671875, "completions/mean_terminated_length": 739.638671875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3410429290774089, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10247574619281355, "kl": 0.1513671875, "learning_rate": 1.6675354456233444e-05, "loss": 0.0152, "num_tokens": 489766858.0, "reward": 2.095703125, "reward_std": 0.1288703978061676, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 698.853515625, "completions/mean_terminated_length": 698.853515625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.34138431339079967, "frac_reward_zero_std": 0.84375, "grad_norm": 0.07246138104930426, "kl": 0.152587890625, "learning_rate": 1.6666479183226613e-05, "loss": 0.0059, "num_tokens": 490209247.0, "reward": 2.05859375, "reward_std": 0.05342378467321396, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 702.431640625, "completions/mean_terminated_length": 702.431640625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.3417256977041905, "frac_reward_zero_std": 0.5, "grad_norm": 0.11807386421797862, "kl": 0.142333984375, "learning_rate": 1.6657594448371898e-05, "loss": 0.0085, "num_tokens": 490661436.0, "reward": 2.11474609375, "reward_std": 0.17376907169818878, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 738.513671875, "completions/mean_terminated_length": 738.25634765625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.3420670820175813, "frac_reward_zero_std": 0.53125, "grad_norm": 5.154730807841425, "kl": 1.370849609375, "learning_rate": 1.664870026427955e-05, "loss": 0.0618, "num_tokens": 491127107.0, "reward": 2.10693359375, "reward_std": 0.15896931290626526, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.045533329248428345, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 724.05859375, "completions/mean_terminated_length": 724.05859375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.3424084663309721, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10132751119860221, "kl": 0.1513671875, "learning_rate": 1.6639796643573247e-05, "loss": 0.005, "num_tokens": 491585425.0, "reward": 2.03125, "reward_std": 0.09688520431518555, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 778.81640625, "completions/mean_terminated_length": 778.81640625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.3427498506443629, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0777055635305028, "kl": 0.146484375, "learning_rate": 1.663088359889004e-05, "loss": 0.0115, "num_tokens": 492066659.0, "reward": 2.048828125, "reward_std": 0.0858922079205513, "rewards/accuracy_reward/mean": 0.05040322616696358, "rewards/accuracy_reward/std": 0.21899642050266266, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1617.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 788.509765625, "completions/mean_terminated_length": 788.509765625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.3430912349577537, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11776406571098393, "kl": 0.14892578125, "learning_rate": 1.6621961142880368e-05, "loss": 0.0199, "num_tokens": 492556040.0, "reward": 2.20263671875, "reward_std": 0.19267868995666504, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 820.6484375, "completions/mean_terminated_length": 820.6484375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.3434326192711445, "frac_reward_zero_std": 0.5, "grad_norm": 0.12080109210156327, "kl": 0.1484375, "learning_rate": 1.661302928820803e-05, "loss": 0.0084, "num_tokens": 493053108.0, "reward": 2.083984375, "reward_std": 0.1757497638463974, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1682.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 847.92578125, "completions/mean_terminated_length": 847.92578125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.3437740035845353, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10235498049324904, "kl": 0.150146484375, "learning_rate": 1.660408804755016e-05, "loss": 0.0119, "num_tokens": 493566366.0, "reward": 2.076171875, "reward_std": 0.11139687150716782, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 887.705078125, "completions/mean_terminated_length": 886.76318359375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.3441153878979261, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2601576088434286, "kl": 0.195556640625, "learning_rate": 1.6595137433597203e-05, "loss": 0.0171, "num_tokens": 494105639.0, "reward": 2.0185546875, "reward_std": 0.11611363291740417, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 862.65625, "completions/mean_terminated_length": 862.65625, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.3444567722113169, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10503325217398018, "kl": 0.14599609375, "learning_rate": 1.6586177459052932e-05, "loss": 0.0128, "num_tokens": 494639255.0, "reward": 2.06396484375, "reward_std": 0.10696999728679657, "rewards/accuracy_reward/mean": 0.0786290317773819, "rewards/accuracy_reward/std": 0.26943066716194153, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 771.404296875, "completions/mean_terminated_length": 771.404296875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.3447981565247077, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10998253438827048, "kl": 0.148681640625, "learning_rate": 1.6577208136634383e-05, "loss": 0.0117, "num_tokens": 495115942.0, "reward": 2.154296875, "reward_std": 0.17373302578926086, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 766.7578125, "completions/mean_terminated_length": 766.7578125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.3451395408380985, "frac_reward_zero_std": 0.625, "grad_norm": 0.1263993918239203, "kl": 0.1630859375, "learning_rate": 1.6568229479071875e-05, "loss": 0.0233, "num_tokens": 495589578.0, "reward": 2.06591796875, "reward_std": 0.13151970505714417, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 753.953125, "completions/mean_terminated_length": 753.953125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.3454809251514893, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09834042479235831, "kl": 0.1533203125, "learning_rate": 1.6559241499108965e-05, "loss": 0.015, "num_tokens": 496054722.0, "reward": 2.05029296875, "reward_std": 0.09855898469686508, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 756.587890625, "completions/mean_terminated_length": 755.6790771484375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.3458223094648801, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10751512627209683, "kl": 0.18212890625, "learning_rate": 1.655024420950245e-05, "loss": 0.0127, "num_tokens": 496532223.0, "reward": 2.08837890625, "reward_std": 0.1569109708070755, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 678.283203125, "completions/mean_terminated_length": 677.2896118164062, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.3461636937782709, "frac_reward_zero_std": 0.59375, "grad_norm": 0.22701319344348012, "kl": 0.1630859375, "learning_rate": 1.6541237623022333e-05, "loss": 0.0156, "num_tokens": 496955168.0, "reward": 2.115234375, "reward_std": 0.1663835644721985, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 707.47265625, "completions/mean_terminated_length": 707.47265625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3465050780916617, "frac_reward_zero_std": 0.78125, "grad_norm": 0.08688393505035533, "kl": 0.135986328125, "learning_rate": 1.653222175245182e-05, "loss": 0.0075, "num_tokens": 497399426.0, "reward": 2.03759765625, "reward_std": 0.07219552993774414, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 696.53125, "completions/mean_terminated_length": 696.53125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.3468464624050525, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1084926851570345, "kl": 0.14453125, "learning_rate": 1.6523196610587292e-05, "loss": 0.0107, "num_tokens": 497834850.0, "reward": 2.1005859375, "reward_std": 0.13446511328220367, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 668.41796875, "completions/mean_terminated_length": 668.41796875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.34718784671844327, "frac_reward_zero_std": 0.75, "grad_norm": 0.09655500002864409, "kl": 0.153564453125, "learning_rate": 1.651416221023828e-05, "loss": 0.0118, "num_tokens": 498268248.0, "reward": 2.04833984375, "reward_std": 0.08658716082572937, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 627.859375, "completions/mean_terminated_length": 627.859375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.3475292310318341, "frac_reward_zero_std": 0.75, "grad_norm": 0.1078426418892905, "kl": 0.154052734375, "learning_rate": 1.650511856422747e-05, "loss": 0.0074, "num_tokens": 498678832.0, "reward": 2.07470703125, "reward_std": 0.07611861079931259, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1215.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 647.873046875, "completions/mean_terminated_length": 647.873046875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.3478706153452249, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09731043458051335, "kl": 0.15185546875, "learning_rate": 1.6496065685390662e-05, "loss": 0.0099, "num_tokens": 499087359.0, "reward": 2.015625, "reward_std": 0.09398725628852844, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1176.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 631.6328125, "completions/mean_terminated_length": 631.6328125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.3482119996586157, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12118684930769835, "kl": 0.139892578125, "learning_rate": 1.6487003586576755e-05, "loss": 0.0169, "num_tokens": 499491347.0, "reward": 2.1123046875, "reward_std": 0.10179172456264496, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 699.328125, "completions/mean_terminated_length": 699.328125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.34855338397200647, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08837065438241705, "kl": 0.1357421875, "learning_rate": 1.647793228064775e-05, "loss": 0.0076, "num_tokens": 499924107.0, "reward": 2.08349609375, "reward_std": 0.10669347643852234, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1256.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 715.509765625, "completions/mean_terminated_length": 715.509765625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.3488947682853973, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1044925287973444, "kl": 0.154052734375, "learning_rate": 1.6468851780478702e-05, "loss": 0.0092, "num_tokens": 500368848.0, "reward": 2.05419921875, "reward_std": 0.1069236621260643, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1494.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 697.947265625, "completions/mean_terminated_length": 697.947265625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3492361525987881, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10756812894066962, "kl": 0.147216796875, "learning_rate": 1.6459762098957718e-05, "loss": 0.0078, "num_tokens": 500807141.0, "reward": 2.09521484375, "reward_std": 0.11363181471824646, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1368.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 730.529296875, "completions/mean_terminated_length": 730.529296875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3495775369121789, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1144035251532104, "kl": 0.1416015625, "learning_rate": 1.645066324898594e-05, "loss": 0.0266, "num_tokens": 501252308.0, "reward": 2.08740234375, "reward_std": 0.16449105739593506, "rewards/accuracy_reward/mean": 0.09879032522439957, "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 796.845703125, "completions/mean_terminated_length": 796.845703125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.34991892122556967, "frac_reward_zero_std": 0.71875, "grad_norm": 0.0800312109042223, "kl": 0.12841796875, "learning_rate": 1.644155524347753e-05, "loss": 0.021, "num_tokens": 501746629.0, "reward": 2.08935546875, "reward_std": 0.10754930973052979, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1507.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 734.62890625, "completions/mean_terminated_length": 734.62890625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.3502603055389605, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10148805764373754, "kl": 0.14208984375, "learning_rate": 1.6432438095359622e-05, "loss": 0.0154, "num_tokens": 502203863.0, "reward": 2.07177734375, "reward_std": 0.12433397024869919, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 817.763671875, "completions/mean_terminated_length": 817.763671875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.3506016898523513, "frac_reward_zero_std": 0.625, "grad_norm": 0.09656010095472745, "kl": 0.1357421875, "learning_rate": 1.642331181757235e-05, "loss": 0.0198, "num_tokens": 502699918.0, "reward": 2.14794921875, "reward_std": 0.14115969836711884, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1640.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 762.408203125, "completions/mean_terminated_length": 762.408203125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3509430741657421, "frac_reward_zero_std": 0.5, "grad_norm": 0.12415835609020058, "kl": 0.139892578125, "learning_rate": 1.6414176423068794e-05, "loss": 0.0302, "num_tokens": 503168079.0, "reward": 2.14794921875, "reward_std": 0.18420571088790894, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 782.775390625, "completions/mean_terminated_length": 782.775390625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.35128445847913287, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11653324176921341, "kl": 0.141357421875, "learning_rate": 1.640503192481497e-05, "loss": 0.0232, "num_tokens": 503648028.0, "reward": 2.03466796875, "reward_std": 0.13509997725486755, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 783.763671875, "completions/mean_terminated_length": 783.763671875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.3516258427925237, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09760119967550528, "kl": 0.143310546875, "learning_rate": 1.639587833578983e-05, "loss": 0.0121, "num_tokens": 504139907.0, "reward": 2.11572265625, "reward_std": 0.17043516039848328, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1869.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 740.283203125, "completions/mean_terminated_length": 740.283203125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.3519672271059145, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13481825832561992, "kl": 0.15087890625, "learning_rate": 1.638671566898521e-05, "loss": 0.0131, "num_tokens": 504611300.0, "reward": 2.10009765625, "reward_std": 0.15388405323028564, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 778.994140625, "completions/mean_terminated_length": 778.994140625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.3523086114193053, "frac_reward_zero_std": 0.625, "grad_norm": 0.1203578127514915, "kl": 0.139404296875, "learning_rate": 1.6377543937405848e-05, "loss": 0.0154, "num_tokens": 505091489.0, "reward": 2.04150390625, "reward_std": 0.12500633299350739, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 755.830078125, "completions/mean_terminated_length": 755.830078125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.35264999573269606, "frac_reward_zero_std": 0.5, "grad_norm": 0.1356812911489008, "kl": 0.149169921875, "learning_rate": 1.6368363154069333e-05, "loss": 0.0075, "num_tokens": 505560042.0, "reward": 2.07568359375, "reward_std": 0.1641862839460373, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 735.33203125, "completions/mean_terminated_length": 735.33203125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.3529913800460869, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10687652609183466, "kl": 0.15087890625, "learning_rate": 1.635917333200611e-05, "loss": 0.021, "num_tokens": 506013684.0, "reward": 2.0673828125, "reward_std": 0.14176851511001587, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 772.48828125, "completions/mean_terminated_length": 772.48828125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3533327643594777, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1018787431716144, "kl": 0.137451171875, "learning_rate": 1.6349974484259452e-05, "loss": 0.0247, "num_tokens": 506493358.0, "reward": 2.03466796875, "reward_std": 0.11170844733715057, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 740.65625, "completions/mean_terminated_length": 740.65625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.3536741486728685, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11923612255881132, "kl": 0.1494140625, "learning_rate": 1.634076662388544e-05, "loss": 0.0071, "num_tokens": 506959758.0, "reward": 2.06787109375, "reward_std": 0.14943018555641174, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 750.474609375, "completions/mean_terminated_length": 750.474609375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.35401553298625926, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3219318892564184, "kl": 0.177734375, "learning_rate": 1.633154976395295e-05, "loss": 0.0109, "num_tokens": 507424305.0, "reward": 2.03076171875, "reward_std": 0.10356885194778442, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 805.666015625, "completions/mean_terminated_length": 804.74365234375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.3543569172996501, "frac_reward_zero_std": 0.5, "grad_norm": 0.23377693145877304, "kl": 0.186767578125, "learning_rate": 1.632232391754362e-05, "loss": 0.0225, "num_tokens": 507913542.0, "reward": 2.03857421875, "reward_std": 0.15264685451984406, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 738.537109375, "completions/mean_terminated_length": 737.2817993164062, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3546983016130409, "frac_reward_zero_std": 0.21875, "grad_norm": 0.6024858689799473, "kl": 0.40234375, "learning_rate": 1.6313089097751862e-05, "loss": 0.0251, "num_tokens": 508369945.0, "reward": 1.97021484375, "reward_std": 0.3218327760696411, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.923828125, "rewards/format_reward/std": 0.26553234457969666, "rewards/tag_count_reward/mean": 0.97998046875, "rewards/tag_count_reward/std": 0.07797962427139282, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 704.62109375, "completions/mean_terminated_length": 704.62109375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.3550396859264317, "frac_reward_zero_std": 0.3125, "grad_norm": 0.15798840262140484, "kl": 0.15087890625, "learning_rate": 1.630384531768481e-05, "loss": 0.0118, "num_tokens": 508808983.0, "reward": 2.046875, "reward_std": 0.28161758184432983, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.955078125, "rewards/format_reward/std": 0.20733514428138733, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.07374914735555649, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 712.705078125, "completions/mean_terminated_length": 712.705078125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.35538107023982246, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11751542989980526, "kl": 0.14306640625, "learning_rate": 1.6294592590462317e-05, "loss": 0.0174, "num_tokens": 509261984.0, "reward": 2.02294921875, "reward_std": 0.14375010132789612, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1306.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 670.7265625, "completions/mean_terminated_length": 670.7265625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.3557224545532133, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1208542543836754, "kl": 0.14306640625, "learning_rate": 1.628533092921694e-05, "loss": 0.0165, "num_tokens": 509685316.0, "reward": 2.12109375, "reward_std": 0.1516994833946228, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1919.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 745.3828125, "completions/mean_terminated_length": 745.3828125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.3560638388666041, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12833089211957202, "kl": 0.142822265625, "learning_rate": 1.627606034709391e-05, "loss": 0.0123, "num_tokens": 510159288.0, "reward": 2.0751953125, "reward_std": 0.22194407880306244, "rewards/accuracy_reward/mean": 0.10685484111309052, "rewards/accuracy_reward/std": 0.3092404901981354, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06218579038977623, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 698.01171875, "completions/mean_terminated_length": 698.01171875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.3564052231799949, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13120816158674412, "kl": 0.1455078125, "learning_rate": 1.6266780857251126e-05, "loss": 0.0203, "num_tokens": 510608014.0, "reward": 2.07568359375, "reward_std": 0.196427583694458, "rewards/accuracy_reward/mean": 0.10080645233392715, "rewards/accuracy_reward/std": 0.30137622356414795, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 679.388671875, "completions/mean_terminated_length": 679.388671875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.35674660749338566, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09513749947245091, "kl": 0.146240234375, "learning_rate": 1.6257492472859124e-05, "loss": 0.0123, "num_tokens": 511045237.0, "reward": 2.056640625, "reward_std": 0.13867145776748657, "rewards/accuracy_reward/mean": 0.07258064299821854, "rewards/accuracy_reward/std": 0.25970885157585144, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 681.3828125, "completions/mean_terminated_length": 681.3828125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.3570879918067765, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14274077871149984, "kl": 0.1474609375, "learning_rate": 1.624819520710107e-05, "loss": 0.0132, "num_tokens": 511477689.0, "reward": 2.03466796875, "reward_std": 0.1729464828968048, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 677.134765625, "completions/mean_terminated_length": 677.134765625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3574293761201673, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14131847052112795, "kl": 0.1533203125, "learning_rate": 1.6238889073172725e-05, "loss": 0.0153, "num_tokens": 511911630.0, "reward": 2.052734375, "reward_std": 0.1960866004228592, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 667.8515625, "completions/mean_terminated_length": 667.8515625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.3577707604335581, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1312579208330598, "kl": 0.14599609375, "learning_rate": 1.6229574084282455e-05, "loss": 0.0166, "num_tokens": 512343586.0, "reward": 2.0927734375, "reward_std": 0.17482374608516693, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 690.748046875, "completions/mean_terminated_length": 689.75927734375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.35811214474694886, "frac_reward_zero_std": 0.46875, "grad_norm": 0.24731700081283425, "kl": 0.249755859375, "learning_rate": 1.6220250253651176e-05, "loss": 0.009, "num_tokens": 512786353.0, "reward": 2.10107421875, "reward_std": 0.2151673436164856, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 684.328125, "completions/mean_terminated_length": 684.328125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.35845352906033967, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12942723955344682, "kl": 0.14404296875, "learning_rate": 1.6210917594512355e-05, "loss": 0.0198, "num_tokens": 513224137.0, "reward": 1.9931640625, "reward_std": 0.08819417655467987, "rewards/accuracy_reward/mean": 0.010080644860863686, "rewards/accuracy_reward/std": 0.0999959409236908, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 683.8046875, "completions/mean_terminated_length": 683.8046875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.3587949133737305, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12646605432531433, "kl": 0.141357421875, "learning_rate": 1.6201576120112008e-05, "loss": 0.0143, "num_tokens": 513654405.0, "reward": 2.138671875, "reward_std": 0.18069779872894287, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 731.736328125, "completions/mean_terminated_length": 731.736328125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.3591362976871213, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1161463761392366, "kl": 0.140380859375, "learning_rate": 1.619222584370864e-05, "loss": 0.0227, "num_tokens": 514107902.0, "reward": 2.0263671875, "reward_std": 0.13132990896701813, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 704.833984375, "completions/mean_terminated_length": 703.2348022460938, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.35947768200051206, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2907005976448166, "kl": 0.436279296875, "learning_rate": 1.6182866778573262e-05, "loss": 0.0285, "num_tokens": 514555049.0, "reward": 2.0390625, "reward_std": 0.08232204616069794, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 720.771484375, "completions/mean_terminated_length": 720.771484375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.35981906631390287, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1264582873851103, "kl": 0.150634765625, "learning_rate": 1.617349893798935e-05, "loss": 0.0212, "num_tokens": 515018436.0, "reward": 2.04736328125, "reward_std": 0.20064862072467804, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 724.48046875, "completions/mean_terminated_length": 724.48046875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.3601604506272937, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13492990276655611, "kl": 0.148193359375, "learning_rate": 1.6164122335252845e-05, "loss": 0.0223, "num_tokens": 515469466.0, "reward": 2.0673828125, "reward_std": 0.17187809944152832, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 759.28515625, "completions/mean_terminated_length": 759.28515625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.3605018349406845, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10067118791735148, "kl": 0.14501953125, "learning_rate": 1.6154736983672123e-05, "loss": 0.0134, "num_tokens": 515939116.0, "reward": 2.0390625, "reward_std": 0.11673503369092941, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 730.03515625, "completions/mean_terminated_length": 729.6712036132812, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.36084321925407525, "frac_reward_zero_std": 0.53125, "grad_norm": 0.4290933546507811, "kl": 0.166015625, "learning_rate": 1.6145342896567964e-05, "loss": 0.0143, "num_tokens": 516389070.0, "reward": 2.0517578125, "reward_std": 0.1746603399515152, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 802.431640625, "completions/mean_terminated_length": 802.431640625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.36118460356746607, "frac_reward_zero_std": 0.625, "grad_norm": 0.0988967422811646, "kl": 0.1396484375, "learning_rate": 1.6135940087273564e-05, "loss": 0.0052, "num_tokens": 516885803.0, "reward": 2.076171875, "reward_std": 0.13872270286083221, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 782.771484375, "completions/mean_terminated_length": 782.771484375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3615259878808569, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10677873736468219, "kl": 0.14306640625, "learning_rate": 1.612652856913449e-05, "loss": 0.0144, "num_tokens": 517358502.0, "reward": 2.0478515625, "reward_std": 0.1384851038455963, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1741.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 784.224609375, "completions/mean_terminated_length": 783.24072265625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3618673721942477, "frac_reward_zero_std": 0.65625, "grad_norm": 0.9208036419625282, "kl": 0.3916015625, "learning_rate": 1.611710835550867e-05, "loss": 0.0269, "num_tokens": 517840905.0, "reward": 2.0771484375, "reward_std": 0.1255410611629486, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 790.19921875, "completions/mean_terminated_length": 790.19921875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.36220875650763845, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10662613832507231, "kl": 0.135986328125, "learning_rate": 1.6107679459766368e-05, "loss": 0.0157, "num_tokens": 518341983.0, "reward": 2.1005859375, "reward_std": 0.1707250028848648, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 753.560546875, "completions/mean_terminated_length": 753.560546875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.36255014082102927, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10009577935487463, "kl": 0.13037109375, "learning_rate": 1.6098241895290186e-05, "loss": 0.0018, "num_tokens": 518807230.0, "reward": 2.08935546875, "reward_std": 0.14896059036254883, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 706.515625, "completions/mean_terminated_length": 706.515625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.3628915251344201, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1217517359177203, "kl": 0.13916015625, "learning_rate": 1.608879567547502e-05, "loss": 0.0132, "num_tokens": 519242934.0, "reward": 2.0927734375, "reward_std": 0.15769505500793457, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 757.923828125, "completions/mean_terminated_length": 757.0332641601562, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.3632329094478109, "frac_reward_zero_std": 0.625, "grad_norm": 0.32474297845733097, "kl": 0.13671875, "learning_rate": 1.6079340813728045e-05, "loss": 0.0138, "num_tokens": 519716271.0, "reward": 2.0830078125, "reward_std": 0.1325436234474182, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 724.548828125, "completions/mean_terminated_length": 724.548828125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.36357429376120165, "frac_reward_zero_std": 0.625, "grad_norm": 0.10833870908843549, "kl": 0.142578125, "learning_rate": 1.6069877323468713e-05, "loss": 0.01, "num_tokens": 520167000.0, "reward": 2.0859375, "reward_std": 0.1340378224849701, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 724.54296875, "completions/mean_terminated_length": 724.54296875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.36391567807459246, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11555092951251371, "kl": 0.141845703125, "learning_rate": 1.6060405218128717e-05, "loss": 0.0116, "num_tokens": 520615278.0, "reward": 2.11181640625, "reward_std": 0.16702638566493988, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1878.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 714.9453125, "completions/mean_terminated_length": 714.9453125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.3642570623879833, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10960382531185933, "kl": 0.142822265625, "learning_rate": 1.6050924511151978e-05, "loss": 0.0098, "num_tokens": 521067170.0, "reward": 2.10107421875, "reward_std": 0.13619005680084229, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 701.623046875, "completions/mean_terminated_length": 701.2289428710938, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3645984467013741, "frac_reward_zero_std": 0.53125, "grad_norm": 0.39504680179442847, "kl": 0.152099609375, "learning_rate": 1.6041435215994622e-05, "loss": 0.0325, "num_tokens": 521508929.0, "reward": 2.14697265625, "reward_std": 0.18705075979232788, "rewards/accuracy_reward/mean": 0.16733871400356293, "rewards/accuracy_reward/std": 0.37365487217903137, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1517.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 738.28125, "completions/mean_terminated_length": 738.28125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.36493983101476485, "frac_reward_zero_std": 0.75, "grad_norm": 0.10591993341021579, "kl": 0.14990234375, "learning_rate": 1.6031937346124973e-05, "loss": 0.0136, "num_tokens": 521973745.0, "reward": 2.01416015625, "reward_std": 0.07798224687576294, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 659.515625, "completions/mean_terminated_length": 658.5009765625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.36528121532815566, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5161412085830773, "kl": 0.155029296875, "learning_rate": 1.6022430915023516e-05, "loss": 0.0168, "num_tokens": 522395529.0, "reward": 2.07470703125, "reward_std": 0.15544013679027557, "rewards/accuracy_reward/mean": 0.0947580635547638, "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 685.181640625, "completions/mean_terminated_length": 685.181640625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.3656225996415465, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11254109220903487, "kl": 0.14599609375, "learning_rate": 1.6012915936182892e-05, "loss": 0.0101, "num_tokens": 522836486.0, "reward": 2.08642578125, "reward_std": 0.1255168914794922, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 685.71875, "completions/mean_terminated_length": 685.71875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.3659639839549373, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11011624051556863, "kl": 0.1435546875, "learning_rate": 1.6003392423107877e-05, "loss": 0.0134, "num_tokens": 523270934.0, "reward": 2.05615234375, "reward_std": 0.1006467416882515, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 614.861328125, "completions/mean_terminated_length": 614.861328125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.36630536826832805, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11885073887582775, "kl": 0.141845703125, "learning_rate": 1.599386038931535e-05, "loss": 0.01, "num_tokens": 523675695.0, "reward": 2.08984375, "reward_std": 0.12526729702949524, "rewards/accuracy_reward/mean": 0.0947580635547638, "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 675.955078125, "completions/mean_terminated_length": 675.955078125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.36664675258171886, "frac_reward_zero_std": 0.625, "grad_norm": 0.10527055678189372, "kl": 0.1376953125, "learning_rate": 1.5984319848334292e-05, "loss": -0.0004, "num_tokens": 524098584.0, "reward": 2.14794921875, "reward_std": 0.1463720202445984, "rewards/accuracy_reward/mean": 0.1552419364452362, "rewards/accuracy_reward/std": 0.36250078678131104, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 660.3046875, "completions/mean_terminated_length": 660.3046875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.3669881368951097, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1500767448853809, "kl": 0.140625, "learning_rate": 1.597477081370576e-05, "loss": 0.0057, "num_tokens": 524519236.0, "reward": 2.09765625, "reward_std": 0.17286168038845062, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 673.951171875, "completions/mean_terminated_length": 673.951171875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.3673295212085005, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12157152371427463, "kl": 0.13623046875, "learning_rate": 1.5965213298982857e-05, "loss": 0.0234, "num_tokens": 524949995.0, "reward": 2.09375, "reward_std": 0.14858537912368774, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 677.93359375, "completions/mean_terminated_length": 677.93359375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.36767090552189124, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11619607830928848, "kl": 0.141357421875, "learning_rate": 1.5955647317730727e-05, "loss": 0.022, "num_tokens": 525370233.0, "reward": 2.07666015625, "reward_std": 0.1005052700638771, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 712.478515625, "completions/mean_terminated_length": 712.478515625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.36801228983528206, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13053340677245676, "kl": 0.146484375, "learning_rate": 1.5946072883526535e-05, "loss": 0.0157, "num_tokens": 525807982.0, "reward": 2.05224609375, "reward_std": 0.15917477011680603, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 681.685546875, "completions/mean_terminated_length": 681.685546875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.36835367414867287, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11511499744239027, "kl": 0.137451171875, "learning_rate": 1.5936490009959432e-05, "loss": 0.0109, "num_tokens": 526247229.0, "reward": 2.14501953125, "reward_std": 0.20348265767097473, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 781.287109375, "completions/mean_terminated_length": 781.287109375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.3686950584620637, "frac_reward_zero_std": 0.625, "grad_norm": 0.11027189158266082, "kl": 0.1328125, "learning_rate": 1.5926898710630562e-05, "loss": 0.0194, "num_tokens": 526734464.0, "reward": 2.02783203125, "reward_std": 0.11309142410755157, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 744.701171875, "completions/mean_terminated_length": 744.701171875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.36903644277545444, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14011434833739855, "kl": 0.139892578125, "learning_rate": 1.5917298999153016e-05, "loss": 0.0036, "num_tokens": 527198871.0, "reward": 2.09130859375, "reward_std": 0.19020068645477295, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 763.7421875, "completions/mean_terminated_length": 762.0880737304688, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.36937782708884526, "frac_reward_zero_std": 0.5625, "grad_norm": 12.670193936561725, "kl": 2.731689453125, "learning_rate": 1.590769088915183e-05, "loss": 0.1228, "num_tokens": 527666307.0, "reward": 2.08544921875, "reward_std": 0.14716380834579468, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 782.98046875, "completions/mean_terminated_length": 782.98046875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.36971921140223607, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13585910299435458, "kl": 0.1357421875, "learning_rate": 1.589807439426396e-05, "loss": 0.01, "num_tokens": 528151465.0, "reward": 2.0400390625, "reward_std": 0.14238829910755157, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 701.296875, "completions/mean_terminated_length": 701.296875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.3700605957156269, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13005122351827794, "kl": 0.1474609375, "learning_rate": 1.5888449528138256e-05, "loss": 0.0115, "num_tokens": 528587601.0, "reward": 2.087890625, "reward_std": 0.20312896370887756, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 747.662109375, "completions/mean_terminated_length": 746.9823608398438, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.37040198002901764, "frac_reward_zero_std": 0.5, "grad_norm": 0.1446306560753965, "kl": 0.220947265625, "learning_rate": 1.5878816304435457e-05, "loss": 0.0078, "num_tokens": 529054084.0, "reward": 2.09814453125, "reward_std": 0.16675487160682678, "rewards/accuracy_reward/mean": 0.1145833358168602, "rewards/accuracy_reward/std": 0.3188507556915283, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 775.65234375, "completions/mean_terminated_length": 775.65234375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.37074336434240845, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09987562107364631, "kl": 0.1455078125, "learning_rate": 1.586917473682817e-05, "loss": 0.018, "num_tokens": 529535010.0, "reward": 2.02978515625, "reward_std": 0.0999407097697258, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 750.693359375, "completions/mean_terminated_length": 750.0274047851562, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.37108474865579927, "frac_reward_zero_std": 0.5625, "grad_norm": 0.40557502151633673, "kl": 0.38134765625, "learning_rate": 1.585952483900083e-05, "loss": 0.0291, "num_tokens": 530005141.0, "reward": 2.06494140625, "reward_std": 0.12492634356021881, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 769.45703125, "completions/mean_terminated_length": 769.45703125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.3714261329691901, "frac_reward_zero_std": 0.625, "grad_norm": 0.11636064728409787, "kl": 0.150634765625, "learning_rate": 1.5849866624649698e-05, "loss": -0.0007, "num_tokens": 530479903.0, "reward": 2.09716796875, "reward_std": 0.12377262860536575, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 764.2890625, "completions/mean_terminated_length": 764.2890625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.37176751728258084, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10440670406335537, "kl": 0.1474609375, "learning_rate": 1.584020010748285e-05, "loss": 0.0127, "num_tokens": 530957267.0, "reward": 2.068359375, "reward_std": 0.10014601051807404, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 806.228515625, "completions/mean_terminated_length": 806.228515625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.37210890159597165, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1031577990654693, "kl": 0.143310546875, "learning_rate": 1.5830525301220144e-05, "loss": 0.0166, "num_tokens": 531449992.0, "reward": 2.08740234375, "reward_std": 0.15130552649497986, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1458.0, "completions/max_terminated_length": 1458.0, "completions/mean_length": 770.345703125, "completions/mean_terminated_length": 770.345703125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.37245028590936247, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11831750907116842, "kl": 0.153564453125, "learning_rate": 1.582084221959318e-05, "loss": 0.0136, "num_tokens": 531931449.0, "reward": 2.06298828125, "reward_std": 0.16237938404083252, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 691.419921875, "completions/mean_terminated_length": 690.3483276367188, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3727916702227533, "frac_reward_zero_std": 0.625, "grad_norm": 0.36863608288255845, "kl": 0.222412109375, "learning_rate": 1.5811150876345337e-05, "loss": 0.0145, "num_tokens": 532385088.0, "reward": 2.09765625, "reward_std": 0.14250105619430542, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05608600005507469, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 750.38671875, "completions/mean_terminated_length": 750.38671875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.37313305453614404, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1247587837850439, "kl": 0.153564453125, "learning_rate": 1.5801451285231698e-05, "loss": 0.0124, "num_tokens": 532848342.0, "reward": 2.0703125, "reward_std": 0.16872870922088623, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 725.34765625, "completions/mean_terminated_length": 725.34765625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.37347443884953485, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14389304790892446, "kl": 0.138427734375, "learning_rate": 1.579174346001906e-05, "loss": 0.0122, "num_tokens": 533302840.0, "reward": 2.029296875, "reward_std": 0.19183574616909027, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.03785909339785576, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1405.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 750.904296875, "completions/mean_terminated_length": 750.904296875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.37381582316292566, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12043000641166782, "kl": 0.152587890625, "learning_rate": 1.5782027414485906e-05, "loss": 0.0116, "num_tokens": 533770855.0, "reward": 2.04150390625, "reward_std": 0.12996628880500793, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 692.330078125, "completions/mean_terminated_length": 692.330078125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.3741572074763165, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1186565962099324, "kl": 0.14892578125, "learning_rate": 1.5772303162422386e-05, "loss": 0.0076, "num_tokens": 534208752.0, "reward": 2.04345703125, "reward_std": 0.14387057721614838, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 698.4765625, "completions/mean_terminated_length": 698.4765625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.37449859178970724, "frac_reward_zero_std": 0.6875, "grad_norm": 0.099990170285212, "kl": 0.14990234375, "learning_rate": 1.576257071763029e-05, "loss": 0.003, "num_tokens": 534639476.0, "reward": 2.0546875, "reward_std": 0.09319227933883667, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 735.607421875, "completions/mean_terminated_length": 735.607421875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.37483997610309805, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11260064026851822, "kl": 0.148193359375, "learning_rate": 1.575283009392305e-05, "loss": 0.0078, "num_tokens": 535098587.0, "reward": 2.05712890625, "reward_std": 0.13543540239334106, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 727.478515625, "completions/mean_terminated_length": 725.8215942382812, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.37518136041648886, "frac_reward_zero_std": 0.65625, "grad_norm": 0.4245813295484185, "kl": 0.151611328125, "learning_rate": 1.5743081305125703e-05, "loss": 0.0078, "num_tokens": 535558544.0, "reward": 1.982421875, "reward_std": 0.12843738496303558, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04655282944440842, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 701.02734375, "completions/mean_terminated_length": 701.02734375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.3755227447298797, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09318141947046801, "kl": 0.15380859375, "learning_rate": 1.5733324365074866e-05, "loss": 0.0021, "num_tokens": 536000702.0, "reward": 2.0068359375, "reward_std": 0.06736468523740768, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12414088100194931, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1752.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 681.017578125, "completions/mean_terminated_length": 681.017578125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.37586412904327043, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11186468306747613, "kl": 0.150146484375, "learning_rate": 1.572355928761873e-05, "loss": 0.0124, "num_tokens": 536429575.0, "reward": 2.0859375, "reward_std": 0.15185217559337616, "rewards/accuracy_reward/mean": 0.10208333283662796, "rewards/accuracy_reward/std": 0.3030737340450287, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 766.03515625, "completions/mean_terminated_length": 766.03515625, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.37620551335666125, "frac_reward_zero_std": 0.5, "grad_norm": 0.12422766048616421, "kl": 0.14501953125, "learning_rate": 1.571378608661704e-05, "loss": 0.0233, "num_tokens": 536907497.0, "reward": 2.0458984375, "reward_std": 0.15227805078029633, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.03799765557050705, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 667.650390625, "completions/mean_terminated_length": 667.650390625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.37654689767005206, "frac_reward_zero_std": 0.5625, "grad_norm": 0.127142904607048, "kl": 0.154052734375, "learning_rate": 1.5704004775941064e-05, "loss": 0.01, "num_tokens": 537327718.0, "reward": 2.0439453125, "reward_std": 0.1421242207288742, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 711.046875, "completions/mean_terminated_length": 711.046875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3768882819834429, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12469150349534715, "kl": 0.1396484375, "learning_rate": 1.5694215369473584e-05, "loss": 0.0143, "num_tokens": 537776974.0, "reward": 2.0654296875, "reward_std": 0.11943669617176056, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 736.50390625, "completions/mean_terminated_length": 736.50390625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.3772296662968337, "frac_reward_zero_std": 0.5, "grad_norm": 0.1363787846963741, "kl": 0.14794921875, "learning_rate": 1.5684417881108878e-05, "loss": 0.0124, "num_tokens": 538231200.0, "reward": 2.04296875, "reward_std": 0.18490231037139893, "rewards/accuracy_reward/mean": 0.06653226166963577, "rewards/accuracy_reward/std": 0.2494617998600006, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 715.669921875, "completions/mean_terminated_length": 715.669921875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.37757105061022445, "frac_reward_zero_std": 0.625, "grad_norm": 0.11546113191999026, "kl": 0.147705078125, "learning_rate": 1.5674612324752683e-05, "loss": 0.0061, "num_tokens": 538674055.0, "reward": 2.02392578125, "reward_std": 0.11476990580558777, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 801.85546875, "completions/mean_terminated_length": 801.85546875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.37791243492361526, "frac_reward_zero_std": 0.625, "grad_norm": 0.11107871357116918, "kl": 0.1435546875, "learning_rate": 1.566479871432219e-05, "loss": 0.0131, "num_tokens": 539180509.0, "reward": 2.0087890625, "reward_std": 0.12662026286125183, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 715.865234375, "completions/mean_terminated_length": 715.865234375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.3782538192370061, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12212465121714884, "kl": 0.143798828125, "learning_rate": 1.565497706374603e-05, "loss": 0.0167, "num_tokens": 539627816.0, "reward": 2.07080078125, "reward_std": 0.18112993240356445, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1463.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 667.673828125, "completions/mean_terminated_length": 667.673828125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.3785952035503969, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14108978148535398, "kl": 0.1513671875, "learning_rate": 1.564514738696424e-05, "loss": 0.0059, "num_tokens": 540051201.0, "reward": 2.095703125, "reward_std": 0.16567367315292358, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1413.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 725.955078125, "completions/mean_terminated_length": 725.955078125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.37893658786378764, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09489591676072741, "kl": 0.1416015625, "learning_rate": 1.5635309697928244e-05, "loss": 0.0102, "num_tokens": 540508634.0, "reward": 2.06201171875, "reward_std": 0.11667535454034805, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 738.302734375, "completions/mean_terminated_length": 738.302734375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.37927797217717846, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12543726469398866, "kl": 0.142822265625, "learning_rate": 1.5625464010600844e-05, "loss": 0.0129, "num_tokens": 540970069.0, "reward": 2.0927734375, "reward_std": 0.13369695842266083, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 699.736328125, "completions/mean_terminated_length": 698.9921875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.37961935649056927, "frac_reward_zero_std": 0.71875, "grad_norm": 0.35615177166874057, "kl": 0.164306640625, "learning_rate": 1.561561033895619e-05, "loss": 0.0143, "num_tokens": 541410238.0, "reward": 2.068359375, "reward_std": 0.0958566963672638, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 715.271484375, "completions/mean_terminated_length": 715.271484375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.3799607408039601, "frac_reward_zero_std": 0.625, "grad_norm": 0.1071192805754338, "kl": 0.150390625, "learning_rate": 1.5605748696979773e-05, "loss": 0.023, "num_tokens": 541859081.0, "reward": 2.06298828125, "reward_std": 0.13727758824825287, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 733.216796875, "completions/mean_terminated_length": 733.216796875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.38030212511735084, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11669067622239754, "kl": 0.154296875, "learning_rate": 1.5595879098668385e-05, "loss": 0.0056, "num_tokens": 542307368.0, "reward": 2.119140625, "reward_std": 0.17156368494033813, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 694.783203125, "completions/mean_terminated_length": 694.783203125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.38064350943074166, "frac_reward_zero_std": 0.625, "grad_norm": 0.11397749438651483, "kl": 0.1591796875, "learning_rate": 1.5586001558030116e-05, "loss": 0.0114, "num_tokens": 542745353.0, "reward": 2.048828125, "reward_std": 0.12468095123767853, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 718.482421875, "completions/mean_terminated_length": 718.482421875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.38098489374413247, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09829545818508242, "kl": 0.14599609375, "learning_rate": 1.5576116089084327e-05, "loss": 0.0068, "num_tokens": 543193248.0, "reward": 2.0673828125, "reward_std": 0.11214835941791534, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 784.921875, "completions/mean_terminated_length": 784.921875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.3813262780575233, "frac_reward_zero_std": 0.78125, "grad_norm": 0.08627034615890962, "kl": 0.14501953125, "learning_rate": 1.556622270586164e-05, "loss": 0.0037, "num_tokens": 543689208.0, "reward": 2.0283203125, "reward_std": 0.08119973540306091, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 730.400390625, "completions/mean_terminated_length": 730.400390625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.38166766237091404, "frac_reward_zero_std": 0.625, "grad_norm": 0.11187786379228688, "kl": 0.151123046875, "learning_rate": 1.5556321422403895e-05, "loss": 0.0133, "num_tokens": 544141973.0, "reward": 2.10498046875, "reward_std": 0.1369456648826599, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 833.21875, "completions/mean_terminated_length": 833.21875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.38200904668430485, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6103792120700328, "kl": 0.147705078125, "learning_rate": 1.5546412252764156e-05, "loss": 0.0067, "num_tokens": 544648565.0, "reward": 2.0419921875, "reward_std": 0.16991397738456726, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 716.501953125, "completions/mean_terminated_length": 716.501953125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.38235043099769567, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1248563490814579, "kl": 0.151611328125, "learning_rate": 1.5536495211006677e-05, "loss": 0.018, "num_tokens": 545097526.0, "reward": 2.0986328125, "reward_std": 0.17952154576778412, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 865.998046875, "completions/mean_terminated_length": 865.0, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.3826918153110865, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2720141913259885, "kl": 0.296630859375, "learning_rate": 1.5526570311206884e-05, "loss": 0.0121, "num_tokens": 545628325.0, "reward": 2.10400390625, "reward_std": 0.2175171673297882, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 830.0, "completions/mean_terminated_length": 830.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.38303319962447724, "frac_reward_zero_std": 0.625, "grad_norm": 0.10406996077125634, "kl": 0.144287109375, "learning_rate": 1.5516637567451357e-05, "loss": 0.0116, "num_tokens": 546130133.0, "reward": 2.1201171875, "reward_std": 0.13167807459831238, "rewards/accuracy_reward/mean": 0.13709677755832672, "rewards/accuracy_reward/std": 0.34429675340652466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 860.181640625, "completions/mean_terminated_length": 859.389404296875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.38337458393786805, "frac_reward_zero_std": 0.53125, "grad_norm": 0.2221047609777661, "kl": 0.20166015625, "learning_rate": 1.5506696993837812e-05, "loss": 0.0165, "num_tokens": 546655090.0, "reward": 2.0810546875, "reward_std": 0.16456982493400574, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 779.279296875, "completions/mean_terminated_length": 779.279296875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.38371596825125887, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09547496786808587, "kl": 0.138916015625, "learning_rate": 1.5496748604475076e-05, "loss": 0.0107, "num_tokens": 547129889.0, "reward": 2.06103515625, "reward_std": 0.13523928821086884, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 746.55859375, "completions/mean_terminated_length": 746.55859375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.3840573525646497, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12107207214100976, "kl": 0.14453125, "learning_rate": 1.5486792413483057e-05, "loss": 0.0075, "num_tokens": 547595327.0, "reward": 2.06201171875, "reward_std": 0.16313436627388, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1624.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 739.5546875, "completions/mean_terminated_length": 739.5546875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.38439873687804044, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12764224924998005, "kl": 0.13525390625, "learning_rate": 1.547682843499276e-05, "loss": 0.0161, "num_tokens": 548049003.0, "reward": 2.0439453125, "reward_std": 0.16903018951416016, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034629516303539276, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 709.44140625, "completions/mean_terminated_length": 709.44140625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.38474012119143125, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1119582209547065, "kl": 0.1396484375, "learning_rate": 1.5466856683146226e-05, "loss": 0.0159, "num_tokens": 548499757.0, "reward": 2.07763671875, "reward_std": 0.12485788762569427, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 703.1640625, "completions/mean_terminated_length": 703.1640625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.38508150550482206, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11173003328373164, "kl": 0.142578125, "learning_rate": 1.545687717209653e-05, "loss": 0.0097, "num_tokens": 548942209.0, "reward": 2.06884765625, "reward_std": 0.131199911236763, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 715.97265625, "completions/mean_terminated_length": 715.97265625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3854228898182129, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12038848151881942, "kl": 0.1494140625, "learning_rate": 1.5446889916007764e-05, "loss": 0.0086, "num_tokens": 549383555.0, "reward": 2.00732421875, "reward_std": 0.09086508303880692, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 702.853515625, "completions/mean_terminated_length": 702.853515625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.38576427413160363, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12068299433378184, "kl": 0.148681640625, "learning_rate": 1.5436894929055018e-05, "loss": 0.0112, "num_tokens": 549836360.0, "reward": 2.08984375, "reward_std": 0.14780564606189728, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 659.1875, "completions/mean_terminated_length": 659.1875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.38610565844499445, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12499328723464934, "kl": 0.144775390625, "learning_rate": 1.542689222542434e-05, "loss": 0.0019, "num_tokens": 550255720.0, "reward": 2.07177734375, "reward_std": 0.15623070299625397, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 696.30859375, "completions/mean_terminated_length": 696.30859375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.38644704275838526, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11642577552392654, "kl": 0.137939453125, "learning_rate": 1.5416881819312733e-05, "loss": 0.0003, "num_tokens": 550694006.0, "reward": 2.0888671875, "reward_std": 0.15700691938400269, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 726.99609375, "completions/mean_terminated_length": 726.99609375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3867884270717761, "frac_reward_zero_std": 0.71875, "grad_norm": 0.13374938577653672, "kl": 0.138671875, "learning_rate": 1.5406863724928152e-05, "loss": 0.0016, "num_tokens": 551158772.0, "reward": 2.04443359375, "reward_std": 0.10526617616415024, "rewards/accuracy_reward/mean": 0.05040322616696358, "rewards/accuracy_reward/std": 0.21899642050266266, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 749.322265625, "completions/mean_terminated_length": 748.2289428710938, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.38712981138516683, "frac_reward_zero_std": 0.59375, "grad_norm": 1.4192579931242442, "kl": 0.554931640625, "learning_rate": 1.5396837956489437e-05, "loss": 0.024, "num_tokens": 551627465.0, "reward": 2.04541015625, "reward_std": 0.13923919200897217, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.045280806720256805, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 763.810546875, "completions/mean_terminated_length": 763.810546875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.38747119569855765, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11672084827273409, "kl": 0.14306640625, "learning_rate": 1.5386804528226342e-05, "loss": 0.0047, "num_tokens": 552108824.0, "reward": 2.06787109375, "reward_std": 0.14892083406448364, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 804.078125, "completions/mean_terminated_length": 796.74658203125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.38781258001194846, "frac_reward_zero_std": 0.625, "grad_norm": 0.11130514507564411, "kl": 0.13916015625, "learning_rate": 1.5376763454379478e-05, "loss": 0.0115, "num_tokens": 552609696.0, "reward": 2.09521484375, "reward_std": 0.14271697402000427, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 786.548828125, "completions/mean_terminated_length": 786.548828125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3881539643253393, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09113523542881627, "kl": 0.1318359375, "learning_rate": 1.5366714749200315e-05, "loss": 0.0022, "num_tokens": 553097865.0, "reward": 2.0859375, "reward_std": 0.1497223824262619, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 838.48828125, "completions/mean_terminated_length": 836.121337890625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.38849534863873003, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10363102424763798, "kl": 0.129150390625, "learning_rate": 1.5356658426951148e-05, "loss": 0.0138, "num_tokens": 553608723.0, "reward": 2.11572265625, "reward_std": 0.15906822681427002, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 818.46875, "completions/mean_terminated_length": 818.46875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.38883673295212084, "frac_reward_zero_std": 0.75, "grad_norm": 0.07997500542014345, "kl": 0.134033203125, "learning_rate": 1.5346594501905094e-05, "loss": 0.0125, "num_tokens": 554103939.0, "reward": 2.03564453125, "reward_std": 0.0980663150548935, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 781.62890625, "completions/mean_terminated_length": 781.62890625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.38917811726551166, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10100518536278591, "kl": 0.144775390625, "learning_rate": 1.5336522988346047e-05, "loss": 0.0052, "num_tokens": 554587701.0, "reward": 2.0439453125, "reward_std": 0.14063362777233124, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 787.916015625, "completions/mean_terminated_length": 787.916015625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.38951950157890247, "frac_reward_zero_std": 0.625, "grad_norm": 0.10450805560990872, "kl": 0.1376953125, "learning_rate": 1.5326443900568683e-05, "loss": 0.0078, "num_tokens": 555072634.0, "reward": 2.05078125, "reward_std": 0.1360301375389099, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 783.021484375, "completions/mean_terminated_length": 783.021484375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.38986088589229323, "frac_reward_zero_std": 0.625, "grad_norm": 0.10884497538871131, "kl": 0.136962890625, "learning_rate": 1.5316357252878424e-05, "loss": 0.0174, "num_tokens": 555550789.0, "reward": 2.0400390625, "reward_std": 0.13301309943199158, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 755.17578125, "completions/mean_terminated_length": 755.17578125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.39020227020568404, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10985767308927434, "kl": 0.14111328125, "learning_rate": 1.5306263059591416e-05, "loss": 0.0043, "num_tokens": 556032975.0, "reward": 2.09375, "reward_std": 0.16314703226089478, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 762.078125, "completions/mean_terminated_length": 760.76123046875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.39054365451907486, "frac_reward_zero_std": 0.59375, "grad_norm": 0.31436032711843154, "kl": 0.14599609375, "learning_rate": 1.5296161335034522e-05, "loss": 0.0045, "num_tokens": 556519175.0, "reward": 2.07666015625, "reward_std": 0.140224426984787, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1660.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 751.3671875, "completions/mean_terminated_length": 751.3671875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.39088503883246567, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09368492256491569, "kl": 0.142333984375, "learning_rate": 1.52860520935453e-05, "loss": 0.0107, "num_tokens": 556987827.0, "reward": 2.04248046875, "reward_std": 0.10417771339416504, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 732.21875, "completions/mean_terminated_length": 732.21875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.39122642314585643, "frac_reward_zero_std": 0.625, "grad_norm": 0.12173107961196275, "kl": 0.140380859375, "learning_rate": 1.527593534947196e-05, "loss": -0.0034, "num_tokens": 557454291.0, "reward": 2.10009765625, "reward_std": 0.133047953248024, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 735.3046875, "completions/mean_terminated_length": 735.3046875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.39156780745924724, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11191342051941897, "kl": 0.146240234375, "learning_rate": 1.5265811117173373e-05, "loss": 0.0147, "num_tokens": 557908559.0, "reward": 2.0556640625, "reward_std": 0.11836553364992142, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1929.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 757.37109375, "completions/mean_terminated_length": 757.37109375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.39190919177263805, "frac_reward_zero_std": 0.78125, "grad_norm": 0.08464126821514685, "kl": 0.137451171875, "learning_rate": 1.5255679411019037e-05, "loss": 0.0051, "num_tokens": 558383053.0, "reward": 2.05078125, "reward_std": 0.08228506147861481, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 761.30859375, "completions/mean_terminated_length": 761.30859375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.39225057608602887, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12110529456311142, "kl": 0.12890625, "learning_rate": 1.5245540245389053e-05, "loss": 0.0029, "num_tokens": 558855867.0, "reward": 2.1259765625, "reward_std": 0.19106711447238922, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1395.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 751.291015625, "completions/mean_terminated_length": 751.291015625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3925919603994196, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09524448929999507, "kl": 0.131103515625, "learning_rate": 1.5235393634674109e-05, "loss": 0.0078, "num_tokens": 559319056.0, "reward": 2.08935546875, "reward_std": 0.16247513890266418, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 754.6484375, "completions/mean_terminated_length": 754.6484375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.39293334471281044, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10772649176483311, "kl": 0.133056640625, "learning_rate": 1.5225239593275474e-05, "loss": 0.0192, "num_tokens": 559782556.0, "reward": 2.080078125, "reward_std": 0.17819766700267792, "rewards/accuracy_reward/mean": 0.0927419364452362, "rewards/accuracy_reward/std": 0.2903633117675781, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 778.408203125, "completions/mean_terminated_length": 777.0997924804688, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.39327472902620125, "frac_reward_zero_std": 0.59375, "grad_norm": 0.27901958198521304, "kl": 0.315185546875, "learning_rate": 1.5215078135604944e-05, "loss": 0.0163, "num_tokens": 560270189.0, "reward": 2.05615234375, "reward_std": 0.14411747455596924, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 753.35546875, "completions/mean_terminated_length": 750.8218994140625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.39361611333959207, "frac_reward_zero_std": 0.625, "grad_norm": 0.19228661747264575, "kl": 0.138671875, "learning_rate": 1.520490927608485e-05, "loss": 0.0188, "num_tokens": 560744035.0, "reward": 2.07373046875, "reward_std": 0.13151831924915314, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 761.7734375, "completions/mean_terminated_length": 761.7734375, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.3939574976529828, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07602852488577463, "kl": 0.1328125, "learning_rate": 1.519473302914803e-05, "loss": 0.0074, "num_tokens": 561207983.0, "reward": 2.03076171875, "reward_std": 0.07363377511501312, "rewards/accuracy_reward/mean": 0.04032257944345474, "rewards/accuracy_reward/std": 0.19691328704357147, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 723.017578125, "completions/mean_terminated_length": 723.017578125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.39429888196637364, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09154988244793621, "kl": 0.134765625, "learning_rate": 1.5184549409237806e-05, "loss": 0.0023, "num_tokens": 561659848.0, "reward": 2.08203125, "reward_std": 0.10051137953996658, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 732.712890625, "completions/mean_terminated_length": 732.712890625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.39464026627976445, "frac_reward_zero_std": 0.84375, "grad_norm": 0.07408101419159865, "kl": 0.134765625, "learning_rate": 1.5174358430807959e-05, "loss": 0.0031, "num_tokens": 562121189.0, "reward": 2.052734375, "reward_std": 0.0634324923157692, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 790.57421875, "completions/mean_terminated_length": 788.9725952148438, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.39498165059315526, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3427760803312015, "kl": 0.15625, "learning_rate": 1.516416010832272e-05, "loss": 0.0043, "num_tokens": 562623019.0, "reward": 2.0322265625, "reward_std": 0.13698746263980865, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 745.390625, "completions/mean_terminated_length": 745.390625, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.395323034906546, "frac_reward_zero_std": 0.78125, "grad_norm": 0.11316620414343141, "kl": 0.142333984375, "learning_rate": 1.5153954456256754e-05, "loss": -0.0006, "num_tokens": 563085731.0, "reward": 2.0244140625, "reward_std": 0.07883721590042114, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 719.001953125, "completions/mean_terminated_length": 719.001953125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.39566441921993684, "frac_reward_zero_std": 0.5625, "grad_norm": 0.146432422772475, "kl": 0.135498046875, "learning_rate": 1.5143741489095104e-05, "loss": 0.0184, "num_tokens": 563532724.0, "reward": 2.0966796875, "reward_std": 0.16574768722057343, "rewards/accuracy_reward/mean": 0.10685484111309052, "rewards/accuracy_reward/std": 0.3092404901981354, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 758.45703125, "completions/mean_terminated_length": 758.45703125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.39600580353332765, "frac_reward_zero_std": 0.75, "grad_norm": 0.15912368244737518, "kl": 0.12890625, "learning_rate": 1.5133521221333212e-05, "loss": 0.0106, "num_tokens": 564012478.0, "reward": 2.1162109375, "reward_std": 0.09916865825653076, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.3333272337913513, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1517.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 689.8359375, "completions/mean_terminated_length": 688.7730102539062, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.39634718784671846, "frac_reward_zero_std": 0.46875, "grad_norm": 2.950489333418539, "kl": 0.595458984375, "learning_rate": 1.5123293667476886e-05, "loss": 0.0434, "num_tokens": 564443322.0, "reward": 2.0732421875, "reward_std": 0.2040901631116867, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 766.970703125, "completions/mean_terminated_length": 764.7274780273438, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.3966885721601092, "frac_reward_zero_std": 0.46875, "grad_norm": 0.2684685588408513, "kl": 0.440673828125, "learning_rate": 1.5113058842042256e-05, "loss": 0.0272, "num_tokens": 564923739.0, "reward": 2.05859375, "reward_std": 0.19521546363830566, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05386113002896309, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 768.146484375, "completions/mean_terminated_length": 768.146484375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.39702995647350003, "frac_reward_zero_std": 0.625, "grad_norm": 0.10496853922830116, "kl": 0.13232421875, "learning_rate": 1.5102816759555792e-05, "loss": 0.0068, "num_tokens": 565398166.0, "reward": 2.1005859375, "reward_std": 0.1478804498910904, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 742.763671875, "completions/mean_terminated_length": 742.763671875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.39737134078689085, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12091821241248345, "kl": 0.13525390625, "learning_rate": 1.509256743455426e-05, "loss": 0.0153, "num_tokens": 565861581.0, "reward": 2.0390625, "reward_std": 0.1582350730895996, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 715.986328125, "completions/mean_terminated_length": 715.986328125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.39771272510028166, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1648945197835138, "kl": 0.13671875, "learning_rate": 1.5082310881584692e-05, "loss": 0.0133, "num_tokens": 566302694.0, "reward": 2.0322265625, "reward_std": 0.15897685289382935, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 744.01171875, "completions/mean_terminated_length": 744.01171875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.3980541094136724, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12180843694584541, "kl": 0.130615234375, "learning_rate": 1.5072047115204398e-05, "loss": 0.0143, "num_tokens": 566765148.0, "reward": 2.0615234375, "reward_std": 0.18678218126296997, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 741.98046875, "completions/mean_terminated_length": 741.98046875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.39839549372706323, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10112695822504766, "kl": 0.133544921875, "learning_rate": 1.5061776149980915e-05, "loss": 0.0087, "num_tokens": 567228642.0, "reward": 2.0634765625, "reward_std": 0.11217039823532104, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 742.169921875, "completions/mean_terminated_length": 742.169921875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.39873687804045405, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11790488827820735, "kl": 0.13330078125, "learning_rate": 1.5051498000491997e-05, "loss": 0.002, "num_tokens": 567688985.0, "reward": 2.11279296875, "reward_std": 0.1840301752090454, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 793.41796875, "completions/mean_terminated_length": 793.41796875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.39907826235384486, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1725536562628815, "kl": 0.130615234375, "learning_rate": 1.5041212681325598e-05, "loss": 0.0009, "num_tokens": 568174895.0, "reward": 2.10107421875, "reward_std": 0.18563753366470337, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 849.626953125, "completions/mean_terminated_length": 849.626953125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.3994196466672356, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08315423677198809, "kl": 0.123291015625, "learning_rate": 1.503092020707985e-05, "loss": 0.0066, "num_tokens": 568690192.0, "reward": 2.0908203125, "reward_std": 0.13704869151115417, "rewards/accuracy_reward/mean": 0.10282257944345474, "rewards/accuracy_reward/std": 0.30403366684913635, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 765.41015625, "completions/mean_terminated_length": 765.41015625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.39976103098062643, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0710708676471876, "kl": 0.128173828125, "learning_rate": 1.5020620592363035e-05, "loss": 0.0189, "num_tokens": 569168242.0, "reward": 2.0712890625, "reward_std": 0.09175549447536469, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 776.310546875, "completions/mean_terminated_length": 776.310546875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.40010241529401724, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10063162170619748, "kl": 0.133056640625, "learning_rate": 1.5010313851793577e-05, "loss": 0.0033, "num_tokens": 569654641.0, "reward": 2.05029296875, "reward_std": 0.14335371553897858, "rewards/accuracy_reward/mean": 0.06653226166963577, "rewards/accuracy_reward/std": 0.2494617998600006, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1590.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 809.8046875, "completions/mean_terminated_length": 809.8046875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.40044379960740806, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1110829080389506, "kl": 0.1337890625, "learning_rate": 1.5000000000000002e-05, "loss": 0.0063, "num_tokens": 570156205.0, "reward": 2.05615234375, "reward_std": 0.14892086386680603, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 749.595703125, "completions/mean_terminated_length": 749.595703125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.4007851839207988, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10033485659637532, "kl": 0.134033203125, "learning_rate": 1.498967905162094e-05, "loss": 0.005, "num_tokens": 570618238.0, "reward": 2.1025390625, "reward_std": 0.13029411435127258, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 742.443359375, "completions/mean_terminated_length": 742.443359375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.40112656823418963, "frac_reward_zero_std": 0.75, "grad_norm": 0.0789261990892619, "kl": 0.13623046875, "learning_rate": 1.4979351021305088e-05, "loss": 0.0107, "num_tokens": 571078321.0, "reward": 2.01953125, "reward_std": 0.09805534780025482, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 828.28515625, "completions/mean_terminated_length": 828.28515625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.40146795254758044, "frac_reward_zero_std": 0.625, "grad_norm": 0.0963682137612647, "kl": 0.131591796875, "learning_rate": 1.4969015923711197e-05, "loss": 0.0117, "num_tokens": 571582243.0, "reward": 2.03515625, "reward_std": 0.14946137368679047, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 804.228515625, "completions/mean_terminated_length": 804.228515625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.40180933686097126, "frac_reward_zero_std": 0.5, "grad_norm": 0.11724506193464641, "kl": 0.133544921875, "learning_rate": 1.495867377350805e-05, "loss": 0.0085, "num_tokens": 572074056.0, "reward": 2.06689453125, "reward_std": 0.19699105620384216, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 755.130859375, "completions/mean_terminated_length": 755.130859375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.402150721174362, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10968775951998334, "kl": 0.14111328125, "learning_rate": 1.4948324585374432e-05, "loss": 0.0078, "num_tokens": 572546475.0, "reward": 2.0224609375, "reward_std": 0.11352013796567917, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 762.05859375, "completions/mean_terminated_length": 762.05859375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.4024921054877528, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12228586884411488, "kl": 0.14111328125, "learning_rate": 1.493796837399913e-05, "loss": 0.0067, "num_tokens": 573017785.0, "reward": 2.06396484375, "reward_std": 0.17676088213920593, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 757.46484375, "completions/mean_terminated_length": 757.46484375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.40283348980114364, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12483420938055487, "kl": 0.141845703125, "learning_rate": 1.4927605154080885e-05, "loss": 0.005, "num_tokens": 573489511.0, "reward": 2.0791015625, "reward_std": 0.18922439217567444, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 753.580078125, "completions/mean_terminated_length": 753.580078125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.40317487411453445, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10729802080036764, "kl": 0.140625, "learning_rate": 1.4917234940328395e-05, "loss": 0.0035, "num_tokens": 573950240.0, "reward": 2.07470703125, "reward_std": 0.17150183022022247, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 760.111328125, "completions/mean_terminated_length": 760.111328125, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.4035162584279252, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11408417629594313, "kl": 0.137939453125, "learning_rate": 1.4906857747460284e-05, "loss": 0.0106, "num_tokens": 574421849.0, "reward": 2.048828125, "reward_std": 0.1508398950099945, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 751.8671875, "completions/mean_terminated_length": 751.8671875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.403857642741316, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1115270987499037, "kl": 0.142578125, "learning_rate": 1.4896473590205073e-05, "loss": 0.018, "num_tokens": 574891829.0, "reward": 2.05712890625, "reward_std": 0.10596694052219391, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 784.935546875, "completions/mean_terminated_length": 784.935546875, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.40419902705470684, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1289061398627619, "kl": 0.1376953125, "learning_rate": 1.4886082483301181e-05, "loss": 0.0085, "num_tokens": 575378708.0, "reward": 2.09130859375, "reward_std": 0.2182164490222931, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 740.87890625, "completions/mean_terminated_length": 740.87890625, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.40454041136809765, "frac_reward_zero_std": 0.625, "grad_norm": 0.11062978498347063, "kl": 0.14013671875, "learning_rate": 1.4875684441496883e-05, "loss": 0.007, "num_tokens": 575849142.0, "reward": 2.0869140625, "reward_std": 0.1504589468240738, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 744.146484375, "completions/mean_terminated_length": 744.146484375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4048817956814884, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10940810331961559, "kl": 0.1376953125, "learning_rate": 1.4865279479550292e-05, "loss": 0.0044, "num_tokens": 576314769.0, "reward": 2.1123046875, "reward_std": 0.14095279574394226, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 767.58203125, "completions/mean_terminated_length": 767.58203125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.4052231799948792, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10674677863150626, "kl": 0.143310546875, "learning_rate": 1.4854867612229352e-05, "loss": 0.0103, "num_tokens": 576792907.0, "reward": 2.0791015625, "reward_std": 0.152098149061203, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 771.31640625, "completions/mean_terminated_length": 771.31640625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.40556456430827004, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10319086760921295, "kl": 0.137939453125, "learning_rate": 1.4844448854311806e-05, "loss": 0.0059, "num_tokens": 577266109.0, "reward": 2.0595703125, "reward_std": 0.1090644970536232, "rewards/accuracy_reward/mean": 0.06666667014360428, "rewards/accuracy_reward/std": 0.24970407783985138, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1792.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 785.892578125, "completions/mean_terminated_length": 785.892578125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.40590594862166085, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11630800175917833, "kl": 0.13427734375, "learning_rate": 1.4834023220585169e-05, "loss": 0.0048, "num_tokens": 577753590.0, "reward": 2.11376953125, "reward_std": 0.1580388844013214, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 783.119140625, "completions/mean_terminated_length": 783.119140625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.4062473329350516, "frac_reward_zero_std": 0.71875, "grad_norm": 0.11206008650174434, "kl": 0.135986328125, "learning_rate": 1.4823590725846728e-05, "loss": 0.0082, "num_tokens": 578229011.0, "reward": 2.0595703125, "reward_std": 0.09226740896701813, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 780.51953125, "completions/mean_terminated_length": 780.51953125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.4065887172484424, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09817631313796393, "kl": 0.132568359375, "learning_rate": 1.4813151384903494e-05, "loss": 0.008, "num_tokens": 578704509.0, "reward": 2.07763671875, "reward_std": 0.12568223476409912, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 800.322265625, "completions/mean_terminated_length": 800.322265625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.40693010156183324, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09372910945139061, "kl": 0.1337890625, "learning_rate": 1.4802705212572215e-05, "loss": 0.0095, "num_tokens": 579193874.0, "reward": 2.0419921875, "reward_std": 0.1025063768029213, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 734.748046875, "completions/mean_terminated_length": 734.748046875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.40727148587522405, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11755571662445884, "kl": 0.140869140625, "learning_rate": 1.4792252223679308e-05, "loss": 0.0229, "num_tokens": 579649489.0, "reward": 2.07470703125, "reward_std": 0.14910444617271423, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 678.73828125, "completions/mean_terminated_length": 678.73828125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.4076128701886148, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13421942465049966, "kl": 0.14111328125, "learning_rate": 1.4781792433060884e-05, "loss": 0.0167, "num_tokens": 580075291.0, "reward": 2.10302734375, "reward_std": 0.24123135209083557, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.047786012291908264, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 730.427734375, "completions/mean_terminated_length": 730.427734375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.4079542545020056, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12950431301118837, "kl": 0.13623046875, "learning_rate": 1.4771325855562707e-05, "loss": 0.0105, "num_tokens": 580538166.0, "reward": 2.064453125, "reward_std": 0.14509317278862, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 669.794921875, "completions/mean_terminated_length": 669.794921875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.40829563881539643, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13398551056219313, "kl": 0.138427734375, "learning_rate": 1.4760852506040163e-05, "loss": -0.0001, "num_tokens": 580963933.0, "reward": 2.1142578125, "reward_std": 0.20395320653915405, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1566.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 740.546875, "completions/mean_terminated_length": 740.546875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.40863702312878725, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13161885586087022, "kl": 0.1357421875, "learning_rate": 1.4750372399358257e-05, "loss": 0.0113, "num_tokens": 581433925.0, "reward": 2.076171875, "reward_std": 0.21939748525619507, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.03785909339785576, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 724.75, "completions/mean_terminated_length": 724.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.408978407442178, "frac_reward_zero_std": 0.65625, "grad_norm": 0.19944512423066896, "kl": 0.13525390625, "learning_rate": 1.4739885550391585e-05, "loss": 0.0185, "num_tokens": 581886293.0, "reward": 2.05126953125, "reward_std": 0.10695584863424301, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293970108032227, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 722.142578125, "completions/mean_terminated_length": 722.142578125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.4093197917555688, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11257933861638127, "kl": 0.134033203125, "learning_rate": 1.4729391974024315e-05, "loss": 0.011, "num_tokens": 582331118.0, "reward": 2.0439453125, "reward_std": 0.14073872566223145, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 737.072265625, "completions/mean_terminated_length": 734.5068359375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.40966117606895963, "frac_reward_zero_std": 0.53125, "grad_norm": 8.620907178612065, "kl": 0.2314453125, "learning_rate": 1.4718891685150151e-05, "loss": 0.0061, "num_tokens": 582794291.0, "reward": 2.087890625, "reward_std": 0.1770661473274231, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0347534641623497, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 681.8984375, "completions/mean_terminated_length": 681.8984375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.41000256038235044, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14738513452997692, "kl": 0.15087890625, "learning_rate": 1.4708384698672342e-05, "loss": 0.0261, "num_tokens": 583223231.0, "reward": 2.09912109375, "reward_std": 0.19165322184562683, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.036414988338947296, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 699.57421875, "completions/mean_terminated_length": 699.57421875, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.41034394469574126, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12834437911726587, "kl": 0.144775390625, "learning_rate": 1.4697871029503626e-05, "loss": 0.0074, "num_tokens": 583663333.0, "reward": 2.0146484375, "reward_std": 0.18851128220558167, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.04081062600016594, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 735.349609375, "completions/mean_terminated_length": 735.349609375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.410685329009132, "frac_reward_zero_std": 0.4375, "grad_norm": 3.35063753989393, "kl": 0.153564453125, "learning_rate": 1.4687350692566236e-05, "loss": 0.0154, "num_tokens": 584126744.0, "reward": 2.0, "reward_std": 0.17500996589660645, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04626457393169403, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 730.75390625, "completions/mean_terminated_length": 730.75390625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.41102671332252283, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1264220923663278, "kl": 0.139892578125, "learning_rate": 1.4676823702791865e-05, "loss": 0.0094, "num_tokens": 584585162.0, "reward": 2.08251953125, "reward_std": 0.19279228150844574, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.05039432644844055, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1851.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 677.48828125, "completions/mean_terminated_length": 677.48828125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.41136809763591364, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1325926888346177, "kl": 0.1513671875, "learning_rate": 1.466629007512165e-05, "loss": 0.011, "num_tokens": 585015972.0, "reward": 2.09228515625, "reward_std": 0.12333377450704575, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 709.1328125, "completions/mean_terminated_length": 709.1328125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.41170948194930446, "frac_reward_zero_std": 0.46875, "grad_norm": 0.16021885027895535, "kl": 0.14501953125, "learning_rate": 1.4655749824506152e-05, "loss": 0.0184, "num_tokens": 585461336.0, "reward": 2.0859375, "reward_std": 0.186717227101326, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 763.794921875, "completions/mean_terminated_length": 763.794921875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.4120508662626952, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1213962499772317, "kl": 0.14404296875, "learning_rate": 1.4645202965905325e-05, "loss": 0.0153, "num_tokens": 585936799.0, "reward": 2.0810546875, "reward_std": 0.13808704912662506, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 719.302734375, "completions/mean_terminated_length": 719.302734375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.41239225057608603, "frac_reward_zero_std": 0.5, "grad_norm": 0.12826540547215373, "kl": 0.1455078125, "learning_rate": 1.4634649514288508e-05, "loss": 0.009, "num_tokens": 586390874.0, "reward": 2.07470703125, "reward_std": 0.16075019538402557, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 815.16015625, "completions/mean_terminated_length": 814.4794311523438, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.41273363488947684, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5862329889236116, "kl": 0.316650390625, "learning_rate": 1.4624089484634394e-05, "loss": 0.0195, "num_tokens": 586890508.0, "reward": 2.0390625, "reward_std": 0.17495597898960114, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 787.271484375, "completions/mean_terminated_length": 787.271484375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.41307501920286765, "frac_reward_zero_std": 0.53125, "grad_norm": 0.4571532333247072, "kl": 0.151611328125, "learning_rate": 1.4613522891931013e-05, "loss": 0.0187, "num_tokens": 587369271.0, "reward": 2.05078125, "reward_std": 0.17196111381053925, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 705.759765625, "completions/mean_terminated_length": 705.759765625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.4134164035162584, "frac_reward_zero_std": 0.5, "grad_norm": 0.1273191717966715, "kl": 0.14208984375, "learning_rate": 1.4602949751175714e-05, "loss": 0.0172, "num_tokens": 587828940.0, "reward": 2.0341796875, "reward_std": 0.15560951828956604, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 750.4140625, "completions/mean_terminated_length": 750.4140625, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.4137577878296492, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11606074148571242, "kl": 0.154541015625, "learning_rate": 1.4592370077375132e-05, "loss": 0.0204, "num_tokens": 588297136.0, "reward": 2.07373046875, "reward_std": 0.1714479923248291, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 749.65625, "completions/mean_terminated_length": 749.65625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.41409917214304004, "frac_reward_zero_std": 0.53125, "grad_norm": 0.123136371369016, "kl": 0.147705078125, "learning_rate": 1.4581783885545184e-05, "loss": 0.0124, "num_tokens": 588766000.0, "reward": 2.041015625, "reward_std": 0.14326819777488708, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 751.048828125, "completions/mean_terminated_length": 751.048828125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.41444055645643085, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3315637196891423, "kl": 0.176513671875, "learning_rate": 1.4571191190711029e-05, "loss": 0.0096, "num_tokens": 589231513.0, "reward": 2.048828125, "reward_std": 0.13293591141700745, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 719.390625, "completions/mean_terminated_length": 719.390625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.4147819407698216, "frac_reward_zero_std": 0.3125, "grad_norm": 0.17210266401344534, "kl": 0.16455078125, "learning_rate": 1.456059200790706e-05, "loss": 0.0152, "num_tokens": 589677345.0, "reward": 2.0400390625, "reward_std": 0.2500760555267334, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.958984375, "rewards/format_reward/std": 0.19852031767368317, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.048670168966054916, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 698.396484375, "completions/mean_terminated_length": 698.396484375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.4151233250832124, "frac_reward_zero_std": 0.0625, "grad_norm": 0.1893322250696519, "kl": 0.166748046875, "learning_rate": 1.4549986352176882e-05, "loss": 0.0098, "num_tokens": 590119084.0, "reward": 1.806640625, "reward_std": 0.5103014707565308, "rewards/accuracy_reward/mean": 0.09677419066429138, "rewards/accuracy_reward/std": 0.2959485352039337, "rewards/format_reward/mean": 0.7734375, "rewards/format_reward/std": 0.4190165400505066, "rewards/tag_count_reward/mean": 0.939453125, "rewards/tag_count_reward/std": 0.12412548810243607, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 695.693359375, "completions/mean_terminated_length": 693.07666015625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.41546470939660324, "frac_reward_zero_std": 0.0, "grad_norm": 0.36452752268868627, "kl": 0.220703125, "learning_rate": 1.4539374238573276e-05, "loss": 0.0237, "num_tokens": 590554943.0, "reward": 1.59716796875, "reward_std": 0.6430724263191223, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.63671875, "rewards/format_reward/std": 0.4814152419567108, "rewards/tag_count_reward/mean": 0.89599609375, "rewards/tag_count_reward/std": 0.15964914858341217, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 635.33984375, "completions/mean_terminated_length": 633.5137329101562, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.41580609370999405, "frac_reward_zero_std": 0.0, "grad_norm": 0.5016452340156518, "kl": 0.390869140625, "learning_rate": 1.4528755682158205e-05, "loss": 0.0391, "num_tokens": 590970941.0, "reward": 1.74169921875, "reward_std": 0.5658047795295715, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.75390625, "rewards/format_reward/std": 0.4311550557613373, "rewards/tag_count_reward/mean": 0.92333984375, "rewards/tag_count_reward/std": 0.14707578718662262, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 642.68359375, "completions/mean_terminated_length": 640.75146484375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.4161474780233848, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5300700591459354, "kl": 0.21240234375, "learning_rate": 1.4518130698002763e-05, "loss": 0.0324, "num_tokens": 591391515.0, "reward": 1.8896484375, "reward_std": 0.3853845000267029, "rewards/accuracy_reward/mean": 0.032258063554763794, "rewards/accuracy_reward/std": 0.17686305940151215, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.31241437792778015, "rewards/tag_count_reward/mean": 0.9677734375, "rewards/tag_count_reward/std": 0.09735506027936935, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 583.810546875, "completions/mean_terminated_length": 583.810546875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.4164888623367756, "frac_reward_zero_std": 0.40625, "grad_norm": 0.17128061813730355, "kl": 0.17919921875, "learning_rate": 1.4507499301187173e-05, "loss": 0.0153, "num_tokens": 591774714.0, "reward": 1.99609375, "reward_std": 0.21906518936157227, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.1843547374010086, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.057698383927345276, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 574.56640625, "completions/mean_terminated_length": 573.74169921875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.41683024665016644, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14837611431749054, "kl": 0.182861328125, "learning_rate": 1.449686150680076e-05, "loss": 0.0182, "num_tokens": 592151564.0, "reward": 1.97265625, "reward_std": 0.1755637228488922, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04626457393169403, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 554.45703125, "completions/mean_terminated_length": 554.45703125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.41717163096355725, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1799772988992238, "kl": 0.19287109375, "learning_rate": 1.4486217329941923e-05, "loss": 0.0127, "num_tokens": 592515862.0, "reward": 2.0263671875, "reward_std": 0.1586238294839859, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.04081062600016594, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1230.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 527.154296875, "completions/mean_terminated_length": 527.154296875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.417513015276948, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1748562743967559, "kl": 0.187255859375, "learning_rate": 1.4475566785718127e-05, "loss": 0.011, "num_tokens": 592871173.0, "reward": 1.98876953125, "reward_std": 0.13336944580078125, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.05252963304519653, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 568.060546875, "completions/mean_terminated_length": 568.060546875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.4178543995903388, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1433604196471944, "kl": 0.18017578125, "learning_rate": 1.4464909889245868e-05, "loss": 0.0267, "num_tokens": 593240452.0, "reward": 1.99658203125, "reward_std": 0.1684112250804901, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.036283548921346664, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 593.826171875, "completions/mean_terminated_length": 593.826171875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.41819578390372963, "frac_reward_zero_std": 0.28125, "grad_norm": 0.17458916926454537, "kl": 0.169677734375, "learning_rate": 1.4454246655650668e-05, "loss": 0.0305, "num_tokens": 593625163.0, "reward": 1.99609375, "reward_std": 0.2163454294204712, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.07241551578044891, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 644.509765625, "completions/mean_terminated_length": 643.4539794921875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.41853716821712045, "frac_reward_zero_std": 0.59375, "grad_norm": 3.667167945828998, "kl": 0.805419921875, "learning_rate": 1.4443577100067033e-05, "loss": 0.0453, "num_tokens": 594039120.0, "reward": 2.0068359375, "reward_std": 0.13701844215393066, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06218579038977623, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 688.373046875, "completions/mean_terminated_length": 688.373046875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.4188785525305112, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13952152127161083, "kl": 0.163818359375, "learning_rate": 1.4432901237638444e-05, "loss": 0.0101, "num_tokens": 594474559.0, "reward": 2.0126953125, "reward_std": 0.1799059808254242, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034629516303539276, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 731.681640625, "completions/mean_terminated_length": 731.681640625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.419219936843902, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12138102259197521, "kl": 0.15625, "learning_rate": 1.4422219083517338e-05, "loss": 0.0089, "num_tokens": 594937020.0, "reward": 2.0419921875, "reward_std": 0.14340853691101074, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 791.974609375, "completions/mean_terminated_length": 789.5166015625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.41956132115729283, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1129454032004388, "kl": 0.146728515625, "learning_rate": 1.4411530652865077e-05, "loss": 0.0243, "num_tokens": 595427679.0, "reward": 2.025390625, "reward_std": 0.16940301656723022, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1692.0, "completions/max_terminated_length": 1692.0, "completions/mean_length": 755.21875, "completions/mean_terminated_length": 755.21875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.41990270547068365, "frac_reward_zero_std": 0.5, "grad_norm": 0.12054882080746337, "kl": 0.150146484375, "learning_rate": 1.4400835960851938e-05, "loss": 0.0014, "num_tokens": 595901087.0, "reward": 2.03076171875, "reward_std": 0.1740472912788391, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 797.73828125, "completions/mean_terminated_length": 797.73828125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.4202440897840744, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09862782775924575, "kl": 0.152099609375, "learning_rate": 1.439013502265707e-05, "loss": 0.0059, "num_tokens": 596391177.0, "reward": 2.0439453125, "reward_std": 0.1142137423157692, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 773.89453125, "completions/mean_terminated_length": 773.89453125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.4205854740974652, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12276262450696711, "kl": 0.153076171875, "learning_rate": 1.4379427853468508e-05, "loss": 0.0073, "num_tokens": 596883267.0, "reward": 2.087890625, "reward_std": 0.1697334349155426, "rewards/accuracy_reward/mean": 0.10483870655298233, "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04406425356864929, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 815.90625, "completions/mean_terminated_length": 815.90625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.42092685841085603, "frac_reward_zero_std": 0.59375, "grad_norm": 0.16681278920870385, "kl": 0.15869140625, "learning_rate": 1.4368714468483115e-05, "loss": 0.0064, "num_tokens": 597383027.0, "reward": 2.04833984375, "reward_std": 0.1448490172624588, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1880.0, "completions/max_terminated_length": 1880.0, "completions/mean_length": 797.59765625, "completions/mean_terminated_length": 797.59765625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.42126824272424684, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1251616803178815, "kl": 0.156982421875, "learning_rate": 1.4357994882906586e-05, "loss": 0.0057, "num_tokens": 597870661.0, "reward": 2.037109375, "reward_std": 0.1576440930366516, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 780.177734375, "completions/mean_terminated_length": 780.177734375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.4216096270376376, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10199866284105549, "kl": 0.146484375, "learning_rate": 1.4347269111953406e-05, "loss": 0.0057, "num_tokens": 598359952.0, "reward": 2.01318359375, "reward_std": 0.11291886866092682, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05285605043172836, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 877.6484375, "completions/mean_terminated_length": 877.6484375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.4219510113510284, "frac_reward_zero_std": 0.5, "grad_norm": 0.10514160105592739, "kl": 0.144775390625, "learning_rate": 1.4336537170846849e-05, "loss": 0.0028, "num_tokens": 598908860.0, "reward": 2.11328125, "reward_std": 0.1941487193107605, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 776.4453125, "completions/mean_terminated_length": 775.7788696289062, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.42229239566441923, "frac_reward_zero_std": 0.53125, "grad_norm": 0.4443495939726328, "kl": 0.25146484375, "learning_rate": 1.432579907481894e-05, "loss": 0.0137, "num_tokens": 599389008.0, "reward": 2.05517578125, "reward_std": 0.1662340760231018, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 714.177734375, "completions/mean_terminated_length": 714.177734375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.42263377997781004, "frac_reward_zero_std": 0.5, "grad_norm": 0.13559855645792462, "kl": 0.166015625, "learning_rate": 1.4315054839110445e-05, "loss": 0.0138, "num_tokens": 599838875.0, "reward": 2.06201171875, "reward_std": 0.18528705835342407, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 733.046875, "completions/mean_terminated_length": 733.046875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.4229751642912008, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10931212078282443, "kl": 0.16796875, "learning_rate": 1.4304304478970839e-05, "loss": 0.0034, "num_tokens": 600295955.0, "reward": 2.04541015625, "reward_std": 0.13272294402122498, "rewards/accuracy_reward/mean": 0.05645161122083664, "rewards/accuracy_reward/std": 0.23102475702762604, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1707.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 729.84765625, "completions/mean_terminated_length": 729.84765625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.4233165486045916, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09528557740312651, "kl": 0.162109375, "learning_rate": 1.4293548009658294e-05, "loss": 0.0088, "num_tokens": 600755077.0, "reward": 2.02587890625, "reward_std": 0.11668550223112106, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 721.328125, "completions/mean_terminated_length": 720.2034912109375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4236579329179824, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7672737519849829, "kl": 0.536376953125, "learning_rate": 1.4282785446439654e-05, "loss": 0.0283, "num_tokens": 601206461.0, "reward": 2.06396484375, "reward_std": 0.15191341936588287, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 783.3515625, "completions/mean_terminated_length": 781.8453979492188, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.42399931723137324, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3269297348345463, "kl": 0.397216796875, "learning_rate": 1.4272016804590398e-05, "loss": 0.037, "num_tokens": 601700433.0, "reward": 2.08154296875, "reward_std": 0.17030742764472961, "rewards/accuracy_reward/mean": 0.10080645233392715, "rewards/accuracy_reward/std": 0.30137622356414795, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 691.900390625, "completions/mean_terminated_length": 691.1956787109375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.424340701544764, "frac_reward_zero_std": 0.5625, "grad_norm": 0.35196022540747884, "kl": 0.1953125, "learning_rate": 1.4261242099394654e-05, "loss": 0.0188, "num_tokens": 602137086.0, "reward": 2.03271484375, "reward_std": 0.14225342869758606, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 752.302734375, "completions/mean_terminated_length": 752.302734375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.4246820858581548, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11752061134805177, "kl": 0.158935546875, "learning_rate": 1.4250461346145142e-05, "loss": 0.0192, "num_tokens": 602606985.0, "reward": 2.13037109375, "reward_std": 0.17821475863456726, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1926.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 766.244140625, "completions/mean_terminated_length": 766.244140625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.4250234701715456, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08562671132179103, "kl": 0.14892578125, "learning_rate": 1.4239674560143168e-05, "loss": 0.0077, "num_tokens": 603081750.0, "reward": 2.03369140625, "reward_std": 0.10767760872840881, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 713.189453125, "completions/mean_terminated_length": 713.189453125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.42536485448493644, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1274490058231458, "kl": 0.162841796875, "learning_rate": 1.4228881756698603e-05, "loss": 0.0079, "num_tokens": 603540535.0, "reward": 2.06494140625, "reward_std": 0.16365396976470947, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 754.083984375, "completions/mean_terminated_length": 754.083984375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.4257062387983272, "frac_reward_zero_std": 0.65625, "grad_norm": 0.6623296073522212, "kl": 0.164794921875, "learning_rate": 1.4218082951129859e-05, "loss": 0.0149, "num_tokens": 604024034.0, "reward": 2.09765625, "reward_std": 0.13985879719257355, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 726.6015625, "completions/mean_terminated_length": 726.6015625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.426047623111718, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09521097317283642, "kl": 0.1533203125, "learning_rate": 1.4207278158763861e-05, "loss": 0.0022, "num_tokens": 604484054.0, "reward": 2.0439453125, "reward_std": 0.13731977343559265, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 693.1796875, "completions/mean_terminated_length": 693.1796875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.4263890074251088, "frac_reward_zero_std": 0.5, "grad_norm": 0.1328091908326859, "kl": 0.162841796875, "learning_rate": 1.419646739493604e-05, "loss": 0.0069, "num_tokens": 604916210.0, "reward": 2.078125, "reward_std": 0.18824580311775208, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 798.359375, "completions/mean_terminated_length": 798.359375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.42673039173849964, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1131846111830826, "kl": 0.14111328125, "learning_rate": 1.4185650674990297e-05, "loss": 0.0155, "num_tokens": 605402986.0, "reward": 2.064453125, "reward_std": 0.1742907464504242, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 768.013671875, "completions/mean_terminated_length": 768.013671875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.4270717760518904, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11314655787184236, "kl": 0.146484375, "learning_rate": 1.4174828014278985e-05, "loss": 0.0065, "num_tokens": 605873505.0, "reward": 2.09423828125, "reward_std": 0.17149238288402557, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 800.537109375, "completions/mean_terminated_length": 800.537109375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.4274131603652812, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09251768939872458, "kl": 0.1474609375, "learning_rate": 1.4163999428162894e-05, "loss": 0.0248, "num_tokens": 606360644.0, "reward": 2.0634765625, "reward_std": 0.11611117422580719, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2024.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 763.970703125, "completions/mean_terminated_length": 763.970703125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.427754544678672, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11705036889737779, "kl": 0.14404296875, "learning_rate": 1.4153164932011223e-05, "loss": 0.0075, "num_tokens": 606839733.0, "reward": 2.0595703125, "reward_std": 0.1638772189617157, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 749.12109375, "completions/mean_terminated_length": 749.12109375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.42809592899206284, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10111998733731077, "kl": 0.144287109375, "learning_rate": 1.4142324541201553e-05, "loss": 0.0126, "num_tokens": 607307939.0, "reward": 2.05322265625, "reward_std": 0.11704547703266144, "rewards/accuracy_reward/mean": 0.06653226166963577, "rewards/accuracy_reward/std": 0.2494617998600006, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 749.03515625, "completions/mean_terminated_length": 747.26806640625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.4284373133054536, "frac_reward_zero_std": 0.625, "grad_norm": 0.201267110775523, "kl": 0.228271484375, "learning_rate": 1.4131478271119839e-05, "loss": 0.0136, "num_tokens": 607775285.0, "reward": 2.068359375, "reward_std": 0.13467703759670258, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1495.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 737.26171875, "completions/mean_terminated_length": 737.26171875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.4287786976188444, "frac_reward_zero_std": 0.65625, "grad_norm": 0.099694182871476, "kl": 0.146728515625, "learning_rate": 1.4120626137160377e-05, "loss": 0.0087, "num_tokens": 608232587.0, "reward": 2.0712890625, "reward_std": 0.13254845142364502, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 732.072265625, "completions/mean_terminated_length": 732.072265625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.4291200819322352, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1080296056248656, "kl": 0.1474609375, "learning_rate": 1.4109768154725783e-05, "loss": 0.0078, "num_tokens": 608687344.0, "reward": 2.06982421875, "reward_std": 0.12579593062400818, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 801.5390625, "completions/mean_terminated_length": 801.5390625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.42946146624562603, "frac_reward_zero_std": 0.625, "grad_norm": 0.1041155688235961, "kl": 0.137451171875, "learning_rate": 1.4098904339226982e-05, "loss": 0.0104, "num_tokens": 609179556.0, "reward": 2.03271484375, "reward_std": 0.13523036241531372, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 755.2890625, "completions/mean_terminated_length": 755.2890625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.4298028505590168, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09505534170403113, "kl": 0.14013671875, "learning_rate": 1.4088034706083173e-05, "loss": 0.0042, "num_tokens": 609647960.0, "reward": 2.02294921875, "reward_std": 0.1121893972158432, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 721.326171875, "completions/mean_terminated_length": 721.326171875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4301442348724076, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10312996181099977, "kl": 0.140869140625, "learning_rate": 1.407715927072181e-05, "loss": 0.0055, "num_tokens": 610104335.0, "reward": 2.08837890625, "reward_std": 0.12249472737312317, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 735.5859375, "completions/mean_terminated_length": 734.6966552734375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.4304856191857984, "frac_reward_zero_std": 0.71875, "grad_norm": 0.6854690970025845, "kl": 0.410400390625, "learning_rate": 1.4066278048578586e-05, "loss": 0.0182, "num_tokens": 610561035.0, "reward": 2.02197265625, "reward_std": 0.10318189859390259, "rewards/accuracy_reward/mean": 0.03427419438958168, "rewards/accuracy_reward/std": 0.18211627006530762, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 785.8359375, "completions/mean_terminated_length": 785.8359375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.43082700349918923, "frac_reward_zero_std": 0.625, "grad_norm": 0.10914563075871125, "kl": 0.135498046875, "learning_rate": 1.40553910550974e-05, "loss": 0.0142, "num_tokens": 611046743.0, "reward": 2.07568359375, "reward_std": 0.15227971971035004, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 738.2265625, "completions/mean_terminated_length": 738.2265625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.43116838781258, "frac_reward_zero_std": 0.5, "grad_norm": 0.12209085736684222, "kl": 0.1416015625, "learning_rate": 1.404449830573036e-05, "loss": 0.0043, "num_tokens": 611503291.0, "reward": 2.07470703125, "reward_std": 0.1897425651550293, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 765.09765625, "completions/mean_terminated_length": 765.09765625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.4315097721259708, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08739827752936208, "kl": 0.142822265625, "learning_rate": 1.4033599815937715e-05, "loss": 0.0069, "num_tokens": 611987277.0, "reward": 2.0048828125, "reward_std": 0.09144711494445801, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 712.94921875, "completions/mean_terminated_length": 712.94921875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.4318511564393616, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09683507692158938, "kl": 0.141845703125, "learning_rate": 1.402269560118789e-05, "loss": 0.006, "num_tokens": 612437955.0, "reward": 2.12939453125, "reward_std": 0.14132165908813477, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2032.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 722.798828125, "completions/mean_terminated_length": 722.798828125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.43219254075275243, "frac_reward_zero_std": 0.625, "grad_norm": 0.10551480331140681, "kl": 0.144775390625, "learning_rate": 1.4011785676957423e-05, "loss": 0.0045, "num_tokens": 612895324.0, "reward": 2.11669921875, "reward_std": 0.15130965411663055, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 718.462890625, "completions/mean_terminated_length": 718.462890625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.4325339250661432, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1323016327481177, "kl": 0.143798828125, "learning_rate": 1.400087005873095e-05, "loss": 0.0053, "num_tokens": 613354713.0, "reward": 2.1103515625, "reward_std": 0.23317399621009827, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.05597515031695366, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1427.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 700.435546875, "completions/mean_terminated_length": 700.435546875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.432875309379534, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1314408277001777, "kl": 0.149169921875, "learning_rate": 1.39899487620012e-05, "loss": 0.0225, "num_tokens": 613797000.0, "reward": 2.1298828125, "reward_std": 0.18609708547592163, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 746.125, "completions/mean_terminated_length": 746.125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.4332166936929248, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09824412409323316, "kl": 0.141845703125, "learning_rate": 1.3979021802268955e-05, "loss": 0.0076, "num_tokens": 614257352.0, "reward": 2.08349609375, "reward_std": 0.15332981944084167, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 751.283203125, "completions/mean_terminated_length": 751.283203125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.43355807800631563, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08456848494521416, "kl": 0.13330078125, "learning_rate": 1.396808919504303e-05, "loss": 0.0161, "num_tokens": 614719081.0, "reward": 2.0703125, "reward_std": 0.13441061973571777, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1515.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 684.302734375, "completions/mean_terminated_length": 684.302734375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.4338994623197064, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0785185038219004, "kl": 0.15283203125, "learning_rate": 1.3957150955840268e-05, "loss": 0.008, "num_tokens": 615152340.0, "reward": 2.03271484375, "reward_std": 0.08684363961219788, "rewards/accuracy_reward/mean": 0.04233871027827263, "rewards/accuracy_reward/std": 0.2015640139579773, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 748.9921875, "completions/mean_terminated_length": 748.9921875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.4342408466330972, "frac_reward_zero_std": 0.625, "grad_norm": 0.10403845097227113, "kl": 0.147705078125, "learning_rate": 1.39462071001855e-05, "loss": 0.0031, "num_tokens": 615620272.0, "reward": 2.0146484375, "reward_std": 0.12500624358654022, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 719.01953125, "completions/mean_terminated_length": 719.01953125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.434582230946488, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10964507658923169, "kl": 0.142822265625, "learning_rate": 1.3935257643611521e-05, "loss": 0.001, "num_tokens": 616065946.0, "reward": 2.0361328125, "reward_std": 0.13085219264030457, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 734.66015625, "completions/mean_terminated_length": 734.66015625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4349236152598788, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10042138466515932, "kl": 0.146484375, "learning_rate": 1.3924302601659088e-05, "loss": 0.0101, "num_tokens": 616519532.0, "reward": 2.080078125, "reward_std": 0.14802411198616028, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1407.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 724.7890625, "completions/mean_terminated_length": 724.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.4352649995732696, "frac_reward_zero_std": 0.5625, "grad_norm": 0.50102413868683, "kl": 0.1875, "learning_rate": 1.3913341989876876e-05, "loss": 0.0206, "num_tokens": 616964192.0, "reward": 2.06396484375, "reward_std": 0.16525918245315552, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 723.369140625, "completions/mean_terminated_length": 723.369140625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.4356063838866604, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10564010032546146, "kl": 0.14794921875, "learning_rate": 1.390237582382147e-05, "loss": 0.0097, "num_tokens": 617416221.0, "reward": 2.06396484375, "reward_std": 0.15290974080562592, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 743.544921875, "completions/mean_terminated_length": 743.544921875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.4359477682000512, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10809195649958775, "kl": 0.152587890625, "learning_rate": 1.3891404119057335e-05, "loss": 0.0103, "num_tokens": 617871956.0, "reward": 2.09326171875, "reward_std": 0.16202788054943085, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1424.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 751.849609375, "completions/mean_terminated_length": 751.849609375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.436289152513442, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09260092996756819, "kl": 0.146240234375, "learning_rate": 1.3880426891156799e-05, "loss": 0.01, "num_tokens": 618340295.0, "reward": 2.04833984375, "reward_std": 0.12551699578762054, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 752.669921875, "completions/mean_terminated_length": 752.669921875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.4366305368268328, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10356949091390044, "kl": 0.141845703125, "learning_rate": 1.3869444155700033e-05, "loss": 0.0095, "num_tokens": 618810846.0, "reward": 2.072265625, "reward_std": 0.15475985407829285, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0347534641623497, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 747.09765625, "completions/mean_terminated_length": 747.09765625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.4369719211402236, "frac_reward_zero_std": 0.75, "grad_norm": 0.0868601413372234, "kl": 0.14111328125, "learning_rate": 1.3858455928275018e-05, "loss": 0.0062, "num_tokens": 619269712.0, "reward": 2.0537109375, "reward_std": 0.09748993813991547, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 782.8515625, "completions/mean_terminated_length": 782.8515625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.4373133054536144, "frac_reward_zero_std": 0.75, "grad_norm": 0.07818846510687122, "kl": 0.142822265625, "learning_rate": 1.3847462224477536e-05, "loss": 0.0122, "num_tokens": 619763972.0, "reward": 2.05419921875, "reward_std": 0.10337120294570923, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 710.888671875, "completions/mean_terminated_length": 710.888671875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.4376546897670052, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12159699810001272, "kl": 0.143798828125, "learning_rate": 1.3836463059911136e-05, "loss": 0.0163, "num_tokens": 620207595.0, "reward": 2.06787109375, "reward_std": 0.13302890956401825, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 796.78125, "completions/mean_terminated_length": 795.6966552734375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.437996074080396, "frac_reward_zero_std": 0.75, "grad_norm": 1.3640694557111275, "kl": 0.5654296875, "learning_rate": 1.3825458450187119e-05, "loss": 0.0242, "num_tokens": 620695947.0, "reward": 2.02880859375, "reward_std": 0.07209931313991547, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 717.423828125, "completions/mean_terminated_length": 716.4735717773438, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.4383374583937868, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3234922881114218, "kl": 0.14892578125, "learning_rate": 1.381444841092452e-05, "loss": 0.0065, "num_tokens": 621159540.0, "reward": 2.06982421875, "reward_std": 0.1805834174156189, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 791.328125, "completions/mean_terminated_length": 791.328125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.4386788427071776, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11069338750956784, "kl": 0.1337890625, "learning_rate": 1.3803432957750074e-05, "loss": 0.001, "num_tokens": 621642012.0, "reward": 2.08203125, "reward_std": 0.17460137605667114, "rewards/accuracy_reward/mean": 0.10080645233392715, "rewards/accuracy_reward/std": 0.30137622356414795, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 767.712890625, "completions/mean_terminated_length": 767.712890625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.4390202270205684, "frac_reward_zero_std": 0.625, "grad_norm": 0.10198112086459087, "kl": 0.133544921875, "learning_rate": 1.3792412106298198e-05, "loss": 0.0074, "num_tokens": 622119929.0, "reward": 2.0146484375, "reward_std": 0.12763558328151703, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 767.671875, "completions/mean_terminated_length": 767.671875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.4393616113339592, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11994176002377903, "kl": 0.134765625, "learning_rate": 1.3781385872210987e-05, "loss": 0.0081, "num_tokens": 622594609.0, "reward": 2.0498046875, "reward_std": 0.17517441511154175, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 737.19921875, "completions/mean_terminated_length": 734.634033203125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.43970299564735, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13036278306316712, "kl": 0.14111328125, "learning_rate": 1.3770354271138149e-05, "loss": 0.018, "num_tokens": 623056951.0, "reward": 2.0595703125, "reward_std": 0.18515758216381073, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 793.306640625, "completions/mean_terminated_length": 793.306640625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.4400443799607408, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1144541182844824, "kl": 0.14404296875, "learning_rate": 1.3759317318737031e-05, "loss": 0.0101, "num_tokens": 623544596.0, "reward": 2.064453125, "reward_std": 0.15670624375343323, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 777.61328125, "completions/mean_terminated_length": 777.61328125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.4403857642741316, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11217692482779733, "kl": 0.13671875, "learning_rate": 1.3748275030672569e-05, "loss": 0.0143, "num_tokens": 624024142.0, "reward": 2.05029296875, "reward_std": 0.14936494827270508, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 796.427734375, "completions/mean_terminated_length": 796.427734375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.4407271485875224, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10712811909330915, "kl": 0.135009765625, "learning_rate": 1.3737227422617267e-05, "loss": 0.0202, "num_tokens": 624525225.0, "reward": 2.09716796875, "reward_std": 0.1638602614402771, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 770.5234375, "completions/mean_terminated_length": 770.5234375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4410685329009132, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12252201258794644, "kl": 0.13330078125, "learning_rate": 1.3726174510251189e-05, "loss": 0.0117, "num_tokens": 625002645.0, "reward": 2.13134765625, "reward_std": 0.2145114541053772, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 788.546875, "completions/mean_terminated_length": 786.0822143554688, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.441409917214304, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10520865240730742, "kl": 0.1376953125, "learning_rate": 1.3715116309261923e-05, "loss": 0.0192, "num_tokens": 625488765.0, "reward": 2.0205078125, "reward_std": 0.12040627002716064, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 758.2109375, "completions/mean_terminated_length": 758.2109375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.4417513015276948, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09038289348279592, "kl": 0.131103515625, "learning_rate": 1.3704052835344557e-05, "loss": -0.0035, "num_tokens": 625957337.0, "reward": 2.0302734375, "reward_std": 0.10662223398685455, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 746.55859375, "completions/mean_terminated_length": 746.55859375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.4420926858410856, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10667694951835235, "kl": 0.135009765625, "learning_rate": 1.3692984104201672e-05, "loss": 0.0132, "num_tokens": 626421623.0, "reward": 2.09228515625, "reward_std": 0.1699477732181549, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 731.490234375, "completions/mean_terminated_length": 731.490234375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.4424340701544764, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10557234272284885, "kl": 0.13818359375, "learning_rate": 1.3681910131543308e-05, "loss": 0.0077, "num_tokens": 626889602.0, "reward": 2.07568359375, "reward_std": 0.1689925342798233, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 706.669921875, "completions/mean_terminated_length": 706.669921875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4427754544678672, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13316228433107583, "kl": 0.1474609375, "learning_rate": 1.3670830933086941e-05, "loss": 0.0025, "num_tokens": 627332777.0, "reward": 2.04833984375, "reward_std": 0.1635630577802658, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 783.009765625, "completions/mean_terminated_length": 783.009765625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.443116838781258, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11479972904796651, "kl": 0.130615234375, "learning_rate": 1.3659746524557468e-05, "loss": 0.0176, "num_tokens": 627816830.0, "reward": 2.11572265625, "reward_std": 0.2114834487438202, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 723.73046875, "completions/mean_terminated_length": 723.73046875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4434582230946488, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12684434658711366, "kl": 0.135009765625, "learning_rate": 1.3648656921687182e-05, "loss": 0.0266, "num_tokens": 628265396.0, "reward": 2.0439453125, "reward_std": 0.18058337271213531, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293970108032227, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 690.72265625, "completions/mean_terminated_length": 690.72265625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.4437996074080396, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13005092884237765, "kl": 0.1396484375, "learning_rate": 1.3637562140215746e-05, "loss": 0.0037, "num_tokens": 628703526.0, "reward": 2.08544921875, "reward_std": 0.17970658838748932, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 771.919921875, "completions/mean_terminated_length": 771.919921875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.4441409917214304, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09963154938125991, "kl": 0.129638671875, "learning_rate": 1.3626462195890169e-05, "loss": 0.0141, "num_tokens": 629190701.0, "reward": 2.05712890625, "reward_std": 0.14664380252361298, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 744.9765625, "completions/mean_terminated_length": 744.9765625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.4444823760348212, "frac_reward_zero_std": 0.5, "grad_norm": 0.11766414512295591, "kl": 0.12939453125, "learning_rate": 1.3615357104464794e-05, "loss": 0.0036, "num_tokens": 629659457.0, "reward": 2.10302734375, "reward_std": 0.22067588567733765, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1495.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 767.701171875, "completions/mean_terminated_length": 767.701171875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.444823760348212, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0910084481814581, "kl": 0.1396484375, "learning_rate": 1.3604246881701269e-05, "loss": 0.0125, "num_tokens": 630133896.0, "reward": 2.04052734375, "reward_std": 0.12136983126401901, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1640.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 751.162109375, "completions/mean_terminated_length": 751.162109375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.4451651446616028, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13704934865373022, "kl": 0.134765625, "learning_rate": 1.3593131543368517e-05, "loss": 0.0092, "num_tokens": 630599995.0, "reward": 2.09375, "reward_std": 0.20178626477718353, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 781.38671875, "completions/mean_terminated_length": 781.38671875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.4455065289749936, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10339754789745494, "kl": 0.13330078125, "learning_rate": 1.3582011105242734e-05, "loss": 0.0105, "num_tokens": 631078849.0, "reward": 2.09423828125, "reward_std": 0.15049782395362854, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 766.390625, "completions/mean_terminated_length": 766.390625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.4458479132883844, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1192803389446433, "kl": 0.136962890625, "learning_rate": 1.3570885583107347e-05, "loss": 0.0055, "num_tokens": 631551913.0, "reward": 2.0361328125, "reward_std": 0.1454104483127594, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 833.55078125, "completions/mean_terminated_length": 833.55078125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.4461892976017752, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11007605032925091, "kl": 0.135986328125, "learning_rate": 1.3559754992752998e-05, "loss": 0.0085, "num_tokens": 632065027.0, "reward": 2.04296875, "reward_std": 0.14658331871032715, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 839.701171875, "completions/mean_terminated_length": 839.701171875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.446530681915166, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10441675057721249, "kl": 0.130126953125, "learning_rate": 1.3548619349977525e-05, "loss": 0.0285, "num_tokens": 632576202.0, "reward": 2.05908203125, "reward_std": 0.1574922651052475, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 751.857421875, "completions/mean_terminated_length": 751.857421875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.4468720662285568, "frac_reward_zero_std": 0.5, "grad_norm": 0.11802840346433563, "kl": 0.137451171875, "learning_rate": 1.3537478670585937e-05, "loss": 0.0164, "num_tokens": 633041521.0, "reward": 2.10595703125, "reward_std": 0.19249184429645538, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 725.703125, "completions/mean_terminated_length": 725.703125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.4472134505419476, "frac_reward_zero_std": 0.625, "grad_norm": 0.10194040528461565, "kl": 0.138671875, "learning_rate": 1.352633297039039e-05, "loss": 0.0057, "num_tokens": 633508057.0, "reward": 2.0263671875, "reward_std": 0.14546069502830505, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034629516303539276, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 887.216796875, "completions/mean_terminated_length": 887.216796875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.4475548348553384, "frac_reward_zero_std": 0.5, "grad_norm": 0.10973137917669365, "kl": 0.13037109375, "learning_rate": 1.3515182265210164e-05, "loss": 0.005, "num_tokens": 634053960.0, "reward": 2.0556640625, "reward_std": 0.18812838196754456, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 772.38671875, "completions/mean_terminated_length": 772.38671875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.4478962191687292, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11711280301139776, "kl": 0.135986328125, "learning_rate": 1.3504026570871649e-05, "loss": 0.0172, "num_tokens": 634538158.0, "reward": 2.10986328125, "reward_std": 0.20386144518852234, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 709.076171875, "completions/mean_terminated_length": 707.5577392578125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.44823760348212, "frac_reward_zero_std": 0.625, "grad_norm": 0.604743316387148, "kl": 0.197509765625, "learning_rate": 1.3492865903208311e-05, "loss": 0.0181, "num_tokens": 634978805.0, "reward": 2.0625, "reward_std": 0.15357205271720886, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04406425356864929, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 754.501953125, "completions/mean_terminated_length": 754.501953125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.4485789877955108, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11081506460940449, "kl": 0.14306640625, "learning_rate": 1.3481700278060681e-05, "loss": 0.0114, "num_tokens": 635450582.0, "reward": 2.05908203125, "reward_std": 0.14974841475486755, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03651979938149452, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 797.736328125, "completions/mean_terminated_length": 797.736328125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4489203721089016, "frac_reward_zero_std": 0.625, "grad_norm": 0.11105132164447885, "kl": 0.13330078125, "learning_rate": 1.3470529711276317e-05, "loss": 0.006, "num_tokens": 635954127.0, "reward": 2.07470703125, "reward_std": 0.12968049943447113, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 746.470703125, "completions/mean_terminated_length": 746.470703125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.4492617564222924, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1352690597759401, "kl": 0.140869140625, "learning_rate": 1.3459354218709795e-05, "loss": 0.0156, "num_tokens": 636424816.0, "reward": 2.09423828125, "reward_std": 0.20029735565185547, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03651979938149452, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1517.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 704.197265625, "completions/mean_terminated_length": 702.75146484375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.4496031407356832, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6419037797984211, "kl": 0.197021484375, "learning_rate": 1.3448173816222685e-05, "loss": 0.0276, "num_tokens": 636863381.0, "reward": 2.09765625, "reward_std": 0.1541491150856018, "rewards/accuracy_reward/mean": 0.12096773833036423, "rewards/accuracy_reward/std": 0.32641899585723877, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.051540303975343704, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 746.23046875, "completions/mean_terminated_length": 746.23046875, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.449944525049074, "frac_reward_zero_std": 0.625, "grad_norm": 0.10675660906684999, "kl": 0.132568359375, "learning_rate": 1.343698851968352e-05, "loss": 0.0093, "num_tokens": 637323995.0, "reward": 2.056640625, "reward_std": 0.1268172413110733, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1653.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 659.44921875, "completions/mean_terminated_length": 659.44921875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.4502859093624648, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11493765325708068, "kl": 0.143310546875, "learning_rate": 1.3425798344967787e-05, "loss": 0.0113, "num_tokens": 637746849.0, "reward": 2.0283203125, "reward_std": 0.15502266585826874, "rewards/accuracy_reward/mean": 0.05443548411130905, "rewards/accuracy_reward/std": 0.227104052901268, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04666558653116226, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 689.33203125, "completions/mean_terminated_length": 689.33203125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.4506272936758556, "frac_reward_zero_std": 0.53125, "grad_norm": 0.136670016206844, "kl": 0.14111328125, "learning_rate": 1.341460330795789e-05, "loss": 0.0201, "num_tokens": 638173547.0, "reward": 2.044921875, "reward_std": 0.16070668399333954, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1470.0, "completions/mean_length": 685.58984375, "completions/mean_terminated_length": 682.9236450195312, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.4509686779892464, "frac_reward_zero_std": 0.28125, "grad_norm": 0.16354300103745628, "kl": 0.14404296875, "learning_rate": 1.3403403424543138e-05, "loss": 0.0346, "num_tokens": 638611673.0, "reward": 2.0458984375, "reward_std": 0.2246732860803604, "rewards/accuracy_reward/mean": 0.0786290317773819, "rewards/accuracy_reward/std": 0.26943066716194153, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.05811915174126625, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 734.41796875, "completions/mean_terminated_length": 734.41796875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.4513100623026372, "frac_reward_zero_std": 0.5, "grad_norm": 0.13479385727149004, "kl": 0.135498046875, "learning_rate": 1.3392198710619716e-05, "loss": 0.0205, "num_tokens": 639065839.0, "reward": 2.0556640625, "reward_std": 0.17025014758110046, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 764.83984375, "completions/mean_terminated_length": 764.83984375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.451651446616028, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12004573257666998, "kl": 0.122802734375, "learning_rate": 1.3380989182090662e-05, "loss": 0.0088, "num_tokens": 639549165.0, "reward": 2.1044921875, "reward_std": 0.1813865453004837, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 664.169921875, "completions/mean_terminated_length": 661.4618530273438, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.4519928309294188, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1102223714704991, "kl": 0.1318359375, "learning_rate": 1.336977485486586e-05, "loss": 0.0127, "num_tokens": 639965828.0, "reward": 2.12939453125, "reward_std": 0.17097508907318115, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 735.83203125, "completions/mean_terminated_length": 734.7044677734375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.4523342152428096, "frac_reward_zero_std": 0.71875, "grad_norm": 0.3998679524121744, "kl": 0.193359375, "learning_rate": 1.335855574486199e-05, "loss": 0.0184, "num_tokens": 640428462.0, "reward": 2.0224609375, "reward_std": 0.1095648854970932, "rewards/accuracy_reward/mean": 0.03629032149910927, "rewards/accuracy_reward/std": 0.1872003823518753, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 713.87890625, "completions/mean_terminated_length": 713.87890625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.4526755995562004, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1289888892858963, "kl": 0.13916015625, "learning_rate": 1.3347331868002527e-05, "loss": 0.0283, "num_tokens": 640876320.0, "reward": 2.14990234375, "reward_std": 0.21471744775772095, "rewards/accuracy_reward/mean": 0.17943547666072845, "rewards/accuracy_reward/std": 0.3841039538383484, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 686.46484375, "completions/mean_terminated_length": 686.46484375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4530169838695912, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11061822569606146, "kl": 0.132568359375, "learning_rate": 1.3336103240217716e-05, "loss": 0.0144, "num_tokens": 641311086.0, "reward": 2.08837890625, "reward_std": 0.14502468705177307, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 778.28125, "completions/mean_terminated_length": 778.28125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.453358368182982, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09556484073040858, "kl": 0.127197265625, "learning_rate": 1.3324869877444536e-05, "loss": 0.0176, "num_tokens": 641800158.0, "reward": 2.07666015625, "reward_std": 0.12105336785316467, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 807.037109375, "completions/mean_terminated_length": 802.170654296875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.4536997524963728, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11534635763729871, "kl": 0.11962890625, "learning_rate": 1.3313631795626691e-05, "loss": 0.0301, "num_tokens": 642296129.0, "reward": 2.02392578125, "reward_std": 0.15029552578926086, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 754.572265625, "completions/mean_terminated_length": 754.572265625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4540411368097636, "frac_reward_zero_std": 0.5, "grad_norm": 0.12343980800404916, "kl": 0.135986328125, "learning_rate": 1.3302389010714582e-05, "loss": 0.0098, "num_tokens": 642752198.0, "reward": 2.12158203125, "reward_std": 0.19590556621551514, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 729.90234375, "completions/mean_terminated_length": 729.90234375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4543825211231544, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09031289659771391, "kl": 0.136474609375, "learning_rate": 1.3291141538665291e-05, "loss": 0.0162, "num_tokens": 643213700.0, "reward": 2.08935546875, "reward_std": 0.11902754008769989, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 719.525390625, "completions/mean_terminated_length": 719.525390625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.4547239054365452, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12067283407790044, "kl": 0.142578125, "learning_rate": 1.3279889395442542e-05, "loss": 0.0161, "num_tokens": 643670689.0, "reward": 2.02197265625, "reward_std": 0.1331167221069336, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 717.576171875, "completions/mean_terminated_length": 717.576171875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.455065289749936, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11916603078421245, "kl": 0.148681640625, "learning_rate": 1.3268632597016695e-05, "loss": 0.0109, "num_tokens": 644126952.0, "reward": 2.07666015625, "reward_std": 0.1490759253501892, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 727.109375, "completions/mean_terminated_length": 727.109375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.4554066740633268, "frac_reward_zero_std": 0.625, "grad_norm": 0.12387868404131037, "kl": 0.142578125, "learning_rate": 1.3257371159364723e-05, "loss": 0.013, "num_tokens": 644580320.0, "reward": 2.04736328125, "reward_std": 0.1354910433292389, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1827.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 733.837890625, "completions/mean_terminated_length": 733.837890625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4557480583767176, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11287070958015381, "kl": 0.140380859375, "learning_rate": 1.3246105098470166e-05, "loss": 0.0167, "num_tokens": 645033981.0, "reward": 2.09619140625, "reward_std": 0.15930357575416565, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1463.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 754.244140625, "completions/mean_terminated_length": 754.244140625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.45608944269010837, "frac_reward_zero_std": 0.625, "grad_norm": 0.11673940947840623, "kl": 0.140380859375, "learning_rate": 1.3234834430323144e-05, "loss": 0.0139, "num_tokens": 645508394.0, "reward": 2.01708984375, "reward_std": 0.12414534389972687, "rewards/accuracy_reward/mean": 0.032258063554763794, "rewards/accuracy_reward/std": 0.17686307430267334, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 679.619140625, "completions/mean_terminated_length": 679.619140625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.4564308270034992, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12130954826000719, "kl": 0.148193359375, "learning_rate": 1.322355917092031e-05, "loss": 0.0186, "num_tokens": 645941367.0, "reward": 2.07421875, "reward_std": 0.18037612736225128, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.05361218377947807, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 767.98828125, "completions/mean_terminated_length": 767.98828125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.45677221131689, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12149539392129369, "kl": 0.139892578125, "learning_rate": 1.3212279336264824e-05, "loss": 0.0197, "num_tokens": 646416929.0, "reward": 2.08251953125, "reward_std": 0.18328940868377686, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 750.623046875, "completions/mean_terminated_length": 748.641845703125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.4571135956302808, "frac_reward_zero_std": 0.5, "grad_norm": 5.522701363079265, "kl": 1.43359375, "learning_rate": 1.3200994942366358e-05, "loss": 0.0765, "num_tokens": 646875024.0, "reward": 2.1181640625, "reward_std": 0.21051523089408875, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1420.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 729.298828125, "completions/mean_terminated_length": 729.298828125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.45745497994367157, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11209099980904855, "kl": 0.1328125, "learning_rate": 1.3189706005241043e-05, "loss": 0.0054, "num_tokens": 647336153.0, "reward": 2.060546875, "reward_std": 0.1382061392068863, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 755.232421875, "completions/mean_terminated_length": 755.232421875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.4577963642570624, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09377108734210092, "kl": 0.134765625, "learning_rate": 1.3178412540911456e-05, "loss": 0.008, "num_tokens": 647808832.0, "reward": 2.021484375, "reward_std": 0.08577118813991547, "rewards/accuracy_reward/mean": 0.03541666641831398, "rewards/accuracy_reward/std": 0.18502336740493774, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 846.275390625, "completions/mean_terminated_length": 843.9236450195312, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.4581377485704532, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10963141159863411, "kl": 0.134765625, "learning_rate": 1.3167114565406606e-05, "loss": 0.0194, "num_tokens": 648332845.0, "reward": 2.08251953125, "reward_std": 0.1657879650592804, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 857.26953125, "completions/mean_terminated_length": 854.9393310546875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.458479132883844, "frac_reward_zero_std": 0.5, "grad_norm": 0.12028118588843667, "kl": 0.1328125, "learning_rate": 1.3155812094761909e-05, "loss": 0.0182, "num_tokens": 648850487.0, "reward": 2.03955078125, "reward_std": 0.15923961997032166, "rewards/accuracy_reward/mean": 0.05443548411130905, "rewards/accuracy_reward/std": 0.227104052901268, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 799.99609375, "completions/mean_terminated_length": 799.99609375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.45882051719723477, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12230867408869066, "kl": 0.13525390625, "learning_rate": 1.3144505145019143e-05, "loss": 0.0217, "num_tokens": 649342629.0, "reward": 2.0849609375, "reward_std": 0.19504490494728088, "rewards/accuracy_reward/mean": 0.11290322244167328, "rewards/accuracy_reward/std": 0.3167939782142639, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 808.8671875, "completions/mean_terminated_length": 808.8671875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.4591619015106256, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08925557852186468, "kl": 0.129638671875, "learning_rate": 1.3133193732226463e-05, "loss": 0.0171, "num_tokens": 649833089.0, "reward": 2.076171875, "reward_std": 0.13948200643062592, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 814.384765625, "completions/mean_terminated_length": 814.384765625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.4595032858240164, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09055416223445215, "kl": 0.127197265625, "learning_rate": 1.3121877872438353e-05, "loss": 0.0014, "num_tokens": 650327830.0, "reward": 2.01611328125, "reward_std": 0.09698759019374847, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 795.884765625, "completions/mean_terminated_length": 795.884765625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.4598446701374072, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09733858848057597, "kl": 0.12841796875, "learning_rate": 1.31105575817156e-05, "loss": 0.012, "num_tokens": 650829547.0, "reward": 2.07958984375, "reward_std": 0.14305154979228973, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 731.541015625, "completions/mean_terminated_length": 730.7260131835938, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.46018605445079797, "frac_reward_zero_std": 0.46875, "grad_norm": 0.35643122367664826, "kl": 0.359619140625, "learning_rate": 1.3099232876125287e-05, "loss": 0.0334, "num_tokens": 651283440.0, "reward": 2.07080078125, "reward_std": 0.18637287616729736, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 744.732421875, "completions/mean_terminated_length": 744.391357421875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.4605274387641888, "frac_reward_zero_std": 0.46875, "grad_norm": 0.9098245926762962, "kl": 0.61083984375, "learning_rate": 1.3087903771740767e-05, "loss": 0.0257, "num_tokens": 651751415.0, "reward": 2.0146484375, "reward_std": 0.1709347665309906, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 796.0234375, "completions/mean_terminated_length": 795.2720336914062, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.4608688230775796, "frac_reward_zero_std": 0.53125, "grad_norm": 0.4393266980662319, "kl": 0.17724609375, "learning_rate": 1.3076570284641625e-05, "loss": 0.0266, "num_tokens": 652250339.0, "reward": 2.041015625, "reward_std": 0.1671517938375473, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 782.576171875, "completions/mean_terminated_length": 782.576171875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.4612102073909704, "frac_reward_zero_std": 0.5, "grad_norm": 0.10772342522994058, "kl": 0.128173828125, "learning_rate": 1.3065232430913677e-05, "loss": 0.0083, "num_tokens": 652731834.0, "reward": 2.06787109375, "reward_std": 0.19128571450710297, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.036414988338947296, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1405.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 726.78125, "completions/mean_terminated_length": 726.78125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.46155159170436116, "frac_reward_zero_std": 0.5, "grad_norm": 0.12590648515873745, "kl": 0.1298828125, "learning_rate": 1.3053890226648934e-05, "loss": 0.0144, "num_tokens": 653187066.0, "reward": 2.11767578125, "reward_std": 0.19094544649124146, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 780.037109375, "completions/mean_terminated_length": 778.994140625, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.461892976017752, "frac_reward_zero_std": 0.59375, "grad_norm": 5.601661132509308, "kl": 1.42431640625, "learning_rate": 1.3042543687945574e-05, "loss": 0.073, "num_tokens": 653678605.0, "reward": 2.0361328125, "reward_std": 0.1438729465007782, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 803.203125, "completions/mean_terminated_length": 803.203125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.4622343603311428, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08762523309079999, "kl": 0.120361328125, "learning_rate": 1.3031192830907942e-05, "loss": 0.006, "num_tokens": 654178293.0, "reward": 2.01171875, "reward_std": 0.09773427248001099, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 812.0625, "completions/mean_terminated_length": 809.6438598632812, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.4625757446445336, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10430450827734056, "kl": 0.1280517578125, "learning_rate": 1.30198376716465e-05, "loss": 0.0263, "num_tokens": 654673669.0, "reward": 2.0830078125, "reward_std": 0.1471654176712036, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 765.91015625, "completions/mean_terminated_length": 765.91015625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.46291712895792436, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11006908822617988, "kl": 0.125732421875, "learning_rate": 1.3008478226277817e-05, "loss": 0.013, "num_tokens": 655151895.0, "reward": 1.9931640625, "reward_std": 0.11523006856441498, "rewards/accuracy_reward/mean": 0.013671875, "rewards/accuracy_reward/std": 0.1162383034825325, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1560.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 783.85546875, "completions/mean_terminated_length": 783.85546875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.4632585132713152, "frac_reward_zero_std": 0.625, "grad_norm": 0.10824349369850862, "kl": 0.127197265625, "learning_rate": 1.2997114510924549e-05, "loss": 0.0124, "num_tokens": 655631613.0, "reward": 2.04736328125, "reward_std": 0.1373661607503891, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1435.0, "completions/max_terminated_length": 1435.0, "completions/mean_length": 784.80078125, "completions/mean_terminated_length": 784.80078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.463599897584706, "frac_reward_zero_std": 0.625, "grad_norm": 0.10784487352819776, "kl": 0.1298828125, "learning_rate": 1.2985746541715414e-05, "loss": 0.0109, "num_tokens": 656123623.0, "reward": 2.103515625, "reward_std": 0.13594770431518555, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 737.380859375, "completions/mean_terminated_length": 737.380859375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.4639412818980968, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1016044159573091, "kl": 0.138916015625, "learning_rate": 1.2974374334785162e-05, "loss": 0.0059, "num_tokens": 656593178.0, "reward": 2.048828125, "reward_std": 0.10273082554340363, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 787.791015625, "completions/mean_terminated_length": 787.4833374023438, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.46428266621148756, "frac_reward_zero_std": 0.46875, "grad_norm": 0.47805689326229445, "kl": 0.13232421875, "learning_rate": 1.2962997906274563e-05, "loss": 0.0109, "num_tokens": 657103263.0, "reward": 2.05517578125, "reward_std": 0.18056881427764893, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 778.9296875, "completions/mean_terminated_length": 778.9296875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.4646240505248784, "frac_reward_zero_std": 0.5, "grad_norm": 0.10633500177886639, "kl": 0.1234130859375, "learning_rate": 1.2951617272330377e-05, "loss": 0.0169, "num_tokens": 657589531.0, "reward": 2.07470703125, "reward_std": 0.18924403190612793, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1703.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 739.2421875, "completions/mean_terminated_length": 739.2421875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.4649654348382692, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11393174495319189, "kl": 0.137939453125, "learning_rate": 1.294023244910533e-05, "loss": 0.0129, "num_tokens": 658057575.0, "reward": 2.0615234375, "reward_std": 0.16483503580093384, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 731.849609375, "completions/mean_terminated_length": 731.849609375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.46530681915166, "frac_reward_zero_std": 0.59375, "grad_norm": 0.31716227854212037, "kl": 0.1435546875, "learning_rate": 1.2928843452758097e-05, "loss": 0.0148, "num_tokens": 658520730.0, "reward": 2.05322265625, "reward_std": 0.15573008358478546, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293973088264465, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 685.275390625, "completions/mean_terminated_length": 685.275390625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.46564820346505076, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12038602884401006, "kl": 0.140625, "learning_rate": 1.2917450299453278e-05, "loss": 0.0074, "num_tokens": 658957511.0, "reward": 2.02490234375, "reward_std": 0.13523375988006592, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 701.42578125, "completions/mean_terminated_length": 701.42578125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.46598958777844157, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10526046503194103, "kl": 0.1455078125, "learning_rate": 1.2906053005361366e-05, "loss": 0.0041, "num_tokens": 659405169.0, "reward": 2.08203125, "reward_std": 0.14524486660957336, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 714.046875, "completions/mean_terminated_length": 714.046875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.4663309720918324, "frac_reward_zero_std": 0.625, "grad_norm": 0.10433649001447108, "kl": 0.140869140625, "learning_rate": 1.2894651586658736e-05, "loss": 0.0132, "num_tokens": 659851033.0, "reward": 2.0576171875, "reward_std": 0.12522368133068085, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 683.03125, "completions/mean_terminated_length": 683.03125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.4666723564052232, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10947579235754248, "kl": 0.141845703125, "learning_rate": 1.2883246059527619e-05, "loss": 0.0106, "num_tokens": 660276793.0, "reward": 2.1162109375, "reward_std": 0.15700995922088623, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 715.275390625, "completions/mean_terminated_length": 715.275390625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.46701374071861396, "frac_reward_zero_std": 0.625, "grad_norm": 0.1199278286855341, "kl": 0.144775390625, "learning_rate": 1.2871836440156076e-05, "loss": 0.0124, "num_tokens": 660721126.0, "reward": 2.00927734375, "reward_std": 0.12583386898040771, "rewards/accuracy_reward/mean": 0.03427419438958168, "rewards/accuracy_reward/std": 0.18211629986763, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 689.421875, "completions/mean_terminated_length": 689.421875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.46735512503200477, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11957498435038606, "kl": 0.149169921875, "learning_rate": 1.2860422744737967e-05, "loss": 0.0161, "num_tokens": 661157134.0, "reward": 2.037109375, "reward_std": 0.13582998514175415, "rewards/accuracy_reward/mean": 0.05645161122083664, "rewards/accuracy_reward/std": 0.23102475702762604, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1424.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 690.81640625, "completions/mean_terminated_length": 690.81640625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.4676965093453956, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11356918434138939, "kl": 0.148681640625, "learning_rate": 1.2849004989472955e-05, "loss": 0.0064, "num_tokens": 661601072.0, "reward": 2.056640625, "reward_std": 0.13674986362457275, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 665.39453125, "completions/mean_terminated_length": 665.39453125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.4680378936587864, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13764005497066795, "kl": 0.1552734375, "learning_rate": 1.2837583190566447e-05, "loss": 0.0161, "num_tokens": 662017578.0, "reward": 2.05859375, "reward_std": 0.2201385796070099, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 719.92578125, "completions/mean_terminated_length": 719.92578125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.46837927797217715, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10980202864824798, "kl": 0.146728515625, "learning_rate": 1.2826157364229604e-05, "loss": 0.0092, "num_tokens": 662459812.0, "reward": 2.0498046875, "reward_std": 0.14599281549453735, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1420.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 716.880859375, "completions/mean_terminated_length": 716.880859375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.46872066228556797, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12947473848544563, "kl": 0.15087890625, "learning_rate": 1.2814727526679291e-05, "loss": 0.014, "num_tokens": 662904055.0, "reward": 1.9873046875, "reward_std": 0.18890196084976196, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.06760437041521072, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 723.30859375, "completions/mean_terminated_length": 723.30859375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.4690620465989588, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13149374982416168, "kl": 0.154296875, "learning_rate": 1.2803293694138077e-05, "loss": 0.0201, "num_tokens": 663355573.0, "reward": 2.04345703125, "reward_std": 0.15028482675552368, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293973088264465, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 737.638671875, "completions/mean_terminated_length": 737.638671875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.4694034309123496, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09590629984462792, "kl": 0.15380859375, "learning_rate": 1.2791855882834195e-05, "loss": 0.0025, "num_tokens": 663820108.0, "reward": 2.046875, "reward_std": 0.13260646164417267, "rewards/accuracy_reward/mean": 0.058467742055654526, "rewards/accuracy_reward/std": 0.23486268520355225, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1494.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 751.634765625, "completions/mean_terminated_length": 751.634765625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.46974481522574035, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12168153204392887, "kl": 0.152587890625, "learning_rate": 1.278041410900152e-05, "loss": 0.0104, "num_tokens": 664288033.0, "reward": 2.04833984375, "reward_std": 0.16126099228858948, "rewards/accuracy_reward/mean": 0.06451612710952759, "rewards/accuracy_reward/std": 0.2459181249141693, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 708.94140625, "completions/mean_terminated_length": 708.94140625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.47008619953913117, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10751653019560387, "kl": 0.14990234375, "learning_rate": 1.2768968388879568e-05, "loss": 0.0046, "num_tokens": 664730387.0, "reward": 2.08642578125, "reward_std": 0.12043101340532303, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1655.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 788.494140625, "completions/mean_terminated_length": 787.136962890625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.470427583852522, "frac_reward_zero_std": 0.625, "grad_norm": 0.44814578494409846, "kl": 0.2109375, "learning_rate": 1.2757518738713432e-05, "loss": 0.0138, "num_tokens": 665220528.0, "reward": 2.03125, "reward_std": 0.11268772184848785, "rewards/accuracy_reward/mean": 0.0463709682226181, "rewards/accuracy_reward/std": 0.21049949526786804, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 737.638671875, "completions/mean_terminated_length": 737.638671875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.4707689681659128, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1218434037849159, "kl": 0.150390625, "learning_rate": 1.2746065174753808e-05, "loss": 0.0175, "num_tokens": 665678359.0, "reward": 2.0126953125, "reward_std": 0.14970192313194275, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 713.509765625, "completions/mean_terminated_length": 713.509765625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.47111035247930355, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1131419663259381, "kl": 0.147705078125, "learning_rate": 1.273460771325693e-05, "loss": 0.0102, "num_tokens": 666139580.0, "reward": 2.072265625, "reward_std": 0.1354348212480545, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 654.763671875, "completions/mean_terminated_length": 654.763671875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.47145173679269436, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12566380559598725, "kl": 0.1572265625, "learning_rate": 1.2723146370484569e-05, "loss": 0.0165, "num_tokens": 666566179.0, "reward": 2.119140625, "reward_std": 0.18576443195343018, "rewards/accuracy_reward/mean": 0.13104838132858276, "rewards/accuracy_reward/std": 0.3377939760684967, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 736.671875, "completions/mean_terminated_length": 736.671875, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.4717931211060852, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11016344869258989, "kl": 0.144287109375, "learning_rate": 1.2711681162704006e-05, "loss": 0.0188, "num_tokens": 667023787.0, "reward": 2.0439453125, "reward_std": 0.1414516568183899, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 740.3984375, "completions/mean_terminated_length": 740.3984375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.472134505419476, "frac_reward_zero_std": 0.625, "grad_norm": 0.10874248800336263, "kl": 0.149169921875, "learning_rate": 1.2700212106188008e-05, "loss": 0.0141, "num_tokens": 667478887.0, "reward": 2.04296875, "reward_std": 0.1476689875125885, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1304.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 658.34765625, "completions/mean_terminated_length": 657.7944946289062, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.47247588973286675, "frac_reward_zero_std": 0.625, "grad_norm": 0.5175579373086712, "kl": 0.182861328125, "learning_rate": 1.2688739217214799e-05, "loss": 0.0185, "num_tokens": 667896425.0, "reward": 2.09228515625, "reward_std": 0.12420445680618286, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 640.2734375, "completions/mean_terminated_length": 640.2734375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.47281727404625756, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11297887153387592, "kl": 0.157470703125, "learning_rate": 1.2677262512068045e-05, "loss": 0.0083, "num_tokens": 668314949.0, "reward": 2.08544921875, "reward_std": 0.11170843988656998, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 690.791015625, "completions/mean_terminated_length": 690.791015625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.4731586583596484, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10486668028831057, "kl": 0.150634765625, "learning_rate": 1.2665782007036835e-05, "loss": 0.0115, "num_tokens": 668748986.0, "reward": 2.0908203125, "reward_std": 0.13393868505954742, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 699.09765625, "completions/mean_terminated_length": 699.09765625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.4735000426730392, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10887440834753838, "kl": 0.14208984375, "learning_rate": 1.2654297718415646e-05, "loss": 0.0126, "num_tokens": 669190796.0, "reward": 2.05126953125, "reward_std": 0.11842407286167145, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1367.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 673.078125, "completions/mean_terminated_length": 673.078125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.47384142698642995, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1307570218318361, "kl": 0.1484375, "learning_rate": 1.2642809662504321e-05, "loss": 0.0089, "num_tokens": 669614788.0, "reward": 2.14306640625, "reward_std": 0.15781265497207642, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 665.296875, "completions/mean_terminated_length": 665.296875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.47418281129982076, "frac_reward_zero_std": 0.625, "grad_norm": 0.12046591292152847, "kl": 0.146728515625, "learning_rate": 1.263131785560806e-05, "loss": 0.0156, "num_tokens": 670043500.0, "reward": 2.08837890625, "reward_std": 0.1401423215866089, "rewards/accuracy_reward/mean": 0.1088709682226181, "rewards/accuracy_reward/std": 0.31179171800613403, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 664.501953125, "completions/mean_terminated_length": 663.5831909179688, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.4745241956132116, "frac_reward_zero_std": 0.65625, "grad_norm": 0.3281499467563628, "kl": 0.2216796875, "learning_rate": 1.2619822314037376e-05, "loss": 0.0318, "num_tokens": 670461101.0, "reward": 2.0751953125, "reward_std": 0.12599295377731323, "rewards/accuracy_reward/mean": 0.0947580635547638, "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 682.84765625, "completions/mean_terminated_length": 682.4168090820312, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.4748655799266024, "frac_reward_zero_std": 0.625, "grad_norm": 0.8241996699430898, "kl": 0.342529296875, "learning_rate": 1.2608323054108089e-05, "loss": 0.0158, "num_tokens": 670899743.0, "reward": 2.0791015625, "reward_std": 0.15216538310050964, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 718.23828125, "completions/mean_terminated_length": 718.23828125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.47520696423999315, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1033010282662039, "kl": 0.141357421875, "learning_rate": 1.2596820092141295e-05, "loss": 0.0037, "num_tokens": 671352409.0, "reward": 2.037109375, "reward_std": 0.12069705873727798, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 710.130859375, "completions/mean_terminated_length": 710.130859375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.47554834855338396, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11157675589444745, "kl": 0.136474609375, "learning_rate": 1.2585313444463344e-05, "loss": 0.0137, "num_tokens": 671803292.0, "reward": 2.05419921875, "reward_std": 0.1380205750465393, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1458.0, "completions/max_terminated_length": 1458.0, "completions/mean_length": 717.228515625, "completions/mean_terminated_length": 717.228515625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.4758897328667748, "frac_reward_zero_std": 0.625, "grad_norm": 0.10534632003419955, "kl": 0.137451171875, "learning_rate": 1.2573803127405822e-05, "loss": 0.0244, "num_tokens": 672249729.0, "reward": 2.06591796875, "reward_std": 0.13340042531490326, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1598.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 733.5390625, "completions/mean_terminated_length": 733.5390625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.4762311171801656, "frac_reward_zero_std": 0.5, "grad_norm": 0.13102288908754053, "kl": 0.13525390625, "learning_rate": 1.2562289157305514e-05, "loss": 0.0162, "num_tokens": 672710181.0, "reward": 2.0595703125, "reward_std": 0.16060984134674072, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 800.923828125, "completions/mean_terminated_length": 799.6477661132812, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.47657250149355634, "frac_reward_zero_std": 0.46875, "grad_norm": 0.20912406504731834, "kl": 0.2646484375, "learning_rate": 1.2550771550504398e-05, "loss": 0.0225, "num_tokens": 673208382.0, "reward": 2.03955078125, "reward_std": 0.17146630585193634, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 822.044921875, "completions/mean_terminated_length": 814.8192749023438, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.47691388580694716, "frac_reward_zero_std": 0.625, "grad_norm": 0.10208058762065775, "kl": 0.134521484375, "learning_rate": 1.2539250323349605e-05, "loss": 0.0109, "num_tokens": 673709141.0, "reward": 2.04736328125, "reward_std": 0.14394478499889374, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293973088264465, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 789.01171875, "completions/mean_terminated_length": 789.01171875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.47725527012033797, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12386598977625787, "kl": 0.13916015625, "learning_rate": 1.2527725492193414e-05, "loss": 0.0212, "num_tokens": 674190267.0, "reward": 2.0517578125, "reward_std": 0.15521474182605743, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 823.083984375, "completions/mean_terminated_length": 821.75537109375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.4775966544337288, "frac_reward_zero_std": 0.65625, "grad_norm": 0.368055883746116, "kl": 0.25927734375, "learning_rate": 1.2516197073393217e-05, "loss": 0.0283, "num_tokens": 674709942.0, "reward": 2.07861328125, "reward_std": 0.11668345332145691, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 886.318359375, "completions/mean_terminated_length": 886.318359375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.47793803874711954, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09778772203368716, "kl": 0.13671875, "learning_rate": 1.250466508331149e-05, "loss": 0.0086, "num_tokens": 675246361.0, "reward": 2.0615234375, "reward_std": 0.1641511619091034, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 815.98828125, "completions/mean_terminated_length": 815.3580932617188, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.47827942306051036, "frac_reward_zero_std": 0.5625, "grad_norm": 0.23094092763198898, "kl": 0.20166015625, "learning_rate": 1.249312953831579e-05, "loss": 0.023, "num_tokens": 675745603.0, "reward": 2.0576171875, "reward_std": 0.14878427982330322, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 828.044921875, "completions/mean_terminated_length": 828.044921875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.47862080737390117, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12369421815943714, "kl": 0.146484375, "learning_rate": 1.2481590454778707e-05, "loss": 0.0224, "num_tokens": 676259658.0, "reward": 2.06884765625, "reward_std": 0.13193371891975403, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 828.13671875, "completions/mean_terminated_length": 828.13671875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.478962191687292, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08725018094180961, "kl": 0.151123046875, "learning_rate": 1.2470047849077864e-05, "loss": 0.0043, "num_tokens": 676769216.0, "reward": 2.005859375, "reward_std": 0.09573635458946228, "rewards/accuracy_reward/mean": 0.02016128972172737, "rewards/accuracy_reward/std": 0.14069372415542603, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 827.76953125, "completions/mean_terminated_length": 827.76953125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.4793035760006828, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10153383967715773, "kl": 0.150146484375, "learning_rate": 1.2458501737595876e-05, "loss": 0.0205, "num_tokens": 677267962.0, "reward": 2.04443359375, "reward_std": 0.12584424018859863, "rewards/accuracy_reward/mean": 0.058467742055654526, "rewards/accuracy_reward/std": 0.23486268520355225, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 819.791015625, "completions/mean_terminated_length": 819.791015625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.47964496031407355, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09403142023098786, "kl": 0.15087890625, "learning_rate": 1.2446952136720338e-05, "loss": 0.0186, "num_tokens": 677772847.0, "reward": 2.06689453125, "reward_std": 0.09169875085353851, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 780.587890625, "completions/mean_terminated_length": 780.587890625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.47998634462746437, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14868480231800701, "kl": 0.16259765625, "learning_rate": 1.2435399062843795e-05, "loss": 0.0046, "num_tokens": 678257004.0, "reward": 2.07861328125, "reward_std": 0.22086301445960999, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.052765581756830215, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 776.822265625, "completions/mean_terminated_length": 776.822265625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.4803277289408552, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12429890331624088, "kl": 0.15625, "learning_rate": 1.2423842532363725e-05, "loss": 0.0111, "num_tokens": 678735377.0, "reward": 2.05615234375, "reward_std": 0.14892084896564484, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 734.263671875, "completions/mean_terminated_length": 733.5459594726562, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.480669113254246, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09373975379710041, "kl": 0.14697265625, "learning_rate": 1.2412282561682506e-05, "loss": 0.0118, "num_tokens": 679192792.0, "reward": 2.08837890625, "reward_std": 0.1261497139930725, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 729.03515625, "completions/mean_terminated_length": 729.03515625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.48101049756763675, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11245728415322633, "kl": 0.157958984375, "learning_rate": 1.2400719167207404e-05, "loss": 0.0155, "num_tokens": 679650826.0, "reward": 2.0869140625, "reward_std": 0.12489424645900726, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1623.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 737.724609375, "completions/mean_terminated_length": 737.724609375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.48135188188102757, "frac_reward_zero_std": 0.625, "grad_norm": 0.11161692407018067, "kl": 0.1591796875, "learning_rate": 1.238915236535054e-05, "loss": 0.0192, "num_tokens": 680117373.0, "reward": 2.06982421875, "reward_std": 0.14718666672706604, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 757.103515625, "completions/mean_terminated_length": 757.103515625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.4816932661944184, "frac_reward_zero_std": 0.78125, "grad_norm": 0.08502971096285838, "kl": 0.156005859375, "learning_rate": 1.2377582172528877e-05, "loss": 0.0126, "num_tokens": 680593762.0, "reward": 2.0244140625, "reward_std": 0.06337852776050568, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 770.87890625, "completions/mean_terminated_length": 770.87890625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.4820346505078092, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10312517866601541, "kl": 0.155029296875, "learning_rate": 1.2366008605164185e-05, "loss": 0.0142, "num_tokens": 681073828.0, "reward": 2.0400390625, "reward_std": 0.10710855573415756, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 751.11328125, "completions/mean_terminated_length": 751.11328125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.48237603482119995, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11728611810025624, "kl": 0.153564453125, "learning_rate": 1.2354431679683028e-05, "loss": 0.0171, "num_tokens": 681536974.0, "reward": 2.072265625, "reward_std": 0.15013298392295837, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 773.205078125, "completions/mean_terminated_length": 773.205078125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.48271741913459076, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1567827731497402, "kl": 0.149169921875, "learning_rate": 1.2342851412516732e-05, "loss": 0.0139, "num_tokens": 682015383.0, "reward": 2.01953125, "reward_std": 0.1308479905128479, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 700.10546875, "completions/mean_terminated_length": 699.5694580078125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.4830588034479816, "frac_reward_zero_std": 0.59375, "grad_norm": 2.078238353149562, "kl": 0.77880859375, "learning_rate": 1.2331267820101368e-05, "loss": 0.0462, "num_tokens": 682454093.0, "reward": 2.01708984375, "reward_std": 0.14419639110565186, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.047786012291908264, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1981.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 755.80078125, "completions/mean_terminated_length": 755.80078125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.4834001877613724, "frac_reward_zero_std": 0.375, "grad_norm": 0.14452781762688974, "kl": 0.1435546875, "learning_rate": 1.2319680918877732e-05, "loss": 0.019, "num_tokens": 682925847.0, "reward": 2.0712890625, "reward_std": 0.18943758308887482, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 784.046875, "completions/mean_terminated_length": 784.046875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.48374157207476315, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11758448798124368, "kl": 0.1396484375, "learning_rate": 1.2308090725291307e-05, "loss": 0.0068, "num_tokens": 683418335.0, "reward": 2.0625, "reward_std": 0.20163019001483917, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 736.12890625, "completions/mean_terminated_length": 734.1375732421875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.48408295638815396, "frac_reward_zero_std": 0.53125, "grad_norm": 1.0147705056983671, "kl": 0.6708984375, "learning_rate": 1.2296497255792246e-05, "loss": 0.0424, "num_tokens": 683882129.0, "reward": 2.091796875, "reward_std": 0.17130257189273834, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.060289934277534485, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 824.669921875, "completions/mean_terminated_length": 822.9373779296875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.4844243407015448, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5891752140111766, "kl": 0.26806640625, "learning_rate": 1.228490052683537e-05, "loss": 0.0214, "num_tokens": 684383656.0, "reward": 2.10791015625, "reward_std": 0.17348438501358032, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1880.0, "completions/max_terminated_length": 1880.0, "completions/mean_length": 753.064453125, "completions/mean_terminated_length": 753.064453125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4847657250149356, "frac_reward_zero_std": 0.3125, "grad_norm": 0.15258966934499457, "kl": 0.13916015625, "learning_rate": 1.2273300554880111e-05, "loss": 0.0076, "num_tokens": 684851129.0, "reward": 2.107421875, "reward_std": 0.22518914937973022, "rewards/accuracy_reward/mean": 0.14314515888690948, "rewards/accuracy_reward/std": 0.35057440400123596, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 838.92578125, "completions/mean_terminated_length": 838.92578125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.48510710932832635, "frac_reward_zero_std": 0.375, "grad_norm": 0.12191118651899956, "kl": 0.1357421875, "learning_rate": 1.2261697356390507e-05, "loss": 0.0278, "num_tokens": 685352595.0, "reward": 2.0654296875, "reward_std": 0.219220370054245, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1958.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 896.3671875, "completions/mean_terminated_length": 896.3671875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.48544849364171716, "frac_reward_zero_std": 0.625, "grad_norm": 0.10211558670734183, "kl": 0.13427734375, "learning_rate": 1.2250090947835176e-05, "loss": 0.0244, "num_tokens": 685893951.0, "reward": 2.05078125, "reward_std": 0.15672904253005981, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 889.705078125, "completions/mean_terminated_length": 889.705078125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.485789877955108, "frac_reward_zero_std": 0.5, "grad_norm": 0.11650251203524209, "kl": 0.141845703125, "learning_rate": 1.2238481345687291e-05, "loss": 0.0201, "num_tokens": 686427064.0, "reward": 2.05810546875, "reward_std": 0.18136923015117645, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04538619518280029, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 984.369140625, "completions/mean_terminated_length": 980.1981201171875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.4861312622684988, "frac_reward_zero_std": 0.53125, "grad_norm": 0.09580626025518384, "kl": 0.132568359375, "learning_rate": 1.222686856642456e-05, "loss": 0.0246, "num_tokens": 687012069.0, "reward": 2.044921875, "reward_std": 0.17488646507263184, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 868.72265625, "completions/mean_terminated_length": 867.1721801757812, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.48647264658188955, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5634145647496309, "kl": 0.236572265625, "learning_rate": 1.2215252626529197e-05, "loss": 0.0253, "num_tokens": 687535943.0, "reward": 2.07275390625, "reward_std": 0.1872081756591797, "rewards/accuracy_reward/mean": 0.0927419364452362, "rewards/accuracy_reward/std": 0.2903633117675781, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 846.685546875, "completions/mean_terminated_length": 841.9745483398438, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.48681403089528036, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11471686677687448, "kl": 0.143798828125, "learning_rate": 1.2203633542487907e-05, "loss": 0.0298, "num_tokens": 688049702.0, "reward": 2.00341796875, "reward_std": 0.1477867215871811, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 752.078125, "completions/mean_terminated_length": 750.2954711914062, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.48715541520867117, "frac_reward_zero_std": 0.5, "grad_norm": 0.43318634897929575, "kl": 0.142578125, "learning_rate": 1.2192011330791853e-05, "loss": 0.0446, "num_tokens": 688520206.0, "reward": 2.03564453125, "reward_std": 0.16985519230365753, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 779.240234375, "completions/mean_terminated_length": 779.240234375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.487496799522062, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11538653587561423, "kl": 0.144775390625, "learning_rate": 1.2180386007936637e-05, "loss": 0.0065, "num_tokens": 689001897.0, "reward": 2.03076171875, "reward_std": 0.15907946228981018, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1933.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 690.720703125, "completions/mean_terminated_length": 690.720703125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.48783818383545274, "frac_reward_zero_std": 0.46875, "grad_norm": 0.14111778224177565, "kl": 0.144775390625, "learning_rate": 1.2168757590422287e-05, "loss": 0.0274, "num_tokens": 689442698.0, "reward": 2.03564453125, "reward_std": 0.17580671608448029, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 714.75390625, "completions/mean_terminated_length": 714.75390625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.48817956814884356, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1336581316702294, "kl": 0.149169921875, "learning_rate": 1.2157126094753204e-05, "loss": 0.026, "num_tokens": 689894428.0, "reward": 2.04052734375, "reward_std": 0.1972595900297165, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 663.953125, "completions/mean_terminated_length": 663.1585083007812, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.48852095246223437, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4818460449611945, "kl": 0.252685546875, "learning_rate": 1.2145491537438175e-05, "loss": 0.0281, "num_tokens": 690314884.0, "reward": 2.111328125, "reward_std": 0.1938806176185608, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 640.689453125, "completions/mean_terminated_length": 640.689453125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.4888623367756252, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11194834612776704, "kl": 0.14697265625, "learning_rate": 1.2133853934990323e-05, "loss": 0.0129, "num_tokens": 690733525.0, "reward": 2.03955078125, "reward_std": 0.1107923835515976, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 611.396484375, "completions/mean_terminated_length": 611.396484375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.48920372108901594, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14184206179349798, "kl": 0.171875, "learning_rate": 1.2122213303927096e-05, "loss": 0.0067, "num_tokens": 691126496.0, "reward": 2.08935546875, "reward_std": 0.17120017111301422, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 656.43359375, "completions/mean_terminated_length": 656.43359375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.48954510540240675, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12091893759659432, "kl": 0.159423828125, "learning_rate": 1.2110569660770245e-05, "loss": 0.007, "num_tokens": 691544686.0, "reward": 2.09521484375, "reward_std": 0.17082175612449646, "rewards/accuracy_reward/mean": 0.10685484111309052, "rewards/accuracy_reward/std": 0.3092404901981354, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 636.26171875, "completions/mean_terminated_length": 636.26171875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.48988648971579757, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1436070096684287, "kl": 0.151611328125, "learning_rate": 1.2098923022045792e-05, "loss": 0.009, "num_tokens": 691951988.0, "reward": 2.12890625, "reward_std": 0.2176588475704193, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 640.671875, "completions/mean_terminated_length": 640.671875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.4902278740291884, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12244035664703626, "kl": 0.152587890625, "learning_rate": 1.2087273404284004e-05, "loss": 0.0096, "num_tokens": 692382412.0, "reward": 2.08984375, "reward_std": 0.18278810381889343, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 662.552734375, "completions/mean_terminated_length": 662.552734375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.49056925834257914, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10913381819774597, "kl": 0.150146484375, "learning_rate": 1.2075620824019385e-05, "loss": 0.0127, "num_tokens": 692806775.0, "reward": 2.09033203125, "reward_std": 0.1268255114555359, "rewards/accuracy_reward/mean": 0.10282257944345474, "rewards/accuracy_reward/std": 0.30403366684913635, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 669.634765625, "completions/mean_terminated_length": 669.634765625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.49091064265596995, "frac_reward_zero_std": 0.46875, "grad_norm": 0.14945018286402945, "kl": 0.15478515625, "learning_rate": 1.2063965297790642e-05, "loss": 0.0228, "num_tokens": 693232060.0, "reward": 2.021484375, "reward_std": 0.17494183778762817, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 656.544921875, "completions/mean_terminated_length": 656.544921875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.49125202696936077, "frac_reward_zero_std": 0.5, "grad_norm": 0.1372375942104128, "kl": 0.160400390625, "learning_rate": 1.2052306842140666e-05, "loss": 0.0221, "num_tokens": 693656931.0, "reward": 2.1328125, "reward_std": 0.20608699321746826, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 716.328125, "completions/mean_terminated_length": 713.7221069335938, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.4915934112827516, "frac_reward_zero_std": 0.5, "grad_norm": 0.13137611171463856, "kl": 0.148681640625, "learning_rate": 1.2040645473616496e-05, "loss": 0.0325, "num_tokens": 694112075.0, "reward": 2.04443359375, "reward_std": 0.17516373097896576, "rewards/accuracy_reward/mean": 0.07258064299821854, "rewards/accuracy_reward/std": 0.25970885157585144, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 719.794921875, "completions/mean_terminated_length": 719.794921875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.49193479559614234, "frac_reward_zero_std": 0.375, "grad_norm": 0.14785466173805598, "kl": 0.1552734375, "learning_rate": 1.202898120876932e-05, "loss": 0.0148, "num_tokens": 694574978.0, "reward": 2.1103515625, "reward_std": 0.22789224982261658, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 773.787109375, "completions/mean_terminated_length": 773.787109375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.49227617990953315, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08364129622294718, "kl": 0.140869140625, "learning_rate": 1.2017314064154422e-05, "loss": 0.0191, "num_tokens": 695055381.0, "reward": 2.04931640625, "reward_std": 0.11252880096435547, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 791.42578125, "completions/mean_terminated_length": 784.0196533203125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.49261756422292396, "frac_reward_zero_std": 0.53125, "grad_norm": 1.9909725896772261, "kl": 0.768798828125, "learning_rate": 1.2005644056331182e-05, "loss": 0.0662, "num_tokens": 695539023.0, "reward": 2.1181640625, "reward_std": 0.20678624510765076, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 863.583984375, "completions/mean_terminated_length": 861.26611328125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.4929589485363148, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11822537610263022, "kl": 0.134765625, "learning_rate": 1.199397120186304e-05, "loss": 0.0332, "num_tokens": 696059226.0, "reward": 2.07373046875, "reward_std": 0.19908522069454193, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.045533329248428345, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 828.3046875, "completions/mean_terminated_length": 828.3046875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.49330033284970554, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12129830866874869, "kl": 0.1328125, "learning_rate": 1.198229551731748e-05, "loss": 0.0248, "num_tokens": 696573574.0, "reward": 2.03662109375, "reward_std": 0.16046755015850067, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 823.921875, "completions/mean_terminated_length": 823.921875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.49364171716309635, "frac_reward_zero_std": 0.34375, "grad_norm": 0.13078596670310103, "kl": 0.134765625, "learning_rate": 1.1970617019266e-05, "loss": 0.0195, "num_tokens": 697075230.0, "reward": 2.1376953125, "reward_std": 0.2330712527036667, "rewards/accuracy_reward/mean": 0.16129031777381897, "rewards/accuracy_reward/std": 0.3681698441505432, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 911.283203125, "completions/mean_terminated_length": 911.283203125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.49398310147648716, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11995753041333136, "kl": 0.135498046875, "learning_rate": 1.1958935724284091e-05, "loss": 0.028, "num_tokens": 697622879.0, "reward": 2.0244140625, "reward_std": 0.17840874195098877, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1752.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 787.970703125, "completions/mean_terminated_length": 787.970703125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.494324485789878, "frac_reward_zero_std": 0.5, "grad_norm": 0.1204081681917122, "kl": 0.148681640625, "learning_rate": 1.1947251648951219e-05, "loss": 0.0231, "num_tokens": 698115184.0, "reward": 2.109375, "reward_std": 0.19203895330429077, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 816.869140625, "completions/mean_terminated_length": 816.869140625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.49466587010326873, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12411010257145606, "kl": 0.140380859375, "learning_rate": 1.1935564809850786e-05, "loss": 0.0319, "num_tokens": 698610605.0, "reward": 2.03076171875, "reward_std": 0.1961774379014969, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 775.375, "completions/mean_terminated_length": 775.375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.49500725441665955, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2495241875493095, "kl": 0.14013671875, "learning_rate": 1.1923875223570124e-05, "loss": 0.0282, "num_tokens": 699091421.0, "reward": 2.0849609375, "reward_std": 0.15031418204307556, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 744.087890625, "completions/mean_terminated_length": 744.087890625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.49534863873005036, "frac_reward_zero_std": 0.625, "grad_norm": 0.09967113733205417, "kl": 0.135009765625, "learning_rate": 1.1912182906700467e-05, "loss": 0.0057, "num_tokens": 699564938.0, "reward": 2.06201171875, "reward_std": 0.12144431471824646, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 738.25390625, "completions/mean_terminated_length": 738.25390625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.4956900230434412, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10685910867668401, "kl": 0.15234375, "learning_rate": 1.1900487875836913e-05, "loss": 0.0236, "num_tokens": 700028572.0, "reward": 2.06201171875, "reward_std": 0.13086006045341492, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 693.583984375, "completions/mean_terminated_length": 693.583984375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.49603140735683193, "frac_reward_zero_std": 0.625, "grad_norm": 0.10676857345304068, "kl": 0.143798828125, "learning_rate": 1.1888790147578425e-05, "loss": 0.0132, "num_tokens": 700478903.0, "reward": 2.1416015625, "reward_std": 0.15433669090270996, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 798.888671875, "completions/mean_terminated_length": 796.4442138671875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.49637279167022275, "frac_reward_zero_std": 0.625, "grad_norm": 0.10036611668082121, "kl": 0.138427734375, "learning_rate": 1.1877089738527788e-05, "loss": 0.0188, "num_tokens": 700970510.0, "reward": 2.04736328125, "reward_std": 0.12246278673410416, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 689.115234375, "completions/mean_terminated_length": 688.2367553710938, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.49671417598361356, "frac_reward_zero_std": 0.71875, "grad_norm": 0.6612182941920377, "kl": 0.17724609375, "learning_rate": 1.1865386665291591e-05, "loss": 0.0246, "num_tokens": 701404041.0, "reward": 2.0791015625, "reward_std": 0.09795548766851425, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 732.357421875, "completions/mean_terminated_length": 732.357421875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.4970555602970044, "frac_reward_zero_std": 0.625, "grad_norm": 0.12176841220306704, "kl": 0.151123046875, "learning_rate": 1.1853680944480207e-05, "loss": 0.0165, "num_tokens": 701860736.0, "reward": 2.06640625, "reward_std": 0.12812179327011108, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 727.90625, "completions/mean_terminated_length": 727.90625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.49739694461039513, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12090007793940907, "kl": 0.153564453125, "learning_rate": 1.1841972592707764e-05, "loss": 0.0161, "num_tokens": 702327344.0, "reward": 2.095703125, "reward_std": 0.1708938181400299, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 696.5390625, "completions/mean_terminated_length": 696.5390625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.49773832892378594, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12909631755091366, "kl": 0.14794921875, "learning_rate": 1.1830261626592127e-05, "loss": 0.0036, "num_tokens": 702779348.0, "reward": 2.06884765625, "reward_std": 0.1564531922340393, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1304.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 682.0078125, "completions/mean_terminated_length": 682.0078125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.49807971323717676, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1265539537194441, "kl": 0.1494140625, "learning_rate": 1.1818548062754869e-05, "loss": 0.0088, "num_tokens": 703218296.0, "reward": 2.037109375, "reward_std": 0.15613995492458344, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04655282944440842, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1470.0, "completions/max_terminated_length": 1470.0, "completions/mean_length": 722.294921875, "completions/mean_terminated_length": 722.294921875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.49842109755056757, "frac_reward_zero_std": 0.625, "grad_norm": 0.11087576538684413, "kl": 0.135498046875, "learning_rate": 1.1806831917821258e-05, "loss": 0.0073, "num_tokens": 703666431.0, "reward": 2.0537109375, "reward_std": 0.12452582269906998, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 699.40625, "completions/mean_terminated_length": 699.40625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.49876248186395833, "frac_reward_zero_std": 0.625, "grad_norm": 0.1157720358646423, "kl": 0.13671875, "learning_rate": 1.1795113208420208e-05, "loss": 0.0146, "num_tokens": 704104351.0, "reward": 2.0615234375, "reward_std": 0.12760278582572937, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 709.435546875, "completions/mean_terminated_length": 709.435546875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.49910386617734914, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09518449830100098, "kl": 0.138916015625, "learning_rate": 1.1783391951184293e-05, "loss": 0.0126, "num_tokens": 704559822.0, "reward": 2.0439453125, "reward_std": 0.09462852776050568, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 737.626953125, "completions/mean_terminated_length": 737.626953125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.49944525049073996, "frac_reward_zero_std": 0.5, "grad_norm": 0.11036643387833864, "kl": 0.137451171875, "learning_rate": 1.177166816274969e-05, "loss": 0.0179, "num_tokens": 705020959.0, "reward": 2.15283203125, "reward_std": 0.21094074845314026, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 719.01171875, "completions/mean_terminated_length": 719.01171875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.49978663480413077, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11582763888797916, "kl": 0.14208984375, "learning_rate": 1.1759941859756173e-05, "loss": 0.0087, "num_tokens": 705473125.0, "reward": 2.11669921875, "reward_std": 0.1698957085609436, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 780.01171875, "completions/mean_terminated_length": 780.01171875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.5001280191175216, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10818550789735636, "kl": 0.12939453125, "learning_rate": 1.174821305884708e-05, "loss": 0.0142, "num_tokens": 705959083.0, "reward": 2.06884765625, "reward_std": 0.17450200021266937, "rewards/accuracy_reward/mean": 0.08266129344701767, "rewards/accuracy_reward/std": 0.2756475806236267, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 782.345703125, "completions/mean_terminated_length": 780.7749633789062, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.5004694034309124, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2316852384972555, "kl": 0.16650390625, "learning_rate": 1.1736481776669307e-05, "loss": 0.0223, "num_tokens": 706441580.0, "reward": 2.07861328125, "reward_std": 0.203344464302063, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 713.240234375, "completions/mean_terminated_length": 713.240234375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5008107877443031, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10787577143726232, "kl": 0.13818359375, "learning_rate": 1.1724748029873254e-05, "loss": 0.007, "num_tokens": 706903111.0, "reward": 2.02685546875, "reward_std": 0.12461923062801361, "rewards/accuracy_reward/mean": 0.0463709682226181, "rewards/accuracy_reward/std": 0.21049949526786804, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1560.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 767.189453125, "completions/mean_terminated_length": 767.189453125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5011521720576939, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11149273522228198, "kl": 0.13330078125, "learning_rate": 1.1713011835112837e-05, "loss": 0.0164, "num_tokens": 707373688.0, "reward": 2.10205078125, "reward_std": 0.17644894123077393, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 738.7578125, "completions/mean_terminated_length": 738.7578125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5014935563710847, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10825062272119503, "kl": 0.140869140625, "learning_rate": 1.1701273209045435e-05, "loss": 0.0149, "num_tokens": 707826412.0, "reward": 2.03857421875, "reward_std": 0.11989803612232208, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1646.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 798.5, "completions/mean_terminated_length": 798.5, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.5018349406844755, "frac_reward_zero_std": 0.5, "grad_norm": 0.10877914520545448, "kl": 0.131103515625, "learning_rate": 1.1689532168331877e-05, "loss": 0.0097, "num_tokens": 708311020.0, "reward": 2.12841796875, "reward_std": 0.18617843091487885, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 747.025390625, "completions/mean_terminated_length": 744.4794311523438, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.5021763249978664, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11150957722053749, "kl": 0.13525390625, "learning_rate": 1.1677788729636428e-05, "loss": 0.0276, "num_tokens": 708780873.0, "reward": 2.064453125, "reward_std": 0.1632244884967804, "rewards/accuracy_reward/mean": 0.08266129344701767, "rewards/accuracy_reward/std": 0.2756475806236267, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 779.7890625, "completions/mean_terminated_length": 779.7890625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.5025177093112572, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10515683265766367, "kl": 0.13671875, "learning_rate": 1.1666042909626747e-05, "loss": 0.0212, "num_tokens": 709261757.0, "reward": 2.0205078125, "reward_std": 0.13206824660301208, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 796.970703125, "completions/mean_terminated_length": 796.970703125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.502859093624648, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0888566889433372, "kl": 0.1243896484375, "learning_rate": 1.165429472497388e-05, "loss": 0.022, "num_tokens": 709748782.0, "reward": 2.06201171875, "reward_std": 0.11312470585107803, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 818.05078125, "completions/mean_terminated_length": 818.05078125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.5032004779380388, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10266866170224061, "kl": 0.128173828125, "learning_rate": 1.1642544192352225e-05, "loss": 0.0201, "num_tokens": 710258040.0, "reward": 2.0654296875, "reward_std": 0.15808700025081635, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 805.427734375, "completions/mean_terminated_length": 805.427734375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.5035418622514295, "frac_reward_zero_std": 0.75, "grad_norm": 0.08659772371082795, "kl": 0.131591796875, "learning_rate": 1.1630791328439515e-05, "loss": 0.0155, "num_tokens": 710764291.0, "reward": 2.04052734375, "reward_std": 0.06951074302196503, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 840.802734375, "completions/mean_terminated_length": 840.802734375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5038832465648203, "frac_reward_zero_std": 0.75, "grad_norm": 0.0864258129813953, "kl": 0.13037109375, "learning_rate": 1.1619036149916792e-05, "loss": 0.0082, "num_tokens": 711280430.0, "reward": 2.04248046875, "reward_std": 0.07815548777580261, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 749.994140625, "completions/mean_terminated_length": 749.994140625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.5042246308782111, "frac_reward_zero_std": 0.5, "grad_norm": 0.12107109523191074, "kl": 0.1243896484375, "learning_rate": 1.1607278673468378e-05, "loss": 0.0181, "num_tokens": 711746747.0, "reward": 2.083984375, "reward_std": 0.18681584298610687, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1929.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 828.201171875, "completions/mean_terminated_length": 828.201171875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.5045660151916019, "frac_reward_zero_std": 0.5, "grad_norm": 0.11266961623713287, "kl": 0.118896484375, "learning_rate": 1.1595518915781863e-05, "loss": 0.035, "num_tokens": 712252466.0, "reward": 2.04541015625, "reward_std": 0.15068507194519043, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 773.8359375, "completions/mean_terminated_length": 772.74365234375, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.5049073995049927, "frac_reward_zero_std": 0.71875, "grad_norm": 0.882607515606254, "kl": 0.679931640625, "learning_rate": 1.158375689354807e-05, "loss": 0.0476, "num_tokens": 712732286.0, "reward": 2.03271484375, "reward_std": 0.09443940967321396, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 779.3125, "completions/mean_terminated_length": 779.3125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.5052487838183836, "frac_reward_zero_std": 0.5, "grad_norm": 0.11805886479922632, "kl": 0.11962890625, "learning_rate": 1.1571992623461039e-05, "loss": 0.0121, "num_tokens": 713215182.0, "reward": 2.04541015625, "reward_std": 0.158103346824646, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 770.708984375, "completions/mean_terminated_length": 770.708984375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.5055901681317744, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10180466453714049, "kl": 0.1337890625, "learning_rate": 1.1560226122218e-05, "loss": 0.0143, "num_tokens": 713689993.0, "reward": 2.02490234375, "reward_std": 0.10517555475234985, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 711.9375, "completions/mean_terminated_length": 711.9375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5059315524451652, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12140929490564265, "kl": 0.130859375, "learning_rate": 1.1548457406519355e-05, "loss": 0.0167, "num_tokens": 714133865.0, "reward": 2.0361328125, "reward_std": 0.1449190378189087, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 758.291015625, "completions/mean_terminated_length": 758.291015625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5062729367585559, "frac_reward_zero_std": 0.5, "grad_norm": 0.11551034716253313, "kl": 0.13330078125, "learning_rate": 1.1536686493068637e-05, "loss": 0.0149, "num_tokens": 714609886.0, "reward": 2.14892578125, "reward_std": 0.18989580869674683, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 735.814453125, "completions/mean_terminated_length": 734.5244750976562, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5066143210719467, "frac_reward_zero_std": 0.75, "grad_norm": 0.25683358961486313, "kl": 0.305908203125, "learning_rate": 1.1524913398572501e-05, "loss": 0.0341, "num_tokens": 715066863.0, "reward": 1.99462890625, "reward_std": 0.08257247507572174, "rewards/accuracy_reward/mean": 0.010080644860863686, "rewards/accuracy_reward/std": 0.0999959334731102, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04269581660628319, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 687.123046875, "completions/mean_terminated_length": 687.123046875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5069557053853375, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11539490923309415, "kl": 0.13330078125, "learning_rate": 1.151313813974071e-05, "loss": 0.0227, "num_tokens": 715503726.0, "reward": 2.0751953125, "reward_std": 0.13584695756435394, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 675.90234375, "completions/mean_terminated_length": 675.90234375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.5072970896987283, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12100610957757525, "kl": 0.13818359375, "learning_rate": 1.1501360733286084e-05, "loss": 0.014, "num_tokens": 715943756.0, "reward": 2.05029296875, "reward_std": 0.11238718777894974, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 696.552734375, "completions/mean_terminated_length": 696.552734375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5076384740121191, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09754126857336261, "kl": 0.13427734375, "learning_rate": 1.1489581195924501e-05, "loss": 0.0124, "num_tokens": 716380695.0, "reward": 2.02685546875, "reward_std": 0.10494492948055267, "rewards/accuracy_reward/mean": 0.04032257944345474, "rewards/accuracy_reward/std": 0.19691328704357147, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1507.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 678.365234375, "completions/mean_terminated_length": 678.365234375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.50797985832551, "frac_reward_zero_std": 0.5, "grad_norm": 0.1323248836605658, "kl": 0.141357421875, "learning_rate": 1.1477799544374861e-05, "loss": 0.0196, "num_tokens": 716812290.0, "reward": 2.06591796875, "reward_std": 0.1706896424293518, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 666.302734375, "completions/mean_terminated_length": 666.302734375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.5083212426389008, "frac_reward_zero_std": 0.5, "grad_norm": 0.1299305272167997, "kl": 0.141845703125, "learning_rate": 1.1466015795359058e-05, "loss": 0.0122, "num_tokens": 717237485.0, "reward": 2.03564453125, "reward_std": 0.1661335676908493, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 725.4921875, "completions/mean_terminated_length": 725.4921875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5086626269522916, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11139650777985402, "kl": 0.14599609375, "learning_rate": 1.1454229965601973e-05, "loss": 0.0076, "num_tokens": 717690729.0, "reward": 2.111328125, "reward_std": 0.14713963866233826, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 707.265625, "completions/mean_terminated_length": 707.265625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.5090040112656823, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09424739247296587, "kl": 0.146484375, "learning_rate": 1.1442442071831434e-05, "loss": 0.0137, "num_tokens": 718131121.0, "reward": 2.0439453125, "reward_std": 0.06574104726314545, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 778.314453125, "completions/mean_terminated_length": 775.8297119140625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.5093453955790731, "frac_reward_zero_std": 0.625, "grad_norm": 0.11096373332380276, "kl": 0.138916015625, "learning_rate": 1.1430652130778197e-05, "loss": 0.0235, "num_tokens": 718606706.0, "reward": 2.0849609375, "reward_std": 0.1556401401758194, "rewards/accuracy_reward/mean": 0.10080645233392715, "rewards/accuracy_reward/std": 0.30137622356414795, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 722.451171875, "completions/mean_terminated_length": 722.451171875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.5096867798924639, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1262456179957242, "kl": 0.138427734375, "learning_rate": 1.1418860159175933e-05, "loss": 0.0233, "num_tokens": 719056793.0, "reward": 2.0908203125, "reward_std": 0.16160514950752258, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 774.615234375, "completions/mean_terminated_length": 774.615234375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5100281642058547, "frac_reward_zero_std": 0.75, "grad_norm": 0.08201076626212273, "kl": 0.135009765625, "learning_rate": 1.1407066173761187e-05, "loss": 0.0142, "num_tokens": 719525700.0, "reward": 2.046875, "reward_std": 0.09632748365402222, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 764.568359375, "completions/mean_terminated_length": 764.568359375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.5103695485192455, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13314511957254502, "kl": 0.139404296875, "learning_rate": 1.1395270191273362e-05, "loss": 0.0156, "num_tokens": 720004135.0, "reward": 2.0537109375, "reward_std": 0.19518308341503143, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 772.451171875, "completions/mean_terminated_length": 772.451171875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.5107109328326364, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10572833191064648, "kl": 0.134521484375, "learning_rate": 1.13834722284547e-05, "loss": 0.0188, "num_tokens": 720473422.0, "reward": 2.09423828125, "reward_std": 0.13416516780853271, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 791.982421875, "completions/mean_terminated_length": 791.982421875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.5110523171460272, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11840507183958297, "kl": 0.13427734375, "learning_rate": 1.137167230205025e-05, "loss": 0.0111, "num_tokens": 720961253.0, "reward": 2.119140625, "reward_std": 0.19228553771972656, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 807.890625, "completions/mean_terminated_length": 807.890625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.511393701459418, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11844854242589767, "kl": 0.131103515625, "learning_rate": 1.1359870428807849e-05, "loss": 0.0205, "num_tokens": 721465213.0, "reward": 2.08154296875, "reward_std": 0.15624743700027466, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 777.224609375, "completions/mean_terminated_length": 777.224609375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5117350857728087, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12068896228671235, "kl": 0.1328125, "learning_rate": 1.1348066625478096e-05, "loss": 0.0302, "num_tokens": 721946608.0, "reward": 2.0595703125, "reward_std": 0.1507698893547058, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1621.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 835.068359375, "completions/mean_terminated_length": 834.5968627929688, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.5120764700861995, "frac_reward_zero_std": 0.53125, "grad_norm": 0.21946457507677528, "kl": 0.232421875, "learning_rate": 1.1336260908814335e-05, "loss": 0.0238, "num_tokens": 722455859.0, "reward": 2.1162109375, "reward_std": 0.1923483908176422, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 741.564453125, "completions/mean_terminated_length": 741.564453125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5124178543995903, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12394145117313546, "kl": 0.135986328125, "learning_rate": 1.1324453295572619e-05, "loss": 0.023, "num_tokens": 722915012.0, "reward": 2.107421875, "reward_std": 0.1975012719631195, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1625.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 778.376953125, "completions/mean_terminated_length": 778.376953125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.5127592387129811, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1136658509313669, "kl": 0.130615234375, "learning_rate": 1.1312643802511696e-05, "loss": 0.0195, "num_tokens": 723393477.0, "reward": 2.05078125, "reward_std": 0.16614919900894165, "rewards/accuracy_reward/mean": 0.07459677755832672, "rewards/accuracy_reward/std": 0.263004869222641, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 812.86328125, "completions/mean_terminated_length": 812.86328125, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.5131006230263719, "frac_reward_zero_std": 0.59375, "grad_norm": 0.105632527265338, "kl": 0.1279296875, "learning_rate": 1.1300832446392985e-05, "loss": 0.0161, "num_tokens": 723889487.0, "reward": 2.0458984375, "reward_std": 0.1416110098361969, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 731.12109375, "completions/mean_terminated_length": 725.9569091796875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.5134420073397628, "frac_reward_zero_std": 0.5, "grad_norm": 0.11937683326294028, "kl": 0.134765625, "learning_rate": 1.1289019243980537e-05, "loss": 0.034, "num_tokens": 724344957.0, "reward": 2.0693359375, "reward_std": 0.186063751578331, "rewards/accuracy_reward/mean": 0.08870967477560043, "rewards/accuracy_reward/std": 0.2846112847328186, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 774.974609375, "completions/mean_terminated_length": 774.974609375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5137833916531536, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11415592902446431, "kl": 0.1259765625, "learning_rate": 1.1277204212041038e-05, "loss": 0.0125, "num_tokens": 724827792.0, "reward": 2.09033203125, "reward_std": 0.18566186726093292, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 805.830078125, "completions/mean_terminated_length": 805.830078125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.5141247759665444, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10337167423994177, "kl": 0.11865234375, "learning_rate": 1.1265387367343763e-05, "loss": 0.0158, "num_tokens": 725321177.0, "reward": 2.0537109375, "reward_std": 0.1606258749961853, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 739.46484375, "completions/mean_terminated_length": 738.7201538085938, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.5144661602799352, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6521210359953824, "kl": 0.452880859375, "learning_rate": 1.1253568726660562e-05, "loss": 0.0446, "num_tokens": 725774135.0, "reward": 2.07568359375, "reward_std": 0.19586580991744995, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 755.908203125, "completions/mean_terminated_length": 755.908203125, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.5148075445933259, "frac_reward_zero_std": 0.75, "grad_norm": 0.07492495671458746, "kl": 0.128173828125, "learning_rate": 1.1241748306765836e-05, "loss": 0.0111, "num_tokens": 726245880.0, "reward": 2.0595703125, "reward_std": 0.08700430393218994, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03484956547617912, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1406.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 727.8671875, "completions/mean_terminated_length": 727.8671875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5151489289067167, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1108363549911696, "kl": 0.134765625, "learning_rate": 1.1229926124436506e-05, "loss": 0.0153, "num_tokens": 726697188.0, "reward": 2.02734375, "reward_std": 0.0990472063422203, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 751.09375, "completions/mean_terminated_length": 751.09375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.5154903132201075, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10511038913717141, "kl": 0.13720703125, "learning_rate": 1.1218102196452003e-05, "loss": 0.0097, "num_tokens": 727158756.0, "reward": 2.0966796875, "reward_std": 0.17086532711982727, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 692.98046875, "completions/mean_terminated_length": 692.98046875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.5158316975334983, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11702496820432189, "kl": 0.142578125, "learning_rate": 1.120627653959422e-05, "loss": 0.0336, "num_tokens": 727597338.0, "reward": 2.0419921875, "reward_std": 0.12333044409751892, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 741.34375, "completions/mean_terminated_length": 741.34375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.5161730818468891, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11828779124386021, "kl": 0.1279296875, "learning_rate": 1.1194449170647526e-05, "loss": 0.0215, "num_tokens": 728056842.0, "reward": 2.03369140625, "reward_std": 0.15727370977401733, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04269581660628319, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 718.724609375, "completions/mean_terminated_length": 718.724609375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.51651446616028, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11876736244276428, "kl": 0.1287841796875, "learning_rate": 1.1182620106398699e-05, "loss": 0.0141, "num_tokens": 728511389.0, "reward": 2.09814453125, "reward_std": 0.17761537432670593, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1905.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 726.2734375, "completions/mean_terminated_length": 726.2734375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.5168558504736708, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11651786238016033, "kl": 0.122802734375, "learning_rate": 1.1170789363636934e-05, "loss": 0.0299, "num_tokens": 728960537.0, "reward": 2.04736328125, "reward_std": 0.17077386379241943, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 700.744140625, "completions/mean_terminated_length": 699.4520263671875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5171972347870616, "frac_reward_zero_std": 0.53125, "grad_norm": 0.336025510689481, "kl": 0.198974609375, "learning_rate": 1.1158956959153808e-05, "loss": 0.0218, "num_tokens": 729399542.0, "reward": 2.07177734375, "reward_std": 0.17209449410438538, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 748.349609375, "completions/mean_terminated_length": 748.349609375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.5175386191004523, "frac_reward_zero_std": 0.5, "grad_norm": 0.11967726026855514, "kl": 0.132080078125, "learning_rate": 1.1147122909743257e-05, "loss": 0.01, "num_tokens": 729868633.0, "reward": 2.14111328125, "reward_std": 0.1980966329574585, "rewards/accuracy_reward/mean": 0.15322580933570862, "rewards/accuracy_reward/std": 0.36056873202323914, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 708.431640625, "completions/mean_terminated_length": 708.431640625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5178800034138431, "frac_reward_zero_std": 0.625, "grad_norm": 0.11186985588294306, "kl": 0.13427734375, "learning_rate": 1.1135287232201546e-05, "loss": 0.0163, "num_tokens": 730313638.0, "reward": 2.07177734375, "reward_std": 0.13588345050811768, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 694.259765625, "completions/mean_terminated_length": 693.1820068359375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5182213877272339, "frac_reward_zero_std": 0.65625, "grad_norm": 0.4649100095814516, "kl": 0.16796875, "learning_rate": 1.1123449943327256e-05, "loss": 0.0221, "num_tokens": 730752651.0, "reward": 2.06103515625, "reward_std": 0.13186736404895782, "rewards/accuracy_reward/mean": 0.0786290317773819, "rewards/accuracy_reward/std": 0.26943066716194153, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 725.73828125, "completions/mean_terminated_length": 725.73828125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5185627720406247, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11560824724094268, "kl": 0.130126953125, "learning_rate": 1.1111611059921253e-05, "loss": 0.0043, "num_tokens": 731212469.0, "reward": 2.1298828125, "reward_std": 0.2038733959197998, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 758.751953125, "completions/mean_terminated_length": 758.751953125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.5189041563540155, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09865878838199368, "kl": 0.1224365234375, "learning_rate": 1.1099770598786665e-05, "loss": 0.0107, "num_tokens": 731684406.0, "reward": 2.06884765625, "reward_std": 0.1341090053319931, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 729.958984375, "completions/mean_terminated_length": 729.958984375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.5192455406674064, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10127038600955574, "kl": 0.13037109375, "learning_rate": 1.1087928576728866e-05, "loss": 0.0109, "num_tokens": 732140753.0, "reward": 2.0380859375, "reward_std": 0.12117845565080643, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1436.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 713.328125, "completions/mean_terminated_length": 713.328125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5195869249807972, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1275968903135654, "kl": 0.13232421875, "learning_rate": 1.1076085010555438e-05, "loss": 0.017, "num_tokens": 732583225.0, "reward": 2.0205078125, "reward_std": 0.15440642833709717, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 701.423828125, "completions/mean_terminated_length": 701.423828125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.519928309294188, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1150740824474945, "kl": 0.1256103515625, "learning_rate": 1.1064239917076151e-05, "loss": 0.0071, "num_tokens": 733026770.0, "reward": 2.11962890625, "reward_std": 0.16918472945690155, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 734.98828125, "completions/mean_terminated_length": 734.98828125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5202696936075787, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09345050692738625, "kl": 0.1251220703125, "learning_rate": 1.1052393313102958e-05, "loss": 0.0049, "num_tokens": 733478700.0, "reward": 2.06103515625, "reward_std": 0.08742999285459518, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 745.525390625, "completions/mean_terminated_length": 745.525390625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5206110779209695, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09843632327560116, "kl": 0.12548828125, "learning_rate": 1.104054521544994e-05, "loss": 0.0062, "num_tokens": 733944777.0, "reward": 2.02685546875, "reward_std": 0.09730081260204315, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 832.255859375, "completions/mean_terminated_length": 829.876708984375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.5209524622343603, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11672734042091196, "kl": 0.11962890625, "learning_rate": 1.1028695640933309e-05, "loss": 0.0131, "num_tokens": 734460876.0, "reward": 2.10302734375, "reward_std": 0.22522056102752686, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 825.845703125, "completions/mean_terminated_length": 825.845703125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.5212938465477511, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09718438480411172, "kl": 0.114990234375, "learning_rate": 1.1016844606371364e-05, "loss": 0.0126, "num_tokens": 734963965.0, "reward": 2.06005859375, "reward_std": 0.15393158793449402, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1792.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 816.302734375, "completions/mean_terminated_length": 816.302734375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.5216352308611419, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08692095289240236, "kl": 0.1177978515625, "learning_rate": 1.1004992128584489e-05, "loss": 0.0135, "num_tokens": 735460136.0, "reward": 2.06591796875, "reward_std": 0.12226152420043945, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1617.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 821.822265625, "completions/mean_terminated_length": 820.632080078125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.5219766151745328, "frac_reward_zero_std": 0.6875, "grad_norm": 0.22099536347244617, "kl": 0.1954345703125, "learning_rate": 1.09931382243951e-05, "loss": 0.0147, "num_tokens": 735968909.0, "reward": 2.09375, "reward_std": 0.12338299304246902, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 815.216796875, "completions/mean_terminated_length": 812.8043212890625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.5223179994879236, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11046802670440688, "kl": 0.1207275390625, "learning_rate": 1.0981282910627646e-05, "loss": 0.0213, "num_tokens": 736472732.0, "reward": 2.09521484375, "reward_std": 0.16157867014408112, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 749.65234375, "completions/mean_terminated_length": 749.65234375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5226593838013144, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12031965358840221, "kl": 0.130859375, "learning_rate": 1.0969426204108584e-05, "loss": 0.0118, "num_tokens": 736935466.0, "reward": 2.07373046875, "reward_std": 0.15860888361930847, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 761.546875, "completions/mean_terminated_length": 761.546875, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.5230007681147051, "frac_reward_zero_std": 0.5, "grad_norm": 0.12509792282318938, "kl": 0.126953125, "learning_rate": 1.0957568121666331e-05, "loss": 0.0184, "num_tokens": 737408562.0, "reward": 2.06103515625, "reward_std": 0.16554534435272217, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 863.115234375, "completions/mean_terminated_length": 851.4299926757812, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.5233421524280959, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10446516633187018, "kl": 0.1124267578125, "learning_rate": 1.0945708680131274e-05, "loss": 0.016, "num_tokens": 737930669.0, "reward": 2.091796875, "reward_std": 0.22196388244628906, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1951.0, "completions/mean_length": 784.544921875, "completions/mean_terminated_length": 779.5902709960938, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.5236835367414867, "frac_reward_zero_std": 0.5, "grad_norm": 0.11927110889847943, "kl": 0.125732421875, "learning_rate": 1.093384789633572e-05, "loss": 0.0279, "num_tokens": 738423172.0, "reward": 2.09619140625, "reward_std": 0.19619220495224, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 754.609375, "completions/mean_terminated_length": 754.609375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5240249210548775, "frac_reward_zero_std": 0.375, "grad_norm": 0.13579728986517536, "kl": 0.128662109375, "learning_rate": 1.0921985787113878e-05, "loss": 0.0271, "num_tokens": 738908684.0, "reward": 2.13232421875, "reward_std": 0.2229837030172348, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.958984375, "rewards/format_reward/std": 0.19852031767368317, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 698.94921875, "completions/mean_terminated_length": 698.94921875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.5243663053682683, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12119920806307724, "kl": 0.1337890625, "learning_rate": 1.0910122369301843e-05, "loss": 0.0246, "num_tokens": 739350754.0, "reward": 2.05615234375, "reward_std": 0.13912621140480042, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 790.8125, "completions/mean_terminated_length": 790.8125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5247076896816592, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11171881583945915, "kl": 0.125732421875, "learning_rate": 1.0898257659737572e-05, "loss": 0.0248, "num_tokens": 739840722.0, "reward": 2.0615234375, "reward_std": 0.17539182305335999, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 719.806640625, "completions/mean_terminated_length": 719.806640625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.52504907399505, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11596676375134454, "kl": 0.1328125, "learning_rate": 1.0886391675260846e-05, "loss": 0.0009, "num_tokens": 740290399.0, "reward": 2.0859375, "reward_std": 0.1636594533920288, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 711.45703125, "completions/mean_terminated_length": 711.45703125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.5253904583084408, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13385402195586205, "kl": 0.12841796875, "learning_rate": 1.0874524432713255e-05, "loss": 0.0263, "num_tokens": 740736409.0, "reward": 2.140625, "reward_std": 0.20169684290885925, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 682.701171875, "completions/mean_terminated_length": 682.701171875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.5257318426218315, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11837996709811761, "kl": 0.13525390625, "learning_rate": 1.086265594893819e-05, "loss": 0.001, "num_tokens": 741169232.0, "reward": 2.1640625, "reward_std": 0.20401310920715332, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1628.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 749.099609375, "completions/mean_terminated_length": 749.099609375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.5260732269352223, "frac_reward_zero_std": 0.625, "grad_norm": 0.09815701561436854, "kl": 0.131591796875, "learning_rate": 1.0850786240780787e-05, "loss": 0.0111, "num_tokens": 741631347.0, "reward": 2.09228515625, "reward_std": 0.1258563995361328, "rewards/accuracy_reward/mean": 0.10483870655298233, "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 707.20703125, "completions/mean_terminated_length": 707.20703125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5264146112486131, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13515278685763082, "kl": 0.131103515625, "learning_rate": 1.0838915325087925e-05, "loss": 0.0126, "num_tokens": 742073181.0, "reward": 2.0537109375, "reward_std": 0.16665077209472656, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 738.9453125, "completions/mean_terminated_length": 738.9453125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.5267559955620039, "frac_reward_zero_std": 0.65625, "grad_norm": 0.11570306682650346, "kl": 0.138427734375, "learning_rate": 1.0827043218708197e-05, "loss": 0.0206, "num_tokens": 742569041.0, "reward": 2.01513671875, "reward_std": 0.095703125, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15143637359142303, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 761.083984375, "completions/mean_terminated_length": 761.083984375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.5270973798753947, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12163336288600783, "kl": 0.13232421875, "learning_rate": 1.0815169938491892e-05, "loss": 0.0213, "num_tokens": 743042748.0, "reward": 2.0283203125, "reward_std": 0.16418275237083435, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 783.994140625, "completions/mean_terminated_length": 783.3385620117188, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.5274387641887855, "frac_reward_zero_std": 0.6875, "grad_norm": 0.9522413318534559, "kl": 0.395751953125, "learning_rate": 1.0803295501290954e-05, "loss": 0.0251, "num_tokens": 743534521.0, "reward": 2.05908203125, "reward_std": 0.09971685707569122, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1451.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 778.5546875, "completions/mean_terminated_length": 778.5546875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5277801485021764, "frac_reward_zero_std": 0.71875, "grad_norm": 0.25271110532289426, "kl": 0.13623046875, "learning_rate": 1.0791419923958977e-05, "loss": 0.0096, "num_tokens": 744015061.0, "reward": 2.041015625, "reward_std": 0.0848003551363945, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.2181723266839981, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 874.787109375, "completions/mean_terminated_length": 867.872314453125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.5281215328155672, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10384417322930689, "kl": 0.123291015625, "learning_rate": 1.077954322335117e-05, "loss": 0.0151, "num_tokens": 744548984.0, "reward": 2.0400390625, "reward_std": 0.15596535801887512, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 797.470703125, "completions/mean_terminated_length": 797.470703125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.5284629171289579, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12597861541535654, "kl": 0.131591796875, "learning_rate": 1.0767665416324338e-05, "loss": 0.0252, "num_tokens": 745045465.0, "reward": 2.0322265625, "reward_std": 0.1859951913356781, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 757.19921875, "completions/mean_terminated_length": 757.19921875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.5288043014423487, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10907848946933346, "kl": 0.134765625, "learning_rate": 1.0755786519736855e-05, "loss": 0.012, "num_tokens": 745516303.0, "reward": 2.07568359375, "reward_std": 0.11845651268959045, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 804.673828125, "completions/mean_terminated_length": 804.673828125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.5291456857557395, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10698810401380815, "kl": 0.130859375, "learning_rate": 1.0743906550448644e-05, "loss": 0.0102, "num_tokens": 746018216.0, "reward": 2.078125, "reward_std": 0.12904389202594757, "rewards/accuracy_reward/mean": 0.09677419066429138, "rewards/accuracy_reward/std": 0.2959485352039337, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1384.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 795.02734375, "completions/mean_terminated_length": 795.02734375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.5294870700691303, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10696917852562124, "kl": 0.128662109375, "learning_rate": 1.0732025525321145e-05, "loss": 0.0225, "num_tokens": 746537590.0, "reward": 2.03369140625, "reward_std": 0.1726887971162796, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.2029850035905838, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04269581660628319, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 715.5625, "completions/mean_terminated_length": 715.5625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5298284543825211, "frac_reward_zero_std": 0.375, "grad_norm": 0.1423660770033174, "kl": 0.1328125, "learning_rate": 1.0720143461217302e-05, "loss": 0.0257, "num_tokens": 746979366.0, "reward": 2.11376953125, "reward_std": 0.21945856511592865, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1619.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 762.57421875, "completions/mean_terminated_length": 762.57421875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.5301698386959119, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11572023154900822, "kl": 0.11962890625, "learning_rate": 1.0708260375001533e-05, "loss": 0.0298, "num_tokens": 747459196.0, "reward": 2.03271484375, "reward_std": 0.11843983829021454, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 742.94921875, "completions/mean_terminated_length": 742.94921875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.5305112230093028, "frac_reward_zero_std": 0.75, "grad_norm": 0.09195555164856234, "kl": 0.1318359375, "learning_rate": 1.0696376283539704e-05, "loss": 0.0139, "num_tokens": 747926322.0, "reward": 2.0712890625, "reward_std": 0.08787450194358826, "rewards/accuracy_reward/mean": 0.08064515888690948, "rewards/accuracy_reward/std": 0.2725643217563629, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1374.0, "completions/max_terminated_length": 1374.0, "completions/mean_length": 731.640625, "completions/mean_terminated_length": 731.640625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5308526073226936, "frac_reward_zero_std": 0.625, "grad_norm": 0.11353688626686341, "kl": 0.129150390625, "learning_rate": 1.0684491203699109e-05, "loss": 0.0296, "num_tokens": 748382986.0, "reward": 2.0703125, "reward_std": 0.13027715682983398, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 725.25, "completions/mean_terminated_length": 725.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.5311939916360843, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11992249424155096, "kl": 0.1258544921875, "learning_rate": 1.067260515234845e-05, "loss": 0.018, "num_tokens": 748833082.0, "reward": 2.1123046875, "reward_std": 0.1609947383403778, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1407.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 742.921875, "completions/mean_terminated_length": 740.9745483398438, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5315353759494751, "frac_reward_zero_std": 0.59375, "grad_norm": 1.0471632149083003, "kl": 0.181884765625, "learning_rate": 1.0660718146357793e-05, "loss": 0.0274, "num_tokens": 749299586.0, "reward": 2.05224609375, "reward_std": 0.14722761511802673, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1495.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 718.859375, "completions/mean_terminated_length": 718.859375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5318767602628659, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14275801564927007, "kl": 0.1285400390625, "learning_rate": 1.0648830202598578e-05, "loss": 0.0235, "num_tokens": 749753738.0, "reward": 2.0830078125, "reward_std": 0.21064460277557373, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 712.986328125, "completions/mean_terminated_length": 712.986328125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5322181445762567, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11188564104576774, "kl": 0.1221923828125, "learning_rate": 1.0636941337943561e-05, "loss": 0.0264, "num_tokens": 750194291.0, "reward": 2.0751953125, "reward_std": 0.1611892431974411, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1646.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 663.5234375, "completions/mean_terminated_length": 663.5234375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5325595288896475, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11951808613030045, "kl": 0.13037109375, "learning_rate": 1.062505156926681e-05, "loss": 0.0205, "num_tokens": 750622447.0, "reward": 2.04345703125, "reward_std": 0.14735190570354462, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 701.23828125, "completions/mean_terminated_length": 701.23828125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.5329009132030383, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09224007909039163, "kl": 0.12939453125, "learning_rate": 1.0613160913443684e-05, "loss": 0.0145, "num_tokens": 751076025.0, "reward": 2.0361328125, "reward_std": 0.09057271480560303, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 613.86328125, "completions/mean_terminated_length": 613.86328125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.5332422975164292, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13428685133670792, "kl": 0.130859375, "learning_rate": 1.060126938735079e-05, "loss": 0.0111, "num_tokens": 751468739.0, "reward": 2.0400390625, "reward_std": 0.1638714224100113, "rewards/accuracy_reward/mean": 0.060483869165182114, "rewards/accuracy_reward/std": 0.2386218160390854, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 646.388671875, "completions/mean_terminated_length": 646.388671875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.53358368182982, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10736036193359459, "kl": 0.1314697265625, "learning_rate": 1.0589377007865973e-05, "loss": 0.0097, "num_tokens": 751880762.0, "reward": 2.01708984375, "reward_std": 0.11824160814285278, "rewards/accuracy_reward/mean": 0.038306452333927155, "rewards/accuracy_reward/std": 0.19212882220745087, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1157.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 671.720703125, "completions/mean_terminated_length": 671.720703125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5339250661432107, "frac_reward_zero_std": 0.53125, "grad_norm": 0.122712876149099, "kl": 0.13330078125, "learning_rate": 1.0577483791868292e-05, "loss": 0.0097, "num_tokens": 752314091.0, "reward": 2.08740234375, "reward_std": 0.16621175408363342, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 664.783203125, "completions/mean_terminated_length": 664.783203125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5342664504566015, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09566419753683185, "kl": 0.1280517578125, "learning_rate": 1.056558975623799e-05, "loss": 0.0142, "num_tokens": 752740748.0, "reward": 2.0732421875, "reward_std": 0.11409001797437668, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1494.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 673.857421875, "completions/mean_terminated_length": 673.857421875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.5346078347699923, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1196891848975476, "kl": 0.133056640625, "learning_rate": 1.0553694917856478e-05, "loss": 0.0167, "num_tokens": 753168195.0, "reward": 2.05029296875, "reward_std": 0.13912013173103333, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1640.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 729.11328125, "completions/mean_terminated_length": 729.11328125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.5349492190833831, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11076227886768755, "kl": 0.1241455078125, "learning_rate": 1.0541799293606302e-05, "loss": 0.0039, "num_tokens": 753631037.0, "reward": 2.06689453125, "reward_std": 0.1449015736579895, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 691.544921875, "completions/mean_terminated_length": 691.544921875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.5352906033967739, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11930984042056286, "kl": 0.13330078125, "learning_rate": 1.052990290037113e-05, "loss": 0.0221, "num_tokens": 754065732.0, "reward": 2.1279296875, "reward_std": 0.1822621375322342, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 751.130859375, "completions/mean_terminated_length": 751.130859375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.5356319877101647, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11633657513103028, "kl": 0.129150390625, "learning_rate": 1.0518005755035708e-05, "loss": 0.0103, "num_tokens": 754525191.0, "reward": 2.14306640625, "reward_std": 0.1609470546245575, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 791.615234375, "completions/mean_terminated_length": 791.615234375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.5359733720235555, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10529631806357485, "kl": 0.124755859375, "learning_rate": 1.050610787448586e-05, "loss": 0.0105, "num_tokens": 755012882.0, "reward": 2.05859375, "reward_std": 0.16630053520202637, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 814.39453125, "completions/mean_terminated_length": 814.39453125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.5363147563369464, "frac_reward_zero_std": 0.28125, "grad_norm": 0.14073228238462665, "kl": 0.12255859375, "learning_rate": 1.0494209275608456e-05, "loss": 0.0378, "num_tokens": 755514412.0, "reward": 2.14453125, "reward_std": 0.26720887422561646, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 776.7734375, "completions/mean_terminated_length": 776.7734375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5366561406503371, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11093165686726127, "kl": 0.1240234375, "learning_rate": 1.0482309975291373e-05, "loss": 0.0106, "num_tokens": 755993944.0, "reward": 2.103515625, "reward_std": 0.17109856009483337, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 857.001953125, "completions/mean_terminated_length": 849.9823608398438, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5369975249637279, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10829294027110595, "kl": 0.1256103515625, "learning_rate": 1.0470409990423496e-05, "loss": 0.0304, "num_tokens": 756508297.0, "reward": 2.01513671875, "reward_std": 0.15676969289779663, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04538619518280029, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1919.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 862.646484375, "completions/mean_terminated_length": 861.5303344726562, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.5373389092771187, "frac_reward_zero_std": 0.4375, "grad_norm": 1.5829154863332244, "kl": 0.8052978515625, "learning_rate": 1.045850933789468e-05, "loss": 0.0622, "num_tokens": 757033204.0, "reward": 2.025390625, "reward_std": 0.20738013088703156, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 916.642578125, "completions/mean_terminated_length": 914.4285888671875, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.5376802935905095, "frac_reward_zero_std": 0.46875, "grad_norm": 0.10632914827504285, "kl": 0.1209716796875, "learning_rate": 1.0446608034595718e-05, "loss": 0.03, "num_tokens": 757583133.0, "reward": 2.0224609375, "reward_std": 0.18098169565200806, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034629516303539276, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 939.076171875, "completions/mean_terminated_length": 931.6141967773438, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5380216779039003, "frac_reward_zero_std": 0.46875, "grad_norm": 0.18862196178860804, "kl": 0.35546875, "learning_rate": 1.0434706097418338e-05, "loss": 0.0344, "num_tokens": 758153364.0, "reward": 2.03271484375, "reward_std": 0.17342175543308258, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1951.0, "completions/max_terminated_length": 1951.0, "completions/mean_length": 888.19921875, "completions/mean_terminated_length": 888.19921875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.5383630622172911, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10360017813057697, "kl": 0.1190185546875, "learning_rate": 1.0422803543255165e-05, "loss": 0.0195, "num_tokens": 758683882.0, "reward": 2.14453125, "reward_std": 0.16631554067134857, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2021.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 952.126953125, "completions/mean_terminated_length": 952.126953125, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.538704446530682, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0824066486410854, "kl": 0.1151123046875, "learning_rate": 1.041090038899969e-05, "loss": 0.0183, "num_tokens": 759254315.0, "reward": 2.00927734375, "reward_std": 0.11563771218061447, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1941.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 900.591796875, "completions/mean_terminated_length": 900.591796875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.5390458308440728, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10670021329989685, "kl": 0.12353515625, "learning_rate": 1.039899665154627e-05, "loss": 0.0101, "num_tokens": 759801770.0, "reward": 2.07373046875, "reward_std": 0.1554080992937088, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.045533329248428345, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 929.548828125, "completions/mean_terminated_length": 929.548828125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.5393872151574635, "frac_reward_zero_std": 0.625, "grad_norm": 0.09526257372962679, "kl": 0.1229248046875, "learning_rate": 1.0387092347790075e-05, "loss": 0.0142, "num_tokens": 760371555.0, "reward": 2.025390625, "reward_std": 0.12805049121379852, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 910.646484375, "completions/mean_terminated_length": 908.4207153320312, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5397285994708543, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11364963030183814, "kl": 0.1259765625, "learning_rate": 1.03751874946271e-05, "loss": 0.0157, "num_tokens": 760929214.0, "reward": 2.07177734375, "reward_std": 0.1954706311225891, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 1978.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 859.265625, "completions/mean_terminated_length": 827.8152465820312, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.5400699837842451, "frac_reward_zero_std": 0.3125, "grad_norm": 67.96816259814578, "kl": 5.18212890625, "learning_rate": 1.0363282108954094e-05, "loss": 0.205, "num_tokens": 761483014.0, "reward": 2.0283203125, "reward_std": 0.24300752580165863, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.955078125, "rewards/format_reward/std": 0.20733514428138733, "rewards/tag_count_reward/mean": 0.9677734375, "rewards/tag_count_reward/std": 0.1660715788602829, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1510.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 780.33984375, "completions/mean_terminated_length": 780.33984375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.5404113680976359, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10909023575620387, "kl": 0.1279296875, "learning_rate": 1.035137620766858e-05, "loss": 0.0259, "num_tokens": 761954436.0, "reward": 2.0302734375, "reward_std": 0.12626439332962036, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 773.67578125, "completions/mean_terminated_length": 773.67578125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5407527524110267, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11497205620621674, "kl": 0.131103515625, "learning_rate": 1.0339469807668812e-05, "loss": 0.0228, "num_tokens": 762431070.0, "reward": 2.0498046875, "reward_std": 0.14516869187355042, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 766.2578125, "completions/mean_terminated_length": 766.2578125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.5410941367244175, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10528774485925053, "kl": 0.131591796875, "learning_rate": 1.0327562925853736e-05, "loss": 0.0014, "num_tokens": 762916162.0, "reward": 2.05419921875, "reward_std": 0.11961033195257187, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 730.18359375, "completions/mean_terminated_length": 730.18359375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5414355210378083, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12201049433059749, "kl": 0.126708984375, "learning_rate": 1.0315655579123001e-05, "loss": 0.0139, "num_tokens": 763374832.0, "reward": 2.0771484375, "reward_std": 0.16443553566932678, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 722.12109375, "completions/mean_terminated_length": 722.12109375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.5417769053511992, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12367028960156823, "kl": 0.13134765625, "learning_rate": 1.0303747784376905e-05, "loss": 0.0091, "num_tokens": 763837214.0, "reward": 2.0751953125, "reward_std": 0.1647929847240448, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 707.404296875, "completions/mean_terminated_length": 707.404296875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5421182896645899, "frac_reward_zero_std": 0.625, "grad_norm": 0.10196236819643086, "kl": 0.131103515625, "learning_rate": 1.0291839558516385e-05, "loss": 0.0071, "num_tokens": 764284637.0, "reward": 2.0615234375, "reward_std": 0.1465752124786377, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 752.453125, "completions/mean_terminated_length": 752.453125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.5424596739779807, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10247486067914746, "kl": 0.1197509765625, "learning_rate": 1.027993091844299e-05, "loss": 0.0141, "num_tokens": 764750965.0, "reward": 2.10400390625, "reward_std": 0.1616087406873703, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 766.470703125, "completions/mean_terminated_length": 766.470703125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.5428010582913715, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12109259177148002, "kl": 0.1278076171875, "learning_rate": 1.0268021881058858e-05, "loss": 0.0104, "num_tokens": 765224310.0, "reward": 2.1357421875, "reward_std": 0.18976332247257233, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 684.681640625, "completions/mean_terminated_length": 684.681640625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.5431424426047623, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12286724042830263, "kl": 0.14306640625, "learning_rate": 1.0256112463266687e-05, "loss": 0.0083, "num_tokens": 765658035.0, "reward": 2.08984375, "reward_std": 0.14832675457000732, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 775.927734375, "completions/mean_terminated_length": 775.927734375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.5434838269181531, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10003508825577438, "kl": 0.1199951171875, "learning_rate": 1.0244202681969717e-05, "loss": 0.008, "num_tokens": 766144158.0, "reward": 2.08251953125, "reward_std": 0.16163745522499084, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 770.755859375, "completions/mean_terminated_length": 770.755859375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.5438252112315439, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10702897619701364, "kl": 0.129150390625, "learning_rate": 1.0232292554071705e-05, "loss": 0.0076, "num_tokens": 766621617.0, "reward": 2.15087890625, "reward_std": 0.1687193512916565, "rewards/accuracy_reward/mean": 0.16129031777381897, "rewards/accuracy_reward/std": 0.3681698441505432, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 754.01171875, "completions/mean_terminated_length": 754.01171875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.5441665955449347, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11322625007326519, "kl": 0.127197265625, "learning_rate": 1.0220382096476902e-05, "loss": 0.0269, "num_tokens": 767093863.0, "reward": 2.06884765625, "reward_std": 0.13013532757759094, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 777.21484375, "completions/mean_terminated_length": 777.21484375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5445079798583256, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09926410029675356, "kl": 0.1201171875, "learning_rate": 1.0208471326090019e-05, "loss": 0.0053, "num_tokens": 767577829.0, "reward": 2.04296875, "reward_std": 0.13821569085121155, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1680.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 756.642578125, "completions/mean_terminated_length": 756.642578125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5448493641717163, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09397553248302673, "kl": 0.1236572265625, "learning_rate": 1.0196560259816222e-05, "loss": 0.0174, "num_tokens": 768056814.0, "reward": 2.05859375, "reward_std": 0.11012051999568939, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 789.056640625, "completions/mean_terminated_length": 789.056640625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5451907484851071, "frac_reward_zero_std": 0.5, "grad_norm": 0.11446994223581387, "kl": 0.125244140625, "learning_rate": 1.0184648914561084e-05, "loss": 0.0199, "num_tokens": 768544203.0, "reward": 2.06591796875, "reward_std": 0.1760910451412201, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 705.54296875, "completions/mean_terminated_length": 705.54296875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5455321327984979, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10326625395081085, "kl": 0.128662109375, "learning_rate": 1.0172737307230584e-05, "loss": 0.0032, "num_tokens": 768993009.0, "reward": 2.03955078125, "reward_std": 0.1227213442325592, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 772.41015625, "completions/mean_terminated_length": 772.41015625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5458735171118887, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12468296713167075, "kl": 0.1231689453125, "learning_rate": 1.0160825454731072e-05, "loss": 0.0194, "num_tokens": 769470019.0, "reward": 2.07177734375, "reward_std": 0.1917804777622223, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.033087924122810364, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 795.189453125, "completions/mean_terminated_length": 795.189453125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.5462149014252795, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09337801676317883, "kl": 0.1114501953125, "learning_rate": 1.0148913373969242e-05, "loss": 0.0026, "num_tokens": 769953796.0, "reward": 2.0458984375, "reward_std": 0.11707795411348343, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 756.4375, "completions/mean_terminated_length": 756.4375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5465562857386703, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09714587708208389, "kl": 0.122314453125, "learning_rate": 1.0137001081852113e-05, "loss": 0.0086, "num_tokens": 770423860.0, "reward": 2.05810546875, "reward_std": 0.12380683422088623, "rewards/accuracy_reward/mean": 0.06653226166963577, "rewards/accuracy_reward/std": 0.2494617998600006, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 744.439453125, "completions/mean_terminated_length": 744.439453125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5468976700520611, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10882230885625505, "kl": 0.1195068359375, "learning_rate": 1.0125088595287011e-05, "loss": 0.0143, "num_tokens": 770885733.0, "reward": 2.07568359375, "reward_std": 0.1425192952156067, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 774.009765625, "completions/mean_terminated_length": 771.1886596679688, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.547239054365452, "frac_reward_zero_std": 0.59375, "grad_norm": 1.1401633305208492, "kl": 0.21923828125, "learning_rate": 1.0113175931181522e-05, "loss": 0.0353, "num_tokens": 771366426.0, "reward": 2.05078125, "reward_std": 0.15540985763072968, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 749.654296875, "completions/mean_terminated_length": 749.654296875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.5475804386788428, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1250217335627548, "kl": 0.1190185546875, "learning_rate": 1.0101263106443501e-05, "loss": 0.0196, "num_tokens": 771838393.0, "reward": 2.126953125, "reward_std": 0.1594398319721222, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 739.357421875, "completions/mean_terminated_length": 739.357421875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.5479218229922335, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12812046386025117, "kl": 0.127197265625, "learning_rate": 1.0089350137981022e-05, "loss": 0.0202, "num_tokens": 772294416.0, "reward": 2.08154296875, "reward_std": 0.1610666811466217, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 687.693359375, "completions/mean_terminated_length": 687.693359375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5482632073056243, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10458486198970329, "kl": 0.1278076171875, "learning_rate": 1.0077437042702362e-05, "loss": 0.0076, "num_tokens": 772722819.0, "reward": 2.14599609375, "reward_std": 0.15936869382858276, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 713.17578125, "completions/mean_terminated_length": 711.1549682617188, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.5486045916190151, "frac_reward_zero_std": 0.65625, "grad_norm": 0.902735848284101, "kl": 0.1763916015625, "learning_rate": 1.0065523837515985e-05, "loss": 0.0115, "num_tokens": 773164157.0, "reward": 2.02392578125, "reward_std": 0.11608945578336716, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 727.740234375, "completions/mean_terminated_length": 727.740234375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.5489459759324059, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11753998111270669, "kl": 0.1263427734375, "learning_rate": 1.0053610539330508e-05, "loss": 0.0161, "num_tokens": 773615192.0, "reward": 2.08349609375, "reward_std": 0.16583725810050964, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 708.650390625, "completions/mean_terminated_length": 708.650390625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5492873602457967, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11157045205180109, "kl": 0.11669921875, "learning_rate": 1.004169716505467e-05, "loss": 0.0008, "num_tokens": 774067557.0, "reward": 2.11962890625, "reward_std": 0.15714994072914124, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 687.09375, "completions/mean_terminated_length": 687.09375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5496287445591875, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11778754522393753, "kl": 0.121826171875, "learning_rate": 1.0029783731597337e-05, "loss": 0.0096, "num_tokens": 774499717.0, "reward": 2.09375, "reward_std": 0.15227776765823364, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 684.7109375, "completions/mean_terminated_length": 684.7109375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.5499701288725783, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1008876255661059, "kl": 0.12646484375, "learning_rate": 1.0017870255867446e-05, "loss": 0.0082, "num_tokens": 774932321.0, "reward": 2.0498046875, "reward_std": 0.10753877460956573, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 678.548828125, "completions/mean_terminated_length": 678.548828125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.5503115131859692, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1436962200141881, "kl": 0.126220703125, "learning_rate": 1.0005956754773992e-05, "loss": 0.0089, "num_tokens": 775358874.0, "reward": 2.10498046875, "reward_std": 0.17776523530483246, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1305.0, "completions/max_terminated_length": 1305.0, "completions/mean_length": 672.21875, "completions/mean_terminated_length": 672.21875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.5506528974993599, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10410950577814565, "kl": 0.131591796875, "learning_rate": 9.994043245226013e-06, "loss": 0.0009, "num_tokens": 775783338.0, "reward": 2.06005859375, "reward_std": 0.10510721802711487, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 723.7734375, "completions/mean_terminated_length": 723.7734375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5509942818127507, "frac_reward_zero_std": 0.375, "grad_norm": 0.14002456421092122, "kl": 0.130859375, "learning_rate": 9.98212974413256e-06, "loss": 0.0191, "num_tokens": 776235942.0, "reward": 2.10693359375, "reward_std": 0.2235897332429886, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1281.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 724.982421875, "completions/mean_terminated_length": 724.982421875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5513356661261415, "frac_reward_zero_std": 0.65625, "grad_norm": 0.105772666314424, "kl": 0.128662109375, "learning_rate": 9.970216268402666e-06, "loss": 0.0121, "num_tokens": 776687181.0, "reward": 1.998046875, "reward_std": 0.1144820898771286, "rewards/accuracy_reward/mean": 0.016129031777381897, "rewards/accuracy_reward/std": 0.12609896063804626, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 796.013671875, "completions/mean_terminated_length": 796.013671875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5516770504395323, "frac_reward_zero_std": 0.625, "grad_norm": 0.0971984587271969, "kl": 0.12060546875, "learning_rate": 9.958302834945331e-06, "loss": 0.0131, "num_tokens": 777174900.0, "reward": 2.080078125, "reward_std": 0.1409790813922882, "rewards/accuracy_reward/mean": 0.09879032522439957, "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1795.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 742.369140625, "completions/mean_terminated_length": 742.369140625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5520184347529231, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1047295433704583, "kl": 0.12890625, "learning_rate": 9.946389460669497e-06, "loss": 0.015, "num_tokens": 777640113.0, "reward": 2.07666015625, "reward_std": 0.1414305418729782, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 732.509765625, "completions/mean_terminated_length": 732.509765625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.5523598190663139, "frac_reward_zero_std": 0.75, "grad_norm": 0.10063104164423885, "kl": 0.13427734375, "learning_rate": 9.934476162484015e-06, "loss": 0.0161, "num_tokens": 778101750.0, "reward": 2.07421875, "reward_std": 0.0985575020313263, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 785.2578125, "completions/mean_terminated_length": 785.2578125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.5527012033797047, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13914609254998553, "kl": 0.1260986328125, "learning_rate": 9.922562957297641e-06, "loss": 0.023, "num_tokens": 778582106.0, "reward": 2.0810546875, "reward_std": 0.19534572958946228, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 815.0859375, "completions/mean_terminated_length": 815.0859375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.5530425876930956, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11633892850478152, "kl": 0.1219482421875, "learning_rate": 9.910649862018981e-06, "loss": 0.0154, "num_tokens": 779079030.0, "reward": 2.07861328125, "reward_std": 0.137151300907135, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 804.728515625, "completions/mean_terminated_length": 804.728515625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.5533839720064863, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09809390386664892, "kl": 0.126220703125, "learning_rate": 9.898736893556502e-06, "loss": 0.0175, "num_tokens": 779570203.0, "reward": 2.03662109375, "reward_std": 0.11449846625328064, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 801.333984375, "completions/mean_terminated_length": 801.333984375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.5537253563198771, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12097389978755424, "kl": 0.1168212890625, "learning_rate": 9.88682406881848e-06, "loss": 0.0265, "num_tokens": 780062806.0, "reward": 2.08154296875, "reward_std": 0.20175831019878387, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 827.732421875, "completions/mean_terminated_length": 827.732421875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.5540667406332679, "frac_reward_zero_std": 0.5, "grad_norm": 0.1231749617933021, "kl": 0.1177978515625, "learning_rate": 9.874911404712992e-06, "loss": 0.0334, "num_tokens": 780568509.0, "reward": 2.06103515625, "reward_std": 0.18944120407104492, "rewards/accuracy_reward/mean": 0.08870967477560043, "rewards/accuracy_reward/std": 0.2846112847328186, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 796.94140625, "completions/mean_terminated_length": 796.94140625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5544081249466587, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12555111756727633, "kl": 0.124755859375, "learning_rate": 9.862998918147887e-06, "loss": 0.0337, "num_tokens": 781054767.0, "reward": 1.99853515625, "reward_std": 0.1314060091972351, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1623.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 801.390625, "completions/mean_terminated_length": 801.390625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.5547495092600495, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12730757490734226, "kl": 0.1142578125, "learning_rate": 9.851086626030762e-06, "loss": 0.013, "num_tokens": 781541847.0, "reward": 2.13623046875, "reward_std": 0.21087008714675903, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 752.87890625, "completions/mean_terminated_length": 752.87890625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5550908935734403, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11597386902958189, "kl": 0.1197509765625, "learning_rate": 9.839174545268931e-06, "loss": 0.0164, "num_tokens": 782007801.0, "reward": 2.2060546875, "reward_std": 0.20308831334114075, "rewards/accuracy_reward/mean": 0.22177419066429138, "rewards/accuracy_reward/std": 0.4158594012260437, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 825.783203125, "completions/mean_terminated_length": 823.391357421875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.5554322778868311, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11305450834735889, "kl": 0.1148681640625, "learning_rate": 9.827262692769417e-06, "loss": 0.0352, "num_tokens": 782527306.0, "reward": 2.0576171875, "reward_std": 0.15154173970222473, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 769.98046875, "completions/mean_terminated_length": 769.98046875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.555773662200222, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10465733565924064, "kl": 0.1240234375, "learning_rate": 9.81535108543892e-06, "loss": 0.0111, "num_tokens": 783013440.0, "reward": 2.08203125, "reward_std": 0.15751886367797852, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1911.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 759.859375, "completions/mean_terminated_length": 759.859375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5561150465136127, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12478630181363277, "kl": 0.11572265625, "learning_rate": 9.803439740183783e-06, "loss": 0.0339, "num_tokens": 783482136.0, "reward": 2.11962890625, "reward_std": 0.238993838429451, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 732.59765625, "completions/mean_terminated_length": 732.59765625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.5564564308270035, "frac_reward_zero_std": 0.40625, "grad_norm": 0.143426549647957, "kl": 0.1229248046875, "learning_rate": 9.791528673909983e-06, "loss": 0.0426, "num_tokens": 783934554.0, "reward": 2.07421875, "reward_std": 0.23506218194961548, "rewards/accuracy_reward/mean": 0.10483870655298233, "rewards/accuracy_reward/std": 0.30665475130081177, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 844.654296875, "completions/mean_terminated_length": 842.2993774414062, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.5567978151403943, "frac_reward_zero_std": 0.5, "grad_norm": 0.1096129282354529, "kl": 0.1060791015625, "learning_rate": 9.7796179035231e-06, "loss": 0.0203, "num_tokens": 784460841.0, "reward": 2.046875, "reward_std": 0.18307781219482422, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1952.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 775.642578125, "completions/mean_terminated_length": 775.642578125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.5571391994537851, "frac_reward_zero_std": 0.6875, "grad_norm": 0.17138102269655062, "kl": 0.1170654296875, "learning_rate": 9.767707445928297e-06, "loss": 0.0131, "num_tokens": 784943858.0, "reward": 2.06689453125, "reward_std": 0.1374157816171646, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 731.681640625, "completions/mean_terminated_length": 731.681640625, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.5574805837671759, "frac_reward_zero_std": 0.625, "grad_norm": 0.0988810375719296, "kl": 0.1151123046875, "learning_rate": 9.755797318030286e-06, "loss": 0.0185, "num_tokens": 785395215.0, "reward": 2.09375, "reward_std": 0.13764315843582153, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 670.365234375, "completions/mean_terminated_length": 670.365234375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.5578219680805667, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11937581310236918, "kl": 0.1181640625, "learning_rate": 9.743887536733316e-06, "loss": 0.0132, "num_tokens": 785824250.0, "reward": 2.12646484375, "reward_std": 0.15344035625457764, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 712.537109375, "completions/mean_terminated_length": 712.537109375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.5581633523939575, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11936061086895655, "kl": 0.1175537109375, "learning_rate": 9.731978118941143e-06, "loss": 0.0166, "num_tokens": 786271693.0, "reward": 2.0322265625, "reward_std": 0.13666076958179474, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 742.775390625, "completions/mean_terminated_length": 742.775390625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5585047367073483, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0939484948908479, "kl": 0.115478515625, "learning_rate": 9.72006908155701e-06, "loss": 0.0022, "num_tokens": 786744778.0, "reward": 2.0703125, "reward_std": 0.1268659383058548, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 733.876953125, "completions/mean_terminated_length": 733.876953125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.558846121020739, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10517899346157816, "kl": 0.120361328125, "learning_rate": 9.708160441483615e-06, "loss": 0.0093, "num_tokens": 787219099.0, "reward": 2.0693359375, "reward_std": 0.10608866065740585, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 706.37109375, "completions/mean_terminated_length": 706.37109375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5591875053341299, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09811053614028524, "kl": 0.123046875, "learning_rate": 9.6962522156231e-06, "loss": 0.0087, "num_tokens": 787667673.0, "reward": 2.05517578125, "reward_std": 0.10575816035270691, "rewards/accuracy_reward/mean": 0.06653226166963577, "rewards/accuracy_reward/std": 0.24946178495883942, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 650.80078125, "completions/mean_terminated_length": 649.5792236328125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.5595288896475207, "frac_reward_zero_std": 0.625, "grad_norm": 23.11036029622287, "kl": 5.0032958984375, "learning_rate": 9.684344420877002e-06, "loss": 0.2111, "num_tokens": 788075043.0, "reward": 2.0986328125, "reward_std": 0.15169894695281982, "rewards/accuracy_reward/mean": 0.11290322244167328, "rewards/accuracy_reward/std": 0.3167939782142639, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 759.3984375, "completions/mean_terminated_length": 759.3984375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.5598702739609115, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10808074451080839, "kl": 0.1219482421875, "learning_rate": 9.672437074146268e-06, "loss": 0.0051, "num_tokens": 788543423.0, "reward": 2.06640625, "reward_std": 0.1531359702348709, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 735.716796875, "completions/mean_terminated_length": 735.716796875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.5602116582743023, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07183306036281858, "kl": 0.1201171875, "learning_rate": 9.660530192331193e-06, "loss": 0.0047, "num_tokens": 789007614.0, "reward": 2.0849609375, "reward_std": 0.07371994853019714, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 783.87109375, "completions/mean_terminated_length": 783.87109375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.5605530425876931, "frac_reward_zero_std": 0.625, "grad_norm": 0.09355296709907744, "kl": 0.1168212890625, "learning_rate": 9.64862379233142e-06, "loss": 0.0121, "num_tokens": 789492044.0, "reward": 2.09521484375, "reward_std": 0.15588593482971191, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 764.77734375, "completions/mean_terminated_length": 764.77734375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.5608944269010839, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09277708952669657, "kl": 0.116455078125, "learning_rate": 9.636717891045908e-06, "loss": 0.0029, "num_tokens": 789969450.0, "reward": 2.08544921875, "reward_std": 0.11495085805654526, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 817.48046875, "completions/mean_terminated_length": 817.48046875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.5612358112144747, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09218372405784835, "kl": 0.11474609375, "learning_rate": 9.624812505372907e-06, "loss": 0.0153, "num_tokens": 790476288.0, "reward": 2.03125, "reward_std": 0.0933043360710144, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1655.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 777.34765625, "completions/mean_terminated_length": 777.34765625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.5615771955278654, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1161326497584426, "kl": 0.12109375, "learning_rate": 9.612907652209928e-06, "loss": 0.011, "num_tokens": 790964338.0, "reward": 2.1396484375, "reward_std": 0.19429704546928406, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 763.646484375, "completions/mean_terminated_length": 763.646484375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.5619185798412563, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12963018893439643, "kl": 0.1201171875, "learning_rate": 9.601003348453735e-06, "loss": -0.0027, "num_tokens": 791431597.0, "reward": 2.060546875, "reward_std": 0.1765592098236084, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 839.34765625, "completions/mean_terminated_length": 839.34765625, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.5622599641546471, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12291552395044374, "kl": 0.12060546875, "learning_rate": 9.589099611000313e-06, "loss": 0.0132, "num_tokens": 791939471.0, "reward": 2.05908203125, "reward_std": 0.18435361981391907, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 776.15625, "completions/mean_terminated_length": 776.15625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5626013484680379, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08486284769140123, "kl": 0.11865234375, "learning_rate": 9.577196456744839e-06, "loss": 0.0089, "num_tokens": 792429599.0, "reward": 2.15234375, "reward_std": 0.12961986660957336, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 808.634765625, "completions/mean_terminated_length": 808.634765625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.5629427327814287, "frac_reward_zero_std": 0.625, "grad_norm": 0.1043985653609575, "kl": 0.12109375, "learning_rate": 9.565293902581662e-06, "loss": 0.0143, "num_tokens": 792922596.0, "reward": 2.06396484375, "reward_std": 0.15178486704826355, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 733.001953125, "completions/mean_terminated_length": 732.4266357421875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.5632841170948195, "frac_reward_zero_std": 0.71875, "grad_norm": 0.20012051404073092, "kl": 0.314208984375, "learning_rate": 9.553391965404287e-06, "loss": 0.0232, "num_tokens": 793376773.0, "reward": 2.0947265625, "reward_std": 0.10865526646375656, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 750.3671875, "completions/mean_terminated_length": 747.8277587890625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5636255014082103, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10316184024419989, "kl": 0.119140625, "learning_rate": 9.541490662105325e-06, "loss": 0.0299, "num_tokens": 793836465.0, "reward": 2.06298828125, "reward_std": 0.14729109406471252, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 791.51953125, "completions/mean_terminated_length": 789.0606689453125, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.5639668857216011, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09498993932711947, "kl": 0.1201171875, "learning_rate": 9.529590009576507e-06, "loss": 0.0181, "num_tokens": 794327755.0, "reward": 2.0302734375, "reward_std": 0.11396025121212006, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1581.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 770.79296875, "completions/mean_terminated_length": 770.79296875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.5643082700349918, "frac_reward_zero_std": 0.71875, "grad_norm": 0.10348929237714999, "kl": 0.118408203125, "learning_rate": 9.517690024708628e-06, "loss": 0.0083, "num_tokens": 794804865.0, "reward": 2.0322265625, "reward_std": 0.06926766037940979, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 734.51171875, "completions/mean_terminated_length": 734.51171875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.5646496543483827, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12073707228237736, "kl": 0.120849609375, "learning_rate": 9.505790724391549e-06, "loss": 0.0173, "num_tokens": 795265127.0, "reward": 2.0615234375, "reward_std": 0.16098937392234802, "rewards/accuracy_reward/mean": 0.08467742055654526, "rewards/accuracy_reward/std": 0.278682142496109, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 733.689453125, "completions/mean_terminated_length": 733.689453125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.5649910386617735, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12692842626831344, "kl": 0.1226806640625, "learning_rate": 9.493892125514142e-06, "loss": 0.0072, "num_tokens": 795719432.0, "reward": 2.0634765625, "reward_std": 0.15719586610794067, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 772.3125, "completions/mean_terminated_length": 772.3125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5653324229751643, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09486035081838114, "kl": 0.1107177734375, "learning_rate": 9.481994244964297e-06, "loss": 0.0179, "num_tokens": 796195912.0, "reward": 2.0615234375, "reward_std": 0.1088767871260643, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 718.712890625, "completions/mean_terminated_length": 718.712890625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.5656738072885551, "frac_reward_zero_std": 0.625, "grad_norm": 0.15207879026986051, "kl": 0.119140625, "learning_rate": 9.470097099628875e-06, "loss": 0.0247, "num_tokens": 796651557.0, "reward": 2.10302734375, "reward_std": 0.13330230116844177, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.3216678202152252, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 735.05078125, "completions/mean_terminated_length": 733.9002075195312, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.5660151916019459, "frac_reward_zero_std": 0.53125, "grad_norm": 0.317096455912214, "kl": 0.16357421875, "learning_rate": 9.4582007063937e-06, "loss": 0.0159, "num_tokens": 797115375.0, "reward": 2.09033203125, "reward_std": 0.17535096406936646, "rewards/accuracy_reward/mean": 0.10685484111309052, "rewards/accuracy_reward/std": 0.3092404901981354, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1940.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 799.478515625, "completions/mean_terminated_length": 799.478515625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5663565759153367, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08830255016824495, "kl": 0.1151123046875, "learning_rate": 9.446305082143524e-06, "loss": 0.016, "num_tokens": 797610756.0, "reward": 2.00732421875, "reward_std": 0.09046171605587006, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 751.8515625, "completions/mean_terminated_length": 751.8515625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5666979602287275, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07249893668894086, "kl": 0.10986328125, "learning_rate": 9.434410243762011e-06, "loss": 0.013, "num_tokens": 798074808.0, "reward": 2.03173828125, "reward_std": 0.07734845578670502, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 689.4453125, "completions/mean_terminated_length": 689.4453125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.5670393445421182, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13462495476308953, "kl": 0.12255859375, "learning_rate": 9.42251620813171e-06, "loss": 0.0233, "num_tokens": 798515340.0, "reward": 2.1474609375, "reward_std": 0.2175661325454712, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06218579038977623, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 693.650390625, "completions/mean_terminated_length": 693.650390625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.567380728855509, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13545186247000626, "kl": 0.1251220703125, "learning_rate": 9.410622992134033e-06, "loss": 0.0292, "num_tokens": 798950921.0, "reward": 2.0830078125, "reward_std": 0.1854587346315384, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 752.267578125, "completions/mean_terminated_length": 749.7318725585938, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.5677221131688999, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11233867727526971, "kl": 0.1171875, "learning_rate": 9.398730612649214e-06, "loss": 0.0309, "num_tokens": 799423538.0, "reward": 2.09814453125, "reward_std": 0.1713787019252777, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 673.607421875, "completions/mean_terminated_length": 673.607421875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5680634974822907, "frac_reward_zero_std": 0.34375, "grad_norm": 0.13843419835538467, "kl": 0.1165771484375, "learning_rate": 9.38683908655632e-06, "loss": 0.0373, "num_tokens": 799862521.0, "reward": 2.1904296875, "reward_std": 0.28731757402420044, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 684.84375, "completions/mean_terminated_length": 684.84375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.5684048817956815, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11523222282138823, "kl": 0.125244140625, "learning_rate": 9.374948430733191e-06, "loss": 0.0139, "num_tokens": 800284873.0, "reward": 2.0849609375, "reward_std": 0.18187373876571655, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 733.65234375, "completions/mean_terminated_length": 733.65234375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5687462661090723, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12117786058693565, "kl": 0.1170654296875, "learning_rate": 9.363058662056444e-06, "loss": 0.0184, "num_tokens": 800740759.0, "reward": 2.05517578125, "reward_std": 0.19659431278705597, "rewards/accuracy_reward/mean": 0.08266129344701767, "rewards/accuracy_reward/std": 0.2756476104259491, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 705.240234375, "completions/mean_terminated_length": 705.240234375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.5690876504224631, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11229387073835292, "kl": 0.1180419921875, "learning_rate": 9.351169797401426e-06, "loss": 0.0229, "num_tokens": 801181858.0, "reward": 2.07958984375, "reward_std": 0.16173617541790009, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 692.0390625, "completions/mean_terminated_length": 690.8590698242188, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.5694290347358539, "frac_reward_zero_std": 0.59375, "grad_norm": 0.14176341285500807, "kl": 0.20849609375, "learning_rate": 9.339281853642207e-06, "loss": 0.0261, "num_tokens": 801633158.0, "reward": 2.029296875, "reward_std": 0.13773730397224426, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 677.255859375, "completions/mean_terminated_length": 677.255859375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5697704190492446, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12836324860792983, "kl": 0.1219482421875, "learning_rate": 9.327394847651556e-06, "loss": 0.0087, "num_tokens": 802057497.0, "reward": 2.12109375, "reward_std": 0.1478273570537567, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 814.451171875, "completions/mean_terminated_length": 814.451171875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5701118033626354, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09861643176681606, "kl": 0.1119384765625, "learning_rate": 9.315508796300893e-06, "loss": 0.0228, "num_tokens": 802559360.0, "reward": 2.07177734375, "reward_std": 0.15867668390274048, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 723.271484375, "completions/mean_terminated_length": 723.271484375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5704531876760263, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13397199552074052, "kl": 0.1221923828125, "learning_rate": 9.303623716460297e-06, "loss": 0.0141, "num_tokens": 803012923.0, "reward": 2.1279296875, "reward_std": 0.19599603116512299, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1624.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 752.43359375, "completions/mean_terminated_length": 752.43359375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.5707945719894171, "frac_reward_zero_std": 0.625, "grad_norm": 0.10097594329032793, "kl": 0.1187744140625, "learning_rate": 9.291739624998469e-06, "loss": 0.0091, "num_tokens": 803483977.0, "reward": 2.0771484375, "reward_std": 0.15199580788612366, "rewards/accuracy_reward/mean": 0.08870967477560043, "rewards/accuracy_reward/std": 0.284611314535141, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 801.951171875, "completions/mean_terminated_length": 801.951171875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.5711359563028079, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10898937595987103, "kl": 0.1134033203125, "learning_rate": 9.2798565387827e-06, "loss": 0.0163, "num_tokens": 803983120.0, "reward": 2.080078125, "reward_std": 0.15042853355407715, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 760.044921875, "completions/mean_terminated_length": 760.044921875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.5714773406161987, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09740599431457746, "kl": 0.1123046875, "learning_rate": 9.267974474678857e-06, "loss": 0.0114, "num_tokens": 804453303.0, "reward": 2.13671875, "reward_std": 0.12738411128520966, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1435.0, "completions/max_terminated_length": 1435.0, "completions/mean_length": 749.42578125, "completions/mean_terminated_length": 748.26611328125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5718187249295895, "frac_reward_zero_std": 0.5, "grad_norm": 0.8265220851841555, "kl": 0.45849609375, "learning_rate": 9.256093449551361e-06, "loss": 0.0284, "num_tokens": 804922113.0, "reward": 2.1396484375, "reward_std": 0.19800598919391632, "rewards/accuracy_reward/mean": 0.16458334028720856, "rewards/accuracy_reward/std": 0.37119096517562866, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 783.240234375, "completions/mean_terminated_length": 783.240234375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5721601092429803, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12688455819900799, "kl": 0.120849609375, "learning_rate": 9.244213480263149e-06, "loss": 0.0243, "num_tokens": 805410556.0, "reward": 2.07373046875, "reward_std": 0.16834717988967896, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.039800092577934265, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 829.732421875, "completions/mean_terminated_length": 829.732421875, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.572501493556371, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11684256243591995, "kl": 0.1171875, "learning_rate": 9.232334583675667e-06, "loss": 0.0124, "num_tokens": 805915747.0, "reward": 2.12109375, "reward_std": 0.15199978649616241, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 807.470703125, "completions/mean_terminated_length": 807.470703125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5728428778697618, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11780207016032314, "kl": 0.1131591796875, "learning_rate": 9.220456776648833e-06, "loss": 0.0116, "num_tokens": 806409044.0, "reward": 2.1025390625, "reward_std": 0.13376571238040924, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.3216678202152252, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1892.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 868.337890625, "completions/mean_terminated_length": 868.337890625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.5731842621831527, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10255040039297131, "kl": 0.1109619140625, "learning_rate": 9.208580076041026e-06, "loss": 0.0288, "num_tokens": 806944865.0, "reward": 2.03515625, "reward_std": 0.1338701844215393, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 876.1875, "completions/mean_terminated_length": 876.1875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5735256464965435, "frac_reward_zero_std": 0.46875, "grad_norm": 0.10867855802861194, "kl": 0.107421875, "learning_rate": 9.196704498709049e-06, "loss": 0.0195, "num_tokens": 807473537.0, "reward": 2.0810546875, "reward_std": 0.1892920732498169, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 830.318359375, "completions/mean_terminated_length": 827.9354248046875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5738670308099343, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11170768703189375, "kl": 0.1129150390625, "learning_rate": 9.184830061508113e-06, "loss": 0.026, "num_tokens": 807988612.0, "reward": 2.0615234375, "reward_std": 0.1414109319448471, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 835.150390625, "completions/mean_terminated_length": 835.150390625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.5742084151233251, "frac_reward_zero_std": 0.625, "grad_norm": 0.09415313160409361, "kl": 0.1136474609375, "learning_rate": 9.172956781291804e-06, "loss": 0.0187, "num_tokens": 808496609.0, "reward": 2.05859375, "reward_std": 0.14648395776748657, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 820.24609375, "completions/mean_terminated_length": 820.24609375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.5745497994367159, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08733451932873976, "kl": 0.117919921875, "learning_rate": 9.161084674912079e-06, "loss": 0.0139, "num_tokens": 809002559.0, "reward": 2.056640625, "reward_std": 0.09198889881372452, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 802.7109375, "completions/mean_terminated_length": 802.7109375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5748911837501067, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11259016351730382, "kl": 0.1187744140625, "learning_rate": 9.149213759219216e-06, "loss": 0.0216, "num_tokens": 809491099.0, "reward": 2.115234375, "reward_std": 0.20855185389518738, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 729.064453125, "completions/mean_terminated_length": 729.064453125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5752325680634974, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09806397107771184, "kl": 0.1240234375, "learning_rate": 9.137344051061811e-06, "loss": 0.014, "num_tokens": 809943724.0, "reward": 2.12744140625, "reward_std": 0.13795016705989838, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 720.08984375, "completions/mean_terminated_length": 720.08984375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5755739523768882, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13192701727691764, "kl": 0.120361328125, "learning_rate": 9.125475567286744e-06, "loss": 0.0225, "num_tokens": 810393866.0, "reward": 2.13134765625, "reward_std": 0.20999017357826233, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 727.087890625, "completions/mean_terminated_length": 727.087890625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.5759153366902791, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11841994270042469, "kl": 0.12646484375, "learning_rate": 9.11360832473916e-06, "loss": 0.0105, "num_tokens": 810851207.0, "reward": 2.03125, "reward_std": 0.15582986176013947, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 720.59375, "completions/mean_terminated_length": 720.59375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5762567210036699, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1291113992742585, "kl": 0.12255859375, "learning_rate": 9.101742340262431e-06, "loss": 0.0193, "num_tokens": 811322071.0, "reward": 2.056640625, "reward_std": 0.1552235335111618, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 709.869140625, "completions/mean_terminated_length": 709.869140625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.5765981053170607, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1182906709799707, "kl": 0.1209716796875, "learning_rate": 9.089877630698159e-06, "loss": 0.0135, "num_tokens": 811772484.0, "reward": 2.0595703125, "reward_std": 0.15204128623008728, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 718.642578125, "completions/mean_terminated_length": 718.642578125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.5769394896304515, "frac_reward_zero_std": 0.625, "grad_norm": 0.11824258085461084, "kl": 0.1248779296875, "learning_rate": 9.078014212886126e-06, "loss": 0.0184, "num_tokens": 812218525.0, "reward": 2.0849609375, "reward_std": 0.13960830867290497, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 709.98828125, "completions/mean_terminated_length": 709.98828125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.5772808739438423, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12182591055752902, "kl": 0.124755859375, "learning_rate": 9.066152103664283e-06, "loss": 0.012, "num_tokens": 812663703.0, "reward": 2.0380859375, "reward_std": 0.16101428866386414, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1328.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 649.916015625, "completions/mean_terminated_length": 649.916015625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5776222582572331, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14040639342471148, "kl": 0.1279296875, "learning_rate": 9.054291319868727e-06, "loss": 0.0134, "num_tokens": 813072748.0, "reward": 2.10400390625, "reward_std": 0.20981869101524353, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 642.03125, "completions/mean_terminated_length": 642.03125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5779636425706238, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13141071001505225, "kl": 0.127685546875, "learning_rate": 9.04243187833367e-06, "loss": 0.0191, "num_tokens": 813480460.0, "reward": 2.115234375, "reward_std": 0.18674710392951965, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 713.8671875, "completions/mean_terminated_length": 713.8671875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5783050268840146, "frac_reward_zero_std": 0.625, "grad_norm": 0.11199164874828693, "kl": 0.123291015625, "learning_rate": 9.030573795891421e-06, "loss": 0.0033, "num_tokens": 813936696.0, "reward": 2.041015625, "reward_std": 0.11918123066425323, "rewards/accuracy_reward/mean": 0.058467742055654526, "rewards/accuracy_reward/std": 0.23486268520355225, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 739.640625, "completions/mean_terminated_length": 739.640625, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.5786464111974055, "frac_reward_zero_std": 0.75, "grad_norm": 0.08078067070953382, "kl": 0.1202392578125, "learning_rate": 9.018717089372356e-06, "loss": 0.0185, "num_tokens": 814399120.0, "reward": 2.00927734375, "reward_std": 0.06483898311853409, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 662.390625, "completions/mean_terminated_length": 662.390625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5789877955107963, "frac_reward_zero_std": 0.46875, "grad_norm": 0.14321450898060561, "kl": 0.12646484375, "learning_rate": 9.006861775604905e-06, "loss": 0.0195, "num_tokens": 814814552.0, "reward": 2.072265625, "reward_std": 0.17717982828617096, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 743.18359375, "completions/mean_terminated_length": 743.18359375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.5793291798241871, "frac_reward_zero_std": 0.625, "grad_norm": 0.10545498962114194, "kl": 0.1175537109375, "learning_rate": 8.995007871415514e-06, "loss": 0.0165, "num_tokens": 815285398.0, "reward": 2.04833984375, "reward_std": 0.14210116863250732, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.045533329248428345, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1432.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 721.8828125, "completions/mean_terminated_length": 721.8828125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.5796705641375779, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09824172704402405, "kl": 0.11181640625, "learning_rate": 8.983155393628636e-06, "loss": 0.0147, "num_tokens": 815741498.0, "reward": 2.06201171875, "reward_std": 0.1546829491853714, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 738.404296875, "completions/mean_terminated_length": 738.404296875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5800119484509687, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1282097360145906, "kl": 0.116455078125, "learning_rate": 8.971304359066695e-06, "loss": 0.0123, "num_tokens": 816193481.0, "reward": 2.10986328125, "reward_std": 0.1631832867860794, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1951.0, "completions/max_terminated_length": 1951.0, "completions/mean_length": 793.787109375, "completions/mean_terminated_length": 793.787109375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5803533327643595, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10403835705370625, "kl": 0.11181640625, "learning_rate": 8.959454784550063e-06, "loss": 0.02, "num_tokens": 816694140.0, "reward": 2.064453125, "reward_std": 0.15747001767158508, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 791.525390625, "completions/mean_terminated_length": 791.525390625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5806947170777502, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1201320023062985, "kl": 0.115966796875, "learning_rate": 8.947606686897046e-06, "loss": 0.0146, "num_tokens": 817180729.0, "reward": 2.06201171875, "reward_std": 0.19902411103248596, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 796.625, "completions/mean_terminated_length": 796.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.581036101391141, "frac_reward_zero_std": 0.625, "grad_norm": 0.09985502846023117, "kl": 0.111328125, "learning_rate": 8.935760082923852e-06, "loss": 0.0147, "num_tokens": 817664617.0, "reward": 2.1083984375, "reward_std": 0.1534242033958435, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 802.353515625, "completions/mean_terminated_length": 802.353515625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.5813774857045318, "frac_reward_zero_std": 0.5, "grad_norm": 0.11491571793807953, "kl": 0.1087646484375, "learning_rate": 8.923914989444567e-06, "loss": 0.0229, "num_tokens": 818157534.0, "reward": 2.10107421875, "reward_std": 0.1618068516254425, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1745.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 783.3359375, "completions/mean_terminated_length": 783.3359375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5817188700179227, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11761492577518433, "kl": 0.115478515625, "learning_rate": 8.912071423271135e-06, "loss": 0.0331, "num_tokens": 818637146.0, "reward": 2.04736328125, "reward_std": 0.14775046706199646, "rewards/accuracy_reward/mean": 0.06653226166963577, "rewards/accuracy_reward/std": 0.2494617998600006, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 754.287109375, "completions/mean_terminated_length": 754.287109375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5820602543313135, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11844293801719137, "kl": 0.1142578125, "learning_rate": 8.900229401213335e-06, "loss": 0.0155, "num_tokens": 819104557.0, "reward": 2.1923828125, "reward_std": 0.197993665933609, "rewards/accuracy_reward/mean": 0.205078125, "rewards/accuracy_reward/std": 0.4041535556316376, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 789.75, "completions/mean_terminated_length": 789.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5824016386447043, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0898249371435702, "kl": 0.111328125, "learning_rate": 8.88838894007875e-06, "loss": 0.0151, "num_tokens": 819594253.0, "reward": 2.09521484375, "reward_std": 0.12151609361171722, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 754.564453125, "completions/mean_terminated_length": 752.0332641601562, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5827430229580951, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13164274778786497, "kl": 0.11865234375, "learning_rate": 8.876550056672747e-06, "loss": 0.023, "num_tokens": 820065422.0, "reward": 2.11474609375, "reward_std": 0.21703064441680908, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 773.16796875, "completions/mean_terminated_length": 773.16796875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5830844072714859, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12848909650895707, "kl": 0.11328125, "learning_rate": 8.864712767798458e-06, "loss": 0.012, "num_tokens": 820549140.0, "reward": 2.1318359375, "reward_std": 0.22809526324272156, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 785.712890625, "completions/mean_terminated_length": 783.24267578125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5834257915848767, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10063009471174734, "kl": 0.1151123046875, "learning_rate": 8.852877090256746e-06, "loss": 0.0258, "num_tokens": 821040433.0, "reward": 2.07275390625, "reward_std": 0.13435080647468567, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 745.029296875, "completions/mean_terminated_length": 745.029296875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5837671758982674, "frac_reward_zero_std": 0.5, "grad_norm": 0.11518879951098925, "kl": 0.1043701171875, "learning_rate": 8.841043040846192e-06, "loss": 0.0148, "num_tokens": 821504272.0, "reward": 2.0849609375, "reward_std": 0.16349591314792633, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 765.53125, "completions/mean_terminated_length": 765.53125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5841085602116582, "frac_reward_zero_std": 0.625, "grad_norm": 0.09451378831472586, "kl": 0.1180419921875, "learning_rate": 8.829210636363067e-06, "loss": 0.0119, "num_tokens": 821984288.0, "reward": 2.05322265625, "reward_std": 0.14123386144638062, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 750.9375, "completions/mean_terminated_length": 750.9375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5844499445250491, "frac_reward_zero_std": 0.65625, "grad_norm": 0.12738871978509012, "kl": 0.1182861328125, "learning_rate": 8.817379893601308e-06, "loss": 0.0269, "num_tokens": 822445008.0, "reward": 2.12255859375, "reward_std": 0.12459851801395416, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 734.75, "completions/mean_terminated_length": 734.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5847913288384399, "frac_reward_zero_std": 0.5, "grad_norm": 0.12548745956198776, "kl": 0.1199951171875, "learning_rate": 8.805550829352479e-06, "loss": 0.024, "num_tokens": 822929200.0, "reward": 2.1044921875, "reward_std": 0.19857114553451538, "rewards/accuracy_reward/mean": 0.13306452333927155, "rewards/accuracy_reward/std": 0.3399873673915863, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 720.48828125, "completions/mean_terminated_length": 720.48828125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.5851327131518307, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13067347820705813, "kl": 0.1124267578125, "learning_rate": 8.793723460405781e-06, "loss": 0.0225, "num_tokens": 823382234.0, "reward": 2.15576171875, "reward_std": 0.19116157293319702, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 727.630859375, "completions/mean_terminated_length": 727.630859375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5854740974652215, "frac_reward_zero_std": 0.75, "grad_norm": 0.08867211754113792, "kl": 0.1143798828125, "learning_rate": 8.781897803548e-06, "loss": 0.0119, "num_tokens": 823832781.0, "reward": 2.05029296875, "reward_std": 0.0885101854801178, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1998.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 759.544921875, "completions/mean_terminated_length": 759.544921875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.5858154817786123, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10103454978232654, "kl": 0.1099853515625, "learning_rate": 8.770073875563494e-06, "loss": 0.0155, "num_tokens": 824302868.0, "reward": 2.048828125, "reward_std": 0.10810519009828568, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 726.15625, "completions/mean_terminated_length": 725.624267578125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5861568660920031, "frac_reward_zero_std": 0.375, "grad_norm": 0.4307244808372051, "kl": 0.1162109375, "learning_rate": 8.758251693234166e-06, "loss": 0.0222, "num_tokens": 824758260.0, "reward": 2.06640625, "reward_std": 0.2200247347354889, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 674.91015625, "completions/mean_terminated_length": 673.7827758789062, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.5864982504053938, "frac_reward_zero_std": 0.65625, "grad_norm": 0.3347347894823114, "kl": 0.180908203125, "learning_rate": 8.746431273339443e-06, "loss": 0.0198, "num_tokens": 825183990.0, "reward": 2.08447265625, "reward_std": 0.11459419876337051, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 735.595703125, "completions/mean_terminated_length": 735.595703125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.5868396347187846, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11394966633257567, "kl": 0.1063232421875, "learning_rate": 8.734612632656242e-06, "loss": 0.0244, "num_tokens": 825640743.0, "reward": 2.083984375, "reward_std": 0.12100543081760406, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1313.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 622.619140625, "completions/mean_terminated_length": 622.619140625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.5871810190321755, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11762930078090475, "kl": 0.1209716796875, "learning_rate": 8.722795787958966e-06, "loss": 0.0175, "num_tokens": 826038036.0, "reward": 2.123046875, "reward_std": 0.15910661220550537, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 660.435546875, "completions/mean_terminated_length": 660.435546875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5875224033455663, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11478614313649892, "kl": 0.1190185546875, "learning_rate": 8.710980756019468e-06, "loss": 0.0265, "num_tokens": 826453971.0, "reward": 2.14013671875, "reward_std": 0.14428414404392242, "rewards/accuracy_reward/mean": 0.1552419364452362, "rewards/accuracy_reward/std": 0.36250078678131104, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1213.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 635.072265625, "completions/mean_terminated_length": 635.072265625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5878637876589571, "frac_reward_zero_std": 0.625, "grad_norm": 0.1317228241202134, "kl": 0.128173828125, "learning_rate": 8.69916755360702e-06, "loss": 0.0284, "num_tokens": 826863992.0, "reward": 2.0712890625, "reward_std": 0.11790765076875687, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 665.361328125, "completions/mean_terminated_length": 665.361328125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5882051719723479, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12302024509847963, "kl": 0.121337890625, "learning_rate": 8.687356197488305e-06, "loss": 0.0135, "num_tokens": 827297729.0, "reward": 2.05078125, "reward_std": 0.13364717364311218, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 683.296875, "completions/mean_terminated_length": 683.296875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.5885465562857387, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08792858453091941, "kl": 0.12158203125, "learning_rate": 8.675546704427386e-06, "loss": 0.0145, "num_tokens": 827743497.0, "reward": 2.01123046875, "reward_std": 0.07496071606874466, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 671.056640625, "completions/mean_terminated_length": 669.5784912109375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5888879405991295, "frac_reward_zero_std": 0.53125, "grad_norm": 0.5285896883824923, "kl": 0.49365234375, "learning_rate": 8.663739091185668e-06, "loss": 0.0434, "num_tokens": 828163254.0, "reward": 2.009765625, "reward_std": 0.15644419193267822, "rewards/accuracy_reward/mean": 0.038306452333927155, "rewards/accuracy_reward/std": 0.19212883710861206, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04406425356864929, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 679.318359375, "completions/mean_terminated_length": 679.318359375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.5892293249125202, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11312768396333188, "kl": 0.1146240234375, "learning_rate": 8.651933374521907e-06, "loss": -0.0007, "num_tokens": 828594169.0, "reward": 2.1279296875, "reward_std": 0.14746734499931335, "rewards/accuracy_reward/mean": 0.13709677755832672, "rewards/accuracy_reward/std": 0.34429675340652466, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 673.212890625, "completions/mean_terminated_length": 673.212890625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.589570709225911, "frac_reward_zero_std": 0.75, "grad_norm": 0.09426597280676542, "kl": 0.1180419921875, "learning_rate": 8.640129571192155e-06, "loss": 0.0107, "num_tokens": 829024134.0, "reward": 2.04052734375, "reward_std": 0.08832971751689911, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 660.369140625, "completions/mean_terminated_length": 660.369140625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5899120935393019, "frac_reward_zero_std": 0.5, "grad_norm": 0.1444491550002601, "kl": 0.1220703125, "learning_rate": 8.628327697949752e-06, "loss": 0.0316, "num_tokens": 829436995.0, "reward": 2.0205078125, "reward_std": 0.16018596291542053, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.2147994488477707, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 670.33203125, "completions/mean_terminated_length": 670.33203125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.5902534778526927, "frac_reward_zero_std": 0.625, "grad_norm": 0.12034844763344131, "kl": 0.120361328125, "learning_rate": 8.616527771545302e-06, "loss": 0.0184, "num_tokens": 829875933.0, "reward": 2.09619140625, "reward_std": 0.13374103605747223, "rewards/accuracy_reward/mean": 0.1088709682226181, "rewards/accuracy_reward/std": 0.31179171800613403, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 722.615234375, "completions/mean_terminated_length": 722.615234375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5905948621660835, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0920056327401717, "kl": 0.1165771484375, "learning_rate": 8.604729808726643e-06, "loss": 0.0157, "num_tokens": 830330056.0, "reward": 2.0625, "reward_std": 0.09287214279174805, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1704.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 762.130859375, "completions/mean_terminated_length": 762.130859375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5909362464794743, "frac_reward_zero_std": 0.625, "grad_norm": 0.10189903567232457, "kl": 0.1129150390625, "learning_rate": 8.592933826238818e-06, "loss": 0.0086, "num_tokens": 830801643.0, "reward": 2.0595703125, "reward_std": 0.11605371534824371, "rewards/accuracy_reward/mean": 0.07459677755832672, "rewards/accuracy_reward/std": 0.263004869222641, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 743.70703125, "completions/mean_terminated_length": 743.70703125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.5912776307928651, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13478441692036364, "kl": 0.12109375, "learning_rate": 8.58113984082407e-06, "loss": 0.011, "num_tokens": 831267813.0, "reward": 2.1025390625, "reward_std": 0.18597349524497986, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 756.228515625, "completions/mean_terminated_length": 756.228515625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.5916190151062559, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12080441489289528, "kl": 0.1064453125, "learning_rate": 8.569347869221805e-06, "loss": 0.028, "num_tokens": 831736042.0, "reward": 2.06201171875, "reward_std": 0.1543799489736557, "rewards/accuracy_reward/mean": 0.0786290317773819, "rewards/accuracy_reward/std": 0.26943066716194153, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 795.259765625, "completions/mean_terminated_length": 795.259765625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.5919603994196466, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11915405349535889, "kl": 0.112548828125, "learning_rate": 8.557557928168568e-06, "loss": 0.021, "num_tokens": 832227951.0, "reward": 2.0869140625, "reward_std": 0.18059974908828735, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04666558653116226, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 724.201171875, "completions/mean_terminated_length": 724.201171875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.5923017837330374, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12099636267002283, "kl": 0.12646484375, "learning_rate": 8.545770034398028e-06, "loss": 0.0157, "num_tokens": 832675622.0, "reward": 2.03662109375, "reward_std": 0.13372144103050232, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 827.517578125, "completions/mean_terminated_length": 827.517578125, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.5926431680464282, "frac_reward_zero_std": 0.71875, "grad_norm": 0.07725640355456596, "kl": 0.1124267578125, "learning_rate": 8.533984204640942e-06, "loss": 0.0085, "num_tokens": 833178863.0, "reward": 2.068359375, "reward_std": 0.10778524726629257, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1855.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 784.2421875, "completions/mean_terminated_length": 784.2421875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.5929845523598191, "frac_reward_zero_std": 0.375, "grad_norm": 0.13640839283363698, "kl": 0.115478515625, "learning_rate": 8.522200455625144e-06, "loss": 0.0371, "num_tokens": 833656443.0, "reward": 2.0888671875, "reward_std": 0.22461305558681488, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 776.7421875, "completions/mean_terminated_length": 776.7421875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5933259366732099, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10341200428634172, "kl": 0.111572265625, "learning_rate": 8.5104188040755e-06, "loss": 0.018, "num_tokens": 834134535.0, "reward": 2.052734375, "reward_std": 0.12303133308887482, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 791.220703125, "completions/mean_terminated_length": 791.220703125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.5936673209866007, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08710673439732855, "kl": 0.11279296875, "learning_rate": 8.498639266713918e-06, "loss": 0.0109, "num_tokens": 834619880.0, "reward": 2.041015625, "reward_std": 0.0985880047082901, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 768.755859375, "completions/mean_terminated_length": 768.755859375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.5940087052999915, "frac_reward_zero_std": 0.5, "grad_norm": 0.14155622240518898, "kl": 0.119384765625, "learning_rate": 8.486861860259294e-06, "loss": 0.029, "num_tokens": 835097115.0, "reward": 2.1103515625, "reward_std": 0.18538478016853333, "rewards/accuracy_reward/mean": 0.13104838132858276, "rewards/accuracy_reward/std": 0.3377939462661743, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1496.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 740.830078125, "completions/mean_terminated_length": 740.830078125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.5943500896133823, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11820814729284025, "kl": 0.1160888671875, "learning_rate": 8.475086601427499e-06, "loss": 0.0058, "num_tokens": 835558900.0, "reward": 2.107421875, "reward_std": 0.1810828000307083, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 820.1328125, "completions/mean_terminated_length": 820.1328125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.594691473926773, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10305443613493172, "kl": 0.1151123046875, "learning_rate": 8.463313506931365e-06, "loss": 0.0141, "num_tokens": 836063528.0, "reward": 2.00146484375, "reward_std": 0.1028485894203186, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 762.73046875, "completions/mean_terminated_length": 762.73046875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.5950328582401638, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10534970063828257, "kl": 0.1124267578125, "learning_rate": 8.45154259348065e-06, "loss": 0.0207, "num_tokens": 836526558.0, "reward": 2.0849609375, "reward_std": 0.15452617406845093, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1322.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 750.92578125, "completions/mean_terminated_length": 750.0880737304688, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.5953742425535546, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11208011732379371, "kl": 0.1239013671875, "learning_rate": 8.439773877782001e-06, "loss": 0.0161, "num_tokens": 837000456.0, "reward": 2.05078125, "reward_std": 0.15443411469459534, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 757.64453125, "completions/mean_terminated_length": 757.64453125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5957156268669455, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11891461956855327, "kl": 0.1158447265625, "learning_rate": 8.428007376538963e-06, "loss": 0.0098, "num_tokens": 837473922.0, "reward": 2.04931640625, "reward_std": 0.11393091082572937, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 839.38671875, "completions/mean_terminated_length": 839.38671875, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.5960570111803363, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09051857261698004, "kl": 0.1094970703125, "learning_rate": 8.416243106451933e-06, "loss": 0.0016, "num_tokens": 837985432.0, "reward": 2.04443359375, "reward_std": 0.09567949175834656, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1367.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 753.1875, "completions/mean_terminated_length": 753.1875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.5963983954937271, "frac_reward_zero_std": 0.625, "grad_norm": 0.09379823336632787, "kl": 0.1165771484375, "learning_rate": 8.404481084218142e-06, "loss": 0.0083, "num_tokens": 838455960.0, "reward": 2.0576171875, "reward_std": 0.14829078316688538, "rewards/accuracy_reward/mean": 0.07258064299821854, "rewards/accuracy_reward/std": 0.25970885157585144, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1435.0, "completions/mean_length": 732.455078125, "completions/mean_terminated_length": 729.880615234375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.5967397798071179, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12173062194834276, "kl": 0.113037109375, "learning_rate": 8.392721326531624e-06, "loss": 0.0278, "num_tokens": 838912561.0, "reward": 2.115234375, "reward_std": 0.17295482754707336, "rewards/accuracy_reward/mean": 0.12903225421905518, "rewards/accuracy_reward/std": 0.33557409048080444, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 765.279296875, "completions/mean_terminated_length": 762.7691040039062, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5970811641205087, "frac_reward_zero_std": 0.5, "grad_norm": 0.6130110369878674, "kl": 0.12060546875, "learning_rate": 8.380963850083213e-06, "loss": 0.015, "num_tokens": 839383152.0, "reward": 2.02685546875, "reward_std": 0.1556621938943863, "rewards/accuracy_reward/mean": 0.0463709682226181, "rewards/accuracy_reward/std": 0.21049949526786804, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 737.33984375, "completions/mean_terminated_length": 737.33984375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.5974225484338994, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09629067137148088, "kl": 0.1175537109375, "learning_rate": 8.369208671560489e-06, "loss": 0.0045, "num_tokens": 839839198.0, "reward": 2.08837890625, "reward_std": 0.11453113704919815, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 743.5625, "completions/mean_terminated_length": 743.5625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5977639327472902, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1127819435567702, "kl": 0.10986328125, "learning_rate": 8.357455807647778e-06, "loss": 0.0275, "num_tokens": 840298142.0, "reward": 2.123046875, "reward_std": 0.1748436689376831, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 745.740234375, "completions/mean_terminated_length": 745.740234375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.598105317060681, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1125621031244264, "kl": 0.1160888671875, "learning_rate": 8.345705275026124e-06, "loss": 0.0108, "num_tokens": 840757113.0, "reward": 2.10986328125, "reward_std": 0.18233756721019745, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 775.6328125, "completions/mean_terminated_length": 775.6328125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5984467013740719, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1120434897059448, "kl": 0.104248046875, "learning_rate": 8.333957090373256e-06, "loss": 0.0373, "num_tokens": 841247981.0, "reward": 2.08740234375, "reward_std": 0.15772193670272827, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 763.1015625, "completions/mean_terminated_length": 763.1015625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5987880856874627, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08509697459648374, "kl": 0.111083984375, "learning_rate": 8.322211270363575e-06, "loss": 0.0244, "num_tokens": 841715729.0, "reward": 2.0908203125, "reward_std": 0.10108834505081177, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 712.564453125, "completions/mean_terminated_length": 712.564453125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5991294700008535, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13340411597129884, "kl": 0.1182861328125, "learning_rate": 8.310467831668128e-06, "loss": 0.0226, "num_tokens": 842170594.0, "reward": 2.18017578125, "reward_std": 0.19096018373966217, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1894.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 817.205078125, "completions/mean_terminated_length": 817.205078125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.5994708543142443, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08830363737294054, "kl": 0.1170654296875, "learning_rate": 8.29872679095457e-06, "loss": 0.0082, "num_tokens": 842675419.0, "reward": 2.08154296875, "reward_std": 0.12526145577430725, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 797.662109375, "completions/mean_terminated_length": 796.1995849609375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.5998122386276351, "frac_reward_zero_std": 0.59375, "grad_norm": 0.4610866069648255, "kl": 0.3275146484375, "learning_rate": 8.286988164887167e-06, "loss": 0.0215, "num_tokens": 843173710.0, "reward": 2.0771484375, "reward_std": 0.1570359468460083, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 747.337890625, "completions/mean_terminated_length": 747.337890625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6001536229410258, "frac_reward_zero_std": 0.5, "grad_norm": 0.1208452917239761, "kl": 0.11669921875, "learning_rate": 8.275251970126748e-06, "loss": 0.0153, "num_tokens": 843640107.0, "reward": 2.09375, "reward_std": 0.1711398959159851, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 735.408203125, "completions/mean_terminated_length": 735.408203125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6004950072544166, "frac_reward_zero_std": 0.625, "grad_norm": 0.09913884890486574, "kl": 0.107421875, "learning_rate": 8.263518223330698e-06, "loss": 0.0178, "num_tokens": 844091708.0, "reward": 2.1083984375, "reward_std": 0.15055307745933533, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 746.990234375, "completions/mean_terminated_length": 746.990234375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6008363915678074, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12673565761074893, "kl": 0.1107177734375, "learning_rate": 8.251786941152922e-06, "loss": 0.0195, "num_tokens": 844560231.0, "reward": 2.115234375, "reward_std": 0.20644697546958923, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1588.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 730.072265625, "completions/mean_terminated_length": 730.072265625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6011777758811983, "frac_reward_zero_std": 0.5, "grad_norm": 0.11272087870963732, "kl": 0.1103515625, "learning_rate": 8.240058140243834e-06, "loss": 0.0123, "num_tokens": 845015580.0, "reward": 2.1181640625, "reward_std": 0.18700222671031952, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 799.150390625, "completions/mean_terminated_length": 799.150390625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.6015191601945891, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10266178107324443, "kl": 0.1085205078125, "learning_rate": 8.228331837250313e-06, "loss": 0.0167, "num_tokens": 845505369.0, "reward": 2.10888671875, "reward_std": 0.14902232587337494, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 683.953125, "completions/mean_terminated_length": 683.953125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.6018605445079799, "frac_reward_zero_std": 0.625, "grad_norm": 0.11164860213646749, "kl": 0.1134033203125, "learning_rate": 8.21660804881571e-06, "loss": 0.0137, "num_tokens": 845931857.0, "reward": 2.10693359375, "reward_std": 0.14974334836006165, "rewards/accuracy_reward/mean": 0.11895161122083664, "rewards/accuracy_reward/std": 0.3240583837032318, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1300.0, "completions/max_terminated_length": 1300.0, "completions/mean_length": 687.865234375, "completions/mean_terminated_length": 687.865234375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.6022019288213707, "frac_reward_zero_std": 0.5, "grad_norm": 0.1198992314910159, "kl": 0.1171875, "learning_rate": 8.204886791579794e-06, "loss": -0.0037, "num_tokens": 846371404.0, "reward": 2.15234375, "reward_std": 0.20520812273025513, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 732.673828125, "completions/mean_terminated_length": 730.0997924804688, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.6025433131347615, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12544186305454177, "kl": 0.1112060546875, "learning_rate": 8.193168082178746e-06, "loss": 0.0301, "num_tokens": 846829093.0, "reward": 2.126953125, "reward_std": 0.20278823375701904, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 753.337890625, "completions/mean_terminated_length": 753.337890625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.6028846974481522, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11250063934307421, "kl": 0.11279296875, "learning_rate": 8.181451937245131e-06, "loss": 0.0319, "num_tokens": 847295106.0, "reward": 2.07763671875, "reward_std": 0.16190803050994873, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 738.763671875, "completions/mean_terminated_length": 738.763671875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.603226081761543, "frac_reward_zero_std": 0.5, "grad_norm": 0.12822708883412523, "kl": 0.1102294921875, "learning_rate": 8.169738373407878e-06, "loss": 0.0226, "num_tokens": 847757769.0, "reward": 2.08154296875, "reward_std": 0.15789592266082764, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 732.212890625, "completions/mean_terminated_length": 732.212890625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.6035674660749338, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11658849220809443, "kl": 0.12109375, "learning_rate": 8.158027407292241e-06, "loss": 0.0242, "num_tokens": 848219302.0, "reward": 2.126953125, "reward_std": 0.1460181176662445, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1413.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 676.83203125, "completions/mean_terminated_length": 676.83203125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6039088503883246, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10274897813828664, "kl": 0.1177978515625, "learning_rate": 8.146319055519798e-06, "loss": 0.0205, "num_tokens": 848643456.0, "reward": 2.14599609375, "reward_std": 0.11638574302196503, "rewards/accuracy_reward/mean": 0.16330644488334656, "rewards/accuracy_reward/std": 0.37001824378967285, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 750.62109375, "completions/mean_terminated_length": 750.62109375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6042502347017155, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11740698545825126, "kl": 0.1107177734375, "learning_rate": 8.134613334708412e-06, "loss": 0.0215, "num_tokens": 849106510.0, "reward": 2.05712890625, "reward_std": 0.13969388604164124, "rewards/accuracy_reward/mean": 0.08064515888690948, "rewards/accuracy_reward/std": 0.2725643217563629, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.050564687699079514, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 784.505859375, "completions/mean_terminated_length": 784.505859375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.6045916190151063, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08969195089306231, "kl": 0.1181640625, "learning_rate": 8.122910261472214e-06, "loss": 0.0056, "num_tokens": 849597473.0, "reward": 2.06201171875, "reward_std": 0.10555906593799591, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 769.95703125, "completions/mean_terminated_length": 769.95703125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.6049330033284971, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10904514848061798, "kl": 0.1158447265625, "learning_rate": 8.111209852421577e-06, "loss": -0.0007, "num_tokens": 850073515.0, "reward": 2.0888671875, "reward_std": 0.15928639471530914, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1960.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 789.7109375, "completions/mean_terminated_length": 789.7109375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.6052743876418879, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11922955393619367, "kl": 0.113037109375, "learning_rate": 8.099512124163087e-06, "loss": 0.0161, "num_tokens": 850564231.0, "reward": 2.08935546875, "reward_std": 0.1912645846605301, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 844.3046875, "completions/mean_terminated_length": 841.9490966796875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.6056157719552786, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10404042361391928, "kl": 0.1041259765625, "learning_rate": 8.087817093299538e-06, "loss": 0.019, "num_tokens": 851080499.0, "reward": 2.12060546875, "reward_std": 0.17954251170158386, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1633.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 725.833984375, "completions/mean_terminated_length": 725.833984375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.6059571562686694, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11914915361478341, "kl": 0.11572265625, "learning_rate": 8.076124776429879e-06, "loss": 0.0063, "num_tokens": 851537854.0, "reward": 2.07421875, "reward_std": 0.14707332849502563, "rewards/accuracy_reward/mean": 0.0786290317773819, "rewards/accuracy_reward/std": 0.26943066716194153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1660.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 782.15625, "completions/mean_terminated_length": 782.15625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.6062985405820602, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10925985855820844, "kl": 0.1103515625, "learning_rate": 8.064435190149218e-06, "loss": 0.0178, "num_tokens": 852021118.0, "reward": 2.07470703125, "reward_std": 0.1562289297580719, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 792.482421875, "completions/mean_terminated_length": 790.0254516601562, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.606639924895451, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11178666951459967, "kl": 0.1121826171875, "learning_rate": 8.052748351048785e-06, "loss": 0.0175, "num_tokens": 852504613.0, "reward": 2.09228515625, "reward_std": 0.14890944957733154, "rewards/accuracy_reward/mean": 0.10685484111309052, "rewards/accuracy_reward/std": 0.30924052000045776, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 751.673828125, "completions/mean_terminated_length": 751.673828125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.6069813092088419, "frac_reward_zero_std": 0.5, "grad_norm": 0.11251647150053617, "kl": 0.1112060546875, "learning_rate": 8.041064275715909e-06, "loss": 0.0197, "num_tokens": 852971550.0, "reward": 2.1376953125, "reward_std": 0.1981278657913208, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1792.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 818.75, "completions/mean_terminated_length": 818.75, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.6073226935222327, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11419043708726176, "kl": 0.11181640625, "learning_rate": 8.029382980734e-06, "loss": 0.0103, "num_tokens": 853470846.0, "reward": 2.06884765625, "reward_std": 0.15891560912132263, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1959.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 900.443359375, "completions/mean_terminated_length": 900.443359375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.6076640778356235, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09549636357247325, "kl": 0.104248046875, "learning_rate": 8.017704482682522e-06, "loss": 0.0188, "num_tokens": 854019825.0, "reward": 2.091796875, "reward_std": 0.17787672579288483, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1911.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 749.11328125, "completions/mean_terminated_length": 749.11328125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6080054621490143, "frac_reward_zero_std": 0.625, "grad_norm": 0.11430027984468388, "kl": 0.1220703125, "learning_rate": 8.006028798136962e-06, "loss": 0.0071, "num_tokens": 854487499.0, "reward": 2.08544921875, "reward_std": 0.1477096974849701, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 775.7421875, "completions/mean_terminated_length": 775.7421875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.608346846462405, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1019112212259972, "kl": 0.1187744140625, "learning_rate": 7.994355943668821e-06, "loss": 0.0121, "num_tokens": 854960887.0, "reward": 2.095703125, "reward_std": 0.16481684148311615, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 781.423828125, "completions/mean_terminated_length": 781.423828125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.6086882307757958, "frac_reward_zero_std": 0.59375, "grad_norm": 0.2044359523920352, "kl": 0.120849609375, "learning_rate": 7.98268593584558e-06, "loss": 0.0123, "num_tokens": 855444688.0, "reward": 2.06884765625, "reward_std": 0.12685802578926086, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 787.298828125, "completions/mean_terminated_length": 787.298828125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6090296150891866, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11676210620667331, "kl": 0.1212158203125, "learning_rate": 7.97101879123068e-06, "loss": 0.0277, "num_tokens": 855940425.0, "reward": 2.06591796875, "reward_std": 0.1413867175579071, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1905.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 793.197265625, "completions/mean_terminated_length": 792.135009765625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6093709994025774, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11988720988984117, "kl": 0.212646484375, "learning_rate": 7.959354526383502e-06, "loss": 0.0309, "num_tokens": 856429838.0, "reward": 2.06396484375, "reward_std": 0.15131616592407227, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 717.537109375, "completions/mean_terminated_length": 717.537109375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6097123837159683, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09654415015003837, "kl": 0.12255859375, "learning_rate": 7.947693157859338e-06, "loss": 0.0104, "num_tokens": 856881521.0, "reward": 2.03369140625, "reward_std": 0.11611723154783249, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 739.205078125, "completions/mean_terminated_length": 739.205078125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6100537680293591, "frac_reward_zero_std": 0.625, "grad_norm": 0.10086344201091213, "kl": 0.1201171875, "learning_rate": 7.93603470220936e-06, "loss": 0.0172, "num_tokens": 857346122.0, "reward": 2.11572265625, "reward_std": 0.1356073021888733, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 784.818359375, "completions/mean_terminated_length": 784.818359375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.6103951523427499, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12216219410967954, "kl": 0.110107421875, "learning_rate": 7.92437917598062e-06, "loss": 0.0314, "num_tokens": 857832077.0, "reward": 2.111328125, "reward_std": 0.19313117861747742, "rewards/accuracy_reward/mean": 0.12298387289047241, "rewards/accuracy_reward/std": 0.32875028252601624, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 771.81640625, "completions/mean_terminated_length": 771.81640625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.6107365366561407, "frac_reward_zero_std": 0.625, "grad_norm": 0.0973923021297237, "kl": 0.113037109375, "learning_rate": 7.912726595716e-06, "loss": 0.0016, "num_tokens": 858309055.0, "reward": 2.09912109375, "reward_std": 0.15064918994903564, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 757.990234375, "completions/mean_terminated_length": 757.990234375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6110779209695314, "frac_reward_zero_std": 0.625, "grad_norm": 0.10417005714876888, "kl": 0.1158447265625, "learning_rate": 7.901076977954213e-06, "loss": 0.0353, "num_tokens": 858786810.0, "reward": 2.07421875, "reward_std": 0.16046275198459625, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 774.439453125, "completions/mean_terminated_length": 774.439453125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6114193052829222, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10961333475773968, "kl": 0.1162109375, "learning_rate": 7.889430339229755e-06, "loss": 0.0205, "num_tokens": 859265019.0, "reward": 2.06201171875, "reward_std": 0.13885700702667236, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 737.080078125, "completions/mean_terminated_length": 737.080078125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.611760689596313, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12499361115099049, "kl": 0.114501953125, "learning_rate": 7.877786696072907e-06, "loss": 0.0222, "num_tokens": 859716772.0, "reward": 2.0771484375, "reward_std": 0.1587916910648346, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 775.74609375, "completions/mean_terminated_length": 775.74609375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.6121020739097038, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11542783457781464, "kl": 0.111083984375, "learning_rate": 7.86614606500968e-06, "loss": 0.0275, "num_tokens": 860199906.0, "reward": 2.03125, "reward_std": 0.16528168320655823, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 735.671875, "completions/mean_terminated_length": 735.671875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6124434582230946, "frac_reward_zero_std": 0.625, "grad_norm": 0.1135191588287713, "kl": 0.1103515625, "learning_rate": 7.854508462561829e-06, "loss": 0.0188, "num_tokens": 860662826.0, "reward": 2.0810546875, "reward_std": 0.13564833998680115, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 686.736328125, "completions/mean_terminated_length": 686.736328125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6127848425364855, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13557291423948278, "kl": 0.1163330078125, "learning_rate": 7.842873905246798e-06, "loss": 0.0219, "num_tokens": 861108659.0, "reward": 2.10888671875, "reward_std": 0.2298424243927002, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 762.36328125, "completions/mean_terminated_length": 762.36328125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6131262268498763, "frac_reward_zero_std": 0.5, "grad_norm": 0.12099528451216068, "kl": 0.1121826171875, "learning_rate": 7.831242409577717e-06, "loss": 0.016, "num_tokens": 861586749.0, "reward": 2.03125, "reward_std": 0.13211944699287415, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 715.16796875, "completions/mean_terminated_length": 715.16796875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.6134676111632671, "frac_reward_zero_std": 0.5, "grad_norm": 0.12934820572017192, "kl": 0.11669921875, "learning_rate": 7.819613992063361e-06, "loss": 0.0228, "num_tokens": 862035763.0, "reward": 2.0498046875, "reward_std": 0.1588086187839508, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 794.5, "completions/mean_terminated_length": 794.5, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6138089954766578, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11197115988248946, "kl": 0.1112060546875, "learning_rate": 7.807988669208152e-06, "loss": 0.0291, "num_tokens": 862539011.0, "reward": 2.0380859375, "reward_std": 0.1644839197397232, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 727.509765625, "completions/mean_terminated_length": 727.509765625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6141503797900486, "frac_reward_zero_std": 0.625, "grad_norm": 0.10833220558015971, "kl": 0.11767578125, "learning_rate": 7.7963664575121e-06, "loss": 0.0208, "num_tokens": 862993880.0, "reward": 2.05517578125, "reward_std": 0.1424437314271927, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1440.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 719.5703125, "completions/mean_terminated_length": 719.5703125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.6144917641034394, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12210755752738317, "kl": 0.1107177734375, "learning_rate": 7.784747373470806e-06, "loss": 0.0272, "num_tokens": 863448844.0, "reward": 2.10400390625, "reward_std": 0.1513235867023468, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1432.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 694.37890625, "completions/mean_terminated_length": 694.37890625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.6148331484168302, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12401696203924381, "kl": 0.115478515625, "learning_rate": 7.773131433575444e-06, "loss": 0.0159, "num_tokens": 863879246.0, "reward": 2.10009765625, "reward_std": 0.1937132626771927, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 657.94921875, "completions/mean_terminated_length": 657.94921875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.615174532730221, "frac_reward_zero_std": 0.5, "grad_norm": 0.1213024832898883, "kl": 0.1112060546875, "learning_rate": 7.761518654312712e-06, "loss": 0.0238, "num_tokens": 864295508.0, "reward": 2.146484375, "reward_std": 0.19504587352275848, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 722.181640625, "completions/mean_terminated_length": 722.181640625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.6155159170436119, "frac_reward_zero_std": 0.5, "grad_norm": 0.11547567028441856, "kl": 0.1168212890625, "learning_rate": 7.749909052164825e-06, "loss": 0.0245, "num_tokens": 864741441.0, "reward": 2.04833984375, "reward_std": 0.17442584037780762, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 779.302734375, "completions/mean_terminated_length": 779.302734375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6158573013570027, "frac_reward_zero_std": 0.5, "grad_norm": 0.12163451033230231, "kl": 0.1097412109375, "learning_rate": 7.738302643609498e-06, "loss": 0.0151, "num_tokens": 865228764.0, "reward": 2.060546875, "reward_std": 0.19544458389282227, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 741.841796875, "completions/mean_terminated_length": 741.841796875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.6161986856703935, "frac_reward_zero_std": 0.5, "grad_norm": 0.12426286145093025, "kl": 0.1107177734375, "learning_rate": 7.726699445119892e-06, "loss": 0.0206, "num_tokens": 865687627.0, "reward": 2.09228515625, "reward_std": 0.1726551055908203, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1911.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 738.26953125, "completions/mean_terminated_length": 738.26953125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.6165400699837843, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09350424795083827, "kl": 0.110595703125, "learning_rate": 7.715099473164633e-06, "loss": 0.0148, "num_tokens": 866152757.0, "reward": 2.10791015625, "reward_std": 0.12918229401111603, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 739.19140625, "completions/mean_terminated_length": 739.19140625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.616881454297175, "frac_reward_zero_std": 0.375, "grad_norm": 0.14853034465892936, "kl": 0.120849609375, "learning_rate": 7.703502744207756e-06, "loss": 0.0212, "num_tokens": 866632167.0, "reward": 2.0703125, "reward_std": 0.21492625772953033, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 783.904296875, "completions/mean_terminated_length": 781.4305419921875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6172228386105658, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11905492733190563, "kl": 0.10888671875, "learning_rate": 7.691909274708698e-06, "loss": 0.0223, "num_tokens": 867121974.0, "reward": 2.06787109375, "reward_std": 0.18449586629867554, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 718.80859375, "completions/mean_terminated_length": 716.2074584960938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6175642229239566, "frac_reward_zero_std": 0.5, "grad_norm": 0.14840856976903263, "kl": 0.1102294921875, "learning_rate": 7.68031908112227e-06, "loss": 0.0238, "num_tokens": 867568004.0, "reward": 2.0322265625, "reward_std": 0.1527005136013031, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1451.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 725.30859375, "completions/mean_terminated_length": 725.30859375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.6179056072373474, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1253800923915706, "kl": 0.1123046875, "learning_rate": 7.66873217989863e-06, "loss": 0.02, "num_tokens": 868022834.0, "reward": 2.10888671875, "reward_std": 0.19830745458602905, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2008.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 722.875, "completions/mean_terminated_length": 722.875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6182469915507383, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12412454670133881, "kl": 0.109619140625, "learning_rate": 7.657148587483271e-06, "loss": 0.0067, "num_tokens": 868475538.0, "reward": 2.0849609375, "reward_std": 0.1951552927494049, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 693.583984375, "completions/mean_terminated_length": 693.583984375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.6185883758641291, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10767262784937802, "kl": 0.11279296875, "learning_rate": 7.645568320316975e-06, "loss": 0.0178, "num_tokens": 868907581.0, "reward": 2.0986328125, "reward_std": 0.1337239146232605, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 707.111328125, "completions/mean_terminated_length": 707.111328125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.6189297601775199, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10177315370404647, "kl": 0.1162109375, "learning_rate": 7.633991394835818e-06, "loss": 0.0222, "num_tokens": 869347942.0, "reward": 2.0498046875, "reward_std": 0.11953511089086533, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 770.62109375, "completions/mean_terminated_length": 770.62109375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.6192711444909107, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09937571933672605, "kl": 0.1068115234375, "learning_rate": 7.622417827471126e-06, "loss": 0.017, "num_tokens": 869824644.0, "reward": 2.04638671875, "reward_std": 0.11281375586986542, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 775.984375, "completions/mean_terminated_length": 773.4951171875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6196125288043014, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11709790379811309, "kl": 0.1068115234375, "learning_rate": 7.610847634649459e-06, "loss": 0.025, "num_tokens": 870306604.0, "reward": 2.10205078125, "reward_std": 0.2027372121810913, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 747.7734375, "completions/mean_terminated_length": 747.7734375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6199539131176922, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10264603575810574, "kl": 0.1119384765625, "learning_rate": 7.599280832792596e-06, "loss": 0.0103, "num_tokens": 870769496.0, "reward": 2.0498046875, "reward_std": 0.13379411399364471, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 712.9921875, "completions/mean_terminated_length": 712.9921875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.620295297431083, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13579250622411182, "kl": 0.1197509765625, "learning_rate": 7.587717438317497e-06, "loss": 0.0134, "num_tokens": 871234740.0, "reward": 2.0283203125, "reward_std": 0.14487597346305847, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 688.94140625, "completions/mean_terminated_length": 688.94140625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6206366817444738, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11517707407783338, "kl": 0.114990234375, "learning_rate": 7.576157467636278e-06, "loss": 0.0217, "num_tokens": 871677430.0, "reward": 2.1220703125, "reward_std": 0.15377512574195862, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 756.91796875, "completions/mean_terminated_length": 754.391357421875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.6209780660578647, "frac_reward_zero_std": 0.625, "grad_norm": 0.10625096140224645, "kl": 0.1138916015625, "learning_rate": 7.564600937156207e-06, "loss": 0.0126, "num_tokens": 872152364.0, "reward": 2.04541015625, "reward_std": 0.14102429151535034, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04269581660628319, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 759.880859375, "completions/mean_terminated_length": 759.880859375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6213194503712555, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12406955257550784, "kl": 0.113525390625, "learning_rate": 7.553047863279664e-06, "loss": 0.0374, "num_tokens": 872623279.0, "reward": 2.12109375, "reward_std": 0.18645180761814117, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1919.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 754.072265625, "completions/mean_terminated_length": 752.74951171875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.6216608346846463, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2201072730428638, "kl": 0.3199462890625, "learning_rate": 7.541498262404126e-06, "loss": 0.0209, "num_tokens": 873100852.0, "reward": 2.05908203125, "reward_std": 0.10935494303703308, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 752.431640625, "completions/mean_terminated_length": 752.431640625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.6220022189980371, "frac_reward_zero_std": 0.5, "grad_norm": 0.1270947394174133, "kl": 0.1146240234375, "learning_rate": 7.529952150922136e-06, "loss": 0.0179, "num_tokens": 873572161.0, "reward": 2.06982421875, "reward_std": 0.17477965354919434, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 704.431640625, "completions/mean_terminated_length": 704.431640625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.6223436033114278, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13935124624276699, "kl": 0.1187744140625, "learning_rate": 7.518409545221297e-06, "loss": 0.0204, "num_tokens": 874017934.0, "reward": 2.11328125, "reward_std": 0.19128042459487915, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 704.458984375, "completions/mean_terminated_length": 704.458984375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.6226849876248186, "frac_reward_zero_std": 0.5, "grad_norm": 0.1371485187691939, "kl": 0.1134033203125, "learning_rate": 7.506870461684215e-06, "loss": 0.0211, "num_tokens": 874459001.0, "reward": 2.0927734375, "reward_std": 0.16934379935264587, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 690.099609375, "completions/mean_terminated_length": 690.099609375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.6230263719382094, "frac_reward_zero_std": 0.46875, "grad_norm": 0.15291438221037168, "kl": 0.1248779296875, "learning_rate": 7.4953349166885125e-06, "loss": 0.0266, "num_tokens": 874888812.0, "reward": 2.0732421875, "reward_std": 0.16498547792434692, "rewards/accuracy_reward/mean": 0.0927419364452362, "rewards/accuracy_reward/std": 0.2903633117675781, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 738.404296875, "completions/mean_terminated_length": 738.404296875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.6233677562516002, "frac_reward_zero_std": 0.375, "grad_norm": 0.13278940559246163, "kl": 0.113525390625, "learning_rate": 7.4838029266067865e-06, "loss": 0.0048, "num_tokens": 875348955.0, "reward": 2.09375, "reward_std": 0.23697246611118317, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 743.61328125, "completions/mean_terminated_length": 743.61328125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.623709140564991, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1467473588804871, "kl": 0.1123046875, "learning_rate": 7.4722745078065875e-06, "loss": 0.032, "num_tokens": 875813829.0, "reward": 2.07373046875, "reward_std": 0.18468967080116272, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 738.630859375, "completions/mean_terminated_length": 738.630859375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6240505248783819, "frac_reward_zero_std": 0.65625, "grad_norm": 0.090101584424931, "kl": 0.11083984375, "learning_rate": 7.460749676650397e-06, "loss": 0.015, "num_tokens": 876279224.0, "reward": 2.1044921875, "reward_std": 0.10039908438920975, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.3216678202152252, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 793.779296875, "completions/mean_terminated_length": 793.779296875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.6243919091917727, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11585994774202546, "kl": 0.1134033203125, "learning_rate": 7.449228449495608e-06, "loss": 0.0159, "num_tokens": 876766887.0, "reward": 2.02734375, "reward_std": 0.1312352418899536, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 746.361328125, "completions/mean_terminated_length": 746.361328125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.6247332935051635, "frac_reward_zero_std": 0.40625, "grad_norm": 0.139832522460302, "kl": 0.1146240234375, "learning_rate": 7.43771084269449e-06, "loss": 0.0327, "num_tokens": 877231392.0, "reward": 2.08642578125, "reward_std": 0.21478921175003052, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 728.001953125, "completions/mean_terminated_length": 728.001953125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6250746778185542, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11773418254232398, "kl": 0.1131591796875, "learning_rate": 7.426196872594182e-06, "loss": 0.0104, "num_tokens": 877692897.0, "reward": 2.18310546875, "reward_std": 0.21614456176757812, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 793.921875, "completions/mean_terminated_length": 793.921875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.625416062131945, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1081173063103935, "kl": 0.1114501953125, "learning_rate": 7.414686555536659e-06, "loss": 0.0041, "num_tokens": 878187993.0, "reward": 2.05712890625, "reward_std": 0.1529437005519867, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 739.4296875, "completions/mean_terminated_length": 739.4296875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.6257574464453358, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12154832712047335, "kl": 0.1175537109375, "learning_rate": 7.403179907858708e-06, "loss": 0.0188, "num_tokens": 878651765.0, "reward": 2.10498046875, "reward_std": 0.14810703694820404, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 826.279296875, "completions/mean_terminated_length": 826.279296875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.6260988307587266, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08777947831905504, "kl": 0.1097412109375, "learning_rate": 7.391676945891914e-06, "loss": -0.0011, "num_tokens": 879158260.0, "reward": 2.03076171875, "reward_std": 0.10307788103818893, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1507.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 790.22265625, "completions/mean_terminated_length": 790.22265625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.6264402150721174, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12409207934858053, "kl": 0.11572265625, "learning_rate": 7.380177685962629e-06, "loss": 0.0117, "num_tokens": 879645910.0, "reward": 2.06103515625, "reward_std": 0.18182456493377686, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 787.892578125, "completions/mean_terminated_length": 787.892578125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.6267815993855083, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09942777854302086, "kl": 0.115966796875, "learning_rate": 7.368682144391944e-06, "loss": 0.015, "num_tokens": 880140575.0, "reward": 2.0693359375, "reward_std": 0.11312709748744965, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 775.197265625, "completions/mean_terminated_length": 775.197265625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.6271229836988991, "frac_reward_zero_std": 0.5, "grad_norm": 0.12218461525716967, "kl": 0.113525390625, "learning_rate": 7.357190337495681e-06, "loss": 0.0106, "num_tokens": 880615716.0, "reward": 2.09033203125, "reward_std": 0.15205441415309906, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 795.7421875, "completions/mean_terminated_length": 795.7421875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6274643680122899, "frac_reward_zero_std": 0.625, "grad_norm": 0.09756384061653706, "kl": 0.1092529296875, "learning_rate": 7.345702281584357e-06, "loss": 0.0148, "num_tokens": 881103696.0, "reward": 2.0927734375, "reward_std": 0.13996046781539917, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 798.703125, "completions/mean_terminated_length": 798.703125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6278057523256806, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10877530970789206, "kl": 0.1085205078125, "learning_rate": 7.334217992963166e-06, "loss": 0.0187, "num_tokens": 881594856.0, "reward": 2.08447265625, "reward_std": 0.15407171845436096, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 792.072265625, "completions/mean_terminated_length": 792.072265625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6281471366390714, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08052140423868241, "kl": 0.1065673828125, "learning_rate": 7.322737487931957e-06, "loss": 0.0091, "num_tokens": 882075501.0, "reward": 2.0966796875, "reward_std": 0.11806757003068924, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 789.93359375, "completions/mean_terminated_length": 789.93359375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6284885209524622, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13299408982706196, "kl": 0.1168212890625, "learning_rate": 7.311260782785208e-06, "loss": 0.0161, "num_tokens": 882566683.0, "reward": 2.099609375, "reward_std": 0.21740272641181946, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 813.046875, "completions/mean_terminated_length": 813.046875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.628829905265853, "frac_reward_zero_std": 0.5, "grad_norm": 0.11695204979680662, "kl": 0.108154296875, "learning_rate": 7.299787893811998e-06, "loss": 0.0194, "num_tokens": 883065443.0, "reward": 2.06396484375, "reward_std": 0.17839419841766357, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1988.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 788.498046875, "completions/mean_terminated_length": 788.498046875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.6291712895792438, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11546464648117089, "kl": 0.109130859375, "learning_rate": 7.288318837295997e-06, "loss": 0.0342, "num_tokens": 883542514.0, "reward": 2.1259765625, "reward_std": 0.18808680772781372, "rewards/accuracy_reward/mean": 0.14516128599643707, "rewards/accuracy_reward/std": 0.3526190221309662, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 760.421875, "completions/mean_terminated_length": 760.421875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6295126738926347, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11548845101125063, "kl": 0.1143798828125, "learning_rate": 7.276853629515434e-06, "loss": 0.0119, "num_tokens": 884013466.0, "reward": 2.08251953125, "reward_std": 0.14010325074195862, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 738.38671875, "completions/mean_terminated_length": 738.38671875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6298540582060255, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11024237581678883, "kl": 0.10986328125, "learning_rate": 7.2653922867430725e-06, "loss": 0.0286, "num_tokens": 884469072.0, "reward": 2.09033203125, "reward_std": 0.13615520298480988, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 777.015625, "completions/mean_terminated_length": 777.015625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.6301954425194163, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11645330634314052, "kl": 0.1102294921875, "learning_rate": 7.253934825246194e-06, "loss": 0.0224, "num_tokens": 884953496.0, "reward": 2.099609375, "reward_std": 0.19707424938678741, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.3216678202152252, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 767.666015625, "completions/mean_terminated_length": 767.666015625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.630536826832807, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1045890817412583, "kl": 0.109130859375, "learning_rate": 7.242481261286569e-06, "loss": 0.014, "num_tokens": 885424061.0, "reward": 2.09521484375, "reward_std": 0.1730721890926361, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 722.896484375, "completions/mean_terminated_length": 722.896484375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6308782111461978, "frac_reward_zero_std": 0.4375, "grad_norm": 0.134101987755741, "kl": 0.116943359375, "learning_rate": 7.231031611120438e-06, "loss": 0.0253, "num_tokens": 885873256.0, "reward": 2.037109375, "reward_std": 0.18470311164855957, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 748.2265625, "completions/mean_terminated_length": 748.2265625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.6312195954595886, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11089886115592343, "kl": 0.1138916015625, "learning_rate": 7.219585890998481e-06, "loss": 0.035, "num_tokens": 886341084.0, "reward": 2.03369140625, "reward_std": 0.1583920419216156, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 716.533203125, "completions/mean_terminated_length": 716.533203125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6315609797729794, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11747415900211204, "kl": 0.1168212890625, "learning_rate": 7.2081441171658096e-06, "loss": 0.007, "num_tokens": 886809805.0, "reward": 2.0830078125, "reward_std": 0.17258861660957336, "rewards/accuracy_reward/mean": 0.0947580635547638, "rewards/accuracy_reward/std": 0.29317617416381836, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 725.5234375, "completions/mean_terminated_length": 725.5234375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6319023640863702, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1503740022548012, "kl": 0.113037109375, "learning_rate": 7.196706305861925e-06, "loss": 0.0258, "num_tokens": 887273625.0, "reward": 2.1630859375, "reward_std": 0.22948221862316132, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1902.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 796.123046875, "completions/mean_terminated_length": 796.123046875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.632243748399761, "frac_reward_zero_std": 0.5, "grad_norm": 0.1055115767020403, "kl": 0.1053466796875, "learning_rate": 7.185272473320709e-06, "loss": 0.0327, "num_tokens": 887762632.0, "reward": 2.07666015625, "reward_std": 0.18790292739868164, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 755.91015625, "completions/mean_terminated_length": 755.91015625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.6325851327131519, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10283955709365217, "kl": 0.10791015625, "learning_rate": 7.173842635770398e-06, "loss": 0.0148, "num_tokens": 888245306.0, "reward": 2.0205078125, "reward_std": 0.12582767009735107, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 824.884765625, "completions/mean_terminated_length": 824.884765625, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.6329265170265427, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09994512249122461, "kl": 0.1036376953125, "learning_rate": 7.162416809433554e-06, "loss": 0.012, "num_tokens": 888747663.0, "reward": 2.0947265625, "reward_std": 0.17097747325897217, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 727.232421875, "completions/mean_terminated_length": 727.232421875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6332679013399334, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09871471269894755, "kl": 0.1121826171875, "learning_rate": 7.150995010527049e-06, "loss": 0.0123, "num_tokens": 889202854.0, "reward": 2.06005859375, "reward_std": 0.10318300127983093, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1920.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 736.55859375, "completions/mean_terminated_length": 736.55859375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.6336092856533242, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1112212373749632, "kl": 0.106201171875, "learning_rate": 7.1395772552620334e-06, "loss": 0.0152, "num_tokens": 889660244.0, "reward": 2.0947265625, "reward_std": 0.1463322937488556, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 733.673828125, "completions/mean_terminated_length": 732.3972778320312, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.633950669966715, "frac_reward_zero_std": 0.53125, "grad_norm": 0.4822778329996438, "kl": 0.1619873046875, "learning_rate": 7.128163559843928e-06, "loss": 0.0201, "num_tokens": 890122541.0, "reward": 2.0322265625, "reward_std": 0.17077302932739258, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.06760437041521072, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1395.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 720.359375, "completions/mean_terminated_length": 720.359375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.6342920542801058, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11737998182769518, "kl": 0.119873046875, "learning_rate": 7.116753940472381e-06, "loss": 0.0167, "num_tokens": 890597733.0, "reward": 2.06298828125, "reward_std": 0.18473242223262787, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 692.767578125, "completions/mean_terminated_length": 692.767578125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6346334385934966, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1151821493284125, "kl": 0.11328125, "learning_rate": 7.105348413341264e-06, "loss": 0.0077, "num_tokens": 891033422.0, "reward": 2.08642578125, "reward_std": 0.17273347079753876, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1717.0, "completions/max_terminated_length": 1717.0, "completions/mean_length": 748.259765625, "completions/mean_terminated_length": 748.259765625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.6349748229068874, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09500354730462601, "kl": 0.1051025390625, "learning_rate": 7.093946994638638e-06, "loss": 0.0234, "num_tokens": 891503539.0, "reward": 2.0712890625, "reward_std": 0.14338994026184082, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 768.02734375, "completions/mean_terminated_length": 768.02734375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.6353162072202783, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10112317320355677, "kl": 0.10546875, "learning_rate": 7.082549700546726e-06, "loss": 0.0091, "num_tokens": 891990257.0, "reward": 2.06982421875, "reward_std": 0.11766356229782104, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 814.408203125, "completions/mean_terminated_length": 814.408203125, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.6356575915336691, "frac_reward_zero_std": 0.5, "grad_norm": 0.11145871642943866, "kl": 0.1060791015625, "learning_rate": 7.071156547241904e-06, "loss": 0.0284, "num_tokens": 892487986.0, "reward": 2.06689453125, "reward_std": 0.16502851247787476, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1654.0, "completions/max_terminated_length": 1654.0, "completions/mean_length": 715.91796875, "completions/mean_terminated_length": 715.91796875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6359989758470598, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1341777556326104, "kl": 0.116455078125, "learning_rate": 7.059767550894672e-06, "loss": 0.0219, "num_tokens": 892937160.0, "reward": 2.171875, "reward_std": 0.22849588096141815, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 749.3203125, "completions/mean_terminated_length": 749.3203125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6363403601604506, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1262343627442371, "kl": 0.10888671875, "learning_rate": 7.048382727669625e-06, "loss": 0.0186, "num_tokens": 893403756.0, "reward": 2.146484375, "reward_std": 0.2205982506275177, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1868.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 750.31640625, "completions/mean_terminated_length": 750.31640625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6366817444738414, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12699564399127894, "kl": 0.1168212890625, "learning_rate": 7.0370020937254376e-06, "loss": 0.0085, "num_tokens": 893884926.0, "reward": 2.13330078125, "reward_std": 0.17491000890731812, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.04790584370493889, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1463.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 717.517578125, "completions/mean_terminated_length": 717.517578125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6370231287872322, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1237976215160175, "kl": 0.112548828125, "learning_rate": 7.0256256652148435e-06, "loss": 0.006, "num_tokens": 894332631.0, "reward": 2.10791015625, "reward_std": 0.1779700517654419, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 712.57421875, "completions/mean_terminated_length": 712.57421875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.637364513100623, "frac_reward_zero_std": 0.5, "grad_norm": 0.12794897764123134, "kl": 0.1131591796875, "learning_rate": 7.01425345828459e-06, "loss": 0.0241, "num_tokens": 894779117.0, "reward": 2.17919921875, "reward_std": 0.1993003636598587, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 739.34765625, "completions/mean_terminated_length": 739.34765625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.6377058974140138, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10493277825765941, "kl": 0.1070556640625, "learning_rate": 7.002885489075454e-06, "loss": 0.0108, "num_tokens": 895237807.0, "reward": 2.1982421875, "reward_std": 0.1419886350631714, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 754.27734375, "completions/mean_terminated_length": 753.4755249023438, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.6380472817274047, "frac_reward_zero_std": 0.28125, "grad_norm": 0.1955659641477732, "kl": 0.1575927734375, "learning_rate": 6.991521773722186e-06, "loss": 0.017, "num_tokens": 895707693.0, "reward": 2.1103515625, "reward_std": 0.24038590490818024, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.05597515031695366, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 748.755859375, "completions/mean_terminated_length": 748.755859375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6383886660407955, "frac_reward_zero_std": 0.625, "grad_norm": 0.11036419216628465, "kl": 0.12060546875, "learning_rate": 6.9801623283535035e-06, "loss": 0.019, "num_tokens": 896192224.0, "reward": 2.078125, "reward_std": 0.1270020306110382, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 771.3515625, "completions/mean_terminated_length": 771.3515625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.6387300503541862, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11552736632684665, "kl": 0.106201171875, "learning_rate": 6.968807169092059e-06, "loss": 0.0129, "num_tokens": 896673748.0, "reward": 2.177734375, "reward_std": 0.1866581290960312, "rewards/accuracy_reward/mean": 0.197265625, "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 763.18359375, "completions/mean_terminated_length": 763.18359375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.639071434667577, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10447884094357922, "kl": 0.107421875, "learning_rate": 6.957456312054429e-06, "loss": 0.0288, "num_tokens": 897144754.0, "reward": 2.0888671875, "reward_std": 0.11888891458511353, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 782.1484375, "completions/mean_terminated_length": 782.1484375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6394128189809678, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1324742902128546, "kl": 0.112548828125, "learning_rate": 6.946109773351073e-06, "loss": 0.0134, "num_tokens": 897621022.0, "reward": 2.1123046875, "reward_std": 0.16201981902122498, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 735.70703125, "completions/mean_terminated_length": 735.70703125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6397542032943586, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10752651420031235, "kl": 0.10595703125, "learning_rate": 6.934767569086325e-06, "loss": 0.0245, "num_tokens": 898102792.0, "reward": 2.041015625, "reward_std": 0.10137581825256348, "rewards/accuracy_reward/mean": 0.058467742055654526, "rewards/accuracy_reward/std": 0.23486268520355225, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2001.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 761.21875, "completions/mean_terminated_length": 761.21875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.6400955876077494, "frac_reward_zero_std": 0.5, "grad_norm": 0.12333462482608148, "kl": 0.11083984375, "learning_rate": 6.923429715358377e-06, "loss": 0.0162, "num_tokens": 898584168.0, "reward": 2.09716796875, "reward_std": 0.20525075495243073, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 716.9765625, "completions/mean_terminated_length": 716.9765625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.6404369719211402, "frac_reward_zero_std": 0.5, "grad_norm": 0.13158545761735108, "kl": 0.1163330078125, "learning_rate": 6.912096228259237e-06, "loss": 0.0085, "num_tokens": 899027420.0, "reward": 2.11083984375, "reward_std": 0.18843549489974976, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 772.859375, "completions/mean_terminated_length": 770.364013671875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.640778356234531, "frac_reward_zero_std": 0.5, "grad_norm": 0.12091524332903889, "kl": 0.11279296875, "learning_rate": 6.900767123874714e-06, "loss": 0.0259, "num_tokens": 899505188.0, "reward": 2.076171875, "reward_std": 0.18987596035003662, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 713.0390625, "completions/mean_terminated_length": 713.0390625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.6411197405479219, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13389639888254964, "kl": 0.1092529296875, "learning_rate": 6.889442418284402e-06, "loss": 0.0153, "num_tokens": 899952456.0, "reward": 2.0185546875, "reward_std": 0.1252184510231018, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1510.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 757.5546875, "completions/mean_terminated_length": 757.5546875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6414611248613126, "frac_reward_zero_std": 0.5, "grad_norm": 0.13660049195516868, "kl": 0.1181640625, "learning_rate": 6.878122127561653e-06, "loss": 0.0311, "num_tokens": 900413956.0, "reward": 2.0869140625, "reward_std": 0.18045789003372192, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 735.01953125, "completions/mean_terminated_length": 735.01953125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6418025091747034, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11627859326637391, "kl": 0.1097412109375, "learning_rate": 6.866806267773539e-06, "loss": 0.0148, "num_tokens": 900881342.0, "reward": 2.15234375, "reward_std": 0.173303484916687, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 738.970703125, "completions/mean_terminated_length": 738.970703125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.6421438934880942, "frac_reward_zero_std": 0.78125, "grad_norm": 0.08231364043701539, "kl": 0.1141357421875, "learning_rate": 6.855494854980857e-06, "loss": 0.0134, "num_tokens": 901342223.0, "reward": 2.03076171875, "reward_std": 0.08003926277160645, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 768.29296875, "completions/mean_terminated_length": 768.29296875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.642485277801485, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10703016822304506, "kl": 0.109619140625, "learning_rate": 6.8441879052380935e-06, "loss": 0.0104, "num_tokens": 901822245.0, "reward": 2.048828125, "reward_std": 0.1553611159324646, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1411.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 742.34765625, "completions/mean_terminated_length": 742.34765625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6428266621148758, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1237014522201562, "kl": 0.1058349609375, "learning_rate": 6.832885434593393e-06, "loss": 0.015, "num_tokens": 902289351.0, "reward": 2.08740234375, "reward_std": 0.19479620456695557, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 774.416015625, "completions/mean_terminated_length": 774.416015625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6431680464282666, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08556288249180324, "kl": 0.0994873046875, "learning_rate": 6.821587459088543e-06, "loss": 0.0123, "num_tokens": 902769356.0, "reward": 2.09912109375, "reward_std": 0.12341275066137314, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 819.67578125, "completions/mean_terminated_length": 819.67578125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.6435094307416575, "frac_reward_zero_std": 0.625, "grad_norm": 0.09197594735025183, "kl": 0.093994140625, "learning_rate": 6.810293994758962e-06, "loss": 0.0182, "num_tokens": 903276374.0, "reward": 2.0380859375, "reward_std": 0.11091610789299011, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 768.02734375, "completions/mean_terminated_length": 768.02734375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6438508150550483, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10854897139549226, "kl": 0.10693359375, "learning_rate": 6.7990050576336455e-06, "loss": 0.0188, "num_tokens": 903751508.0, "reward": 2.1240234375, "reward_std": 0.15097174048423767, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 743.326171875, "completions/mean_terminated_length": 743.326171875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.644192199368439, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1035362502643167, "kl": 0.10693359375, "learning_rate": 6.7877206637351775e-06, "loss": 0.0014, "num_tokens": 904221915.0, "reward": 2.08984375, "reward_std": 0.13489387929439545, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1619.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 751.83984375, "completions/mean_terminated_length": 751.83984375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6445335836818298, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12116585212644833, "kl": 0.10498046875, "learning_rate": 6.776440829079694e-06, "loss": 0.0169, "num_tokens": 904686313.0, "reward": 2.03515625, "reward_std": 0.1532997339963913, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 707.73828125, "completions/mean_terminated_length": 707.73828125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.6448749679952206, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09989402009535851, "kl": 0.11083984375, "learning_rate": 6.765165569676857e-06, "loss": -0.0027, "num_tokens": 905132019.0, "reward": 2.04052734375, "reward_std": 0.11455029249191284, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 715.904296875, "completions/mean_terminated_length": 715.904296875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6452163523086114, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10882784069538534, "kl": 0.1177978515625, "learning_rate": 6.753894901529834e-06, "loss": 0.0189, "num_tokens": 905574978.0, "reward": 2.06103515625, "reward_std": 0.1355074942111969, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 750.5234375, "completions/mean_terminated_length": 750.5234375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.6455577366220022, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11568502273756892, "kl": 0.1041259765625, "learning_rate": 6.742628840635283e-06, "loss": 0.017, "num_tokens": 906042702.0, "reward": 2.1005859375, "reward_std": 0.1952502727508545, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 804.84375, "completions/mean_terminated_length": 804.84375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.645899120935393, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10915731049510304, "kl": 0.1025390625, "learning_rate": 6.731367402983307e-06, "loss": 0.0077, "num_tokens": 906538798.0, "reward": 2.04443359375, "reward_std": 0.14936819672584534, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24230584502220154, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 752.939453125, "completions/mean_terminated_length": 752.939453125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6462405052487838, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10777732563901612, "kl": 0.1083984375, "learning_rate": 6.720110604557461e-06, "loss": 0.0037, "num_tokens": 907013183.0, "reward": 2.11572265625, "reward_std": 0.1548643410205841, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1631.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 769.15234375, "completions/mean_terminated_length": 769.15234375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6465818895621747, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12609981496101494, "kl": 0.11328125, "learning_rate": 6.708858461334713e-06, "loss": 0.0047, "num_tokens": 907485901.0, "reward": 2.048828125, "reward_std": 0.16610682010650635, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1959.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 809.21875, "completions/mean_terminated_length": 809.21875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6469232738755654, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10488537871950405, "kl": 0.10595703125, "learning_rate": 6.697610989285419e-06, "loss": 0.0042, "num_tokens": 907986685.0, "reward": 2.119140625, "reward_std": 0.17809134721755981, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 743.234375, "completions/mean_terminated_length": 743.234375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6472646581889562, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11100948315945218, "kl": 0.10986328125, "learning_rate": 6.686368204373311e-06, "loss": 0.014, "num_tokens": 908467365.0, "reward": 2.1171875, "reward_std": 0.18296507000923157, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 739.7890625, "completions/mean_terminated_length": 739.7890625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.647606042502347, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1298478178916058, "kl": 0.10791015625, "learning_rate": 6.675130122555469e-06, "loss": 0.0199, "num_tokens": 908929593.0, "reward": 2.103515625, "reward_std": 0.22521573305130005, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 721.90234375, "completions/mean_terminated_length": 721.90234375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6479474268157378, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08377206101405893, "kl": 0.1142578125, "learning_rate": 6.663896759782289e-06, "loss": 0.0216, "num_tokens": 909379943.0, "reward": 2.02685546875, "reward_std": 0.09388723969459534, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 754.9609375, "completions/mean_terminated_length": 754.9609375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.6482888111291286, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10318965695303803, "kl": 0.11474609375, "learning_rate": 6.652668131997475e-06, "loss": 0.011, "num_tokens": 909848803.0, "reward": 2.07177734375, "reward_std": 0.10994528979063034, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.045533329248428345, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 723.71484375, "completions/mean_terminated_length": 721.123291015625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.6486301954425194, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12655694042338375, "kl": 0.1162109375, "learning_rate": 6.641444255138013e-06, "loss": 0.0114, "num_tokens": 910305489.0, "reward": 2.05615234375, "reward_std": 0.21592871844768524, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 733.98828125, "completions/mean_terminated_length": 733.98828125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.6489715797559102, "frac_reward_zero_std": 0.625, "grad_norm": 0.10322278363948188, "kl": 0.1099853515625, "learning_rate": 6.6302251451341435e-06, "loss": 0.0111, "num_tokens": 910762779.0, "reward": 2.06982421875, "reward_std": 0.131458580493927, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 810.7421875, "completions/mean_terminated_length": 808.3209228515625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6493129640693011, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12195824446210489, "kl": 0.10986328125, "learning_rate": 6.619010817909339e-06, "loss": 0.016, "num_tokens": 911260919.0, "reward": 2.08203125, "reward_std": 0.1935141384601593, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 790.513671875, "completions/mean_terminated_length": 788.8434448242188, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.6496543483826919, "frac_reward_zero_std": 0.5625, "grad_norm": 1.1881271755988667, "kl": 0.54248046875, "learning_rate": 6.6078012893802886e-06, "loss": 0.0382, "num_tokens": 911750878.0, "reward": 2.08837890625, "reward_std": 0.17726829648017883, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 789.83984375, "completions/mean_terminated_length": 789.83984375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6499957326960826, "frac_reward_zero_std": 0.71875, "grad_norm": 0.0890726345517265, "kl": 0.1107177734375, "learning_rate": 6.596596575456866e-06, "loss": 0.0072, "num_tokens": 912240236.0, "reward": 2.0859375, "reward_std": 0.1167372614145279, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 804.416015625, "completions/mean_terminated_length": 804.416015625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.6503371170094734, "frac_reward_zero_std": 0.75, "grad_norm": 0.08291838527726643, "kl": 0.10546875, "learning_rate": 6.585396692042113e-06, "loss": 0.0153, "num_tokens": 912736097.0, "reward": 2.01123046875, "reward_std": 0.08344895392656326, "rewards/accuracy_reward/mean": 0.025390625, "rewards/accuracy_reward/std": 0.15746226906776428, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 819.486328125, "completions/mean_terminated_length": 817.0822143554688, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6506785013228642, "frac_reward_zero_std": 0.375, "grad_norm": 0.1342577762619219, "kl": 0.109375, "learning_rate": 6.574201655032216e-06, "loss": 0.0248, "num_tokens": 913239098.0, "reward": 2.109375, "reward_std": 0.2357736974954605, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04406425356864929, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 799.16796875, "completions/mean_terminated_length": 799.16796875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.651019885636255, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10806983382463835, "kl": 0.113037109375, "learning_rate": 6.5630114803164835e-06, "loss": 0.0261, "num_tokens": 913732208.0, "reward": 2.0732421875, "reward_std": 0.1756751537322998, "rewards/accuracy_reward/mean": 0.1041666641831398, "rewards/accuracy_reward/std": 0.3057953417301178, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 778.818359375, "completions/mean_terminated_length": 778.818359375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.6513612699496458, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10529594974094489, "kl": 0.11083984375, "learning_rate": 6.551826183777319e-06, "loss": 0.0218, "num_tokens": 914208611.0, "reward": 2.07373046875, "reward_std": 0.17001958191394806, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04538619518280029, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1945.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 822.48828125, "completions/mean_terminated_length": 822.48828125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.6517026542630366, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10495406304147742, "kl": 0.101318359375, "learning_rate": 6.54064578129021e-06, "loss": 0.0348, "num_tokens": 914722813.0, "reward": 2.04443359375, "reward_std": 0.1393882930278778, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 789.505859375, "completions/mean_terminated_length": 789.505859375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6520440385764275, "frac_reward_zero_std": 0.5, "grad_norm": 0.11186689243304716, "kl": 0.103759765625, "learning_rate": 6.5294702887236886e-06, "loss": 0.0172, "num_tokens": 915211744.0, "reward": 2.07421875, "reward_std": 0.1597701907157898, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 772.572265625, "completions/mean_terminated_length": 772.572265625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6523854228898183, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1018495331749366, "kl": 0.1107177734375, "learning_rate": 6.518299721939323e-06, "loss": 0.0125, "num_tokens": 915688885.0, "reward": 2.033203125, "reward_std": 0.14271622896194458, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 758.892578125, "completions/mean_terminated_length": 758.892578125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.652726807203209, "frac_reward_zero_std": 0.625, "grad_norm": 0.10603497264805886, "kl": 0.105224609375, "learning_rate": 6.5071340967916894e-06, "loss": 0.0077, "num_tokens": 916156078.0, "reward": 2.14501953125, "reward_std": 0.14079482853412628, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 785.23046875, "completions/mean_terminated_length": 785.23046875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6530681915165998, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1105152712485825, "kl": 0.106201171875, "learning_rate": 6.495973429128353e-06, "loss": 0.0197, "num_tokens": 916638692.0, "reward": 2.03564453125, "reward_std": 0.14181217551231384, "rewards/accuracy_reward/mean": 0.058467742055654526, "rewards/accuracy_reward/std": 0.23486268520355225, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04269581660628319, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 742.501953125, "completions/mean_terminated_length": 741.7123413085938, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6534095758299906, "frac_reward_zero_std": 0.625, "grad_norm": 0.5468740494860957, "kl": 0.1298828125, "learning_rate": 6.484817734789839e-06, "loss": 0.0184, "num_tokens": 917100021.0, "reward": 2.07080078125, "reward_std": 0.12855255603790283, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1827.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 748.85546875, "completions/mean_terminated_length": 748.85546875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.6537509601433814, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09704350852310249, "kl": 0.111572265625, "learning_rate": 6.473667029609614e-06, "loss": 0.0086, "num_tokens": 917564267.0, "reward": 2.05126953125, "reward_std": 0.0914858877658844, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1963.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 691.2265625, "completions/mean_terminated_length": 691.2265625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.6540923444567722, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09797228568868388, "kl": 0.1097412109375, "learning_rate": 6.462521329414066e-06, "loss": 0.0092, "num_tokens": 918002671.0, "reward": 2.0673828125, "reward_std": 0.12022743374109268, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 817.21484375, "completions/mean_terminated_length": 816.4912109375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.654433728770163, "frac_reward_zero_std": 0.5625, "grad_norm": 0.26516672892158427, "kl": 0.2789306640625, "learning_rate": 6.451380650022477e-06, "loss": 0.0267, "num_tokens": 918503181.0, "reward": 2.08203125, "reward_std": 0.1743844449520111, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 767.421875, "completions/mean_terminated_length": 767.421875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6547751130835539, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09534285234043377, "kl": 0.102783203125, "learning_rate": 6.440245007247003e-06, "loss": -0.0004, "num_tokens": 918979605.0, "reward": 2.10205078125, "reward_std": 0.1121005192399025, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 748.318359375, "completions/mean_terminated_length": 748.318359375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6551164973969447, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11598228946891873, "kl": 0.112548828125, "learning_rate": 6.429114416892655e-06, "loss": 0.0153, "num_tokens": 919447928.0, "reward": 2.10986328125, "reward_std": 0.1866188794374466, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 731.033203125, "completions/mean_terminated_length": 731.033203125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.6554578817103354, "frac_reward_zero_std": 0.5, "grad_norm": 0.11666005562856918, "kl": 0.1065673828125, "learning_rate": 6.417988894757267e-06, "loss": 0.0235, "num_tokens": 919903321.0, "reward": 2.09716796875, "reward_std": 0.18303075432777405, "rewards/accuracy_reward/mean": 0.11491935700178146, "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 756.451171875, "completions/mean_terminated_length": 756.451171875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.6557992660237262, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10382502678853774, "kl": 0.106201171875, "learning_rate": 6.406868456631483e-06, "loss": 0.0267, "num_tokens": 920372768.0, "reward": 2.046875, "reward_std": 0.1219031810760498, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 822.654296875, "completions/mean_terminated_length": 822.654296875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.656140650337117, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08905838318343787, "kl": 0.1025390625, "learning_rate": 6.395753118298735e-06, "loss": 0.0121, "num_tokens": 920880367.0, "reward": 2.0751953125, "reward_std": 0.1314471960067749, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 821.8515625, "completions/mean_terminated_length": 821.8515625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.6564820346505078, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0987915150569123, "kl": 0.101806640625, "learning_rate": 6.384642895535209e-06, "loss": 0.022, "num_tokens": 921398643.0, "reward": 2.0078125, "reward_std": 0.08957062661647797, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 725.3125, "completions/mean_terminated_length": 725.3125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.6568234189638986, "frac_reward_zero_std": 0.34375, "grad_norm": 0.13724186345425296, "kl": 0.1075439453125, "learning_rate": 6.373537804109834e-06, "loss": 0.0291, "num_tokens": 921846323.0, "reward": 2.09521484375, "reward_std": 0.22282516956329346, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.32166779041290283, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 779.26171875, "completions/mean_terminated_length": 779.26171875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.6571648032772894, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09940879689255935, "kl": 0.1029052734375, "learning_rate": 6.3624378597842564e-06, "loss": 0.0076, "num_tokens": 922326313.0, "reward": 2.0849609375, "reward_std": 0.153800830245018, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 725.3671875, "completions/mean_terminated_length": 725.3671875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6575061875906802, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09732146137007397, "kl": 0.1048583984375, "learning_rate": 6.35134307831282e-06, "loss": 0.0062, "num_tokens": 922779221.0, "reward": 2.03662109375, "reward_std": 0.10867195576429367, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 780.150390625, "completions/mean_terminated_length": 777.6692504882812, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6578475719040711, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08804244986286415, "kl": 0.0972900390625, "learning_rate": 6.340253475442532e-06, "loss": 0.016, "num_tokens": 923258594.0, "reward": 2.11474609375, "reward_std": 0.12574678659439087, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 761.216796875, "completions/mean_terminated_length": 761.216796875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.6581889562174618, "frac_reward_zero_std": 0.34375, "grad_norm": 0.13811833467803755, "kl": 0.1016845703125, "learning_rate": 6.329169066913063e-06, "loss": 0.0174, "num_tokens": 923742833.0, "reward": 2.0732421875, "reward_std": 0.19192902743816376, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 742.115234375, "completions/mean_terminated_length": 742.115234375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.6585303405308526, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1144155142264883, "kl": 0.106201171875, "learning_rate": 6.318089868456696e-06, "loss": 0.0159, "num_tokens": 924211916.0, "reward": 2.1337890625, "reward_std": 0.18463414907455444, "rewards/accuracy_reward/mean": 0.15120968222618103, "rewards/accuracy_reward/std": 0.35861483216285706, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 745.126953125, "completions/mean_terminated_length": 745.126953125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.6588717248442434, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10512284935647116, "kl": 0.10205078125, "learning_rate": 6.307015895798332e-06, "loss": 0.0075, "num_tokens": 924677725.0, "reward": 2.0439453125, "reward_std": 0.12081440538167953, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1968.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 832.990234375, "completions/mean_terminated_length": 832.990234375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.6592131091576342, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10597389927273715, "kl": 0.1029052734375, "learning_rate": 6.295947164655447e-06, "loss": 0.0101, "num_tokens": 925189336.0, "reward": 2.09326171875, "reward_std": 0.15079665184020996, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 702.7578125, "completions/mean_terminated_length": 702.7578125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.659554493471025, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12090184035137186, "kl": 0.1011962890625, "learning_rate": 6.2848836907380805e-06, "loss": 0.0109, "num_tokens": 925627916.0, "reward": 2.09375, "reward_std": 0.17701753973960876, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.06609638035297394, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 736.34765625, "completions/mean_terminated_length": 736.34765625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.6598958777844158, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09991633249343379, "kl": 0.1046142578125, "learning_rate": 6.2738254897488114e-06, "loss": 0.0115, "num_tokens": 926086190.0, "reward": 2.03173828125, "reward_std": 0.11204035580158234, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1923.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 759.142578125, "completions/mean_terminated_length": 759.142578125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.6602372620978066, "frac_reward_zero_std": 0.46875, "grad_norm": 0.127852294248724, "kl": 0.1077880859375, "learning_rate": 6.262772577382736e-06, "loss": 0.0238, "num_tokens": 926555607.0, "reward": 2.078125, "reward_std": 0.18003851175308228, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 745.623046875, "completions/mean_terminated_length": 745.623046875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.6605786464111975, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10339688683144788, "kl": 0.10498046875, "learning_rate": 6.251724969327435e-06, "loss": 0.0165, "num_tokens": 927019638.0, "reward": 2.0419921875, "reward_std": 0.14049693942070007, "rewards/accuracy_reward/mean": 0.05443548411130905, "rewards/accuracy_reward/std": 0.227104052901268, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 763.23046875, "completions/mean_terminated_length": 763.23046875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6609200307245882, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12052456486725667, "kl": 0.096435546875, "learning_rate": 6.240682681262972e-06, "loss": 0.0038, "num_tokens": 927492988.0, "reward": 2.10400390625, "reward_std": 0.1976318359375, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 779.9609375, "completions/mean_terminated_length": 779.9609375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.661261415037979, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10309717645208706, "kl": 0.1048583984375, "learning_rate": 6.229645728861854e-06, "loss": 0.0078, "num_tokens": 927974216.0, "reward": 2.03662109375, "reward_std": 0.1455770879983902, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 774.54296875, "completions/mean_terminated_length": 774.54296875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6616027993513698, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12986078409921564, "kl": 0.1031494140625, "learning_rate": 6.218614127789018e-06, "loss": 0.0137, "num_tokens": 928460046.0, "reward": 2.10205078125, "reward_std": 0.21810871362686157, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.3216678202152252, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 806.978515625, "completions/mean_terminated_length": 806.978515625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.6619441836647606, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11961868903741438, "kl": 0.1019287109375, "learning_rate": 6.207587893701801e-06, "loss": 0.0267, "num_tokens": 928952851.0, "reward": 2.087890625, "reward_std": 0.18828606605529785, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 754.37109375, "completions/mean_terminated_length": 754.37109375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.6622855679781514, "frac_reward_zero_std": 0.625, "grad_norm": 0.10047610754551324, "kl": 0.1092529296875, "learning_rate": 6.19656704224993e-06, "loss": 0.0144, "num_tokens": 929414497.0, "reward": 2.06982421875, "reward_std": 0.142716646194458, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 803.888671875, "completions/mean_terminated_length": 803.888671875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.6626269522915422, "frac_reward_zero_std": 0.5, "grad_norm": 0.11065204821839576, "kl": 0.100830078125, "learning_rate": 6.185551589075483e-06, "loss": 0.0142, "num_tokens": 929905112.0, "reward": 2.09130859375, "reward_std": 0.16599507629871368, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 776.640625, "completions/mean_terminated_length": 776.640625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.662968336604933, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12877473214894386, "kl": 0.102783203125, "learning_rate": 6.174541549812882e-06, "loss": 0.0183, "num_tokens": 930377424.0, "reward": 2.1044921875, "reward_std": 0.19224420189857483, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1655.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 825.927734375, "completions/mean_terminated_length": 825.927734375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.6633097209183239, "frac_reward_zero_std": 0.5, "grad_norm": 0.1131674520392799, "kl": 0.0972900390625, "learning_rate": 6.163536940088867e-06, "loss": 0.0054, "num_tokens": 930889803.0, "reward": 2.064453125, "reward_std": 0.1736922413110733, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 826.912109375, "completions/mean_terminated_length": 826.912109375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.6636511052317146, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10160856617066724, "kl": 0.0960693359375, "learning_rate": 6.152537775522467e-06, "loss": 0.0202, "num_tokens": 931389246.0, "reward": 2.1142578125, "reward_std": 0.1359277218580246, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 808.966796875, "completions/mean_terminated_length": 808.966796875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.6639924895451054, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10084700785644309, "kl": 0.0975341796875, "learning_rate": 6.141544071724983e-06, "loss": 0.0155, "num_tokens": 931884845.0, "reward": 2.05224609375, "reward_std": 0.14772072434425354, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 814.330078125, "completions/mean_terminated_length": 814.330078125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.6643338738584962, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11015180088086696, "kl": 0.10205078125, "learning_rate": 6.130555844299972e-06, "loss": 0.0162, "num_tokens": 932388566.0, "reward": 2.052734375, "reward_std": 0.16712047159671783, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1588.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 762.732421875, "completions/mean_terminated_length": 762.732421875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.664675258171887, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09933961610493047, "kl": 0.107666015625, "learning_rate": 6.119573108843205e-06, "loss": 0.015, "num_tokens": 932869869.0, "reward": 2.04052734375, "reward_std": 0.11283182352781296, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 771.8984375, "completions/mean_terminated_length": 769.4011840820312, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6650166424852778, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11200395518419137, "kl": 0.105712890625, "learning_rate": 6.1085958809426695e-06, "loss": 0.0344, "num_tokens": 933348217.0, "reward": 2.099609375, "reward_std": 0.2121584415435791, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 766.509765625, "completions/mean_terminated_length": 766.509765625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6653580267986686, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09961798541812392, "kl": 0.1015625, "learning_rate": 6.097624176178534e-06, "loss": 0.0051, "num_tokens": 933824462.0, "reward": 2.0888671875, "reward_std": 0.13049477338790894, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 825.943359375, "completions/mean_terminated_length": 823.5518798828125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.6656994111120594, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09280574498637684, "kl": 0.098388671875, "learning_rate": 6.086658010123125e-06, "loss": 0.0162, "num_tokens": 934335393.0, "reward": 2.05419921875, "reward_std": 0.13643795251846313, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 793.9609375, "completions/mean_terminated_length": 793.9609375, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.6660407954254502, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11056606273782156, "kl": 0.102783203125, "learning_rate": 6.075697398340913e-06, "loss": 0.0235, "num_tokens": 934831661.0, "reward": 2.05615234375, "reward_std": 0.17489361763000488, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 703.908203125, "completions/mean_terminated_length": 703.908203125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.666382179738841, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1148435107795283, "kl": 0.1119384765625, "learning_rate": 6.064742356388478e-06, "loss": 0.0029, "num_tokens": 935284942.0, "reward": 2.07763671875, "reward_std": 0.15107226371765137, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 702.076171875, "completions/mean_terminated_length": 702.076171875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.6667235640522318, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11907838453031888, "kl": 0.104736328125, "learning_rate": 6.0537928998145045e-06, "loss": 0.0175, "num_tokens": 935728133.0, "reward": 2.06640625, "reward_std": 0.15218521654605865, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 758.890625, "completions/mean_terminated_length": 758.890625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.6670649483656226, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07792898134617104, "kl": 0.102294921875, "learning_rate": 6.042849044159734e-06, "loss": 0.0097, "num_tokens": 936208605.0, "reward": 2.046875, "reward_std": 0.08700709789991379, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1944.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 825.296875, "completions/mean_terminated_length": 825.296875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.6674063326790134, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08125644484792041, "kl": 0.0965576171875, "learning_rate": 6.031910804956971e-06, "loss": 0.0095, "num_tokens": 936714517.0, "reward": 2.0322265625, "reward_std": 0.09020845592021942, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 824.177734375, "completions/mean_terminated_length": 822.994140625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.6677477169924042, "frac_reward_zero_std": 0.5625, "grad_norm": 0.25755206381367257, "kl": 0.2420654296875, "learning_rate": 6.020978197731049e-06, "loss": 0.0321, "num_tokens": 937219456.0, "reward": 2.046875, "reward_std": 0.17474335432052612, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 790.701171875, "completions/mean_terminated_length": 790.701171875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.668089101305795, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09736689396410013, "kl": 0.0972900390625, "learning_rate": 6.0100512379988e-06, "loss": 0.0149, "num_tokens": 937707303.0, "reward": 2.04296875, "reward_std": 0.1225418820977211, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1703.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 796.291015625, "completions/mean_terminated_length": 796.291015625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.6684304856191858, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09194328347143928, "kl": 0.09765625, "learning_rate": 5.99912994126905e-06, "loss": 0.0086, "num_tokens": 938193420.0, "reward": 2.0859375, "reward_std": 0.12111739814281464, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 770.86328125, "completions/mean_terminated_length": 770.2113647460938, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.6687718699325766, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5608597993527045, "kl": 0.1497802734375, "learning_rate": 5.988214323042581e-06, "loss": 0.0152, "num_tokens": 938666534.0, "reward": 2.060546875, "reward_std": 0.16424080729484558, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 793.724609375, "completions/mean_terminated_length": 793.724609375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6691132542459673, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11537016325009912, "kl": 0.0948486328125, "learning_rate": 5.977304398812113e-06, "loss": 0.0169, "num_tokens": 939166633.0, "reward": 2.06689453125, "reward_std": 0.1524951010942459, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 730.05078125, "completions/mean_terminated_length": 730.05078125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6694546385593582, "frac_reward_zero_std": 0.5, "grad_norm": 0.12097180400471165, "kl": 0.100830078125, "learning_rate": 5.966400184062289e-06, "loss": 0.0259, "num_tokens": 939625251.0, "reward": 2.1005859375, "reward_std": 0.17789947986602783, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1847.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 736.38671875, "completions/mean_terminated_length": 736.38671875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.669796022872749, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10852407789939425, "kl": 0.1002197265625, "learning_rate": 5.955501694269646e-06, "loss": 0.0238, "num_tokens": 940081817.0, "reward": 2.17333984375, "reward_std": 0.17577509582042694, "rewards/accuracy_reward/mean": 0.19354838132858276, "rewards/accuracy_reward/std": 0.39547789096832275, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 698.388671875, "completions/mean_terminated_length": 698.388671875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6701374071861398, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13431974445694114, "kl": 0.1043701171875, "learning_rate": 5.9446089449026e-06, "loss": 0.0202, "num_tokens": 940519760.0, "reward": 2.08740234375, "reward_std": 0.18977874517440796, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1306.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 723.326171875, "completions/mean_terminated_length": 723.326171875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6704787914995306, "frac_reward_zero_std": 0.625, "grad_norm": 0.10041880767498992, "kl": 0.10107421875, "learning_rate": 5.933721951421416e-06, "loss": 0.0083, "num_tokens": 940973831.0, "reward": 2.06787109375, "reward_std": 0.13596998155117035, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 712.4375, "completions/mean_terminated_length": 712.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6708201758129214, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11184770498777245, "kl": 0.0975341796875, "learning_rate": 5.9228407292781945e-06, "loss": 0.0177, "num_tokens": 941429223.0, "reward": 2.15576171875, "reward_std": 0.1653352677822113, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1590.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 764.525390625, "completions/mean_terminated_length": 763.5087890625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.6711615601263122, "frac_reward_zero_std": 0.59375, "grad_norm": 0.2780127624346535, "kl": 0.1826171875, "learning_rate": 5.911965293916831e-06, "loss": 0.0173, "num_tokens": 941897316.0, "reward": 2.0361328125, "reward_std": 0.1485709846019745, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 729.966796875, "completions/mean_terminated_length": 729.966796875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.671502944439703, "frac_reward_zero_std": 0.5, "grad_norm": 0.16999121802483305, "kl": 0.098388671875, "learning_rate": 5.901095660773021e-06, "loss": 0.024, "num_tokens": 942355907.0, "reward": 2.06494140625, "reward_std": 0.17513471841812134, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 706.84375, "completions/mean_terminated_length": 706.84375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.6718443287530937, "frac_reward_zero_std": 0.59375, "grad_norm": 0.110465573603467, "kl": 0.1024169921875, "learning_rate": 5.8902318452742195e-06, "loss": 0.0178, "num_tokens": 942798051.0, "reward": 2.07080078125, "reward_std": 0.12629035115242004, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 733.244140625, "completions/mean_terminated_length": 733.244140625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.6721857130664846, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10718891593055058, "kl": 0.100341796875, "learning_rate": 5.879373862839626e-06, "loss": 0.0326, "num_tokens": 943257632.0, "reward": 2.0234375, "reward_std": 0.16227638721466064, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 745.9921875, "completions/mean_terminated_length": 745.9921875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.6725270973798754, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1287739010112158, "kl": 0.0994873046875, "learning_rate": 5.868521728880164e-06, "loss": 0.0166, "num_tokens": 943730444.0, "reward": 2.05078125, "reward_std": 0.20202039182186127, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 702.224609375, "completions/mean_terminated_length": 702.224609375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.6728684816932662, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10865199997245856, "kl": 0.104248046875, "learning_rate": 5.857675458798453e-06, "loss": 0.0052, "num_tokens": 944165407.0, "reward": 2.05419921875, "reward_std": 0.11590683460235596, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 709.591796875, "completions/mean_terminated_length": 709.591796875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.673209866006657, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13410568064677794, "kl": 0.09814453125, "learning_rate": 5.846835067988782e-06, "loss": 0.0193, "num_tokens": 944616558.0, "reward": 2.0791015625, "reward_std": 0.16934587061405182, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 708.65625, "completions/mean_terminated_length": 708.65625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6735512503200478, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12694905602270184, "kl": 0.1025390625, "learning_rate": 5.836000571837106e-06, "loss": 0.0157, "num_tokens": 945064302.0, "reward": 2.14501953125, "reward_std": 0.21507133543491364, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 758.404296875, "completions/mean_terminated_length": 758.404296875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6738926346334386, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09107222704982779, "kl": 0.0989990234375, "learning_rate": 5.825171985721017e-06, "loss": 0.0137, "num_tokens": 945544557.0, "reward": 2.068359375, "reward_std": 0.10336729884147644, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 777.33203125, "completions/mean_terminated_length": 775.9041137695312, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.6742340189468294, "frac_reward_zero_std": 0.5625, "grad_norm": 1.80989979279191, "kl": 0.6285400390625, "learning_rate": 5.814349325009705e-06, "loss": 0.045, "num_tokens": 946031047.0, "reward": 2.0703125, "reward_std": 0.1631918102502823, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 754.150390625, "completions/mean_terminated_length": 754.150390625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6745754032602201, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10442021343662185, "kl": 0.0999755859375, "learning_rate": 5.803532605063962e-06, "loss": 0.0067, "num_tokens": 946510996.0, "reward": 2.05322265625, "reward_std": 0.1308140754699707, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 767.4140625, "completions/mean_terminated_length": 767.4140625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.674916787573611, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1101887942062251, "kl": 0.099365234375, "learning_rate": 5.792721841236143e-06, "loss": 0.0194, "num_tokens": 946986008.0, "reward": 2.10009765625, "reward_std": 0.17478403449058533, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 753.6640625, "completions/mean_terminated_length": 753.6640625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.6752581718870018, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10700396600743019, "kl": 0.100830078125, "learning_rate": 5.781917048870145e-06, "loss": 0.0074, "num_tokens": 947452028.0, "reward": 2.08203125, "reward_std": 0.14430683851242065, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1915.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 801.091796875, "completions/mean_terminated_length": 801.091796875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6755995562003926, "frac_reward_zero_std": 0.625, "grad_norm": 0.10138526791084079, "kl": 0.093994140625, "learning_rate": 5.771118243301401e-06, "loss": 0.0046, "num_tokens": 947937275.0, "reward": 2.03369140625, "reward_std": 0.11363008618354797, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 804.037109375, "completions/mean_terminated_length": 804.037109375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.6759409405137834, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09704533673726608, "kl": 0.0953369140625, "learning_rate": 5.760325439856833e-06, "loss": 0.008, "num_tokens": 948429262.0, "reward": 2.0703125, "reward_std": 0.11436352133750916, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 849.341796875, "completions/mean_terminated_length": 849.341796875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.6762823248271742, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11964096565141528, "kl": 0.096923828125, "learning_rate": 5.749538653854861e-06, "loss": 0.0023, "num_tokens": 948963485.0, "reward": 2.1806640625, "reward_std": 0.2459474503993988, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 815.490234375, "completions/mean_terminated_length": 815.490234375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.676623709140565, "frac_reward_zero_std": 0.40625, "grad_norm": 0.126109507978867, "kl": 0.1019287109375, "learning_rate": 5.7387579006053475e-06, "loss": 0.0215, "num_tokens": 949459272.0, "reward": 2.18212890625, "reward_std": 0.23353315889835358, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39069411158561707, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 867.97265625, "completions/mean_terminated_length": 867.97265625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6769650934539558, "frac_reward_zero_std": 0.625, "grad_norm": 0.0931777876924212, "kl": 0.09912109375, "learning_rate": 5.727983195409603e-06, "loss": 0.0068, "num_tokens": 949993994.0, "reward": 2.03564453125, "reward_std": 0.12351667881011963, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 803.50390625, "completions/mean_terminated_length": 803.50390625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6773064777673465, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10410692955959555, "kl": 0.0924072265625, "learning_rate": 5.717214553560353e-06, "loss": 0.0263, "num_tokens": 950485740.0, "reward": 2.1533203125, "reward_std": 0.1994428038597107, "rewards/accuracy_reward/mean": 0.1713709682226181, "rewards/accuracy_reward/std": 0.3772132694721222, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1931.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 874.4453125, "completions/mean_terminated_length": 874.4453125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6776478620807374, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10958907066689572, "kl": 0.0972900390625, "learning_rate": 5.706451990341706e-06, "loss": 0.0232, "num_tokens": 951014944.0, "reward": 2.11181640625, "reward_std": 0.19672146439552307, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 875.322265625, "completions/mean_terminated_length": 873.0274047851562, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.6779892463941282, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09537727253250773, "kl": 0.10107421875, "learning_rate": 5.695695521029163e-06, "loss": 0.0132, "num_tokens": 951552853.0, "reward": 2.04296875, "reward_std": 0.1395440697669983, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 867.50390625, "completions/mean_terminated_length": 855.8619384765625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.678330630707519, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12669912918541315, "kl": 0.1005859375, "learning_rate": 5.684945160889556e-06, "loss": 0.0269, "num_tokens": 952083911.0, "reward": 2.07666015625, "reward_std": 0.2145688831806183, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.99267578125, "rewards/tag_count_reward/std": 0.06310669332742691, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 879.841796875, "completions/mean_terminated_length": 879.841796875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.6786720150209098, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10975991758079864, "kl": 0.0926513671875, "learning_rate": 5.6742009251810614e-06, "loss": 0.0214, "num_tokens": 952616758.0, "reward": 2.08447265625, "reward_std": 0.16679877042770386, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 874.638671875, "completions/mean_terminated_length": 874.638671875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.6790133993343006, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11781269613578704, "kl": 0.096435546875, "learning_rate": 5.663462829153153e-06, "loss": 0.0249, "num_tokens": 953148109.0, "reward": 2.09716796875, "reward_std": 0.20398655533790588, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 851.29296875, "completions/mean_terminated_length": 851.29296875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.6793547836476914, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1266017967523318, "kl": 0.100341796875, "learning_rate": 5.652730888046601e-06, "loss": 0.023, "num_tokens": 953672931.0, "reward": 2.09375, "reward_std": 0.2647911310195923, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 819.2421875, "completions/mean_terminated_length": 819.2421875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6796961679610822, "frac_reward_zero_std": 0.5, "grad_norm": 0.13144880972824774, "kl": 0.09814453125, "learning_rate": 5.642005117093419e-06, "loss": 0.0071, "num_tokens": 954178767.0, "reward": 2.0517578125, "reward_std": 0.1548386961221695, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 770.671875, "completions/mean_terminated_length": 770.671875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.6800375522744729, "frac_reward_zero_std": 0.375, "grad_norm": 0.13380880678562446, "kl": 0.102294921875, "learning_rate": 5.631285531516885e-06, "loss": 0.0256, "num_tokens": 954649607.0, "reward": 2.11181640625, "reward_std": 0.21998533606529236, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 764.33203125, "completions/mean_terminated_length": 764.33203125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.6803789365878637, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12582038513490854, "kl": 0.111572265625, "learning_rate": 5.620572146531493e-06, "loss": 0.0088, "num_tokens": 955126401.0, "reward": 2.06884765625, "reward_std": 0.19668205082416534, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 747.205078125, "completions/mean_terminated_length": 747.205078125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.6807203209012546, "frac_reward_zero_std": 0.5, "grad_norm": 0.10709419006485954, "kl": 0.1019287109375, "learning_rate": 5.6098649773429295e-06, "loss": 0.0197, "num_tokens": 955596922.0, "reward": 2.09423828125, "reward_std": 0.18940842151641846, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 774.658203125, "completions/mean_terminated_length": 774.658203125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.6810617052146454, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12023548122240334, "kl": 0.0966796875, "learning_rate": 5.599164039148066e-06, "loss": 0.0275, "num_tokens": 956073339.0, "reward": 2.015625, "reward_std": 0.180311381816864, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 777.43359375, "completions/mean_terminated_length": 777.43359375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.6814030895280362, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14900464730687055, "kl": 0.1038818359375, "learning_rate": 5.588469347134926e-06, "loss": 0.0287, "num_tokens": 956557961.0, "reward": 2.06787109375, "reward_std": 0.2322199046611786, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 737.025390625, "completions/mean_terminated_length": 737.025390625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.681744473841427, "frac_reward_zero_std": 0.625, "grad_norm": 0.11066143815399011, "kl": 0.1065673828125, "learning_rate": 5.5777809164826665e-06, "loss": 0.0078, "num_tokens": 957016614.0, "reward": 2.1064453125, "reward_std": 0.12511879205703735, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 724.130859375, "completions/mean_terminated_length": 724.130859375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6820858581548178, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10245342651836052, "kl": 0.099365234375, "learning_rate": 5.567098762361559e-06, "loss": 0.0008, "num_tokens": 957461801.0, "reward": 2.12158203125, "reward_std": 0.161658376455307, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 779.623046875, "completions/mean_terminated_length": 777.140869140625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6824272424682086, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11700928358276662, "kl": 0.096435546875, "learning_rate": 5.5564228999329695e-06, "loss": 0.0265, "num_tokens": 957946840.0, "reward": 2.111328125, "reward_std": 0.18248909711837769, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 740.7265625, "completions/mean_terminated_length": 738.7808227539062, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.6827686267815993, "frac_reward_zero_std": 0.5, "grad_norm": 0.2901385248374408, "kl": 0.2987060546875, "learning_rate": 5.545753344349334e-06, "loss": 0.0141, "num_tokens": 958419868.0, "reward": 2.1025390625, "reward_std": 0.21510764956474304, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1495.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 736.998046875, "completions/mean_terminated_length": 736.998046875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.6831100110949901, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1366477600290527, "kl": 0.107666015625, "learning_rate": 5.535090110754131e-06, "loss": 0.002, "num_tokens": 958887019.0, "reward": 2.06201171875, "reward_std": 0.16876666247844696, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1929.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 729.4921875, "completions/mean_terminated_length": 729.4921875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.683451395408381, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1261084392841696, "kl": 0.1026611328125, "learning_rate": 5.52443321428188e-06, "loss": 0.0321, "num_tokens": 959346471.0, "reward": 2.09033203125, "reward_std": 0.18392547965049744, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 701.6015625, "completions/mean_terminated_length": 701.6015625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.6837927797217718, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13162528620229105, "kl": 0.1011962890625, "learning_rate": 5.513782670058081e-06, "loss": 0.0204, "num_tokens": 959790843.0, "reward": 2.1015625, "reward_std": 0.21937954425811768, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 740.13671875, "completions/mean_terminated_length": 740.13671875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6841341640351626, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1195063432897991, "kl": 0.106201171875, "learning_rate": 5.503138493199247e-06, "loss": 0.0122, "num_tokens": 960255393.0, "reward": 2.04833984375, "reward_std": 0.1457308530807495, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 718.6640625, "completions/mean_terminated_length": 718.6640625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6844755483485534, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11217202276918974, "kl": 0.10302734375, "learning_rate": 5.492500698812828e-06, "loss": 0.023, "num_tokens": 960724021.0, "reward": 2.1474609375, "reward_std": 0.15821854770183563, "rewards/accuracy_reward/mean": 0.16532258689403534, "rewards/accuracy_reward/std": 0.37184643745422363, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 681.869140625, "completions/mean_terminated_length": 681.869140625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.6848169326619442, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14515098183111969, "kl": 0.105712890625, "learning_rate": 5.481869301997236e-06, "loss": 0.0174, "num_tokens": 961155842.0, "reward": 2.12109375, "reward_std": 0.23377162218093872, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04406425356864929, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 710.53515625, "completions/mean_terminated_length": 710.53515625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.685158316975335, "frac_reward_zero_std": 0.75, "grad_norm": 0.08575803946602994, "kl": 0.1102294921875, "learning_rate": 5.4712443178417965e-06, "loss": 0.0099, "num_tokens": 961608660.0, "reward": 2.05322265625, "reward_std": 0.09000095725059509, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 716.427734375, "completions/mean_terminated_length": 716.427734375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.6854997012887258, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11257631093561035, "kl": 0.100830078125, "learning_rate": 5.4606257614267255e-06, "loss": 0.0239, "num_tokens": 962057119.0, "reward": 2.15673828125, "reward_std": 0.19257782399654388, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1745.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 700.259765625, "completions/mean_terminated_length": 699.4912109375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6858410856021165, "frac_reward_zero_std": 0.59375, "grad_norm": 0.37059328753540943, "kl": 0.153564453125, "learning_rate": 5.450013647823125e-06, "loss": 0.0304, "num_tokens": 962517316.0, "reward": 2.0625, "reward_std": 0.14474454522132874, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 748.1953125, "completions/mean_terminated_length": 748.1953125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.6861824699155074, "frac_reward_zero_std": 0.625, "grad_norm": 0.10674613939226914, "kl": 0.1019287109375, "learning_rate": 5.4394079920929425e-06, "loss": 0.0046, "num_tokens": 962986776.0, "reward": 2.04736328125, "reward_std": 0.13929228484630585, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 759.26171875, "completions/mean_terminated_length": 759.26171875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.6865238542288982, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10146917168250959, "kl": 0.1064453125, "learning_rate": 5.428808809288976e-06, "loss": 0.01, "num_tokens": 963461422.0, "reward": 2.04296875, "reward_std": 0.12040339410305023, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 761.62890625, "completions/mean_terminated_length": 761.62890625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.686865238542289, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08856412347355888, "kl": 0.0975341796875, "learning_rate": 5.418216114454818e-06, "loss": 0.0146, "num_tokens": 963930288.0, "reward": 2.048828125, "reward_std": 0.1311972737312317, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 722.6484375, "completions/mean_terminated_length": 722.6484375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.6872066228556798, "frac_reward_zero_std": 0.625, "grad_norm": 0.13811469714687613, "kl": 0.106689453125, "learning_rate": 5.407629922624866e-06, "loss": 0.0066, "num_tokens": 964392732.0, "reward": 2.107421875, "reward_std": 0.15402653813362122, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 746.849609375, "completions/mean_terminated_length": 746.849609375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6875480071690706, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10743556415161466, "kl": 0.101806640625, "learning_rate": 5.397050248824291e-06, "loss": 0.0266, "num_tokens": 964860383.0, "reward": 2.05126953125, "reward_std": 0.13712453842163086, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 786.552734375, "completions/mean_terminated_length": 786.552734375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.6878893914824614, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10711717909077777, "kl": 0.0989990234375, "learning_rate": 5.386477108068988e-06, "loss": 0.0131, "num_tokens": 965338522.0, "reward": 2.0361328125, "reward_std": 0.13219377398490906, "rewards/accuracy_reward/mean": 0.0463709682226181, "rewards/accuracy_reward/std": 0.21049949526786804, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 739.498046875, "completions/mean_terminated_length": 739.498046875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.6882307757958522, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11683600295941216, "kl": 0.1055908203125, "learning_rate": 5.37591051536561e-06, "loss": 0.0071, "num_tokens": 965804505.0, "reward": 2.05078125, "reward_std": 0.17575326561927795, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 778.916015625, "completions/mean_terminated_length": 776.4324951171875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.6885721601092429, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12112519873522337, "kl": 0.0916748046875, "learning_rate": 5.3653504857114935e-06, "loss": 0.015, "num_tokens": 966287694.0, "reward": 2.12646484375, "reward_std": 0.20306572318077087, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 740.67578125, "completions/mean_terminated_length": 740.67578125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6889135444226338, "frac_reward_zero_std": 0.5625, "grad_norm": 4.008928472882253, "kl": 0.1033935546875, "learning_rate": 5.354797034094679e-06, "loss": 0.0298, "num_tokens": 966746984.0, "reward": 2.10400390625, "reward_std": 0.17457887530326843, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 785.984375, "completions/mean_terminated_length": 784.98828125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6892549287360246, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1556942786086666, "kl": 0.1612548828125, "learning_rate": 5.344250175493849e-06, "loss": 0.018, "num_tokens": 967241760.0, "reward": 2.0869140625, "reward_std": 0.157125324010849, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 759.87890625, "completions/mean_terminated_length": 757.3580932617188, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6895963130494154, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1102269179250642, "kl": 0.1019287109375, "learning_rate": 5.333709924878347e-06, "loss": 0.0269, "num_tokens": 967705090.0, "reward": 2.04541015625, "reward_std": 0.17173349857330322, "rewards/accuracy_reward/mean": 0.06451612710952759, "rewards/accuracy_reward/std": 0.2459181249141693, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 799.93359375, "completions/mean_terminated_length": 799.93359375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.6899376973628062, "frac_reward_zero_std": 0.5, "grad_norm": 0.10751049456229474, "kl": 0.094482421875, "learning_rate": 5.3231762972081406e-06, "loss": 0.0163, "num_tokens": 968202080.0, "reward": 2.07568359375, "reward_std": 0.18714803457260132, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 761.5625, "completions/mean_terminated_length": 761.5625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.690279081676197, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11808715308848317, "kl": 0.104248046875, "learning_rate": 5.312649307433768e-06, "loss": 0.0104, "num_tokens": 968668960.0, "reward": 2.052734375, "reward_std": 0.1914394199848175, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04910992085933685, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 772.0546875, "completions/mean_terminated_length": 772.0546875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6906204659895878, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12746979515090792, "kl": 0.10107421875, "learning_rate": 5.3021289704963785e-06, "loss": 0.0136, "num_tokens": 969136700.0, "reward": 2.06201171875, "reward_std": 0.17832285165786743, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 732.26171875, "completions/mean_terminated_length": 732.26171875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.6909618503029786, "frac_reward_zero_std": 0.5, "grad_norm": 0.12345740840164675, "kl": 0.1025390625, "learning_rate": 5.291615301327662e-06, "loss": 0.021, "num_tokens": 969596066.0, "reward": 2.1181640625, "reward_std": 0.16829481720924377, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.022097086533904076, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 745.625, "completions/mean_terminated_length": 745.625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6913032346163693, "frac_reward_zero_std": 0.5, "grad_norm": 0.12267508645302172, "kl": 0.1025390625, "learning_rate": 5.281108314849851e-06, "loss": 0.0021, "num_tokens": 970066146.0, "reward": 2.0244140625, "reward_std": 0.16133596003055573, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 736.958984375, "completions/mean_terminated_length": 736.958984375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.6916446189297601, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11981446337399367, "kl": 0.10107421875, "learning_rate": 5.270608025975687e-06, "loss": 0.0084, "num_tokens": 970524381.0, "reward": 2.13134765625, "reward_std": 0.19658315181732178, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1586.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 778.08984375, "completions/mean_terminated_length": 778.08984375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.691986003243151, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1107831472524514, "kl": 0.09912109375, "learning_rate": 5.260114449608415e-06, "loss": 0.018, "num_tokens": 970999867.0, "reward": 2.064453125, "reward_std": 0.15245652198791504, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 814.8046875, "completions/mean_terminated_length": 814.8046875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6923273875565418, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11208166130328234, "kl": 0.0970458984375, "learning_rate": 5.249627600641747e-06, "loss": 0.0147, "num_tokens": 971509543.0, "reward": 2.07177734375, "reward_std": 0.15287816524505615, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 746.79296875, "completions/mean_terminated_length": 746.79296875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6926687718699326, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10649324647527986, "kl": 0.0986328125, "learning_rate": 5.23914749395984e-06, "loss": 0.0001, "num_tokens": 971988957.0, "reward": 2.11767578125, "reward_std": 0.14742454886436462, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 732.986328125, "completions/mean_terminated_length": 730.4129028320312, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.6930101561833234, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10823831898725403, "kl": 0.0992431640625, "learning_rate": 5.2286741444372975e-06, "loss": 0.018, "num_tokens": 972444982.0, "reward": 2.08642578125, "reward_std": 0.17701706290245056, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 779.181640625, "completions/mean_terminated_length": 779.181640625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6933515404967142, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11264447597913584, "kl": 0.099853515625, "learning_rate": 5.2182075669391164e-06, "loss": 0.0075, "num_tokens": 972932803.0, "reward": 2.05859375, "reward_std": 0.16507941484451294, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 698.1796875, "completions/mean_terminated_length": 697.5264282226562, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.693692924810105, "frac_reward_zero_std": 0.4375, "grad_norm": 0.21127839925258773, "kl": 0.1881103515625, "learning_rate": 5.207747776320695e-06, "loss": 0.0218, "num_tokens": 973361519.0, "reward": 2.1083984375, "reward_std": 0.22784870862960815, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 730.546875, "completions/mean_terminated_length": 727.3471069335938, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.6940343091234957, "frac_reward_zero_std": 0.625, "grad_norm": 0.19957019610542714, "kl": 0.175048828125, "learning_rate": 5.1972947874277915e-06, "loss": 0.0166, "num_tokens": 973837607.0, "reward": 2.091796875, "reward_std": 0.14681211113929749, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 769.0390625, "completions/mean_terminated_length": 769.0390625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.6943756934368865, "frac_reward_zero_std": 0.625, "grad_norm": 0.09508792006478006, "kl": 0.1002197265625, "learning_rate": 5.186848615096505e-06, "loss": 0.0115, "num_tokens": 974309115.0, "reward": 2.12841796875, "reward_std": 0.1327778697013855, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 729.931640625, "completions/mean_terminated_length": 729.931640625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6947170777502774, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10752882568130481, "kl": 0.1004638671875, "learning_rate": 5.176409274153277e-06, "loss": 0.0022, "num_tokens": 974767016.0, "reward": 2.10498046875, "reward_std": 0.16479960083961487, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 779.91796875, "completions/mean_terminated_length": 779.91796875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.6950584620636682, "frac_reward_zero_std": 0.625, "grad_norm": 0.10372097679380907, "kl": 0.1041259765625, "learning_rate": 5.165976779414832e-06, "loss": 0.0134, "num_tokens": 975246670.0, "reward": 2.0888671875, "reward_std": 0.14784477651119232, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 785.091796875, "completions/mean_terminated_length": 785.091796875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.695399846377059, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10638264723540172, "kl": 0.101806640625, "learning_rate": 5.1555511456882e-06, "loss": 0.0162, "num_tokens": 975738237.0, "reward": 2.05224609375, "reward_std": 0.14845740795135498, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 739.21484375, "completions/mean_terminated_length": 739.21484375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.6957412306904498, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10031769406176394, "kl": 0.1051025390625, "learning_rate": 5.145132387770648e-06, "loss": 0.0082, "num_tokens": 976198651.0, "reward": 2.03955078125, "reward_std": 0.12245914340019226, "rewards/accuracy_reward/mean": 0.058467742055654526, "rewards/accuracy_reward/std": 0.23486268520355225, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 705.669921875, "completions/mean_terminated_length": 705.669921875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6960826150038406, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12616531304885203, "kl": 0.1043701171875, "learning_rate": 5.134720520449711e-06, "loss": 0.0095, "num_tokens": 976647282.0, "reward": 2.08740234375, "reward_std": 0.18612819910049438, "rewards/accuracy_reward/mean": 0.1088709682226181, "rewards/accuracy_reward/std": 0.31179171800613403, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1455.0, "completions/max_terminated_length": 1455.0, "completions/mean_length": 750.4765625, "completions/mean_terminated_length": 750.4765625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6964239993172314, "frac_reward_zero_std": 0.625, "grad_norm": 0.09446758455188235, "kl": 0.1011962890625, "learning_rate": 5.124315558503122e-06, "loss": 0.0149, "num_tokens": 977114630.0, "reward": 2.0947265625, "reward_std": 0.1494864821434021, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 740.236328125, "completions/mean_terminated_length": 740.236328125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6967653836306221, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13865660756287132, "kl": 0.1094970703125, "learning_rate": 5.113917516698819e-06, "loss": 0.0085, "num_tokens": 977580751.0, "reward": 2.04736328125, "reward_std": 0.11193152517080307, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1281.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 690.8125, "completions/mean_terminated_length": 690.8125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6971067679440129, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1031543455339039, "kl": 0.1065673828125, "learning_rate": 5.103526409794928e-06, "loss": 0.0086, "num_tokens": 978021343.0, "reward": 2.15576171875, "reward_std": 0.12713713943958282, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 739.8984375, "completions/mean_terminated_length": 739.8984375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.6974481522574038, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10916169321039203, "kl": 0.1077880859375, "learning_rate": 5.0931422525397175e-06, "loss": 0.0112, "num_tokens": 978479467.0, "reward": 2.11474609375, "reward_std": 0.14108845591545105, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 745.154296875, "completions/mean_terminated_length": 745.154296875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.6977895365707946, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11884375073765062, "kl": 0.105224609375, "learning_rate": 5.082765059671607e-06, "loss": 0.0142, "num_tokens": 978941034.0, "reward": 2.04052734375, "reward_std": 0.14127466082572937, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 729.376953125, "completions/mean_terminated_length": 729.376953125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.6981309208841854, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11682519425628354, "kl": 0.1031494140625, "learning_rate": 5.0723948459191194e-06, "loss": 0.0142, "num_tokens": 979401643.0, "reward": 2.10595703125, "reward_std": 0.16969463229179382, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1407.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 743.8671875, "completions/mean_terminated_length": 743.1917724609375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.6984723051975762, "frac_reward_zero_std": 0.6875, "grad_norm": 3.0591362554499257, "kl": 0.7757568359375, "learning_rate": 5.062031626000873e-06, "loss": 0.0446, "num_tokens": 979868519.0, "reward": 2.03466796875, "reward_std": 0.10062548518180847, "rewards/accuracy_reward/mean": 0.05040322616696358, "rewards/accuracy_reward/std": 0.21899640560150146, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 746.9921875, "completions/mean_terminated_length": 746.9921875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.698813689510967, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12870376355540256, "kl": 0.1060791015625, "learning_rate": 5.05167541462557e-06, "loss": 0.0055, "num_tokens": 980329795.0, "reward": 2.033203125, "reward_std": 0.2012486308813095, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05386113002896309, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 712.279296875, "completions/mean_terminated_length": 712.279296875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.6991550738243578, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08350991339142015, "kl": 0.107666015625, "learning_rate": 5.041326226491951e-06, "loss": 0.0128, "num_tokens": 980774354.0, "reward": 2.056640625, "reward_std": 0.11313889920711517, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 729.115234375, "completions/mean_terminated_length": 729.115234375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.6994964581377485, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09399293238443626, "kl": 0.1065673828125, "learning_rate": 5.030984076288805e-06, "loss": 0.0199, "num_tokens": 981227405.0, "reward": 2.03857421875, "reward_std": 0.09750792384147644, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 740.78125, "completions/mean_terminated_length": 740.78125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.6998378424511393, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12531525402919874, "kl": 0.11083984375, "learning_rate": 5.020648978694913e-06, "loss": 0.0211, "num_tokens": 981688317.0, "reward": 2.09423828125, "reward_std": 0.21212685108184814, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 774.3359375, "completions/mean_terminated_length": 771.8434448242188, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7001792267645301, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09777271631782702, "kl": 0.0958251953125, "learning_rate": 5.010320948379064e-06, "loss": 0.0127, "num_tokens": 982164025.0, "reward": 2.07421875, "reward_std": 0.1544087529182434, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 784.30078125, "completions/mean_terminated_length": 784.30078125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.700520611077921, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10991477094332852, "kl": 0.1021728515625, "learning_rate": 5.000000000000003e-06, "loss": 0.0082, "num_tokens": 982647811.0, "reward": 2.03076171875, "reward_std": 0.14953526854515076, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 810.10546875, "completions/mean_terminated_length": 810.10546875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.7008619953913118, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09869394424636016, "kl": 0.09521484375, "learning_rate": 4.989686148206425e-06, "loss": 0.0177, "num_tokens": 983144537.0, "reward": 2.05859375, "reward_std": 0.1433883011341095, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 805.900390625, "completions/mean_terminated_length": 803.4696655273438, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7012033797047026, "frac_reward_zero_std": 0.5, "grad_norm": 0.1149187818207568, "kl": 0.1002197265625, "learning_rate": 4.979379407636966e-06, "loss": 0.0318, "num_tokens": 983639670.0, "reward": 2.06591796875, "reward_std": 0.19060955941677094, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1476.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 736.306640625, "completions/mean_terminated_length": 736.306640625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.7015447640180934, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11205031401312786, "kl": 0.1077880859375, "learning_rate": 4.96907979292015e-06, "loss": 0.0042, "num_tokens": 984100995.0, "reward": 2.10107421875, "reward_std": 0.1539515256881714, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 778.015625, "completions/mean_terminated_length": 778.015625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7018861483314842, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11358743310292035, "kl": 0.1021728515625, "learning_rate": 4.958787318674403e-06, "loss": 0.0151, "num_tokens": 984576731.0, "reward": 2.11083984375, "reward_std": 0.14258147776126862, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 819.302734375, "completions/mean_terminated_length": 816.8982543945312, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.7022275326448749, "frac_reward_zero_std": 0.5, "grad_norm": 0.1146231796432012, "kl": 0.103759765625, "learning_rate": 4.948501999508003e-06, "loss": 0.0356, "num_tokens": 985077590.0, "reward": 2.04443359375, "reward_std": 0.18418854475021362, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 738.9296875, "completions/mean_terminated_length": 738.9296875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.7025689169582657, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11576557917114856, "kl": 0.100341796875, "learning_rate": 4.938223850019087e-06, "loss": 0.0077, "num_tokens": 985544626.0, "reward": 2.16748046875, "reward_std": 0.20567357540130615, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39070644974708557, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 731.44140625, "completions/mean_terminated_length": 731.44140625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.7029103012716565, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12366137513198235, "kl": 0.107421875, "learning_rate": 4.927952884795605e-06, "loss": 0.0315, "num_tokens": 985997828.0, "reward": 2.138671875, "reward_std": 0.20119020342826843, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 769.11328125, "completions/mean_terminated_length": 769.11328125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.7032516855850474, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10822574514706149, "kl": 0.0965576171875, "learning_rate": 4.917689118415309e-06, "loss": 0.0175, "num_tokens": 986476798.0, "reward": 2.12060546875, "reward_std": 0.16070598363876343, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 774.353515625, "completions/mean_terminated_length": 774.353515625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7035930698984382, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1104524056205253, "kl": 0.10546875, "learning_rate": 4.907432565445744e-06, "loss": 0.0092, "num_tokens": 986969827.0, "reward": 2.0791015625, "reward_std": 0.15066951513290405, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 750.40625, "completions/mean_terminated_length": 750.40625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.703934454211829, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12414357204138397, "kl": 0.098388671875, "learning_rate": 4.8971832404442074e-06, "loss": 0.0021, "num_tokens": 987433283.0, "reward": 2.11669921875, "reward_std": 0.1339670866727829, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 810.884765625, "completions/mean_terminated_length": 810.884765625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7042758385252198, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10678886701420451, "kl": 0.09814453125, "learning_rate": 4.886941157957747e-06, "loss": 0.0115, "num_tokens": 987934152.0, "reward": 2.0380859375, "reward_std": 0.1307041347026825, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1427.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 784.4375, "completions/mean_terminated_length": 783.6497192382812, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.7046172228386106, "frac_reward_zero_std": 0.46875, "grad_norm": 0.22928060863699024, "kl": 0.255126953125, "learning_rate": 4.87670633252312e-06, "loss": 0.0216, "num_tokens": 988428952.0, "reward": 2.06689453125, "reward_std": 0.2171318084001541, "rewards/accuracy_reward/mean": 0.10282257944345474, "rewards/accuracy_reward/std": 0.30403366684913635, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 714.9140625, "completions/mean_terminated_length": 714.25048828125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.7049586071520013, "frac_reward_zero_std": 0.5, "grad_norm": 0.43490420438884164, "kl": 0.1075439453125, "learning_rate": 4.8664787786667875e-06, "loss": 0.0238, "num_tokens": 988876332.0, "reward": 2.14111328125, "reward_std": 0.16205468773841858, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 801.38671875, "completions/mean_terminated_length": 801.38671875, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.7052999914653921, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10774323258511578, "kl": 0.100341796875, "learning_rate": 4.856258510904899e-06, "loss": 0.0116, "num_tokens": 989371362.0, "reward": 2.056640625, "reward_std": 0.1453634798526764, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 779.5390625, "completions/mean_terminated_length": 779.5390625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.7056413757787829, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1208318897516213, "kl": 0.0948486328125, "learning_rate": 4.846045543743247e-06, "loss": 0.0178, "num_tokens": 989852950.0, "reward": 2.03759765625, "reward_std": 0.18036985397338867, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 805.181640625, "completions/mean_terminated_length": 805.181640625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.7059827600921738, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12450986052275066, "kl": 0.0965576171875, "learning_rate": 4.8358398916772785e-06, "loss": 0.0241, "num_tokens": 990356563.0, "reward": 2.08984375, "reward_std": 0.20382779836654663, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 771.36328125, "completions/mean_terminated_length": 771.36328125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.7063241444055646, "frac_reward_zero_std": 0.625, "grad_norm": 0.09538280654314552, "kl": 0.098876953125, "learning_rate": 4.825641569192042e-06, "loss": 0.022, "num_tokens": 990838685.0, "reward": 2.12109375, "reward_std": 0.1475500762462616, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1432.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 760.443359375, "completions/mean_terminated_length": 760.443359375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.7066655287189554, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10107810728818863, "kl": 0.095703125, "learning_rate": 4.815450590762203e-06, "loss": 0.0153, "num_tokens": 991306976.0, "reward": 2.13427734375, "reward_std": 0.15430554747581482, "rewards/accuracy_reward/mean": 0.14516128599643707, "rewards/accuracy_reward/std": 0.3526190221309662, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 772.33203125, "completions/mean_terminated_length": 772.33203125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.7070069130323462, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10584696626888425, "kl": 0.101318359375, "learning_rate": 4.805266970851975e-06, "loss": 0.017, "num_tokens": 991789322.0, "reward": 2.07470703125, "reward_std": 0.16344644129276276, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 767.6953125, "completions/mean_terminated_length": 767.6953125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.707348297345737, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0954306726603832, "kl": 0.0982666015625, "learning_rate": 4.7950907239151526e-06, "loss": 0.0144, "num_tokens": 992263118.0, "reward": 2.0615234375, "reward_std": 0.12288254499435425, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 707.587890625, "completions/mean_terminated_length": 707.587890625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.7076896816591277, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12352790712452524, "kl": 0.1026611328125, "learning_rate": 4.78492186439506e-06, "loss": 0.0197, "num_tokens": 992701275.0, "reward": 2.0947265625, "reward_std": 0.17691496014595032, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 750.96484375, "completions/mean_terminated_length": 750.96484375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.7080310659725185, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13062227880681337, "kl": 0.0985107421875, "learning_rate": 4.774760406724527e-06, "loss": 0.0149, "num_tokens": 993170169.0, "reward": 2.05419921875, "reward_std": 0.14873984456062317, "rewards/accuracy_reward/mean": 0.07258064299821854, "rewards/accuracy_reward/std": 0.25970885157585144, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 684.603515625, "completions/mean_terminated_length": 684.603515625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.7083724502859093, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13189837434610163, "kl": 0.10009765625, "learning_rate": 4.76460636532589e-06, "loss": 0.0079, "num_tokens": 993604046.0, "reward": 2.169921875, "reward_std": 0.21299269795417786, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 767.423828125, "completions/mean_terminated_length": 767.423828125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.7087138345993002, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10244943271251239, "kl": 0.095947265625, "learning_rate": 4.754459754610952e-06, "loss": 0.0104, "num_tokens": 994084967.0, "reward": 2.07177734375, "reward_std": 0.1548658013343811, "rewards/accuracy_reward/mean": 0.0927419364452362, "rewards/accuracy_reward/std": 0.2903633117675781, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 737.21875, "completions/mean_terminated_length": 737.21875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.709055218912691, "frac_reward_zero_std": 0.75, "grad_norm": 0.09473858502606197, "kl": 0.0992431640625, "learning_rate": 4.744320588980969e-06, "loss": 0.0043, "num_tokens": 994545063.0, "reward": 2.08740234375, "reward_std": 0.08631035685539246, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1955.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 671.484375, "completions/mean_terminated_length": 671.484375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.7093966032260818, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09818449547923133, "kl": 0.112060546875, "learning_rate": 4.73418888282663e-06, "loss": 0.0031, "num_tokens": 994965519.0, "reward": 2.07470703125, "reward_std": 0.1275642067193985, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 700.24609375, "completions/mean_terminated_length": 700.24609375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7097379875394726, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10435855957834034, "kl": 0.10498046875, "learning_rate": 4.724064650528042e-06, "loss": 0.0081, "num_tokens": 995409341.0, "reward": 2.04052734375, "reward_std": 0.12298581004142761, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 765.4921875, "completions/mean_terminated_length": 765.4921875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7100793718528634, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09600840909273292, "kl": 0.095947265625, "learning_rate": 4.713947906454703e-06, "loss": 0.0091, "num_tokens": 995884121.0, "reward": 2.1171875, "reward_std": 0.1521209478378296, "rewards/accuracy_reward/mean": 0.13104838132858276, "rewards/accuracy_reward/std": 0.3377939760684967, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 756.26171875, "completions/mean_terminated_length": 756.26171875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7104207561662541, "frac_reward_zero_std": 0.5, "grad_norm": 0.11833709434554099, "kl": 0.095947265625, "learning_rate": 4.703838664965476e-06, "loss": -0.0007, "num_tokens": 996355423.0, "reward": 2.07080078125, "reward_std": 0.1716424524784088, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 736.435546875, "completions/mean_terminated_length": 736.435546875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.7107621404796449, "frac_reward_zero_std": 0.625, "grad_norm": 0.11110528000795776, "kl": 0.10107421875, "learning_rate": 4.693736940408591e-06, "loss": 0.0186, "num_tokens": 996814718.0, "reward": 2.048828125, "reward_std": 0.1187463253736496, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 715.30078125, "completions/mean_terminated_length": 715.30078125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7111035247930357, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09752870806168366, "kl": 0.1029052734375, "learning_rate": 4.683642747121582e-06, "loss": 0.0129, "num_tokens": 997259224.0, "reward": 2.1103515625, "reward_std": 0.12947189807891846, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 690.79296875, "completions/mean_terminated_length": 690.79296875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7114449091064265, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1104350441420909, "kl": 0.1065673828125, "learning_rate": 4.6735560994313224e-06, "loss": 0.0024, "num_tokens": 997688030.0, "reward": 2.064453125, "reward_std": 0.14352592825889587, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 766.2109375, "completions/mean_terminated_length": 766.2109375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.7117862934198174, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11262670982400147, "kl": 0.1031494140625, "learning_rate": 4.663477011653955e-06, "loss": 0.0126, "num_tokens": 998162410.0, "reward": 2.03564453125, "reward_std": 0.14511658251285553, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 760.029296875, "completions/mean_terminated_length": 760.029296875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7121276777332082, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10238495914090655, "kl": 0.099365234375, "learning_rate": 4.653405498094911e-06, "loss": 0.0086, "num_tokens": 998631673.0, "reward": 2.0498046875, "reward_std": 0.1485825479030609, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 736.015625, "completions/mean_terminated_length": 736.015625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.712469062046599, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11059050249514897, "kl": 0.0966796875, "learning_rate": 4.643341573048853e-06, "loss": 0.0083, "num_tokens": 999095905.0, "reward": 2.10107421875, "reward_std": 0.19338315725326538, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 816.166015625, "completions/mean_terminated_length": 816.166015625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.7128104463599898, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10950385573125082, "kl": 0.101806640625, "learning_rate": 4.633285250799686e-06, "loss": 0.0155, "num_tokens": 999602726.0, "reward": 2.046875, "reward_std": 0.16972284018993378, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 732.763671875, "completions/mean_terminated_length": 732.763671875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.7131518306733805, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10818168050796399, "kl": 0.103515625, "learning_rate": 4.623236545620528e-06, "loss": 0.0091, "num_tokens": 1000060877.0, "reward": 2.08154296875, "reward_std": 0.1542920023202896, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 748.162109375, "completions/mean_terminated_length": 748.162109375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.7134932149867713, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11340876634085766, "kl": 0.103515625, "learning_rate": 4.613195471773661e-06, "loss": 0.0235, "num_tokens": 1000522416.0, "reward": 2.05908203125, "reward_std": 0.16126471757888794, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 699.109375, "completions/mean_terminated_length": 699.109375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.7138345993001621, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12722535021677825, "kl": 0.1036376953125, "learning_rate": 4.603162043510566e-06, "loss": 0.0139, "num_tokens": 1000961000.0, "reward": 2.18798828125, "reward_std": 0.2019350528717041, "rewards/accuracy_reward/mean": 0.20564515888690948, "rewards/accuracy_reward/std": 0.40458032488822937, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 760.9140625, "completions/mean_terminated_length": 760.9140625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.714175983613553, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11832845468952521, "kl": 0.0948486328125, "learning_rate": 4.593136275071851e-06, "loss": 0.0196, "num_tokens": 1001436284.0, "reward": 2.09228515625, "reward_std": 0.20821265876293182, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 778.53515625, "completions/mean_terminated_length": 776.0509033203125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.7145173679269438, "frac_reward_zero_std": 0.625, "grad_norm": 0.08772208393278776, "kl": 0.092529296875, "learning_rate": 4.5831181806872695e-06, "loss": 0.0222, "num_tokens": 1001913310.0, "reward": 2.111328125, "reward_std": 0.14681750535964966, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 760.833984375, "completions/mean_terminated_length": 760.833984375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.7148587522403346, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11469965002594809, "kl": 0.0933837890625, "learning_rate": 4.573107774575665e-06, "loss": 0.0131, "num_tokens": 1002382745.0, "reward": 2.06787109375, "reward_std": 0.16421960294246674, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 768.552734375, "completions/mean_terminated_length": 768.552734375, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.7152001365537254, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11163985743790847, "kl": 0.1016845703125, "learning_rate": 4.563105070944987e-06, "loss": 0.019, "num_tokens": 1002853444.0, "reward": 2.04736328125, "reward_std": 0.15100422501564026, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1971.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 823.126953125, "completions/mean_terminated_length": 823.126953125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7155415208671162, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10016776803840798, "kl": 0.0958251953125, "learning_rate": 4.553110083992237e-06, "loss": 0.0119, "num_tokens": 1003356597.0, "reward": 2.0302734375, "reward_std": 0.1558925062417984, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 706.837890625, "completions/mean_terminated_length": 704.2133178710938, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.7158829051805069, "frac_reward_zero_std": 0.375, "grad_norm": 0.13140494056779023, "kl": 0.10107421875, "learning_rate": 4.54312282790347e-06, "loss": 0.0182, "num_tokens": 1003795170.0, "reward": 2.11767578125, "reward_std": 0.24807727336883545, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 698.349609375, "completions/mean_terminated_length": 696.888427734375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7162242894938977, "frac_reward_zero_std": 0.65625, "grad_norm": 2.82143411360932, "kl": 0.9134521484375, "learning_rate": 4.533143316853776e-06, "loss": 0.0375, "num_tokens": 1004242277.0, "reward": 2.07763671875, "reward_std": 0.12286057323217392, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 729.767578125, "completions/mean_terminated_length": 729.767578125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7165656738072885, "frac_reward_zero_std": 0.625, "grad_norm": 0.09764359099584555, "kl": 0.09716796875, "learning_rate": 4.52317156500724e-06, "loss": 0.0107, "num_tokens": 1004705726.0, "reward": 2.09228515625, "reward_std": 0.12315435707569122, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 752.65625, "completions/mean_terminated_length": 752.113525390625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.7169070581206793, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4717622066486411, "kl": 0.1666259765625, "learning_rate": 4.513207586516943e-06, "loss": 0.0147, "num_tokens": 1005165422.0, "reward": 2.1025390625, "reward_std": 0.19573919475078583, "rewards/accuracy_reward/mean": 0.11693548411130905, "rewards/accuracy_reward/std": 0.3216678202152252, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 822.44140625, "completions/mean_terminated_length": 822.44140625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.7172484424340702, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10455473305005103, "kl": 0.0985107421875, "learning_rate": 4.503251395524931e-06, "loss": 0.0199, "num_tokens": 1005675584.0, "reward": 2.09765625, "reward_std": 0.19760385155677795, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1637.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 726.484375, "completions/mean_terminated_length": 726.484375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.717589826747461, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13197603800414817, "kl": 0.099609375, "learning_rate": 4.493303006162188e-06, "loss": 0.0149, "num_tokens": 1006136728.0, "reward": 2.1376953125, "reward_std": 0.2068256139755249, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 765.486328125, "completions/mean_terminated_length": 765.486328125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7179312110608518, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11857890339266794, "kl": 0.0997314453125, "learning_rate": 4.483362432548645e-06, "loss": 0.008, "num_tokens": 1006612081.0, "reward": 2.14208984375, "reward_std": 0.20582345128059387, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 805.701171875, "completions/mean_terminated_length": 805.701171875, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.7182725953742426, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09978256206631556, "kl": 0.09716796875, "learning_rate": 4.473429688793118e-06, "loss": 0.0144, "num_tokens": 1007107064.0, "reward": 2.0537109375, "reward_std": 0.16002313792705536, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 734.865234375, "completions/mean_terminated_length": 734.865234375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.7186139796876334, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11161741482409486, "kl": 0.1002197265625, "learning_rate": 4.463504788993327e-06, "loss": 0.0116, "num_tokens": 1007569155.0, "reward": 2.13671875, "reward_std": 0.1707926094532013, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 791.009765625, "completions/mean_terminated_length": 791.009765625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.7189553640010241, "frac_reward_zero_std": 0.625, "grad_norm": 0.1161721085047564, "kl": 0.1002197265625, "learning_rate": 4.453587747235847e-06, "loss": 0.0287, "num_tokens": 1008055448.0, "reward": 2.02294921875, "reward_std": 0.12296046316623688, "rewards/accuracy_reward/mean": 0.04032257944345474, "rewards/accuracy_reward/std": 0.19691328704357147, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 769.45703125, "completions/mean_terminated_length": 769.45703125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7192967483144149, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10103995352835363, "kl": 0.0938720703125, "learning_rate": 4.443678577596109e-06, "loss": 0.019, "num_tokens": 1008535234.0, "reward": 2.1513671875, "reward_std": 0.1677519679069519, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 800.466796875, "completions/mean_terminated_length": 799.1663208007812, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.7196381326278057, "frac_reward_zero_std": 0.53125, "grad_norm": 0.2686746914767253, "kl": 0.178955078125, "learning_rate": 4.433777294138366e-06, "loss": 0.0104, "num_tokens": 1009025905.0, "reward": 2.0478515625, "reward_std": 0.16900154948234558, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 760.068359375, "completions/mean_terminated_length": 760.068359375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7199795169411966, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1130656438412054, "kl": 0.093505859375, "learning_rate": 4.4238839109156735e-06, "loss": 0.0118, "num_tokens": 1009493476.0, "reward": 2.09521484375, "reward_std": 0.1476316899061203, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 731.072265625, "completions/mean_terminated_length": 731.072265625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.7203209012545874, "frac_reward_zero_std": 0.5, "grad_norm": 0.12943444789604278, "kl": 0.09619140625, "learning_rate": 4.413998441969889e-06, "loss": 0.0189, "num_tokens": 1009956345.0, "reward": 2.09716796875, "reward_std": 0.1811424195766449, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 695.046875, "completions/mean_terminated_length": 695.046875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7206622855679782, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11108772582024322, "kl": 0.10498046875, "learning_rate": 4.404120901331618e-06, "loss": 0.0127, "num_tokens": 1010408529.0, "reward": 2.03564453125, "reward_std": 0.1395827680826187, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 756.974609375, "completions/mean_terminated_length": 756.974609375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.721003669881369, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13351830558329708, "kl": 0.0960693359375, "learning_rate": 4.39425130302023e-06, "loss": 0.0193, "num_tokens": 1010877668.0, "reward": 2.11572265625, "reward_std": 0.21046508848667145, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 734.04296875, "completions/mean_terminated_length": 734.04296875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.7213450541947598, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12046131888024755, "kl": 0.10107421875, "learning_rate": 4.384389661043813e-06, "loss": 0.0082, "num_tokens": 1011333354.0, "reward": 2.14306640625, "reward_std": 0.19633442163467407, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 778.57421875, "completions/mean_terminated_length": 778.57421875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.7216864385081505, "frac_reward_zero_std": 0.625, "grad_norm": 0.09716349998935347, "kl": 0.101318359375, "learning_rate": 4.374535989399159e-06, "loss": 0.0141, "num_tokens": 1011808016.0, "reward": 2.03369140625, "reward_std": 0.1273026019334793, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 747.91015625, "completions/mean_terminated_length": 737.6732177734375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.7220278228215413, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1165071163046269, "kl": 0.098876953125, "learning_rate": 4.36469030207176e-06, "loss": 0.0174, "num_tokens": 1012276610.0, "reward": 2.10986328125, "reward_std": 0.16684389114379883, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 723.033203125, "completions/mean_terminated_length": 722.0195922851562, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7223692071349321, "frac_reward_zero_std": 0.625, "grad_norm": 0.23719071377250686, "kl": 0.2083740234375, "learning_rate": 4.354852613035763e-06, "loss": 0.0125, "num_tokens": 1012724995.0, "reward": 2.0712890625, "reward_std": 0.13486310839653015, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 690.677734375, "completions/mean_terminated_length": 690.677734375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.722710591448323, "frac_reward_zero_std": 0.53125, "grad_norm": 0.13055274310379056, "kl": 0.114013671875, "learning_rate": 4.345022936253972e-06, "loss": 0.0092, "num_tokens": 1013167166.0, "reward": 2.091796875, "reward_std": 0.16054841876029968, "rewards/accuracy_reward/mean": 0.1088709682226181, "rewards/accuracy_reward/std": 0.3117917478084564, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 708.51171875, "completions/mean_terminated_length": 708.51171875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.7230519757617138, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11726437115401993, "kl": 0.1075439453125, "learning_rate": 4.33520128567781e-06, "loss": 0.0196, "num_tokens": 1013616308.0, "reward": 2.14794921875, "reward_std": 0.16198712587356567, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1715.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 779.060546875, "completions/mean_terminated_length": 779.060546875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.7233933600751046, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12798474754202877, "kl": 0.1025390625, "learning_rate": 4.325387675247321e-06, "loss": 0.0199, "num_tokens": 1014111667.0, "reward": 2.0322265625, "reward_std": 0.16672256588935852, "rewards/accuracy_reward/mean": 0.05443548411130905, "rewards/accuracy_reward/std": 0.227104052901268, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 768.3046875, "completions/mean_terminated_length": 768.3046875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7237347443884954, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10875878392985924, "kl": 0.1029052734375, "learning_rate": 4.3155821188911265e-06, "loss": 0.0234, "num_tokens": 1014588319.0, "reward": 2.04833984375, "reward_std": 0.14927825331687927, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 795.654296875, "completions/mean_terminated_length": 795.654296875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.7240761287018862, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10224848182361013, "kl": 0.1026611328125, "learning_rate": 4.3057846305264164e-06, "loss": 0.0039, "num_tokens": 1015091134.0, "reward": 2.099609375, "reward_std": 0.1518208384513855, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 784.005859375, "completions/mean_terminated_length": 784.005859375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7244175130152769, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11151861210708472, "kl": 0.0980224609375, "learning_rate": 4.29599522405894e-06, "loss": 0.0082, "num_tokens": 1015576417.0, "reward": 2.14794921875, "reward_std": 0.17306667566299438, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1690.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 776.4765625, "completions/mean_terminated_length": 776.4765625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.7247588973286677, "frac_reward_zero_std": 0.5, "grad_norm": 0.10968847279834042, "kl": 0.0943603515625, "learning_rate": 4.286213913382963e-06, "loss": 0.023, "num_tokens": 1016060597.0, "reward": 2.10400390625, "reward_std": 0.17833377420902252, "rewards/accuracy_reward/mean": 0.13124999403953552, "rewards/accuracy_reward/std": 0.33802586793899536, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 783.408203125, "completions/mean_terminated_length": 782.8179931640625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7251002816420585, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3840072516696254, "kl": 0.146484375, "learning_rate": 4.276440712381275e-06, "loss": 0.0121, "num_tokens": 1016543702.0, "reward": 2.06005859375, "reward_std": 0.15233442187309265, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 777.375, "completions/mean_terminated_length": 777.375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7254416659554493, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10658638551086382, "kl": 0.101806640625, "learning_rate": 4.266675634925137e-06, "loss": 0.0118, "num_tokens": 1017025126.0, "reward": 2.1279296875, "reward_std": 0.15921558439731598, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1853.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 793.59375, "completions/mean_terminated_length": 793.59375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7257830502688402, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10961270796386702, "kl": 0.0972900390625, "learning_rate": 4.2569186948743e-06, "loss": 0.0126, "num_tokens": 1017513734.0, "reward": 2.08154296875, "reward_std": 0.1762491911649704, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 765.96875, "completions/mean_terminated_length": 765.96875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.726124434582231, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11818498022047987, "kl": 0.097900390625, "learning_rate": 4.247169906076953e-06, "loss": 0.0079, "num_tokens": 1018000006.0, "reward": 2.111328125, "reward_std": 0.18563778698444366, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 786.1953125, "completions/mean_terminated_length": 786.1953125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7264658188956218, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09486330307135654, "kl": 0.087646484375, "learning_rate": 4.237429282369712e-06, "loss": 0.014, "num_tokens": 1018490826.0, "reward": 2.05615234375, "reward_std": 0.14380665123462677, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293973088264465, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1596.0, "completions/max_terminated_length": 1596.0, "completions/mean_length": 747.1640625, "completions/mean_terminated_length": 745.5029296875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.7268072032090126, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2884039074418678, "kl": 0.3419189453125, "learning_rate": 4.227696837577619e-06, "loss": 0.0349, "num_tokens": 1018953758.0, "reward": 2.015625, "reward_std": 0.14459088444709778, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 752.25, "completions/mean_terminated_length": 752.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7271485875224033, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11062779653809109, "kl": 0.09375, "learning_rate": 4.217972585514095e-06, "loss": 0.0039, "num_tokens": 1019423566.0, "reward": 2.12060546875, "reward_std": 0.17173388600349426, "rewards/accuracy_reward/mean": 0.13709677755832672, "rewards/accuracy_reward/std": 0.34429675340652466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 751.921875, "completions/mean_terminated_length": 751.921875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7274899718357941, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11635625701448118, "kl": 0.0947265625, "learning_rate": 4.20825653998094e-06, "loss": -0.0009, "num_tokens": 1019887062.0, "reward": 2.15625, "reward_std": 0.20984061062335968, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 882.52734375, "completions/mean_terminated_length": 882.52734375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.7278313561491849, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08678506176449866, "kl": 0.08984375, "learning_rate": 4.198548714768305e-06, "loss": 0.0059, "num_tokens": 1020424004.0, "reward": 2.03466796875, "reward_std": 0.09172322601079941, "rewards/accuracy_reward/mean": 0.04233871027827263, "rewards/accuracy_reward/std": 0.2015640139579773, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 800.166015625, "completions/mean_terminated_length": 800.166015625, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.7281727404625757, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1020311227401331, "kl": 0.0946044921875, "learning_rate": 4.188849123654663e-06, "loss": 0.0145, "num_tokens": 1020913897.0, "reward": 2.0458984375, "reward_std": 0.15262088179588318, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 749.060546875, "completions/mean_terminated_length": 749.060546875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.7285141247759666, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10152002179597214, "kl": 0.0982666015625, "learning_rate": 4.179157780406821e-06, "loss": 0.0022, "num_tokens": 1021398120.0, "reward": 2.08935546875, "reward_std": 0.17088833451271057, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 779.814453125, "completions/mean_terminated_length": 774.8412475585938, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7288555090893574, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11112077160828451, "kl": 0.0902099609375, "learning_rate": 4.16947469877986e-06, "loss": 0.0157, "num_tokens": 1021875353.0, "reward": 2.08056640625, "reward_std": 0.15056924521923065, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1951.0, "completions/max_terminated_length": 1951.0, "completions/mean_length": 780.068359375, "completions/mean_terminated_length": 780.068359375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7291968934027482, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08175094022844323, "kl": 0.0880126953125, "learning_rate": 4.159799892517148e-06, "loss": 0.0045, "num_tokens": 1022362764.0, "reward": 2.07080078125, "reward_std": 0.07836808264255524, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 822.833984375, "completions/mean_terminated_length": 822.833984375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.729538277716139, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10426128228490861, "kl": 0.0906982421875, "learning_rate": 4.150133375350299e-06, "loss": 0.0107, "num_tokens": 1022874407.0, "reward": 2.06689453125, "reward_std": 0.1503373682498932, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 778.55859375, "completions/mean_terminated_length": 777.9373779296875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.7298796620295297, "frac_reward_zero_std": 0.59375, "grad_norm": 0.29190108355664696, "kl": 0.3553466796875, "learning_rate": 4.140475160999175e-06, "loss": 0.0228, "num_tokens": 1023357061.0, "reward": 2.0830078125, "reward_std": 0.16274067759513855, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.041276250034570694, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 767.83984375, "completions/mean_terminated_length": 767.83984375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.7302210463429205, "frac_reward_zero_std": 0.625, "grad_norm": 0.1067582593809987, "kl": 0.0972900390625, "learning_rate": 4.130825263171833e-06, "loss": 0.0124, "num_tokens": 1023831795.0, "reward": 2.052734375, "reward_std": 0.12696832418441772, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 738.36328125, "completions/mean_terminated_length": 738.36328125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.7305624306563113, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09485397086097218, "kl": 0.097412109375, "learning_rate": 4.121183695564541e-06, "loss": 0.012, "num_tokens": 1024290813.0, "reward": 2.11572265625, "reward_std": 0.12013313174247742, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 804.47265625, "completions/mean_terminated_length": 804.47265625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7309038149697021, "frac_reward_zero_std": 0.625, "grad_norm": 0.09325203480056199, "kl": 0.093994140625, "learning_rate": 4.111550471861747e-06, "loss": 0.0147, "num_tokens": 1024787151.0, "reward": 2.02099609375, "reward_std": 0.10836058109998703, "rewards/accuracy_reward/mean": 0.032258063554763794, "rewards/accuracy_reward/std": 0.17686305940151215, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 709.5546875, "completions/mean_terminated_length": 709.5546875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.731245199283093, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12053965234824982, "kl": 0.100341796875, "learning_rate": 4.101925605736044e-06, "loss": 0.007, "num_tokens": 1025227243.0, "reward": 2.10595703125, "reward_std": 0.19119316339492798, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 748.25390625, "completions/mean_terminated_length": 748.25390625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7315865835964838, "frac_reward_zero_std": 0.46875, "grad_norm": 0.122090064994397, "kl": 0.0941162109375, "learning_rate": 4.092309110848173e-06, "loss": 0.0074, "num_tokens": 1025691501.0, "reward": 2.12548828125, "reward_std": 0.21427951753139496, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1445.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 744.955078125, "completions/mean_terminated_length": 744.955078125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7319279679098746, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11413935940689615, "kl": 0.0953369140625, "learning_rate": 4.082701000846988e-06, "loss": 0.0054, "num_tokens": 1026155446.0, "reward": 2.1181640625, "reward_std": 0.17440083622932434, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 752.056640625, "completions/mean_terminated_length": 752.056640625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7322693522232654, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10777680109989582, "kl": 0.09521484375, "learning_rate": 4.0731012893694435e-06, "loss": 0.0079, "num_tokens": 1026622531.0, "reward": 2.06982421875, "reward_std": 0.12234780192375183, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 741.408203125, "completions/mean_terminated_length": 741.408203125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.7326107365366561, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11558391244533026, "kl": 0.0970458984375, "learning_rate": 4.0635099900405705e-06, "loss": 0.0155, "num_tokens": 1027088788.0, "reward": 2.20263671875, "reward_std": 0.18016278743743896, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4111155867576599, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 811.146484375, "completions/mean_terminated_length": 810.2133178710938, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.7329521208500469, "frac_reward_zero_std": 0.6875, "grad_norm": 0.23091914362476482, "kl": 0.308349609375, "learning_rate": 4.0539271164734685e-06, "loss": 0.029, "num_tokens": 1027604815.0, "reward": 2.02783203125, "reward_std": 0.11392994970083237, "rewards/accuracy_reward/mean": 0.04435483738780022, "rewards/accuracy_reward/std": 0.2060900777578354, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 781.947265625, "completions/mean_terminated_length": 781.947265625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.7332935051634377, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12508502978276978, "kl": 0.09814453125, "learning_rate": 4.044352682269276e-06, "loss": 0.0099, "num_tokens": 1028098932.0, "reward": 2.09130859375, "reward_std": 0.16936585307121277, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1458.0, "completions/max_terminated_length": 1458.0, "completions/mean_length": 797.771484375, "completions/mean_terminated_length": 797.771484375, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.7336348894768285, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10494621673057285, "kl": 0.084228515625, "learning_rate": 4.034786701017145e-06, "loss": 0.0079, "num_tokens": 1028598383.0, "reward": 2.1005859375, "reward_std": 0.19521963596343994, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 724.1640625, "completions/mean_terminated_length": 724.1640625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7339762737902193, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12361735952287516, "kl": 0.0947265625, "learning_rate": 4.025229186294246e-06, "loss": 0.0209, "num_tokens": 1029047635.0, "reward": 2.12890625, "reward_std": 0.2197571098804474, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 767.201171875, "completions/mean_terminated_length": 767.201171875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.7343176581036102, "frac_reward_zero_std": 0.5, "grad_norm": 0.1280688033842171, "kl": 0.095947265625, "learning_rate": 4.0156801516657095e-06, "loss": 0.0151, "num_tokens": 1029518410.0, "reward": 2.11865234375, "reward_std": 0.21322709321975708, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.05028042942285538, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 755.734375, "completions/mean_terminated_length": 755.734375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.734659042417001, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11706632711635266, "kl": 0.0938720703125, "learning_rate": 4.006139610684654e-06, "loss": 0.0143, "num_tokens": 1029985650.0, "reward": 2.19287109375, "reward_std": 0.15253356099128723, "rewards/accuracy_reward/mean": 0.205078125, "rewards/accuracy_reward/std": 0.4041535556316376, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 766.1875, "completions/mean_terminated_length": 766.1875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.7350004267303918, "frac_reward_zero_std": 0.5, "grad_norm": 0.11259923550107458, "kl": 0.0919189453125, "learning_rate": 3.996607576892127e-06, "loss": 0.0258, "num_tokens": 1030455346.0, "reward": 2.0888671875, "reward_std": 0.18676593899726868, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 822.904296875, "completions/mean_terminated_length": 822.904296875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.7353418110437825, "frac_reward_zero_std": 0.59375, "grad_norm": 0.094854361818135, "kl": 0.0875244140625, "learning_rate": 3.987084063817107e-06, "loss": 0.0138, "num_tokens": 1030955825.0, "reward": 2.12060546875, "reward_std": 0.15779346227645874, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 781.544921875, "completions/mean_terminated_length": 781.544921875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7356831953571733, "frac_reward_zero_std": 0.625, "grad_norm": 0.0918471488007245, "kl": 0.0904541015625, "learning_rate": 3.977569084976486e-06, "loss": 0.0186, "num_tokens": 1031449848.0, "reward": 2.1064453125, "reward_std": 0.12476682662963867, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 839.021484375, "completions/mean_terminated_length": 834.2804565429688, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.7360245796705641, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10139983977824489, "kl": 0.0877685546875, "learning_rate": 3.968062653875031e-06, "loss": 0.019, "num_tokens": 1031961251.0, "reward": 2.14111328125, "reward_std": 0.15402966737747192, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 801.76171875, "completions/mean_terminated_length": 801.76171875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.7363659639839549, "frac_reward_zero_std": 0.625, "grad_norm": 0.0985148846508664, "kl": 0.0936279296875, "learning_rate": 3.958564784005382e-06, "loss": 0.0098, "num_tokens": 1032447113.0, "reward": 2.04736328125, "reward_std": 0.1312134563922882, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 772.203125, "completions/mean_terminated_length": 772.203125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.7367073482973457, "frac_reward_zero_std": 0.625, "grad_norm": 0.09952840636864198, "kl": 0.095947265625, "learning_rate": 3.949075488848026e-06, "loss": 0.0008, "num_tokens": 1032935233.0, "reward": 2.0673828125, "reward_std": 0.14729318022727966, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 771.771484375, "completions/mean_terminated_length": 769.2739868164062, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7370487326107366, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12928946617981896, "kl": 0.0966796875, "learning_rate": 3.939594781871287e-06, "loss": 0.0301, "num_tokens": 1033411164.0, "reward": 2.05126953125, "reward_std": 0.19118362665176392, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 765.142578125, "completions/mean_terminated_length": 765.142578125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.7373901169241274, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12440239272297503, "kl": 0.0938720703125, "learning_rate": 3.930122676531287e-06, "loss": 0.0148, "num_tokens": 1033886629.0, "reward": 2.04638671875, "reward_std": 0.1716238260269165, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1566.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 722.03515625, "completions/mean_terminated_length": 722.03515625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7377315012375182, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13014394846514507, "kl": 0.0975341796875, "learning_rate": 3.920659186271953e-06, "loss": 0.0113, "num_tokens": 1034342455.0, "reward": 2.13037109375, "reward_std": 0.1947098821401596, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1450.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 720.84765625, "completions/mean_terminated_length": 720.84765625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.7380728855509089, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11627744030079623, "kl": 0.10888671875, "learning_rate": 3.911204324524982e-06, "loss": 0.0052, "num_tokens": 1034789145.0, "reward": 2.07861328125, "reward_std": 0.15235085785388947, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 740.68359375, "completions/mean_terminated_length": 740.68359375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.7384142698642997, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1322037660085033, "kl": 0.096435546875, "learning_rate": 3.9017581047098154e-06, "loss": 0.0182, "num_tokens": 1035257063.0, "reward": 2.03466796875, "reward_std": 0.185065358877182, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1851.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 789.01953125, "completions/mean_terminated_length": 789.01953125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7387556541776905, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11062542751496518, "kl": 0.0958251953125, "learning_rate": 3.892320540233636e-06, "loss": 0.0268, "num_tokens": 1035742849.0, "reward": 2.03759765625, "reward_std": 0.14280661940574646, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1946.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 808.775390625, "completions/mean_terminated_length": 808.775390625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.7390970384910813, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10396344962526156, "kl": 0.09375, "learning_rate": 3.882891644491335e-06, "loss": 0.0099, "num_tokens": 1036230734.0, "reward": 2.02197265625, "reward_std": 0.13112680613994598, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.1939331740140915, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 782.337890625, "completions/mean_terminated_length": 782.337890625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.7394384228044721, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08215313930537423, "kl": 0.0908203125, "learning_rate": 3.873471430865515e-06, "loss": 0.012, "num_tokens": 1036718203.0, "reward": 2.0205078125, "reward_std": 0.09198666363954544, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1590.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 751.779296875, "completions/mean_terminated_length": 751.779296875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.739779807117863, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12926123958242888, "kl": 0.100341796875, "learning_rate": 3.864059912726438e-06, "loss": 0.0165, "num_tokens": 1037184842.0, "reward": 2.04345703125, "reward_std": 0.1797148585319519, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 739.669921875, "completions/mean_terminated_length": 739.669921875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.7401211914312538, "frac_reward_zero_std": 0.625, "grad_norm": 0.10101175241182508, "kl": 0.0968017578125, "learning_rate": 3.8546571034320356e-06, "loss": 0.0237, "num_tokens": 1037655985.0, "reward": 2.14013671875, "reward_std": 0.11882153153419495, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 791.857421875, "completions/mean_terminated_length": 791.857421875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.7404625757446446, "frac_reward_zero_std": 0.46875, "grad_norm": 0.10008263654225552, "kl": 0.0911865234375, "learning_rate": 3.845263016327884e-06, "loss": 0.0123, "num_tokens": 1038141624.0, "reward": 2.0751953125, "reward_std": 0.18462905287742615, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 775.23046875, "completions/mean_terminated_length": 775.23046875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.7408039600580353, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10872516911503742, "kl": 0.0966796875, "learning_rate": 3.835877664747156e-06, "loss": 0.0114, "num_tokens": 1038624510.0, "reward": 2.11669921875, "reward_std": 0.16393932700157166, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04538619518280029, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 794.755859375, "completions/mean_terminated_length": 794.755859375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.7411453443714261, "frac_reward_zero_std": 0.375, "grad_norm": 0.12179012926627003, "kl": 0.09130859375, "learning_rate": 3.8265010620106535e-06, "loss": 0.0202, "num_tokens": 1039112081.0, "reward": 2.07958984375, "reward_std": 0.2534647583961487, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 725.373046875, "completions/mean_terminated_length": 725.373046875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7414867286848169, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11922416935921556, "kl": 0.09912109375, "learning_rate": 3.817133221426742e-06, "loss": 0.0128, "num_tokens": 1039559568.0, "reward": 2.0673828125, "reward_std": 0.19416913390159607, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 755.92578125, "completions/mean_terminated_length": 755.92578125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7418281129982077, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09508265832843887, "kl": 0.0924072265625, "learning_rate": 3.807774156291364e-06, "loss": 0.023, "num_tokens": 1040030010.0, "reward": 2.1015625, "reward_std": 0.13606475293636322, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1833.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 808.72265625, "completions/mean_terminated_length": 808.72265625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.7421694973115985, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10105016391460209, "kl": 0.0921630859375, "learning_rate": 3.7984238798879936e-06, "loss": 0.0006, "num_tokens": 1040524556.0, "reward": 2.05908203125, "reward_std": 0.14789414405822754, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 814.70703125, "completions/mean_terminated_length": 814.70703125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.7425108816249893, "frac_reward_zero_std": 0.78125, "grad_norm": 0.06910284534697293, "kl": 0.089111328125, "learning_rate": 3.7890824054876452e-06, "loss": 0.0099, "num_tokens": 1041030918.0, "reward": 2.064453125, "reward_std": 0.06519509106874466, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 820.462890625, "completions/mean_terminated_length": 819.377685546875, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.7428522659383802, "frac_reward_zero_std": 0.5625, "grad_norm": 1.3306504060794566, "kl": 0.522705078125, "learning_rate": 3.779749746348831e-06, "loss": 0.0346, "num_tokens": 1041536723.0, "reward": 2.02783203125, "reward_std": 0.11297836899757385, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 785.6796875, "completions/mean_terminated_length": 785.6796875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.743193650251771, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12375754945822665, "kl": 0.0977783203125, "learning_rate": 3.7704259157175483e-06, "loss": 0.0105, "num_tokens": 1042033503.0, "reward": 2.01611328125, "reward_std": 0.12908855080604553, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 784.9453125, "completions/mean_terminated_length": 784.9453125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.7435350345651617, "frac_reward_zero_std": 0.625, "grad_norm": 0.10231963107192248, "kl": 0.0953369140625, "learning_rate": 3.761110926827277e-06, "loss": 0.0056, "num_tokens": 1042515139.0, "reward": 2.07177734375, "reward_std": 0.1560455560684204, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 763.974609375, "completions/mean_terminated_length": 763.974609375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.7438764188785525, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10729111465539161, "kl": 0.0928955078125, "learning_rate": 3.751804792898933e-06, "loss": 0.0177, "num_tokens": 1043012774.0, "reward": 2.04345703125, "reward_std": 0.13344144821166992, "rewards/accuracy_reward/mean": 0.060483869165182114, "rewards/accuracy_reward/std": 0.2386218160390854, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1909.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 856.587890625, "completions/mean_terminated_length": 856.587890625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.7442178031919433, "frac_reward_zero_std": 0.59375, "grad_norm": 0.08595055379275723, "kl": 0.0870361328125, "learning_rate": 3.7425075271408786e-06, "loss": 0.0171, "num_tokens": 1043528147.0, "reward": 2.05419921875, "reward_std": 0.14301379024982452, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 787.048828125, "completions/mean_terminated_length": 787.048828125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.7445591875053341, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11559852003034748, "kl": 0.0955810546875, "learning_rate": 3.7332191427488782e-06, "loss": 0.0146, "num_tokens": 1044031868.0, "reward": 2.0888671875, "reward_std": 0.14679157733917236, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 777.806640625, "completions/mean_terminated_length": 777.806640625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.7449005718187249, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12666850241606556, "kl": 0.0980224609375, "learning_rate": 3.723939652906092e-06, "loss": 0.026, "num_tokens": 1044513497.0, "reward": 2.06640625, "reward_std": 0.17883294820785522, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 746.447265625, "completions/mean_terminated_length": 746.447265625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7452419561321157, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09714060492214062, "kl": 0.095947265625, "learning_rate": 3.7146690707830646e-06, "loss": 0.0091, "num_tokens": 1044974174.0, "reward": 2.08447265625, "reward_std": 0.11906066536903381, "rewards/accuracy_reward/mean": 0.10282257944345474, "rewards/accuracy_reward/std": 0.30403366684913635, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 779.833984375, "completions/mean_terminated_length": 779.833984375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7455833404455066, "frac_reward_zero_std": 0.5, "grad_norm": 0.11933947981956856, "kl": 0.0933837890625, "learning_rate": 3.7054074095376845e-06, "loss": 0.0181, "num_tokens": 1045463913.0, "reward": 2.07421875, "reward_std": 0.18009814620018005, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1723.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 788.333984375, "completions/mean_terminated_length": 788.333984375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7459247247588974, "frac_reward_zero_std": 0.5, "grad_norm": 0.11664102504456764, "kl": 0.09228515625, "learning_rate": 3.696154682315194e-06, "loss": 0.0111, "num_tokens": 1045946116.0, "reward": 2.0498046875, "reward_std": 0.18077170848846436, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 742.443359375, "completions/mean_terminated_length": 742.443359375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7462661090722881, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08372453530014314, "kl": 0.09716796875, "learning_rate": 3.6869109022481385e-06, "loss": 0.0036, "num_tokens": 1046412935.0, "reward": 2.11279296875, "reward_std": 0.11395696550607681, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1420.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 751.73046875, "completions/mean_terminated_length": 751.73046875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7466074933856789, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11296233091034341, "kl": 0.0914306640625, "learning_rate": 3.6776760824563816e-06, "loss": 0.003, "num_tokens": 1046880445.0, "reward": 2.18505859375, "reward_std": 0.19911840558052063, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1745.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 772.349609375, "completions/mean_terminated_length": 772.349609375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.7469488776990697, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11186173219760757, "kl": 0.09375, "learning_rate": 3.6684502360470565e-06, "loss": 0.0077, "num_tokens": 1047357888.0, "reward": 2.1025390625, "reward_std": 0.16359922289848328, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 744.53515625, "completions/mean_terminated_length": 742.9021606445312, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.7472902620124605, "frac_reward_zero_std": 0.34375, "grad_norm": 0.3193445082116644, "kl": 0.149169921875, "learning_rate": 3.6592333761145616e-06, "loss": 0.017, "num_tokens": 1047819586.0, "reward": 2.14404296875, "reward_std": 0.24827003479003906, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 764.162109375, "completions/mean_terminated_length": 764.162109375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.7476316463258513, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08899046229919755, "kl": 0.0982666015625, "learning_rate": 3.6500255157405504e-06, "loss": 0.0291, "num_tokens": 1048290533.0, "reward": 2.07958984375, "reward_std": 0.13315358757972717, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 768.142578125, "completions/mean_terminated_length": 768.142578125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.7479730306392421, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2760437633785342, "kl": 0.096435546875, "learning_rate": 3.640826667993891e-06, "loss": -0.0035, "num_tokens": 1048769678.0, "reward": 2.04248046875, "reward_std": 0.13707776367664337, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1596.0, "completions/max_terminated_length": 1596.0, "completions/mean_length": 796.037109375, "completions/mean_terminated_length": 796.037109375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.748314414952633, "frac_reward_zero_std": 0.625, "grad_norm": 0.09162224210880952, "kl": 0.0841064453125, "learning_rate": 3.6316368459306705e-06, "loss": 0.0127, "num_tokens": 1049262737.0, "reward": 2.03515625, "reward_std": 0.1320703625679016, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 735.037109375, "completions/mean_terminated_length": 735.037109375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.7486557992660238, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11603929118804113, "kl": 0.096923828125, "learning_rate": 3.622456062594154e-06, "loss": 0.0147, "num_tokens": 1049711988.0, "reward": 2.0751953125, "reward_std": 0.15591508150100708, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 753.388671875, "completions/mean_terminated_length": 753.388671875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7489971835794145, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09197195557414309, "kl": 0.098876953125, "learning_rate": 3.6132843310147915e-06, "loss": 0.0121, "num_tokens": 1050180795.0, "reward": 2.0732421875, "reward_std": 0.12002617865800858, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 687.107421875, "completions/mean_terminated_length": 687.107421875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7493385678928053, "frac_reward_zero_std": 0.5, "grad_norm": 0.12675065069914407, "kl": 0.094970703125, "learning_rate": 3.604121664210174e-06, "loss": 0.0374, "num_tokens": 1050621138.0, "reward": 2.1357421875, "reward_std": 0.1749902069568634, "rewards/accuracy_reward/mean": 0.15927419066429138, "rewards/accuracy_reward/std": 0.366301029920578, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 751.802734375, "completions/mean_terminated_length": 751.802734375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7496799522061961, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10770571356681412, "kl": 0.0941162109375, "learning_rate": 3.5949680751850303e-06, "loss": -0.0003, "num_tokens": 1051092861.0, "reward": 2.0283203125, "reward_std": 0.14444352686405182, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 695.03125, "completions/mean_terminated_length": 695.03125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7500213365195869, "frac_reward_zero_std": 0.5, "grad_norm": 0.12512921292376153, "kl": 0.101806640625, "learning_rate": 3.58582357693121e-06, "loss": 0.0179, "num_tokens": 1051535917.0, "reward": 2.14306640625, "reward_std": 0.17279815673828125, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 728.79296875, "completions/mean_terminated_length": 726.2113647460938, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7503627208329777, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10140788814978542, "kl": 0.0943603515625, "learning_rate": 3.576688182427649e-06, "loss": 0.0205, "num_tokens": 1051989203.0, "reward": 2.1357421875, "reward_std": 0.16037024557590485, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 680.66796875, "completions/mean_terminated_length": 680.66796875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.7507041051463685, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10415024497778352, "kl": 0.0968017578125, "learning_rate": 3.5675619046403775e-06, "loss": 0.0086, "num_tokens": 1052415481.0, "reward": 2.06396484375, "reward_std": 0.12355998158454895, "rewards/accuracy_reward/mean": 0.07056451588869095, "rewards/accuracy_reward/std": 0.25635460019111633, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 730.87890625, "completions/mean_terminated_length": 730.87890625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.7510454894597594, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1212972427309543, "kl": 0.0919189453125, "learning_rate": 3.5584447565224735e-06, "loss": 0.0164, "num_tokens": 1052867051.0, "reward": 2.12451171875, "reward_std": 0.17783388495445251, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1668.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 778.552734375, "completions/mean_terminated_length": 778.552734375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7513868737731502, "frac_reward_zero_std": 0.5, "grad_norm": 0.11486470407460654, "kl": 0.093017578125, "learning_rate": 3.5493367510140574e-06, "loss": 0.0063, "num_tokens": 1053348070.0, "reward": 2.1142578125, "reward_std": 0.20490562915802002, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 761.419921875, "completions/mean_terminated_length": 761.419921875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.7517282580865409, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10248464766097509, "kl": 0.0921630859375, "learning_rate": 3.540237901042285e-06, "loss": 0.012, "num_tokens": 1053812477.0, "reward": 2.03515625, "reward_std": 0.1529216170310974, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 768.892578125, "completions/mean_terminated_length": 768.892578125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.7520696423999317, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11327292936181502, "kl": 0.090576171875, "learning_rate": 3.531148219521301e-06, "loss": 0.0153, "num_tokens": 1054283606.0, "reward": 2.08251953125, "reward_std": 0.16473785042762756, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 811.0859375, "completions/mean_terminated_length": 811.0859375, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.7524110267133225, "frac_reward_zero_std": 0.625, "grad_norm": 0.08841972032842407, "kl": 0.0911865234375, "learning_rate": 3.5220677193522543e-06, "loss": 0.0094, "num_tokens": 1054786098.0, "reward": 2.11474609375, "reward_std": 0.13928142189979553, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 800.416015625, "completions/mean_terminated_length": 797.9745483398438, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.7527524110267133, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10366504444443464, "kl": 0.0914306640625, "learning_rate": 3.5129964134232464e-06, "loss": 0.0187, "num_tokens": 1055275527.0, "reward": 2.03271484375, "reward_std": 0.13645225763320923, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 739.55078125, "completions/mean_terminated_length": 739.005859375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.7530937953401041, "frac_reward_zero_std": 0.5, "grad_norm": 0.4350159546840303, "kl": 0.1572265625, "learning_rate": 3.503934314609343e-06, "loss": 0.0336, "num_tokens": 1055738225.0, "reward": 2.10107421875, "reward_std": 0.18025422096252441, "rewards/accuracy_reward/mean": 0.12096773833036423, "rewards/accuracy_reward/std": 0.32641899585723877, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 817.458984375, "completions/mean_terminated_length": 817.458984375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.7534351796534949, "frac_reward_zero_std": 0.25, "grad_norm": 0.1405783763237198, "kl": 0.08984375, "learning_rate": 3.4948814357725346e-06, "loss": 0.0134, "num_tokens": 1056240300.0, "reward": 2.10302734375, "reward_std": 0.2677418291568756, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 786.763671875, "completions/mean_terminated_length": 786.763671875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7537765639668857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08563031523577637, "kl": 0.090576171875, "learning_rate": 3.4858377897617214e-06, "loss": 0.0189, "num_tokens": 1056732067.0, "reward": 2.08447265625, "reward_std": 0.13209359347820282, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 741.650390625, "completions/mean_terminated_length": 739.0939331054688, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.7541179482802766, "frac_reward_zero_std": 0.5, "grad_norm": 0.12446663077076038, "kl": 0.0943603515625, "learning_rate": 3.476803389412713e-06, "loss": 0.0141, "num_tokens": 1057199424.0, "reward": 2.125, "reward_std": 0.18656659126281738, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 795.109375, "completions/mean_terminated_length": 795.109375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7544593325936674, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12150761815936931, "kl": 0.0916748046875, "learning_rate": 3.4677782475481803e-06, "loss": 0.0222, "num_tokens": 1057695848.0, "reward": 2.0849609375, "reward_std": 0.17935554683208466, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1847.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 713.279296875, "completions/mean_terminated_length": 713.279296875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.7548007169070581, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09121621771820847, "kl": 0.091552734375, "learning_rate": 3.458762376977669e-06, "loss": 0.0177, "num_tokens": 1058138279.0, "reward": 2.07763671875, "reward_std": 0.10555906593799591, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 698.24609375, "completions/mean_terminated_length": 695.6046752929688, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.7551421012204489, "frac_reward_zero_std": 0.625, "grad_norm": 0.09314244174524261, "kl": 0.0997314453125, "learning_rate": 3.4497557904975555e-06, "loss": 0.0274, "num_tokens": 1058581125.0, "reward": 2.1005859375, "reward_std": 0.11305820941925049, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1847.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 783.052734375, "completions/mean_terminated_length": 783.052734375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.7554834855338397, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08241584108686088, "kl": 0.094482421875, "learning_rate": 3.440758500891037e-06, "loss": 0.0161, "num_tokens": 1059061248.0, "reward": 2.0283203125, "reward_std": 0.11221987009048462, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 769.529296875, "completions/mean_terminated_length": 769.529296875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.7558248698472305, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11469687122763542, "kl": 0.0989990234375, "learning_rate": 3.4317705209281293e-06, "loss": 0.0103, "num_tokens": 1059539391.0, "reward": 2.10693359375, "reward_std": 0.16126766800880432, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 751.84765625, "completions/mean_terminated_length": 751.84765625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.7561662541606213, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10002853875702697, "kl": 0.09130859375, "learning_rate": 3.422791863365618e-06, "loss": 0.0088, "num_tokens": 1059999825.0, "reward": 2.10693359375, "reward_std": 0.1508839726448059, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 740.994140625, "completions/mean_terminated_length": 740.994140625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7565076384740121, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10805142917279817, "kl": 0.0963134765625, "learning_rate": 3.4138225409470704e-06, "loss": 0.0138, "num_tokens": 1060457454.0, "reward": 2.08154296875, "reward_std": 0.16153162717819214, "rewards/accuracy_reward/mean": 0.09677419066429138, "rewards/accuracy_reward/std": 0.2959485352039337, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 733.35546875, "completions/mean_terminated_length": 733.35546875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.756849022787403, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11467789162139697, "kl": 0.09716796875, "learning_rate": 3.4048625664027957e-06, "loss": 0.0106, "num_tokens": 1060910772.0, "reward": 2.1474609375, "reward_std": 0.20898738503456116, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 744.7578125, "completions/mean_terminated_length": 744.7578125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7571904071007938, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10375180586265653, "kl": 0.0860595703125, "learning_rate": 3.3959119524498475e-06, "loss": 0.0128, "num_tokens": 1061369432.0, "reward": 2.1357421875, "reward_std": 0.16940021514892578, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1839.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 779.19140625, "completions/mean_terminated_length": 779.19140625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.7575317914141845, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11383187114788797, "kl": 0.09814453125, "learning_rate": 3.3869707117919725e-06, "loss": 0.0227, "num_tokens": 1061849274.0, "reward": 2.05908203125, "reward_std": 0.15262478590011597, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1937.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 754.720703125, "completions/mean_terminated_length": 754.720703125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7578731757275753, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12700405531991585, "kl": 0.0933837890625, "learning_rate": 3.378038857119632e-06, "loss": 0.0084, "num_tokens": 1062322475.0, "reward": 2.0732421875, "reward_std": 0.2016068696975708, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 785.841796875, "completions/mean_terminated_length": 785.841796875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.7582145600409661, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11926075585146514, "kl": 0.09765625, "learning_rate": 3.3691164011099632e-06, "loss": 0.0154, "num_tokens": 1062806922.0, "reward": 2.1533203125, "reward_std": 0.23726814985275269, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 757.0, "completions/mean_terminated_length": 757.0, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.7585559443543569, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13274791800292077, "kl": 0.095947265625, "learning_rate": 3.3602033564267555e-06, "loss": 0.0201, "num_tokens": 1063277450.0, "reward": 2.09521484375, "reward_std": 0.2140478938817978, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 805.556640625, "completions/mean_terminated_length": 805.556640625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7588973286677477, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11128501018317417, "kl": 0.0933837890625, "learning_rate": 3.3512997357204502e-06, "loss": 0.0129, "num_tokens": 1063781543.0, "reward": 2.09326171875, "reward_std": 0.16313199698925018, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 776.255859375, "completions/mean_terminated_length": 776.255859375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7592387129811385, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1285720563069898, "kl": 0.09619140625, "learning_rate": 3.3424055516281074e-06, "loss": 0.023, "num_tokens": 1064255722.0, "reward": 2.06982421875, "reward_std": 0.18461036682128906, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 745.0703125, "completions/mean_terminated_length": 745.0703125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.7595800972945294, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12960440974882065, "kl": 0.0941162109375, "learning_rate": 3.3335208167733924e-06, "loss": 0.0174, "num_tokens": 1064716446.0, "reward": 2.05908203125, "reward_std": 0.21801932156085968, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 756.525390625, "completions/mean_terminated_length": 756.525390625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.7599214816079202, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09451080959551011, "kl": 0.097412109375, "learning_rate": 3.3246455437665594e-06, "loss": 0.0081, "num_tokens": 1065183163.0, "reward": 2.04052734375, "reward_std": 0.15825429558753967, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.05493048578500748, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 756.24609375, "completions/mean_terminated_length": 756.24609375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.7602628659213109, "frac_reward_zero_std": 0.34375, "grad_norm": 0.12540611736805524, "kl": 0.0858154296875, "learning_rate": 3.315779745204437e-06, "loss": 0.0209, "num_tokens": 1065660025.0, "reward": 2.13818359375, "reward_std": 0.23173625767230988, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 744.5390625, "completions/mean_terminated_length": 744.5390625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.7606042502347017, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10354631367399263, "kl": 0.092041015625, "learning_rate": 3.3069234336704103e-06, "loss": 0.0125, "num_tokens": 1066119629.0, "reward": 2.01416015625, "reward_std": 0.14956532418727875, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 750.87109375, "completions/mean_terminated_length": 750.87109375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.7609456345480925, "frac_reward_zero_std": 0.5, "grad_norm": 0.11620870863060513, "kl": 0.09326171875, "learning_rate": 3.2980766217343852e-06, "loss": 0.0043, "num_tokens": 1066585883.0, "reward": 2.09912109375, "reward_std": 0.16045255959033966, "rewards/accuracy_reward/mean": 0.11088709533214569, "rewards/accuracy_reward/std": 0.3143092691898346, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 756.21875, "completions/mean_terminated_length": 753.6907958984375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.7612870188614833, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12642565851489485, "kl": 0.095947265625, "learning_rate": 3.2892393219528007e-06, "loss": 0.0166, "num_tokens": 1067054523.0, "reward": 2.044921875, "reward_std": 0.17819327116012573, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 685.595703125, "completions/mean_terminated_length": 685.595703125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.7616284031748741, "frac_reward_zero_std": 0.625, "grad_norm": 0.10355544941844341, "kl": 0.0953369140625, "learning_rate": 3.2804115468685824e-06, "loss": 0.0161, "num_tokens": 1067483820.0, "reward": 2.03173828125, "reward_std": 0.12693026661872864, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 763.728515625, "completions/mean_terminated_length": 763.728515625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.7619697874882649, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10695550386082146, "kl": 0.0931396484375, "learning_rate": 3.271593309011145e-06, "loss": 0.0164, "num_tokens": 1067959537.0, "reward": 2.06982421875, "reward_std": 0.15113702416419983, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 715.90625, "completions/mean_terminated_length": 715.90625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7623111718016558, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13223457077999376, "kl": 0.1004638671875, "learning_rate": 3.26278462089636e-06, "loss": 0.02, "num_tokens": 1068422049.0, "reward": 2.1162109375, "reward_std": 0.19845956563949585, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 785.572265625, "completions/mean_terminated_length": 785.572265625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.7626525561150466, "frac_reward_zero_std": 0.5, "grad_norm": 0.11816074369153778, "kl": 0.0958251953125, "learning_rate": 3.2539854950265516e-06, "loss": 0.0264, "num_tokens": 1068911750.0, "reward": 2.07275390625, "reward_std": 0.1962774395942688, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 723.40625, "completions/mean_terminated_length": 723.40625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7629939404284373, "frac_reward_zero_std": 0.625, "grad_norm": 0.17140226062866618, "kl": 0.10546875, "learning_rate": 3.2451959438904735e-06, "loss": 0.0076, "num_tokens": 1069366118.0, "reward": 2.04345703125, "reward_std": 0.10466699302196503, "rewards/accuracy_reward/mean": 0.05040322616696358, "rewards/accuracy_reward/std": 0.21899642050266266, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 737.990234375, "completions/mean_terminated_length": 737.990234375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7633353247418281, "frac_reward_zero_std": 0.625, "grad_norm": 0.10657023684272275, "kl": 0.0947265625, "learning_rate": 3.236415979963279e-06, "loss": 0.0189, "num_tokens": 1069835281.0, "reward": 2.03369140625, "reward_std": 0.1319688856601715, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1515.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 748.828125, "completions/mean_terminated_length": 748.828125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.7636767090552189, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13134946922027274, "kl": 0.09521484375, "learning_rate": 3.2276456157065307e-06, "loss": 0.0213, "num_tokens": 1070304889.0, "reward": 2.10791015625, "reward_std": 0.2003340870141983, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 753.3671875, "completions/mean_terminated_length": 753.3671875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.7640180933686097, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11779301984664034, "kl": 0.0953369140625, "learning_rate": 3.218884863568145e-06, "loss": 0.0082, "num_tokens": 1070779301.0, "reward": 2.13134765625, "reward_std": 0.20217201113700867, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1625.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 787.056640625, "completions/mean_terminated_length": 787.056640625, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.7643594776820005, "frac_reward_zero_std": 0.46875, "grad_norm": 0.10850507447302106, "kl": 0.09423828125, "learning_rate": 3.2101337359824147e-06, "loss": 0.0208, "num_tokens": 1071261874.0, "reward": 2.07861328125, "reward_std": 0.18922682106494904, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 731.92578125, "completions/mean_terminated_length": 731.92578125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7647008619953913, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12968225099162412, "kl": 0.095947265625, "learning_rate": 3.20139224536996e-06, "loss": 0.0212, "num_tokens": 1071721996.0, "reward": 2.0869140625, "reward_std": 0.2072605937719345, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 817.328125, "completions/mean_terminated_length": 817.328125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.7650422463087821, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12498649863272747, "kl": 0.096435546875, "learning_rate": 3.1926604041377296e-06, "loss": 0.0116, "num_tokens": 1072219140.0, "reward": 2.06103515625, "reward_std": 0.19213730096817017, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.050564687699079514, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 733.61328125, "completions/mean_terminated_length": 733.61328125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.765383630622173, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13124997312681427, "kl": 0.0931396484375, "learning_rate": 3.18393822467898e-06, "loss": 0.017, "num_tokens": 1072683230.0, "reward": 2.1123046875, "reward_std": 0.22172397375106812, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 818.333984375, "completions/mean_terminated_length": 817.2211303710938, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.7657250149355637, "frac_reward_zero_std": 0.46875, "grad_norm": 0.26583539086435387, "kl": 0.3360595703125, "learning_rate": 3.1752257193732463e-06, "loss": 0.0333, "num_tokens": 1073186393.0, "reward": 2.060546875, "reward_std": 0.20300067961215973, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 766.853515625, "completions/mean_terminated_length": 766.853515625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.7660663992489545, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1697214156785074, "kl": 0.10498046875, "learning_rate": 3.166522900586334e-06, "loss": 0.0118, "num_tokens": 1073694302.0, "reward": 2.08935546875, "reward_std": 0.21894437074661255, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.951171875, "rewards/format_reward/std": 0.2157193273305893, "rewards/tag_count_reward/mean": 0.99169921875, "rewards/tag_count_reward/std": 0.06101268157362938, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 777.677734375, "completions/mean_terminated_length": 777.677734375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7664077835623453, "frac_reward_zero_std": 0.625, "grad_norm": 0.09608101737302661, "kl": 0.09765625, "learning_rate": 3.157829780670303e-06, "loss": 0.0185, "num_tokens": 1074178857.0, "reward": 2.0302734375, "reward_std": 0.12407618016004562, "rewards/accuracy_reward/mean": 0.05443548411130905, "rewards/accuracy_reward/std": 0.227104052901268, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 726.890625, "completions/mean_terminated_length": 723.5177001953125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7667491678757361, "frac_reward_zero_std": 0.53125, "grad_norm": 0.2893309651557271, "kl": 0.137451171875, "learning_rate": 3.1491463719634507e-06, "loss": 0.021, "num_tokens": 1074647969.0, "reward": 2.15966796875, "reward_std": 0.1899544596672058, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 738.384765625, "completions/mean_terminated_length": 738.384765625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7670905521891269, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13859208873707962, "kl": 0.09423828125, "learning_rate": 3.1404726867902814e-06, "loss": 0.0062, "num_tokens": 1075109238.0, "reward": 2.015625, "reward_std": 0.16619431972503662, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 689.408203125, "completions/mean_terminated_length": 689.408203125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7674319365025177, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12727395671081185, "kl": 0.0970458984375, "learning_rate": 3.1318087374615094e-06, "loss": 0.0149, "num_tokens": 1075551639.0, "reward": 2.1220703125, "reward_std": 0.19835036993026733, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 744.267578125, "completions/mean_terminated_length": 744.267578125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7677733208159085, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11730864785690415, "kl": 0.095458984375, "learning_rate": 3.123154536274031e-06, "loss": 0.019, "num_tokens": 1076015776.0, "reward": 2.09912109375, "reward_std": 0.20016926527023315, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 749.49609375, "completions/mean_terminated_length": 749.49609375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7681147051292994, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12468642378344633, "kl": 0.0963134765625, "learning_rate": 3.11451009551089e-06, "loss": 0.014, "num_tokens": 1076481646.0, "reward": 2.09912109375, "reward_std": 0.20076537132263184, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 726.513671875, "completions/mean_terminated_length": 726.513671875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7684560894426901, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10256757454447575, "kl": 0.0982666015625, "learning_rate": 3.105875427441297e-06, "loss": 0.0142, "num_tokens": 1076936405.0, "reward": 2.04736328125, "reward_std": 0.11617570370435715, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 760.0625, "completions/mean_terminated_length": 760.0625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7687974737560809, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09224946604477648, "kl": 0.0975341796875, "learning_rate": 3.09725054432058e-06, "loss": 0.009, "num_tokens": 1077411637.0, "reward": 2.04736328125, "reward_std": 0.11644909530878067, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.052765581756830215, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 715.849609375, "completions/mean_terminated_length": 715.849609375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.7691388580694717, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09531759650554378, "kl": 0.0946044921875, "learning_rate": 3.088635458390189e-06, "loss": 0.0088, "num_tokens": 1077872424.0, "reward": 2.04052734375, "reward_std": 0.11063507199287415, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 745.1640625, "completions/mean_terminated_length": 745.1640625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7694802423828625, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1017456471570565, "kl": 0.0933837890625, "learning_rate": 3.0800301818776556e-06, "loss": 0.0106, "num_tokens": 1078337372.0, "reward": 2.09326171875, "reward_std": 0.16557495296001434, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 695.87890625, "completions/mean_terminated_length": 695.87890625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7698216266962533, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12412826813430801, "kl": 0.10791015625, "learning_rate": 3.0714347269966015e-06, "loss": 0.0051, "num_tokens": 1078767518.0, "reward": 2.1376953125, "reward_std": 0.1543101966381073, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 769.591796875, "completions/mean_terminated_length": 769.591796875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.7701630110096441, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1091604057853574, "kl": 0.09130859375, "learning_rate": 3.0628491059467015e-06, "loss": 0.0169, "num_tokens": 1079248061.0, "reward": 2.033203125, "reward_std": 0.16222605109214783, "rewards/accuracy_reward/mean": 0.05443548411130905, "rewards/accuracy_reward/std": 0.227104052901268, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1752.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 755.08984375, "completions/mean_terminated_length": 755.08984375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.7705043953230349, "frac_reward_zero_std": 0.5, "grad_norm": 0.13478146167610566, "kl": 0.102294921875, "learning_rate": 3.0542733309136696e-06, "loss": 0.0145, "num_tokens": 1079715819.0, "reward": 2.0751953125, "reward_std": 0.1444714516401291, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 768.712890625, "completions/mean_terminated_length": 767.9510498046875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7708457796364258, "frac_reward_zero_std": 0.59375, "grad_norm": 1.5906855264189463, "kl": 0.638671875, "learning_rate": 3.0457074140692567e-06, "loss": 0.0332, "num_tokens": 1080189160.0, "reward": 2.06787109375, "reward_std": 0.14459572732448578, "rewards/accuracy_reward/mean": 0.08266129344701767, "rewards/accuracy_reward/std": 0.2756475806236267, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 820.865234375, "completions/mean_terminated_length": 818.4638061523438, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7711871639498165, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11180214789330602, "kl": 0.092041015625, "learning_rate": 3.0371513675712096e-06, "loss": 0.0167, "num_tokens": 1080683427.0, "reward": 2.07666015625, "reward_std": 0.19447466731071472, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.052765581756830215, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 745.455078125, "completions/mean_terminated_length": 744.5029296875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7715285482632073, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1482982627077477, "kl": 0.266357421875, "learning_rate": 3.028605203563276e-06, "loss": 0.0223, "num_tokens": 1081145564.0, "reward": 2.09912109375, "reward_std": 0.1974930763244629, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 769.359375, "completions/mean_terminated_length": 769.359375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.7718699325765981, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11997155547649048, "kl": 0.0968017578125, "learning_rate": 3.020068934175171e-06, "loss": 0.0177, "num_tokens": 1081613268.0, "reward": 2.05908203125, "reward_std": 0.18324607610702515, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 816.32421875, "completions/mean_terminated_length": 816.32421875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.7722113168899889, "frac_reward_zero_std": 0.34375, "grad_norm": 0.11425173479448099, "kl": 0.0928955078125, "learning_rate": 3.0115425715225633e-06, "loss": 0.0297, "num_tokens": 1082108426.0, "reward": 2.0078125, "reward_std": 0.21048590540885925, "rewards/accuracy_reward/mean": 0.04233871027827263, "rewards/accuracy_reward/std": 0.2015640139579773, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0347534641623497, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 850.943359375, "completions/mean_terminated_length": 850.943359375, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.7725527012033797, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10276896489921941, "kl": 0.0880126953125, "learning_rate": 3.0030261277070725e-06, "loss": 0.0078, "num_tokens": 1082626477.0, "reward": 2.044921875, "reward_std": 0.1478443741798401, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 825.619140625, "completions/mean_terminated_length": 825.619140625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.7728940855167705, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14239771569644277, "kl": 0.094482421875, "learning_rate": 2.994519614816227e-06, "loss": 0.0173, "num_tokens": 1083128874.0, "reward": 2.0732421875, "reward_std": 0.19900798797607422, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 753.333984375, "completions/mean_terminated_length": 753.333984375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7732354698301613, "frac_reward_zero_std": 0.625, "grad_norm": 0.11023713143695678, "kl": 0.0938720703125, "learning_rate": 2.9860230449234705e-06, "loss": 0.0258, "num_tokens": 1083590517.0, "reward": 2.1328125, "reward_std": 0.1287638396024704, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 833.8515625, "completions/mean_terminated_length": 833.8515625, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.7735768541435522, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10599285838627223, "kl": 0.0887451171875, "learning_rate": 2.977536430088125e-06, "loss": 0.0139, "num_tokens": 1084095801.0, "reward": 2.13916015625, "reward_std": 0.18773382902145386, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1717.0, "completions/max_terminated_length": 1717.0, "completions/mean_length": 833.796875, "completions/mean_terminated_length": 833.796875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7739182384569429, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1070636383447851, "kl": 0.0909423828125, "learning_rate": 2.9690597823553935e-06, "loss": 0.0055, "num_tokens": 1084605825.0, "reward": 2.07958984375, "reward_std": 0.14710815250873566, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 816.087890625, "completions/mean_terminated_length": 814.7847290039062, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7742596227703337, "frac_reward_zero_std": 0.5625, "grad_norm": 0.45869839827173203, "kl": 0.1729736328125, "learning_rate": 2.960593113756325e-06, "loss": 0.0219, "num_tokens": 1085104926.0, "reward": 2.1240234375, "reward_std": 0.15268336236476898, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1974.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 760.67578125, "completions/mean_terminated_length": 760.67578125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.7746010070837245, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09363425216720211, "kl": 0.09619140625, "learning_rate": 2.952136436307803e-06, "loss": 0.0117, "num_tokens": 1085576552.0, "reward": 2.1142578125, "reward_std": 0.11853906512260437, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 795.791015625, "completions/mean_terminated_length": 793.3405151367188, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7749423913971153, "frac_reward_zero_std": 0.40625, "grad_norm": 0.11320132794053935, "kl": 0.0908203125, "learning_rate": 2.9436897620125414e-06, "loss": 0.0255, "num_tokens": 1086063613.0, "reward": 2.1328125, "reward_std": 0.20537084341049194, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 835.623046875, "completions/mean_terminated_length": 835.623046875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7752837757105061, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10092623827056628, "kl": 0.086181640625, "learning_rate": 2.9352531028590426e-06, "loss": 0.0188, "num_tokens": 1086572924.0, "reward": 2.0966796875, "reward_std": 0.18982921540737152, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 757.3515625, "completions/mean_terminated_length": 757.3515625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7756251600238969, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12509950284982332, "kl": 0.090087890625, "learning_rate": 2.9268264708216065e-06, "loss": -0.0006, "num_tokens": 1087040624.0, "reward": 2.0556640625, "reward_std": 0.18428167700767517, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 836.55859375, "completions/mean_terminated_length": 834.1878662109375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.7759665443372877, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1077495443719284, "kl": 0.087646484375, "learning_rate": 2.918409877860292e-06, "loss": 0.0196, "num_tokens": 1087546878.0, "reward": 2.07373046875, "reward_std": 0.1758636236190796, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 839.990234375, "completions/mean_terminated_length": 839.990234375, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.7763079286506785, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09296546567733555, "kl": 0.0899658203125, "learning_rate": 2.910003335920918e-06, "loss": 0.0161, "num_tokens": 1088061929.0, "reward": 2.05810546875, "reward_std": 0.12649685144424438, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 866.154296875, "completions/mean_terminated_length": 861.5196533203125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.7766493129640692, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09753350139516748, "kl": 0.082275390625, "learning_rate": 2.9016068569350297e-06, "loss": 0.0256, "num_tokens": 1088585400.0, "reward": 1.9970703125, "reward_std": 0.12034836411476135, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.1385180652141571, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 752.462890625, "completions/mean_terminated_length": 752.462890625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7769906972774601, "frac_reward_zero_std": 0.5, "grad_norm": 0.1253924059475935, "kl": 0.0982666015625, "learning_rate": 2.8932204528198925e-06, "loss": 0.0218, "num_tokens": 1089050165.0, "reward": 2.07080078125, "reward_std": 0.19340604543685913, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 853.796875, "completions/mean_terminated_length": 853.796875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.7773320815908509, "frac_reward_zero_std": 0.625, "grad_norm": 0.08871768852072635, "kl": 0.087646484375, "learning_rate": 2.8848441354784775e-06, "loss": 0.008, "num_tokens": 1089563773.0, "reward": 2.08984375, "reward_std": 0.13288718461990356, "rewards/accuracy_reward/mean": 0.09677419066429138, "rewards/accuracy_reward/std": 0.2959485352039337, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 814.591796875, "completions/mean_terminated_length": 814.591796875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7776734659042417, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10365857309678896, "kl": 0.08544921875, "learning_rate": 2.8764779167994283e-06, "loss": -0.0004, "num_tokens": 1090064524.0, "reward": 2.0341796875, "reward_std": 0.14101877808570862, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 789.091796875, "completions/mean_terminated_length": 789.091796875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.7780148502176325, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1165878984795139, "kl": 0.089111328125, "learning_rate": 2.8681218086570706e-06, "loss": 0.0155, "num_tokens": 1090552011.0, "reward": 2.080078125, "reward_std": 0.16898518800735474, "rewards/accuracy_reward/mean": 0.1041666641831398, "rewards/accuracy_reward/std": 0.3057953417301178, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 766.017578125, "completions/mean_terminated_length": 766.017578125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7783562345310233, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13385264901117103, "kl": 0.0933837890625, "learning_rate": 2.859775822911366e-06, "loss": 0.0118, "num_tokens": 1091030724.0, "reward": 2.1083984375, "reward_std": 0.17876088619232178, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 789.923828125, "completions/mean_terminated_length": 789.923828125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7786976188444141, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09864780493349064, "kl": 0.0849609375, "learning_rate": 2.8514399714079135e-06, "loss": 0.0157, "num_tokens": 1091512541.0, "reward": 2.0859375, "reward_std": 0.16125009953975677, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 809.99609375, "completions/mean_terminated_length": 809.99609375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.7790390031578049, "frac_reward_zero_std": 0.625, "grad_norm": 0.09814362685752173, "kl": 0.086669921875, "learning_rate": 2.843114265977934e-06, "loss": 0.023, "num_tokens": 1092015163.0, "reward": 2.08544921875, "reward_std": 0.13892129063606262, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1931.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 831.345703125, "completions/mean_terminated_length": 829.9412841796875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7793803874711956, "frac_reward_zero_std": 0.625, "grad_norm": 0.2769682847227638, "kl": 0.3509521484375, "learning_rate": 2.8347987184382398e-06, "loss": 0.0225, "num_tokens": 1092525004.0, "reward": 2.05126953125, "reward_std": 0.10653991997241974, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 767.41796875, "completions/mean_terminated_length": 767.41796875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.7797217717845865, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1281835009776641, "kl": 0.0889892578125, "learning_rate": 2.8264933405912344e-06, "loss": 0.0196, "num_tokens": 1092998098.0, "reward": 2.095703125, "reward_std": 0.2063148319721222, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 784.005859375, "completions/mean_terminated_length": 784.005859375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7800631560979773, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08487391397378274, "kl": 0.0906982421875, "learning_rate": 2.8181981442248762e-06, "loss": 0.0104, "num_tokens": 1093483957.0, "reward": 2.10546875, "reward_std": 0.14630156755447388, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 851.005859375, "completions/mean_terminated_length": 851.005859375, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.7804045404113681, "frac_reward_zero_std": 0.5, "grad_norm": 0.10115543371278114, "kl": 0.0806884765625, "learning_rate": 2.809913141112687e-06, "loss": 0.019, "num_tokens": 1093999368.0, "reward": 2.06591796875, "reward_std": 0.17397037148475647, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 748.1015625, "completions/mean_terminated_length": 748.1015625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7807459247247589, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1279584327679576, "kl": 0.0927734375, "learning_rate": 2.8016383430137074e-06, "loss": 0.0131, "num_tokens": 1094462172.0, "reward": 2.07373046875, "reward_std": 0.19745755195617676, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 793.212890625, "completions/mean_terminated_length": 793.212890625, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.7810873090381497, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1109025547989082, "kl": 0.0921630859375, "learning_rate": 2.7933737616725e-06, "loss": 0.0079, "num_tokens": 1094947433.0, "reward": 2.056640625, "reward_std": 0.16577306389808655, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 743.353515625, "completions/mean_terminated_length": 743.353515625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.7814286933515405, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10832754575904, "kl": 0.0953369140625, "learning_rate": 2.7851194088191303e-06, "loss": 0.0129, "num_tokens": 1095417326.0, "reward": 2.05810546875, "reward_std": 0.1304713934659958, "rewards/accuracy_reward/mean": 0.07258064299821854, "rewards/accuracy_reward/std": 0.25970885157585144, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 779.708984375, "completions/mean_terminated_length": 779.708984375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.7817700776649313, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10458502023971321, "kl": 0.088623046875, "learning_rate": 2.776875296169137e-06, "loss": 0.0207, "num_tokens": 1095903465.0, "reward": 2.06396484375, "reward_std": 0.14722105860710144, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1624.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 728.5078125, "completions/mean_terminated_length": 728.5078125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.782111461978322, "frac_reward_zero_std": 0.5, "grad_norm": 0.11166147432314691, "kl": 0.1002197265625, "learning_rate": 2.7686414354235356e-06, "loss": 0.0142, "num_tokens": 1096352221.0, "reward": 2.1376953125, "reward_std": 0.18242405354976654, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 739.193359375, "completions/mean_terminated_length": 739.193359375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7824528462917129, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07110170868060864, "kl": 0.09228515625, "learning_rate": 2.760417838268784e-06, "loss": 0.0071, "num_tokens": 1096817712.0, "reward": 2.04052734375, "reward_std": 0.05934926122426987, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 742.294921875, "completions/mean_terminated_length": 742.294921875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.7827942306051037, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10833295264191586, "kl": 0.08642578125, "learning_rate": 2.7522045163767695e-06, "loss": 0.0225, "num_tokens": 1097283911.0, "reward": 2.12353515625, "reward_std": 0.16538062691688538, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 731.810546875, "completions/mean_terminated_length": 731.810546875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.7831356149184945, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1142107819409254, "kl": 0.099365234375, "learning_rate": 2.7440014814048077e-06, "loss": 0.0261, "num_tokens": 1097733590.0, "reward": 2.0615234375, "reward_std": 0.13161487877368927, "rewards/accuracy_reward/mean": 0.0786290317773819, "rewards/accuracy_reward/std": 0.26943066716194153, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 772.533203125, "completions/mean_terminated_length": 772.533203125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7834769992318853, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10802076816361773, "kl": 0.08740234375, "learning_rate": 2.7358087449955996e-06, "loss": 0.0008, "num_tokens": 1098211767.0, "reward": 2.068359375, "reward_std": 0.1645216941833496, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 744.3203125, "completions/mean_terminated_length": 744.3203125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7838183835452761, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1126908525671743, "kl": 0.0946044921875, "learning_rate": 2.7276263187772424e-06, "loss": 0.008, "num_tokens": 1098676891.0, "reward": 2.14794921875, "reward_std": 0.14454048871994019, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 743.095703125, "completions/mean_terminated_length": 743.095703125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7841597678586669, "frac_reward_zero_std": 0.59375, "grad_norm": 0.113700584575809, "kl": 0.0875244140625, "learning_rate": 2.7194542143631875e-06, "loss": 0.0173, "num_tokens": 1099141996.0, "reward": 2.1826171875, "reward_std": 0.152418315410614, "rewards/accuracy_reward/mean": 0.193359375, "rewards/accuracy_reward/std": 0.39531853795051575, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 762.8984375, "completions/mean_terminated_length": 762.8984375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7845011521720577, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12060801551028487, "kl": 0.085205078125, "learning_rate": 2.7112924433522514e-06, "loss": 0.009, "num_tokens": 1099624728.0, "reward": 2.134765625, "reward_std": 0.22227707505226135, "rewards/accuracy_reward/mean": 0.15927419066429138, "rewards/accuracy_reward/std": 0.3663010001182556, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 796.71484375, "completions/mean_terminated_length": 796.71484375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.7848425364854484, "frac_reward_zero_std": 0.5, "grad_norm": 0.11183099695485031, "kl": 0.0902099609375, "learning_rate": 2.703141017328562e-06, "loss": 0.0105, "num_tokens": 1100115174.0, "reward": 2.1025390625, "reward_std": 0.164056658744812, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 799.1328125, "completions/mean_terminated_length": 799.1328125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7851839207988393, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09157848118733591, "kl": 0.09130859375, "learning_rate": 2.6949999478615853e-06, "loss": 0.0221, "num_tokens": 1100610138.0, "reward": 2.13134765625, "reward_std": 0.12845170497894287, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 761.115234375, "completions/mean_terminated_length": 761.115234375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7855253051122301, "frac_reward_zero_std": 0.375, "grad_norm": 0.1326049873149344, "kl": 0.091796875, "learning_rate": 2.6868692465060832e-06, "loss": 0.0123, "num_tokens": 1101086741.0, "reward": 2.068359375, "reward_std": 0.20508578419685364, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 788.87890625, "completions/mean_terminated_length": 788.87890625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7858666894256209, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11556993613530793, "kl": 0.0865478515625, "learning_rate": 2.6787489248020927e-06, "loss": 0.0171, "num_tokens": 1101578903.0, "reward": 2.10107421875, "reward_std": 0.16980698704719543, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1700.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 773.974609375, "completions/mean_terminated_length": 773.974609375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7862080737390117, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10384795724632528, "kl": 0.0902099609375, "learning_rate": 2.6706389942749313e-06, "loss": 0.0106, "num_tokens": 1102057770.0, "reward": 2.14990234375, "reward_std": 0.179182767868042, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 829.546875, "completions/mean_terminated_length": 829.546875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.7865494580524025, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11199051565869571, "kl": 0.0850830078125, "learning_rate": 2.6625394664351557e-06, "loss": 0.0041, "num_tokens": 1102562738.0, "reward": 2.08349609375, "reward_std": 0.18289360404014587, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 705.662109375, "completions/mean_terminated_length": 705.662109375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.7868908423657933, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12247409168645083, "kl": 0.102294921875, "learning_rate": 2.6544503527785737e-06, "loss": 0.0194, "num_tokens": 1103003845.0, "reward": 2.12451171875, "reward_std": 0.1715179681777954, "rewards/accuracy_reward/mean": 0.1391129046678543, "rewards/accuracy_reward/std": 0.3464137017726898, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 800.90625, "completions/mean_terminated_length": 796.0157470703125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7872322266791841, "frac_reward_zero_std": 0.5, "grad_norm": 0.11353340386116376, "kl": 0.082763671875, "learning_rate": 2.6463716647861905e-06, "loss": 0.0154, "num_tokens": 1103496677.0, "reward": 2.06201171875, "reward_std": 0.17662325501441956, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 805.5234375, "completions/mean_terminated_length": 803.0919799804688, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.787573610992575, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09328668503097819, "kl": 0.0859375, "learning_rate": 2.6383034139242313e-06, "loss": 0.0291, "num_tokens": 1103992657.0, "reward": 2.07470703125, "reward_std": 0.16578909754753113, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 825.208984375, "completions/mean_terminated_length": 821.8451538085938, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.7879149953059656, "frac_reward_zero_std": 0.40625, "grad_norm": 1.2373784505406258, "kl": 0.55712890625, "learning_rate": 2.6302456116441078e-06, "loss": 0.0564, "num_tokens": 1104499404.0, "reward": 2.06640625, "reward_std": 0.21445810794830322, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04406425356864929, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1742.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 769.634765625, "completions/mean_terminated_length": 769.634765625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.7882563796193565, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12438249164464125, "kl": 0.0933837890625, "learning_rate": 2.6221982693823876e-06, "loss": 0.0101, "num_tokens": 1104974929.0, "reward": 2.083984375, "reward_std": 0.18250596523284912, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0347534641623497, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 779.923828125, "completions/mean_terminated_length": 779.923828125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.7885977639327473, "frac_reward_zero_std": 0.46875, "grad_norm": 0.13152777180575057, "kl": 0.0914306640625, "learning_rate": 2.6141613985608093e-06, "loss": 0.0247, "num_tokens": 1105454474.0, "reward": 2.08935546875, "reward_std": 0.18824262917041779, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1598.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 798.427734375, "completions/mean_terminated_length": 798.427734375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7889391482461381, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10949997384699305, "kl": 0.088623046875, "learning_rate": 2.6061350105862384e-06, "loss": 0.005, "num_tokens": 1105945253.0, "reward": 2.16845703125, "reward_std": 0.1629614233970642, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 767.3515625, "completions/mean_terminated_length": 767.3515625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.7892805325595289, "frac_reward_zero_std": 0.625, "grad_norm": 0.10896694898427914, "kl": 0.0892333984375, "learning_rate": 2.5981191168506625e-06, "loss": 0.0008, "num_tokens": 1106426425.0, "reward": 2.08056640625, "reward_std": 0.13285666704177856, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 779.14453125, "completions/mean_terminated_length": 779.14453125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.7896219168729197, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12303427591291162, "kl": 0.086181640625, "learning_rate": 2.590113728731175e-06, "loss": 0.0132, "num_tokens": 1106912371.0, "reward": 2.048828125, "reward_std": 0.15632271766662598, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 769.078125, "completions/mean_terminated_length": 764.0628051757812, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7899633011863105, "frac_reward_zero_std": 0.65625, "grad_norm": 0.21486557059430716, "kl": 0.1064453125, "learning_rate": 2.5821188575899626e-06, "loss": 0.0137, "num_tokens": 1107387499.0, "reward": 2.14111328125, "reward_std": 0.15172332525253296, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.052765581756830215, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 778.537109375, "completions/mean_terminated_length": 774.1490478515625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7903046854997013, "frac_reward_zero_std": 0.65625, "grad_norm": 0.19406292590912635, "kl": 0.315673828125, "learning_rate": 2.5741345147742856e-06, "loss": 0.0311, "num_tokens": 1107862302.0, "reward": 2.076171875, "reward_std": 0.14316029846668243, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.0695069283246994, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 899.47265625, "completions/mean_terminated_length": 899.47265625, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.790646069813092, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09218007923694603, "kl": 0.080322265625, "learning_rate": 2.566160711616453e-06, "loss": 0.0012, "num_tokens": 1108406080.0, "reward": 2.083984375, "reward_std": 0.12077422440052032, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2027.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 774.95703125, "completions/mean_terminated_length": 774.95703125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7909874541264829, "frac_reward_zero_std": 0.375, "grad_norm": 0.12479198454673832, "kl": 0.0897216796875, "learning_rate": 2.5581974594338275e-06, "loss": 0.0294, "num_tokens": 1108883242.0, "reward": 2.14599609375, "reward_std": 0.22905975580215454, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.036414988338947296, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 754.751953125, "completions/mean_terminated_length": 752.2211303710938, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7913288384398737, "frac_reward_zero_std": 0.46875, "grad_norm": 0.122873141925088, "kl": 0.0966796875, "learning_rate": 2.550244769528779e-06, "loss": 0.0291, "num_tokens": 1109356219.0, "reward": 2.1376953125, "reward_std": 0.19050170481204987, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 843.669921875, "completions/mean_terminated_length": 843.669921875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.7916702227532645, "frac_reward_zero_std": 0.46875, "grad_norm": 0.10880358540909654, "kl": 0.0902099609375, "learning_rate": 2.542302653188704e-06, "loss": 0.0236, "num_tokens": 1109874898.0, "reward": 2.0537109375, "reward_std": 0.17859822511672974, "rewards/accuracy_reward/mean": 0.07258064299821854, "rewards/accuracy_reward/std": 0.25970885157585144, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1809.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 805.7890625, "completions/mean_terminated_length": 804.7005615234375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7920116070666553, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3413185132156353, "kl": 0.1854248046875, "learning_rate": 2.534371121685979e-06, "loss": 0.0248, "num_tokens": 1110358262.0, "reward": 2.10107421875, "reward_std": 0.16367079317569733, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1682.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 818.447265625, "completions/mean_terminated_length": 818.447265625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7923529913800461, "frac_reward_zero_std": 0.5, "grad_norm": 0.10749237412607585, "kl": 0.0811767578125, "learning_rate": 2.5264501862779668e-06, "loss": 0.0184, "num_tokens": 1110860715.0, "reward": 2.099609375, "reward_std": 0.17699377238750458, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 809.78515625, "completions/mean_terminated_length": 809.78515625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7926943756934369, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09654649393714926, "kl": 0.0904541015625, "learning_rate": 2.518539858206981e-06, "loss": 0.0121, "num_tokens": 1111357245.0, "reward": 2.0595703125, "reward_std": 0.12083718925714493, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 761.4921875, "completions/mean_terminated_length": 761.4921875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.7930357600068277, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12352377166705748, "kl": 0.0931396484375, "learning_rate": 2.510640148700292e-06, "loss": 0.0079, "num_tokens": 1111823577.0, "reward": 2.0888671875, "reward_std": 0.1667931079864502, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 778.900390625, "completions/mean_terminated_length": 776.4168090820312, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.7933771443202184, "frac_reward_zero_std": 0.625, "grad_norm": 0.0975214369882003, "kl": 0.0927734375, "learning_rate": 2.5027510689700894e-06, "loss": 0.0116, "num_tokens": 1112306326.0, "reward": 2.107421875, "reward_std": 0.13128653168678284, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1682.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 754.65234375, "completions/mean_terminated_length": 754.65234375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7937185286336093, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11427773417174623, "kl": 0.091064453125, "learning_rate": 2.494872630213476e-06, "loss": -0.0013, "num_tokens": 1112773556.0, "reward": 2.03955078125, "reward_std": 0.13331636786460876, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 774.505859375, "completions/mean_terminated_length": 772.013671875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7940599129470001, "frac_reward_zero_std": 0.625, "grad_norm": 0.09501915580460249, "kl": 0.0911865234375, "learning_rate": 2.4870048436124594e-06, "loss": 0.0209, "num_tokens": 1113253063.0, "reward": 2.1328125, "reward_std": 0.12629631161689758, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 766.0546875, "completions/mean_terminated_length": 763.5459594726562, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.7944012972603909, "frac_reward_zero_std": 0.53125, "grad_norm": 0.09924602278794636, "kl": 0.083740234375, "learning_rate": 2.47914772033392e-06, "loss": 0.0299, "num_tokens": 1113724899.0, "reward": 2.197265625, "reward_std": 0.16199323534965515, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 772.5390625, "completions/mean_terminated_length": 772.5390625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.7947426815737817, "frac_reward_zero_std": 0.5, "grad_norm": 0.11441459363531237, "kl": 0.087158203125, "learning_rate": 2.4713012715296116e-06, "loss": 0.006, "num_tokens": 1114212135.0, "reward": 2.1123046875, "reward_std": 0.18034686148166656, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1954.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 768.609375, "completions/mean_terminated_length": 768.609375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7950840658871725, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13024216141031922, "kl": 0.093505859375, "learning_rate": 2.4634655083361294e-06, "loss": 0.0252, "num_tokens": 1114693231.0, "reward": 2.1123046875, "reward_std": 0.20452260971069336, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 755.94140625, "completions/mean_terminated_length": 754.2720336914062, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7954254502005633, "frac_reward_zero_std": 0.375, "grad_norm": 0.369564038289317, "kl": 0.1168212890625, "learning_rate": 2.455640441874905e-06, "loss": 0.02, "num_tokens": 1115152737.0, "reward": 2.1689453125, "reward_std": 0.22425949573516846, "rewards/accuracy_reward/mean": 0.189453125, "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 754.216796875, "completions/mean_terminated_length": 754.216796875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7957668345139541, "frac_reward_zero_std": 0.625, "grad_norm": 0.09935754832251847, "kl": 0.0992431640625, "learning_rate": 2.447826083252194e-06, "loss": 0.0086, "num_tokens": 1115623760.0, "reward": 2.04296875, "reward_std": 0.15573687851428986, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0347534641623497, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 812.052734375, "completions/mean_terminated_length": 812.052734375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.7961082188273448, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11376977851431459, "kl": 0.08935546875, "learning_rate": 2.4400224435590436e-06, "loss": 0.0142, "num_tokens": 1116134411.0, "reward": 2.0087890625, "reward_std": 0.14048263430595398, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 771.693359375, "completions/mean_terminated_length": 771.693359375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.7964496031407357, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10155942275900176, "kl": 0.08984375, "learning_rate": 2.4322295338712975e-06, "loss": 0.0055, "num_tokens": 1116618526.0, "reward": 2.04443359375, "reward_std": 0.13394060730934143, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 805.53515625, "completions/mean_terminated_length": 805.53515625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.7967909874541265, "frac_reward_zero_std": 0.53125, "grad_norm": 0.09709898124160579, "kl": 0.0870361328125, "learning_rate": 2.4244473652495624e-06, "loss": -0.0081, "num_tokens": 1117112656.0, "reward": 2.02734375, "reward_std": 0.16678684949874878, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 750.0546875, "completions/mean_terminated_length": 749.0567626953125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7971323717675173, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2106953132027507, "kl": 0.1268310546875, "learning_rate": 2.4166759487392066e-06, "loss": 0.0094, "num_tokens": 1117574236.0, "reward": 2.08984375, "reward_std": 0.19684529304504395, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05386113002896309, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 762.45703125, "completions/mean_terminated_length": 759.9412841796875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.7974737560809081, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12137525211302838, "kl": 0.094482421875, "learning_rate": 2.408915295370333e-06, "loss": 0.0097, "num_tokens": 1118049798.0, "reward": 2.033203125, "reward_std": 0.16661611199378967, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 767.294921875, "completions/mean_terminated_length": 767.294921875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7978151403942989, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12196757862499816, "kl": 0.0892333984375, "learning_rate": 2.401165416157767e-06, "loss": -0.004, "num_tokens": 1118523821.0, "reward": 2.0888671875, "reward_std": 0.1870439648628235, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 724.0546875, "completions/mean_terminated_length": 724.0546875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.7981565247076897, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1110029904331091, "kl": 0.0885009765625, "learning_rate": 2.393426322101049e-06, "loss": 0.0164, "num_tokens": 1118977065.0, "reward": 2.15185546875, "reward_std": 0.1684597134590149, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 779.89453125, "completions/mean_terminated_length": 779.89453125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.7984979090210805, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12148818704820219, "kl": 0.094970703125, "learning_rate": 2.385698024184403e-06, "loss": 0.0152, "num_tokens": 1119461987.0, "reward": 2.07275390625, "reward_std": 0.16809310019016266, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1953.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 745.71875, "completions/mean_terminated_length": 745.71875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7988392933344712, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12985684211648094, "kl": 0.0897216796875, "learning_rate": 2.3779805333767403e-06, "loss": 0.0039, "num_tokens": 1119927667.0, "reward": 2.1796875, "reward_std": 0.23868808150291443, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3968288004398346, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1916.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 756.25390625, "completions/mean_terminated_length": 756.25390625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.799180677647862, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09862337112437256, "kl": 0.0904541015625, "learning_rate": 2.3702738606316202e-06, "loss": 0.0123, "num_tokens": 1120391045.0, "reward": 2.05029296875, "reward_std": 0.11554190516471863, "rewards/accuracy_reward/mean": 0.06854838877916336, "rewards/accuracy_reward/std": 0.25293973088264465, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 752.16796875, "completions/mean_terminated_length": 752.16796875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.7995220619612529, "frac_reward_zero_std": 0.625, "grad_norm": 0.09433307152924947, "kl": 0.08544921875, "learning_rate": 2.3625780168872614e-06, "loss": 0.022, "num_tokens": 1120862571.0, "reward": 2.06640625, "reward_std": 0.1508246660232544, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2008.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 802.453125, "completions/mean_terminated_length": 802.453125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.7998634462746437, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10915576347128296, "kl": 0.088623046875, "learning_rate": 2.354893013066505e-06, "loss": 0.0189, "num_tokens": 1121359395.0, "reward": 2.06005859375, "reward_std": 0.1601342260837555, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1828.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 731.541015625, "completions/mean_terminated_length": 731.541015625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.8002048305880345, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08942147472667678, "kl": 0.0902099609375, "learning_rate": 2.3472188600768043e-06, "loss": 0.0082, "num_tokens": 1121818760.0, "reward": 2.04345703125, "reward_std": 0.09234350919723511, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 782.609375, "completions/mean_terminated_length": 782.609375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.8005462149014253, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12034473577720659, "kl": 0.0870361328125, "learning_rate": 2.339555568810221e-06, "loss": 0.0053, "num_tokens": 1122307520.0, "reward": 2.1162109375, "reward_std": 0.2050863802433014, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 775.427734375, "completions/mean_terminated_length": 775.427734375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.8008875992148161, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11596709651906603, "kl": 0.0904541015625, "learning_rate": 2.3319031501433907e-06, "loss": 0.0169, "num_tokens": 1122797995.0, "reward": 2.080078125, "reward_std": 0.17570778727531433, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 723.845703125, "completions/mean_terminated_length": 723.845703125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8012289835282069, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08968228561016195, "kl": 0.0950927734375, "learning_rate": 2.3242616149375285e-06, "loss": 0.0015, "num_tokens": 1123254620.0, "reward": 2.14892578125, "reward_std": 0.1381072700023651, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 750.380859375, "completions/mean_terminated_length": 750.380859375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.8015703678415976, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08113532137365073, "kl": 0.08837890625, "learning_rate": 2.316630974038391e-06, "loss": 0.0094, "num_tokens": 1123720223.0, "reward": 2.02880859375, "reward_std": 0.11579383909702301, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 779.8359375, "completions/mean_terminated_length": 779.8359375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.8019117521549884, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09497797608728874, "kl": 0.0863037109375, "learning_rate": 2.3090112382762765e-06, "loss": 0.0102, "num_tokens": 1124204059.0, "reward": 2.09375, "reward_std": 0.11479055881500244, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 721.595703125, "completions/mean_terminated_length": 721.595703125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8022531364683793, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1410178247636979, "kl": 0.0928955078125, "learning_rate": 2.3014024184660098e-06, "loss": 0.016, "num_tokens": 1124652540.0, "reward": 2.1015625, "reward_std": 0.20596812665462494, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 728.9140625, "completions/mean_terminated_length": 728.9140625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.8025945207817701, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1299782050215643, "kl": 0.0904541015625, "learning_rate": 2.2938045254069152e-06, "loss": 0.0225, "num_tokens": 1125116672.0, "reward": 2.07861328125, "reward_std": 0.1966109722852707, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 757.212890625, "completions/mean_terminated_length": 757.212890625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8029359050951609, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11620059553236503, "kl": 0.0867919921875, "learning_rate": 2.2862175698828183e-06, "loss": 0.0204, "num_tokens": 1125591549.0, "reward": 2.13037109375, "reward_std": 0.19289112091064453, "rewards/accuracy_reward/mean": 0.15120968222618103, "rewards/accuracy_reward/std": 0.35861483216285706, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 773.017578125, "completions/mean_terminated_length": 773.017578125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.8032772894085517, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10408099226507286, "kl": 0.090087890625, "learning_rate": 2.278641562662006e-06, "loss": 0.0146, "num_tokens": 1126073190.0, "reward": 2.07421875, "reward_std": 0.1599620282649994, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 803.826171875, "completions/mean_terminated_length": 803.826171875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8036186737219425, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13449781409681538, "kl": 0.0960693359375, "learning_rate": 2.2710765144972434e-06, "loss": 0.026, "num_tokens": 1126573725.0, "reward": 2.05517578125, "reward_std": 0.20842324197292328, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 818.46875, "completions/mean_terminated_length": 818.46875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.8039600580353333, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12631489242846541, "kl": 0.087646484375, "learning_rate": 2.263522436125729e-06, "loss": 0.007, "num_tokens": 1127081373.0, "reward": 2.09130859375, "reward_std": 0.22733303904533386, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 745.26171875, "completions/mean_terminated_length": 745.26171875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.804301442348724, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12283823432813673, "kl": 0.0938720703125, "learning_rate": 2.255979338269093e-06, "loss": 0.0246, "num_tokens": 1127542067.0, "reward": 2.11376953125, "reward_std": 0.21445593237876892, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1637.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 795.9765625, "completions/mean_terminated_length": 795.9765625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.8046428266621148, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10663893796969748, "kl": 0.0897216796875, "learning_rate": 2.2484472316333882e-06, "loss": 0.0118, "num_tokens": 1128047159.0, "reward": 2.06884765625, "reward_std": 0.19542229175567627, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 825.111328125, "completions/mean_terminated_length": 825.111328125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.8049842109755057, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10622512442142329, "kl": 0.0816650390625, "learning_rate": 2.2409261269090566e-06, "loss": 0.0077, "num_tokens": 1128563728.0, "reward": 2.06494140625, "reward_std": 0.1422327756881714, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 804.140625, "completions/mean_terminated_length": 804.140625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.8053255952888965, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08648083481614481, "kl": 0.08837890625, "learning_rate": 2.2334160347709368e-06, "loss": 0.0342, "num_tokens": 1129062584.0, "reward": 2.03076171875, "reward_std": 0.07979878038167953, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1868.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 771.591796875, "completions/mean_terminated_length": 771.591796875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.8056669796022873, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09991823599709077, "kl": 0.09423828125, "learning_rate": 2.2259169658782285e-06, "loss": 0.0125, "num_tokens": 1129533207.0, "reward": 2.05517578125, "reward_std": 0.13407176733016968, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1591.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 752.7421875, "completions/mean_terminated_length": 751.990234375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.8060083639156781, "frac_reward_zero_std": 0.5, "grad_norm": 0.3167881049360454, "kl": 0.1904296875, "learning_rate": 2.2184289308744844e-06, "loss": 0.0174, "num_tokens": 1129995363.0, "reward": 2.16162109375, "reward_std": 0.1788094937801361, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 714.736328125, "completions/mean_terminated_length": 714.736328125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8063497482290689, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11438384287549501, "kl": 0.0914306640625, "learning_rate": 2.2109519403876068e-06, "loss": 0.0066, "num_tokens": 1130448828.0, "reward": 2.14306640625, "reward_std": 0.1601743996143341, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 790.625, "completions/mean_terminated_length": 788.1643676757812, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.8066911325424597, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12509871753730853, "kl": 0.086181640625, "learning_rate": 2.2034860050298114e-06, "loss": 0.0268, "num_tokens": 1130937340.0, "reward": 2.1064453125, "reward_std": 0.19787320494651794, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 747.798828125, "completions/mean_terminated_length": 747.798828125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.8070325168558504, "frac_reward_zero_std": 0.375, "grad_norm": 0.12596329363951242, "kl": 0.0897216796875, "learning_rate": 2.1960311353976318e-06, "loss": 0.0258, "num_tokens": 1131396021.0, "reward": 2.13818359375, "reward_std": 0.23216594755649567, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 748.40234375, "completions/mean_terminated_length": 748.40234375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.8073739011692412, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10641731942958145, "kl": 0.090576171875, "learning_rate": 2.1885873420718882e-06, "loss": 0.0098, "num_tokens": 1131863731.0, "reward": 2.05908203125, "reward_std": 0.10658155381679535, "rewards/accuracy_reward/mean": 0.07056451588869095, "rewards/accuracy_reward/std": 0.25635457038879395, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 817.13671875, "completions/mean_terminated_length": 817.13671875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.807715285482632, "frac_reward_zero_std": 0.3125, "grad_norm": 0.13487985430963856, "kl": 0.0887451171875, "learning_rate": 2.181154635617687e-06, "loss": 0.0195, "num_tokens": 1132369545.0, "reward": 2.12890625, "reward_std": 0.25346285104751587, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 795.029296875, "completions/mean_terminated_length": 795.029296875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.8080566697960229, "frac_reward_zero_std": 0.625, "grad_norm": 0.0967919156363253, "kl": 0.0888671875, "learning_rate": 2.1737330265843958e-06, "loss": 0.0164, "num_tokens": 1132869096.0, "reward": 2.0625, "reward_std": 0.1352614164352417, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 770.2890625, "completions/mean_terminated_length": 767.7886352539062, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.8083980541094137, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09515505101552613, "kl": 0.0887451171875, "learning_rate": 2.1663225255056264e-06, "loss": 0.0251, "num_tokens": 1133352316.0, "reward": 2.09521484375, "reward_std": 0.15908372402191162, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 804.9140625, "completions/mean_terminated_length": 802.4813842773438, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8087394384228045, "frac_reward_zero_std": 0.375, "grad_norm": 0.13058523961082769, "kl": 0.080810546875, "learning_rate": 2.1589231428992352e-06, "loss": 0.038, "num_tokens": 1133845664.0, "reward": 2.197265625, "reward_std": 0.24709883332252502, "rewards/accuracy_reward/mean": 0.220703125, "rewards/accuracy_reward/std": 0.4151262938976288, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 750.931640625, "completions/mean_terminated_length": 750.931640625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.8090808227361953, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09328451123675134, "kl": 0.0863037109375, "learning_rate": 2.151534889267287e-06, "loss": 0.011, "num_tokens": 1134309149.0, "reward": 2.07861328125, "reward_std": 0.14198225736618042, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 752.5859375, "completions/mean_terminated_length": 752.5859375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8094222070495861, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10052112902776077, "kl": 0.0894775390625, "learning_rate": 2.144157775096063e-06, "loss": 0.0167, "num_tokens": 1134795369.0, "reward": 2.095703125, "reward_std": 0.1582655906677246, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 771.017578125, "completions/mean_terminated_length": 771.017578125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.8097635913629768, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11015849363044786, "kl": 0.089111328125, "learning_rate": 2.1367918108560206e-06, "loss": 0.0146, "num_tokens": 1135277074.0, "reward": 2.08056640625, "reward_std": 0.16521382331848145, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 788.0, "completions/mean_terminated_length": 788.0, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.8101049756763676, "frac_reward_zero_std": 0.5, "grad_norm": 0.11864322784681386, "kl": 0.0865478515625, "learning_rate": 2.129437007001808e-06, "loss": 0.018, "num_tokens": 1135768258.0, "reward": 2.048828125, "reward_std": 0.18190380930900574, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04655282944440842, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 812.705078125, "completions/mean_terminated_length": 810.2876586914062, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8104463599897584, "frac_reward_zero_std": 0.34375, "grad_norm": 0.13234800719436923, "kl": 0.0830078125, "learning_rate": 2.1220933739722125e-06, "loss": 0.048, "num_tokens": 1136270139.0, "reward": 2.0654296875, "reward_std": 0.22644609212875366, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 862.650390625, "completions/mean_terminated_length": 860.3306884765625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.8107877443031493, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11139982152383873, "kl": 0.0816650390625, "learning_rate": 2.1147609221901843e-06, "loss": 0.0256, "num_tokens": 1136793832.0, "reward": 2.08349609375, "reward_std": 0.19420671463012695, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 773.509765625, "completions/mean_terminated_length": 773.509765625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.8111291286165401, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11751997607109672, "kl": 0.088623046875, "learning_rate": 2.1074396620628003e-06, "loss": 0.0172, "num_tokens": 1137299373.0, "reward": 2.076171875, "reward_std": 0.19162340462207794, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 750.498046875, "completions/mean_terminated_length": 750.498046875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8114705129299309, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0940093087438306, "kl": 0.08740234375, "learning_rate": 2.1001296039812436e-06, "loss": 0.0047, "num_tokens": 1137769964.0, "reward": 2.05712890625, "reward_std": 0.11105500161647797, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 778.1640625, "completions/mean_terminated_length": 778.1640625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.8118118972433217, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13128547039739122, "kl": 0.087158203125, "learning_rate": 2.092830758320811e-06, "loss": 0.0126, "num_tokens": 1138248352.0, "reward": 2.0634765625, "reward_std": 0.22205492854118347, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05831611156463623, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 749.904296875, "completions/mean_terminated_length": 749.904296875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8121532815567125, "frac_reward_zero_std": 0.375, "grad_norm": 0.1294238386917774, "kl": 0.08837890625, "learning_rate": 2.085543135440876e-06, "loss": 0.0091, "num_tokens": 1138714175.0, "reward": 2.15625, "reward_std": 0.2429991066455841, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 760.630859375, "completions/mean_terminated_length": 758.9667358398438, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.8124946658701032, "frac_reward_zero_std": 0.625, "grad_norm": 0.7303041083901726, "kl": 0.3238525390625, "learning_rate": 2.078266745684886e-06, "loss": 0.0238, "num_tokens": 1139182898.0, "reward": 2.1025390625, "reward_std": 0.1447305828332901, "rewards/accuracy_reward/mean": 0.11895161122083664, "rewards/accuracy_reward/std": 0.3240584135055542, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.046829111874103546, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 731.28125, "completions/mean_terminated_length": 731.28125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.812836050183494, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10508203785811496, "kl": 0.084716796875, "learning_rate": 2.0710015993803425e-06, "loss": 0.0182, "num_tokens": 1139639346.0, "reward": 2.05419921875, "reward_std": 0.14472874999046326, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 807.53515625, "completions/mean_terminated_length": 807.53515625, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.8131774344968848, "frac_reward_zero_std": 0.5, "grad_norm": 0.10774892872894196, "kl": 0.0789794921875, "learning_rate": 2.063747706838796e-06, "loss": 0.0069, "num_tokens": 1140141940.0, "reward": 2.07177734375, "reward_std": 0.16839070618152618, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 692.298828125, "completions/mean_terminated_length": 692.298828125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8135188188102757, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12604635418789245, "kl": 0.0870361328125, "learning_rate": 2.0565050783558217e-06, "loss": 0.0246, "num_tokens": 1140574589.0, "reward": 2.22412109375, "reward_std": 0.2093288153409958, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42402184009552, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 789.232421875, "completions/mean_terminated_length": 784.296142578125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8138602031236665, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09826495632285731, "kl": 0.085693359375, "learning_rate": 2.0492737242109993e-06, "loss": 0.0161, "num_tokens": 1141061908.0, "reward": 2.08349609375, "reward_std": 0.1431598961353302, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 821.826171875, "completions/mean_terminated_length": 821.826171875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8142015874370573, "frac_reward_zero_std": 0.5, "grad_norm": 0.10986564112898076, "kl": 0.08203125, "learning_rate": 2.042053654667925e-06, "loss": 0.0051, "num_tokens": 1141574187.0, "reward": 2.1015625, "reward_std": 0.16787907481193542, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 838.810546875, "completions/mean_terminated_length": 838.810546875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.8145429717504481, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09342116962301042, "kl": 0.0826416015625, "learning_rate": 2.034844879974154e-06, "loss": 0.0156, "num_tokens": 1142090842.0, "reward": 2.03125, "reward_std": 0.10652732849121094, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 771.212890625, "completions/mean_terminated_length": 771.212890625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.8148843560638389, "frac_reward_zero_std": 0.625, "grad_norm": 0.10115609792477599, "kl": 0.0853271484375, "learning_rate": 2.027647410361231e-06, "loss": -0.0017, "num_tokens": 1142571175.0, "reward": 2.09521484375, "reward_std": 0.12893670797348022, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 712.76171875, "completions/mean_terminated_length": 712.76171875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.8152257403772296, "frac_reward_zero_std": 0.625, "grad_norm": 0.09986083168329661, "kl": 0.0870361328125, "learning_rate": 2.0204612560446433e-06, "loss": 0.0005, "num_tokens": 1143028797.0, "reward": 2.1435546875, "reward_std": 0.1501334011554718, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 815.322265625, "completions/mean_terminated_length": 812.9099731445312, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.8155671246906204, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1116915268333819, "kl": 0.0860595703125, "learning_rate": 2.013286427223825e-06, "loss": 0.0122, "num_tokens": 1143529314.0, "reward": 2.0615234375, "reward_std": 0.219620019197464, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06218579038977623, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1997.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 848.5234375, "completions/mean_terminated_length": 848.5234375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.8159085090040112, "frac_reward_zero_std": 0.625, "grad_norm": 0.0892762927336334, "kl": 0.0828857421875, "learning_rate": 2.0061229340821365e-06, "loss": 0.0071, "num_tokens": 1144047038.0, "reward": 2.0576171875, "reward_std": 0.14451035857200623, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 819.5625, "completions/mean_terminated_length": 819.5625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.816249893317402, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12030162910215009, "kl": 0.0799560546875, "learning_rate": 1.9989707867868423e-06, "loss": 0.0338, "num_tokens": 1144551998.0, "reward": 2.1611328125, "reward_std": 0.24572032690048218, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 775.75, "completions/mean_terminated_length": 775.75, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.8165912776307929, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09608652569080607, "kl": 0.0849609375, "learning_rate": 1.9918299954891084e-06, "loss": 0.0126, "num_tokens": 1145030462.0, "reward": 2.134765625, "reward_std": 0.16035668551921844, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 803.3046875, "completions/mean_terminated_length": 801.9334716796875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.8169326619441837, "frac_reward_zero_std": 0.40625, "grad_norm": 0.3466625432074308, "kl": 0.1973876953125, "learning_rate": 1.9847005703239786e-06, "loss": 0.0257, "num_tokens": 1145521930.0, "reward": 2.12646484375, "reward_std": 0.22417938709259033, "rewards/accuracy_reward/mean": 0.14919355511665344, "rewards/accuracy_reward/std": 0.3566388487815857, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 847.8359375, "completions/mean_terminated_length": 845.4872436523438, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.8172740462575745, "frac_reward_zero_std": 0.5, "grad_norm": 0.09938913363672403, "kl": 0.0782470703125, "learning_rate": 1.977582521410374e-06, "loss": 0.0141, "num_tokens": 1146034390.0, "reward": 2.12548828125, "reward_std": 0.18245133757591248, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 811.33984375, "completions/mean_terminated_length": 811.33984375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.8176154305709653, "frac_reward_zero_std": 0.5, "grad_norm": 0.11525157320088487, "kl": 0.0872802734375, "learning_rate": 1.9704758588510575e-06, "loss": 0.0033, "num_tokens": 1146539956.0, "reward": 2.08837890625, "reward_std": 0.15240603685379028, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 786.623046875, "completions/mean_terminated_length": 786.623046875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.817956814884356, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12414200043441444, "kl": 0.084716796875, "learning_rate": 1.9633805927326386e-06, "loss": 0.0193, "num_tokens": 1147026147.0, "reward": 2.052734375, "reward_std": 0.19405335187911987, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 843.50390625, "completions/mean_terminated_length": 841.1467895507812, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.8182981991977468, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09924854696326886, "kl": 0.0848388671875, "learning_rate": 1.9562967331255554e-06, "loss": 0.0187, "num_tokens": 1147538997.0, "reward": 2.05126953125, "reward_std": 0.13944359123706818, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 810.822265625, "completions/mean_terminated_length": 808.4011840820312, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8186395835111376, "frac_reward_zero_std": 0.625, "grad_norm": 0.10286749861783726, "kl": 0.0850830078125, "learning_rate": 1.949224290084042e-06, "loss": 0.0095, "num_tokens": 1148040218.0, "reward": 2.10302734375, "reward_std": 0.14694131910800934, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 771.2421875, "completions/mean_terminated_length": 771.2421875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.8189809678245284, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10868466691906183, "kl": 0.0888671875, "learning_rate": 1.9421632736461437e-06, "loss": 0.0033, "num_tokens": 1148516038.0, "reward": 2.0791015625, "reward_std": 0.14197850227355957, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1773.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 778.96875, "completions/mean_terminated_length": 777.0234985351562, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.8193223521379193, "frac_reward_zero_std": 0.375, "grad_norm": 0.13857677182313421, "kl": 0.0899658203125, "learning_rate": 1.9351136938336777e-06, "loss": 0.0123, "num_tokens": 1149001286.0, "reward": 2.05859375, "reward_std": 0.190807044506073, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1956.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 872.34765625, "completions/mean_terminated_length": 872.34765625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.8196637364513101, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08036715718094653, "kl": 0.080078125, "learning_rate": 1.9280755606522383e-06, "loss": 0.0195, "num_tokens": 1149525224.0, "reward": 2.0380859375, "reward_std": 0.11271896213293076, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 790.904296875, "completions/mean_terminated_length": 785.9745483398438, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.8200051207647009, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12512242742662677, "kl": 0.087158203125, "learning_rate": 1.921048884091162e-06, "loss": 0.0108, "num_tokens": 1150009751.0, "reward": 2.12646484375, "reward_std": 0.2150183618068695, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1940.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 779.9375, "completions/mean_terminated_length": 779.9375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.8203465050780917, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11297904323138414, "kl": 0.086181640625, "learning_rate": 1.914033674123538e-06, "loss": 0.0023, "num_tokens": 1150487607.0, "reward": 2.12646484375, "reward_std": 0.15410706400871277, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 779.486328125, "completions/mean_terminated_length": 779.486328125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8206878893914825, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10655668828862033, "kl": 0.08447265625, "learning_rate": 1.9070299407061687e-06, "loss": 0.0113, "num_tokens": 1150966192.0, "reward": 2.109375, "reward_std": 0.18028204143047333, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 741.74609375, "completions/mean_terminated_length": 741.74609375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.8210292737048732, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12184177901033195, "kl": 0.0908203125, "learning_rate": 1.9000376937795728e-06, "loss": 0.0137, "num_tokens": 1151427694.0, "reward": 2.06982421875, "reward_std": 0.20395582914352417, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 826.5234375, "completions/mean_terminated_length": 824.133056640625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.821370658018264, "frac_reward_zero_std": 0.5625, "grad_norm": 0.104601739989558, "kl": 0.081298828125, "learning_rate": 1.8930569432679692e-06, "loss": 0.0058, "num_tokens": 1151944538.0, "reward": 2.0986328125, "reward_std": 0.1444714516401291, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1368.0, "completions/max_terminated_length": 1368.0, "completions/mean_length": 740.26953125, "completions/mean_terminated_length": 740.26953125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8217120423316548, "frac_reward_zero_std": 0.5, "grad_norm": 0.11966649955050236, "kl": 0.087158203125, "learning_rate": 1.886087699079252e-06, "loss": 0.0095, "num_tokens": 1152402452.0, "reward": 2.13818359375, "reward_std": 0.19472216069698334, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1484.0, "completions/max_terminated_length": 1484.0, "completions/mean_length": 739.189453125, "completions/mean_terminated_length": 739.189453125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8220534266450457, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09534254039273896, "kl": 0.0867919921875, "learning_rate": 1.8791299711049937e-06, "loss": 0.0127, "num_tokens": 1152853797.0, "reward": 2.0390625, "reward_std": 0.13188263773918152, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 773.1796875, "completions/mean_terminated_length": 773.1796875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.8223948109584365, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11295539145196437, "kl": 0.0904541015625, "learning_rate": 1.8721837692204115e-06, "loss": 0.0183, "num_tokens": 1153331089.0, "reward": 2.08154296875, "reward_std": 0.18647006154060364, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 753.859375, "completions/mean_terminated_length": 753.859375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8227361952718273, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11978760660139641, "kl": 0.08837890625, "learning_rate": 1.8652491032843723e-06, "loss": 0.0086, "num_tokens": 1153796169.0, "reward": 2.091796875, "reward_std": 0.1600000262260437, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 739.548828125, "completions/mean_terminated_length": 739.548828125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.8230775795852181, "frac_reward_zero_std": 0.5, "grad_norm": 0.11796285432936664, "kl": 0.0845947265625, "learning_rate": 1.8583259831393663e-06, "loss": 0.0129, "num_tokens": 1154255682.0, "reward": 2.1796875, "reward_std": 0.1722034215927124, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 822.884765625, "completions/mean_terminated_length": 822.884765625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.8234189638986089, "frac_reward_zero_std": 0.25, "grad_norm": 0.1280951077341392, "kl": 0.0860595703125, "learning_rate": 1.8514144186114913e-06, "loss": 0.0186, "num_tokens": 1154752775.0, "reward": 2.134765625, "reward_std": 0.28431785106658936, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1653.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 787.455078125, "completions/mean_terminated_length": 787.455078125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8237603482119996, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09670621483973857, "kl": 0.0792236328125, "learning_rate": 1.8445144195104546e-06, "loss": 0.0127, "num_tokens": 1155244048.0, "reward": 2.1044921875, "reward_std": 0.15908639132976532, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 792.98046875, "completions/mean_terminated_length": 792.98046875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.8241017325253904, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11586975771032584, "kl": 0.08544921875, "learning_rate": 1.8376259956295394e-06, "loss": 0.0117, "num_tokens": 1155738390.0, "reward": 2.10205078125, "reward_std": 0.19227664172649384, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 746.275390625, "completions/mean_terminated_length": 746.275390625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8244431168387812, "frac_reward_zero_std": 0.375, "grad_norm": 0.13958530702112387, "kl": 0.0904541015625, "learning_rate": 1.830749156745607e-06, "loss": 0.0282, "num_tokens": 1156203203.0, "reward": 2.09130859375, "reward_std": 0.22615846991539001, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 785.0, "completions/mean_terminated_length": 785.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.8247845011521721, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09404311516795072, "kl": 0.0888671875, "learning_rate": 1.8238839126190687e-06, "loss": 0.0104, "num_tokens": 1156687811.0, "reward": 2.10107421875, "reward_std": 0.13822388648986816, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 784.244140625, "completions/mean_terminated_length": 784.244140625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8251258854655629, "frac_reward_zero_std": 0.5, "grad_norm": 0.1146002179265566, "kl": 0.0836181640625, "learning_rate": 1.8170302729938837e-06, "loss": 0.0211, "num_tokens": 1157172704.0, "reward": 2.0986328125, "reward_std": 0.18143784999847412, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 810.9609375, "completions/mean_terminated_length": 810.0723876953125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.8254672697789537, "frac_reward_zero_std": 0.625, "grad_norm": 0.14536207017626657, "kl": 0.278564453125, "learning_rate": 1.8101882475975418e-06, "loss": 0.0203, "num_tokens": 1157670348.0, "reward": 2.09423828125, "reward_std": 0.15769995748996735, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1913.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 783.90234375, "completions/mean_terminated_length": 783.90234375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.8258086540923445, "frac_reward_zero_std": 0.625, "grad_norm": 0.09511871282464178, "kl": 0.0849609375, "learning_rate": 1.8033578461410438e-06, "loss": 0.016, "num_tokens": 1158153658.0, "reward": 2.09521484375, "reward_std": 0.15130071341991425, "rewards/accuracy_reward/mean": 0.12083332985639572, "rewards/accuracy_reward/std": 0.32627353072166443, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1831.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 781.93359375, "completions/mean_terminated_length": 781.93359375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8261500384057353, "frac_reward_zero_std": 0.625, "grad_norm": 0.10894391739178573, "kl": 0.079345703125, "learning_rate": 1.7965390783188985e-06, "loss": 0.0068, "num_tokens": 1158638136.0, "reward": 2.09912109375, "reward_std": 0.1524536907672882, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 796.287109375, "completions/mean_terminated_length": 796.287109375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.826491422719126, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13550336111638664, "kl": 0.09521484375, "learning_rate": 1.7897319538090962e-06, "loss": 0.0183, "num_tokens": 1159127323.0, "reward": 2.041015625, "reward_std": 0.14379215240478516, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 829.98828125, "completions/mean_terminated_length": 829.98828125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.8268328070325168, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10828175231612065, "kl": 0.088134765625, "learning_rate": 1.7829364822731087e-06, "loss": 0.0071, "num_tokens": 1159640757.0, "reward": 2.07763671875, "reward_std": 0.15954171121120453, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 733.58984375, "completions/mean_terminated_length": 733.58984375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8271741913459076, "frac_reward_zero_std": 0.625, "grad_norm": 0.12098650850046486, "kl": 0.090576171875, "learning_rate": 1.7761526733558644e-06, "loss": 0.0211, "num_tokens": 1160099299.0, "reward": 2.0908203125, "reward_std": 0.1291772425174713, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 775.99609375, "completions/mean_terminated_length": 775.99609375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.8275155756592985, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12790427521561007, "kl": 0.088623046875, "learning_rate": 1.7693805366857342e-06, "loss": 0.0129, "num_tokens": 1160581105.0, "reward": 2.10498046875, "reward_std": 0.2155536413192749, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 746.40234375, "completions/mean_terminated_length": 746.40234375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8278569599726893, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1114181084487019, "kl": 0.0831298828125, "learning_rate": 1.7626200818745342e-06, "loss": 0.011, "num_tokens": 1161045855.0, "reward": 2.1201171875, "reward_std": 0.16763846576213837, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1909.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 745.109375, "completions/mean_terminated_length": 745.109375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.8281983442860801, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12518383621835938, "kl": 0.085693359375, "learning_rate": 1.7558713185174881e-06, "loss": 0.0016, "num_tokens": 1161511303.0, "reward": 2.20751953125, "reward_std": 0.21120180189609528, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4190165400505066, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1586.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 783.65625, "completions/mean_terminated_length": 783.65625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8285397285994709, "frac_reward_zero_std": 0.5, "grad_norm": 0.10898387417609855, "kl": 0.0911865234375, "learning_rate": 1.7491342561932356e-06, "loss": 0.0109, "num_tokens": 1161995879.0, "reward": 2.1103515625, "reward_std": 0.19693374633789062, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1864.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 766.935546875, "completions/mean_terminated_length": 766.935546875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8288811129128617, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1319462519232919, "kl": 0.0863037109375, "learning_rate": 1.7424089044638026e-06, "loss": 0.0182, "num_tokens": 1162468806.0, "reward": 2.15087890625, "reward_std": 0.20533913373947144, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 726.6640625, "completions/mean_terminated_length": 726.6640625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8292224972262524, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10589165979788188, "kl": 0.085693359375, "learning_rate": 1.7356952728745935e-06, "loss": 0.0006, "num_tokens": 1162923114.0, "reward": 2.1904296875, "reward_std": 0.1938788890838623, "rewards/accuracy_reward/mean": 0.197265625, "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 783.517578125, "completions/mean_terminated_length": 783.517578125, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.8295638815396432, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11722743304091648, "kl": 0.0858154296875, "learning_rate": 1.7289933709543848e-06, "loss": 0.0127, "num_tokens": 1163404595.0, "reward": 2.12841796875, "reward_std": 0.17202496528625488, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 713.53515625, "completions/mean_terminated_length": 713.53515625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.829905265853034, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13514907741730836, "kl": 0.08544921875, "learning_rate": 1.722303208215297e-06, "loss": 0.0309, "num_tokens": 1163853829.0, "reward": 2.1533203125, "reward_std": 0.2160234898328781, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 802.478515625, "completions/mean_terminated_length": 802.478515625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.8302466501664248, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09080809948403801, "kl": 0.0843505859375, "learning_rate": 1.7156247941527949e-06, "loss": 0.0037, "num_tokens": 1164347034.0, "reward": 2.037109375, "reward_std": 0.1241372749209404, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 776.68359375, "completions/mean_terminated_length": 776.68359375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.8305880344798157, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1199770762474887, "kl": 0.086669921875, "learning_rate": 1.708958138245662e-06, "loss": 0.0084, "num_tokens": 1164855096.0, "reward": 2.10009765625, "reward_std": 0.17131829261779785, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 782.931640625, "completions/mean_terminated_length": 782.931640625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.8309294187932065, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0847783660282704, "kl": 0.0843505859375, "learning_rate": 1.702303249956002e-06, "loss": 0.0107, "num_tokens": 1165342197.0, "reward": 2.0771484375, "reward_std": 0.09935696423053741, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1777.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 767.939453125, "completions/mean_terminated_length": 767.939453125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.8312708031065973, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12218641328434163, "kl": 0.085205078125, "learning_rate": 1.6956601387292093e-06, "loss": 0.0116, "num_tokens": 1165815270.0, "reward": 2.06689453125, "reward_std": 0.19098392128944397, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 775.134765625, "completions/mean_terminated_length": 775.134765625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.8316121874199881, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11219592044493872, "kl": 0.0806884765625, "learning_rate": 1.6890288139939625e-06, "loss": 0.0051, "num_tokens": 1166299275.0, "reward": 2.05126953125, "reward_std": 0.14104409515857697, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 714.01953125, "completions/mean_terminated_length": 714.01953125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8319535717333788, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3324325151150574, "kl": 0.0933837890625, "learning_rate": 1.6824092851622198e-06, "loss": 0.0084, "num_tokens": 1166743637.0, "reward": 2.08984375, "reward_std": 0.10386867821216583, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 745.955078125, "completions/mean_terminated_length": 745.955078125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8322949560467696, "frac_reward_zero_std": 0.5, "grad_norm": 0.1253838399157319, "kl": 0.09375, "learning_rate": 1.6758015616291868e-06, "loss": 0.0148, "num_tokens": 1167213406.0, "reward": 2.09716796875, "reward_std": 0.18941475450992584, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1931.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 801.447265625, "completions/mean_terminated_length": 801.447265625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.8326363403601604, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09655032574514752, "kl": 0.0859375, "learning_rate": 1.6692056527733214e-06, "loss": 0.0093, "num_tokens": 1167710963.0, "reward": 2.11328125, "reward_std": 0.14598602056503296, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 729.17578125, "completions/mean_terminated_length": 729.17578125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.8329777246735512, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13164977698119468, "kl": 0.0897216796875, "learning_rate": 1.6626215679563074e-06, "loss": 0.0167, "num_tokens": 1168165517.0, "reward": 2.06005859375, "reward_std": 0.20424577593803406, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.036414988338947296, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 742.26953125, "completions/mean_terminated_length": 742.26953125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.8333191089869421, "frac_reward_zero_std": 0.375, "grad_norm": 0.1404942431113272, "kl": 0.090576171875, "learning_rate": 1.6560493165230518e-06, "loss": 0.0284, "num_tokens": 1168632759.0, "reward": 2.220703125, "reward_std": 0.2701179087162018, "rewards/accuracy_reward/mean": 0.2479838728904724, "rewards/accuracy_reward/std": 0.4322783946990967, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 743.56640625, "completions/mean_terminated_length": 743.56640625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.8336604933003329, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11427106379064034, "kl": 0.0933837890625, "learning_rate": 1.6494889078016628e-06, "loss": 0.0118, "num_tokens": 1169095545.0, "reward": 2.05078125, "reward_std": 0.14641691744327545, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 833.7578125, "completions/mean_terminated_length": 833.7578125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.8340018776137237, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0966584083872057, "kl": 0.0841064453125, "learning_rate": 1.642940351103437e-06, "loss": 0.0189, "num_tokens": 1169625725.0, "reward": 2.06640625, "reward_std": 0.15950526297092438, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 787.40625, "completions/mean_terminated_length": 787.40625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8343432619271145, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1280312679878114, "kl": 0.090576171875, "learning_rate": 1.636403655722858e-06, "loss": 0.0094, "num_tokens": 1170110861.0, "reward": 2.04345703125, "reward_std": 0.18630169332027435, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 730.240234375, "completions/mean_terminated_length": 730.240234375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8346846462405052, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1022972670149911, "kl": 0.08837890625, "learning_rate": 1.6298788309375646e-06, "loss": 0.0115, "num_tokens": 1170564424.0, "reward": 2.09326171875, "reward_std": 0.12232809513807297, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 736.40625, "completions/mean_terminated_length": 736.40625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.835026030553896, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11439600126301168, "kl": 0.09228515625, "learning_rate": 1.6233658860083567e-06, "loss": 0.0134, "num_tokens": 1171020232.0, "reward": 2.0478515625, "reward_std": 0.14860782027244568, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 797.970703125, "completions/mean_terminated_length": 797.970703125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.8353674148672868, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11198128500882602, "kl": 0.084228515625, "learning_rate": 1.6168648301791668e-06, "loss": 0.0054, "num_tokens": 1171509817.0, "reward": 2.0859375, "reward_std": 0.15491604804992676, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 716.28515625, "completions/mean_terminated_length": 716.28515625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8357087991806776, "frac_reward_zero_std": 0.5, "grad_norm": 0.12900151646441338, "kl": 0.089111328125, "learning_rate": 1.6103756726770513e-06, "loss": 0.0102, "num_tokens": 1171952475.0, "reward": 2.125, "reward_std": 0.1581553965806961, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 777.208984375, "completions/mean_terminated_length": 777.208984375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8360501834940685, "frac_reward_zero_std": 0.5, "grad_norm": 0.12034144498517667, "kl": 0.0880126953125, "learning_rate": 1.6038984227121878e-06, "loss": 0.0148, "num_tokens": 1172439062.0, "reward": 2.12548828125, "reward_std": 0.15121032297611237, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 780.21484375, "completions/mean_terminated_length": 780.21484375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.8363915678074593, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11068704082623909, "kl": 0.0823974609375, "learning_rate": 1.5974330894778422e-06, "loss": 0.02, "num_tokens": 1172920596.0, "reward": 2.10888671875, "reward_std": 0.15625295042991638, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 767.416015625, "completions/mean_terminated_length": 767.416015625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.8367329521208501, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1279872673796529, "kl": 0.0889892578125, "learning_rate": 1.5909796821503787e-06, "loss": 0.0179, "num_tokens": 1173390313.0, "reward": 2.15234375, "reward_std": 0.24542236328125, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 843.2109375, "completions/mean_terminated_length": 843.2109375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.8370743364342409, "frac_reward_zero_std": 0.53125, "grad_norm": 0.14681393071876328, "kl": 0.07861328125, "learning_rate": 1.5845382098892226e-06, "loss": 0.008, "num_tokens": 1173911445.0, "reward": 2.06591796875, "reward_std": 0.1601865440607071, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 837.25390625, "completions/mean_terminated_length": 832.5059204101562, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.8374157207476316, "frac_reward_zero_std": 0.59375, "grad_norm": 0.091152248193791, "kl": 0.0816650390625, "learning_rate": 1.578108681836874e-06, "loss": 0.0125, "num_tokens": 1174422567.0, "reward": 2.017578125, "reward_std": 0.1352192759513855, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 805.703125, "completions/mean_terminated_length": 805.703125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.8377571050610224, "frac_reward_zero_std": 0.625, "grad_norm": 0.09615402330639793, "kl": 0.0870361328125, "learning_rate": 1.5716911071188611e-06, "loss": 0.0113, "num_tokens": 1174914895.0, "reward": 2.087890625, "reward_std": 0.15244118869304657, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 810.671875, "completions/mean_terminated_length": 810.671875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.8380984893744132, "frac_reward_zero_std": 0.71875, "grad_norm": 0.07632111840573849, "kl": 0.080322265625, "learning_rate": 1.5652854948437634e-06, "loss": 0.015, "num_tokens": 1175417255.0, "reward": 2.06396484375, "reward_std": 0.10281579196453094, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 719.541015625, "completions/mean_terminated_length": 719.541015625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.838439873687804, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0914419398021738, "kl": 0.087646484375, "learning_rate": 1.5588918541031783e-06, "loss": 0.0038, "num_tokens": 1175866428.0, "reward": 2.12060546875, "reward_std": 0.15054889023303986, "rewards/accuracy_reward/mean": 0.14314515888690948, "rewards/accuracy_reward/std": 0.35057440400123596, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 712.580078125, "completions/mean_terminated_length": 712.580078125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8387812580011949, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13217283203937671, "kl": 0.091796875, "learning_rate": 1.5525101939717024e-06, "loss": 0.0018, "num_tokens": 1176306501.0, "reward": 2.15673828125, "reward_std": 0.2139018476009369, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 772.22265625, "completions/mean_terminated_length": 772.22265625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.8391226423145857, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10945926172340498, "kl": 0.0858154296875, "learning_rate": 1.5461405235069427e-06, "loss": 0.0056, "num_tokens": 1176785287.0, "reward": 2.1259765625, "reward_std": 0.16572877764701843, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 799.97265625, "completions/mean_terminated_length": 799.97265625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.8394640266279765, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1117198831974148, "kl": 0.0810546875, "learning_rate": 1.5397828517494785e-06, "loss": 0.0108, "num_tokens": 1177278681.0, "reward": 2.1123046875, "reward_std": 0.17507247626781464, "rewards/accuracy_reward/mean": 0.13306452333927155, "rewards/accuracy_reward/std": 0.3399873673915863, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 758.23828125, "completions/mean_terminated_length": 758.23828125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8398054109413673, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1173961862903346, "kl": 0.0986328125, "learning_rate": 1.5334371877228604e-06, "loss": 0.01, "num_tokens": 1177743987.0, "reward": 2.10205078125, "reward_std": 0.17928412556648254, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1413.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 744.548828125, "completions/mean_terminated_length": 744.548828125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.840146795254758, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08915745046986219, "kl": 0.0887451171875, "learning_rate": 1.5271035404335954e-06, "loss": 0.0115, "num_tokens": 1178199180.0, "reward": 2.11376953125, "reward_std": 0.1268950253725052, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 778.81640625, "completions/mean_terminated_length": 778.81640625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.8404881795681488, "frac_reward_zero_std": 0.5, "grad_norm": 0.11034290825742211, "kl": 0.084228515625, "learning_rate": 1.5207819188711426e-06, "loss": 0.0309, "num_tokens": 1178677614.0, "reward": 2.0380859375, "reward_std": 0.17596662044525146, "rewards/accuracy_reward/mean": 0.06653226166963577, "rewards/accuracy_reward/std": 0.2494617998600006, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 804.6171875, "completions/mean_terminated_length": 804.6171875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.8408295638815396, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10345784935179067, "kl": 0.081298828125, "learning_rate": 1.5144723320078868e-06, "loss": 0.0117, "num_tokens": 1179176298.0, "reward": 2.0439453125, "reward_std": 0.12066008895635605, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 742.599609375, "completions/mean_terminated_length": 741.117431640625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8411709481949304, "frac_reward_zero_std": 0.59375, "grad_norm": 0.162620339153362, "kl": 0.310302734375, "learning_rate": 1.5081747887991305e-06, "loss": 0.022, "num_tokens": 1179637405.0, "reward": 2.08349609375, "reward_std": 0.14498162269592285, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04538619518280029, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 789.8984375, "completions/mean_terminated_length": 789.8984375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8415123325083212, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09800966212811454, "kl": 0.0828857421875, "learning_rate": 1.501889298183089e-06, "loss": 0.0061, "num_tokens": 1180136745.0, "reward": 2.0400390625, "reward_std": 0.10392509400844574, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 825.46484375, "completions/mean_terminated_length": 825.46484375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8418537168217121, "frac_reward_zero_std": 0.5, "grad_norm": 0.11841471033567326, "kl": 0.0838623046875, "learning_rate": 1.4956158690808586e-06, "loss": 0.0149, "num_tokens": 1180644471.0, "reward": 2.0693359375, "reward_std": 0.18962278962135315, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 791.251953125, "completions/mean_terminated_length": 791.251953125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8421951011351029, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11202465753951006, "kl": 0.083740234375, "learning_rate": 1.4893545103964314e-06, "loss": 0.0067, "num_tokens": 1181130088.0, "reward": 2.09912109375, "reward_std": 0.15138356387615204, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 807.44140625, "completions/mean_terminated_length": 807.44140625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.8425364854484937, "frac_reward_zero_std": 0.5, "grad_norm": 0.11082285008592253, "kl": 0.0830078125, "learning_rate": 1.4831052310166561e-06, "loss": 0.0058, "num_tokens": 1181632602.0, "reward": 2.0517578125, "reward_std": 0.1770598590373993, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1640.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 826.822265625, "completions/mean_terminated_length": 826.822265625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.8428778697618844, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11838905271698204, "kl": 0.0882568359375, "learning_rate": 1.4768680398112435e-06, "loss": 0.0103, "num_tokens": 1182139071.0, "reward": 2.0517578125, "reward_std": 0.18851588666439056, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1956.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 779.703125, "completions/mean_terminated_length": 779.703125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8432192540752752, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1107855880549987, "kl": 0.0809326171875, "learning_rate": 1.4706429456327486e-06, "loss": 0.0199, "num_tokens": 1182613847.0, "reward": 2.1396484375, "reward_std": 0.18844905495643616, "rewards/accuracy_reward/mean": 0.15120968222618103, "rewards/accuracy_reward/std": 0.35861483216285706, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2034.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 777.744140625, "completions/mean_terminated_length": 777.744140625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.843560638388666, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10594493694656673, "kl": 0.0826416015625, "learning_rate": 1.4644299573165521e-06, "loss": 0.0168, "num_tokens": 1183092468.0, "reward": 2.07763671875, "reward_std": 0.14270474016666412, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 714.3515625, "completions/mean_terminated_length": 714.3515625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.8439020227020568, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08700056236656352, "kl": 0.0928955078125, "learning_rate": 1.4582290836808544e-06, "loss": 0.013, "num_tokens": 1183538584.0, "reward": 2.09765625, "reward_std": 0.10986737906932831, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 787.642578125, "completions/mean_terminated_length": 784.305908203125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8442434070154476, "frac_reward_zero_std": 0.5, "grad_norm": 0.1626013837189049, "kl": 0.181396484375, "learning_rate": 1.4520403335266575e-06, "loss": 0.047, "num_tokens": 1184023185.0, "reward": 2.0732421875, "reward_std": 0.18069753050804138, "rewards/accuracy_reward/mean": 0.10282257944345474, "rewards/accuracy_reward/std": 0.30403366684913635, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 814.470703125, "completions/mean_terminated_length": 814.470703125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.8445847913288385, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10591152900978235, "kl": 0.07373046875, "learning_rate": 1.4458637156377675e-06, "loss": 0.0097, "num_tokens": 1184518466.0, "reward": 2.19775390625, "reward_std": 0.21491894125938416, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 772.94140625, "completions/mean_terminated_length": 771.5577392578125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.8449261756422293, "frac_reward_zero_std": 0.625, "grad_norm": 0.22840727142948025, "kl": 0.2042236328125, "learning_rate": 1.4396992387807574e-06, "loss": 0.0088, "num_tokens": 1185007460.0, "reward": 2.05615234375, "reward_std": 0.12663394212722778, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 741.421875, "completions/mean_terminated_length": 741.421875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8452675599556201, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09577129805128518, "kl": 0.083251953125, "learning_rate": 1.4335469117049772e-06, "loss": 0.0183, "num_tokens": 1185478780.0, "reward": 2.1142578125, "reward_std": 0.1364460438489914, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1682.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 804.62890625, "completions/mean_terminated_length": 804.62890625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.8456089442690108, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10545782425431824, "kl": 0.084228515625, "learning_rate": 1.427406743142533e-06, "loss": 0.0131, "num_tokens": 1185973838.0, "reward": 2.044921875, "reward_std": 0.15374106168746948, "rewards/accuracy_reward/mean": 0.060483869165182114, "rewards/accuracy_reward/std": 0.2386218160390854, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 750.443359375, "completions/mean_terminated_length": 748.4481201171875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8459503285824016, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10974030433810199, "kl": 0.3094482421875, "learning_rate": 1.4212787418082673e-06, "loss": 0.0263, "num_tokens": 1186442577.0, "reward": 2.119140625, "reward_std": 0.18022498488426208, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.056256115436553955, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 763.87109375, "completions/mean_terminated_length": 760.2804565429688, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.8462917128957924, "frac_reward_zero_std": 0.46875, "grad_norm": 0.4762852134659619, "kl": 0.3072509765625, "learning_rate": 1.4151629163997582e-06, "loss": 0.0284, "num_tokens": 1186905823.0, "reward": 2.13720703125, "reward_std": 0.2128443419933319, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 790.12109375, "completions/mean_terminated_length": 790.12109375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.8466330972091832, "frac_reward_zero_std": 0.625, "grad_norm": 0.09258163944369334, "kl": 0.0828857421875, "learning_rate": 1.4090592755972999e-06, "loss": 0.0094, "num_tokens": 1187391565.0, "reward": 2.076171875, "reward_std": 0.15713462233543396, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 817.84375, "completions/mean_terminated_length": 817.84375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.846974481522574, "frac_reward_zero_std": 0.625, "grad_norm": 0.11276694088116938, "kl": 0.0814208984375, "learning_rate": 1.402967828063897e-06, "loss": 0.0257, "num_tokens": 1187893501.0, "reward": 2.09765625, "reward_std": 0.14513742923736572, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 782.625, "completions/mean_terminated_length": 782.625, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.8473158658359649, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09289756313972859, "kl": 0.079833984375, "learning_rate": 1.3968885824452405e-06, "loss": 0.0108, "num_tokens": 1188378989.0, "reward": 2.0546875, "reward_std": 0.1553596556186676, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 850.927734375, "completions/mean_terminated_length": 850.927734375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.8476572501493557, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12038141534476167, "kl": 0.079833984375, "learning_rate": 1.3908215473697117e-06, "loss": 0.0099, "num_tokens": 1188903240.0, "reward": 2.14501953125, "reward_std": 0.2264188826084137, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 735.796875, "completions/mean_terminated_length": 735.796875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.8479986344627465, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12117411645609294, "kl": 0.0843505859375, "learning_rate": 1.3847667314483593e-06, "loss": 0.0231, "num_tokens": 1189364992.0, "reward": 2.14794921875, "reward_std": 0.1757628321647644, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1668.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 780.982421875, "completions/mean_terminated_length": 780.982421875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8483400187761372, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11345022588391065, "kl": 0.0870361328125, "learning_rate": 1.378724143274881e-06, "loss": 0.01, "num_tokens": 1189848423.0, "reward": 2.048828125, "reward_std": 0.16483424603939056, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 792.068359375, "completions/mean_terminated_length": 792.068359375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.848681403089528, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11590887449551131, "kl": 0.0845947265625, "learning_rate": 1.37269379142563e-06, "loss": 0.0101, "num_tokens": 1190330810.0, "reward": 2.12548828125, "reward_std": 0.2268274426460266, "rewards/accuracy_reward/mean": 0.1552419364452362, "rewards/accuracy_reward/std": 0.36250078678131104, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 739.03515625, "completions/mean_terminated_length": 739.03515625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.8490227874029188, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11266281962214837, "kl": 0.0845947265625, "learning_rate": 1.366675684459583e-06, "loss": 0.0081, "num_tokens": 1190800444.0, "reward": 2.1748046875, "reward_std": 0.19566836953163147, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 825.1484375, "completions/mean_terminated_length": 822.75537109375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.8493641717163096, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10102281052748802, "kl": 0.0760498046875, "learning_rate": 1.3606698309183487e-06, "loss": 0.0211, "num_tokens": 1191299672.0, "reward": 2.09716796875, "reward_std": 0.15511180460453033, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 744.259765625, "completions/mean_terminated_length": 744.259765625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.8497055560297004, "frac_reward_zero_std": 0.625, "grad_norm": 0.10603273802073777, "kl": 0.0858154296875, "learning_rate": 1.3546762393261314e-06, "loss": 0.0094, "num_tokens": 1191764125.0, "reward": 2.12646484375, "reward_std": 0.14373424649238586, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 834.455078125, "completions/mean_terminated_length": 834.455078125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8500469403430913, "frac_reward_zero_std": 0.59375, "grad_norm": 0.08826726829897123, "kl": 0.08251953125, "learning_rate": 1.3486949181897435e-06, "loss": 0.0148, "num_tokens": 1192268294.0, "reward": 2.14453125, "reward_std": 0.17563983798027039, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 743.908203125, "completions/mean_terminated_length": 743.908203125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.8503883246564821, "frac_reward_zero_std": 0.625, "grad_norm": 0.1029982722046985, "kl": 0.083984375, "learning_rate": 1.342725875998574e-06, "loss": 0.0241, "num_tokens": 1192736599.0, "reward": 2.15869140625, "reward_std": 0.1430310755968094, "rewards/accuracy_reward/mean": 0.173828125, "rewards/accuracy_reward/std": 0.3793322443962097, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 789.30078125, "completions/mean_terminated_length": 789.30078125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.8507297089698729, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11915559151369885, "kl": 0.083251953125, "learning_rate": 1.3367691212245838e-06, "loss": 0.0019, "num_tokens": 1193230801.0, "reward": 2.111328125, "reward_std": 0.2037445604801178, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 755.4921875, "completions/mean_terminated_length": 755.4921875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.8510710932832636, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11238913774483589, "kl": 0.08349609375, "learning_rate": 1.330824662322302e-06, "loss": 0.0137, "num_tokens": 1193697613.0, "reward": 2.01318359375, "reward_std": 0.13472287356853485, "rewards/accuracy_reward/mean": 0.033203125, "rewards/accuracy_reward/std": 0.17934183776378632, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 741.1640625, "completions/mean_terminated_length": 741.1640625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8514124775966544, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11523453122932543, "kl": 0.0833740234375, "learning_rate": 1.3248925077287954e-06, "loss": 0.0128, "num_tokens": 1194159409.0, "reward": 2.1533203125, "reward_std": 0.15662389993667603, "rewards/accuracy_reward/mean": 0.16330644488334656, "rewards/accuracy_reward/std": 0.37001824378967285, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 728.50390625, "completions/mean_terminated_length": 728.50390625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.8517538619100452, "frac_reward_zero_std": 0.5, "grad_norm": 0.12395767535166766, "kl": 0.085205078125, "learning_rate": 1.318972665863678e-06, "loss": 0.0212, "num_tokens": 1194610099.0, "reward": 2.123046875, "reward_std": 0.18147119879722595, "rewards/accuracy_reward/mean": 0.14717741310596466, "rewards/accuracy_reward/std": 0.3546403646469116, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 729.5859375, "completions/mean_terminated_length": 729.5859375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.852095246223436, "frac_reward_zero_std": 0.5, "grad_norm": 0.11557749328747166, "kl": 0.082275390625, "learning_rate": 1.3130651451290798e-06, "loss": 0.017, "num_tokens": 1195072111.0, "reward": 2.09619140625, "reward_std": 0.16496524214744568, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 777.6796875, "completions/mean_terminated_length": 777.6796875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.8524366305368268, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10176867051765355, "kl": 0.082275390625, "learning_rate": 1.3071699539096438e-06, "loss": 0.0149, "num_tokens": 1195552331.0, "reward": 2.08056640625, "reward_std": 0.14999297261238098, "rewards/accuracy_reward/mean": 0.09677419066429138, "rewards/accuracy_reward/std": 0.2959485352039337, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 738.2734375, "completions/mean_terminated_length": 738.2734375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.8527780148502176, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10515233582713875, "kl": 0.0953369140625, "learning_rate": 1.3012871005725203e-06, "loss": 0.0232, "num_tokens": 1196025511.0, "reward": 2.04541015625, "reward_std": 0.1275293529033661, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 805.89453125, "completions/mean_terminated_length": 805.89453125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.8531193991636085, "frac_reward_zero_std": 0.5, "grad_norm": 0.10862035660907166, "kl": 0.08056640625, "learning_rate": 1.295416593467338e-06, "loss": 0.0066, "num_tokens": 1196525841.0, "reward": 2.06298828125, "reward_std": 0.17750775814056396, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 801.73046875, "completions/mean_terminated_length": 801.73046875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.8534607834769993, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10452977720150176, "kl": 0.080810546875, "learning_rate": 1.2895584409262141e-06, "loss": 0.0192, "num_tokens": 1197029255.0, "reward": 2.05712890625, "reward_std": 0.13296949863433838, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 778.0078125, "completions/mean_terminated_length": 778.0078125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.85380216779039, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11526852326410125, "kl": 0.0889892578125, "learning_rate": 1.2837126512637198e-06, "loss": 0.0195, "num_tokens": 1197509931.0, "reward": 2.0302734375, "reward_std": 0.15043386816978455, "rewards/accuracy_reward/mean": 0.0463709682226181, "rewards/accuracy_reward/std": 0.21049949526786804, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1691.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 754.423828125, "completions/mean_terminated_length": 754.423828125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.8541435521037808, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1165140710984976, "kl": 0.088623046875, "learning_rate": 1.2778792327768873e-06, "loss": 0.0167, "num_tokens": 1197981460.0, "reward": 2.06982421875, "reward_std": 0.15931977331638336, "rewards/accuracy_reward/mean": 0.08870967477560043, "rewards/accuracy_reward/std": 0.284611314535141, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1400.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 731.7890625, "completions/mean_terminated_length": 731.7890625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8544849364171716, "frac_reward_zero_std": 0.5, "grad_norm": 0.1306160311365418, "kl": 0.088623046875, "learning_rate": 1.2720581937451871e-06, "loss": 0.0176, "num_tokens": 1198433928.0, "reward": 2.14501953125, "reward_std": 0.19253653287887573, "rewards/accuracy_reward/mean": 0.16129031777381897, "rewards/accuracy_reward/std": 0.3681698739528656, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1458.0, "completions/max_terminated_length": 1458.0, "completions/mean_length": 735.8359375, "completions/mean_terminated_length": 735.0587158203125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8548263207305624, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12340770248231762, "kl": 0.12353515625, "learning_rate": 1.2662495424305165e-06, "loss": 0.0064, "num_tokens": 1198885652.0, "reward": 2.00927734375, "reward_std": 0.11875774711370468, "rewards/accuracy_reward/mean": 0.029296875, "rewards/accuracy_reward/std": 0.16880230605602264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1921.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 816.62890625, "completions/mean_terminated_length": 816.62890625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8551677050439532, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13321031293413269, "kl": 0.0816650390625, "learning_rate": 1.2604532870771968e-06, "loss": 0.0153, "num_tokens": 1199379254.0, "reward": 2.0908203125, "reward_std": 0.19126036763191223, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 762.78125, "completions/mean_terminated_length": 762.78125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.855509089357344, "frac_reward_zero_std": 0.53125, "grad_norm": 0.09702348187173121, "kl": 0.08056640625, "learning_rate": 1.2546694359119494e-06, "loss": 0.0132, "num_tokens": 1199853046.0, "reward": 2.09814453125, "reward_std": 0.18396835029125214, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 751.818359375, "completions/mean_terminated_length": 751.818359375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.8558504736707349, "frac_reward_zero_std": 0.65625, "grad_norm": 4.573764411949578, "kl": 0.08984375, "learning_rate": 1.2488979971438974e-06, "loss": 0.0117, "num_tokens": 1200322729.0, "reward": 2.02001953125, "reward_std": 0.10461369156837463, "rewards/accuracy_reward/mean": 0.03427419438958168, "rewards/accuracy_reward/std": 0.18211629986763, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1646.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 708.416015625, "completions/mean_terminated_length": 708.416015625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8561918579841257, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12770385110586474, "kl": 0.0914306640625, "learning_rate": 1.2431389789645399e-06, "loss": 0.0156, "num_tokens": 1200768942.0, "reward": 2.099609375, "reward_std": 0.15124008059501648, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 744.173828125, "completions/mean_terminated_length": 744.173828125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8565332422975165, "frac_reward_zero_std": 0.5, "grad_norm": 0.11643588322796733, "kl": 0.08642578125, "learning_rate": 1.237392389547748e-06, "loss": 0.0212, "num_tokens": 1201232983.0, "reward": 2.12890625, "reward_std": 0.16618786752223969, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1692.0, "completions/max_terminated_length": 1692.0, "completions/mean_length": 792.947265625, "completions/mean_terminated_length": 792.947265625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.8568746266109072, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1109632857909066, "kl": 0.0810546875, "learning_rate": 1.2316582370497577e-06, "loss": 0.017, "num_tokens": 1201728620.0, "reward": 2.1513671875, "reward_std": 0.17429645359516144, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1855.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 745.5234375, "completions/mean_terminated_length": 744.6614379882812, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.857216010924298, "frac_reward_zero_std": 0.625, "grad_norm": 0.20598723698460747, "kl": 0.15283203125, "learning_rate": 1.2259365296091463e-06, "loss": 0.0184, "num_tokens": 1202204600.0, "reward": 2.05029296875, "reward_std": 0.12207914143800735, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 759.70703125, "completions/mean_terminated_length": 759.70703125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8575573952376888, "frac_reward_zero_std": 0.5, "grad_norm": 0.11305735274704809, "kl": 0.0858154296875, "learning_rate": 1.2202272753468358e-06, "loss": 0.0242, "num_tokens": 1202672418.0, "reward": 2.08154296875, "reward_std": 0.17393603920936584, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 766.67578125, "completions/mean_terminated_length": 766.67578125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.8578987795510796, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14550408194008674, "kl": 0.0908203125, "learning_rate": 1.214530482366063e-06, "loss": 0.0161, "num_tokens": 1203151260.0, "reward": 2.119140625, "reward_std": 0.20710058510303497, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 778.263671875, "completions/mean_terminated_length": 778.263671875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.8582401638644704, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10184305468607371, "kl": 0.0821533203125, "learning_rate": 1.2088461587523881e-06, "loss": 0.0076, "num_tokens": 1203636691.0, "reward": 2.08642578125, "reward_std": 0.15946167707443237, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 738.1484375, "completions/mean_terminated_length": 738.1484375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8585815481778613, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12026817946396855, "kl": 0.0845947265625, "learning_rate": 1.2031743125736672e-06, "loss": 0.0159, "num_tokens": 1204096447.0, "reward": 2.11962890625, "reward_std": 0.16265693306922913, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 766.470703125, "completions/mean_terminated_length": 766.470703125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.8589229324912521, "frac_reward_zero_std": 0.5, "grad_norm": 0.11844729000646263, "kl": 0.082763671875, "learning_rate": 1.1975149518800455e-06, "loss": 0.0105, "num_tokens": 1204573120.0, "reward": 2.0361328125, "reward_std": 0.1890084594488144, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 778.634765625, "completions/mean_terminated_length": 778.634765625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.8592643168046429, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09555091296299889, "kl": 0.0830078125, "learning_rate": 1.1918680847039553e-06, "loss": -0.0019, "num_tokens": 1205052837.0, "reward": 2.08203125, "reward_std": 0.10178303718566895, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 778.48828125, "completions/mean_terminated_length": 778.48828125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.8596057011180336, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12345271203958902, "kl": 0.0877685546875, "learning_rate": 1.186233719060087e-06, "loss": 0.0144, "num_tokens": 1205537359.0, "reward": 2.19140625, "reward_std": 0.2067967653274536, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 783.0703125, "completions/mean_terminated_length": 783.0703125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.8599470854314244, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09962612442771822, "kl": 0.0806884765625, "learning_rate": 1.1806118629453978e-06, "loss": 0.0044, "num_tokens": 1206028003.0, "reward": 2.09912109375, "reward_std": 0.16433623433113098, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.036414988338947296, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 752.400390625, "completions/mean_terminated_length": 749.864990234375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.8602884697448152, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11087670268760827, "kl": 0.0821533203125, "learning_rate": 1.1750025243390783e-06, "loss": 0.009, "num_tokens": 1206497488.0, "reward": 2.0615234375, "reward_std": 0.16430869698524475, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 747.22265625, "completions/mean_terminated_length": 747.22265625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.860629854058206, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13254329138489657, "kl": 0.0869140625, "learning_rate": 1.1694057112025635e-06, "loss": 0.0101, "num_tokens": 1206966162.0, "reward": 2.15673828125, "reward_std": 0.21332065761089325, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.37069445848464966, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 842.5546875, "completions/mean_terminated_length": 842.5546875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8609712383715968, "frac_reward_zero_std": 0.625, "grad_norm": 0.08741403121724148, "kl": 0.076171875, "learning_rate": 1.1638214314795038e-06, "loss": 0.0121, "num_tokens": 1207497854.0, "reward": 2.07763671875, "reward_std": 0.13901181519031525, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 742.36328125, "completions/mean_terminated_length": 742.36328125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.8613126226849876, "frac_reward_zero_std": 0.40625, "grad_norm": 0.11994844814770027, "kl": 0.0802001953125, "learning_rate": 1.158249693095761e-06, "loss": 0.0059, "num_tokens": 1207960120.0, "reward": 2.1533203125, "reward_std": 0.23944051563739777, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3810062110424042, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 731.515625, "completions/mean_terminated_length": 731.515625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8616540069983785, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10349524520954134, "kl": 0.0809326171875, "learning_rate": 1.1526905039594028e-06, "loss": 0.0096, "num_tokens": 1208420384.0, "reward": 2.19189453125, "reward_std": 0.14441193640232086, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1723.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 787.607421875, "completions/mean_terminated_length": 787.607421875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.8619953913117693, "frac_reward_zero_std": 0.40625, "grad_norm": 0.11361066825912138, "kl": 0.08203125, "learning_rate": 1.1471438719606764e-06, "loss": 0.0168, "num_tokens": 1208902423.0, "reward": 2.1337890625, "reward_std": 0.22238485515117645, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 778.43359375, "completions/mean_terminated_length": 778.43359375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.86233677562516, "frac_reward_zero_std": 0.625, "grad_norm": 0.09750496823530214, "kl": 0.0802001953125, "learning_rate": 1.141609804972017e-06, "loss": 0.0081, "num_tokens": 1209390085.0, "reward": 2.12548828125, "reward_std": 0.12214449048042297, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 750.517578125, "completions/mean_terminated_length": 750.517578125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.8626781599385508, "frac_reward_zero_std": 0.625, "grad_norm": 0.0900617290925569, "kl": 0.0738525390625, "learning_rate": 1.136088310848017e-06, "loss": 0.0171, "num_tokens": 1209861054.0, "reward": 2.05908203125, "reward_std": 0.12900277972221375, "rewards/accuracy_reward/mean": 0.07056451588869095, "rewards/accuracy_reward/std": 0.25635457038879395, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 762.310546875, "completions/mean_terminated_length": 762.310546875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.8630195442519416, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1163321841403952, "kl": 0.0858154296875, "learning_rate": 1.1305793974254265e-06, "loss": 0.0033, "num_tokens": 1210352701.0, "reward": 2.05810546875, "reward_std": 0.14542236924171448, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 762.921875, "completions/mean_terminated_length": 762.921875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.8633609285653324, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10348132920801509, "kl": 0.0826416015625, "learning_rate": 1.1250830725231443e-06, "loss": 0.0119, "num_tokens": 1210825125.0, "reward": 2.11181640625, "reward_std": 0.15628042817115784, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 729.72265625, "completions/mean_terminated_length": 729.72265625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8637023128787232, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12891524775151122, "kl": 0.0921630859375, "learning_rate": 1.1195993439421938e-06, "loss": 0.0134, "num_tokens": 1211276199.0, "reward": 2.13232421875, "reward_std": 0.16815298795700073, "rewards/accuracy_reward/mean": 0.14516128599643707, "rewards/accuracy_reward/std": 0.3526190221309662, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 713.791015625, "completions/mean_terminated_length": 713.791015625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.864043697192114, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09971594960558698, "kl": 0.087158203125, "learning_rate": 1.1141282194657288e-06, "loss": 0.0209, "num_tokens": 1211722620.0, "reward": 2.07373046875, "reward_std": 0.09749916195869446, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1655.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 803.41015625, "completions/mean_terminated_length": 803.41015625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.8643850815055049, "frac_reward_zero_std": 0.625, "grad_norm": 0.09409003159413909, "kl": 0.087158203125, "learning_rate": 1.108669706859007e-06, "loss": 0.0084, "num_tokens": 1212220590.0, "reward": 2.12451171875, "reward_std": 0.15415123105049133, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1933.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 773.021484375, "completions/mean_terminated_length": 773.021484375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.8647264658188957, "frac_reward_zero_std": 0.5, "grad_norm": 0.11659018959668614, "kl": 0.086669921875, "learning_rate": 1.1032238138693929e-06, "loss": 0.0061, "num_tokens": 1212694249.0, "reward": 2.0751953125, "reward_std": 0.18161192536354065, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1933.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 808.576171875, "completions/mean_terminated_length": 808.576171875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.8650678501322864, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10729075698408276, "kl": 0.084716796875, "learning_rate": 1.0977905482263297e-06, "loss": 0.012, "num_tokens": 1213192240.0, "reward": 2.07421875, "reward_std": 0.16694246232509613, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 774.970703125, "completions/mean_terminated_length": 774.970703125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.8654092344456772, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1070491275500019, "kl": 0.082763671875, "learning_rate": 1.092369917641345e-06, "loss": 0.0066, "num_tokens": 1213681537.0, "reward": 2.0771484375, "reward_std": 0.17007192969322205, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 760.84375, "completions/mean_terminated_length": 758.3248291015625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.865750618759068, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12170207528212593, "kl": 0.0853271484375, "learning_rate": 1.086961929808038e-06, "loss": 0.0186, "num_tokens": 1214154225.0, "reward": 2.07568359375, "reward_std": 0.18786102533340454, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 810.8671875, "completions/mean_terminated_length": 810.8671875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8660920030724588, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10096259769551102, "kl": 0.0772705078125, "learning_rate": 1.0815665924020513e-06, "loss": 0.0204, "num_tokens": 1214655069.0, "reward": 2.1982421875, "reward_std": 0.16022302210330963, "rewards/accuracy_reward/mean": 0.21370968222618103, "rewards/accuracy_reward/std": 0.41033804416656494, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 738.900390625, "completions/mean_terminated_length": 738.900390625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8664333873858496, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11858291337612349, "kl": 0.0826416015625, "learning_rate": 1.0761839130810858e-06, "loss": 0.0266, "num_tokens": 1215118378.0, "reward": 2.19189453125, "reward_std": 0.2264479100704193, "rewards/accuracy_reward/mean": 0.20967741310596466, "rewards/accuracy_reward/std": 0.4074893891811371, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 832.21875, "completions/mean_terminated_length": 832.21875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.8667747716992404, "frac_reward_zero_std": 0.5, "grad_norm": 0.1006170548183312, "kl": 0.0787353515625, "learning_rate": 1.0708138994848672e-06, "loss": 0.0073, "num_tokens": 1215628314.0, "reward": 2.0908203125, "reward_std": 0.18482941389083862, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 760.205078125, "completions/mean_terminated_length": 760.205078125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.8671161560126313, "frac_reward_zero_std": 0.625, "grad_norm": 0.09770503899441182, "kl": 0.0814208984375, "learning_rate": 1.0654565592351485e-06, "loss": 0.0114, "num_tokens": 1216098243.0, "reward": 2.123046875, "reward_std": 0.15552091598510742, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1704.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 763.27734375, "completions/mean_terminated_length": 763.27734375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.8674575403260221, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08097702526294535, "kl": 0.087158203125, "learning_rate": 1.0601118999356907e-06, "loss": 0.0045, "num_tokens": 1216576481.0, "reward": 2.083984375, "reward_std": 0.09644509106874466, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 796.533203125, "completions/mean_terminated_length": 796.533203125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.8677989246394128, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10610877145746242, "kl": 0.0810546875, "learning_rate": 1.0547799291722628e-06, "loss": 0.0179, "num_tokens": 1217081890.0, "reward": 2.04296875, "reward_std": 0.16628488898277283, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 782.77734375, "completions/mean_terminated_length": 782.77734375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.8681403089528036, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10540833478043375, "kl": 0.07861328125, "learning_rate": 1.049460654512625e-06, "loss": 0.0136, "num_tokens": 1217575760.0, "reward": 2.1865234375, "reward_std": 0.16358016431331635, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 815.94921875, "completions/mean_terminated_length": 815.94921875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.8684816932661944, "frac_reward_zero_std": 0.625, "grad_norm": 0.09655482258423755, "kl": 0.0809326171875, "learning_rate": 1.0441540835065101e-06, "loss": 0.0018, "num_tokens": 1218081110.0, "reward": 2.0166015625, "reward_std": 0.12552374601364136, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 822.544921875, "completions/mean_terminated_length": 822.544921875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.8688230775795852, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09863159380095043, "kl": 0.09033203125, "learning_rate": 1.0388602236856282e-06, "loss": 0.0224, "num_tokens": 1218579005.0, "reward": 2.0068359375, "reward_std": 0.13069820404052734, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 793.576171875, "completions/mean_terminated_length": 793.576171875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.869164461892976, "frac_reward_zero_std": 0.375, "grad_norm": 0.13371970451661613, "kl": 0.082763671875, "learning_rate": 1.033579082563645e-06, "loss": 0.004, "num_tokens": 1219077524.0, "reward": 2.16357421875, "reward_std": 0.21563366055488586, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.032885149121284485, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 791.25390625, "completions/mean_terminated_length": 791.25390625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.8695058462063668, "frac_reward_zero_std": 0.625, "grad_norm": 0.13530097912813197, "kl": 0.0921630859375, "learning_rate": 1.028310667636172e-06, "loss": 0.0046, "num_tokens": 1219568502.0, "reward": 2.0283203125, "reward_std": 0.13550619781017303, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 789.015625, "completions/mean_terminated_length": 784.0784912109375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.8698472305197577, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12869342319748653, "kl": 0.0853271484375, "learning_rate": 1.0230549863807614e-06, "loss": 0.0182, "num_tokens": 1220056814.0, "reward": 2.11767578125, "reward_std": 0.2093929499387741, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 792.404296875, "completions/mean_terminated_length": 792.404296875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8701886148331485, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1085705516988293, "kl": 0.08056640625, "learning_rate": 1.0178120462568908e-06, "loss": 0.0126, "num_tokens": 1220551869.0, "reward": 2.10009765625, "reward_std": 0.17892129719257355, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 834.19921875, "completions/mean_terminated_length": 831.8238525390625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.8705299991465392, "frac_reward_zero_std": 0.46875, "grad_norm": 0.10583972837965086, "kl": 0.0765380859375, "learning_rate": 1.0125818547059584e-06, "loss": 0.0147, "num_tokens": 1221064595.0, "reward": 2.099609375, "reward_std": 0.19126150012016296, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 719.685546875, "completions/mean_terminated_length": 719.685546875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.87087138345993, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13443950705026514, "kl": 0.0965576171875, "learning_rate": 1.0073644191512598e-06, "loss": 0.005, "num_tokens": 1221503218.0, "reward": 2.09326171875, "reward_std": 0.23135003447532654, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.03936556726694107, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 736.826171875, "completions/mean_terminated_length": 736.826171875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.8712127677733208, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11369272443987738, "kl": 0.0885009765625, "learning_rate": 1.002159746997996e-06, "loss": 0.0065, "num_tokens": 1221970345.0, "reward": 2.083984375, "reward_std": 0.18401885032653809, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 855.44140625, "completions/mean_terminated_length": 855.44140625, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.8715541520867116, "frac_reward_zero_std": 0.625, "grad_norm": 0.09709234021639998, "kl": 0.076171875, "learning_rate": 9.96967845633241e-07, "loss": 0.004, "num_tokens": 1222491739.0, "reward": 2.029296875, "reward_std": 0.13320392370224, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.2029850035905838, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 790.763671875, "completions/mean_terminated_length": 790.763671875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8718955364001024, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11089729071652511, "kl": 0.081298828125, "learning_rate": 9.917887224259537e-07, "loss": 0.0073, "num_tokens": 1222985922.0, "reward": 2.0927734375, "reward_std": 0.18466541171073914, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 917.134765625, "completions/mean_terminated_length": 914.9216918945312, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.8722369207134932, "frac_reward_zero_std": 0.625, "grad_norm": 0.08950382401653835, "kl": 0.07177734375, "learning_rate": 9.866223847269486e-07, "loss": 0.0167, "num_tokens": 1223543415.0, "reward": 2.07568359375, "reward_std": 0.1474985033273697, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 764.18359375, "completions/mean_terminated_length": 761.6712036132812, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.872578305026884, "frac_reward_zero_std": 0.5, "grad_norm": 0.12226394079698741, "kl": 0.0826416015625, "learning_rate": 9.814688398689e-07, "loss": 0.0226, "num_tokens": 1224026629.0, "reward": 2.1201171875, "reward_std": 0.1859002262353897, "rewards/accuracy_reward/mean": 0.1391129046678543, "rewards/accuracy_reward/std": 0.3464137017726898, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1405.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 759.416015625, "completions/mean_terminated_length": 759.416015625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.8729196893402749, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10265294739362629, "kl": 0.08056640625, "learning_rate": 9.7632809516632e-07, "loss": 0.0026, "num_tokens": 1224494666.0, "reward": 2.08203125, "reward_std": 0.17892923951148987, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04930410906672478, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 744.712890625, "completions/mean_terminated_length": 744.712890625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.8732610736536656, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10719404887085428, "kl": 0.089111328125, "learning_rate": 9.712001579155593e-07, "loss": 0.0126, "num_tokens": 1224968103.0, "reward": 2.08251953125, "reward_std": 0.15350550413131714, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1812.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 822.6171875, "completions/mean_terminated_length": 822.6171875, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.8736024579670564, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11154738870605545, "kl": 0.0814208984375, "learning_rate": 9.660850353947836e-07, "loss": 0.0098, "num_tokens": 1225472035.0, "reward": 2.0859375, "reward_std": 0.1909281611442566, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 726.478515625, "completions/mean_terminated_length": 726.478515625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.8739438422804472, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13043802811256938, "kl": 0.0811767578125, "learning_rate": 9.609827348639722e-07, "loss": 0.0224, "num_tokens": 1225930824.0, "reward": 2.17529296875, "reward_std": 0.2486654669046402, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.39980348944664, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 755.1484375, "completions/mean_terminated_length": 755.1484375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.874285226593838, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12992775776772603, "kl": 0.08837890625, "learning_rate": 9.558932635649132e-07, "loss": 0.0212, "num_tokens": 1226395316.0, "reward": 2.09228515625, "reward_std": 0.21472997963428497, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 755.9921875, "completions/mean_terminated_length": 755.9921875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8746266109072288, "frac_reward_zero_std": 0.34375, "grad_norm": 0.12226124887533849, "kl": 0.0775146484375, "learning_rate": 9.508166287211739e-07, "loss": 0.0141, "num_tokens": 1226866576.0, "reward": 2.138671875, "reward_std": 0.24129945039749146, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1745.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 705.580078125, "completions/mean_terminated_length": 705.580078125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8749679952206196, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12663849127394372, "kl": 0.085693359375, "learning_rate": 9.457528375381131e-07, "loss": 0.0152, "num_tokens": 1227312169.0, "reward": 2.109375, "reward_std": 0.18940851092338562, "rewards/accuracy_reward/mean": 0.13104838132858276, "rewards/accuracy_reward/std": 0.3377939462661743, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1864.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 785.01953125, "completions/mean_terminated_length": 782.9080200195312, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.8753093795340104, "frac_reward_zero_std": 0.625, "grad_norm": 0.19280082273908372, "kl": 0.2994384765625, "learning_rate": 9.407018972028559e-07, "loss": 0.0233, "num_tokens": 1227818611.0, "reward": 2.09912109375, "reward_std": 0.1324743628501892, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 811.29296875, "completions/mean_terminated_length": 808.872802734375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.8756507638474013, "frac_reward_zero_std": 0.5, "grad_norm": 0.10586817983425373, "kl": 0.07958984375, "learning_rate": 9.356638148842856e-07, "loss": 0.0086, "num_tokens": 1228315993.0, "reward": 2.14306640625, "reward_std": 0.21292644739151, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 773.89453125, "completions/mean_terminated_length": 771.4011840820312, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.875992148160792, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11537980941148826, "kl": 0.079833984375, "learning_rate": 9.306385977330412e-07, "loss": 0.0225, "num_tokens": 1228798435.0, "reward": 2.072265625, "reward_std": 0.1651635766029358, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 849.0078125, "completions/mean_terminated_length": 844.305908203125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.8763335324741828, "frac_reward_zero_std": 0.5, "grad_norm": 0.10140555726045397, "kl": 0.0771484375, "learning_rate": 9.256262528814975e-07, "loss": 0.0379, "num_tokens": 1229317639.0, "reward": 2.1142578125, "reward_std": 0.20872336626052856, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1889.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 795.265625, "completions/mean_terminated_length": 795.265625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.8766749167875736, "frac_reward_zero_std": 0.625, "grad_norm": 0.10057911162305666, "kl": 0.0792236328125, "learning_rate": 9.206267874437635e-07, "loss": 0.0075, "num_tokens": 1229802879.0, "reward": 2.07568359375, "reward_std": 0.12497886270284653, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 787.62890625, "completions/mean_terminated_length": 787.62890625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8770163011009644, "frac_reward_zero_std": 0.625, "grad_norm": 0.1065612666623614, "kl": 0.08154296875, "learning_rate": 9.156402085156635e-07, "loss": 0.0258, "num_tokens": 1230282769.0, "reward": 2.05322265625, "reward_std": 0.13731786608695984, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2036.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 831.484375, "completions/mean_terminated_length": 831.484375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.8773576854143552, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1009483679208864, "kl": 0.08251953125, "learning_rate": 9.106665231747369e-07, "loss": 0.0137, "num_tokens": 1230797785.0, "reward": 2.09033203125, "reward_std": 0.1447124481201172, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 768.1484375, "completions/mean_terminated_length": 768.1484375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.877699069727746, "frac_reward_zero_std": 0.71875, "grad_norm": 0.09380839404028062, "kl": 0.0869140625, "learning_rate": 9.057057384802182e-07, "loss": 0.0053, "num_tokens": 1231265989.0, "reward": 2.017578125, "reward_std": 0.0903809443116188, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 847.544921875, "completions/mean_terminated_length": 838.092529296875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.8780404540411368, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11740150867001295, "kl": 0.08154296875, "learning_rate": 9.007578614730328e-07, "loss": 0.0208, "num_tokens": 1231790956.0, "reward": 2.0537109375, "reward_std": 0.1585150957107544, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17416280508041382, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034629516303539276, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1898.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 801.5546875, "completions/mean_terminated_length": 801.5546875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.8783818383545277, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10076633666742127, "kl": 0.08251953125, "learning_rate": 8.958228991757911e-07, "loss": 0.0092, "num_tokens": 1232284040.0, "reward": 2.06494140625, "reward_std": 0.1356079876422882, "rewards/accuracy_reward/mean": 0.0786290317773819, "rewards/accuracy_reward/std": 0.26943066716194153, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1752.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 783.84375, "completions/mean_terminated_length": 783.84375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.8787232226679184, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11207553079129348, "kl": 0.0855712890625, "learning_rate": 8.909008585927659e-07, "loss": -0.0008, "num_tokens": 1232775768.0, "reward": 2.10107421875, "reward_std": 0.17193078994750977, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 772.990234375, "completions/mean_terminated_length": 772.990234375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8790646069813092, "frac_reward_zero_std": 0.5, "grad_norm": 0.11544664375863208, "kl": 0.080810546875, "learning_rate": 8.859917467098955e-07, "loss": 0.0036, "num_tokens": 1233254387.0, "reward": 2.13232421875, "reward_std": 0.16388481855392456, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 798.71875, "completions/mean_terminated_length": 796.7984008789062, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.8794059912947, "frac_reward_zero_std": 0.4375, "grad_norm": 0.18791349426340312, "kl": 0.106201171875, "learning_rate": 8.810955704947666e-07, "loss": 0.02, "num_tokens": 1233742659.0, "reward": 2.1005859375, "reward_std": 0.1738668531179428, "rewards/accuracy_reward/mean": 0.13709677755832672, "rewards/accuracy_reward/std": 0.34429675340652466, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034629516303539276, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 786.123046875, "completions/mean_terminated_length": 786.123046875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.8797473756080908, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11106266475435027, "kl": 0.0831298828125, "learning_rate": 8.762123368966036e-07, "loss": 0.0077, "num_tokens": 1234226802.0, "reward": 2.103515625, "reward_std": 0.17130202054977417, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 810.828125, "completions/mean_terminated_length": 810.828125, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.8800887599214816, "frac_reward_zero_std": 0.5, "grad_norm": 0.11554770442641944, "kl": 0.0894775390625, "learning_rate": 8.713420528462657e-07, "loss": 0.0123, "num_tokens": 1234745674.0, "reward": 2.04345703125, "reward_std": 0.18923452496528625, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 773.462890625, "completions/mean_terminated_length": 773.462890625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8804301442348724, "frac_reward_zero_std": 0.625, "grad_norm": 0.10621918374417452, "kl": 0.0797119140625, "learning_rate": 8.66484725256228e-07, "loss": 0.0088, "num_tokens": 1235224263.0, "reward": 2.12548828125, "reward_std": 0.13624483346939087, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 791.13671875, "completions/mean_terminated_length": 791.13671875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.8807715285482632, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11705420061881007, "kl": 0.080322265625, "learning_rate": 8.616403610205814e-07, "loss": 0.0272, "num_tokens": 1235705725.0, "reward": 2.16943359375, "reward_std": 0.17601878941059113, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 776.25, "completions/mean_terminated_length": 776.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.881112912861654, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10817430303668968, "kl": 0.083251953125, "learning_rate": 8.568089670150115e-07, "loss": 0.0107, "num_tokens": 1236197213.0, "reward": 2.03369140625, "reward_std": 0.12981122732162476, "rewards/accuracy_reward/mean": 0.05040322616696358, "rewards/accuracy_reward/std": 0.21899642050266266, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 805.298828125, "completions/mean_terminated_length": 802.866943359375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.8814542971750448, "frac_reward_zero_std": 0.75, "grad_norm": 0.07401961963975068, "kl": 0.0775146484375, "learning_rate": 8.519905500968028e-07, "loss": 0.0117, "num_tokens": 1236697910.0, "reward": 2.005859375, "reward_std": 0.08168989419937134, "rewards/accuracy_reward/mean": 0.017578125, "rewards/accuracy_reward/std": 0.13154059648513794, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 768.55078125, "completions/mean_terminated_length": 768.55078125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.8817956814884356, "frac_reward_zero_std": 0.5, "grad_norm": 0.11093689216705245, "kl": 0.0826416015625, "learning_rate": 8.47185117104814e-07, "loss": 0.0226, "num_tokens": 1237184048.0, "reward": 2.09033203125, "reward_std": 0.18145334720611572, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1632.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 778.4765625, "completions/mean_terminated_length": 778.4765625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.8821370658018264, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11857332469484591, "kl": 0.08642578125, "learning_rate": 8.423926748594769e-07, "loss": 0.011, "num_tokens": 1237664740.0, "reward": 2.193359375, "reward_std": 0.2042330652475357, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 818.630859375, "completions/mean_terminated_length": 818.630859375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.8824784501152172, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10101887251594842, "kl": 0.078857421875, "learning_rate": 8.376132301627915e-07, "loss": 0.014, "num_tokens": 1238160759.0, "reward": 2.08642578125, "reward_std": 0.15609848499298096, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 761.71484375, "completions/mean_terminated_length": 761.71484375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.882819834428608, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1123571484444886, "kl": 0.08642578125, "learning_rate": 8.328467897982994e-07, "loss": 0.0144, "num_tokens": 1238630197.0, "reward": 2.07470703125, "reward_std": 0.16537654399871826, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 750.322265625, "completions/mean_terminated_length": 750.322265625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.8831612187419988, "frac_reward_zero_std": 0.625, "grad_norm": 0.10129671576398559, "kl": 0.088623046875, "learning_rate": 8.280933605310959e-07, "loss": 0.0142, "num_tokens": 1239096554.0, "reward": 2.046875, "reward_std": 0.1432940661907196, "rewards/accuracy_reward/mean": 0.060483869165182114, "rewards/accuracy_reward/std": 0.2386218160390854, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 843.765625, "completions/mean_terminated_length": 840.300048828125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.8835026030553896, "frac_reward_zero_std": 0.5, "grad_norm": 0.4166412353719321, "kl": 0.1658935546875, "learning_rate": 8.233529491078007e-07, "loss": 0.0282, "num_tokens": 1239608306.0, "reward": 2.0361328125, "reward_std": 0.16807982325553894, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 812.01953125, "completions/mean_terminated_length": 812.01953125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.8838439873687804, "frac_reward_zero_std": 0.375, "grad_norm": 0.1330165262989648, "kl": 0.0909423828125, "learning_rate": 8.186255622565642e-07, "loss": 0.0244, "num_tokens": 1240112860.0, "reward": 2.1103515625, "reward_std": 0.22902342677116394, "rewards/accuracy_reward/mean": 0.12903225421905518, "rewards/accuracy_reward/std": 0.33557409048080444, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 770.787109375, "completions/mean_terminated_length": 770.787109375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.8841853716821712, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11521856599404036, "kl": 0.088134765625, "learning_rate": 8.139112066870458e-07, "loss": 0.0108, "num_tokens": 1240597663.0, "reward": 2.05419921875, "reward_std": 0.16543196141719818, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 801.302734375, "completions/mean_terminated_length": 800.0078125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.884526755995562, "frac_reward_zero_std": 0.53125, "grad_norm": 0.31619058664326943, "kl": 0.1390380859375, "learning_rate": 8.092098890904099e-07, "loss": 0.0234, "num_tokens": 1241083034.0, "reward": 2.1240234375, "reward_std": 0.16680464148521423, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1983.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 760.390625, "completions/mean_terminated_length": 760.390625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.8848681403089528, "frac_reward_zero_std": 0.5, "grad_norm": 0.11697547524034253, "kl": 0.084228515625, "learning_rate": 8.045216161393188e-07, "loss": 0.0179, "num_tokens": 1241547266.0, "reward": 2.1337890625, "reward_std": 0.1931840479373932, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 797.279296875, "completions/mean_terminated_length": 797.279296875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.8852095246223436, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10933413449985525, "kl": 0.08154296875, "learning_rate": 7.998463944879153e-07, "loss": 0.0098, "num_tokens": 1242036929.0, "reward": 2.12939453125, "reward_std": 0.16849175095558167, "rewards/accuracy_reward/mean": 0.14919355511665344, "rewards/accuracy_reward/std": 0.3566388487815857, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 745.0703125, "completions/mean_terminated_length": 745.0703125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.8855509089357344, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10732643058777763, "kl": 0.0806884765625, "learning_rate": 7.951842307718271e-07, "loss": 0.0213, "num_tokens": 1242497605.0, "reward": 2.13427734375, "reward_std": 0.13938409090042114, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1742.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 779.67578125, "completions/mean_terminated_length": 779.67578125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.8858922932491252, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11748816377791581, "kl": 0.0858154296875, "learning_rate": 7.905351316081377e-07, "loss": 0.0146, "num_tokens": 1242971583.0, "reward": 2.11328125, "reward_std": 0.1884143352508545, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1510.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 812.2265625, "completions/mean_terminated_length": 812.2265625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.886233677562516, "frac_reward_zero_std": 0.5, "grad_norm": 0.11792405196273632, "kl": 0.08056640625, "learning_rate": 7.858991035953945e-07, "loss": 0.0112, "num_tokens": 1243465683.0, "reward": 2.0849609375, "reward_std": 0.1882270872592926, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034629516303539276, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 747.5390625, "completions/mean_terminated_length": 747.5390625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8865750618759068, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1147295925008588, "kl": 0.086181640625, "learning_rate": 7.812761533135926e-07, "loss": 0.0097, "num_tokens": 1243930727.0, "reward": 2.04296875, "reward_std": 0.13880911469459534, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 782.423828125, "completions/mean_terminated_length": 782.423828125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8869164461892975, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11128399824550003, "kl": 0.083251953125, "learning_rate": 7.766662873241615e-07, "loss": 0.0091, "num_tokens": 1244411680.0, "reward": 2.0673828125, "reward_std": 0.13732650876045227, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1440.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 699.4609375, "completions/mean_terminated_length": 699.4609375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.8872578305026884, "frac_reward_zero_std": 0.625, "grad_norm": 0.10848399339900397, "kl": 0.084716796875, "learning_rate": 7.720695121699673e-07, "loss": 0.0105, "num_tokens": 1244853404.0, "reward": 2.08837890625, "reward_std": 0.12686923146247864, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1646.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 791.21484375, "completions/mean_terminated_length": 791.21484375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8875992148160792, "frac_reward_zero_std": 0.5, "grad_norm": 0.10325906710633044, "kl": 0.0791015625, "learning_rate": 7.674858343752867e-07, "loss": 0.0165, "num_tokens": 1245337594.0, "reward": 2.0791015625, "reward_std": 0.18225236237049103, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 799.548828125, "completions/mean_terminated_length": 799.548828125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.88794059912947, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10366066918365438, "kl": 0.079833984375, "learning_rate": 7.629152604458157e-07, "loss": 0.0099, "num_tokens": 1245826035.0, "reward": 2.0556640625, "reward_std": 0.15157338976860046, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 766.767578125, "completions/mean_terminated_length": 764.26025390625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.8882819834428608, "frac_reward_zero_std": 0.5, "grad_norm": 0.11823944157436203, "kl": 0.0780029296875, "learning_rate": 7.583577968686473e-07, "loss": 0.0157, "num_tokens": 1246299292.0, "reward": 2.19677734375, "reward_std": 0.2039279043674469, "rewards/accuracy_reward/mean": 0.2177419364452362, "rewards/accuracy_reward/std": 0.41312772035598755, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2017.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 765.4140625, "completions/mean_terminated_length": 765.4140625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8886233677562516, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10438348621359789, "kl": 0.083984375, "learning_rate": 7.538134501122652e-07, "loss": 0.013, "num_tokens": 1246771440.0, "reward": 2.1015625, "reward_std": 0.1599941998720169, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1851.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 765.541015625, "completions/mean_terminated_length": 765.541015625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.8889647520696424, "frac_reward_zero_std": 0.5, "grad_norm": 0.11504084360320671, "kl": 0.08544921875, "learning_rate": 7.492822266265409e-07, "loss": 0.0165, "num_tokens": 1247241157.0, "reward": 2.16748046875, "reward_std": 0.19984078407287598, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.3937928080558777, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1451.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 721.21484375, "completions/mean_terminated_length": 721.21484375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.8893061363830332, "frac_reward_zero_std": 0.5, "grad_norm": 0.12067501907892475, "kl": 0.08642578125, "learning_rate": 7.447641328427146e-07, "loss": 0.0125, "num_tokens": 1247690995.0, "reward": 2.1435546875, "reward_std": 0.17955367267131805, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2013.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 788.703125, "completions/mean_terminated_length": 788.703125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.889647520696424, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12006246924749613, "kl": 0.0775146484375, "learning_rate": 7.402591751733989e-07, "loss": 0.0235, "num_tokens": 1248172651.0, "reward": 2.18896484375, "reward_std": 0.2165784239768982, "rewards/accuracy_reward/mean": 0.212890625, "rewards/accuracy_reward/std": 0.409751296043396, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1633.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 772.5390625, "completions/mean_terminated_length": 772.5390625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.8899889050098148, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11695887810250159, "kl": 0.0897216796875, "learning_rate": 7.357673600125526e-07, "loss": 0.0048, "num_tokens": 1248647855.0, "reward": 2.0830078125, "reward_std": 0.18357078731060028, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 833.46484375, "completions/mean_terminated_length": 833.46484375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8903302893232056, "frac_reward_zero_std": 0.59375, "grad_norm": 0.08604766645846693, "kl": 0.078125, "learning_rate": 7.31288693735489e-07, "loss": 0.0063, "num_tokens": 1249157485.0, "reward": 2.04931640625, "reward_std": 0.12523721158504486, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 765.861328125, "completions/mean_terminated_length": 765.861328125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8906716736365964, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11460876125652661, "kl": 0.08642578125, "learning_rate": 7.268231826988514e-07, "loss": 0.0182, "num_tokens": 1249628822.0, "reward": 2.1181640625, "reward_std": 0.19218982756137848, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 791.58203125, "completions/mean_terminated_length": 791.58203125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.8910130579499872, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12164431971327312, "kl": 0.08154296875, "learning_rate": 7.223708332406187e-07, "loss": 0.0316, "num_tokens": 1250113456.0, "reward": 2.13232421875, "reward_std": 0.2141786515712738, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1623.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 770.71484375, "completions/mean_terminated_length": 770.71484375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.891354442263378, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09304748791271236, "kl": 0.0841064453125, "learning_rate": 7.179316516800894e-07, "loss": 0.0158, "num_tokens": 1250596382.0, "reward": 2.11865234375, "reward_std": 0.15423515439033508, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1643.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 773.087890625, "completions/mean_terminated_length": 773.087890625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.8916958265767688, "frac_reward_zero_std": 0.625, "grad_norm": 0.1027426254934754, "kl": 0.07958984375, "learning_rate": 7.135056443178645e-07, "loss": 0.0112, "num_tokens": 1251070363.0, "reward": 2.0771484375, "reward_std": 0.13604073226451874, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1992.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 735.22265625, "completions/mean_terminated_length": 735.22265625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.8920372108901596, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08841874534880098, "kl": 0.083251953125, "learning_rate": 7.09092817435858e-07, "loss": 0.0119, "num_tokens": 1251535981.0, "reward": 2.05517578125, "reward_std": 0.11217763274908066, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 786.44921875, "completions/mean_terminated_length": 786.44921875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8923785952035505, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09219049631489352, "kl": 0.080078125, "learning_rate": 7.046931772972632e-07, "loss": 0.0054, "num_tokens": 1252019955.0, "reward": 2.0849609375, "reward_std": 0.12536266446113586, "rewards/accuracy_reward/mean": 0.09879032522439957, "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 756.59375, "completions/mean_terminated_length": 756.59375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8927199795169412, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11119992904582794, "kl": 0.080078125, "learning_rate": 7.003067301465704e-07, "loss": 0.0071, "num_tokens": 1252489203.0, "reward": 2.11181640625, "reward_std": 0.15916207432746887, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1631.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 802.0390625, "completions/mean_terminated_length": 802.0390625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.893061363830332, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11167234119871765, "kl": 0.08740234375, "learning_rate": 6.959334822095354e-07, "loss": 0.0133, "num_tokens": 1252988583.0, "reward": 2.04833984375, "reward_std": 0.16448864340782166, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 739.841796875, "completions/mean_terminated_length": 739.841796875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.8934027481437228, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11827372331008328, "kl": 0.0863037109375, "learning_rate": 6.915734396931851e-07, "loss": 0.0139, "num_tokens": 1253447846.0, "reward": 2.04150390625, "reward_std": 0.14360681176185608, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1849.0, "completions/max_terminated_length": 1849.0, "completions/mean_length": 746.2734375, "completions/mean_terminated_length": 746.2734375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.8937441324571136, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10950248612466225, "kl": 0.0814208984375, "learning_rate": 6.872266087858049e-07, "loss": 0.017, "num_tokens": 1253915362.0, "reward": 2.171875, "reward_std": 0.18406826257705688, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.3875311613082886, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 806.220703125, "completions/mean_terminated_length": 806.220703125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.8940855167705044, "frac_reward_zero_std": 0.625, "grad_norm": 0.10543261613571607, "kl": 0.0784912109375, "learning_rate": 6.828929956569219e-07, "loss": 0.0173, "num_tokens": 1254415395.0, "reward": 2.04638671875, "reward_std": 0.13450904190540314, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 835.888671875, "completions/mean_terminated_length": 833.5166015625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.8944269010838952, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1204734513292686, "kl": 0.0767822265625, "learning_rate": 6.785726064573117e-07, "loss": 0.0156, "num_tokens": 1254933418.0, "reward": 2.076171875, "reward_std": 0.24417033791542053, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1236.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 741.1328125, "completions/mean_terminated_length": 741.1328125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.894768285397286, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11639885284497871, "kl": 0.0841064453125, "learning_rate": 6.742654473189725e-07, "loss": 0.0098, "num_tokens": 1255398222.0, "reward": 2.0458984375, "reward_std": 0.15146633982658386, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 704.41796875, "completions/mean_terminated_length": 704.41796875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.8951096697106768, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10188869220734409, "kl": 0.0860595703125, "learning_rate": 6.699715243551319e-07, "loss": 0.0018, "num_tokens": 1255848596.0, "reward": 2.11572265625, "reward_std": 0.14330387115478516, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 832.0390625, "completions/mean_terminated_length": 832.0390625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8954510540240675, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12027618249985055, "kl": 0.08203125, "learning_rate": 6.65690843660225e-07, "loss": 0.0155, "num_tokens": 1256356360.0, "reward": 2.02587890625, "reward_std": 0.17624567449092865, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1963.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 747.845703125, "completions/mean_terminated_length": 747.845703125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8957924383374584, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08938594505133737, "kl": 0.08251953125, "learning_rate": 6.614234113098972e-07, "loss": 0.0133, "num_tokens": 1256819033.0, "reward": 2.09423828125, "reward_std": 0.09996902942657471, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 712.0546875, "completions/mean_terminated_length": 712.0546875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.8961338226508492, "frac_reward_zero_std": 0.3125, "grad_norm": 0.14111418922409347, "kl": 0.0863037109375, "learning_rate": 6.571692333609892e-07, "loss": 0.0037, "num_tokens": 1257266197.0, "reward": 2.099609375, "reward_std": 0.25387945771217346, "rewards/accuracy_reward/mean": 0.13306452333927155, "rewards/accuracy_reward/std": 0.3399873375892639, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0347534641623497, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 795.5, "completions/mean_terminated_length": 795.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.89647520696424, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11926103406917325, "kl": 0.0809326171875, "learning_rate": 6.529283158515276e-07, "loss": 0.01, "num_tokens": 1257762917.0, "reward": 2.14892578125, "reward_std": 0.2445654422044754, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2046.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 794.9765625, "completions/mean_terminated_length": 794.9765625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.8968165912776308, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11267761457446974, "kl": 0.077880859375, "learning_rate": 6.487006648007188e-07, "loss": 0.0115, "num_tokens": 1258250441.0, "reward": 2.12451171875, "reward_std": 0.1749998927116394, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 770.87890625, "completions/mean_terminated_length": 770.87890625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.8971579755910216, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11358026194675275, "kl": 0.084228515625, "learning_rate": 6.444862862089385e-07, "loss": 0.0132, "num_tokens": 1258724331.0, "reward": 2.04541015625, "reward_std": 0.18280768394470215, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03300117328763008, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 767.078125, "completions/mean_terminated_length": 764.5714111328125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8974993599044124, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12239133235003671, "kl": 0.0838623046875, "learning_rate": 6.402851860577297e-07, "loss": 0.011, "num_tokens": 1259193779.0, "reward": 2.04736328125, "reward_std": 0.1868741661310196, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 746.650390625, "completions/mean_terminated_length": 746.650390625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.8978407442178032, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13176165743707619, "kl": 0.082763671875, "learning_rate": 6.360973703097828e-07, "loss": 0.0156, "num_tokens": 1259665456.0, "reward": 2.08154296875, "reward_std": 0.2038860023021698, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 776.69140625, "completions/mean_terminated_length": 776.69140625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.898182128531194, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08345649442692789, "kl": 0.0802001953125, "learning_rate": 6.319228449089376e-07, "loss": 0.0183, "num_tokens": 1260144146.0, "reward": 2.083984375, "reward_std": 0.10768459737300873, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1896.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 771.119140625, "completions/mean_terminated_length": 771.119140625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8985235128445848, "frac_reward_zero_std": 0.5, "grad_norm": 0.11525617018791001, "kl": 0.0811767578125, "learning_rate": 6.277616157801724e-07, "loss": 0.034, "num_tokens": 1260623855.0, "reward": 2.10009765625, "reward_std": 0.17999513447284698, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1628.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 735.927734375, "completions/mean_terminated_length": 735.927734375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.8988648971579756, "frac_reward_zero_std": 0.5, "grad_norm": 0.12084452316904694, "kl": 0.07763671875, "learning_rate": 6.23613688829584e-07, "loss": 0.0075, "num_tokens": 1261084602.0, "reward": 2.1181640625, "reward_std": 0.20483160018920898, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 752.970703125, "completions/mean_terminated_length": 752.970703125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.8992062814713664, "frac_reward_zero_std": 0.3125, "grad_norm": 0.14472396348908695, "kl": 0.08349609375, "learning_rate": 6.19479069944402e-07, "loss": 0.0144, "num_tokens": 1261550955.0, "reward": 2.16064453125, "reward_std": 0.2640913426876068, "rewards/accuracy_reward/mean": 0.177734375, "rewards/accuracy_reward/std": 0.3826628625392914, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 745.15234375, "completions/mean_terminated_length": 742.6027221679688, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.8995476657847572, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12489961917642108, "kl": 0.0858154296875, "learning_rate": 6.153577649929576e-07, "loss": 0.0129, "num_tokens": 1262012217.0, "reward": 2.07275390625, "reward_std": 0.18686333298683167, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 761.974609375, "completions/mean_terminated_length": 761.974609375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.899889050098148, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09692813615952063, "kl": 0.0826416015625, "learning_rate": 6.11249779824693e-07, "loss": 0.004, "num_tokens": 1262490700.0, "reward": 2.05810546875, "reward_std": 0.1225486621260643, "rewards/accuracy_reward/mean": 0.07258064299821854, "rewards/accuracy_reward/std": 0.25970885157585144, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2022.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 747.16796875, "completions/mean_terminated_length": 747.16796875, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.9002304344115388, "frac_reward_zero_std": 0.5, "grad_norm": 0.11082923874072835, "kl": 0.0802001953125, "learning_rate": 6.071551202701376e-07, "loss": 0.0013, "num_tokens": 1262960450.0, "reward": 2.09228515625, "reward_std": 0.15970176458358765, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 797.3125, "completions/mean_terminated_length": 794.864990234375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.9005718187249296, "frac_reward_zero_std": 0.5, "grad_norm": 0.11825267277112021, "kl": 0.0869140625, "learning_rate": 6.030737921409169e-07, "loss": 0.0223, "num_tokens": 1263453042.0, "reward": 2.05224609375, "reward_std": 0.17613086104393005, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1704.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 795.38671875, "completions/mean_terminated_length": 795.38671875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.9009132030383203, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11140283745296055, "kl": 0.0748291015625, "learning_rate": 5.990058012297262e-07, "loss": 0.0101, "num_tokens": 1263947432.0, "reward": 2.11572265625, "reward_std": 0.18044936656951904, "rewards/accuracy_reward/mean": 0.13104838132858276, "rewards/accuracy_reward/std": 0.3377939760684967, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2037.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 766.6640625, "completions/mean_terminated_length": 766.6640625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.9012545873517112, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10631584853397975, "kl": 0.0802001953125, "learning_rate": 5.949511533103336e-07, "loss": 0.0245, "num_tokens": 1264434220.0, "reward": 2.09619140625, "reward_std": 0.19086916744709015, "rewards/accuracy_reward/mean": 0.11290322244167328, "rewards/accuracy_reward/std": 0.3167939782142639, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1449.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 713.421875, "completions/mean_terminated_length": 713.421875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.901595971665102, "frac_reward_zero_std": 0.5, "grad_norm": 0.11673734018264724, "kl": 0.0860595703125, "learning_rate": 5.909098541375747e-07, "loss": 0.0127, "num_tokens": 1264880356.0, "reward": 2.099609375, "reward_std": 0.1672363430261612, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1932.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 778.212890625, "completions/mean_terminated_length": 777.0587158203125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.9019373559784928, "frac_reward_zero_std": 0.5625, "grad_norm": 0.9523428832250621, "kl": 0.3260498046875, "learning_rate": 5.868819094473289e-07, "loss": 0.0147, "num_tokens": 1265363201.0, "reward": 2.07568359375, "reward_std": 0.17200887203216553, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1668.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 757.138671875, "completions/mean_terminated_length": 757.138671875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.9022787402918836, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12130670540330267, "kl": 0.0811767578125, "learning_rate": 5.82867324956532e-07, "loss": 0.0177, "num_tokens": 1265843928.0, "reward": 2.16796875, "reward_std": 0.228950634598732, "rewards/accuracy_reward/mean": 0.1895161271095276, "rewards/accuracy_reward/std": 0.39231374859809875, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 802.16796875, "completions/mean_terminated_length": 802.16796875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.9026201246052744, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09899983680149924, "kl": 0.0777587890625, "learning_rate": 5.788661063631496e-07, "loss": 0.0095, "num_tokens": 1266341246.0, "reward": 2.095703125, "reward_std": 0.1659562587738037, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 686.865234375, "completions/mean_terminated_length": 686.865234375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.9029615089186652, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12393141901625897, "kl": 0.0869140625, "learning_rate": 5.748782593461788e-07, "loss": 0.0124, "num_tokens": 1266772681.0, "reward": 2.12158203125, "reward_std": 0.16494515538215637, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 753.615234375, "completions/mean_terminated_length": 751.0822143554688, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.903302893232056, "frac_reward_zero_std": 0.625, "grad_norm": 0.11020723886726227, "kl": 0.079833984375, "learning_rate": 5.709037895656421e-07, "loss": 0.0259, "num_tokens": 1267250964.0, "reward": 2.05419921875, "reward_std": 0.14166143536567688, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 776.888671875, "completions/mean_terminated_length": 776.888671875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9036442775454467, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10290286231169744, "kl": 0.079833984375, "learning_rate": 5.669427026625695e-07, "loss": 0.0164, "num_tokens": 1267730875.0, "reward": 2.11572265625, "reward_std": 0.16812027990818024, "rewards/accuracy_reward/mean": 0.1270161271095276, "rewards/accuracy_reward/std": 0.3333272337913513, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 779.111328125, "completions/mean_terminated_length": 779.111328125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.9039856618588376, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10981163549116227, "kl": 0.078857421875, "learning_rate": 5.629950042590027e-07, "loss": 0.0214, "num_tokens": 1268217924.0, "reward": 2.0888671875, "reward_std": 0.15885281562805176, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1961.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 805.70703125, "completions/mean_terminated_length": 805.70703125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.9043270461722284, "frac_reward_zero_std": 0.625, "grad_norm": 0.10298920399469395, "kl": 0.0784912109375, "learning_rate": 5.590606999579729e-07, "loss": 0.0188, "num_tokens": 1268718222.0, "reward": 2.05419921875, "reward_std": 0.14317026734352112, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 762.37890625, "completions/mean_terminated_length": 762.37890625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.9046684304856192, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10555900397556671, "kl": 0.0791015625, "learning_rate": 5.551397953435112e-07, "loss": 0.0051, "num_tokens": 1269187200.0, "reward": 2.13134765625, "reward_std": 0.17709098756313324, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 757.162109375, "completions/mean_terminated_length": 757.162109375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.90500981479901, "frac_reward_zero_std": 0.5, "grad_norm": 0.12166446558544769, "kl": 0.0845947265625, "learning_rate": 5.512322959806193e-07, "loss": 0.0202, "num_tokens": 1269651891.0, "reward": 2.1328125, "reward_std": 0.1972936987876892, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 747.064453125, "completions/mean_terminated_length": 747.064453125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.9053511991124008, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10979984802441761, "kl": 0.0863037109375, "learning_rate": 5.4733820741528e-07, "loss": 0.0302, "num_tokens": 1270117588.0, "reward": 2.05078125, "reward_std": 0.15705974400043488, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 770.837890625, "completions/mean_terminated_length": 770.837890625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.9056925834257916, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10704621363930865, "kl": 0.0794677734375, "learning_rate": 5.434575351744409e-07, "loss": 0.0186, "num_tokens": 1270594593.0, "reward": 2.0712890625, "reward_std": 0.16783469915390015, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.031142795458436012, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 742.087890625, "completions/mean_terminated_length": 742.087890625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.9060339677391824, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14038122281002344, "kl": 0.09033203125, "learning_rate": 5.39590284766004e-07, "loss": 0.0213, "num_tokens": 1271047310.0, "reward": 2.10498046875, "reward_std": 0.2142137587070465, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 822.416015625, "completions/mean_terminated_length": 821.616455078125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.9063753520525731, "frac_reward_zero_std": 0.53125, "grad_norm": 2.7214860505125347, "kl": 0.812255859375, "learning_rate": 5.357364616788263e-07, "loss": 0.0529, "num_tokens": 1271556099.0, "reward": 2.08447265625, "reward_std": 0.17583611607551575, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1703.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 753.748046875, "completions/mean_terminated_length": 753.748046875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.906716736365964, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08849065760167074, "kl": 0.0794677734375, "learning_rate": 5.318960713827026e-07, "loss": 0.0203, "num_tokens": 1272015506.0, "reward": 2.13818359375, "reward_std": 0.12097153812646866, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 755.57421875, "completions/mean_terminated_length": 755.57421875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.9070581206793548, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13746455458600418, "kl": 0.0860595703125, "learning_rate": 5.280691193283671e-07, "loss": 0.0266, "num_tokens": 1272489560.0, "reward": 2.10791015625, "reward_std": 0.20629993081092834, "rewards/accuracy_reward/mean": 0.13709677755832672, "rewards/accuracy_reward/std": 0.34429675340652466, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 818.0625, "completions/mean_terminated_length": 818.0625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.9073995049927456, "frac_reward_zero_std": 0.625, "grad_norm": 0.08639510187585878, "kl": 0.0728759765625, "learning_rate": 5.242556109474772e-07, "loss": 0.0142, "num_tokens": 1272990520.0, "reward": 2.13671875, "reward_std": 0.14826563000679016, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 794.7890625, "completions/mean_terminated_length": 794.7890625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.9077408893061364, "frac_reward_zero_std": 0.5, "grad_norm": 0.11617717550739047, "kl": 0.079345703125, "learning_rate": 5.204555516526077e-07, "loss": 0.0094, "num_tokens": 1273479564.0, "reward": 2.115234375, "reward_std": 0.17856335639953613, "rewards/accuracy_reward/mean": 0.130859375, "rewards/accuracy_reward/std": 0.33757632970809937, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 717.48046875, "completions/mean_terminated_length": 717.48046875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.9080822736195272, "frac_reward_zero_std": 0.625, "grad_norm": 0.10635829013585218, "kl": 0.0858154296875, "learning_rate": 5.166689468372532e-07, "loss": 0.0161, "num_tokens": 1273924146.0, "reward": 2.1083984375, "reward_std": 0.1477569043636322, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 728.39453125, "completions/mean_terminated_length": 728.39453125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.908423657932918, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12691758157760274, "kl": 0.0845947265625, "learning_rate": 5.128958018758013e-07, "loss": 0.0132, "num_tokens": 1274375308.0, "reward": 2.02294921875, "reward_std": 0.1656372845172882, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 763.953125, "completions/mean_terminated_length": 763.953125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.9087650422463088, "frac_reward_zero_std": 0.5, "grad_norm": 0.10946906117084047, "kl": 0.079345703125, "learning_rate": 5.091361221235447e-07, "loss": 0.0096, "num_tokens": 1274852340.0, "reward": 2.166015625, "reward_std": 0.20414355397224426, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 810.869140625, "completions/mean_terminated_length": 808.4481201171875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.9091064265596995, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11415384683949632, "kl": 0.0848388671875, "learning_rate": 5.053899129166606e-07, "loss": 0.0233, "num_tokens": 1275352337.0, "reward": 2.09228515625, "reward_std": 0.18495003879070282, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 729.228515625, "completions/mean_terminated_length": 726.6477661132812, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.9094478108730903, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12560748632285595, "kl": 0.088623046875, "learning_rate": 5.016571795722058e-07, "loss": 0.0346, "num_tokens": 1275803398.0, "reward": 2.10498046875, "reward_std": 0.20511263608932495, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.966796875, "rewards/format_reward/std": 0.17934183776378632, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 771.724609375, "completions/mean_terminated_length": 771.724609375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.9097891951864812, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0893577725073415, "kl": 0.078857421875, "learning_rate": 4.979379273881146e-07, "loss": 0.0043, "num_tokens": 1276287993.0, "reward": 2.08056640625, "reward_std": 0.1147504597902298, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 768.431640625, "completions/mean_terminated_length": 768.431640625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.910130579499872, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11261067534845, "kl": 0.081787109375, "learning_rate": 4.942321616431833e-07, "loss": 0.0111, "num_tokens": 1276761574.0, "reward": 2.07666015625, "reward_std": 0.1591288447380066, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 718.634765625, "completions/mean_terminated_length": 718.634765625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.9104719638132628, "frac_reward_zero_std": 0.375, "grad_norm": 0.14963148197943496, "kl": 0.0843505859375, "learning_rate": 4.905398875970724e-07, "loss": 0.0275, "num_tokens": 1277207627.0, "reward": 2.134765625, "reward_std": 0.2038916051387787, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 758.337890625, "completions/mean_terminated_length": 758.337890625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.9108133481266536, "frac_reward_zero_std": 0.625, "grad_norm": 0.11156010702166917, "kl": 0.0869140625, "learning_rate": 4.868611104902843e-07, "loss": 0.0128, "num_tokens": 1277684216.0, "reward": 2.0224609375, "reward_std": 0.12407057732343674, "rewards/accuracy_reward/mean": 0.03629032149910927, "rewards/accuracy_reward/std": 0.1872003972530365, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1928.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 833.90234375, "completions/mean_terminated_length": 833.90234375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.9111547324400444, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10270396531563855, "kl": 0.0755615234375, "learning_rate": 4.831958355441746e-07, "loss": 0.0028, "num_tokens": 1278206550.0, "reward": 2.08984375, "reward_std": 0.18543988466262817, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 795.02734375, "completions/mean_terminated_length": 795.02734375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.9114961167534352, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10248017146141315, "kl": 0.0870361328125, "learning_rate": 4.795440679609298e-07, "loss": 0.0145, "num_tokens": 1278706868.0, "reward": 2.04052734375, "reward_std": 0.1335156410932541, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1680.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 799.97265625, "completions/mean_terminated_length": 799.97265625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.9118375010668259, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0892580929215683, "kl": 0.077392578125, "learning_rate": 4.759058129235627e-07, "loss": -0.0005, "num_tokens": 1279197366.0, "reward": 2.076171875, "reward_std": 0.12383072078227997, "rewards/accuracy_reward/mean": 0.08669354766607285, "rewards/accuracy_reward/std": 0.281669557094574, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.04419417306780815, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 803.240234375, "completions/mean_terminated_length": 803.240234375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.9121788853802167, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12153210613519273, "kl": 0.0814208984375, "learning_rate": 4.722810755959162e-07, "loss": 0.0209, "num_tokens": 1279688353.0, "reward": 2.03759765625, "reward_std": 0.18302318453788757, "rewards/accuracy_reward/mean": 0.060483869165182114, "rewards/accuracy_reward/std": 0.2386218160390854, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1769.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 819.77734375, "completions/mean_terminated_length": 819.77734375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.9125202696936076, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0936568625140828, "kl": 0.0723876953125, "learning_rate": 4.6866986112263703e-07, "loss": 0.0087, "num_tokens": 1280188271.0, "reward": 2.10009765625, "reward_std": 0.14121593534946442, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1742.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 743.48828125, "completions/mean_terminated_length": 743.48828125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.9128616540069984, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08812919598473848, "kl": 0.08251953125, "learning_rate": 4.6507217462918753e-07, "loss": 0.0087, "num_tokens": 1280660825.0, "reward": 2.125, "reward_std": 0.14856559038162231, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 792.31640625, "completions/mean_terminated_length": 792.31640625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9132030383203892, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12385664948279175, "kl": 0.08056640625, "learning_rate": 4.614880212218231e-07, "loss": 0.0196, "num_tokens": 1281153979.0, "reward": 2.19873046875, "reward_std": 0.19667920470237732, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4083731174468994, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 821.865234375, "completions/mean_terminated_length": 821.865234375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.91354442263378, "frac_reward_zero_std": 0.71875, "grad_norm": 0.0857587689291789, "kl": 0.0791015625, "learning_rate": 4.579174059875946e-07, "loss": -0.0022, "num_tokens": 1281658454.0, "reward": 2.05712890625, "reward_std": 0.11570733785629272, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 762.689453125, "completions/mean_terminated_length": 760.1741333007812, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.9138858069471708, "frac_reward_zero_std": 0.5, "grad_norm": 0.11606887271852552, "kl": 0.08544921875, "learning_rate": 4.543603339943381e-07, "loss": 0.025, "num_tokens": 1282145527.0, "reward": 2.08349609375, "reward_std": 0.18142634630203247, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 713.79296875, "completions/mean_terminated_length": 713.79296875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.9142271912605616, "frac_reward_zero_std": 0.625, "grad_norm": 0.1177536383525029, "kl": 0.082275390625, "learning_rate": 4.5081681029066516e-07, "loss": 0.0153, "num_tokens": 1282598333.0, "reward": 2.10107421875, "reward_std": 0.14841589331626892, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 738.421875, "completions/mean_terminated_length": 735.8590698242188, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9145685755739523, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1155405650062342, "kl": 0.0811767578125, "learning_rate": 4.4728683990596267e-07, "loss": 0.0204, "num_tokens": 1283061269.0, "reward": 2.1220703125, "reward_std": 0.20466721057891846, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 715.025390625, "completions/mean_terminated_length": 715.025390625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.9149099598873431, "frac_reward_zero_std": 0.5, "grad_norm": 0.14922844792610485, "kl": 0.0936279296875, "learning_rate": 4.4377042785037293e-07, "loss": 0.0087, "num_tokens": 1283515282.0, "reward": 2.0888671875, "reward_std": 0.19195570051670074, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05395861715078354, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1654.0, "completions/max_terminated_length": 1654.0, "completions/mean_length": 732.1640625, "completions/mean_terminated_length": 732.1640625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.915251344200734, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1332540995271479, "kl": 0.08251953125, "learning_rate": 4.402675791148059e-07, "loss": 0.0051, "num_tokens": 1283985142.0, "reward": 2.10302734375, "reward_std": 0.19514772295951843, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 856.2265625, "completions/mean_terminated_length": 856.2265625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.9155927285141248, "frac_reward_zero_std": 0.5, "grad_norm": 0.105012751637964, "kl": 0.0799560546875, "learning_rate": 4.3677829867090906e-07, "loss": 0.0279, "num_tokens": 1284509354.0, "reward": 2.06689453125, "reward_std": 0.18701159954071045, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 776.779296875, "completions/mean_terminated_length": 776.779296875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.9159341128275156, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11142696767087604, "kl": 0.076416015625, "learning_rate": 4.333025914710798e-07, "loss": 0.0211, "num_tokens": 1284984249.0, "reward": 2.1484375, "reward_std": 0.1903039813041687, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 753.595703125, "completions/mean_terminated_length": 753.595703125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.9162754971409064, "frac_reward_zero_std": 0.5, "grad_norm": 0.11445200516090111, "kl": 0.0830078125, "learning_rate": 4.298404624484498e-07, "loss": 0.011, "num_tokens": 1285450074.0, "reward": 2.1171875, "reward_std": 0.21688374876976013, "rewards/accuracy_reward/mean": 0.13306452333927155, "rewards/accuracy_reward/std": 0.3399873375892639, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 725.4453125, "completions/mean_terminated_length": 725.4453125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9166168814542972, "frac_reward_zero_std": 0.5, "grad_norm": 0.12186872137337222, "kl": 0.0892333984375, "learning_rate": 4.263919165168762e-07, "loss": 0.0125, "num_tokens": 1285899902.0, "reward": 2.20751953125, "reward_std": 0.17915880680084229, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41643625497817993, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 818.123046875, "completions/mean_terminated_length": 818.123046875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.916958265767688, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09835162105838646, "kl": 0.0748291015625, "learning_rate": 4.2295695857094256e-07, "loss": 0.0126, "num_tokens": 1286395197.0, "reward": 2.06884765625, "reward_std": 0.15733210742473602, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 805.037109375, "completions/mean_terminated_length": 804.1702270507812, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.9172996500810787, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3517736361117892, "kl": 0.1136474609375, "learning_rate": 4.195355934859391e-07, "loss": 0.0215, "num_tokens": 1286879520.0, "reward": 2.14697265625, "reward_std": 0.15343749523162842, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 825.58203125, "completions/mean_terminated_length": 825.58203125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.9176410343944695, "frac_reward_zero_std": 0.59375, "grad_norm": 0.0933160865968272, "kl": 0.0736083984375, "learning_rate": 4.1612782611787137e-07, "loss": -0.0043, "num_tokens": 1287380858.0, "reward": 2.0302734375, "reward_std": 0.13786140084266663, "rewards/accuracy_reward/mean": 0.052734375, "rewards/accuracy_reward/std": 0.22372129559516907, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 847.64453125, "completions/mean_terminated_length": 847.64453125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.9179824187078603, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08711892798693506, "kl": 0.0745849609375, "learning_rate": 4.127336613034394e-07, "loss": 0.0145, "num_tokens": 1287890020.0, "reward": 2.04150390625, "reward_std": 0.10895287990570068, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 815.951171875, "completions/mean_terminated_length": 815.951171875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.9183238030212512, "frac_reward_zero_std": 0.40625, "grad_norm": 0.11897941525518645, "kl": 0.0782470703125, "learning_rate": 4.093531038600385e-07, "loss": 0.0219, "num_tokens": 1288397435.0, "reward": 2.09130859375, "reward_std": 0.2042258381843567, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 816.865234375, "completions/mean_terminated_length": 816.865234375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.918665187334642, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08626886829618716, "kl": 0.075439453125, "learning_rate": 4.059861585857561e-07, "loss": 0.0147, "num_tokens": 1288892646.0, "reward": 2.09130859375, "reward_std": 0.11570737510919571, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 815.77734375, "completions/mean_terminated_length": 815.77734375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.9190065716480328, "frac_reward_zero_std": 0.5, "grad_norm": 0.11683840457970579, "kl": 0.078369140625, "learning_rate": 4.0263283025935074e-07, "loss": 0.0189, "num_tokens": 1289386628.0, "reward": 2.14013671875, "reward_std": 0.18314334750175476, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2036.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 764.521484375, "completions/mean_terminated_length": 764.521484375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.9193479559614236, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10920522901520652, "kl": 0.0833740234375, "learning_rate": 3.9929312364026085e-07, "loss": 0.0193, "num_tokens": 1289864991.0, "reward": 2.13134765625, "reward_std": 0.15973830223083496, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 787.90625, "completions/mean_terminated_length": 787.90625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9196893402748144, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11259142278979858, "kl": 0.080078125, "learning_rate": 3.9596704346858915e-07, "loss": 0.0186, "num_tokens": 1290362671.0, "reward": 2.05712890625, "reward_std": 0.18101902306079865, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 774.43359375, "completions/mean_terminated_length": 771.9412841796875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.9200307245882051, "frac_reward_zero_std": 0.5, "grad_norm": 0.11890448598514282, "kl": 0.080322265625, "learning_rate": 3.926545944650972e-07, "loss": 0.018, "num_tokens": 1290833853.0, "reward": 2.07763671875, "reward_std": 0.17022420465946198, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 844.302734375, "completions/mean_terminated_length": 844.302734375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.9203721089015959, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09662247648449836, "kl": 0.075439453125, "learning_rate": 3.89355781331201e-07, "loss": 0.0133, "num_tokens": 1291353304.0, "reward": 2.07080078125, "reward_std": 0.1376969963312149, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 791.322265625, "completions/mean_terminated_length": 791.322265625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.9207134932149867, "frac_reward_zero_std": 0.5, "grad_norm": 0.1115554287487263, "kl": 0.0831298828125, "learning_rate": 3.860706087489607e-07, "loss": 0.0274, "num_tokens": 1291851469.0, "reward": 2.08203125, "reward_std": 0.1787654459476471, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 765.7890625, "completions/mean_terminated_length": 763.2798461914062, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.9210548775283776, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1196859971381338, "kl": 0.079833984375, "learning_rate": 3.8279908138108226e-07, "loss": 0.0327, "num_tokens": 1292322545.0, "reward": 2.0751953125, "reward_std": 0.18428736925125122, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1715.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 819.15625, "completions/mean_terminated_length": 819.15625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.9213962618417684, "frac_reward_zero_std": 0.625, "grad_norm": 0.09212194767501768, "kl": 0.0760498046875, "learning_rate": 3.7954120387089566e-07, "loss": 0.0235, "num_tokens": 1292831489.0, "reward": 2.09375, "reward_std": 0.14015734195709229, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 738.8359375, "completions/mean_terminated_length": 738.8359375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.9217376461551592, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10360118236819629, "kl": 0.08154296875, "learning_rate": 3.7629698084236665e-07, "loss": 0.0048, "num_tokens": 1293295613.0, "reward": 2.052734375, "reward_std": 0.13275451958179474, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 781.701171875, "completions/mean_terminated_length": 781.701171875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.92207903046855, "frac_reward_zero_std": 0.625, "grad_norm": 0.09510379150630871, "kl": 0.0797119140625, "learning_rate": 3.7306641690007083e-07, "loss": 0.0092, "num_tokens": 1293774788.0, "reward": 2.1083984375, "reward_std": 0.13945432007312775, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 839.40625, "completions/mean_terminated_length": 839.40625, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.9224204147819408, "frac_reward_zero_std": 0.375, "grad_norm": 0.11590442692253299, "kl": 0.0789794921875, "learning_rate": 3.698495166292082e-07, "loss": 0.0134, "num_tokens": 1294283956.0, "reward": 2.06494140625, "reward_std": 0.21298891305923462, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 818.765625, "completions/mean_terminated_length": 818.765625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.9227617990953315, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10569511913128211, "kl": 0.0723876953125, "learning_rate": 3.666462845955765e-07, "loss": 0.007, "num_tokens": 1294786444.0, "reward": 2.1083984375, "reward_std": 0.16424164175987244, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 754.244140625, "completions/mean_terminated_length": 754.244140625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.9231031834087223, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12375506720323551, "kl": 0.0823974609375, "learning_rate": 3.6345672534557894e-07, "loss": 0.019, "num_tokens": 1295260377.0, "reward": 2.08203125, "reward_std": 0.1687990427017212, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1936.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 795.298828125, "completions/mean_terminated_length": 795.298828125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.9234445677221131, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09500148819685951, "kl": 0.079833984375, "learning_rate": 3.602808434062144e-07, "loss": 0.0076, "num_tokens": 1295759250.0, "reward": 2.1083984375, "reward_std": 0.14804679155349731, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 804.466796875, "completions/mean_terminated_length": 804.466796875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.923785952035504, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12041036563137947, "kl": 0.0789794921875, "learning_rate": 3.5711864328506265e-07, "loss": 0.0067, "num_tokens": 1296254161.0, "reward": 2.146484375, "reward_std": 0.23403970897197723, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 717.33984375, "completions/mean_terminated_length": 714.7357788085938, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.9241273363488948, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1077248560091165, "kl": 0.0789794921875, "learning_rate": 3.539701294702902e-07, "loss": 0.0156, "num_tokens": 1296709903.0, "reward": 2.1484375, "reward_std": 0.16120800375938416, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 732.25, "completions/mean_terminated_length": 732.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.9244687206622856, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1091756700315512, "kl": 0.0831298828125, "learning_rate": 3.508353064306347e-07, "loss": 0.0107, "num_tokens": 1297163631.0, "reward": 2.08740234375, "reward_std": 0.15533745288848877, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 820.193359375, "completions/mean_terminated_length": 817.7905883789062, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9248101049756764, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10652811972392781, "kl": 0.0806884765625, "learning_rate": 3.477141786154059e-07, "loss": 0.023, "num_tokens": 1297671874.0, "reward": 2.111328125, "reward_std": 0.14221099019050598, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 804.060546875, "completions/mean_terminated_length": 801.626220703125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.9251514892890672, "frac_reward_zero_std": 0.71875, "grad_norm": 0.0772814164648757, "kl": 0.074462890625, "learning_rate": 3.4460675045447254e-07, "loss": 0.0156, "num_tokens": 1298179985.0, "reward": 2.0478515625, "reward_std": 0.09855453670024872, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1660.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 759.341796875, "completions/mean_terminated_length": 759.341796875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.925492873602458, "frac_reward_zero_std": 0.625, "grad_norm": 0.09394438709299191, "kl": 0.0830078125, "learning_rate": 3.415130263582611e-07, "loss": 0.0074, "num_tokens": 1298661104.0, "reward": 2.15625, "reward_std": 0.15705078840255737, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 712.314453125, "completions/mean_terminated_length": 712.314453125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.9258342579158487, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10614875776096482, "kl": 0.0841064453125, "learning_rate": 3.3843301071775026e-07, "loss": 0.0202, "num_tokens": 1299106593.0, "reward": 2.1513671875, "reward_std": 0.1424272656440735, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1678.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 774.37890625, "completions/mean_terminated_length": 774.37890625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.9261756422292395, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10628485671014291, "kl": 0.078857421875, "learning_rate": 3.3536670790445314e-07, "loss": 0.0045, "num_tokens": 1299584579.0, "reward": 2.046875, "reward_std": 0.13722340762615204, "rewards/accuracy_reward/mean": 0.056640625, "rewards/accuracy_reward/std": 0.23138070106506348, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1926.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 796.427734375, "completions/mean_terminated_length": 796.427734375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.9265170265426304, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09003245160849631, "kl": 0.0791015625, "learning_rate": 3.323141222704296e-07, "loss": 0.0149, "num_tokens": 1300072910.0, "reward": 2.0673828125, "reward_std": 0.11181704699993134, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 793.9375, "completions/mean_terminated_length": 793.9375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.9268584108560212, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1420095785846164, "kl": 0.0810546875, "learning_rate": 3.29275258148265e-07, "loss": 0.0205, "num_tokens": 1300560878.0, "reward": 2.09716796875, "reward_std": 0.23428697884082794, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 795.49609375, "completions/mean_terminated_length": 795.49609375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.927199795169412, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11615841535042101, "kl": 0.0770263671875, "learning_rate": 3.2625011985107256e-07, "loss": 0.0148, "num_tokens": 1301053644.0, "reward": 2.1083984375, "reward_std": 0.19945615530014038, "rewards/accuracy_reward/mean": 0.12903225421905518, "rewards/accuracy_reward/std": 0.33557409048080444, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 745.87109375, "completions/mean_terminated_length": 743.3228759765625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.9275411794828028, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12374712728090959, "kl": 0.0804443359375, "learning_rate": 3.232387116724811e-07, "loss": 0.0258, "num_tokens": 1301512714.0, "reward": 2.1318359375, "reward_std": 0.22333042323589325, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1566.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 757.849609375, "completions/mean_terminated_length": 757.849609375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.9278825637961936, "frac_reward_zero_std": 0.5, "grad_norm": 0.11418676593937462, "kl": 0.0787353515625, "learning_rate": 3.20241037886635e-07, "loss": 0.0322, "num_tokens": 1301994573.0, "reward": 2.1279296875, "reward_std": 0.2118135392665863, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 821.1875, "completions/mean_terminated_length": 821.1875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.9282239481095844, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11432380614214815, "kl": 0.079833984375, "learning_rate": 3.172571027481841e-07, "loss": 0.028, "num_tokens": 1302498701.0, "reward": 2.05224609375, "reward_std": 0.16306258738040924, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.029158055782318115, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 781.69140625, "completions/mean_terminated_length": 781.69140625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9285653324229751, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1674505606271395, "kl": 0.0831298828125, "learning_rate": 3.1428691049227634e-07, "loss": 0.0174, "num_tokens": 1302975887.0, "reward": 2.05078125, "reward_std": 0.13922229409217834, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 859.69140625, "completions/mean_terminated_length": 859.69140625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.9289067167363659, "frac_reward_zero_std": 0.65625, "grad_norm": 0.07890869189865295, "kl": 0.072998046875, "learning_rate": 3.1133046533455945e-07, "loss": 0.0145, "num_tokens": 1303499825.0, "reward": 2.1142578125, "reward_std": 0.14257270097732544, "rewards/accuracy_reward/mean": 0.123046875, "rewards/accuracy_reward/std": 0.32881227135658264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 723.08984375, "completions/mean_terminated_length": 723.08984375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.9292481010497567, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12786455472489405, "kl": 0.0838623046875, "learning_rate": 3.083877714711636e-07, "loss": 0.0026, "num_tokens": 1303953375.0, "reward": 2.17822265625, "reward_std": 0.20422866940498352, "rewards/accuracy_reward/mean": 0.189453125, "rewards/accuracy_reward/std": 0.3922513723373413, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1896.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 762.412109375, "completions/mean_terminated_length": 762.412109375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.9295894853631476, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10054553474182427, "kl": 0.0853271484375, "learning_rate": 3.054588330787067e-07, "loss": 0.008, "num_tokens": 1304427490.0, "reward": 2.1376953125, "reward_std": 0.14832475781440735, "rewards/accuracy_reward/mean": 0.15120968222618103, "rewards/accuracy_reward/std": 0.35861483216285706, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 752.701171875, "completions/mean_terminated_length": 752.701171875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.9299308696765384, "frac_reward_zero_std": 0.46875, "grad_norm": 0.126429882729447, "kl": 0.0821533203125, "learning_rate": 3.025436543142801e-07, "loss": 0.0113, "num_tokens": 1304893897.0, "reward": 2.09130859375, "reward_std": 0.1899968981742859, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 807.041015625, "completions/mean_terminated_length": 807.041015625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.9302722539899292, "frac_reward_zero_std": 0.625, "grad_norm": 0.0848042153564289, "kl": 0.0755615234375, "learning_rate": 2.9964223931544854e-07, "loss": 0.0017, "num_tokens": 1305400014.0, "reward": 2.1689453125, "reward_std": 0.16065454483032227, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1978.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 820.875, "completions/mean_terminated_length": 820.875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.93061363830332, "frac_reward_zero_std": 0.625, "grad_norm": 0.0971645724658155, "kl": 0.081298828125, "learning_rate": 2.9675459220023794e-07, "loss": 0.001, "num_tokens": 1305911998.0, "reward": 2.0703125, "reward_std": 0.13730567693710327, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04406425356864929, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 795.583984375, "completions/mean_terminated_length": 785.722412109375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.9309550226167108, "frac_reward_zero_std": 0.5, "grad_norm": 0.12018802766487909, "kl": 0.076904296875, "learning_rate": 2.938807170671354e-07, "loss": 0.0157, "num_tokens": 1306405769.0, "reward": 2.14794921875, "reward_std": 0.2071564942598343, "rewards/accuracy_reward/mean": 0.166015625, "rewards/accuracy_reward/std": 0.3724585771560669, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 743.064453125, "completions/mean_terminated_length": 743.064453125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.9312964069301015, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10763867231262284, "kl": 0.0823974609375, "learning_rate": 2.910206179950814e-07, "loss": 0.0212, "num_tokens": 1306869914.0, "reward": 2.0908203125, "reward_std": 0.15976479649543762, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 823.22265625, "completions/mean_terminated_length": 823.22265625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.9316377912434923, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11143679205144184, "kl": 0.07421875, "learning_rate": 2.8817429904346327e-07, "loss": 0.0247, "num_tokens": 1307377516.0, "reward": 2.16064453125, "reward_std": 0.1947622001171112, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1752.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 769.607421875, "completions/mean_terminated_length": 769.607421875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9319791755568831, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10083606322944905, "kl": 0.075927734375, "learning_rate": 2.8534176425211166e-07, "loss": 0.0125, "num_tokens": 1307853555.0, "reward": 2.1064453125, "reward_std": 0.1469949632883072, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1707.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 829.9921875, "completions/mean_terminated_length": 829.9921875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.932320559870274, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11631614856531455, "kl": 0.074951171875, "learning_rate": 2.8252301764128965e-07, "loss": 0.0148, "num_tokens": 1308359583.0, "reward": 2.10888671875, "reward_std": 0.183839812874794, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 753.693359375, "completions/mean_terminated_length": 751.1604614257812, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.9326619441836648, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12510842715776915, "kl": 0.081787109375, "learning_rate": 2.7971806321169247e-07, "loss": 0.0206, "num_tokens": 1308827058.0, "reward": 2.091796875, "reward_std": 0.13801535964012146, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1646.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 807.06640625, "completions/mean_terminated_length": 807.06640625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.9330033284970556, "frac_reward_zero_std": 0.375, "grad_norm": 0.12778952156585657, "kl": 0.078369140625, "learning_rate": 2.769269049444423e-07, "loss": 0.0173, "num_tokens": 1309316964.0, "reward": 2.16748046875, "reward_std": 0.23452657461166382, "rewards/accuracy_reward/mean": 0.181640625, "rewards/accuracy_reward/std": 0.38592514395713806, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 736.533203125, "completions/mean_terminated_length": 736.533203125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.9333447128104464, "frac_reward_zero_std": 0.625, "grad_norm": 0.1007399830602404, "kl": 0.083984375, "learning_rate": 2.7414954680107463e-07, "loss": 0.0118, "num_tokens": 1309783957.0, "reward": 2.078125, "reward_std": 0.13002625107765198, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 802.373046875, "completions/mean_terminated_length": 802.373046875, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.9336860971238372, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10325193600045844, "kl": 0.0809326171875, "learning_rate": 2.7138599272354403e-07, "loss": 0.0087, "num_tokens": 1310274740.0, "reward": 2.09765625, "reward_std": 0.1517922431230545, "rewards/accuracy_reward/mean": 0.11491935700178146, "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 786.7265625, "completions/mean_terminated_length": 786.7265625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.9340274814372279, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08360137735410075, "kl": 0.0723876953125, "learning_rate": 2.686362466342085e-07, "loss": 0.0123, "num_tokens": 1310757576.0, "reward": 2.14013671875, "reward_std": 0.13858428597450256, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 766.04296875, "completions/mean_terminated_length": 763.5342407226562, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.9343688657506187, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10381798862101313, "kl": 0.0816650390625, "learning_rate": 2.6590031243582946e-07, "loss": 0.0165, "num_tokens": 1311229806.0, "reward": 2.1171875, "reward_std": 0.16607214510440826, "rewards/accuracy_reward/mean": 0.1391129046678543, "rewards/accuracy_reward/std": 0.34641367197036743, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 798.072265625, "completions/mean_terminated_length": 795.626220703125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.9347102500640095, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10040751504478544, "kl": 0.0767822265625, "learning_rate": 2.631781940115674e-07, "loss": 0.0317, "num_tokens": 1311731235.0, "reward": 2.06982421875, "reward_std": 0.1574288010597229, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 779.134765625, "completions/mean_terminated_length": 776.6516723632812, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.9350516343774004, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11574546617615596, "kl": 0.0843505859375, "learning_rate": 2.6046989522496845e-07, "loss": 0.0139, "num_tokens": 1312210072.0, "reward": 2.0244140625, "reward_std": 0.1352521777153015, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 763.505859375, "completions/mean_terminated_length": 763.505859375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.9353930186907912, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09513618429719106, "kl": 0.083251953125, "learning_rate": 2.577754199199689e-07, "loss": 0.0255, "num_tokens": 1312688187.0, "reward": 2.08740234375, "reward_std": 0.1240454763174057, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 788.62109375, "completions/mean_terminated_length": 783.682373046875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.935734403004182, "frac_reward_zero_std": 0.40625, "grad_norm": 0.25741884825226813, "kl": 0.3438720703125, "learning_rate": 2.5509477192088293e-07, "loss": 0.0281, "num_tokens": 1313170137.0, "reward": 2.1044921875, "reward_std": 0.2363472431898117, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 790.240234375, "completions/mean_terminated_length": 790.240234375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.9360757873175728, "frac_reward_zero_std": 0.5, "grad_norm": 0.11422327862827089, "kl": 0.0809326171875, "learning_rate": 2.524279550324027e-07, "loss": 0.0111, "num_tokens": 1313657876.0, "reward": 2.0615234375, "reward_std": 0.1711147427558899, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 742.1953125, "completions/mean_terminated_length": 742.1953125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.9364171716309636, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10651468149829711, "kl": 0.0804443359375, "learning_rate": 2.4977497303958374e-07, "loss": 0.0226, "num_tokens": 1314125096.0, "reward": 2.08740234375, "reward_std": 0.18101300299167633, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1619.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 794.8828125, "completions/mean_terminated_length": 794.8828125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.9367585559443543, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1237492984862342, "kl": 0.079345703125, "learning_rate": 2.4713582970784964e-07, "loss": 0.0228, "num_tokens": 1314620316.0, "reward": 2.171875, "reward_std": 0.23788988590240479, "rewards/accuracy_reward/mean": 0.197265625, "rewards/accuracy_reward/std": 0.3983237147331238, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.056256115436553955, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 746.837890625, "completions/mean_terminated_length": 746.837890625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.9370999402577451, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11857855273791369, "kl": 0.08203125, "learning_rate": 2.44510528782983e-07, "loss": 0.0245, "num_tokens": 1315081081.0, "reward": 2.07373046875, "reward_std": 0.19785770773887634, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 764.509765625, "completions/mean_terminated_length": 764.509765625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.9374413245711359, "frac_reward_zero_std": 0.5, "grad_norm": 0.11048424876004256, "kl": 0.08056640625, "learning_rate": 2.4189907399111536e-07, "loss": 0.014, "num_tokens": 1315571166.0, "reward": 2.08544921875, "reward_std": 0.1912420094013214, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 829.3671875, "completions/mean_terminated_length": 829.3671875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9377827088845268, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10768394866671531, "kl": 0.0859375, "learning_rate": 2.39301469038733e-07, "loss": 0.0039, "num_tokens": 1316084698.0, "reward": 2.0810546875, "reward_std": 0.14070288836956024, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 812.44140625, "completions/mean_terminated_length": 812.44140625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.9381240931979176, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11011610336135133, "kl": 0.0782470703125, "learning_rate": 2.3671771761265781e-07, "loss": 0.0247, "num_tokens": 1316576492.0, "reward": 2.03466796875, "reward_std": 0.14087429642677307, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 755.904296875, "completions/mean_terminated_length": 755.904296875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.9384654775113084, "frac_reward_zero_std": 0.71875, "grad_norm": 0.0868714013167995, "kl": 0.0791015625, "learning_rate": 2.3414782338005538e-07, "loss": 0.0172, "num_tokens": 1317043867.0, "reward": 2.14306640625, "reward_std": 0.11245358735322952, "rewards/accuracy_reward/mean": 0.16458334028720856, "rewards/accuracy_reward/std": 0.37119096517562866, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 838.939453125, "completions/mean_terminated_length": 831.8134155273438, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.9388068618246992, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10366966840888552, "kl": 0.0770263671875, "learning_rate": 2.3159178998841792e-07, "loss": 0.0117, "num_tokens": 1317556124.0, "reward": 2.06787109375, "reward_std": 0.18059508502483368, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1360.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 732.041015625, "completions/mean_terminated_length": 732.041015625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.93914824613809, "frac_reward_zero_std": 0.625, "grad_norm": 0.10610748787759254, "kl": 0.0767822265625, "learning_rate": 2.2904962106556795e-07, "loss": 0.0038, "num_tokens": 1318016321.0, "reward": 2.083984375, "reward_std": 0.14940284192562103, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1839.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 840.4140625, "completions/mean_terminated_length": 840.4140625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.9394896304514807, "frac_reward_zero_std": 0.625, "grad_norm": 0.09200888302537841, "kl": 0.076171875, "learning_rate": 2.2652132021965035e-07, "loss": 0.009, "num_tokens": 1318542037.0, "reward": 2.08251953125, "reward_std": 0.1433972418308258, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 724.07421875, "completions/mean_terminated_length": 724.07421875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.9398310147648715, "frac_reward_zero_std": 0.5, "grad_norm": 0.10693694421685443, "kl": 0.0826416015625, "learning_rate": 2.2400689103912464e-07, "loss": 0.0058, "num_tokens": 1319001035.0, "reward": 2.1708984375, "reward_std": 0.2043270319700241, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 787.923828125, "completions/mean_terminated_length": 787.923828125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.9401723990782623, "frac_reward_zero_std": 0.5, "grad_norm": 0.12376348639679186, "kl": 0.080078125, "learning_rate": 2.2150633709276393e-07, "loss": 0.017, "num_tokens": 1319486324.0, "reward": 2.11279296875, "reward_std": 0.19549106061458588, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 788.935546875, "completions/mean_terminated_length": 788.935546875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.9405137833916531, "frac_reward_zero_std": 0.375, "grad_norm": 0.13404651751702548, "kl": 0.082763671875, "learning_rate": 2.1901966192964584e-07, "loss": 0.015, "num_tokens": 1319974147.0, "reward": 2.08740234375, "reward_std": 0.25369778275489807, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.970703125, "rewards/format_reward/std": 0.16880230605602264, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 822.580078125, "completions/mean_terminated_length": 820.1820068359375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.940855167705044, "frac_reward_zero_std": 0.46875, "grad_norm": 0.10521732815018119, "kl": 0.0760498046875, "learning_rate": 2.1654686907915167e-07, "loss": 0.0302, "num_tokens": 1320482588.0, "reward": 2.18994140625, "reward_std": 0.19872784614562988, "rewards/accuracy_reward/mean": 0.208984375, "rewards/accuracy_reward/std": 0.40698084235191345, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 813.88671875, "completions/mean_terminated_length": 813.88671875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.9411965520184348, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10894648123964759, "kl": 0.0784912109375, "learning_rate": 2.140879620509606e-07, "loss": 0.0088, "num_tokens": 1320986242.0, "reward": 2.16552734375, "reward_std": 0.18165364861488342, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38430243730545044, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1927.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 812.099609375, "completions/mean_terminated_length": 812.099609375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.9415379363318256, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11785565531589291, "kl": 0.0767822265625, "learning_rate": 2.1164294433503763e-07, "loss": 0.0246, "num_tokens": 1321487093.0, "reward": 2.02734375, "reward_std": 0.14987540245056152, "rewards/accuracy_reward/mean": 0.04838709533214569, "rewards/accuracy_reward/std": 0.21479946374893188, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 870.228515625, "completions/mean_terminated_length": 867.9236450195312, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.9418793206452164, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08601482871062159, "kl": 0.0771484375, "learning_rate": 2.0921181940164125e-07, "loss": 0.0076, "num_tokens": 1322038586.0, "reward": 2.07177734375, "reward_std": 0.12566816806793213, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 773.58203125, "completions/mean_terminated_length": 773.58203125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9422207049586071, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09668893044833898, "kl": 0.0810546875, "learning_rate": 2.0679459070130693e-07, "loss": 0.0062, "num_tokens": 1322516644.0, "reward": 2.1064453125, "reward_std": 0.13057725131511688, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 775.095703125, "completions/mean_terminated_length": 775.095703125, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.9425620892719979, "frac_reward_zero_std": 0.625, "grad_norm": 0.10427916892334307, "kl": 0.0875244140625, "learning_rate": 2.0439126166485023e-07, "loss": 0.0045, "num_tokens": 1323008437.0, "reward": 2.06103515625, "reward_std": 0.11228133738040924, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1847.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 777.984375, "completions/mean_terminated_length": 777.984375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.9429034735853887, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10275584049028302, "kl": 0.0772705078125, "learning_rate": 2.0200183570335596e-07, "loss": 0.0087, "num_tokens": 1323486797.0, "reward": 2.1259765625, "reward_std": 0.16383689641952515, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 804.837890625, "completions/mean_terminated_length": 804.837890625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.9432448578987795, "frac_reward_zero_std": 0.625, "grad_norm": 0.09742974165448424, "kl": 0.0743408203125, "learning_rate": 1.9962631620817685e-07, "loss": 0.0107, "num_tokens": 1323983002.0, "reward": 2.0595703125, "reward_std": 0.15076105296611786, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1858.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 787.462890625, "completions/mean_terminated_length": 787.462890625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.9435862422121704, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09120650925599111, "kl": 0.0797119140625, "learning_rate": 1.972647065509292e-07, "loss": 0.0137, "num_tokens": 1324467911.0, "reward": 2.109375, "reward_std": 0.13821622729301453, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1633.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 754.126953125, "completions/mean_terminated_length": 754.126953125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.9439276265255612, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11303405384429102, "kl": 0.083251953125, "learning_rate": 1.949170100834863e-07, "loss": 0.0153, "num_tokens": 1324933240.0, "reward": 2.02001953125, "reward_std": 0.1591007262468338, "rewards/accuracy_reward/mean": 0.041015625, "rewards/accuracy_reward/std": 0.19852031767368317, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.029059575870633125, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 768.85546875, "completions/mean_terminated_length": 768.85546875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.944269010838952, "frac_reward_zero_std": 0.46875, "grad_norm": 0.17467223507333238, "kl": 0.0860595703125, "learning_rate": 1.9258323013797265e-07, "loss": 0.009, "num_tokens": 1325403886.0, "reward": 2.1357421875, "reward_std": 0.20235949754714966, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1581.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 814.869140625, "completions/mean_terminated_length": 814.869140625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9446103951523428, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09800008385689796, "kl": 0.07568359375, "learning_rate": 1.902633700267631e-07, "loss": 0.0001, "num_tokens": 1325897291.0, "reward": 2.07470703125, "reward_std": 0.16869224607944489, "rewards/accuracy_reward/mean": 0.09072580933570862, "rewards/accuracy_reward/std": 0.2875087857246399, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 746.650390625, "completions/mean_terminated_length": 746.650390625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.9449517794657335, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10125182884110942, "kl": 0.078369140625, "learning_rate": 1.8795743304247606e-07, "loss": 0.0167, "num_tokens": 1326363192.0, "reward": 2.125, "reward_std": 0.14780330657958984, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 846.408203125, "completions/mean_terminated_length": 846.408203125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.9452931637791243, "frac_reward_zero_std": 0.5, "grad_norm": 0.11386761531406447, "kl": 0.078857421875, "learning_rate": 1.856654224579635e-07, "loss": 0.0184, "num_tokens": 1326874201.0, "reward": 2.10888671875, "reward_std": 0.1687885820865631, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1590.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 820.8203125, "completions/mean_terminated_length": 820.8203125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.9456345480925151, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09262117779790187, "kl": 0.0787353515625, "learning_rate": 1.8338734152631654e-07, "loss": 0.0125, "num_tokens": 1327385565.0, "reward": 2.029296875, "reward_std": 0.12347057461738586, "rewards/accuracy_reward/mean": 0.044921875, "rewards/accuracy_reward/std": 0.20733514428138733, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 860.39453125, "completions/mean_terminated_length": 858.0704345703125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.9459759324059059, "frac_reward_zero_std": 0.59375, "grad_norm": 0.0997623157323359, "kl": 0.079345703125, "learning_rate": 1.811231934808577e-07, "loss": 0.0183, "num_tokens": 1327909783.0, "reward": 2.0537109375, "reward_std": 0.14357587695121765, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 830.341796875, "completions/mean_terminated_length": 830.341796875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.9463173167192968, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10394101935642625, "kl": 0.0782470703125, "learning_rate": 1.7887298153512754e-07, "loss": 0.015, "num_tokens": 1328430422.0, "reward": 2.0107421875, "reward_std": 0.1411224603652954, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 727.2734375, "completions/mean_terminated_length": 727.2734375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9466587010326876, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1510546397978763, "kl": 0.0848388671875, "learning_rate": 1.7663670888289131e-07, "loss": 0.0096, "num_tokens": 1328886194.0, "reward": 2.15283203125, "reward_std": 0.20304402709007263, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 839.251953125, "completions/mean_terminated_length": 839.251953125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9470000853460784, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11254182209559307, "kl": 0.0732421875, "learning_rate": 1.7441437869813004e-07, "loss": 0.0144, "num_tokens": 1329399635.0, "reward": 2.07958984375, "reward_std": 0.18237121403217316, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 773.97265625, "completions/mean_terminated_length": 773.97265625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.9473414696594692, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10186643348177696, "kl": 0.0755615234375, "learning_rate": 1.722059941350329e-07, "loss": 0.0215, "num_tokens": 1329872853.0, "reward": 2.06787109375, "reward_std": 0.14740031957626343, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 725.369140625, "completions/mean_terminated_length": 725.369140625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.9476828539728599, "frac_reward_zero_std": 0.46875, "grad_norm": 0.1312278391484862, "kl": 0.085693359375, "learning_rate": 1.700115583279993e-07, "loss": 0.0173, "num_tokens": 1330325602.0, "reward": 2.15380859375, "reward_std": 0.2189202606678009, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.374204158782959, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 799.00390625, "completions/mean_terminated_length": 799.00390625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.9480242382862507, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09143607668578767, "kl": 0.07861328125, "learning_rate": 1.6783107439162782e-07, "loss": -0.002, "num_tokens": 1330822740.0, "reward": 2.04638671875, "reward_std": 0.10810358077287674, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23509246110916138, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 770.58203125, "completions/mean_terminated_length": 770.58203125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.9483656225996415, "frac_reward_zero_std": 0.5, "grad_norm": 0.11189201044277117, "kl": 0.07666015625, "learning_rate": 1.6566454542071953e-07, "loss": 0.0292, "num_tokens": 1331302798.0, "reward": 2.099609375, "reward_std": 0.19043275713920593, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 738.8046875, "completions/mean_terminated_length": 738.8046875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.9487070069130323, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12702230208075524, "kl": 0.080322265625, "learning_rate": 1.6351197449026468e-07, "loss": 0.0024, "num_tokens": 1331773114.0, "reward": 2.1455078125, "reward_std": 0.19943970441818237, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 803.7734375, "completions/mean_terminated_length": 801.3385620117188, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.9490483912264231, "frac_reward_zero_std": 0.59375, "grad_norm": 0.0948819063833475, "kl": 0.084716796875, "learning_rate": 1.6137336465544383e-07, "loss": 0.0193, "num_tokens": 1332270790.0, "reward": 2.08642578125, "reward_std": 0.1699155569076538, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1382.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 750.80078125, "completions/mean_terminated_length": 750.80078125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.949389775539814, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1261876499666952, "kl": 0.0819091796875, "learning_rate": 1.592487189516212e-07, "loss": 0.0224, "num_tokens": 1332744592.0, "reward": 2.09716796875, "reward_std": 0.1986381709575653, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1495.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 771.8515625, "completions/mean_terminated_length": 771.8515625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9497311598532048, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10063131654179287, "kl": 0.0806884765625, "learning_rate": 1.5713804039434566e-07, "loss": 0.0048, "num_tokens": 1333219044.0, "reward": 2.05908203125, "reward_std": 0.12672413885593414, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1574.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 733.81640625, "completions/mean_terminated_length": 733.81640625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9500725441665956, "frac_reward_zero_std": 0.625, "grad_norm": 0.11630854584050278, "kl": 0.0872802734375, "learning_rate": 1.5504133197933534e-07, "loss": 0.0158, "num_tokens": 1333671222.0, "reward": 2.12548828125, "reward_std": 0.1311398297548294, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 770.890625, "completions/mean_terminated_length": 770.890625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.9504139284799863, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12042071369797791, "kl": 0.0787353515625, "learning_rate": 1.529585966824876e-07, "loss": 0.0131, "num_tokens": 1334140782.0, "reward": 2.11474609375, "reward_std": 0.21216699481010437, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 788.146484375, "completions/mean_terminated_length": 788.146484375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.9507553127933771, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10021603562561142, "kl": 0.078369140625, "learning_rate": 1.5088983745986442e-07, "loss": 0.0087, "num_tokens": 1334636089.0, "reward": 2.05859375, "reward_std": 0.11801324784755707, "rewards/accuracy_reward/mean": 0.068359375, "rewards/accuracy_reward/std": 0.25260838866233826, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 746.1875, "completions/mean_terminated_length": 746.1875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.9510966971067679, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13605317678389992, "kl": 0.0816650390625, "learning_rate": 1.488350572476893e-07, "loss": 0.0101, "num_tokens": 1335100409.0, "reward": 2.095703125, "reward_std": 0.21979676187038422, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2019.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 789.404296875, "completions/mean_terminated_length": 789.404296875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.9514380814201587, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11776207678042375, "kl": 0.0806884765625, "learning_rate": 1.4679425896234832e-07, "loss": 0.0163, "num_tokens": 1335588280.0, "reward": 2.14453125, "reward_std": 0.20785489678382874, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 801.11328125, "completions/mean_terminated_length": 801.11328125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.9517794657335495, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09015553670050196, "kl": 0.0787353515625, "learning_rate": 1.4476744550038002e-07, "loss": 0.0063, "num_tokens": 1336083954.0, "reward": 2.09912109375, "reward_std": 0.11735087633132935, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 727.701171875, "completions/mean_terminated_length": 727.701171875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.9521208500469404, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12694808896851176, "kl": 0.0853271484375, "learning_rate": 1.427546197384766e-07, "loss": 0.0184, "num_tokens": 1336541033.0, "reward": 2.09521484375, "reward_std": 0.2001173496246338, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 790.138671875, "completions/mean_terminated_length": 790.138671875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.9524622343603312, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12226616840386459, "kl": 0.0784912109375, "learning_rate": 1.4075578453347504e-07, "loss": 0.0243, "num_tokens": 1337037760.0, "reward": 2.0732421875, "reward_std": 0.15982362627983093, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 741.8359375, "completions/mean_terminated_length": 741.8359375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.952803618673722, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11423505506943062, "kl": 0.080810546875, "learning_rate": 1.3877094272235713e-07, "loss": 0.0161, "num_tokens": 1337504476.0, "reward": 2.14501953125, "reward_std": 0.15636909008026123, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.36344730854034424, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 769.59375, "completions/mean_terminated_length": 769.59375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.9531450029871127, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11574495343934747, "kl": 0.0826416015625, "learning_rate": 1.3680009712224272e-07, "loss": 0.0082, "num_tokens": 1337982620.0, "reward": 2.09326171875, "reward_std": 0.1595527082681656, "rewards/accuracy_reward/mean": 0.11491935700178146, "rewards/accuracy_reward/std": 0.3192465901374817, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 758.8203125, "completions/mean_terminated_length": 758.8203125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.9534863873005035, "frac_reward_zero_std": 0.5, "grad_norm": 0.12620451347119027, "kl": 0.0831298828125, "learning_rate": 1.3484325053038648e-07, "loss": 0.0241, "num_tokens": 1338454528.0, "reward": 2.10986328125, "reward_std": 0.2100493162870407, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.024685947224497795, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 816.958984375, "completions/mean_terminated_length": 816.958984375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.9538277716138943, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08818458706716598, "kl": 0.0809326171875, "learning_rate": 1.3290040572417452e-07, "loss": 0.0135, "num_tokens": 1338957611.0, "reward": 2.05078125, "reward_std": 0.10639453679323196, "rewards/accuracy_reward/mean": 0.06653226166963577, "rewards/accuracy_reward/std": 0.2494617998600006, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 793.12890625, "completions/mean_terminated_length": 790.6731567382812, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9541691559272851, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12503584874640047, "kl": 0.080810546875, "learning_rate": 1.3097156546111878e-07, "loss": 0.02, "num_tokens": 1339447421.0, "reward": 2.11181640625, "reward_std": 0.20178961753845215, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 746.677734375, "completions/mean_terminated_length": 745.7905883789062, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.9545105402406759, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3466405316876794, "kl": 0.1903076171875, "learning_rate": 1.2905673247885718e-07, "loss": 0.0215, "num_tokens": 1339908056.0, "reward": 2.09326171875, "reward_std": 0.18755626678466797, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 795.07421875, "completions/mean_terminated_length": 795.07421875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.9548519245540668, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10332181524367846, "kl": 0.0830078125, "learning_rate": 1.2715590949514133e-07, "loss": 0.0243, "num_tokens": 1340426638.0, "reward": 2.17041015625, "reward_std": 0.1703740656375885, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 817.89453125, "completions/mean_terminated_length": 817.89453125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9551933088674576, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0657048881378235, "kl": 0.073486328125, "learning_rate": 1.2526909920784648e-07, "loss": 0.0068, "num_tokens": 1340923352.0, "reward": 2.02978515625, "reward_std": 0.06887180358171463, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 780.560546875, "completions/mean_terminated_length": 780.560546875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.9555346931808484, "frac_reward_zero_std": 0.375, "grad_norm": 0.13027404698023082, "kl": 0.0804443359375, "learning_rate": 1.2339630429495153e-07, "loss": 0.0226, "num_tokens": 1341403927.0, "reward": 2.1220703125, "reward_std": 0.23359188437461853, "rewards/accuracy_reward/mean": 0.15322580933570862, "rewards/accuracy_reward/std": 0.36056873202323914, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1674.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 818.955078125, "completions/mean_terminated_length": 818.955078125, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.9558760774942391, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11878792924841342, "kl": 0.0755615234375, "learning_rate": 1.2153752741454583e-07, "loss": 0.0092, "num_tokens": 1341902592.0, "reward": 2.0302734375, "reward_std": 0.15347233414649963, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21157780289649963, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 819.26171875, "completions/mean_terminated_length": 819.26171875, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.9562174618076299, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10722275097609026, "kl": 0.080078125, "learning_rate": 1.196927712048257e-07, "loss": 0.0075, "num_tokens": 1342410150.0, "reward": 2.0439453125, "reward_std": 0.16051340103149414, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.2492343932390213, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 783.7265625, "completions/mean_terminated_length": 780.5333862304688, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.9565588461210207, "frac_reward_zero_std": 0.4375, "grad_norm": 0.742071321110017, "kl": 0.4451904296875, "learning_rate": 1.1786203828408338e-07, "loss": 0.0249, "num_tokens": 1342894026.0, "reward": 2.1240234375, "reward_std": 0.21109649538993835, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.35588082671165466, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 758.140625, "completions/mean_terminated_length": 758.140625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.9569002304344115, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1159384608889156, "kl": 0.0811767578125, "learning_rate": 1.1604533125071038e-07, "loss": 0.017, "num_tokens": 1343365266.0, "reward": 2.13427734375, "reward_std": 0.17299172282218933, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 772.427734375, "completions/mean_terminated_length": 772.427734375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.9572416147478023, "frac_reward_zero_std": 0.5, "grad_norm": 0.1111109238918049, "kl": 0.0787353515625, "learning_rate": 1.1424265268318857e-07, "loss": -0.0023, "num_tokens": 1343845517.0, "reward": 2.12548828125, "reward_std": 0.19251945614814758, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 777.4453125, "completions/mean_terminated_length": 777.4453125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.9575829990611932, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10334543091534153, "kl": 0.0814208984375, "learning_rate": 1.1245400514009352e-07, "loss": 0.022, "num_tokens": 1344325745.0, "reward": 2.072265625, "reward_std": 0.18131525814533234, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1909.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 792.30078125, "completions/mean_terminated_length": 792.30078125, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.957924383374584, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09445969165329016, "kl": 0.0799560546875, "learning_rate": 1.1067939116008008e-07, "loss": 0.0026, "num_tokens": 1344816571.0, "reward": 2.0654296875, "reward_std": 0.13874807953834534, "rewards/accuracy_reward/mean": 0.076171875, "rewards/accuracy_reward/std": 0.26553234457969666, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1496.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 757.92578125, "completions/mean_terminated_length": 757.92578125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.9582657676879748, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10881915039386184, "kl": 0.080322265625, "learning_rate": 1.0891881326188903e-07, "loss": 0.0137, "num_tokens": 1345288341.0, "reward": 2.0986328125, "reward_std": 0.10624046623706818, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1953.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 827.6015625, "completions/mean_terminated_length": 827.6015625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.9586071520013656, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11565468821952082, "kl": 0.0733642578125, "learning_rate": 1.0717227394433927e-07, "loss": 0.0249, "num_tokens": 1345798585.0, "reward": 2.064453125, "reward_std": 0.1817692220211029, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.02701912261545658, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 780.931640625, "completions/mean_terminated_length": 778.4520263671875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.9589485363147563, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10143861018881031, "kl": 0.081787109375, "learning_rate": 1.0543977568632347e-07, "loss": 0.005, "num_tokens": 1346281814.0, "reward": 2.06298828125, "reward_std": 0.16354745626449585, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 870.326171875, "completions/mean_terminated_length": 868.0215454101562, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.9592899206281471, "frac_reward_zero_std": 0.625, "grad_norm": 0.09240340829736615, "kl": 0.07666015625, "learning_rate": 1.0372132094680687e-07, "loss": 0.016, "num_tokens": 1346811645.0, "reward": 2.04443359375, "reward_std": 0.12814690172672272, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 761.138671875, "completions/mean_terminated_length": 758.620361328125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.9596313049415379, "frac_reward_zero_std": 0.5, "grad_norm": 0.11448995291912946, "kl": 0.082763671875, "learning_rate": 1.0201691216482179e-07, "loss": 0.0129, "num_tokens": 1347295252.0, "reward": 2.12646484375, "reward_std": 0.19142305850982666, "rewards/accuracy_reward/mean": 0.142578125, "rewards/accuracy_reward/std": 0.3499840497970581, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 805.802734375, "completions/mean_terminated_length": 805.802734375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.9599726892549287, "frac_reward_zero_std": 0.5, "grad_norm": 0.11236642209967121, "kl": 0.0804443359375, "learning_rate": 1.0032655175946204e-07, "loss": 0.0185, "num_tokens": 1347793711.0, "reward": 2.01708984375, "reward_std": 0.15178494155406952, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.1843547374010086, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1742.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 785.041015625, "completions/mean_terminated_length": 785.041015625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.9603140735683195, "frac_reward_zero_std": 0.5, "grad_norm": 0.1555504538632847, "kl": 0.094482421875, "learning_rate": 9.86502421298885e-08, "loss": 0.0098, "num_tokens": 1348306788.0, "reward": 2.0673828125, "reward_std": 0.16738221049308777, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 786.484375, "completions/mean_terminated_length": 786.484375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.9606554578817104, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10228983532042873, "kl": 0.0791015625, "learning_rate": 9.698798565531464e-08, "loss": 0.0155, "num_tokens": 1348790924.0, "reward": 2.07275390625, "reward_std": 0.14011117815971375, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 758.716796875, "completions/mean_terminated_length": 758.716796875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.9609968421951012, "frac_reward_zero_std": 0.625, "grad_norm": 0.1064888850634116, "kl": 0.0848388671875, "learning_rate": 9.533978469500881e-08, "loss": 0.0173, "num_tokens": 1349264987.0, "reward": 2.01513671875, "reward_std": 0.11714502424001694, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17416280508041382, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 747.578125, "completions/mean_terminated_length": 747.578125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.961338226508492, "frac_reward_zero_std": 0.59375, "grad_norm": 0.2767150388601651, "kl": 0.095458984375, "learning_rate": 9.370564158829088e-08, "loss": 0.017, "num_tokens": 1349733235.0, "reward": 2.10107421875, "reward_std": 0.15091300010681152, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 807.830078125, "completions/mean_terminated_length": 807.830078125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9616796108218827, "frac_reward_zero_std": 0.5, "grad_norm": 0.1069434908544416, "kl": 0.073486328125, "learning_rate": 9.208555865452995e-08, "loss": 0.0088, "num_tokens": 1350225356.0, "reward": 2.09375, "reward_std": 0.18120335042476654, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 820.888671875, "completions/mean_terminated_length": 820.888671875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.9620209951352735, "frac_reward_zero_std": 0.5, "grad_norm": 0.11643362557580116, "kl": 0.0751953125, "learning_rate": 9.04795381931356e-08, "loss": 0.0144, "num_tokens": 1350724803.0, "reward": 2.109375, "reward_std": 0.1775430142879486, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 735.9609375, "completions/mean_terminated_length": 735.9609375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.9623623794486643, "frac_reward_zero_std": 0.625, "grad_norm": 0.11046855227599103, "kl": 0.0811767578125, "learning_rate": 8.88875824835611e-08, "loss": 0.0093, "num_tokens": 1351179295.0, "reward": 2.06298828125, "reward_std": 0.12349243462085724, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 742.228515625, "completions/mean_terminated_length": 742.228515625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.9627037637620551, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1232537810301965, "kl": 0.078857421875, "learning_rate": 8.730969378529575e-08, "loss": 0.0141, "num_tokens": 1351635204.0, "reward": 2.10595703125, "reward_std": 0.18588098883628845, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3265552520751953, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 706.6484375, "completions/mean_terminated_length": 706.6484375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.963045148075446, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11802926627970436, "kl": 0.0897216796875, "learning_rate": 8.574587433786363e-08, "loss": 0.0156, "num_tokens": 1352075472.0, "reward": 2.15234375, "reward_std": 0.1581362932920456, "rewards/accuracy_reward/mean": 0.169921875, "rewards/accuracy_reward/std": 0.3759314715862274, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 769.369140625, "completions/mean_terminated_length": 769.369140625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.9633865323888368, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11452752025303657, "kl": 0.08056640625, "learning_rate": 8.419612636082153e-08, "loss": 0.0175, "num_tokens": 1352559677.0, "reward": 2.1337890625, "reward_std": 0.17749150097370148, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35197147727012634, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 801.755859375, "completions/mean_terminated_length": 801.755859375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.9637279167022276, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10387007603425472, "kl": 0.0845947265625, "learning_rate": 8.266045205375328e-08, "loss": 0.0129, "num_tokens": 1353049856.0, "reward": 2.0419921875, "reward_std": 0.08450682461261749, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.2275916188955307, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 769.6796875, "completions/mean_terminated_length": 769.6796875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.9640693010156184, "frac_reward_zero_std": 0.65625, "grad_norm": 0.10246007744168276, "kl": 0.078125, "learning_rate": 8.113885359626649e-08, "loss": 0.0192, "num_tokens": 1353544652.0, "reward": 2.0703125, "reward_std": 0.11874544620513916, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 777.931640625, "completions/mean_terminated_length": 777.931640625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9644106853290091, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11571689383679548, "kl": 0.08056640625, "learning_rate": 7.963133314799253e-08, "loss": -0.0022, "num_tokens": 1354029737.0, "reward": 2.09130859375, "reward_std": 0.19611677527427673, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 768.720703125, "completions/mean_terminated_length": 768.720703125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.9647520696423999, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12377008934415532, "kl": 0.0809326171875, "learning_rate": 7.813789284857987e-08, "loss": 0.0125, "num_tokens": 1354510474.0, "reward": 2.09814453125, "reward_std": 0.21138444542884827, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 766.421875, "completions/mean_terminated_length": 766.421875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.9650934539557907, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09492227977922468, "kl": 0.079833984375, "learning_rate": 7.665853481769625e-08, "loss": 0.0167, "num_tokens": 1354981186.0, "reward": 2.04736328125, "reward_std": 0.11074879765510559, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 764.91796875, "completions/mean_terminated_length": 762.4070434570312, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.9654348382691815, "frac_reward_zero_std": 0.5, "grad_norm": 0.12516845955943467, "kl": 0.0819091796875, "learning_rate": 7.519326115501879e-08, "loss": 0.0085, "num_tokens": 1355478520.0, "reward": 2.07763671875, "reward_std": 0.18903864920139313, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1663.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 759.80078125, "completions/mean_terminated_length": 759.80078125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.9657762225825723, "frac_reward_zero_std": 0.5, "grad_norm": 0.1073901574892401, "kl": 0.081787109375, "learning_rate": 7.374207394023391e-08, "loss": 0.0153, "num_tokens": 1355953314.0, "reward": 2.18896484375, "reward_std": 0.20335794985294342, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1632.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 770.736328125, "completions/mean_terminated_length": 770.736328125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.9661176068959632, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09325079601774694, "kl": 0.079345703125, "learning_rate": 7.230497523303847e-08, "loss": 0.0108, "num_tokens": 1356434779.0, "reward": 2.08251953125, "reward_std": 0.11480261385440826, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 769.0234375, "completions/mean_terminated_length": 769.0234375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.966458991209354, "frac_reward_zero_std": 0.5, "grad_norm": 0.11101954481303825, "kl": 0.0789794921875, "learning_rate": 7.088196707312978e-08, "loss": 0.017, "num_tokens": 1356910087.0, "reward": 2.11865234375, "reward_std": 0.1712564080953598, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 746.06640625, "completions/mean_terminated_length": 746.06640625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.9668003755227448, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12128416280805214, "kl": 0.0836181640625, "learning_rate": 6.947305148020888e-08, "loss": 0.0096, "num_tokens": 1357375593.0, "reward": 2.064453125, "reward_std": 0.160628080368042, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 791.162109375, "completions/mean_terminated_length": 786.2333984375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.9671417598361355, "frac_reward_zero_std": 0.5, "grad_norm": 0.10916186152482159, "kl": 0.079833984375, "learning_rate": 6.807823045397288e-08, "loss": 0.015, "num_tokens": 1357855804.0, "reward": 2.11328125, "reward_std": 0.19796186685562134, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 791.44921875, "completions/mean_terminated_length": 791.44921875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.9674831441495263, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11275093059934573, "kl": 0.0806884765625, "learning_rate": 6.669750597411594e-08, "loss": 0.0233, "num_tokens": 1358342530.0, "reward": 2.07470703125, "reward_std": 0.1563636064529419, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 787.9296875, "completions/mean_terminated_length": 785.4638061523438, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.9678245284629171, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10637006514985294, "kl": 0.082763671875, "learning_rate": 6.533088000032273e-08, "loss": 0.0166, "num_tokens": 1358819438.0, "reward": 2.0400390625, "reward_std": 0.1309126764535904, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 798.189453125, "completions/mean_terminated_length": 798.189453125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.9681659127763079, "frac_reward_zero_std": 0.5, "grad_norm": 0.2962484933990216, "kl": 0.098876953125, "learning_rate": 6.39783544722694e-08, "loss": 0.0063, "num_tokens": 1359313919.0, "reward": 2.11865234375, "reward_std": 0.1876090168952942, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33542385697364807, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1961.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 825.61328125, "completions/mean_terminated_length": 825.61328125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.9685072970896987, "frac_reward_zero_std": 0.65625, "grad_norm": 0.08786407969016359, "kl": 0.077880859375, "learning_rate": 6.263993130961931e-08, "loss": 0.0126, "num_tokens": 1359818937.0, "reward": 2.0380859375, "reward_std": 0.13053306937217712, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21976542472839355, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 833.615234375, "completions/mean_terminated_length": 833.615234375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9688486814030896, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09484493998929291, "kl": 0.0753173828125, "learning_rate": 6.131561241201844e-08, "loss": 0.0092, "num_tokens": 1360333956.0, "reward": 2.0771484375, "reward_std": 0.101016566157341, "rewards/accuracy_reward/mean": 0.083984375, "rewards/accuracy_reward/std": 0.2776356339454651, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 807.2890625, "completions/mean_terminated_length": 807.2890625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.9691900657164804, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11603310420787072, "kl": 0.07763671875, "learning_rate": 6.000539965909547e-08, "loss": 0.0196, "num_tokens": 1360829256.0, "reward": 2.08056640625, "reward_std": 0.193034365773201, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29713961482048035, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 779.0390625, "completions/mean_terminated_length": 779.0390625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.9695314500298712, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11973155023025858, "kl": 0.0831298828125, "learning_rate": 5.870929491045729e-08, "loss": 0.0064, "num_tokens": 1361316476.0, "reward": 2.076171875, "reward_std": 0.15304479002952576, "rewards/accuracy_reward/mean": 0.087890625, "rewards/accuracy_reward/std": 0.2834126651287079, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 798.41796875, "completions/mean_terminated_length": 795.9725952148438, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9698728343432619, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10552900281216986, "kl": 0.0792236328125, "learning_rate": 5.742730000568908e-08, "loss": 0.0296, "num_tokens": 1361811858.0, "reward": 2.13818359375, "reward_std": 0.15528997778892517, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.3671095669269562, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.05729745700955391, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1625.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 788.35546875, "completions/mean_terminated_length": 788.35546875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9702142186566527, "frac_reward_zero_std": 0.6875, "grad_norm": 0.096028977262835, "kl": 0.0843505859375, "learning_rate": 5.615941676434644e-08, "loss": 0.013, "num_tokens": 1362309848.0, "reward": 2.0478515625, "reward_std": 0.12481817603111267, "rewards/accuracy_reward/mean": 0.060546875, "rewards/accuracy_reward/std": 0.2387305200099945, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 760.73046875, "completions/mean_terminated_length": 760.73046875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9705556029700435, "frac_reward_zero_std": 0.375, "grad_norm": 0.1300490031511393, "kl": 0.0823974609375, "learning_rate": 5.490564698595879e-08, "loss": 0.0121, "num_tokens": 1362781086.0, "reward": 2.11669921875, "reward_std": 0.2178841382265091, "rewards/accuracy_reward/mean": 0.134765625, "rewards/accuracy_reward/std": 0.3418070077896118, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 800.150390625, "completions/mean_terminated_length": 800.150390625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9708969872834343, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10355683223157831, "kl": 0.080322265625, "learning_rate": 5.3665992450022685e-08, "loss": 0.0147, "num_tokens": 1363276523.0, "reward": 2.08251953125, "reward_std": 0.1574598252773285, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15143637359142303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 755.42578125, "completions/mean_terminated_length": 755.42578125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9712383715968251, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12428481750211559, "kl": 0.0892333984375, "learning_rate": 5.2440454915999585e-08, "loss": 0.0136, "num_tokens": 1363750661.0, "reward": 2.11474609375, "reward_std": 0.17559076845645905, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 765.296875, "completions/mean_terminated_length": 765.296875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.971579755910216, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09008177537634678, "kl": 0.08056640625, "learning_rate": 5.1229036123319196e-08, "loss": 0.013, "num_tokens": 1364226397.0, "reward": 2.06982421875, "reward_std": 0.11514122784137726, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26863065361976624, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 723.5859375, "completions/mean_terminated_length": 720.994140625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.9719211402236068, "frac_reward_zero_std": 0.5, "grad_norm": 0.1138328534961306, "kl": 0.0831298828125, "learning_rate": 5.0031737791365055e-08, "loss": 0.0147, "num_tokens": 1364680777.0, "reward": 2.14208984375, "reward_std": 0.20317105948925018, "rewards/accuracy_reward/mean": 0.158203125, "rewards/accuracy_reward/std": 0.36528825759887695, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 776.662109375, "completions/mean_terminated_length": 776.662109375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.9722625245369976, "frac_reward_zero_std": 0.5, "grad_norm": 0.10656437200893537, "kl": 0.0787353515625, "learning_rate": 4.88485616194867e-08, "loss": 0.0237, "num_tokens": 1365166412.0, "reward": 2.109375, "reward_std": 0.18276527523994446, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 777.533203125, "completions/mean_terminated_length": 776.4520263671875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.9726039088503883, "frac_reward_zero_std": 0.5, "grad_norm": 0.40924554312425876, "kl": 0.09765625, "learning_rate": 4.767950928698306e-08, "loss": 0.0195, "num_tokens": 1365658397.0, "reward": 2.12939453125, "reward_std": 0.19214993715286255, "rewards/accuracy_reward/mean": 0.14717741310596466, "rewards/accuracy_reward/std": 0.3546403646469116, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 783.830078125, "completions/mean_terminated_length": 783.830078125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.9729452931637791, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11907834442751615, "kl": 0.0791015625, "learning_rate": 4.652458245311242e-08, "loss": 0.0132, "num_tokens": 1366143142.0, "reward": 2.13720703125, "reward_std": 0.23103174567222595, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 781.431640625, "completions/mean_terminated_length": 781.431640625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9732866774771699, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11219295765663972, "kl": 0.0777587890625, "learning_rate": 4.538378275708133e-08, "loss": 0.0147, "num_tokens": 1366649779.0, "reward": 2.0791015625, "reward_std": 0.16843992471694946, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 794.94140625, "completions/mean_terminated_length": 794.94140625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.9736280617905607, "frac_reward_zero_std": 0.5, "grad_norm": 0.12131810593058934, "kl": 0.0819091796875, "learning_rate": 4.4257111818046814e-08, "loss": 0.0116, "num_tokens": 1367150949.0, "reward": 2.06884765625, "reward_std": 0.17992445826530457, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 762.748046875, "completions/mean_terminated_length": 762.748046875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.9739694461039515, "frac_reward_zero_std": 0.5, "grad_norm": 0.1272588074619021, "kl": 0.090576171875, "learning_rate": 4.314457123511084e-08, "loss": 0.0198, "num_tokens": 1367628452.0, "reward": 2.13720703125, "reward_std": 0.18482574820518494, "rewards/accuracy_reward/mean": 0.154296875, "rewards/accuracy_reward/std": 0.36158639192581177, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 761.91796875, "completions/mean_terminated_length": 761.91796875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.9743108304173423, "frac_reward_zero_std": 0.6875, "grad_norm": 0.11191308274417737, "kl": 0.081298828125, "learning_rate": 4.2046162587323636e-08, "loss": 0.0181, "num_tokens": 1368106810.0, "reward": 2.00830078125, "reward_std": 0.09329229593276978, "rewards/accuracy_reward/mean": 0.021484375, "rewards/accuracy_reward/std": 0.14513419568538666, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 742.828125, "completions/mean_terminated_length": 742.828125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.9746522147307332, "frac_reward_zero_std": 0.5, "grad_norm": 0.10949071734109236, "kl": 0.077880859375, "learning_rate": 4.096188743367258e-08, "loss": 0.0185, "num_tokens": 1368569778.0, "reward": 2.13232421875, "reward_std": 0.20447321236133575, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1852.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 763.037109375, "completions/mean_terminated_length": 763.037109375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.974993599044124, "frac_reward_zero_std": 0.5, "grad_norm": 0.10955626418812267, "kl": 0.0810546875, "learning_rate": 3.989174731308998e-08, "loss": 0.0161, "num_tokens": 1369041957.0, "reward": 2.1396484375, "reward_std": 0.21840211749076843, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.35970520973205566, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 733.24609375, "completions/mean_terminated_length": 732.5107421875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.9753349833575147, "frac_reward_zero_std": 0.46875, "grad_norm": 0.40726841347369097, "kl": 0.3304443359375, "learning_rate": 3.88357437444431e-08, "loss": 0.0213, "num_tokens": 1369512099.0, "reward": 2.138671875, "reward_std": 0.20921343564987183, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 738.84765625, "completions/mean_terminated_length": 738.84765625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9756763676709055, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09290935593751913, "kl": 0.0797119140625, "learning_rate": 3.779387822653635e-08, "loss": 0.0108, "num_tokens": 1369971973.0, "reward": 2.087890625, "reward_std": 0.11586494743824005, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29176566004753113, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 780.3359375, "completions/mean_terminated_length": 780.3359375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.9760177519842963, "frac_reward_zero_std": 0.53125, "grad_norm": 0.109802370352298, "kl": 0.0828857421875, "learning_rate": 3.6766152238106866e-08, "loss": 0.008, "num_tokens": 1370461153.0, "reward": 2.080078125, "reward_std": 0.1699105054140091, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 812.8203125, "completions/mean_terminated_length": 812.8203125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.9763591362976871, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11865262659170554, "kl": 0.0784912109375, "learning_rate": 3.57525672378245e-08, "loss": 0.0059, "num_tokens": 1370957877.0, "reward": 2.076171875, "reward_std": 0.20099163055419922, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1970.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 703.619140625, "completions/mean_terminated_length": 703.619140625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9767005206110779, "frac_reward_zero_std": 0.625, "grad_norm": 0.10794797317046803, "kl": 0.0838623046875, "learning_rate": 3.4753124664286264e-08, "loss": 0.0109, "num_tokens": 1371399090.0, "reward": 2.11572265625, "reward_std": 0.13899493217468262, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310423493385315, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 768.994140625, "completions/mean_terminated_length": 768.994140625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9770419049244687, "frac_reward_zero_std": 0.71875, "grad_norm": 0.07194467605521221, "kl": 0.080810546875, "learning_rate": 3.376782593601968e-08, "loss": -0.0035, "num_tokens": 1371879583.0, "reward": 2.10400390625, "reward_std": 0.10936112701892853, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.3172462284564972, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 766.255859375, "completions/mean_terminated_length": 766.255859375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.9773832892378596, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10755182393210383, "kl": 0.0828857421875, "learning_rate": 3.279667245147389e-08, "loss": 0.0184, "num_tokens": 1372354994.0, "reward": 2.1181640625, "reward_std": 0.16272109746932983, "rewards/accuracy_reward/mean": 0.126953125, "rewards/accuracy_reward/std": 0.33324605226516724, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 784.978515625, "completions/mean_terminated_length": 784.978515625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.9777246735512504, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11379593691915814, "kl": 0.0792236328125, "learning_rate": 3.183966558902629e-08, "loss": 0.0141, "num_tokens": 1372840775.0, "reward": 2.1875, "reward_std": 0.19858019053936005, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40557438135147095, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 783.134765625, "completions/mean_terminated_length": 783.134765625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.9780660578646411, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10811161273230888, "kl": 0.08447265625, "learning_rate": 3.089680670697148e-08, "loss": 0.0146, "num_tokens": 1373316956.0, "reward": 2.16943359375, "reward_std": 0.16540616750717163, "rewards/accuracy_reward/mean": 0.185546875, "rewards/accuracy_reward/std": 0.38912075757980347, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 769.68359375, "completions/mean_terminated_length": 769.68359375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.9784074421780319, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11580517616698506, "kl": 0.08349609375, "learning_rate": 2.996809714352678e-08, "loss": 0.0151, "num_tokens": 1373828138.0, "reward": 2.1533203125, "reward_std": 0.19046896696090698, "rewards/accuracy_reward/mean": 0.16733871400356293, "rewards/accuracy_reward/std": 0.37365487217903137, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1700.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 844.1640625, "completions/mean_terminated_length": 844.1640625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.9787488264914227, "frac_reward_zero_std": 0.59375, "grad_norm": 0.0907146224950526, "kl": 0.0755615234375, "learning_rate": 2.905353821682333e-08, "loss": 0.0141, "num_tokens": 1374341390.0, "reward": 2.09130859375, "reward_std": 0.13747742772102356, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 770.896484375, "completions/mean_terminated_length": 765.8883056640625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.9790902108048135, "frac_reward_zero_std": 0.5, "grad_norm": 0.1280376681735884, "kl": 0.0831298828125, "learning_rate": 2.8153131224911702e-08, "loss": 0.0236, "num_tokens": 1374828793.0, "reward": 2.11962890625, "reward_std": 0.20201072096824646, "rewards/accuracy_reward/mean": 0.14919355511665344, "rewards/accuracy_reward/std": 0.3566388487815857, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.059313252568244934, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1976.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 790.39453125, "completions/mean_terminated_length": 790.39453125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9794315951182043, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1336475481915943, "kl": 0.0816650390625, "learning_rate": 2.726687744575407e-08, "loss": 0.0227, "num_tokens": 1375312595.0, "reward": 2.10205078125, "reward_std": 0.19774317741394043, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.32195815443992615, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 733.71875, "completions/mean_terminated_length": 733.71875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.9797729794315951, "frac_reward_zero_std": 0.59375, "grad_norm": 0.121390208618322, "kl": 0.084716796875, "learning_rate": 2.6394778137226462e-08, "loss": 0.0215, "num_tokens": 1375781123.0, "reward": 2.021484375, "reward_std": 0.12141762673854828, "rewards/accuracy_reward/mean": 0.04032257944345474, "rewards/accuracy_reward/std": 0.19691328704357147, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 803.78515625, "completions/mean_terminated_length": 803.78515625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.980114363744986, "frac_reward_zero_std": 0.46875, "grad_norm": 0.10782930985865093, "kl": 0.0721435546875, "learning_rate": 2.5536834537114308e-08, "loss": 0.01, "num_tokens": 1376277541.0, "reward": 2.06494140625, "reward_std": 0.1847134828567505, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 786.68359375, "completions/mean_terminated_length": 786.68359375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.9804557480583768, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09663896761274555, "kl": 0.07373046875, "learning_rate": 2.469304786311022e-08, "loss": 0.006, "num_tokens": 1376762531.0, "reward": 2.05810546875, "reward_std": 0.14223355054855347, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2623828947544098, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 722.36328125, "completions/mean_terminated_length": 719.7691040039062, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.9807971323717675, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11627443042808593, "kl": 0.082275390625, "learning_rate": 2.3863419312817325e-08, "loss": 0.0123, "num_tokens": 1377212813.0, "reward": 2.0830078125, "reward_std": 0.17120620608329773, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 770.72265625, "completions/mean_terminated_length": 770.72265625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.9811385166851583, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10815010745835144, "kl": 0.08203125, "learning_rate": 2.304795006373928e-08, "loss": 0.0258, "num_tokens": 1377687599.0, "reward": 2.0849609375, "reward_std": 0.15538455545902252, "rewards/accuracy_reward/mean": 0.103515625, "rewards/accuracy_reward/std": 0.30492907762527466, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 761.154296875, "completions/mean_terminated_length": 758.635986328125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.9814799009985491, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1415143157755031, "kl": 0.082763671875, "learning_rate": 2.2246641273286907e-08, "loss": 0.0243, "num_tokens": 1378170206.0, "reward": 2.0771484375, "reward_std": 0.2614050805568695, "rewards/accuracy_reward/mean": 0.11290322244167328, "rewards/accuracy_reward/std": 0.3167939782142639, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04396656155586243, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 827.630859375, "completions/mean_terminated_length": 827.630859375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.9818212853119399, "frac_reward_zero_std": 0.46875, "grad_norm": 0.11337080920773795, "kl": 0.07763671875, "learning_rate": 2.145949407877157e-08, "loss": 0.0143, "num_tokens": 1378673585.0, "reward": 2.06689453125, "reward_std": 0.17279216647148132, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2746807038784027, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 774.001953125, "completions/mean_terminated_length": 774.001953125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9821626696253307, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1074215060566103, "kl": 0.0838623046875, "learning_rate": 2.0686509597404037e-08, "loss": 0.0124, "num_tokens": 1379151586.0, "reward": 2.1123046875, "reward_std": 0.18640004098415375, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.33970388770103455, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 762.306640625, "completions/mean_terminated_length": 762.306640625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.9825040539387215, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14602603439218606, "kl": 0.0841064453125, "learning_rate": 1.9927688926295595e-08, "loss": 0.0018, "num_tokens": 1379623775.0, "reward": 2.126953125, "reward_std": 0.18988856673240662, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 704.943359375, "completions/mean_terminated_length": 702.3150634765625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9828454382521123, "frac_reward_zero_std": 0.59375, "grad_norm": 0.11403122282577474, "kl": 0.0809326171875, "learning_rate": 1.91830331424514e-08, "loss": 0.0133, "num_tokens": 1380067970.0, "reward": 2.13427734375, "reward_std": 0.14482755959033966, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 770.376953125, "completions/mean_terminated_length": 770.376953125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.9831868225655032, "frac_reward_zero_std": 0.625, "grad_norm": 0.10103969169718693, "kl": 0.0780029296875, "learning_rate": 1.8452543302776016e-08, "loss": 0.0064, "num_tokens": 1380536355.0, "reward": 2.0546875, "reward_std": 0.13239669799804688, "rewards/accuracy_reward/mean": 0.07661290466785431, "rewards/accuracy_reward/std": 0.2662447690963745, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1736.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 761.955078125, "completions/mean_terminated_length": 761.955078125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.9835282068788939, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0842403246094216, "kl": 0.07666015625, "learning_rate": 1.7736220444064535e-08, "loss": 0.0053, "num_tokens": 1381014252.0, "reward": 2.10498046875, "reward_std": 0.11270871013402939, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 760.875, "completions/mean_terminated_length": 760.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9838695911922847, "frac_reward_zero_std": 0.625, "grad_norm": 0.09924389192913623, "kl": 0.08056640625, "learning_rate": 1.7034065583008132e-08, "loss": 0.0147, "num_tokens": 1381483340.0, "reward": 2.08740234375, "reward_std": 0.12057200074195862, "rewards/accuracy_reward/mean": 0.095703125, "rewards/accuracy_reward/std": 0.2944713830947876, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 787.89453125, "completions/mean_terminated_length": 785.4285888671875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.9842109755056755, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08467387591184151, "kl": 0.0770263671875, "learning_rate": 1.6346079716185182e-08, "loss": 0.0104, "num_tokens": 1381974742.0, "reward": 2.11474609375, "reward_std": 0.12450353801250458, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3310528099536896, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 830.875, "completions/mean_terminated_length": 830.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.9845523598190663, "frac_reward_zero_std": 0.59375, "grad_norm": 0.09050112984877703, "kl": 0.07177734375, "learning_rate": 1.5672263820066814e-08, "loss": -0.0004, "num_tokens": 1382491798.0, "reward": 2.09228515625, "reward_std": 0.1566058248281479, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 806.625, "completions/mean_terminated_length": 804.1956787109375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.9848937441324571, "frac_reward_zero_std": 0.5, "grad_norm": 0.11659702089788575, "kl": 0.0831298828125, "learning_rate": 1.501261885101246e-08, "loss": 0.0221, "num_tokens": 1382993510.0, "reward": 2.09326171875, "reward_std": 0.18465019762516022, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31241437792778015, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 784.337890625, "completions/mean_terminated_length": 783.8571166992188, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.9852351284458479, "frac_reward_zero_std": 0.5, "grad_norm": 0.5924063618953188, "kl": 0.1103515625, "learning_rate": 1.4367145745265431e-08, "loss": 0.0247, "num_tokens": 1383481235.0, "reward": 2.0849609375, "reward_std": 0.20358715951442719, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1653.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 775.017578125, "completions/mean_terminated_length": 775.017578125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.9855765127592387, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1068557869425238, "kl": 0.0770263671875, "learning_rate": 1.3735845418958449e-08, "loss": 0.0161, "num_tokens": 1383958764.0, "reward": 2.0810546875, "reward_std": 0.16435372829437256, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 706.13671875, "completions/mean_terminated_length": 704.379638671875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.9859178970726296, "frac_reward_zero_std": 0.53125, "grad_norm": 0.8691083096318905, "kl": 0.1837158203125, "learning_rate": 1.3118718768105887e-08, "loss": 0.0269, "num_tokens": 1384403170.0, "reward": 2.1279296875, "reward_std": 0.17970260977745056, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3479743003845215, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 776.693359375, "completions/mean_terminated_length": 776.693359375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.9862592813860203, "frac_reward_zero_std": 0.625, "grad_norm": 0.10924257845032712, "kl": 0.07568359375, "learning_rate": 1.2515766668605989e-08, "loss": 0.0093, "num_tokens": 1384889141.0, "reward": 2.0810546875, "reward_std": 0.13620877265930176, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.06243881583213806, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 770.298828125, "completions/mean_terminated_length": 769.383544921875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.9866006656994111, "frac_reward_zero_std": 0.59375, "grad_norm": 0.28365884544021835, "kl": 0.35595703125, "learning_rate": 1.1926989976240866e-08, "loss": 0.022, "num_tokens": 1385364958.0, "reward": 2.0830078125, "reward_std": 0.15766692161560059, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.051642172038555145, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 771.759765625, "completions/mean_terminated_length": 771.759765625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.9869420500128019, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0810583654063685, "kl": 0.0765380859375, "learning_rate": 1.1352389526668728e-08, "loss": 0.0092, "num_tokens": 1385843683.0, "reward": 2.12744140625, "reward_std": 0.13209691643714905, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.04808502271771431, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 829.896484375, "completions/mean_terminated_length": 829.896484375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.9872834343261927, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1005429488745976, "kl": 0.0799560546875, "learning_rate": 1.0791966135431654e-08, "loss": 0.0148, "num_tokens": 1386352654.0, "reward": 2.07958984375, "reward_std": 0.12743695080280304, "rewards/accuracy_reward/mean": 0.09879032522439957, "rewards/accuracy_reward/std": 0.2986815273761749, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1510.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 805.689453125, "completions/mean_terminated_length": 805.689453125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.9876248186395835, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08922278222561877, "kl": 0.077392578125, "learning_rate": 1.0245720597946706e-08, "loss": 0.0063, "num_tokens": 1386853951.0, "reward": 2.09375, "reward_std": 0.11568198353052139, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.994140625, "rewards/format_reward/std": 0.07639661431312561, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 763.814453125, "completions/mean_terminated_length": 763.814453125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9879662029529743, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1130999390084287, "kl": 0.086181640625, "learning_rate": 9.713653689510383e-09, "loss": 0.0169, "num_tokens": 1387326432.0, "reward": 2.14990234375, "reward_std": 0.19901999831199646, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3776407241821289, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 777.724609375, "completions/mean_terminated_length": 775.23876953125, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.9883075872663651, "frac_reward_zero_std": 0.5, "grad_norm": 0.12449066019748421, "kl": 0.0865478515625, "learning_rate": 9.195766165295272e-09, "loss": 0.0189, "num_tokens": 1387809331.0, "reward": 2.0908203125, "reward_std": 0.20735862851142883, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.974609375, "rewards/format_reward/std": 0.15746226906776428, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1637.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 677.818359375, "completions/mean_terminated_length": 677.818359375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.988648971579756, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12952086785787376, "kl": 0.086181640625, "learning_rate": 8.692058760345623e-09, "loss": 0.0305, "num_tokens": 1388231894.0, "reward": 2.0947265625, "reward_std": 0.20704473555088043, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 761.64453125, "completions/mean_terminated_length": 761.64453125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.9889903558931467, "frac_reward_zero_std": 0.75, "grad_norm": 0.06769060004359177, "kl": 0.07568359375, "learning_rate": 8.20253218958511e-09, "loss": 0.0054, "num_tokens": 1388700880.0, "reward": 2.10791015625, "reward_std": 0.09845314174890518, "rewards/accuracy_reward/mean": 0.111328125, "rewards/accuracy_reward/std": 0.31484565138816833, "rewards/format_reward/mean": 0.998046875, "rewards/format_reward/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.03314562886953354, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1847.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 798.1875, "completions/mean_terminated_length": 798.1875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.9893317402065375, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08588665933406728, "kl": 0.0765380859375, "learning_rate": 7.727187147803516e-09, "loss": 0.0218, "num_tokens": 1389203376.0, "reward": 2.013671875, "reward_std": 0.08813370764255524, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.16324250400066376, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03491804376244545, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 787.677734375, "completions/mean_terminated_length": 787.677734375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.9896731245199283, "frac_reward_zero_std": 0.65625, "grad_norm": 0.09600173554390568, "kl": 0.0784912109375, "learning_rate": 7.26602430966894e-09, "loss": 0.012, "num_tokens": 1389682907.0, "reward": 2.08056640625, "reward_std": 0.12259502708911896, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2862374484539032, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 803.2265625, "completions/mean_terminated_length": 803.2265625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.9900145088333191, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10410407384480717, "kl": 0.0791015625, "learning_rate": 6.819044329716695e-09, "loss": 0.0061, "num_tokens": 1390173855.0, "reward": 2.07080078125, "reward_std": 0.1623360514640808, "rewards/accuracy_reward/mean": 0.091796875, "rewards/accuracy_reward/std": 0.289021372795105, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99462890625, "rewards/tag_count_reward/std": 0.055034760385751724, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 856.576171875, "completions/mean_terminated_length": 856.576171875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.9903558931467099, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09051816360814072, "kl": 0.07421875, "learning_rate": 6.386247842353755e-09, "loss": 0.0128, "num_tokens": 1390698806.0, "reward": 2.02001953125, "reward_std": 0.11058424413204193, "rewards/accuracy_reward/mean": 0.037109375, "rewards/accuracy_reward/std": 0.18921469151973724, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 802.6328125, "completions/mean_terminated_length": 802.6328125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9906972774601007, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12488894096087517, "kl": 0.0802001953125, "learning_rate": 5.967635461854304e-09, "loss": 0.0146, "num_tokens": 1391193434.0, "reward": 2.07763671875, "reward_std": 0.09719854593276978, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 817.966796875, "completions/mean_terminated_length": 815.5596923828125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.9910386617734915, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10402330505958786, "kl": 0.0775146484375, "learning_rate": 5.563207782363078e-09, "loss": 0.0097, "num_tokens": 1391706153.0, "reward": 2.048828125, "reward_std": 0.13361230492591858, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.2422981858253479, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04119514673948288, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 742.251953125, "completions/mean_terminated_length": 742.251953125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.9913800460868823, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12250308762985097, "kl": 0.08251953125, "learning_rate": 5.172965377890915e-09, "loss": 0.0111, "num_tokens": 1392171098.0, "reward": 2.06396484375, "reward_std": 0.16945436596870422, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28054583072662354, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 780.9609375, "completions/mean_terminated_length": 780.9609375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.9917214304002732, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09705909340956163, "kl": 0.072509765625, "learning_rate": 4.79690880231587e-09, "loss": 0.0189, "num_tokens": 1392645526.0, "reward": 2.05029296875, "reward_std": 0.15216603875160217, "rewards/accuracy_reward/mean": 0.064453125, "rewards/accuracy_reward/std": 0.24579854309558868, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1650.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 745.669921875, "completions/mean_terminated_length": 745.669921875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9920628147136639, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12680546398444922, "kl": 0.08154296875, "learning_rate": 4.435038589380991e-09, "loss": 0.0099, "num_tokens": 1393112653.0, "reward": 2.18994140625, "reward_std": 0.16484864056110382, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.4027182459831238, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 694.4453125, "completions/mean_terminated_length": 694.4453125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.9924041990270547, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11603922474180073, "kl": 0.0872802734375, "learning_rate": 4.087355252694325e-09, "loss": 0.0072, "num_tokens": 1393552081.0, "reward": 2.12353515625, "reward_std": 0.18750609457492828, "rewards/accuracy_reward/mean": 0.138671875, "rewards/accuracy_reward/std": 0.34594178199768066, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 825.453125, "completions/mean_terminated_length": 818.24755859375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9927455833404455, "frac_reward_zero_std": 0.375, "grad_norm": 0.12487904460610164, "kl": 0.077392578125, "learning_rate": 3.753859285730022e-09, "loss": 0.0293, "num_tokens": 1394054505.0, "reward": 2.09375, "reward_std": 0.20864024758338928, "rewards/accuracy_reward/mean": 0.119140625, "rewards/accuracy_reward/std": 0.32427072525024414, "rewards/format_reward/mean": 0.978515625, "rewards/format_reward/std": 0.14513419568538666, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.031035220250487328, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 787.2265625, "completions/mean_terminated_length": 787.2265625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9930869676538363, "frac_reward_zero_std": 0.5, "grad_norm": 0.11516269657952301, "kl": 0.0797119140625, "learning_rate": 3.4345511618238957e-09, "loss": 0.0174, "num_tokens": 1394543709.0, "reward": 2.09228515625, "reward_std": 0.17656370997428894, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99658203125, "rewards/tag_count_reward/std": 0.03972800448536873, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 778.107421875, "completions/mean_terminated_length": 776.7182006835938, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9934283519672271, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3443649766769348, "kl": 0.0982666015625, "learning_rate": 3.129431334175648e-09, "loss": 0.0329, "num_tokens": 1395033684.0, "reward": 2.11474609375, "reward_std": 0.20991787314414978, "rewards/accuracy_reward/mean": 0.146484375, "rewards/accuracy_reward/std": 0.35393697023391724, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.16324250400066376, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.04260620102286339, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 782.6015625, "completions/mean_terminated_length": 782.6015625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.9937697362806179, "frac_reward_zero_std": 0.53125, "grad_norm": 0.12301114676381447, "kl": 0.0780029296875, "learning_rate": 2.8385002358466417e-09, "loss": 0.0053, "num_tokens": 1395524152.0, "reward": 2.103515625, "reward_std": 0.171902135014534, "rewards/accuracy_reward/mean": 0.12096773833036423, "rewards/accuracy_reward/std": 0.32641899585723877, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 714.876953125, "completions/mean_terminated_length": 714.876953125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.9941111205940087, "frac_reward_zero_std": 0.53125, "grad_norm": 0.11373851773465948, "kl": 0.08544921875, "learning_rate": 2.5617582797610174e-09, "loss": 0.026, "num_tokens": 1395971001.0, "reward": 2.08349609375, "reward_std": 0.17116190493106842, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.30236753821372986, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1902.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 794.6015625, "completions/mean_terminated_length": 794.6015625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.9944525049073996, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11380928184475643, "kl": 0.0804443359375, "learning_rate": 2.299205858702358e-09, "loss": 0.0148, "num_tokens": 1396454045.0, "reward": 2.11669921875, "reward_std": 0.15615960955619812, "rewards/accuracy_reward/mean": 0.13709677755832672, "rewards/accuracy_reward/std": 0.34429675340652466, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12414088100194931, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.011048543266952038, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 835.59765625, "completions/mean_terminated_length": 833.2250366210938, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.9947938892207903, "frac_reward_zero_std": 0.5, "grad_norm": 0.10149774742284243, "kl": 0.074462890625, "learning_rate": 2.0508433453170218e-09, "loss": 0.0245, "num_tokens": 1396964895.0, "reward": 2.08154296875, "reward_std": 0.1786787509918213, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3074568510055542, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99560546875, "rewards/tag_count_reward/std": 0.050489041954278946, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1628.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 808.962890625, "completions/mean_terminated_length": 808.962890625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.9951352735341811, "frac_reward_zero_std": 0.625, "grad_norm": 0.09026605662138586, "kl": 0.0748291015625, "learning_rate": 1.8166710921097008e-09, "loss": 0.0103, "num_tokens": 1397456716.0, "reward": 2.08642578125, "reward_std": 0.14233165979385376, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 838.197265625, "completions/mean_terminated_length": 835.8297119140625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.9954766578475719, "frac_reward_zero_std": 0.59375, "grad_norm": 0.0990546564513383, "kl": 0.0794677734375, "learning_rate": 1.5966894314456416e-09, "loss": 0.0194, "num_tokens": 1397977025.0, "reward": 2.06494140625, "reward_std": 0.1546308547258377, "rewards/accuracy_reward/mean": 0.080078125, "rewards/accuracy_reward/std": 0.271679550409317, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 735.02734375, "completions/mean_terminated_length": 735.02734375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.9958180421609627, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09306644198204554, "kl": 0.08642578125, "learning_rate": 1.3908986755473142e-09, "loss": 0.0144, "num_tokens": 1398435407.0, "reward": 2.0849609375, "reward_std": 0.1204758882522583, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015609703958034515, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 774.666015625, "completions/mean_terminated_length": 774.666015625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.9961594264743535, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11717990565862879, "kl": 0.08349609375, "learning_rate": 1.199299116497743e-09, "loss": 0.0232, "num_tokens": 1398909236.0, "reward": 2.056640625, "reward_std": 0.13060790300369263, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 794.30078125, "completions/mean_terminated_length": 794.30078125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.9965008107877443, "frac_reward_zero_std": 0.375, "grad_norm": 0.11873912143398956, "kl": 0.0755615234375, "learning_rate": 1.0218910262371762e-09, "loss": 0.0179, "num_tokens": 1399398206.0, "reward": 2.14208984375, "reward_std": 0.23764190077781677, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.03659820929169655, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 833.984375, "completions/mean_terminated_length": 833.984375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.9968421951011351, "frac_reward_zero_std": 0.53125, "grad_norm": 0.10545064088363902, "kl": 0.0784912109375, "learning_rate": 8.586746565641957e-10, "loss": 0.0003, "num_tokens": 1399921494.0, "reward": 2.12548828125, "reward_std": 0.1864640712738037, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.3438861668109894, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 740.46484375, "completions/mean_terminated_length": 740.46484375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.997183579414526, "frac_reward_zero_std": 0.59375, "grad_norm": 0.10125262138426178, "kl": 0.0797119140625, "learning_rate": 7.096502391346072e-10, "loss": 0.0078, "num_tokens": 1400381748.0, "reward": 2.06103515625, "reward_std": 0.15187877416610718, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.25592297315597534, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08812850713729858, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 813.0234375, "completions/mean_terminated_length": 813.0234375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9975249637279167, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09890654551815842, "kl": 0.0748291015625, "learning_rate": 5.748179854614399e-10, "loss": 0.0172, "num_tokens": 1400881472.0, "reward": 2.0908203125, "reward_std": 0.1693429946899414, "rewards/accuracy_reward/mean": 0.107421875, "rewards/accuracy_reward/std": 0.30995169281959534, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.038198307156562805, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 737.50390625, "completions/mean_terminated_length": 737.50390625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9978663480413075, "frac_reward_zero_std": 0.625, "grad_norm": 0.10499893723997014, "kl": 0.081787109375, "learning_rate": 4.541780869138368e-10, "loss": 0.0114, "num_tokens": 1401334930.0, "reward": 2.06103515625, "reward_std": 0.1196913868188858, "rewards/accuracy_reward/mean": 0.072265625, "rewards/accuracy_reward/std": 0.2591804563999176, "rewards/format_reward/mean": 0.990234375, "rewards/format_reward/std": 0.09843364357948303, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1754.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 797.484375, "completions/mean_terminated_length": 797.484375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.9982077323546983, "frac_reward_zero_std": 0.5, "grad_norm": 0.10729524502444683, "kl": 0.080078125, "learning_rate": 3.477307147192743e-10, "loss": 0.016, "num_tokens": 1401819642.0, "reward": 2.13671875, "reward_std": 0.1730148196220398, "rewards/accuracy_reward/mean": 0.150390625, "rewards/accuracy_reward/std": 0.35780346393585205, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1937.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 811.166015625, "completions/mean_terminated_length": 811.166015625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.9985491166680891, "frac_reward_zero_std": 0.5, "grad_norm": 0.10537359711068275, "kl": 0.081298828125, "learning_rate": 2.5547601995912216e-10, "loss": 0.0178, "num_tokens": 1402320623.0, "reward": 2.14892578125, "reward_std": 0.21511411666870117, "rewards/accuracy_reward/mean": 0.162109375, "rewards/accuracy_reward/std": 0.3689115643501282, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.10772226005792618, "rewards/tag_count_reward/mean": 0.99853515625, "rewards/tag_count_reward/std": 0.019099153578281403, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 703.416015625, "completions/mean_terminated_length": 703.416015625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.9988905009814799, "frac_reward_zero_std": 0.5, "grad_norm": 0.1300151179262908, "kl": 0.0833740234375, "learning_rate": 1.7741413357197367e-10, "loss": 0.0125, "num_tokens": 1402760996.0, "reward": 2.080078125, "reward_std": 0.17316466569900513, "rewards/accuracy_reward/mean": 0.099609375, "rewards/accuracy_reward/std": 0.29977133870124817, "rewards/format_reward/mean": 0.982421875, "rewards/format_reward/std": 0.13154059648513794, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022032126784324646, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1723.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 840.955078125, "completions/mean_terminated_length": 840.955078125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.9992318852948707, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10865170665818756, "kl": 0.0772705078125, "learning_rate": 1.1354516635364577e-10, "loss": 0.0192, "num_tokens": 1403271117.0, "reward": 2.0263671875, "reward_std": 0.13820995390415192, "rewards/accuracy_reward/mean": 0.048828125, "rewards/accuracy_reward/std": 0.2157193273305893, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.026930565014481544, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 767.625, "completions/mean_terminated_length": 760.07861328125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.9995732696082615, "frac_reward_zero_std": 0.53125, "grad_norm": 0.1098002562236282, "kl": 0.0780029296875, "learning_rate": 6.386920895384841e-11, "loss": 0.0139, "num_tokens": 1403739997.0, "reward": 2.08935546875, "reward_std": 0.16179436445236206, "rewards/accuracy_reward/mean": 0.11895161122083664, "rewards/accuracy_reward/std": 0.3240584135055542, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.1385180652141571, "rewards/tag_count_reward/mean": 0.99365234375, "rewards/tag_count_reward/std": 0.05493048578500748, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1440.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 793.716796875, "completions/mean_terminated_length": 793.716796875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.9999146539216524, "frac_reward_zero_std": 0.5, "grad_norm": 0.11492651391052233, "kl": 0.0770263671875, "learning_rate": 2.838633187729478e-11, "loss": 0.0089, "num_tokens": 1404234140.0, "reward": 2.09912109375, "reward_std": 0.16686061024665833, "rewards/accuracy_reward/mean": 0.115234375, "rewards/accuracy_reward/std": 0.3196168541908264, "rewards/format_reward/mean": 0.986328125, "rewards/format_reward/std": 0.1162383034825325, "rewards/tag_count_reward/mean": 0.99755859375, "rewards/tag_count_reward/std": 0.024608410894870758, "step": 2929 }, { "epoch": 0.9999146539216524, "step": 2929, "total_flos": 0.0, "train_loss": 0.00015187089497744325, "train_runtime": 1302.9313, "train_samples_per_second": 71.94, "train_steps_per_second": 2.249 } ], "logging_steps": 1, "max_steps": 2930, "num_input_tokens_seen": 1404234140, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }