{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998109640831758, "eval_steps": 51, "global_step": 198, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15104166666666666, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.3333333333334, "completions/mean_length": 312.0329996744792, "completions/mean_terminated_length": 185.89303080240884, "completions/min_length": 28.333333333333332, "completions/min_terminated_length": 28.333333333333332, "epoch": 0.045368620037807186, "grad_norm": 0.14972379803657532, "kl": 4.560748736063639e-05, "learning_rate": 4e-07, "loss": -0.0081, "num_tokens": 942182.0, "reward": 0.37008477250734967, "reward_std": 0.11998833467562993, "rewards/get_embedding_sim/mean": 0.3440430959065755, "rewards/get_embedding_sim/std": 0.06710867583751678, "rewards/reward_num_unique_chars/mean": 0.026041666666666668, "rewards/reward_num_unique_chars/std": 0.14761295169591904, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13020833333333334, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 307.0069580078125, "completions/mean_terminated_length": 199.09521484375, "completions/min_length": 10.333333333333334, "completions/min_terminated_length": 10.333333333333334, "epoch": 0.09073724007561437, "grad_norm": 0.12008437514305115, "kl": 0.0001388813058535258, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 1882942.0, "reward": 0.4796616733074188, "reward_std": 0.214401513338089, "rewards/get_embedding_sim/mean": 0.3694185713926951, "rewards/get_embedding_sim/std": 0.07585694640874863, "rewards/reward_num_unique_chars/mean": 0.1102430559694767, "rewards/reward_num_unique_chars/std": 0.2982482860485713, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08072916666666667, "completions/max_length": 1024.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 234.67969258626303, "completions/mean_terminated_length": 166.36500040690103, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.13610586011342155, "grad_norm": 0.08606597781181335, "kl": 0.00013801626240213713, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 2735293.0, "reward": 0.39071526130040485, "reward_std": 0.1662569542725881, "rewards/get_embedding_sim/mean": 0.33168746034304303, "rewards/get_embedding_sim/std": 0.07500659177700679, "rewards/reward_num_unique_chars/mean": 0.059027779226501785, "rewards/reward_num_unique_chars/std": 0.22509141763051352, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10503472222222225, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.3333333333334, "completions/mean_length": 262.79688517252606, "completions/mean_terminated_length": 173.54302469889322, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.18147448015122875, "grad_norm": 0.11949238181114197, "kl": 0.00030877192815144855, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 3627859.0, "reward": 0.4095470607280731, "reward_std": 0.18979967882235846, "rewards/get_embedding_sim/mean": 0.33055397868156433, "rewards/get_embedding_sim/std": 0.07462155818939209, "rewards/reward_num_unique_chars/mean": 0.07899305472771327, "rewards/reward_num_unique_chars/std": 0.25569593409697217, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1362847222222222, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.6666666666666, "completions/mean_length": 316.6762288411458, "completions/mean_terminated_length": 204.85944112141928, "completions/min_length": 9.666666666666666, "completions/min_terminated_length": 9.666666666666666, "epoch": 0.22684310018903592, "grad_norm": 0.16435399651527405, "kl": 0.0005876521269480387, "learning_rate": 1e-06, "loss": 0.0529, "num_tokens": 4554894.0, "reward": 0.4522427221139272, "reward_std": 0.205996572971344, "rewards/get_embedding_sim/mean": 0.35502047340075177, "rewards/get_embedding_sim/std": 0.076506607234478, "rewards/reward_num_unique_chars/mean": 0.09722222139437993, "rewards/reward_num_unique_chars/std": 0.27809616923332214, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11718750000000004, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 283.77171834309894, "completions/mean_terminated_length": 184.45149739583334, "completions/min_length": 24.666666666666668, "completions/min_terminated_length": 24.666666666666668, "epoch": 0.2722117202268431, "grad_norm": 0.17904439568519592, "kl": 0.0004306634267171224, "learning_rate": 1e-06, "loss": 0.036, "num_tokens": 5464567.0, "reward": 0.47324784596761066, "reward_std": 0.2480545292297999, "rewards/get_embedding_sim/mean": 0.35345616936683655, "rewards/get_embedding_sim/std": 0.08570993691682816, "rewards/reward_num_unique_chars/mean": 0.11979166915019353, "rewards/reward_num_unique_chars/std": 0.32309961318969727, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07204861111111112, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 230.54688517252603, "completions/mean_terminated_length": 169.21332804361978, "completions/min_length": 12.333333333333334, "completions/min_terminated_length": 12.333333333333334, "epoch": 0.31758034026465026, "grad_norm": 0.11123450100421906, "kl": 0.0011239051818847656, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 6313117.0, "reward": 0.4715224802494049, "reward_std": 0.2366275986035665, "rewards/get_embedding_sim/mean": 0.3491266171137492, "rewards/get_embedding_sim/std": 0.06465367351969083, "rewards/reward_num_unique_chars/mean": 0.1223958358168602, "rewards/reward_num_unique_chars/std": 0.3250391185283661, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09895833333333337, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 229.36719258626303, "completions/mean_terminated_length": 142.55723571777344, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3629489603024575, "grad_norm": 0.118320994079113, "kl": 0.0019257068634033203, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 7162132.0, "reward": 0.5189645787080129, "reward_std": 0.24159842729568481, "rewards/get_embedding_sim/mean": 0.3809437155723572, "rewards/get_embedding_sim/std": 0.0799456536769867, "rewards/reward_num_unique_chars/mean": 0.13802083084980646, "rewards/reward_num_unique_chars/std": 0.3419287900129954, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052951388888888874, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.3333333333334, "completions/mean_length": 197.44445292154947, "completions/mean_terminated_length": 151.25631205240884, "completions/min_length": 14.666666666666666, "completions/min_terminated_length": 14.666666666666666, "epoch": 0.40831758034026466, "grad_norm": 0.11851406842470169, "kl": 0.002936681111653646, "learning_rate": 1e-06, "loss": 0.0317, "num_tokens": 7973172.0, "reward": 0.569815476735433, "reward_std": 0.25512967507044476, "rewards/get_embedding_sim/mean": 0.362350195646286, "rewards/get_embedding_sim/std": 0.07909337679545085, "rewards/reward_num_unique_chars/mean": 0.2074652761220932, "rewards/reward_num_unique_chars/std": 0.4044720729192098, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056423611111111126, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.3333333333334, "completions/mean_length": 211.5963592529297, "completions/mean_terminated_length": 163.11248270670572, "completions/min_length": 16.333333333333332, "completions/min_terminated_length": 16.333333333333332, "epoch": 0.45368620037807184, "grad_norm": 0.21573348343372345, "kl": 0.008742332458496094, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 8794371.0, "reward": 0.43826034665107727, "reward_std": 0.20837691922982535, "rewards/get_embedding_sim/mean": 0.3427741924921672, "rewards/get_embedding_sim/std": 0.0719177375237147, "rewards/reward_num_unique_chars/mean": 0.09548610945542653, "rewards/reward_num_unique_chars/std": 0.2681623448928197, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056423611111111126, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 197.90365091959634, "completions/mean_terminated_length": 148.50442504882812, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.499054820415879, "grad_norm": 0.08199404180049896, "kl": 0.005775133768717448, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 9601812.0, "reward": 0.45480871200561523, "reward_std": 0.2194500764211019, "rewards/get_embedding_sim/mean": 0.36192673444747925, "rewards/get_embedding_sim/std": 0.0750991627573967, "rewards/reward_num_unique_chars/mean": 0.0928819440305233, "rewards/reward_num_unique_chars/std": 0.2857237259546916, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.053819444444444454, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.3333333333334, "completions/mean_length": 201.65104166666666, "completions/mean_terminated_length": 154.5564727783203, "completions/min_length": 10.666666666666666, "completions/min_terminated_length": 10.666666666666666, "epoch": 0.5444234404536862, "grad_norm": 0.13542793691158295, "kl": 0.011366526285807291, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 10414722.0, "reward": 0.4134095311164856, "reward_std": 0.16343241184949875, "rewards/get_embedding_sim/mean": 0.3708748022715251, "rewards/get_embedding_sim/std": 0.08833041042089462, "rewards/reward_num_unique_chars/mean": 0.042534722636143364, "rewards/reward_num_unique_chars/std": 0.1979833443959554, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04340277777777779, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.6666666666666, "completions/mean_length": 176.1076456705729, "completions/mean_terminated_length": 137.35225423177084, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.5897920604914934, "grad_norm": 1.7642544507980347, "kl": 0.151151974995931, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 11207422.0, "reward": 0.5713514387607574, "reward_std": 0.26335498690605164, "rewards/get_embedding_sim/mean": 0.36909447113672894, "rewards/get_embedding_sim/std": 0.09187572946151097, "rewards/reward_num_unique_chars/mean": 0.202256940305233, "rewards/reward_num_unique_chars/std": 0.390445997317632, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.032986111111111126, "completions/max_length": 1024.0, "completions/max_terminated_length": 826.3333333333334, "completions/mean_length": 201.41754150390625, "completions/mean_terminated_length": 173.10018412272134, "completions/min_length": 18.333333333333332, "completions/min_terminated_length": 18.333333333333332, "epoch": 0.6351606805293005, "grad_norm": 0.10251538455486298, "kl": 0.014621734619140625, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 12029279.0, "reward": 0.5142592787742615, "reward_std": 0.2620675365130107, "rewards/get_embedding_sim/mean": 0.3701620002587636, "rewards/get_embedding_sim/std": 0.10092929750680923, "rewards/reward_num_unique_chars/mean": 0.1440972238779068, "rewards/reward_num_unique_chars/std": 0.34582529465357464, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.6666666666666, "completions/mean_length": 183.04254150390625, "completions/mean_terminated_length": 160.71800740559897, "completions/min_length": 14.666666666666666, "completions/min_terminated_length": 14.666666666666666, "epoch": 0.6805293005671077, "grad_norm": 0.09084329754114151, "kl": 0.015349706013997396, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 12816000.0, "reward": 0.5384640991687775, "reward_std": 0.22944432497024536, "rewards/get_embedding_sim/mean": 0.39697099725405377, "rewards/get_embedding_sim/std": 0.10396929830312729, "rewards/reward_num_unique_chars/mean": 0.14149305721124014, "rewards/reward_num_unique_chars/std": 0.3254843403895696, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02777777777777779, "completions/max_length": 1024.0, "completions/max_terminated_length": 802.3333333333334, "completions/mean_length": 165.0295155843099, "completions/mean_terminated_length": 140.43072509765625, "completions/min_length": 9.666666666666666, "completions/min_terminated_length": 9.666666666666666, "epoch": 0.725897920604915, "grad_norm": 0.21910759806632996, "kl": 0.027149200439453125, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 13587394.0, "reward": 0.5553397635618845, "reward_std": 0.23784717917442322, "rewards/get_embedding_sim/mean": 0.4086383481820424, "rewards/get_embedding_sim/std": 0.10949051380157471, "rewards/reward_num_unique_chars/mean": 0.14670138930281004, "rewards/reward_num_unique_chars/std": 0.33698558807373047, "step": 48 }, { "epoch": 0.7712665406427222, "grad_norm": 0.09893961995840073, "learning_rate": 1e-06, "loss": 0.0047, "step": 51 }, { "epoch": 0.7712665406427222, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.12797619047619044, "eval_completions/max_length": 880.7678571428571, "eval_completions/max_terminated_length": 701.3214285714286, "eval_completions/mean_length": 258.24070589882984, "eval_completions/mean_terminated_length": 153.6624070576259, "eval_completions/min_length": 24.446428571428573, "eval_completions/min_terminated_length": 24.446428571428573, "eval_kl": 0.0542449951171875, "eval_loss": 0.026244351640343666, "eval_num_tokens": 14351398.0, "eval_reward": 0.524820977555854, "eval_reward_std": 0.22432494928528154, "eval_rewards/get_embedding_sim/mean": 0.43479119294456076, "eval_rewards/get_embedding_sim/std": 0.09110667330345937, "eval_rewards/reward_num_unique_chars/mean": 0.09002976235933602, "eval_rewards/reward_num_unique_chars/std": 0.18600706889161042, "eval_runtime": 2254.2404, "eval_samples_per_second": 0.025, "eval_steps_per_second": 0.001, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044270833333333315, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.3333333333334, "completions/mean_length": 197.77517954508463, "completions/mean_terminated_length": 159.97277196248373, "completions/min_length": 9.833333333333334, "completions/min_terminated_length": 9.833333333333334, "epoch": 0.8166351606805293, "grad_norm": 0.08635270595550537, "kl": 0.030397415161132812, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 15200636.0, "reward": 0.5215439548095068, "reward_std": 0.23126975446939468, "rewards/get_embedding_sim/mean": 0.42692585786183673, "rewards/get_embedding_sim/std": 0.11467409133911133, "rewards/reward_num_unique_chars/mean": 0.09461805845300357, "rewards/reward_num_unique_chars/std": 0.28477593511343, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04253472222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.6666666666666, "completions/mean_length": 198.4244842529297, "completions/mean_terminated_length": 161.79749043782553, "completions/min_length": 10.333333333333334, "completions/min_terminated_length": 10.333333333333334, "epoch": 0.8620037807183365, "grad_norm": 14.726771354675293, "kl": 0.21588261922200522, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 16019045.0, "reward": 0.5494122306505839, "reward_std": 0.24494746327400208, "rewards/get_embedding_sim/mean": 0.44264134764671326, "rewards/get_embedding_sim/std": 0.11085022240877151, "rewards/reward_num_unique_chars/mean": 0.10677083333333333, "rewards/reward_num_unique_chars/std": 0.30227985978126526, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04253472222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.6666666666666, "completions/mean_length": 200.72309366861978, "completions/mean_terminated_length": 164.248779296875, "completions/min_length": 11.333333333333334, "completions/min_terminated_length": 11.333333333333334, "epoch": 0.9073724007561437, "grad_norm": 0.09581304341554642, "kl": 0.33023325602213544, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 16832758.0, "reward": 0.599389910697937, "reward_std": 0.26327316959698993, "rewards/get_embedding_sim/mean": 0.45268850525220233, "rewards/get_embedding_sim/std": 0.11441038797299068, "rewards/reward_num_unique_chars/mean": 0.14670139302810034, "rewards/reward_num_unique_chars/std": 0.31440146267414093, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021701388888888878, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.3333333333334, "completions/mean_length": 186.21094258626303, "completions/mean_terminated_length": 167.5730946858724, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.9527410207939508, "grad_norm": 0.08248484879732132, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 17637097.0, "reward": 0.5855847001075745, "reward_std": 0.2750825683275859, "rewards/get_embedding_sim/mean": 0.46405691901842755, "rewards/get_embedding_sim/std": 0.11442819982767105, "rewards/reward_num_unique_chars/mean": 0.12152778108914693, "rewards/reward_num_unique_chars/std": 0.3193853000799815, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.031507423371647504, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 193.8086140950521, "completions/mean_terminated_length": 167.16290283203125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.998109640831758, "grad_norm": 0.06374574452638626, "kl": 0.03699493408203125, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 18440914.0, "reward": 0.6297420461972555, "reward_std": 0.2834969659646352, "rewards/get_embedding_sim/mean": 0.47088783979415894, "rewards/get_embedding_sim/std": 0.11324869592984517, "rewards/reward_num_unique_chars/mean": 0.1588541641831398, "rewards/reward_num_unique_chars/std": 0.36384791135787964, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.032118055555555546, "completions/max_length": 1024.0, "completions/max_terminated_length": 833.3333333333334, "completions/mean_length": 200.6024373372396, "completions/mean_terminated_length": 173.2165069580078, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 1.0453686200378072, "grad_norm": 0.11849670857191086, "kl": 0.05316925048828125, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 19261832.0, "reward": 0.5802033940951029, "reward_std": 0.25838569800059, "rewards/get_embedding_sim/mean": 0.4734325309594472, "rewards/get_embedding_sim/std": 0.11253533015648524, "rewards/reward_num_unique_chars/mean": 0.10677083084980647, "rewards/reward_num_unique_chars/std": 0.30244183043638867, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.039930555555555546, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 199.55555725097656, "completions/mean_terminated_length": 165.2902577718099, "completions/min_length": 9.333333333333334, "completions/min_terminated_length": 9.333333333333334, "epoch": 1.0907372400756143, "grad_norm": 0.10332732647657394, "kl": 0.0515289306640625, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 20067096.0, "reward": 0.625789741675059, "reward_std": 0.2765499949455261, "rewards/get_embedding_sim/mean": 0.49471331636110943, "rewards/get_embedding_sim/std": 0.11266019940376282, "rewards/reward_num_unique_chars/mean": 0.1310763880610466, "rewards/reward_num_unique_chars/std": 0.336679349342982, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.036458333333333294, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.6666666666666, "completions/mean_length": 209.8359375, "completions/mean_terminated_length": 178.97360229492188, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 1.1361058601134215, "grad_norm": 0.11925654858350754, "kl": 0.14461263020833334, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 20893467.0, "reward": 0.5831413467725118, "reward_std": 0.2582869480053584, "rewards/get_embedding_sim/mean": 0.4919954836368561, "rewards/get_embedding_sim/std": 0.1114387462536494, "rewards/reward_num_unique_chars/mean": 0.09114583333333333, "rewards/reward_num_unique_chars/std": 0.2839343051115672, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.047743055555555546, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.3333333333334, "completions/mean_length": 221.0104217529297, "completions/mean_terminated_length": 180.71256510416666, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 1.1814744801512287, "grad_norm": 0.09554021060466766, "kl": 0.16336822509765625, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 21730887.0, "reward": 0.6385945876439413, "reward_std": 0.2661168724298477, "rewards/get_embedding_sim/mean": 0.5127264857292175, "rewards/get_embedding_sim/std": 0.11183823893467586, "rewards/reward_num_unique_chars/mean": 0.12586805472771326, "rewards/reward_num_unique_chars/std": 0.3207412262757619, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.049479166666666664, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 228.04601033528647, "completions/mean_terminated_length": 186.1550038655599, "completions/min_length": 7.333333333333333, "completions/min_terminated_length": 7.333333333333333, "epoch": 1.2268431001890359, "grad_norm": 0.07755686342716217, "kl": 0.05751800537109375, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 22583420.0, "reward": 0.6019672354062399, "reward_std": 0.26383428772290546, "rewards/get_embedding_sim/mean": 0.5021408100922903, "rewards/get_embedding_sim/std": 0.10627821832895279, "rewards/reward_num_unique_chars/mean": 0.09982638930281003, "rewards/reward_num_unique_chars/std": 0.2837299009164174, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029513888888888878, "completions/max_length": 1024.0, "completions/max_terminated_length": 931.3333333333334, "completions/mean_length": 191.74740091959634, "completions/mean_terminated_length": 166.4415028889974, "completions/min_length": 8.666666666666666, "completions/min_terminated_length": 8.666666666666666, "epoch": 1.272211720226843, "grad_norm": 0.08697984367609024, "kl": 0.057329813639322914, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 23394137.0, "reward": 0.6638144056002299, "reward_std": 0.26522762576738995, "rewards/get_embedding_sim/mean": 0.5231893658638, "rewards/get_embedding_sim/std": 0.10482257604598999, "rewards/reward_num_unique_chars/mean": 0.140625, "rewards/reward_num_unique_chars/std": 0.3480878472328186, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/max_terminated_length": 872.3333333333334, "completions/mean_length": 185.42535400390625, "completions/mean_terminated_length": 163.15696716308594, "completions/min_length": 7.333333333333333, "completions/min_terminated_length": 7.333333333333333, "epoch": 1.3175803402646502, "grad_norm": 0.14970338344573975, "kl": 0.12465922037760417, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 24197571.0, "reward": 0.6190575559933981, "reward_std": 0.2601381540298462, "rewards/get_embedding_sim/mean": 0.5174950361251831, "rewards/get_embedding_sim/std": 0.0997606838742892, "rewards/reward_num_unique_chars/mean": 0.10156250124176343, "rewards/reward_num_unique_chars/std": 0.297150323788325, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029513888888888912, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.6666666666666, "completions/mean_length": 216.75694783528647, "completions/mean_terminated_length": 192.3217315673828, "completions/min_length": 7.333333333333333, "completions/min_terminated_length": 7.333333333333333, "epoch": 1.3629489603024574, "grad_norm": 0.11725780367851257, "kl": 0.08345540364583333, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 25034555.0, "reward": 0.5995156168937683, "reward_std": 0.22840352356433868, "rewards/get_embedding_sim/mean": 0.5118419329325358, "rewards/get_embedding_sim/std": 0.0987908939520518, "rewards/reward_num_unique_chars/mean": 0.0876736119389534, "rewards/reward_num_unique_chars/std": 0.27722589671611786, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029513888888888878, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.3333333333334, "completions/mean_length": 186.4375, "completions/mean_terminated_length": 161.33899434407553, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 1.4083175803402646, "grad_norm": 0.10394510626792908, "kl": 0.07037099202473958, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 25831283.0, "reward": 0.6795124411582947, "reward_std": 0.29141750435034436, "rewards/get_embedding_sim/mean": 0.5137137969334921, "rewards/get_embedding_sim/std": 0.09767910589774449, "rewards/reward_num_unique_chars/mean": 0.16579860697189966, "rewards/reward_num_unique_chars/std": 0.3542452355225881, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05034722222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.3333333333334, "completions/mean_length": 212.23785400390625, "completions/mean_terminated_length": 169.16080729166666, "completions/min_length": 6.666666666666667, "completions/min_terminated_length": 6.666666666666667, "epoch": 1.4536862003780717, "grad_norm": 0.10010381788015366, "kl": 0.06960042317708333, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 26656485.0, "reward": 0.6300086975097656, "reward_std": 0.24619843065738678, "rewards/get_embedding_sim/mean": 0.5267100731531779, "rewards/get_embedding_sim/std": 0.1071697548031807, "rewards/reward_num_unique_chars/mean": 0.1032986119389534, "rewards/reward_num_unique_chars/std": 0.29826584458351135, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037326388888888874, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 211.8107655843099, "completions/mean_terminated_length": 180.1980183919271, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 1.499054820415879, "grad_norm": 0.07485458254814148, "kl": 0.061063130696614586, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 27490315.0, "reward": 0.620047926902771, "reward_std": 0.2632503807544708, "rewards/get_embedding_sim/mean": 0.5132770538330078, "rewards/get_embedding_sim/std": 0.10026986648639043, "rewards/reward_num_unique_chars/mean": 0.10677083333333333, "rewards/reward_num_unique_chars/std": 0.3043619990348816, "step": 99 }, { "epoch": 1.544423440453686, "grad_norm": 0.11106861382722855, "learning_rate": 1e-06, "loss": 0.0115, "step": 102 }, { "epoch": 1.544423440453686, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.07068452380952381, "eval_completions/max_length": 887.5892857142857, "eval_completions/max_terminated_length": 675.0892857142857, "eval_completions/mean_length": 206.3244113922119, "eval_completions/mean_terminated_length": 145.37539066587175, "eval_completions/min_length": 18.160714285714285, "eval_completions/min_terminated_length": 18.160714285714285, "eval_kl": 0.06965419224330358, "eval_loss": 0.03773626312613487, "eval_num_tokens": 28307736.0, "eval_reward": 0.6229457370936871, "eval_reward_std": 0.2839882879384926, "eval_rewards/get_embedding_sim/mean": 0.5206391582531589, "eval_rewards/get_embedding_sim/std": 0.09148550758670483, "eval_rewards/reward_num_unique_chars/mean": 0.10230654794057566, "eval_rewards/reward_num_unique_chars/std": 0.24572753932859218, "eval_runtime": 1726.6979, "eval_samples_per_second": 0.032, "eval_steps_per_second": 0.001, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03602430555555556, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.3333333333334, "completions/mean_length": 214.1545181274414, "completions/mean_terminated_length": 183.94319661458334, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 1.5897920604914932, "grad_norm": 1.0838171243667603, "kl": 0.0672899881998698, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 29138511.0, "reward": 0.6624543964862823, "reward_std": 0.26948046932617825, "rewards/get_embedding_sim/mean": 0.5296418766180674, "rewards/get_embedding_sim/std": 0.10213356713453929, "rewards/reward_num_unique_chars/mean": 0.1328124993791183, "rewards/reward_num_unique_chars/std": 0.32840434461832047, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 201.72048950195312, "completions/mean_terminated_length": 179.72496032714844, "completions/min_length": 7.333333333333333, "completions/min_terminated_length": 7.333333333333333, "epoch": 1.6351606805293004, "grad_norm": 0.0918864831328392, "kl": 0.061335245768229164, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 29960717.0, "reward": 0.6120087305704752, "reward_std": 0.250284880399704, "rewards/get_embedding_sim/mean": 0.5364878376324972, "rewards/get_embedding_sim/std": 0.0979540745417277, "rewards/reward_num_unique_chars/mean": 0.07552083457509677, "rewards/reward_num_unique_chars/std": 0.26320414741834003, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03559027777777779, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 206.97309366861978, "completions/mean_terminated_length": 177.1915079752604, "completions/min_length": 7.666666666666667, "completions/min_terminated_length": 7.666666666666667, "epoch": 1.6805293005671076, "grad_norm": 0.07678642123937607, "kl": 0.06285349527994792, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 30781870.0, "reward": 0.6274827718734741, "reward_std": 0.26556732257207233, "rewards/get_embedding_sim/mean": 0.5155035257339478, "rewards/get_embedding_sim/std": 0.09278701990842819, "rewards/reward_num_unique_chars/mean": 0.11197916666666667, "rewards/reward_num_unique_chars/std": 0.30655037860075635, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.032118055555555546, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.3333333333334, "completions/mean_length": 185.73351542154947, "completions/mean_terminated_length": 157.97284952799478, "completions/min_length": 8.333333333333334, "completions/min_terminated_length": 8.333333333333334, "epoch": 1.725897920604915, "grad_norm": 0.07077532261610031, "kl": 0.07100423177083333, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 31578315.0, "reward": 0.6285200913747152, "reward_std": 0.2933768729368846, "rewards/get_embedding_sim/mean": 0.5260895093282064, "rewards/get_embedding_sim/std": 0.10419273873170216, "rewards/reward_num_unique_chars/mean": 0.10243055472771327, "rewards/reward_num_unique_chars/std": 0.30191460251808167, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.035590277777777755, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.3333333333334, "completions/mean_length": 197.5555623372396, "completions/mean_terminated_length": 167.07290649414062, "completions/min_length": 7.333333333333333, "completions/min_terminated_length": 7.333333333333333, "epoch": 1.7712665406427222, "grad_norm": 0.07132314145565033, "kl": 0.07155863444010417, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 32370091.0, "reward": 0.6605067054430643, "reward_std": 0.3198150396347046, "rewards/get_embedding_sim/mean": 0.5276941855748495, "rewards/get_embedding_sim/std": 0.09764280170202255, "rewards/reward_num_unique_chars/mean": 0.13281250248352686, "rewards/reward_num_unique_chars/std": 0.3356940845648448, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03819444444444442, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.3333333333334, "completions/mean_length": 205.5260467529297, "completions/mean_terminated_length": 172.99127197265625, "completions/min_length": 7.333333333333333, "completions/min_terminated_length": 7.333333333333333, "epoch": 1.8166351606805293, "grad_norm": 0.07695771753787994, "kl": 0.079559326171875, "learning_rate": 1e-06, "loss": 0.0285, "num_tokens": 33183529.0, "reward": 0.6505021651585897, "reward_std": 0.28806476791699726, "rewards/get_embedding_sim/mean": 0.5255021254221598, "rewards/get_embedding_sim/std": 0.10448584208885829, "rewards/reward_num_unique_chars/mean": 0.12499999751647313, "rewards/reward_num_unique_chars/std": 0.3297826250394185, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 187.78907267252603, "completions/mean_terminated_length": 160.96214803059897, "completions/min_length": 7.333333333333333, "completions/min_terminated_length": 7.333333333333333, "epoch": 1.8620037807183365, "grad_norm": 0.10003960132598877, "kl": 0.10397847493489583, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 33979270.0, "reward": 0.702047864596049, "reward_std": 0.2998199959595998, "rewards/get_embedding_sim/mean": 0.5353811780611674, "rewards/get_embedding_sim/std": 0.1009945347905159, "rewards/reward_num_unique_chars/mean": 0.16666666915019354, "rewards/reward_num_unique_chars/std": 0.3635033369064331, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 201.6701456705729, "completions/mean_terminated_length": 168.43896484375, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 1.9073724007561437, "grad_norm": 0.1418294459581375, "kl": 0.08981831868489583, "learning_rate": 1e-06, "loss": 0.0311, "num_tokens": 34790042.0, "reward": 0.6395866274833679, "reward_std": 0.278631071249644, "rewards/get_embedding_sim/mean": 0.5371560255686442, "rewards/get_embedding_sim/std": 0.10253078490495682, "rewards/reward_num_unique_chars/mean": 0.10243055721124013, "rewards/reward_num_unique_chars/std": 0.3033109207948049, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.020833333333333297, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.6666666666666, "completions/mean_length": 165.38021341959634, "completions/mean_terminated_length": 147.11800384521484, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 1.9527410207939508, "grad_norm": 0.085059255361557, "kl": 0.09361775716145833, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 35555504.0, "reward": 0.7251607775688171, "reward_std": 0.3248043159643809, "rewards/get_embedding_sim/mean": 0.5359246134757996, "rewards/get_embedding_sim/std": 0.10831368962923686, "rewards/reward_num_unique_chars/mean": 0.18923610945542654, "rewards/reward_num_unique_chars/std": 0.38598161935806274, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.051843869731800774, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.6666666666666, "completions/mean_length": 218.2733408610026, "completions/mean_terminated_length": 174.35783894856772, "completions/min_length": 5.333333333333333, "completions/min_terminated_length": 5.333333333333333, "epoch": 1.998109640831758, "grad_norm": 0.15158401429653168, "kl": 0.09020487467447917, "learning_rate": 1e-06, "loss": 0.0421, "num_tokens": 36367765.0, "reward": 0.6982676188151041, "reward_std": 0.3466052810351054, "rewards/get_embedding_sim/mean": 0.5420175790786743, "rewards/get_embedding_sim/std": 0.09902476519346237, "rewards/reward_num_unique_chars/mean": 0.15625, "rewards/reward_num_unique_chars/std": 0.3612334032853444, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03472222222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 176.51909891764322, "completions/mean_terminated_length": 146.023562113444, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 2.045368620037807, "grad_norm": 0.08893448859453201, "kl": 0.096466064453125, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 37147067.0, "reward": 0.7173450986544291, "reward_std": 0.35685937603314716, "rewards/get_embedding_sim/mean": 0.5341853896776835, "rewards/get_embedding_sim/std": 0.1000617394844691, "rewards/reward_num_unique_chars/mean": 0.1831597238779068, "rewards/reward_num_unique_chars/std": 0.3842338224252065, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041666666666666664, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 208.60938008626303, "completions/mean_terminated_length": 173.26571655273438, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 2.0907372400756143, "grad_norm": 0.12921324372291565, "kl": 0.09910074869791667, "learning_rate": 1e-06, "loss": 0.0475, "num_tokens": 37974473.0, "reward": 0.672684927781423, "reward_std": 0.34854390223821, "rewards/get_embedding_sim/mean": 0.5364001393318176, "rewards/get_embedding_sim/std": 0.10567483057578404, "rewards/reward_num_unique_chars/mean": 0.13628472139437994, "rewards/reward_num_unique_chars/std": 0.34236905972162884, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021701388888888878, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 167.10590616861978, "completions/mean_terminated_length": 148.0250244140625, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 2.1361058601134215, "grad_norm": 0.12040314823389053, "kl": 0.24815877278645834, "learning_rate": 1e-06, "loss": 0.0403, "num_tokens": 38741491.0, "reward": 0.6958853205045065, "reward_std": 0.3416078786055247, "rewards/get_embedding_sim/mean": 0.5396353205045065, "rewards/get_embedding_sim/std": 0.11144034812847774, "rewards/reward_num_unique_chars/mean": 0.15625000248352686, "rewards/reward_num_unique_chars/std": 0.3600207368532817, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.032986111111111084, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 182.28211975097656, "completions/mean_terminated_length": 153.65155029296875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 2.1814744801512287, "grad_norm": 0.08720903098583221, "kl": 0.11935933430989583, "learning_rate": 1e-06, "loss": 0.042, "num_tokens": 39531464.0, "reward": 0.7134884198506674, "reward_std": 0.36159368356068927, "rewards/get_embedding_sim/mean": 0.5424814422925314, "rewards/get_embedding_sim/std": 0.11029936373233795, "rewards/reward_num_unique_chars/mean": 0.17100694278875986, "rewards/reward_num_unique_chars/std": 0.37471526861190796, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 171.84375508626303, "completions/mean_terminated_length": 144.29528299967447, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.226843100189036, "grad_norm": 0.08887135237455368, "kl": 0.118194580078125, "learning_rate": 1e-06, "loss": 0.0381, "num_tokens": 40311428.0, "reward": 0.6788019339243571, "reward_std": 0.33359630902608234, "rewards/get_embedding_sim/mean": 0.5451213518778483, "rewards/get_embedding_sim/std": 0.10193872700134914, "rewards/reward_num_unique_chars/mean": 0.133680559694767, "rewards/reward_num_unique_chars/std": 0.34027015169461566, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01909722222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.6666666666666, "completions/mean_length": 165.07205200195312, "completions/mean_terminated_length": 148.45321146647134, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 2.272211720226843, "grad_norm": 0.08689926564693451, "kl": 0.149566650390625, "learning_rate": 1e-06, "loss": 0.0324, "num_tokens": 41091415.0, "reward": 0.6884604295094808, "reward_std": 0.3447088996569316, "rewards/get_embedding_sim/mean": 0.5495714743932089, "rewards/get_embedding_sim/std": 0.10353380193312962, "rewards/reward_num_unique_chars/mean": 0.13888888557751974, "rewards/reward_num_unique_chars/std": 0.3459552029768626, "step": 150 }, { "epoch": 2.31758034026465, "grad_norm": 0.10476606339216232, "learning_rate": 1e-06, "loss": 0.0513, "step": 153 }, { "epoch": 2.31758034026465, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.04687500000000001, "eval_completions/max_length": 856.4464285714286, "eval_completions/max_terminated_length": 614.625, "eval_completions/mean_length": 148.9296919277736, "eval_completions/mean_terminated_length": 106.54870585032872, "eval_completions/min_length": 12.107142857142858, "eval_completions/min_terminated_length": 12.107142857142858, "eval_kl": 0.15039280482700892, "eval_loss": 0.05131923779845238, "eval_num_tokens": 41858572.0, "eval_reward": 0.7319182710988181, "eval_reward_std": 0.39004063113991705, "eval_rewards/get_embedding_sim/mean": 0.5399539640971592, "eval_rewards/get_embedding_sim/std": 0.09657471527212433, "eval_rewards/reward_num_unique_chars/mean": 0.19196428627973156, "eval_rewards/reward_num_unique_chars/std": 0.34904111203338417, "eval_runtime": 1578.4274, "eval_samples_per_second": 0.035, "eval_steps_per_second": 0.001, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.8333333333334, "completions/mean_length": 154.00824991861978, "completions/mean_terminated_length": 134.3066151936849, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 2.3629489603024574, "grad_norm": 0.17861098051071167, "kl": 0.1991424560546875, "learning_rate": 1e-06, "loss": 0.0446, "num_tokens": 42603290.0, "reward": 0.7928757965564728, "reward_std": 0.39940689504146576, "rewards/get_embedding_sim/mean": 0.5420077045758566, "rewards/get_embedding_sim/std": 0.10503626987338066, "rewards/reward_num_unique_chars/mean": 0.2508680547277133, "rewards/reward_num_unique_chars/std": 0.43154530723889667, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333333333337, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.6666666666666, "completions/mean_length": 149.3498331705729, "completions/mean_terminated_length": 130.67583719889322, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 2.4083175803402646, "grad_norm": 0.0997217446565628, "kl": 0.176727294921875, "learning_rate": 1e-06, "loss": 0.0479, "num_tokens": 43362573.0, "reward": 0.7896133859952291, "reward_std": 0.37762073675791424, "rewards/get_embedding_sim/mean": 0.5604466795921326, "rewards/get_embedding_sim/std": 0.10085596889257431, "rewards/reward_num_unique_chars/mean": 0.22916666666666666, "rewards/reward_num_unique_chars/std": 0.417032649119695, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01996527777777779, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 133.52865091959634, "completions/mean_terminated_length": 115.48833719889323, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 2.4536862003780717, "grad_norm": 0.08974426239728928, "kl": 0.13869730631510416, "learning_rate": 1e-06, "loss": 0.0442, "num_tokens": 44099022.0, "reward": 0.8535909652709961, "reward_std": 0.42868249615033466, "rewards/get_embedding_sim/mean": 0.5332783659299215, "rewards/get_embedding_sim/std": 0.10329846044381459, "rewards/reward_num_unique_chars/mean": 0.3203125, "rewards/reward_num_unique_chars/std": 0.45679094394048053, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 141.3715337117513, "completions/mean_terminated_length": 117.76270294189453, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 2.499054820415879, "grad_norm": 0.10181669145822525, "kl": 0.23414103190104166, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 44851706.0, "reward": 0.8000141382217407, "reward_std": 0.3941415250301361, "rewards/get_embedding_sim/mean": 0.5456738670667013, "rewards/get_embedding_sim/std": 0.10725356390078862, "rewards/reward_num_unique_chars/mean": 0.2543402810891469, "rewards/reward_num_unique_chars/std": 0.43460813164711, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.018229166666666703, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.6666666666666, "completions/mean_length": 141.09028116861978, "completions/mean_terminated_length": 124.72643280029297, "completions/min_length": 6.666666666666667, "completions/min_terminated_length": 6.666666666666667, "epoch": 2.544423440453686, "grad_norm": 0.08525840193033218, "kl": 0.20921834309895834, "learning_rate": 1e-06, "loss": 0.0455, "num_tokens": 45604066.0, "reward": 0.7868956923484802, "reward_std": 0.40804105003674823, "rewards/get_embedding_sim/mean": 0.5299512147903442, "rewards/get_embedding_sim/std": 0.10723193486531575, "rewards/reward_num_unique_chars/mean": 0.2569444378217061, "rewards/reward_num_unique_chars/std": 0.42565350731213886, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025173611111111122, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.3333333333334, "completions/mean_length": 138.74913533528647, "completions/mean_terminated_length": 116.1558354695638, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 2.5897920604914932, "grad_norm": 0.6628166437149048, "kl": 0.317626953125, "learning_rate": 1e-06, "loss": 0.0491, "num_tokens": 46345857.0, "reward": 0.8313470085461935, "reward_std": 0.4129582444826762, "rewards/get_embedding_sim/mean": 0.5509650309880575, "rewards/get_embedding_sim/std": 0.09154053280750911, "rewards/reward_num_unique_chars/mean": 0.2803819427887599, "rewards/reward_num_unique_chars/std": 0.4417712489763896, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01128472222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 772.6666666666666, "completions/mean_length": 111.97396087646484, "completions/mean_terminated_length": 101.59329223632812, "completions/min_length": 6.666666666666667, "completions/min_terminated_length": 6.666666666666667, "epoch": 2.6351606805293004, "grad_norm": 0.09945366531610489, "kl": 0.262237548828125, "learning_rate": 1e-06, "loss": 0.038, "num_tokens": 47054931.0, "reward": 0.8715664744377136, "reward_std": 0.4346109131971995, "rewards/get_embedding_sim/mean": 0.549517830212911, "rewards/get_embedding_sim/std": 0.11452717334032059, "rewards/reward_num_unique_chars/mean": 0.3220486094554265, "rewards/reward_num_unique_chars/std": 0.4634987811247508, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033854166666666685, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 155.64192962646484, "completions/mean_terminated_length": 125.11021041870117, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 2.6805293005671076, "grad_norm": 0.09938167780637741, "kl": 0.2503814697265625, "learning_rate": 1e-06, "loss": 0.054, "num_tokens": 47795091.0, "reward": 0.7676738500595093, "reward_std": 0.39470958709716797, "rewards/get_embedding_sim/mean": 0.5736633539199829, "rewards/get_embedding_sim/std": 0.09976038336753845, "rewards/reward_num_unique_chars/mean": 0.1940104141831398, "rewards/reward_num_unique_chars/std": 0.3958670049905777, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017361111111111122, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.6666666666666, "completions/mean_length": 116.89757283528645, "completions/mean_terminated_length": 100.84752400716145, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 2.7258979206049148, "grad_norm": 0.1425255984067917, "kl": 0.2775370279947917, "learning_rate": 1e-06, "loss": 0.0481, "num_tokens": 48511037.0, "reward": 0.863362193107605, "reward_std": 0.4587005575497945, "rewards/get_embedding_sim/mean": 0.5473899245262146, "rewards/get_embedding_sim/std": 0.09922760476668675, "rewards/reward_num_unique_chars/mean": 0.3159722288449605, "rewards/reward_num_unique_chars/std": 0.4650394419829051, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014756944444444456, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.3333333333334, "completions/mean_length": 102.27864837646484, "completions/mean_terminated_length": 88.46848042805989, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 2.7712665406427224, "grad_norm": 0.09843874722719193, "kl": 0.34393310546875, "learning_rate": 1e-06, "loss": 0.0436, "num_tokens": 49204478.0, "reward": 0.8661341269810995, "reward_std": 0.46143727501233417, "rewards/get_embedding_sim/mean": 0.5727312763532003, "rewards/get_embedding_sim/std": 0.11564485480388005, "rewards/reward_num_unique_chars/mean": 0.2934027711550395, "rewards/reward_num_unique_chars/std": 0.4531017243862152, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01996527777777779, "completions/max_length": 1024.0, "completions/max_terminated_length": 754.6666666666666, "completions/mean_length": 122.11806233723958, "completions/mean_terminated_length": 103.76323954264323, "completions/min_length": 5.666666666666667, "completions/min_terminated_length": 5.666666666666667, "epoch": 2.816635160680529, "grad_norm": 0.15066391229629517, "kl": 0.3179728190104167, "learning_rate": 1e-06, "loss": 0.0582, "num_tokens": 49927110.0, "reward": 0.892190178235372, "reward_std": 0.4511215090751648, "rewards/get_embedding_sim/mean": 0.5527804295221964, "rewards/get_embedding_sim/std": 0.1083058441678683, "rewards/reward_num_unique_chars/mean": 0.3394097238779068, "rewards/reward_num_unique_chars/std": 0.46636247634887695, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022569444444444458, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.3333333333334, "completions/mean_length": 126.84115091959636, "completions/mean_terminated_length": 106.15006764729817, "completions/min_length": 7.333333333333333, "completions/min_terminated_length": 7.333333333333333, "epoch": 2.8620037807183367, "grad_norm": 2.990046739578247, "kl": 0.47100830078125, "learning_rate": 1e-06, "loss": 0.0551, "num_tokens": 50663055.0, "reward": 0.8623983860015869, "reward_std": 0.4631191889444987, "rewards/get_embedding_sim/mean": 0.5533705353736877, "rewards/get_embedding_sim/std": 0.11140244205792744, "rewards/reward_num_unique_chars/mean": 0.3090277810891469, "rewards/reward_num_unique_chars/std": 0.4615551829338074, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01649305555555558, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 111.4730936686198, "completions/mean_terminated_length": 96.25564575195312, "completions/min_length": 5.333333333333333, "completions/min_terminated_length": 5.333333333333333, "epoch": 2.9073724007561434, "grad_norm": 0.11676046997308731, "kl": 0.4471232096354167, "learning_rate": 1e-06, "loss": 0.0481, "num_tokens": 51373952.0, "reward": 0.9203431606292725, "reward_std": 0.47053369879722595, "rewards/get_embedding_sim/mean": 0.5444750587145487, "rewards/get_embedding_sim/std": 0.1073705404996872, "rewards/reward_num_unique_chars/mean": 0.3758680522441864, "rewards/reward_num_unique_chars/std": 0.4811862111091614, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016493055555555542, "completions/max_length": 1024.0, "completions/max_terminated_length": 776.3333333333334, "completions/mean_length": 113.12673950195312, "completions/mean_terminated_length": 97.83474731445312, "completions/min_length": 8.666666666666666, "completions/min_terminated_length": 8.666666666666666, "epoch": 2.952741020793951, "grad_norm": 0.09854816645383835, "kl": 0.285675048828125, "learning_rate": 1e-06, "loss": 0.0465, "num_tokens": 52094098.0, "reward": 0.888769249121348, "reward_std": 0.46734312176704407, "rewards/get_embedding_sim/mean": 0.5519636472066244, "rewards/get_embedding_sim/std": 0.11934416989485423, "rewards/reward_num_unique_chars/mean": 0.3368055522441864, "rewards/reward_num_unique_chars/std": 0.4713793396949768, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013888888888888876, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.3333333333334, "completions/mean_length": 112.50087229410808, "completions/mean_terminated_length": 99.5953369140625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 2.998109640831758, "grad_norm": 0.08575434237718582, "kl": 0.42242431640625, "learning_rate": 1e-06, "loss": 0.0494, "num_tokens": 52800707.0, "reward": 0.9310129086176554, "reward_std": 0.4663335382938385, "rewards/get_embedding_sim/mean": 0.5603530804316202, "rewards/get_embedding_sim/std": 0.10822075108687083, "rewards/reward_num_unique_chars/mean": 0.3706597288449605, "rewards/reward_num_unique_chars/std": 0.4820249378681183, "step": 198 } ], "logging_steps": 3, "max_steps": 198, "num_input_tokens_seen": 52800707, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }