{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998109640831758, "eval_steps": 51, "global_step": 198, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1519097222222222, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 318.44444783528644, "completions/mean_terminated_length": 192.69292195638022, "completions/min_length": 20.666666666666668, "completions/min_terminated_length": 20.666666666666668, "epoch": 0.045368620037807186, "grad_norm": 0.1771457940340042, "kl": 5.446871121724447e-05, "learning_rate": 4e-07, "loss": -0.0064, "num_tokens": 949568.0, "reward": 0.4211084047953288, "reward_std": 0.3868949313958486, "rewards/get_embedding_sim/mean": 0.3429833749930064, "rewards/get_embedding_sim/std": 0.06474291781584422, "rewards/reward_num_unique_chars/mean": 0.026324149842063587, "rewards/reward_num_unique_chars/std": 0.1598906268676122, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.3333333333334, "completions/mean_length": 298.11285400390625, "completions/mean_terminated_length": 200.7570597330729, "completions/min_length": 9.666666666666666, "completions/min_terminated_length": 9.666666666666666, "epoch": 0.09073724007561437, "grad_norm": 0.12331758439540863, "kl": 0.00014600654443105063, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 1880082.0, "reward": 0.6416476865609487, "reward_std": 0.5329093436400095, "rewards/get_embedding_sim/mean": 0.36821014682451886, "rewards/get_embedding_sim/std": 0.07475950072209041, "rewards/reward_num_unique_chars/mean": 0.09220736970504124, "rewards/reward_num_unique_chars/std": 0.2722427050272624, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08680555555555558, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 252.9539998372396, "completions/mean_terminated_length": 180.34420776367188, "completions/min_length": 24.666666666666668, "completions/min_terminated_length": 24.666666666666668, "epoch": 0.13610586011342155, "grad_norm": 0.08608454465866089, "kl": 0.0001450727383295695, "learning_rate": 1e-06, "loss": 0.024, "num_tokens": 2753485.0, "reward": 0.5257614056269327, "reward_std": 0.4465513428052266, "rewards/get_embedding_sim/mean": 0.33044888575871784, "rewards/get_embedding_sim/std": 0.07643905778725942, "rewards/reward_num_unique_chars/mean": 0.06572048738598824, "rewards/reward_num_unique_chars/std": 0.24313671390215555, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10763888888888888, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.6666666666666, "completions/mean_length": 271.1293538411458, "completions/mean_terminated_length": 180.34170532226562, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.18147448015122875, "grad_norm": 0.11351985484361649, "kl": 0.000451435645421346, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 3655650.0, "reward": 0.5510341823101044, "reward_std": 0.5266622304916382, "rewards/get_embedding_sim/mean": 0.332284152507782, "rewards/get_embedding_sim/std": 0.07756081471840541, "rewards/reward_num_unique_chars/mean": 0.07334695508082707, "rewards/reward_num_unique_chars/std": 0.2513364603122075, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 322.98785400390625, "completions/mean_terminated_length": 200.65520731608072, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.22684310018903592, "grad_norm": 0.1555059850215912, "kl": 0.0004805326461791992, "learning_rate": 1e-06, "loss": 0.0487, "num_tokens": 4589956.0, "reward": 0.5868227481842041, "reward_std": 0.5241502523422241, "rewards/get_embedding_sim/mean": 0.3524477581183116, "rewards/get_embedding_sim/std": 0.07752909014622371, "rewards/reward_num_unique_chars/mean": 0.0786214725424846, "rewards/reward_num_unique_chars/std": 0.2524682929118474, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09982638888888888, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 263.34288533528644, "completions/mean_terminated_length": 178.2960662841797, "completions/min_length": 17.666666666666668, "completions/min_terminated_length": 17.666666666666668, "epoch": 0.2722117202268431, "grad_norm": 0.15412873029708862, "kl": 0.0006418625513712565, "learning_rate": 1e-06, "loss": 0.0529, "num_tokens": 5476095.0, "reward": 0.7471893429756165, "reward_std": 0.6821479399998983, "rewards/get_embedding_sim/mean": 0.3487518032391866, "rewards/get_embedding_sim/std": 0.07891600827376048, "rewards/reward_num_unique_chars/mean": 0.13329477856556574, "rewards/reward_num_unique_chars/std": 0.3387155433495839, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.6666666666666, "completions/mean_length": 239.4496612548828, "completions/mean_terminated_length": 168.3182576497396, "completions/min_length": 14.666666666666666, "completions/min_terminated_length": 14.666666666666666, "epoch": 0.31758034026465026, "grad_norm": 0.19775940477848053, "kl": 0.001989444096883138, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 6334901.0, "reward": 0.6757530768712362, "reward_std": 0.5965128739674886, "rewards/get_embedding_sim/mean": 0.34762802720069885, "rewards/get_embedding_sim/std": 0.061141988883415856, "rewards/reward_num_unique_chars/mean": 0.11084798475106557, "rewards/reward_num_unique_chars/std": 0.31187912821769714, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09722222222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 223.53906758626303, "completions/mean_terminated_length": 137.0214869181315, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3629489603024575, "grad_norm": 0.159920796751976, "kl": 0.003296534220377604, "learning_rate": 1e-06, "loss": 0.0371, "num_tokens": 7177202.0, "reward": 0.7903947830200195, "reward_std": 0.7087553143501282, "rewards/get_embedding_sim/mean": 0.37893640001614887, "rewards/get_embedding_sim/std": 0.07807190467913945, "rewards/reward_num_unique_chars/mean": 0.13736129055420557, "rewards/reward_num_unique_chars/std": 0.3415720462799072, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04340277777777779, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 189.19965616861978, "completions/mean_terminated_length": 151.40495808919272, "completions/min_length": 14.333333333333334, "completions/min_terminated_length": 14.333333333333334, "epoch": 0.40831758034026466, "grad_norm": 0.07982576638460159, "kl": 0.006541093190511067, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 7978744.0, "reward": 0.9137211839358012, "reward_std": 0.718700091044108, "rewards/get_embedding_sim/mean": 0.36163782080014545, "rewards/get_embedding_sim/std": 0.0756089190642039, "rewards/reward_num_unique_chars/mean": 0.1940170923868815, "rewards/reward_num_unique_chars/std": 0.3915421764055888, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04253472222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.3333333333334, "completions/mean_length": 195.49305725097656, "completions/mean_terminated_length": 158.83220418294272, "completions/min_length": 20.666666666666668, "completions/min_terminated_length": 20.666666666666668, "epoch": 0.45368620037807184, "grad_norm": 0.13737693428993225, "kl": 0.008511225382486979, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 8781392.0, "reward": 0.6430213848749796, "reward_std": 0.5739699502786001, "rewards/get_embedding_sim/mean": 0.3461463352044423, "rewards/get_embedding_sim/std": 0.07520903646945953, "rewards/reward_num_unique_chars/mean": 0.09946840691069762, "rewards/reward_num_unique_chars/std": 0.26744696994622547, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.045138888888888874, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.3333333333334, "completions/mean_length": 182.24132283528647, "completions/mean_terminated_length": 142.2494913736979, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.499054820415879, "grad_norm": 0.09873297065496445, "kl": 0.012536366780598959, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 9570790.0, "reward": 0.6741114258766174, "reward_std": 0.6212253371874491, "rewards/get_embedding_sim/mean": 0.35900717973709106, "rewards/get_embedding_sim/std": 0.0736292873819669, "rewards/reward_num_unique_chars/mean": 0.10573149348298709, "rewards/reward_num_unique_chars/std": 0.3019101023674011, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02864583333333337, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 177.17447916666666, "completions/mean_terminated_length": 152.01558430989584, "completions/min_length": 13.333333333333334, "completions/min_terminated_length": 13.333333333333334, "epoch": 0.5444234404536862, "grad_norm": 0.11605791002511978, "kl": 0.025735855102539062, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 10355503.0, "reward": 0.5276843309402466, "reward_std": 0.502232551574707, "rewards/get_embedding_sim/mean": 0.3662259578704834, "rewards/get_embedding_sim/std": 0.08873194952805837, "rewards/reward_num_unique_chars/mean": 0.054056490461031594, "rewards/reward_num_unique_chars/std": 0.22509411970774332, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/max_terminated_length": 854.3333333333334, "completions/mean_length": 166.70486450195312, "completions/mean_terminated_length": 136.90866088867188, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.5897920604914934, "grad_norm": 4.433223724365234, "kl": 0.2714697519938151, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 11137371.0, "reward": 1.016512393951416, "reward_std": 0.7703921596209208, "rewards/get_embedding_sim/mean": 0.3654707372188568, "rewards/get_embedding_sim/std": 0.09141946583986282, "rewards/reward_num_unique_chars/mean": 0.21773314972718558, "rewards/reward_num_unique_chars/std": 0.3968968590100606, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 189.52778116861978, "completions/mean_terminated_length": 164.9041544596354, "completions/min_length": 23.333333333333332, "completions/min_terminated_length": 23.333333333333332, "epoch": 0.6351606805293005, "grad_norm": 0.09748831391334534, "kl": 0.030905405680338543, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 11945531.0, "reward": 0.7334451675415039, "reward_std": 0.6726242105166117, "rewards/get_embedding_sim/mean": 0.3584451178709666, "rewards/get_embedding_sim/std": 0.09209247678518295, "rewards/reward_num_unique_chars/mean": 0.12500000248352686, "rewards/reward_num_unique_chars/std": 0.32361265023549396, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02690972222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.6666666666666, "completions/mean_length": 168.27778116861978, "completions/mean_terminated_length": 144.8372548421224, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6805293005671077, "grad_norm": 0.0829065814614296, "kl": 0.028959910074869793, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 12715243.0, "reward": 0.77097487449646, "reward_std": 0.5438057780265808, "rewards/get_embedding_sim/mean": 0.3829539120197296, "rewards/get_embedding_sim/std": 0.09809910257657369, "rewards/reward_num_unique_chars/mean": 0.13059413681427637, "rewards/reward_num_unique_chars/std": 0.31896015008290607, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022569444444444458, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 157.07465616861978, "completions/mean_terminated_length": 136.9731216430664, "completions/min_length": 10.666666666666666, "completions/min_terminated_length": 10.666666666666666, "epoch": 0.725897920604915, "grad_norm": 0.09397952258586884, "kl": 0.044497172037760414, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 13477473.0, "reward": 0.8347963392734528, "reward_std": 0.6062483191490173, "rewards/get_embedding_sim/mean": 0.3972962299982707, "rewards/get_embedding_sim/std": 0.10549474010864894, "rewards/reward_num_unique_chars/mean": 0.14635550851623216, "rewards/reward_num_unique_chars/std": 0.3286245862642924, "step": 48 }, { "epoch": 0.7712665406427222, "grad_norm": 1.0158724784851074, "learning_rate": 1e-06, "loss": 0.0131, "step": 51 }, { "epoch": 0.7712665406427222, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.1023065476190476, "eval_completions/max_length": 850.0357142857143, "eval_completions/max_terminated_length": 649.3214285714286, "eval_completions/mean_length": 221.02716418675013, "eval_completions/mean_terminated_length": 136.32026665551322, "eval_completions/min_length": 25.892857142857142, "eval_completions/min_terminated_length": 25.892857142857142, "eval_kl": 0.05330167497907366, "eval_loss": 0.022182755172252655, "eval_num_tokens": 14225380.0, "eval_reward": 0.6993682932640825, "eval_reward_std": 0.6098802514108164, "eval_rewards/get_embedding_sim/mean": 0.408073626724737, "eval_rewards/get_embedding_sim/std": 0.08180103763671857, "eval_rewards/reward_num_unique_chars/mean": 0.09725111100955733, "eval_rewards/reward_num_unique_chars/std": 0.1932054047605821, "eval_runtime": 6593.9311, "eval_samples_per_second": 0.008, "eval_steps_per_second": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.031249999999999983, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.5, "completions/mean_length": 173.74523162841797, "completions/mean_terminated_length": 146.51427459716797, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.8166351606805293, "grad_norm": 0.13796095550060272, "kl": 0.061681111653645836, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 15035350.0, "reward": 0.7754727999369303, "reward_std": 0.6537116318941116, "rewards/get_embedding_sim/mean": 0.3978685835997264, "rewards/get_embedding_sim/std": 0.10740451887249947, "rewards/reward_num_unique_chars/mean": 0.1262344146768252, "rewards/reward_num_unique_chars/std": 0.31391797463099164, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.032986111111111126, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.6666666666666, "completions/mean_length": 179.25955708821616, "completions/mean_terminated_length": 150.37726338704428, "completions/min_length": 9.666666666666666, "completions/min_terminated_length": 9.666666666666666, "epoch": 0.8620037807183365, "grad_norm": 9.711634635925293, "kl": 0.30323028564453125, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 15831681.0, "reward": 0.8330511649449667, "reward_std": 0.7141762177149454, "rewards/get_embedding_sim/mean": 0.392946978410085, "rewards/get_embedding_sim/std": 0.10180553545554479, "rewards/reward_num_unique_chars/mean": 0.14735475679238638, "rewards/reward_num_unique_chars/std": 0.34930500388145447, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03472222222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.3333333333334, "completions/mean_length": 175.86719258626303, "completions/mean_terminated_length": 145.65855916341147, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9073724007561437, "grad_norm": 0.11200369894504547, "kl": 0.056910196940104164, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 16616760.0, "reward": 0.9081356525421143, "reward_std": 0.7194747726122538, "rewards/get_embedding_sim/mean": 0.40292728940645856, "rewards/get_embedding_sim/std": 0.11402523269255956, "rewards/reward_num_unique_chars/mean": 0.17317021762331328, "rewards/reward_num_unique_chars/std": 0.34040839473406476, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013888888888888914, "completions/max_length": 1024.0, "completions/max_terminated_length": 768.3333333333334, "completions/mean_length": 162.06250508626303, "completions/mean_terminated_length": 149.80463155110678, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.9527410207939508, "grad_norm": 0.08261118829250336, "kl": 0.08898417154947917, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 17393280.0, "reward": 0.8178274830182394, "reward_std": 0.7919754783312479, "rewards/get_embedding_sim/mean": 0.4141816198825836, "rewards/get_embedding_sim/std": 0.11467475444078445, "rewards/reward_num_unique_chars/mean": 0.13454861069718996, "rewards/reward_num_unique_chars/std": 0.33077992002169293, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0385656130268199, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.3333333333334, "completions/mean_length": 183.8406778971354, "completions/mean_terminated_length": 149.68450419108072, "completions/min_length": 9.333333333333334, "completions/min_terminated_length": 9.333333333333334, "epoch": 0.998109640831758, "grad_norm": 0.07706479728221893, "kl": 0.040013631184895836, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 18173273.0, "reward": 0.9729219675064087, "reward_std": 0.8207030693689982, "rewards/get_embedding_sim/mean": 0.4208385944366455, "rewards/get_embedding_sim/std": 0.11511148760716121, "rewards/reward_num_unique_chars/mean": 0.18582184116045633, "rewards/reward_num_unique_chars/std": 0.3880065679550171, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 182.76996866861978, "completions/mean_terminated_length": 150.8812713623047, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.0453686200378072, "grad_norm": 0.12481274455785751, "kl": 0.06750742594401042, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 18973648.0, "reward": 0.7922607262929281, "reward_std": 0.778564453125, "rewards/get_embedding_sim/mean": 0.43028150995572406, "rewards/get_embedding_sim/std": 0.11360271523396175, "rewards/reward_num_unique_chars/mean": 0.12104393541812897, "rewards/reward_num_unique_chars/std": 0.3201603094736735, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.027777777777777752, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 164.88194529215494, "completions/mean_terminated_length": 140.49947357177734, "completions/min_length": 10.333333333333334, "completions/min_terminated_length": 10.333333333333334, "epoch": 1.0907372400756143, "grad_norm": 0.27184560894966125, "kl": 0.10882568359375, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 19738968.0, "reward": 0.9275963107744852, "reward_std": 0.841428816318512, "rewards/get_embedding_sim/mean": 0.4406171242396037, "rewards/get_embedding_sim/std": 0.11815810203552246, "rewards/reward_num_unique_chars/mean": 0.162567267815272, "rewards/reward_num_unique_chars/std": 0.36787914236386615, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.035590277777777755, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 176.44878641764322, "completions/mean_terminated_length": 145.19543965657553, "completions/min_length": 8.333333333333334, "completions/min_terminated_length": 8.333333333333334, "epoch": 1.1361058601134215, "grad_norm": 0.11772840470075607, "kl": 0.132965087890625, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 20526877.0, "reward": 0.7764408787091573, "reward_std": 0.7020115653673807, "rewards/get_embedding_sim/mean": 0.4378991524378459, "rewards/get_embedding_sim/std": 0.11236891647179921, "rewards/reward_num_unique_chars/mean": 0.11354367559154828, "rewards/reward_num_unique_chars/std": 0.31067532300949097, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.030381944444444458, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.6666666666666, "completions/mean_length": 181.06771341959634, "completions/mean_terminated_length": 154.67694600423178, "completions/min_length": 10.333333333333334, "completions/min_terminated_length": 10.333333333333334, "epoch": 1.1814744801512287, "grad_norm": 0.21804682910442352, "kl": 0.09952545166015625, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 21325291.0, "reward": 0.8087505102157593, "reward_std": 0.7311090230941772, "rewards/get_embedding_sim/mean": 0.4597921272118886, "rewards/get_embedding_sim/std": 0.12011716266473134, "rewards/reward_num_unique_chars/mean": 0.11631944527228673, "rewards/reward_num_unique_chars/std": 0.3020235498746236, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.3333333333334, "completions/mean_length": 206.53125508626303, "completions/mean_terminated_length": 182.36050415039062, "completions/min_length": 9.666666666666666, "completions/min_terminated_length": 9.666666666666666, "epoch": 1.2268431001890359, "grad_norm": 0.1415005475282669, "kl": 0.170013427734375, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 22125439.0, "reward": 0.8599557876586914, "reward_std": 0.6892009973526001, "rewards/get_embedding_sim/mean": 0.45630990465482074, "rewards/get_embedding_sim/std": 0.11122701565424602, "rewards/reward_num_unique_chars/mean": 0.1345486119389534, "rewards/reward_num_unique_chars/std": 0.3128484884897868, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.039930555555555546, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.6666666666666, "completions/mean_length": 214.3307342529297, "completions/mean_terminated_length": 180.5359090169271, "completions/min_length": 12.666666666666666, "completions/min_terminated_length": 12.666666666666666, "epoch": 1.272211720226843, "grad_norm": 0.11570374667644501, "kl": 0.06879933675130208, "learning_rate": 1e-06, "loss": 0.0228, "num_tokens": 22955116.0, "reward": 0.9702663818995158, "reward_std": 0.7755107680956522, "rewards/get_embedding_sim/mean": 0.47287049889564514, "rewards/get_embedding_sim/std": 0.11713164548079173, "rewards/reward_num_unique_chars/mean": 0.16612045466899872, "rewards/reward_num_unique_chars/std": 0.3707600136597951, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.032986111111111126, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 191.13976033528647, "completions/mean_terminated_length": 162.73322041829428, "completions/min_length": 7.666666666666667, "completions/min_terminated_length": 7.666666666666667, "epoch": 1.3175803402646502, "grad_norm": 0.19582344591617584, "kl": 0.08817799886067708, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 23758269.0, "reward": 0.8809124827384949, "reward_std": 0.7492716908454895, "rewards/get_embedding_sim/mean": 0.4720582564671834, "rewards/get_embedding_sim/std": 0.11799828956524532, "rewards/reward_num_unique_chars/mean": 0.13729924211899439, "rewards/reward_num_unique_chars/std": 0.3396035333474477, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.032118055555555546, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.6666666666666, "completions/mean_length": 155.8697967529297, "completions/mean_terminated_length": 127.03400421142578, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 1.3629489603024574, "grad_norm": 0.10882719606161118, "kl": 0.08435567220052083, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 24522615.0, "reward": 0.9096565643946329, "reward_std": 0.7040959596633911, "rewards/get_embedding_sim/mean": 0.4825731615225474, "rewards/get_embedding_sim/std": 0.11133117477099101, "rewards/reward_num_unique_chars/mean": 0.1426701620221138, "rewards/reward_num_unique_chars/std": 0.3476703961690267, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.3333333333334, "completions/mean_length": 171.0026092529297, "completions/mean_terminated_length": 145.80119832356772, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 1.4083175803402646, "grad_norm": 0.16034463047981262, "kl": 0.0664825439453125, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 25303194.0, "reward": 1.1405272086461384, "reward_std": 0.8232053716977438, "rewards/get_embedding_sim/mean": 0.4634438355763753, "rewards/get_embedding_sim/std": 0.11458807935317357, "rewards/reward_num_unique_chars/mean": 0.22588256498177847, "rewards/reward_num_unique_chars/std": 0.4179500639438629, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.038194444444444454, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.3333333333334, "completions/mean_length": 197.03039042154947, "completions/mean_terminated_length": 164.30577087402344, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.4536862003780717, "grad_norm": 1.1371827125549316, "kl": 0.16266377766927084, "learning_rate": 1e-06, "loss": 0.018, "num_tokens": 26107613.0, "reward": 0.8050010005633036, "reward_std": 0.7417031327883402, "rewards/get_embedding_sim/mean": 0.4768759409586589, "rewards/get_embedding_sim/std": 0.1163704867164294, "rewards/reward_num_unique_chars/mean": 0.10968360553185146, "rewards/reward_num_unique_chars/std": 0.29880866408348083, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037326388888888916, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.6666666666666, "completions/mean_length": 184.1883748372396, "completions/mean_terminated_length": 151.9073689778646, "completions/min_length": 7.666666666666667, "completions/min_terminated_length": 7.666666666666667, "epoch": 1.499054820415879, "grad_norm": 0.08430308103561401, "kl": 0.07194010416666667, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 26899254.0, "reward": 0.8056914011637369, "reward_std": 0.7580650448799133, "rewards/get_embedding_sim/mean": 0.48798303802808124, "rewards/get_embedding_sim/std": 0.11710481345653534, "rewards/reward_num_unique_chars/mean": 0.106216366092364, "rewards/reward_num_unique_chars/std": 0.3037123878796895, "step": 99 }, { "epoch": 1.544423440453686, "grad_norm": 0.11377694457769394, "learning_rate": 1e-06, "loss": 0.0254, "step": 102 }, { "epoch": 1.544423440453686, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.10007440476190474, "eval_completions/max_length": 869.5892857142857, "eval_completions/max_terminated_length": 639.0535714285714, "eval_completions/mean_length": 225.3545457976205, "eval_completions/mean_terminated_length": 140.344126360757, "eval_completions/min_length": 19.357142857142858, "eval_completions/min_terminated_length": 19.357142857142858, "eval_kl": 0.07553209577287946, "eval_loss": 0.037391725927591324, "eval_num_tokens": 27703933.0, "eval_reward": 0.7799429536930153, "eval_reward_std": 0.6953434666751751, "eval_rewards/get_embedding_sim/mean": 0.47748757898807526, "eval_rewards/get_embedding_sim/std": 0.0975222562971924, "eval_rewards/reward_num_unique_chars/mean": 0.10085803000921649, "eval_rewards/reward_num_unique_chars/std": 0.22144863993993827, "eval_runtime": 5743.3373, "eval_samples_per_second": 0.01, "eval_steps_per_second": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.3333333333334, "completions/mean_length": 192.20443216959634, "completions/mean_terminated_length": 158.34148915608725, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 1.5897920604914932, "grad_norm": 0.14566491544246674, "kl": 0.0706634521484375, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 28512525.0, "reward": 0.8942790528138479, "reward_std": 0.7560157477855682, "rewards/get_embedding_sim/mean": 0.47891440490881604, "rewards/get_embedding_sim/std": 0.11716391022006671, "rewards/reward_num_unique_chars/mean": 0.1388231466213862, "rewards/reward_num_unique_chars/std": 0.3291383981704712, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03472222222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 205.2352498372396, "completions/mean_terminated_length": 176.03035990397134, "completions/min_length": 5.333333333333333, "completions/min_terminated_length": 5.333333333333333, "epoch": 1.6351606805293004, "grad_norm": 0.08621126413345337, "kl": 0.08469390869140625, "learning_rate": 1e-06, "loss": 0.027, "num_tokens": 29338780.0, "reward": 0.8891541957855225, "reward_std": 0.8206586241722107, "rewards/get_embedding_sim/mean": 0.4672791560490926, "rewards/get_embedding_sim/std": 0.11891171584526698, "rewards/reward_num_unique_chars/mean": 0.14074058582385382, "rewards/reward_num_unique_chars/std": 0.34693758686383563, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.032118055555555546, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 195.6493123372396, "completions/mean_terminated_length": 168.39810689290366, "completions/min_length": 8.333333333333334, "completions/min_terminated_length": 8.333333333333334, "epoch": 1.6805293005671076, "grad_norm": 0.09061074256896973, "kl": 0.061197916666666664, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 30140024.0, "reward": 0.8708882729212443, "reward_std": 0.6558753848075867, "rewards/get_embedding_sim/mean": 0.49067989985148114, "rewards/get_embedding_sim/std": 0.11308762182792027, "rewards/reward_num_unique_chars/mean": 0.12729256972670555, "rewards/reward_num_unique_chars/std": 0.3063565840323766, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.3333333333334, "completions/mean_length": 168.0746612548828, "completions/mean_terminated_length": 142.76580810546875, "completions/min_length": 9.666666666666666, "completions/min_terminated_length": 9.666666666666666, "epoch": 1.725897920604915, "grad_norm": 0.07783554494380951, "kl": 0.0714569091796875, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 30914926.0, "reward": 0.9490655660629272, "reward_std": 0.8103155891100565, "rewards/get_embedding_sim/mean": 0.4881279369195302, "rewards/get_embedding_sim/std": 0.1105448305606842, "rewards/reward_num_unique_chars/mean": 0.1541931927204132, "rewards/reward_num_unique_chars/std": 0.3540232678254445, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.3333333333334, "completions/mean_length": 165.41146341959634, "completions/mean_terminated_length": 132.9504165649414, "completions/min_length": 7.666666666666667, "completions/min_terminated_length": 7.666666666666667, "epoch": 1.7712665406427222, "grad_norm": 0.08974138647317886, "kl": 0.08209737141927083, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 31681096.0, "reward": 0.8335268894831339, "reward_std": 0.716159999370575, "rewards/get_embedding_sim/mean": 0.4897768298784892, "rewards/get_embedding_sim/std": 0.12030263990163803, "rewards/reward_num_unique_chars/mean": 0.11484397575259209, "rewards/reward_num_unique_chars/std": 0.30110697944959003, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.040798611111111084, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.6666666666666, "completions/mean_length": 202.7447967529297, "completions/mean_terminated_length": 167.81108601888022, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 1.8166351606805293, "grad_norm": 0.10199436545372009, "kl": 0.07517751057942708, "learning_rate": 1e-06, "loss": 0.0319, "num_tokens": 32496610.0, "reward": 0.860044519106547, "reward_std": 0.7602864901224772, "rewards/get_embedding_sim/mean": 0.4876486460367839, "rewards/get_embedding_sim/std": 0.11599687735239665, "rewards/reward_num_unique_chars/mean": 0.12465803200999896, "rewards/reward_num_unique_chars/std": 0.3174656927585602, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 187.55816141764322, "completions/mean_terminated_length": 160.59460957845053, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 1.8620037807183365, "grad_norm": 1.411887526512146, "kl": 0.13691202799479166, "learning_rate": 1e-06, "loss": 0.0295, "num_tokens": 33302501.0, "reward": 0.8890740275382996, "reward_std": 0.8625878095626831, "rewards/get_embedding_sim/mean": 0.4724073112010956, "rewards/get_embedding_sim/std": 0.11733246843020122, "rewards/reward_num_unique_chars/mean": 0.13933624823888144, "rewards/reward_num_unique_chars/std": 0.34191163380940753, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025173611111111088, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 171.25347900390625, "completions/mean_terminated_length": 149.3338419596354, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 1.9073724007561437, "grad_norm": 0.10643448680639267, "kl": 0.0879974365234375, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 34082265.0, "reward": 1.0310540199279785, "reward_std": 0.83027583360672, "rewards/get_embedding_sim/mean": 0.4763664702574412, "rewards/get_embedding_sim/std": 0.1186542958021164, "rewards/reward_num_unique_chars/mean": 0.1909722164273262, "rewards/reward_num_unique_chars/std": 0.3733387490113576, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.3333333333334, "completions/mean_length": 175.5920206705729, "completions/mean_terminated_length": 157.43394470214844, "completions/min_length": 7.666666666666667, "completions/min_terminated_length": 7.666666666666667, "epoch": 1.9527410207939508, "grad_norm": 0.083248071372509, "kl": 0.07168070475260417, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 34874371.0, "reward": 0.9281045397122701, "reward_std": 0.9109238783518473, "rewards/get_embedding_sim/mean": 0.4853961269060771, "rewards/get_embedding_sim/std": 0.12320189674695332, "rewards/reward_num_unique_chars/mean": 0.14780289431413016, "rewards/reward_num_unique_chars/std": 0.35039229194323224, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.6666666666666, "completions/mean_length": 177.00694783528647, "completions/mean_terminated_length": 154.32052103678384, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 1.998109640831758, "grad_norm": 0.0692070946097374, "kl": 0.07420603434244792, "learning_rate": 1e-06, "loss": 0.0336, "num_tokens": 35655291.0, "reward": 1.0533938805262248, "reward_std": 0.9196257392565409, "rewards/get_embedding_sim/mean": 0.4934980074564616, "rewards/get_embedding_sim/std": 0.11738153547048569, "rewards/reward_num_unique_chars/mean": 0.1887365331252416, "rewards/reward_num_unique_chars/std": 0.3903753161430359, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.3333333333334, "completions/mean_length": 180.38281758626303, "completions/mean_terminated_length": 155.4217987060547, "completions/min_length": 7.666666666666667, "completions/min_terminated_length": 7.666666666666667, "epoch": 2.045368620037807, "grad_norm": 0.1576370894908905, "kl": 0.11295064290364583, "learning_rate": 1e-06, "loss": 0.0335, "num_tokens": 36452916.0, "reward": 0.9172398447990417, "reward_std": 0.9673983256022135, "rewards/get_embedding_sim/mean": 0.48755229512850445, "rewards/get_embedding_sim/std": 0.11796744416157405, "rewards/reward_num_unique_chars/mean": 0.14384527256091437, "rewards/reward_num_unique_chars/std": 0.3497835397720337, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.026909722222222248, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.3333333333334, "completions/mean_length": 170.28646341959634, "completions/mean_terminated_length": 146.62708536783853, "completions/min_length": 5.333333333333333, "completions/min_terminated_length": 5.333333333333333, "epoch": 2.0907372400756143, "grad_norm": 0.5436683893203735, "kl": 0.1591796875, "learning_rate": 1e-06, "loss": 0.0342, "num_tokens": 37224462.0, "reward": 1.0363986889521282, "reward_std": 0.9765956203142802, "rewards/get_embedding_sim/mean": 0.5051485598087311, "rewards/get_embedding_sim/std": 0.11746565749247868, "rewards/reward_num_unique_chars/mean": 0.17818759878476462, "rewards/reward_num_unique_chars/std": 0.3826761841773987, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.038194444444444454, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 174.79948933919272, "completions/mean_terminated_length": 140.99752298990884, "completions/min_length": 8.333333333333334, "completions/min_terminated_length": 8.333333333333334, "epoch": 2.1361058601134215, "grad_norm": 0.11291619390249252, "kl": 0.13877360026041666, "learning_rate": 1e-06, "loss": 0.0321, "num_tokens": 38010471.0, "reward": 0.936789353688558, "reward_std": 0.8961972991625468, "rewards/get_embedding_sim/mean": 0.49408095081647235, "rewards/get_embedding_sim/std": 0.11985934029022853, "rewards/reward_num_unique_chars/mean": 0.14794171353181204, "rewards/reward_num_unique_chars/std": 0.3542039096355438, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.055555555555555546, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 189.78819783528647, "completions/mean_terminated_length": 140.69151306152344, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 2.1814744801512287, "grad_norm": 0.12472045421600342, "kl": 0.2596638997395833, "learning_rate": 1e-06, "loss": 0.0503, "num_tokens": 38811923.0, "reward": 1.0600279172261555, "reward_std": 0.9520064989725748, "rewards/get_embedding_sim/mean": 0.5053403675556183, "rewards/get_embedding_sim/std": 0.12481692930062611, "rewards/reward_num_unique_chars/mean": 0.18512474993864694, "rewards/reward_num_unique_chars/std": 0.3818445106347402, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03559027777777779, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.3333333333334, "completions/mean_length": 186.22656758626303, "completions/mean_terminated_length": 155.0603485107422, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.226843100189036, "grad_norm": 0.2647012174129486, "kl": 0.14057413736979166, "learning_rate": 1e-06, "loss": 0.0445, "num_tokens": 39616280.0, "reward": 1.021846095720927, "reward_std": 1.002595583597819, "rewards/get_embedding_sim/mean": 0.48799189925193787, "rewards/get_embedding_sim/std": 0.12229083478450775, "rewards/reward_num_unique_chars/mean": 0.17855327824751535, "rewards/reward_num_unique_chars/std": 0.3721735179424286, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01996527777777779, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.6666666666666, "completions/mean_length": 152.78299458821616, "completions/mean_terminated_length": 135.05128479003906, "completions/min_length": 7.666666666666667, "completions/min_terminated_length": 7.666666666666667, "epoch": 2.272211720226843, "grad_norm": 0.07731039077043533, "kl": 0.15080769856770834, "learning_rate": 1e-06, "loss": 0.0334, "num_tokens": 40382110.0, "reward": 1.0974433422088623, "reward_std": 0.9521243373552958, "rewards/get_embedding_sim/mean": 0.516714076201121, "rewards/get_embedding_sim/std": 0.12718145549297333, "rewards/reward_num_unique_chars/mean": 0.1947579632202784, "rewards/reward_num_unique_chars/std": 0.3964957594871521, "step": 150 }, { "epoch": 2.31758034026465, "grad_norm": 0.9586585760116577, "learning_rate": 1e-06, "loss": 0.0347, "step": 153 }, { "epoch": 2.31758034026465, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.07328869047619047, "eval_completions/max_length": 872.6071428571429, "eval_completions/max_terminated_length": 616.6071428571429, "eval_completions/mean_length": 177.7972524847303, "eval_completions/mean_terminated_length": 111.63820842334202, "eval_completions/min_length": 14.25, "eval_completions/min_terminated_length": 14.25, "eval_kl": 0.15661403111049108, "eval_loss": 0.05365554988384247, "eval_num_tokens": 41144925.0, "eval_reward": 1.0100113941090447, "eval_reward_std": 0.9766364488750696, "eval_rewards/get_embedding_sim/mean": 0.5055470722062247, "eval_rewards/get_embedding_sim/std": 0.10236791674313801, "eval_rewards/reward_num_unique_chars/mean": 0.16864551766775548, "eval_rewards/reward_num_unique_chars/std": 0.3155075231833117, "eval_runtime": 5610.1574, "eval_samples_per_second": 0.01, "eval_steps_per_second": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028211805555555563, "completions/max_length": 1024.0, "completions/max_terminated_length": 850.1666666666666, "completions/mean_length": 165.75738271077475, "completions/mean_terminated_length": 140.97229131062826, "completions/min_length": 6.666666666666667, "completions/min_terminated_length": 6.666666666666667, "epoch": 2.3629489603024574, "grad_norm": 0.13255949318408966, "kl": 0.1971893310546875, "learning_rate": 1e-06, "loss": 0.037, "num_tokens": 41941119.0, "reward": 1.0157166123390198, "reward_std": 0.9526964128017426, "rewards/get_embedding_sim/mean": 0.5066019793351492, "rewards/get_embedding_sim/std": 0.11097632969419162, "rewards/reward_num_unique_chars/mean": 0.17024830107887587, "rewards/reward_num_unique_chars/std": 0.37277790407339734, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.026909722222222248, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 145.26649729410806, "completions/mean_terminated_length": 121.1592280069987, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 2.4083175803402646, "grad_norm": 0.10361862182617188, "kl": 0.1372528076171875, "learning_rate": 1e-06, "loss": 0.0383, "num_tokens": 42690418.0, "reward": 1.2976791461308796, "reward_std": 1.1292773286501567, "rewards/get_embedding_sim/mean": 0.5060124099254608, "rewards/get_embedding_sim/std": 0.11317289372285207, "rewards/reward_num_unique_chars/mean": 0.26442377765973407, "rewards/reward_num_unique_chars/std": 0.43167150020599365, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.026909722222222248, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.3333333333334, "completions/mean_length": 145.79688008626303, "completions/mean_terminated_length": 121.46813710530598, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 2.4536862003780717, "grad_norm": 0.0873284786939621, "kl": 0.17525736490885416, "learning_rate": 1e-06, "loss": 0.0382, "num_tokens": 43439080.0, "reward": 1.166001319885254, "reward_std": 0.998184601465861, "rewards/get_embedding_sim/mean": 0.5045428971449534, "rewards/get_embedding_sim/std": 0.12551463643709818, "rewards/reward_num_unique_chars/mean": 0.22076034545898438, "rewards/reward_num_unique_chars/std": 0.40754825870196026, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 171.04340616861978, "completions/mean_terminated_length": 145.85011291503906, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.499054820415879, "grad_norm": 0.0893421620130539, "kl": 0.1564788818359375, "learning_rate": 1e-06, "loss": 0.039, "num_tokens": 44225946.0, "reward": 1.0508646965026855, "reward_std": 1.0311030149459839, "rewards/get_embedding_sim/mean": 0.49878130356470746, "rewards/get_embedding_sim/std": 0.12252787003914516, "rewards/reward_num_unique_chars/mean": 0.18418416877587637, "rewards/reward_num_unique_chars/std": 0.38810135920842487, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02256944444444442, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.3333333333334, "completions/mean_length": 149.57465616861978, "completions/mean_terminated_length": 129.38720703125, "completions/min_length": 5.333333333333333, "completions/min_terminated_length": 5.333333333333333, "epoch": 2.544423440453686, "grad_norm": 0.11376336216926575, "kl": 0.17649332682291666, "learning_rate": 1e-06, "loss": 0.0415, "num_tokens": 44975264.0, "reward": 1.2923760414123535, "reward_std": 1.15834375222524, "rewards/get_embedding_sim/mean": 0.5293551087379456, "rewards/get_embedding_sim/std": 0.12636979669332504, "rewards/reward_num_unique_chars/mean": 0.255620613694191, "rewards/reward_num_unique_chars/std": 0.4349779784679413, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021701388888888878, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.6666666666666, "completions/mean_length": 151.2604217529297, "completions/mean_terminated_length": 131.89202372233072, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 2.5897920604914932, "grad_norm": 0.1699657291173935, "kl": 0.176025390625, "learning_rate": 1e-06, "loss": 0.0416, "num_tokens": 45727292.0, "reward": 1.1844958066940308, "reward_std": 1.0816868146260579, "rewards/get_embedding_sim/mean": 0.5048082073529562, "rewards/get_embedding_sim/std": 0.11827733864386876, "rewards/reward_num_unique_chars/mean": 0.22690473993619284, "rewards/reward_num_unique_chars/std": 0.41608301798502606, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013888888888888876, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 140.05816650390625, "completions/mean_terminated_length": 127.6211166381836, "completions/min_length": 6.333333333333333, "completions/min_terminated_length": 6.333333333333333, "epoch": 2.6351606805293004, "grad_norm": 0.08062685281038284, "kl": 0.240142822265625, "learning_rate": 1e-06, "loss": 0.0377, "num_tokens": 46478463.0, "reward": 1.0848047733306885, "reward_std": 1.0851068099339802, "rewards/get_embedding_sim/mean": 0.5197005073229471, "rewards/get_embedding_sim/std": 0.11266261339187622, "rewards/reward_num_unique_chars/mean": 0.1886785626411438, "rewards/reward_num_unique_chars/std": 0.39136550823847455, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025173611111111088, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 142.6154530843099, "completions/mean_terminated_length": 119.86089833577473, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 2.6805293005671076, "grad_norm": 0.1051354631781578, "kl": 0.22475179036458334, "learning_rate": 1e-06, "loss": 0.0389, "num_tokens": 47225476.0, "reward": 1.142371932665507, "reward_std": 1.070401946703593, "rewards/get_embedding_sim/mean": 0.49914271632830304, "rewards/get_embedding_sim/std": 0.11159212638934453, "rewards/reward_num_unique_chars/mean": 0.2148823787768682, "rewards/reward_num_unique_chars/std": 0.40812622507413227, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 128.2951431274414, "completions/mean_terminated_length": 111.62453969319661, "completions/min_length": 7.333333333333333, "completions/min_terminated_length": 7.333333333333333, "epoch": 2.7258979206049148, "grad_norm": 0.20546282827854156, "kl": 0.283447265625, "learning_rate": 1e-06, "loss": 0.0403, "num_tokens": 47955752.0, "reward": 1.1857277949651082, "reward_std": 1.1706757545471191, "rewards/get_embedding_sim/mean": 0.513852725426356, "rewards/get_embedding_sim/std": 0.12434107561906178, "rewards/reward_num_unique_chars/mean": 0.23464342455069223, "rewards/reward_num_unique_chars/std": 0.42252803842226666, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01822916666666663, "completions/max_length": 1024.0, "completions/max_terminated_length": 902.6666666666666, "completions/mean_length": 130.4401067097982, "completions/mean_terminated_length": 113.8047103881836, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 2.7712665406427224, "grad_norm": 0.08767585456371307, "kl": 0.4050394694010417, "learning_rate": 1e-06, "loss": 0.0431, "num_tokens": 48670211.0, "reward": 1.2997503280639648, "reward_std": 1.2144495646158855, "rewards/get_embedding_sim/mean": 0.5237086117267609, "rewards/get_embedding_sim/std": 0.1108636533220609, "rewards/reward_num_unique_chars/mean": 0.2586805572112401, "rewards/reward_num_unique_chars/std": 0.4370884597301483, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.023437500000000038, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 135.30035146077475, "completions/mean_terminated_length": 113.9994608561198, "completions/min_length": 5.666666666666667, "completions/min_terminated_length": 5.666666666666667, "epoch": 2.816635160680529, "grad_norm": 0.3577604591846466, "kl": 0.2592061360677083, "learning_rate": 1e-06, "loss": 0.0498, "num_tokens": 49402749.0, "reward": 1.2939318418502808, "reward_std": 1.2120266358057659, "rewards/get_embedding_sim/mean": 0.5204942027727762, "rewards/get_embedding_sim/std": 0.11977454274892807, "rewards/reward_num_unique_chars/mean": 0.2582635283470154, "rewards/reward_num_unique_chars/std": 0.43749914566675824, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013888888888888876, "completions/max_length": 1024.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 109.67014058430989, "completions/mean_terminated_length": 96.77264912923177, "completions/min_length": 6.666666666666667, "completions/min_terminated_length": 6.666666666666667, "epoch": 2.8620037807183367, "grad_norm": 0.1368367075920105, "kl": 0.3499857584635417, "learning_rate": 1e-06, "loss": 0.0381, "num_tokens": 50108497.0, "reward": 1.5523497263590496, "reward_std": 1.214170217514038, "rewards/get_embedding_sim/mean": 0.534120500087738, "rewards/get_embedding_sim/std": 0.10954815397659938, "rewards/reward_num_unique_chars/mean": 0.3397156894207001, "rewards/reward_num_unique_chars/std": 0.46985835830370587, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01909722222222221, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.6666666666666, "completions/mean_length": 125.33333841959636, "completions/mean_terminated_length": 107.79783884684245, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 2.9073724007561434, "grad_norm": 1.6821552515029907, "kl": 0.4345550537109375, "learning_rate": 1e-06, "loss": 0.0423, "num_tokens": 50831329.0, "reward": 1.3425734440485637, "reward_std": 1.191293756167094, "rewards/get_embedding_sim/mean": 0.5404900709788004, "rewards/get_embedding_sim/std": 0.11695743352174759, "rewards/reward_num_unique_chars/mean": 0.26780080795288086, "rewards/reward_num_unique_chars/std": 0.44234869877497357, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009548611111111124, "completions/max_length": 1024.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 102.75347646077473, "completions/mean_terminated_length": 93.89341735839844, "completions/min_length": 8.666666666666666, "completions/min_terminated_length": 8.666666666666666, "epoch": 2.952741020793951, "grad_norm": 0.11723087728023529, "kl": 0.3047281901041667, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 51524645.0, "reward": 1.6216003100077312, "reward_std": 1.1864676475524902, "rewards/get_embedding_sim/mean": 0.5434751510620117, "rewards/get_embedding_sim/std": 0.12491280088822047, "rewards/reward_num_unique_chars/mean": 0.359375, "rewards/reward_num_unique_chars/std": 0.4740845561027527, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01848659003831421, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 131.5273691813151, "completions/mean_terminated_length": 114.69828796386719, "completions/min_length": 6.666666666666667, "completions/min_terminated_length": 6.666666666666667, "epoch": 2.998109640831758, "grad_norm": 0.10458555072546005, "kl": 0.2809855143229167, "learning_rate": 1e-06, "loss": 0.0481, "num_tokens": 52251674.0, "reward": 1.4293763637542725, "reward_std": 1.2686160405476887, "rewards/get_embedding_sim/mean": 0.5335429906845093, "rewards/get_embedding_sim/std": 0.11212129394213359, "rewards/reward_num_unique_chars/mean": 0.29888081053892773, "rewards/reward_num_unique_chars/std": 0.4557340343793233, "step": 198 } ], "logging_steps": 3, "max_steps": 198, "num_input_tokens_seen": 52251674, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }