{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.012, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.033851010352373125, "clip_ratio/high_mean": 0.011871843505650759, "clip_ratio/low_mean": 0.024242424033582212, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03611426735296845, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 297.675, "completions/mean_terminated_length": 297.675, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.33769991919398307, "epoch": 0.0008, "frac_reward_zero_std": 0.5, "grad_norm": 0.030647173523902893, "kl": 0.022074293252080678, "learning_rate": 8.529119999999999e-07, "loss": -0.0006066907197237014, "num_tokens": 136458.0, "reward": 0.9300000309944153, "reward_std": 0.23334523439407348, "rewards/env_goofspiel_reward/mean": 0.9300000309944153, "rewards/env_goofspiel_reward/std": 0.3451612591743469, "sampling/importance_sampling_ratio/max": 1.5456702947616576, "sampling/importance_sampling_ratio/mean": 0.32863556742668154, "sampling/importance_sampling_ratio/min": 0.00010910680049249776, "sampling/sampling_logp_difference/max": 7.469822406768799, "sampling/sampling_logp_difference/mean": 0.680775272846222, "step": 5, "step_time": 4.723534681799992 }, { "clip_ratio/high_max": 0.052361111342906955, "clip_ratio/high_mean": 0.014340277761220932, "clip_ratio/low_mean": 0.015763888787478208, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030104166455566884, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 291.9125, "completions/mean_terminated_length": 291.9125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.3392209455370903, "epoch": 0.0016, "frac_reward_zero_std": 0.4375, "grad_norm": 0.02777782641351223, "kl": 0.03302585552446544, "learning_rate": 1.919052e-06, "loss": -0.0005224664695560932, "num_tokens": 270583.0, "reward": 0.8775000214576721, "reward_std": 0.26516505479812624, "rewards/env_goofspiel_reward/mean": 0.8775000214576721, "rewards/env_goofspiel_reward/std": 0.3663728296756744, "sampling/importance_sampling_ratio/max": 1.54481360912323, "sampling/importance_sampling_ratio/mean": 0.3381913095712662, "sampling/importance_sampling_ratio/min": 2.4473399389535188e-05, "sampling/sampling_logp_difference/max": 8.968151187896728, "sampling/sampling_logp_difference/mean": 0.7433344721794128, "step": 10, "step_time": 4.138045286000079 }, { "clip_ratio/high_max": 0.05089646503329277, "clip_ratio/high_mean": 0.018314393889158963, "clip_ratio/low_mean": 0.026897096075117588, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04521148977801204, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 284.84375, "completions/mean_terminated_length": 284.84375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.34051789045333863, "epoch": 0.0024, "frac_reward_zero_std": 0.5875, "grad_norm": 0.013990325853228569, "kl": 0.040778578049503265, "learning_rate": 2.985192e-06, "loss": -0.0005226288456469774, "num_tokens": 403304.0, "reward": 0.9674375176429748, "reward_std": 0.19100722074508666, "rewards/env_goofspiel_reward/mean": 0.9674375176429748, "rewards/env_goofspiel_reward/std": 0.3267829120159149, "sampling/importance_sampling_ratio/max": 1.7882837533950806, "sampling/importance_sampling_ratio/mean": 0.3831939160823822, "sampling/importance_sampling_ratio/min": 0.00034590021532494576, "sampling/sampling_logp_difference/max": 6.792400264739991, "sampling/sampling_logp_difference/mean": 0.6026189684867859, "step": 15, "step_time": 4.541797615800033 }, { "clip_ratio/high_max": 0.08027777820825577, "clip_ratio/high_mean": 0.02395833358168602, "clip_ratio/low_mean": 0.019027777854353188, "clip_ratio/low_min": 0.00625, "clip_ratio/region_mean": 0.04298611143603921, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 281.60625, "completions/mean_terminated_length": 281.60625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.28771451860666275, "epoch": 0.0032, "frac_reward_zero_std": 0.5375, "grad_norm": 0.029177065938711166, "kl": 0.0942368695512414, "learning_rate": 4.051332e-06, "loss": -0.00038087132852524517, "num_tokens": 536119.0, "reward": 0.9637500643730164, "reward_std": 0.2068287342786789, "rewards/env_goofspiel_reward/mean": 0.9637500643730164, "rewards/env_goofspiel_reward/std": 0.3382142722606659, "sampling/importance_sampling_ratio/max": 1.751455307006836, "sampling/importance_sampling_ratio/mean": 0.45835237503051757, "sampling/importance_sampling_ratio/min": 0.00027077984723291595, "sampling/sampling_logp_difference/max": 8.25740842819214, "sampling/sampling_logp_difference/mean": 0.5326088547706604, "step": 20, "step_time": 4.071906338800045 }, { "clip_ratio/high_max": 0.05111111141741276, "clip_ratio/high_mean": 0.013914141431450843, "clip_ratio/low_mean": 0.02760506859049201, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04151920983567834, "completions/clipped_ratio": 0.0, "completions/max_length": 378.6, "completions/max_terminated_length": 378.6, "completions/mean_length": 301.01875, "completions/mean_terminated_length": 301.01875, "completions/min_length": 218.8, "completions/min_terminated_length": 218.8, "entropy": 0.2572413206100464, "epoch": 0.004, "frac_reward_zero_std": 0.5375, "grad_norm": 0.04749641567468643, "kl": 0.23950345497578382, "learning_rate": 5.117472e-06, "loss": -0.00030293280724436045, "num_tokens": 674875.0, "reward": 0.9487500309944152, "reward_std": 0.20682873725891113, "rewards/env_goofspiel_reward/mean": 0.9487500309944152, "rewards/env_goofspiel_reward/std": 0.34452574253082274, "sampling/importance_sampling_ratio/max": 2.11174156665802, "sampling/importance_sampling_ratio/mean": 0.43660367727279664, "sampling/importance_sampling_ratio/min": 0.00013589896843768656, "sampling/sampling_logp_difference/max": 8.112329578399658, "sampling/sampling_logp_difference/mean": 0.5582964062690735, "step": 25, "step_time": 4.199152118200027 }, { "clip_ratio/high_max": 0.032361111417412755, "clip_ratio/high_mean": 0.009340277779847384, "clip_ratio/low_mean": 0.018115530349314214, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027455807756632568, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 283.40625, "completions/mean_terminated_length": 283.40625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.273418403416872, "epoch": 0.0048, "frac_reward_zero_std": 0.5875, "grad_norm": 0.005710980389267206, "kl": 1.6874765895307065, "learning_rate": 6.183612e-06, "loss": -0.0006473449524492025, "num_tokens": 806865.0, "reward": 0.9825000166893005, "reward_std": 0.19091882407665253, "rewards/env_goofspiel_reward/mean": 0.9825000166893005, "rewards/env_goofspiel_reward/std": 0.3281997382640839, "sampling/importance_sampling_ratio/max": 2.13806095123291, "sampling/importance_sampling_ratio/mean": 0.48566290736198425, "sampling/importance_sampling_ratio/min": 1.583069079060806e-05, "sampling/sampling_logp_difference/max": 9.758009147644042, "sampling/sampling_logp_difference/mean": 0.5621577501296997, "step": 30, "step_time": 4.3287319488001685 }, { "clip_ratio/high_max": 0.02222222238779068, "clip_ratio/high_mean": 0.00555555559694767, "clip_ratio/low_mean": 0.01631944449618459, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000093132257, "completions/clipped_ratio": 0.0, "completions/max_length": 366.4, "completions/max_terminated_length": 366.4, "completions/mean_length": 290.65625, "completions/mean_terminated_length": 290.65625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2778049871325493, "epoch": 0.0056, "frac_reward_zero_std": 0.55, "grad_norm": 0.025574276223778725, "kl": 0.22191586308181285, "learning_rate": 7.249752e-06, "loss": -0.0005257311277091503, "num_tokens": 941915.0, "reward": 0.99000004529953, "reward_std": 0.20152543485164642, "rewards/env_goofspiel_reward/mean": 0.99000004529953, "rewards/env_goofspiel_reward/std": 0.32238503098487853, "sampling/importance_sampling_ratio/max": 1.8528586864471435, "sampling/importance_sampling_ratio/mean": 0.5378320515155792, "sampling/importance_sampling_ratio/min": 0.0016222307924181223, "sampling/sampling_logp_difference/max": 6.151937532424927, "sampling/sampling_logp_difference/mean": 0.41074748039245607, "step": 35, "step_time": 4.195248219399855 }, { "clip_ratio/high_max": 0.03625000007450581, "clip_ratio/high_mean": 0.009062500018626452, "clip_ratio/low_mean": 0.010277777817100287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019340277835726737, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 290.64375, "completions/mean_terminated_length": 290.64375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.3076802439987659, "epoch": 0.0064, "frac_reward_zero_std": 0.4875, "grad_norm": 0.03911300376057625, "kl": 0.30147731937468053, "learning_rate": 7.4629793691100655e-06, "loss": -0.0009569000452756881, "num_tokens": 1076583.0, "reward": 0.9787500500679016, "reward_std": 0.23864853978157044, "rewards/env_goofspiel_reward/mean": 0.9787500500679016, "rewards/env_goofspiel_reward/std": 0.3355243980884552, "sampling/importance_sampling_ratio/max": 1.7326099634170533, "sampling/importance_sampling_ratio/mean": 0.5706644296646118, "sampling/importance_sampling_ratio/min": 0.008987322356551886, "sampling/sampling_logp_difference/max": 4.789818382263183, "sampling/sampling_logp_difference/mean": 0.32595881819725037, "step": 40, "step_time": 4.120446558600088 }, { "clip_ratio/high_max": 0.02430555559694767, "clip_ratio/high_mean": 0.006076388899236918, "clip_ratio/low_mean": 0.018705808185040952, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02478219708427787, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 298.1, "completions/mean_terminated_length": 298.1, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.30067678913474083, "epoch": 0.0072, "frac_reward_zero_std": 0.5625, "grad_norm": 0.019556289538741112, "kl": 0.4184106796979904, "learning_rate": 7.462976806120193e-06, "loss": -0.00040965699590742586, "num_tokens": 1213169.0, "reward": 0.9524999856948853, "reward_std": 0.22273863554000856, "rewards/env_goofspiel_reward/mean": 0.9524999856948853, "rewards/env_goofspiel_reward/std": 0.36751508712768555, "sampling/importance_sampling_ratio/max": 1.7973033905029296, "sampling/importance_sampling_ratio/mean": 0.6051283955574036, "sampling/importance_sampling_ratio/min": 0.0006137289259640965, "sampling/sampling_logp_difference/max": 5.231348609924316, "sampling/sampling_logp_difference/mean": 0.3217563569545746, "step": 45, "step_time": 4.313124376999985 }, { "clip_ratio/high_max": 0.02430555559694767, "clip_ratio/high_mean": 0.006076388899236918, "clip_ratio/low_mean": 0.018645833339542152, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024722222238779068, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 299.70625, "completions/mean_terminated_length": 299.70625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.26526937559247016, "epoch": 0.008, "frac_reward_zero_std": 0.6, "grad_norm": 0.09333564341068268, "kl": 0.6234624680131674, "learning_rate": 7.4629722716015665e-06, "loss": -0.0008450452238321305, "num_tokens": 1351131.0, "reward": 1.0012500405311584, "reward_std": 0.19622212946414946, "rewards/env_goofspiel_reward/mean": 1.0012500405311584, "rewards/env_goofspiel_reward/std": 0.33813255429267886, "sampling/importance_sampling_ratio/max": 1.8347083568572997, "sampling/importance_sampling_ratio/mean": 0.6785839080810547, "sampling/importance_sampling_ratio/min": 0.002685157069936395, "sampling/sampling_logp_difference/max": 5.393667411804199, "sampling/sampling_logp_difference/mean": 0.3046145349740982, "step": 50, "step_time": 4.5290676355998585 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.014444444514811038, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022256944421678783, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 283.7625, "completions/mean_terminated_length": 283.7625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.25032062605023386, "epoch": 0.0088, "frac_reward_zero_std": 0.5, "grad_norm": 0.0669218897819519, "kl": 0.47833866626024246, "learning_rate": 7.4629657655573805e-06, "loss": -0.0006249105092138052, "num_tokens": 1483536.0, "reward": 0.9561875462532043, "reward_std": 0.2599501311779022, "rewards/env_goofspiel_reward/mean": 0.9561875462532043, "rewards/env_goofspiel_reward/std": 0.3719723880290985, "sampling/importance_sampling_ratio/max": 1.9648807287216186, "sampling/importance_sampling_ratio/mean": 0.7620458364486694, "sampling/importance_sampling_ratio/min": 0.0028414088767021893, "sampling/sampling_logp_difference/max": 4.346728658676147, "sampling/sampling_logp_difference/mean": 0.2154034972190857, "step": 55, "step_time": 4.209427154400236 }, { "clip_ratio/high_max": 0.005000000074505806, "clip_ratio/high_mean": 0.0012500000186264515, "clip_ratio/low_mean": 0.015659722313284875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016909722238779068, "completions/clipped_ratio": 0.0, "completions/max_length": 374.8, "completions/max_terminated_length": 374.8, "completions/mean_length": 294.2, "completions/mean_terminated_length": 294.2, "completions/min_length": 218.8, "completions/min_terminated_length": 218.8, "entropy": 0.25066495686769485, "epoch": 0.0096, "frac_reward_zero_std": 0.6, "grad_norm": 0.03984799236059189, "kl": 0.5187893055379391, "learning_rate": 7.462957287992218e-06, "loss": -0.001143309846520424, "num_tokens": 1618874.0, "reward": 0.993750023841858, "reward_std": 0.19622212946414946, "rewards/env_goofspiel_reward/mean": 0.993750023841858, "rewards/env_goofspiel_reward/std": 0.35103108882904055, "sampling/importance_sampling_ratio/max": 1.7096198081970215, "sampling/importance_sampling_ratio/mean": 0.6896162152290344, "sampling/importance_sampling_ratio/min": 0.0015241437591612338, "sampling/sampling_logp_difference/max": 4.673358488082886, "sampling/sampling_logp_difference/mean": 0.26624326705932616, "step": 60, "step_time": 4.191420737200042 }, { "clip_ratio/high_max": 0.025, "clip_ratio/high_mean": 0.00625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0140625, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 273.53125, "completions/mean_terminated_length": 273.53125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2336573876440525, "epoch": 0.0104, "frac_reward_zero_std": 0.7125, "grad_norm": 0.009051427245140076, "kl": 0.6452277667820454, "learning_rate": 7.462946838912051e-06, "loss": -0.0009178260341286659, "num_tokens": 1748056.0, "reward": 1.0800000190734864, "reward_std": 0.1484924226999283, "rewards/env_goofspiel_reward/mean": 1.0800000190734864, "rewards/env_goofspiel_reward/std": 0.28696190714836123, "sampling/importance_sampling_ratio/max": 1.254857563972473, "sampling/importance_sampling_ratio/mean": 0.7130017876625061, "sampling/importance_sampling_ratio/min": 0.003967047110199929, "sampling/sampling_logp_difference/max": 3.8618431091308594, "sampling/sampling_logp_difference/mean": 0.20099806785583496, "step": 65, "step_time": 4.494912574199771 }, { "clip_ratio/high_max": 0.00555555559694767, "clip_ratio/high_mean": 0.0013888888992369176, "clip_ratio/low_mean": 0.010104166716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011493055615574121, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 281.1125, "completions/mean_terminated_length": 281.1125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.19834314305335282, "epoch": 0.0112, "frac_reward_zero_std": 0.7125, "grad_norm": 0.018667038530111313, "kl": 0.4677882671356201, "learning_rate": 7.462934418324241e-06, "loss": -0.0008302273228764534, "num_tokens": 1879948.0, "reward": 1.0912500381469727, "reward_std": 0.14318912029266356, "rewards/env_goofspiel_reward/mean": 1.0912500381469727, "rewards/env_goofspiel_reward/std": 0.2670675128698349, "sampling/importance_sampling_ratio/max": 1.8250314712524414, "sampling/importance_sampling_ratio/mean": 0.8290145397186279, "sampling/importance_sampling_ratio/min": 0.05519633814692497, "sampling/sampling_logp_difference/max": 2.754664158821106, "sampling/sampling_logp_difference/mean": 0.12412183284759522, "step": 70, "step_time": 4.16713200400036 }, { "clip_ratio/high_max": 0.00625, "clip_ratio/high_mean": 0.0015625, "clip_ratio/low_mean": 0.01158775258809328, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01315025258809328, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 292.4125, "completions/mean_terminated_length": 292.4125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.20496653467416764, "epoch": 0.012, "frac_reward_zero_std": 0.6625, "grad_norm": 0.03426237776875496, "kl": 0.4435719080269337, "learning_rate": 7.4629200262375374e-06, "loss": -0.000939619354903698, "num_tokens": 2015567.0, "reward": 1.0687500715255738, "reward_std": 0.1644023284316063, "rewards/env_goofspiel_reward/mean": 1.0687500715255738, "rewards/env_goofspiel_reward/std": 0.29003112614154813, "sampling/importance_sampling_ratio/max": 1.699583315849304, "sampling/importance_sampling_ratio/mean": 0.7789812088012695, "sampling/importance_sampling_ratio/min": 0.0047567693516612055, "sampling/sampling_logp_difference/max": 4.406649398803711, "sampling/sampling_logp_difference/mean": 0.18859796077013016, "step": 75, "step_time": 4.104378796799847 }, { "epoch": 0.012, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 373.0, "eval_completions/max_terminated_length": 373.0, "eval_completions/mean_length": 286.4583333333333, "eval_completions/mean_terminated_length": 286.4583333333333, "eval_completions/min_length": 235.33333333333334, "eval_completions/min_terminated_length": 235.33333333333334, "eval_entropy": 0.1776730790734291, "eval_frac_reward_zero_std": 0.9166666666666666, "eval_kl": 0.3452555288871129, "eval_loss": 6.334867521218257e-06, "eval_num_tokens": 2015567.0, "eval_reward": 1.1750000317891438, "eval_reward_std": 0.035355339447657265, "eval_rewards/env_goofspiel_reward/mean": 1.1750000317891438, "eval_rewards/env_goofspiel_reward/std": 0.07071068386236827, "eval_runtime": 2.2216, "eval_samples_per_second": 4.501, "eval_sampling/importance_sampling_ratio/max": 1.2363848288853962, "eval_sampling/importance_sampling_ratio/mean": 0.8729836543401083, "eval_sampling/importance_sampling_ratio/min": 0.3416567128151655, "eval_sampling/sampling_logp_difference/max": 1.6411640246709187, "eval_sampling/sampling_logp_difference/mean": 0.12927521020174026, "eval_steps_per_second": 0.9, "step": 75 } ], "logging_steps": 5, "max_steps": 18750, "num_input_tokens_seen": 2015567, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }