{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.024, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0375, "clip_ratio/high_mean": 0.01215277761220932, "clip_ratio/low_mean": 0.008333333488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020486111007630824, "completions/clipped_ratio": 0.0, "completions/max_length": 373.4, "completions/max_terminated_length": 373.4, "completions/mean_length": 297.375, "completions/mean_terminated_length": 297.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.46410001516342164, "epoch": 0.0008, "frac_reward_zero_std": 0.4, "grad_norm": 0.03925486281514168, "kl": 0.0062119101290591065, "learning_rate": 8.529119999999999e-07, "loss": 2.4803768610581757e-06, "num_tokens": 136410.0, "reward": 0.6898125052452088, "reward_std": 0.297073221206665, "rewards/env_goofspiel_reward/mean": 0.6898125052452088, "rewards/env_goofspiel_reward/std": 0.44231168627738954, "sampling/importance_sampling_ratio/max": 2.004665732383728, "sampling/importance_sampling_ratio/mean": 0.3851233392953873, "sampling/importance_sampling_ratio/min": 2.0644982578232883e-05, "sampling/sampling_logp_difference/max": 7.081162071228027, "sampling/sampling_logp_difference/mean": 0.6444696307182312, "step": 5, "step_time": 4.716039226800239 }, { "clip_ratio/high_max": 0.07736111134290695, "clip_ratio/high_mean": 0.023428030125796796, "clip_ratio/low_mean": 0.01346275256946683, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03689078260213137, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 291.68125, "completions/mean_terminated_length": 291.68125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.4338525801897049, "epoch": 0.0016, "frac_reward_zero_std": 0.35, "grad_norm": 0.10885529965162277, "kl": 0.012050760450074449, "learning_rate": 1.919052e-06, "loss": 0.00018108433578163386, "num_tokens": 270498.0, "reward": 0.7646875262260437, "reward_std": 0.32924659848213195, "rewards/env_goofspiel_reward/mean": 0.7646875262260437, "rewards/env_goofspiel_reward/std": 0.44543521404266356, "sampling/importance_sampling_ratio/max": 2.0047454595565797, "sampling/importance_sampling_ratio/mean": 0.39372612833976744, "sampling/importance_sampling_ratio/min": 1.5255385369528085e-05, "sampling/sampling_logp_difference/max": 7.504581451416016, "sampling/sampling_logp_difference/mean": 0.7591853618621827, "step": 10, "step_time": 4.234808461799912 }, { "clip_ratio/high_max": 0.06954545490443706, "clip_ratio/high_mean": 0.023011363483965395, "clip_ratio/low_mean": 0.022291666734963654, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04530302993953228, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.4219451487064362, "epoch": 0.0024, "frac_reward_zero_std": 0.425, "grad_norm": 0.06103011220693588, "kl": 0.04142252554884181, "learning_rate": 2.985192e-06, "loss": 0.0005043631885200739, "num_tokens": 403004.0, "reward": 0.7495625257492066, "reward_std": 0.286996965110302, "rewards/env_goofspiel_reward/mean": 0.7495625257492066, "rewards/env_goofspiel_reward/std": 0.43196319937705996, "sampling/importance_sampling_ratio/max": 2.247171688079834, "sampling/importance_sampling_ratio/mean": 0.4383248031139374, "sampling/importance_sampling_ratio/min": 2.4881603894755245e-05, "sampling/sampling_logp_difference/max": 7.91492919921875, "sampling/sampling_logp_difference/mean": 0.7775012850761414, "step": 15, "step_time": 4.656600760800211 }, { "clip_ratio/high_max": 0.05416666679084301, "clip_ratio/high_mean": 0.013541666697710753, "clip_ratio/low_mean": 0.014444444514811038, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027986111119389534, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 281.39375, "completions/mean_terminated_length": 281.39375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.38834676444530486, "epoch": 0.0032, "frac_reward_zero_std": 0.425, "grad_norm": 0.06636141240596771, "kl": 0.24065511422231795, "learning_rate": 4.051332e-06, "loss": -0.00033730233553797007, "num_tokens": 535785.0, "reward": 0.8736250162124634, "reward_std": 0.29185832142829893, "rewards/env_goofspiel_reward/mean": 0.8736250162124634, "rewards/env_goofspiel_reward/std": 0.4162306308746338, "sampling/importance_sampling_ratio/max": 2.5457701206207277, "sampling/importance_sampling_ratio/mean": 0.43321084380149844, "sampling/importance_sampling_ratio/min": 3.2646653380652424e-07, "sampling/sampling_logp_difference/max": 8.767316246032715, "sampling/sampling_logp_difference/mean": 0.715557587146759, "step": 20, "step_time": 4.1054079593999635 }, { "clip_ratio/high_max": 0.05347222238779068, "clip_ratio/high_mean": 0.01631944440305233, "clip_ratio/low_mean": 0.027108585741370917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.043428029771894215, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 300.24375, "completions/mean_terminated_length": 300.24375, "completions/min_length": 212.6, "completions/min_terminated_length": 212.6, "entropy": 0.3262263901531696, "epoch": 0.004, "frac_reward_zero_std": 0.4125, "grad_norm": 0.04734628647565842, "kl": 0.5914028726518155, "learning_rate": 5.117472e-06, "loss": -0.0011067342013120652, "num_tokens": 674417.0, "reward": 0.7871875405311585, "reward_std": 0.31828643679618834, "rewards/env_goofspiel_reward/mean": 0.7871875405311585, "rewards/env_goofspiel_reward/std": 0.42372027039527893, "sampling/importance_sampling_ratio/max": 1.7681864023208618, "sampling/importance_sampling_ratio/mean": 0.33697319626808164, "sampling/importance_sampling_ratio/min": 5.1189550868002696e-05, "sampling/sampling_logp_difference/max": 8.77573595046997, "sampling/sampling_logp_difference/mean": 0.7286012172698975, "step": 25, "step_time": 4.131656074401144 }, { "clip_ratio/high_max": 0.03625000007450581, "clip_ratio/high_mean": 0.009062500018626452, "clip_ratio/low_mean": 0.02396990731358528, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03303240723907948, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 283.6, "completions/mean_terminated_length": 283.6, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2808014929294586, "epoch": 0.0048, "frac_reward_zero_std": 0.425, "grad_norm": 0.10528500378131866, "kl": 1.774285883270204, "learning_rate": 6.183612e-06, "loss": -0.0006869177334010601, "num_tokens": 806438.0, "reward": 0.8661875247955322, "reward_std": 0.3341963529586792, "rewards/env_goofspiel_reward/mean": 0.8661875247955322, "rewards/env_goofspiel_reward/std": 0.4470097303390503, "sampling/importance_sampling_ratio/max": 1.7913244009017943, "sampling/importance_sampling_ratio/mean": 0.46525874733924866, "sampling/importance_sampling_ratio/min": 5.821734957862645e-05, "sampling/sampling_logp_difference/max": 7.922852897644043, "sampling/sampling_logp_difference/mean": 0.6293025732040405, "step": 30, "step_time": 4.5387513689995105 }, { "clip_ratio/high_max": 0.06666666679084302, "clip_ratio/high_mean": 0.02274305550381541, "clip_ratio/low_mean": 0.02211805563420057, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04486111104488373, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 289.15625, "completions/mean_terminated_length": 289.15625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.37011782824993134, "epoch": 0.0056, "frac_reward_zero_std": 0.375, "grad_norm": 0.1105622872710228, "kl": 0.8004692852497101, "learning_rate": 7.249752e-06, "loss": -0.0004383428022265434, "num_tokens": 941248.0, "reward": 0.8023125410079956, "reward_std": 0.350283020734787, "rewards/env_goofspiel_reward/mean": 0.8023125410079956, "rewards/env_goofspiel_reward/std": 0.4369929492473602, "sampling/importance_sampling_ratio/max": 1.9201649188995362, "sampling/importance_sampling_ratio/mean": 0.4281450629234314, "sampling/importance_sampling_ratio/min": 0.0001282373646972701, "sampling/sampling_logp_difference/max": 7.672937965393066, "sampling/sampling_logp_difference/mean": 0.6862899184226989, "step": 35, "step_time": 3.9215637991999754 }, { "clip_ratio/high_max": 0.052916666865348815, "clip_ratio/high_mean": 0.013229166716337204, "clip_ratio/low_mean": 0.01652777772396803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029756944440305234, "completions/clipped_ratio": 0.0, "completions/max_length": 374.8, "completions/max_terminated_length": 374.8, "completions/mean_length": 290.475, "completions/mean_terminated_length": 290.475, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.45398128926754, "epoch": 0.0064, "frac_reward_zero_std": 0.2625, "grad_norm": 0.05981823429465294, "kl": 0.35742565132677556, "learning_rate": 7.4629793691100655e-06, "loss": -0.00020290752872824668, "num_tokens": 1075889.0, "reward": 0.7535625219345092, "reward_std": 0.4086193323135376, "rewards/env_goofspiel_reward/mean": 0.7535625219345092, "rewards/env_goofspiel_reward/std": 0.48414809107780454, "sampling/importance_sampling_ratio/max": 2.2908178567886353, "sampling/importance_sampling_ratio/mean": 0.4616763234138489, "sampling/importance_sampling_ratio/min": 0.00017085522413253784, "sampling/sampling_logp_difference/max": 6.5542881965637205, "sampling/sampling_logp_difference/mean": 0.6057502806186676, "step": 40, "step_time": 3.9750062072005674 }, { "clip_ratio/high_max": 0.03541666679084301, "clip_ratio/high_mean": 0.010416666697710752, "clip_ratio/low_mean": 0.021631944458931684, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03204861097037792, "completions/clipped_ratio": 0.0, "completions/max_length": 375.4, "completions/max_terminated_length": 375.4, "completions/mean_length": 298.0375, "completions/mean_terminated_length": 298.0375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.4585177510976791, "epoch": 0.0072, "frac_reward_zero_std": 0.4125, "grad_norm": 0.023925846442580223, "kl": 0.2792994946241379, "learning_rate": 7.462976806120193e-06, "loss": -0.0006944169290363789, "num_tokens": 1212465.0, "reward": 0.8323750257492065, "reward_std": 0.30776822566986084, "rewards/env_goofspiel_reward/mean": 0.8323750257492065, "rewards/env_goofspiel_reward/std": 0.45529221296310424, "sampling/importance_sampling_ratio/max": 2.1405240535736083, "sampling/importance_sampling_ratio/mean": 0.47331286072731016, "sampling/importance_sampling_ratio/min": 0.00027849085163325074, "sampling/sampling_logp_difference/max": 6.536230754852295, "sampling/sampling_logp_difference/mean": 0.6047793924808502, "step": 45, "step_time": 4.178844353599925 }, { "clip_ratio/high_max": 0.03611111119389534, "clip_ratio/high_mean": 0.01152777774259448, "clip_ratio/low_mean": 0.014851641468703746, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026379418931901454, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 299.5375, "completions/mean_terminated_length": 299.5375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.32481125816702844, "epoch": 0.008, "frac_reward_zero_std": 0.45, "grad_norm": 0.04731212928891182, "kl": 0.9624257631599903, "learning_rate": 7.4629722716015665e-06, "loss": -0.000700222747400403, "num_tokens": 1350400.0, "reward": 0.9035625338554383, "reward_std": 0.2705567300319672, "rewards/env_goofspiel_reward/mean": 0.9035625338554383, "rewards/env_goofspiel_reward/std": 0.3961623251438141, "sampling/importance_sampling_ratio/max": 2.3192288398742678, "sampling/importance_sampling_ratio/mean": 0.5576886355876922, "sampling/importance_sampling_ratio/min": 0.0003088584111537784, "sampling/sampling_logp_difference/max": 5.365625381469727, "sampling/sampling_logp_difference/mean": 0.5096056282520294, "step": 50, "step_time": 4.218161174600027 }, { "clip_ratio/high_max": 0.0125, "clip_ratio/high_mean": 0.003125, "clip_ratio/low_mean": 0.012881944794207812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016006944794207813, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 284.55625, "completions/mean_terminated_length": 284.55625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.25873188972473143, "epoch": 0.0088, "frac_reward_zero_std": 0.575, "grad_norm": 0.03882903605699539, "kl": 1.3922856956720353, "learning_rate": 7.4629657655573805e-06, "loss": -0.00034891823306679723, "num_tokens": 1482932.0, "reward": 0.9674375414848327, "reward_std": 0.22282702028751372, "rewards/env_goofspiel_reward/mean": 0.9674375414848327, "rewards/env_goofspiel_reward/std": 0.3725932538509369, "sampling/importance_sampling_ratio/max": 2.143031930923462, "sampling/importance_sampling_ratio/mean": 0.6658945798873901, "sampling/importance_sampling_ratio/min": 0.0010862916285987012, "sampling/sampling_logp_difference/max": 7.023790454864502, "sampling/sampling_logp_difference/mean": 0.4928452789783478, "step": 55, "step_time": 3.999122467400048 }, { "clip_ratio/high_max": 0.00625, "clip_ratio/high_mean": 0.0015625, "clip_ratio/low_mean": 0.006613005138933659, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00817550513893366, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 294.025, "completions/mean_terminated_length": 294.025, "completions/min_length": 218.8, "completions/min_terminated_length": 218.8, "entropy": 0.25592469796538353, "epoch": 0.0096, "frac_reward_zero_std": 0.6875, "grad_norm": 0.011030570603907108, "kl": 0.5781544581055641, "learning_rate": 7.462957287992218e-06, "loss": -0.0010543103329837323, "num_tokens": 1618242.0, "reward": 1.0612500429153442, "reward_std": 0.15379572063684463, "rewards/env_goofspiel_reward/mean": 1.0612500429153442, "rewards/env_goofspiel_reward/std": 0.3027446687221527, "sampling/importance_sampling_ratio/max": 2.451231026649475, "sampling/importance_sampling_ratio/mean": 0.6636472702026367, "sampling/importance_sampling_ratio/min": 0.0002636277698911726, "sampling/sampling_logp_difference/max": 5.692580604553223, "sampling/sampling_logp_difference/mean": 0.41186076402664185, "step": 60, "step_time": 4.0995516278006106 }, { "clip_ratio/high_max": 0.02361111119389534, "clip_ratio/high_mean": 0.005902777798473835, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013715277798473835, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 273.5375, "completions/mean_terminated_length": 273.5375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2160419549793005, "epoch": 0.0104, "frac_reward_zero_std": 0.6375, "grad_norm": 0.03689345344901085, "kl": 0.7355392906814814, "learning_rate": 7.462946838912051e-06, "loss": -0.0004133625887334347, "num_tokens": 1747425.0, "reward": 1.0386249899864197, "reward_std": 0.207005512714386, "rewards/env_goofspiel_reward/mean": 1.0386249899864197, "rewards/env_goofspiel_reward/std": 0.34287082552909853, "sampling/importance_sampling_ratio/max": 1.7854061126708984, "sampling/importance_sampling_ratio/mean": 0.7041799902915955, "sampling/importance_sampling_ratio/min": 0.002913042064756155, "sampling/sampling_logp_difference/max": 5.417265224456787, "sampling/sampling_logp_difference/mean": 0.3267530858516693, "step": 65, "step_time": 4.27056002979989 }, { "clip_ratio/high_max": 0.00625, "clip_ratio/high_mean": 0.0015625, "clip_ratio/low_mean": 0.010138888843357563, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011701388843357563, "completions/clipped_ratio": 0.0, "completions/max_length": 374.8, "completions/max_terminated_length": 374.8, "completions/mean_length": 281.0875, "completions/mean_terminated_length": 281.0875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.20821231752634048, "epoch": 0.0112, "frac_reward_zero_std": 0.7125, "grad_norm": 0.09448564797639847, "kl": 1.5728149417787791, "learning_rate": 7.462934418324241e-06, "loss": -0.0004269219934940338, "num_tokens": 1879313.0, "reward": 1.0425000429153441, "reward_std": 0.1484924241900444, "rewards/env_goofspiel_reward/mean": 1.0425000429153441, "rewards/env_goofspiel_reward/std": 0.3160957217216492, "sampling/importance_sampling_ratio/max": 2.062333583831787, "sampling/importance_sampling_ratio/mean": 0.6789644956588745, "sampling/importance_sampling_ratio/min": 0.000793453273945488, "sampling/sampling_logp_difference/max": 5.028042125701904, "sampling/sampling_logp_difference/mean": 0.3625839054584503, "step": 70, "step_time": 4.002103836400238 }, { "clip_ratio/high_max": 0.00625, "clip_ratio/high_mean": 0.0015625, "clip_ratio/low_mean": 0.008715277817100287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010277777817100287, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 292.48125, "completions/mean_terminated_length": 292.48125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.23503879755735396, "epoch": 0.012, "frac_reward_zero_std": 0.675, "grad_norm": 0.008777249604463577, "kl": 1.0266067795455456, "learning_rate": 7.4629200262375374e-06, "loss": -0.000634206272661686, "num_tokens": 2014943.0, "reward": 1.038687562942505, "reward_std": 0.16449071615934371, "rewards/env_goofspiel_reward/mean": 1.038687562942505, "rewards/env_goofspiel_reward/std": 0.3346868008375168, "sampling/importance_sampling_ratio/max": 1.5970592021942138, "sampling/importance_sampling_ratio/mean": 0.6036460041999817, "sampling/importance_sampling_ratio/min": 0.00030632600537501277, "sampling/sampling_logp_difference/max": 5.986368083953858, "sampling/sampling_logp_difference/mean": 0.4311356723308563, "step": 75, "step_time": 4.108657225800198 }, { "epoch": 0.012, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 373.0, "eval_completions/max_terminated_length": 373.0, "eval_completions/mean_length": 314.2916666666667, "eval_completions/mean_terminated_length": 314.2916666666667, "eval_completions/min_length": 263.3333333333333, "eval_completions/min_terminated_length": 263.3333333333333, "eval_entropy": 0.2657604416211446, "eval_frac_reward_zero_std": 0.4166666666666667, "eval_kl": 0.5168450077374777, "eval_loss": -0.00045055957161821425, "eval_num_tokens": 2014943.0, "eval_reward": 0.99958336353302, "eval_reward_std": 0.21272129813830057, "eval_rewards/env_goofspiel_reward/mean": 0.99958336353302, "eval_rewards/env_goofspiel_reward/std": 0.2815760125716527, "eval_runtime": 2.0876, "eval_samples_per_second": 4.79, "eval_sampling/importance_sampling_ratio/max": 1.69784019390742, "eval_sampling/importance_sampling_ratio/mean": 0.624309907356898, "eval_sampling/importance_sampling_ratio/min": 0.014207058896621069, "eval_sampling/sampling_logp_difference/max": 4.317745526631673, "eval_sampling/sampling_logp_difference/mean": 0.3868154088656108, "eval_steps_per_second": 0.958, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012099116202443838, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012099116202443838, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.270469605922699, "epoch": 0.0128, "frac_reward_zero_std": 0.6625, "grad_norm": 0.05691204220056534, "kl": 0.7552504394203424, "learning_rate": 7.462903662662079e-06, "loss": -0.0006706514395773411, "num_tokens": 2148759.0, "reward": 1.0424375176429748, "reward_std": 0.18040061742067337, "rewards/env_goofspiel_reward/mean": 1.0424375176429748, "rewards/env_goofspiel_reward/std": 0.3348836898803711, "sampling/importance_sampling_ratio/max": 1.9503651142120362, "sampling/importance_sampling_ratio/mean": 0.6592345595359802, "sampling/importance_sampling_ratio/min": 0.0007974941050633788, "sampling/sampling_logp_difference/max": 5.421667098999023, "sampling/sampling_logp_difference/mean": 0.37761002480983735, "step": 80, "step_time": 4.0779971672005555 }, { "clip_ratio/high_max": 0.01704545468091965, "clip_ratio/high_mean": 0.004261363670229912, "clip_ratio/low_mean": 0.01110164150595665, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015363005176186561, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 289.10625, "completions/mean_terminated_length": 289.10625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.3144980549812317, "epoch": 0.0136, "frac_reward_zero_std": 0.55, "grad_norm": 0.02907728962600231, "kl": 0.603056262433529, "learning_rate": 7.462885327609394e-06, "loss": -0.00035388271789997815, "num_tokens": 2282680.0, "reward": 0.9262500286102295, "reward_std": 0.25986174046993255, "rewards/env_goofspiel_reward/mean": 0.9262500286102295, "rewards/env_goofspiel_reward/std": 0.40943323373794555, "sampling/importance_sampling_ratio/max": 2.144289803504944, "sampling/importance_sampling_ratio/mean": 0.5303596138954163, "sampling/importance_sampling_ratio/min": 0.0002621762232593028, "sampling/sampling_logp_difference/max": 6.042155361175537, "sampling/sampling_logp_difference/mean": 0.5484204053878784, "step": 85, "step_time": 4.207774694199543 }, { "clip_ratio/high_max": 0.00555555559694767, "clip_ratio/high_mean": 0.0013888888992369176, "clip_ratio/low_mean": 0.011493055615574121, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012881944514811039, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 298.5875, "completions/mean_terminated_length": 298.5875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.25913119316101074, "epoch": 0.0144, "frac_reward_zero_std": 0.475, "grad_norm": 0.021151067689061165, "kl": 0.6603276126086712, "learning_rate": 7.462865021092397e-06, "loss": -0.0006772585213184357, "num_tokens": 2420580.0, "reward": 0.9559375405311584, "reward_std": 0.28134011626243594, "rewards/env_goofspiel_reward/mean": 0.9559375405311584, "rewards/env_goofspiel_reward/std": 0.3986021220684052, "sampling/importance_sampling_ratio/max": 2.0312726736068725, "sampling/importance_sampling_ratio/mean": 0.5839850544929505, "sampling/importance_sampling_ratio/min": 0.0004560710280202329, "sampling/sampling_logp_difference/max": 6.067973613739014, "sampling/sampling_logp_difference/mean": 0.4282756567001343, "step": 90, "step_time": 4.005906563800636 }, { "clip_ratio/high_max": 0.01111111119389534, "clip_ratio/high_mean": 0.002777777798473835, "clip_ratio/low_mean": 0.013939394056797028, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016717171855270864, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 283.1375, "completions/mean_terminated_length": 283.1375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2632964253425598, "epoch": 0.0152, "frac_reward_zero_std": 0.6125, "grad_norm": 0.0911855697631836, "kl": 1.6807550355792045, "learning_rate": 7.462842743125395e-06, "loss": -0.0009113414213061333, "num_tokens": 2552657.0, "reward": 1.0198750019073486, "reward_std": 0.2229154199361801, "rewards/env_goofspiel_reward/mean": 1.0198750019073486, "rewards/env_goofspiel_reward/std": 0.3695839524269104, "sampling/importance_sampling_ratio/max": 1.79537193775177, "sampling/importance_sampling_ratio/mean": 0.5864414393901825, "sampling/importance_sampling_ratio/min": 0.0028222970955539494, "sampling/sampling_logp_difference/max": 6.276762676239014, "sampling/sampling_logp_difference/mean": 0.4131439089775085, "step": 95, "step_time": 4.100510867999764 }, { "clip_ratio/high_max": 0.00625, "clip_ratio/high_mean": 0.0015625, "clip_ratio/low_mean": 0.0072916666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008854166697710752, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 285.8375, "completions/mean_terminated_length": 285.8375, "completions/min_length": 218.6, "completions/min_terminated_length": 218.6, "entropy": 0.289502215385437, "epoch": 0.016, "frac_reward_zero_std": 0.6, "grad_norm": 0.004398128483444452, "kl": 0.6524220421910286, "learning_rate": 7.4628184937240836e-06, "loss": -0.000950614083558321, "num_tokens": 2685325.0, "reward": 0.9823125600814819, "reward_std": 0.22282702326774598, "rewards/env_goofspiel_reward/mean": 0.9823125600814819, "rewards/env_goofspiel_reward/std": 0.37007221579551697, "sampling/importance_sampling_ratio/max": 1.9957563161849976, "sampling/importance_sampling_ratio/mean": 0.5997037708759307, "sampling/importance_sampling_ratio/min": 0.0030981259667896667, "sampling/sampling_logp_difference/max": 5.579097270965576, "sampling/sampling_logp_difference/mean": 0.4109388738870621, "step": 100, "step_time": 4.055971824800144 }, { "clip_ratio/high_max": 0.022361111268401145, "clip_ratio/high_mean": 0.006979166716337204, "clip_ratio/low_mean": 0.005763888917863369, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012743055541068315, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 288.5625, "completions/mean_terminated_length": 288.5625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.3037287026643753, "epoch": 0.0168, "frac_reward_zero_std": 0.6375, "grad_norm": 0.017863936722278595, "kl": 0.7445758618414402, "learning_rate": 7.4627922729055425e-06, "loss": -0.0005706328898668289, "num_tokens": 2819431.0, "reward": 1.046250009536743, "reward_std": 0.18561552762985228, "rewards/env_goofspiel_reward/mean": 1.046250009536743, "rewards/env_goofspiel_reward/std": 0.31610003411769866, "sampling/importance_sampling_ratio/max": 1.9027514457702637, "sampling/importance_sampling_ratio/mean": 0.6652593612670898, "sampling/importance_sampling_ratio/min": 0.0008113190764561295, "sampling/sampling_logp_difference/max": 6.060176849365234, "sampling/sampling_logp_difference/mean": 0.3919778883457184, "step": 105, "step_time": 4.1537415953993335 }, { "clip_ratio/high_max": 0.02986111119389534, "clip_ratio/high_mean": 0.007465277798473835, "clip_ratio/low_mean": 0.010277777817100287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01774305561557412, "completions/clipped_ratio": 0.0, "completions/max_length": 374.8, "completions/max_terminated_length": 374.8, "completions/mean_length": 287.5875, "completions/mean_terminated_length": 287.5875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.31742958873510363, "epoch": 0.0176, "frac_reward_zero_std": 0.5375, "grad_norm": 0.027483796700835228, "kl": 0.7258106715977192, "learning_rate": 7.462764080688243e-06, "loss": -0.0009170899167656899, "num_tokens": 2953131.0, "reward": 0.982312548160553, "reward_std": 0.24404022991657257, "rewards/env_goofspiel_reward/mean": 0.982312548160553, "rewards/env_goofspiel_reward/std": 0.36874093413352965, "sampling/importance_sampling_ratio/max": 1.8940089225769043, "sampling/importance_sampling_ratio/mean": 0.6627432525157928, "sampling/importance_sampling_ratio/min": 0.0007015861105173826, "sampling/sampling_logp_difference/max": 6.205275201797486, "sampling/sampling_logp_difference/mean": 0.37100034952163696, "step": 110, "step_time": 4.001762911599871 }, { "clip_ratio/high_max": 0.023055555671453475, "clip_ratio/high_mean": 0.005763888917863369, "clip_ratio/low_mean": 0.0029513888992369177, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008715277723968028, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 293.1875, "completions/mean_terminated_length": 293.1875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.30907001420855523, "epoch": 0.0184, "frac_reward_zero_std": 0.5625, "grad_norm": 0.015677573159337044, "kl": 0.9665988653898239, "learning_rate": 7.4627339170920494e-06, "loss": -0.0009083808399736881, "num_tokens": 3088531.0, "reward": 0.9374375104904175, "reward_std": 0.2228270262479782, "rewards/env_goofspiel_reward/mean": 0.9374375104904175, "rewards/env_goofspiel_reward/std": 0.4014770984649658, "sampling/importance_sampling_ratio/max": 1.731867289543152, "sampling/importance_sampling_ratio/mean": 0.6140377283096313, "sampling/importance_sampling_ratio/min": 0.00030801825923845174, "sampling/sampling_logp_difference/max": 5.473852968215942, "sampling/sampling_logp_difference/mean": 0.3702615320682526, "step": 115, "step_time": 4.100767185799486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004340277798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004340277798473835, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 285.58125, "completions/mean_terminated_length": 285.58125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2561140716075897, "epoch": 0.0192, "frac_reward_zero_std": 0.725, "grad_norm": 0.0040421271696686745, "kl": 0.5731004536151886, "learning_rate": 7.462701782138208e-06, "loss": -0.0007791116368025541, "num_tokens": 3221502.0, "reward": 1.0274375200271606, "reward_std": 0.1485808130353689, "rewards/env_goofspiel_reward/mean": 1.0274375200271606, "rewards/env_goofspiel_reward/std": 0.3382692337036133, "sampling/importance_sampling_ratio/max": 1.8715962648391724, "sampling/importance_sampling_ratio/mean": 0.7390327334403992, "sampling/importance_sampling_ratio/min": 0.0010936856037005783, "sampling/sampling_logp_difference/max": 5.8203360080719, "sampling/sampling_logp_difference/mean": 0.29723324775695803, "step": 120, "step_time": 4.116316759600522 }, { "clip_ratio/high_max": 0.020656565949320794, "clip_ratio/high_mean": 0.0051641414873301985, "clip_ratio/low_mean": 0.009687500074505806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01485164137557149, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 292.74375, "completions/mean_terminated_length": 292.74375, "completions/min_length": 199.6, "completions/min_terminated_length": 199.6, "entropy": 0.2419313907623291, "epoch": 0.02, "frac_reward_zero_std": 0.725, "grad_norm": 0.007091797888278961, "kl": 0.6101688414812088, "learning_rate": 7.462667675849357e-06, "loss": -0.0007256286218762398, "num_tokens": 3355893.0, "reward": 1.0649374961853026, "reward_std": 0.13797421008348465, "rewards/env_goofspiel_reward/mean": 1.0649374961853026, "rewards/env_goofspiel_reward/std": 0.29806646406650544, "sampling/importance_sampling_ratio/max": 1.4559056520462037, "sampling/importance_sampling_ratio/mean": 0.7336877107620239, "sampling/importance_sampling_ratio/min": 0.001045782444998622, "sampling/sampling_logp_difference/max": 5.669601249694824, "sampling/sampling_logp_difference/mean": 0.32060971260070803, "step": 125, "step_time": 4.16715560520006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007297979947179556, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007297979947179556, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 293.225, "completions/mean_terminated_length": 293.225, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.21281768046319485, "epoch": 0.0208, "frac_reward_zero_std": 0.725, "grad_norm": 0.012143277563154697, "kl": 0.9402982890605927, "learning_rate": 7.462631598249523e-06, "loss": -0.0005932614207267761, "num_tokens": 3491703.0, "reward": 1.0874375343322753, "reward_std": 0.15918741673231124, "rewards/env_goofspiel_reward/mean": 1.0874375343322753, "rewards/env_goofspiel_reward/std": 0.2946491539478302, "sampling/importance_sampling_ratio/max": 1.9441105842590332, "sampling/importance_sampling_ratio/mean": 0.7611136674880982, "sampling/importance_sampling_ratio/min": 0.0005575922084972262, "sampling/sampling_logp_difference/max": 6.110091686248779, "sampling/sampling_logp_difference/mean": 0.27749234437942505, "step": 130, "step_time": 4.052483657999801 }, { "clip_ratio/high_max": 0.00625, "clip_ratio/high_mean": 0.0015625, "clip_ratio/low_mean": 0.01180555559694767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01336805559694767, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 282.3375, "completions/mean_terminated_length": 282.3375, "completions/min_length": 194.6, "completions/min_terminated_length": 194.6, "entropy": 0.21125009432435035, "epoch": 0.0216, "frac_reward_zero_std": 0.65, "grad_norm": 0.017155468463897705, "kl": 0.7436703704297543, "learning_rate": 7.462593549364123e-06, "loss": -0.000642262538895011, "num_tokens": 3624164.0, "reward": 1.0160000562667846, "reward_std": 0.1965756893157959, "rewards/env_goofspiel_reward/mean": 1.0160000562667846, "rewards/env_goofspiel_reward/std": 0.35902883410453795, "sampling/importance_sampling_ratio/max": 1.8083943367004394, "sampling/importance_sampling_ratio/mean": 0.6931412100791932, "sampling/importance_sampling_ratio/min": 0.001236545197753003, "sampling/sampling_logp_difference/max": 6.242915344238281, "sampling/sampling_logp_difference/mean": 0.30908069014549255, "step": 135, "step_time": 4.004675274600231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010798611212521791, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010798611212521791, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 300.39375, "completions/mean_terminated_length": 300.39375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.19219726845622062, "epoch": 0.0224, "frac_reward_zero_std": 0.6, "grad_norm": 0.010142410174012184, "kl": 1.7957286298274995, "learning_rate": 7.462553529219961e-06, "loss": -0.0008514182642102242, "num_tokens": 3762831.0, "reward": 1.0236875414848328, "reward_std": 0.2069171190261841, "rewards/env_goofspiel_reward/mean": 1.0236875414848328, "rewards/env_goofspiel_reward/std": 0.3530817925930023, "sampling/importance_sampling_ratio/max": 1.7995745182037353, "sampling/importance_sampling_ratio/mean": 0.7829151630401612, "sampling/importance_sampling_ratio/min": 0.005923272194922902, "sampling/sampling_logp_difference/max": 5.419459342956543, "sampling/sampling_logp_difference/mean": 0.23471600711345672, "step": 140, "step_time": 4.171908260800047 }, { "clip_ratio/high_max": 0.01041666679084301, "clip_ratio/high_mean": 0.0026041666977107527, "clip_ratio/low_mean": 0.005590277723968029, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00819444442167878, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 289.4625, "completions/mean_terminated_length": 289.4625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.22108654975891112, "epoch": 0.0232, "frac_reward_zero_std": 0.7375, "grad_norm": 0.04653310030698776, "kl": 0.5376328125596046, "learning_rate": 7.462511537845228e-06, "loss": -0.00045080664567649366, "num_tokens": 3896833.0, "reward": 1.0724375009536744, "reward_std": 0.13797421008348465, "rewards/env_goofspiel_reward/mean": 1.0724375009536744, "rewards/env_goofspiel_reward/std": 0.30171733498573305, "sampling/importance_sampling_ratio/max": 1.8748072385787964, "sampling/importance_sampling_ratio/mean": 0.7784299254417419, "sampling/importance_sampling_ratio/min": 0.0016431780066341161, "sampling/sampling_logp_difference/max": 6.234890079498291, "sampling/sampling_logp_difference/mean": 0.2888496518135071, "step": 145, "step_time": 4.1071294096005655 }, { "clip_ratio/high_max": 0.00555555559694767, "clip_ratio/high_mean": 0.0013888888992369176, "clip_ratio/low_mean": 0.0015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029513888992369177, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 283.29375, "completions/mean_terminated_length": 283.29375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2212754048407078, "epoch": 0.024, "frac_reward_zero_std": 0.7125, "grad_norm": 0.026271872222423553, "kl": 0.6011221908032894, "learning_rate": 7.4624675752695055e-06, "loss": -0.0005275519099086523, "num_tokens": 4029909.0, "reward": 1.0724999904632568, "reward_std": 0.14849241971969604, "rewards/env_goofspiel_reward/mean": 1.0724999904632568, "rewards/env_goofspiel_reward/std": 0.29245385825634, "sampling/importance_sampling_ratio/max": 1.610938024520874, "sampling/importance_sampling_ratio/mean": 0.7898086547851563, "sampling/importance_sampling_ratio/min": 0.0010933216894045473, "sampling/sampling_logp_difference/max": 4.982583332061767, "sampling/sampling_logp_difference/mean": 0.22510133385658265, "step": 150, "step_time": 4.046049839799889 }, { "epoch": 0.024, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 373.3333333333333, "eval_completions/max_terminated_length": 373.3333333333333, "eval_completions/mean_length": 314.2083333333333, "eval_completions/mean_terminated_length": 314.2083333333333, "eval_completions/min_length": 263.3333333333333, "eval_completions/min_terminated_length": 263.3333333333333, "eval_entropy": 0.2316575994094213, "eval_frac_reward_zero_std": 0.9166666666666666, "eval_kl": 0.5032360255718231, "eval_loss": -0.00011960561823798344, "eval_num_tokens": 4029909.0, "eval_reward": 1.1500000556310017, "eval_reward_std": 0.07071067889531453, "eval_rewards/env_goofspiel_reward/mean": 1.1500000556310017, "eval_rewards/env_goofspiel_reward/std": 0.14142136772473654, "eval_runtime": 2.0552, "eval_samples_per_second": 4.866, "eval_sampling/importance_sampling_ratio/max": 1.122262716293335, "eval_sampling/importance_sampling_ratio/mean": 0.7364152868588766, "eval_sampling/importance_sampling_ratio/min": 0.09143149045606454, "eval_sampling/sampling_logp_difference/max": 3.0512802600860596, "eval_sampling/sampling_logp_difference/mean": 0.20749726643164954, "eval_steps_per_second": 0.973, "step": 150 } ], "logging_steps": 5, "max_steps": 18750, "num_input_tokens_seen": 4029909, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }