{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.009000360014400577, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 292.2166809082031, "completions/mean_terminated_length": 292.2166809082031, "completions/min_length": 174.8, "completions/min_terminated_length": 174.8, "entropy": 0.7345801413059234, "epoch": 0.0006000240009600384, "frac_reward_zero_std": 0.650000023841858, "grad_norm": 0.578125, "kl": 0.014322867337614297, "learning_rate": 1.137216e-06, "loss": 0.00023176579270511866, "num_tokens": 101638.0, "reward": 0.023166669206693767, "reward_std": 0.03771236310712993, "rewards/env_goofspiel_reward/mean": 0.023166668484918773, "rewards/env_goofspiel_reward/std": 0.10488454704172909, "sampling/importance_sampling_ratio/max": 2.230660581588745, "sampling/importance_sampling_ratio/mean": 1.0512551069259644, "sampling/importance_sampling_ratio/min": 0.45719883143901824, "sampling/sampling_logp_difference/max": 0.9388755321502685, "sampling/sampling_logp_difference/mean": 0.08014384806156158, "step": 5, "step_time": 2.8670244561999425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 284.3750061035156, "completions/mean_terminated_length": 284.3750061035156, "completions/min_length": 197.4, "completions/min_terminated_length": 197.4, "entropy": 0.7396668076515198, "epoch": 0.0012000480019200767, "frac_reward_zero_std": 0.5333333492279053, "grad_norm": 0.6484375, "kl": 0.008018274139612914, "learning_rate": 2.5587359999999995e-06, "loss": 0.0010880917310714723, "num_tokens": 201985.0, "reward": 0.1135000076610595, "reward_std": 0.1355288046877831, "rewards/env_goofspiel_reward/mean": 0.11350000470411033, "rewards/env_goofspiel_reward/std": 0.24339603506959975, "sampling/importance_sampling_ratio/max": 1.9386430501937866, "sampling/importance_sampling_ratio/mean": 1.0104641795158387, "sampling/importance_sampling_ratio/min": 0.48966296911239626, "sampling/sampling_logp_difference/max": 0.7527793884277344, "sampling/sampling_logp_difference/mean": 0.06743223667144775, "step": 10, "step_time": 2.562607514999763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 292.29168090820315, "completions/mean_terminated_length": 292.29168090820315, "completions/min_length": 205.4, "completions/min_terminated_length": 205.4, "entropy": 0.6868447959423065, "epoch": 0.0018000720028801152, "frac_reward_zero_std": 0.7666666865348816, "grad_norm": 0.248046875, "kl": 0.015538515662774444, "learning_rate": 3.9802559999999995e-06, "loss": -0.0002336445264518261, "num_tokens": 303718.0, "reward": 0.02416666953358799, "reward_std": 0.03653385282959789, "rewards/env_goofspiel_reward/mean": 0.024166667682584374, "rewards/env_goofspiel_reward/std": 0.10811053770594299, "sampling/importance_sampling_ratio/max": 1.5562421321868896, "sampling/importance_sampling_ratio/mean": 0.9962499618530274, "sampling/importance_sampling_ratio/min": 0.4864930033683777, "sampling/sampling_logp_difference/max": 0.7846987843513489, "sampling/sampling_logp_difference/mean": 0.05959557741880417, "step": 15, "step_time": 2.4648701715999777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 286.4166748046875, "completions/mean_terminated_length": 286.4166748046875, "completions/min_length": 199.6, "completions/min_terminated_length": 199.6, "entropy": 0.6791316926479339, "epoch": 0.0024000960038401535, "frac_reward_zero_std": 0.7833333492279053, "grad_norm": 0.419921875, "kl": 0.03971561994403601, "learning_rate": 5.401775999999999e-06, "loss": 0.00017259303713217377, "num_tokens": 403266.0, "reward": 0.05466667115688324, "reward_std": 0.07825315818190574, "rewards/env_goofspiel_reward/mean": 0.054666668176651, "rewards/env_goofspiel_reward/std": 0.1692157879471779, "sampling/importance_sampling_ratio/max": 1.7059913635253907, "sampling/importance_sampling_ratio/mean": 1.0254743337631225, "sampling/importance_sampling_ratio/min": 0.5575755715370179, "sampling/sampling_logp_difference/max": 0.5765037894248962, "sampling/sampling_logp_difference/mean": 0.06591257303953171, "step": 20, "step_time": 2.4083536974001616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 279.07501831054685, "completions/mean_terminated_length": 279.07501831054685, "completions/min_length": 206.8, "completions/min_terminated_length": 206.8, "entropy": 0.5845985025167465, "epoch": 0.003000120004800192, "frac_reward_zero_std": 0.8666666865348815, "grad_norm": 0.2080078125, "kl": 0.07888290733098983, "learning_rate": 6.8232959999999994e-06, "loss": 0.00014582456788048148, "num_tokens": 501949.0, "reward": 0.029750002920627593, "reward_std": 0.042544259876012805, "rewards/env_goofspiel_reward/mean": 0.029750000685453415, "rewards/env_goofspiel_reward/std": 0.11651719957590104, "sampling/importance_sampling_ratio/max": 1.5395583629608154, "sampling/importance_sampling_ratio/mean": 0.9871565222740173, "sampling/importance_sampling_ratio/min": 0.6314660668373108, "sampling/sampling_logp_difference/max": 0.43876824378967283, "sampling/sampling_logp_difference/mean": 0.04884573593735695, "step": 25, "step_time": 2.4319384599999467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 294.0166748046875, "completions/mean_terminated_length": 294.0166748046875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.5196643978357315, "epoch": 0.0036001440057602304, "frac_reward_zero_std": 0.850000011920929, "grad_norm": 0.62109375, "kl": 0.1175543449819088, "learning_rate": 8.244816e-06, "loss": 0.0001264215330593288, "num_tokens": 604591.0, "reward": 0.044750004261732104, "reward_std": 0.04985102787613869, "rewards/env_goofspiel_reward/mean": 0.044750002399086955, "rewards/env_goofspiel_reward/std": 0.125959412753582, "sampling/importance_sampling_ratio/max": 1.7827884435653687, "sampling/importance_sampling_ratio/mean": 0.9959828734397889, "sampling/importance_sampling_ratio/min": 0.6090725898742676, "sampling/sampling_logp_difference/max": 0.5527279376983643, "sampling/sampling_logp_difference/mean": 0.05205402001738548, "step": 30, "step_time": 2.3975842315998306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 291.40834350585936, "completions/mean_terminated_length": 291.40834350585936, "completions/min_length": 219.2, "completions/min_terminated_length": 219.2, "entropy": 0.4317367374897003, "epoch": 0.004200168006720269, "frac_reward_zero_std": 0.7500000119209289, "grad_norm": 0.00531005859375, "kl": 0.12076274678111076, "learning_rate": 9.666336e-06, "loss": 0.0003318458097055554, "num_tokens": 706062.0, "reward": 0.09500000476837159, "reward_std": 0.12020815908908844, "rewards/env_goofspiel_reward/mean": 0.09500000327825546, "rewards/env_goofspiel_reward/std": 0.21089642345905305, "sampling/importance_sampling_ratio/max": 1.4750993490219115, "sampling/importance_sampling_ratio/mean": 0.9911892533302307, "sampling/importance_sampling_ratio/min": 0.5305798888206482, "sampling/sampling_logp_difference/max": 0.7321011543273925, "sampling/sampling_logp_difference/mean": 0.043305123969912526, "step": 35, "step_time": 2.43143495660006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.4, "completions/max_terminated_length": 373.4, "completions/mean_length": 288.1250061035156, "completions/mean_terminated_length": 288.1250061035156, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.351773801445961, "epoch": 0.004800192007680307, "frac_reward_zero_std": 0.8666666865348815, "grad_norm": 0.33203125, "kl": 0.05839128475636244, "learning_rate": 9.950639527236806e-06, "loss": 4.926343681290746e-05, "num_tokens": 806862.0, "reward": 0.040000003576278684, "reward_std": 0.05656854510307312, "rewards/env_goofspiel_reward/mean": 0.04000000059604645, "rewards/env_goofspiel_reward/std": 0.11344237923622132, "sampling/importance_sampling_ratio/max": 1.5833612918853759, "sampling/importance_sampling_ratio/mean": 1.0104422450065613, "sampling/importance_sampling_ratio/min": 0.6002241253852845, "sampling/sampling_logp_difference/max": 0.5244450092315673, "sampling/sampling_logp_difference/mean": 0.03759892582893372, "step": 40, "step_time": 2.415808950400242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 291.74168090820314, "completions/mean_terminated_length": 291.74168090820314, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.3240418329834938, "epoch": 0.005400216008640346, "frac_reward_zero_std": 0.9166666865348816, "grad_norm": 0.0439453125, "kl": 0.13300706073641777, "learning_rate": 9.950637606636539e-06, "loss": 0.0001355916727334261, "num_tokens": 907008.0, "reward": 0.034833335876464845, "reward_std": 0.03535534143447876, "rewards/env_goofspiel_reward/mean": 0.03483333364129067, "rewards/env_goofspiel_reward/std": 0.14086373001337052, "sampling/importance_sampling_ratio/max": 1.4610427141189575, "sampling/importance_sampling_ratio/mean": 0.9817873358726501, "sampling/importance_sampling_ratio/min": 0.6326651930809021, "sampling/sampling_logp_difference/max": 0.46174774169921873, "sampling/sampling_logp_difference/mean": 0.040712539479136466, "step": 45, "step_time": 2.417739940000138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 279.3333435058594, "completions/mean_terminated_length": 279.3333435058594, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.3186733976006508, "epoch": 0.006000240009600384, "frac_reward_zero_std": 0.8833333611488342, "grad_norm": 0.353515625, "kl": 0.06966875828802585, "learning_rate": 9.950634208652256e-06, "loss": 0.00012671776348724962, "num_tokens": 1005578.0, "reward": 0.034916669048834593, "reward_std": 0.049615327350329606, "rewards/env_goofspiel_reward/mean": 0.03491666756453924, "rewards/env_goofspiel_reward/std": 0.13708889302797617, "sampling/importance_sampling_ratio/max": 1.5069894075393677, "sampling/importance_sampling_ratio/mean": 1.0077889800071715, "sampling/importance_sampling_ratio/min": 0.7747546076774597, "sampling/sampling_logp_difference/max": 0.3697906732559204, "sampling/sampling_logp_difference/mean": 0.03059108220040798, "step": 50, "step_time": 2.3773288868003872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 305.49168090820314, "completions/mean_terminated_length": 305.49168090820314, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.29847966730594633, "epoch": 0.006600264010560422, "frac_reward_zero_std": 0.900000023841858, "grad_norm": 0.30078125, "kl": 0.08750866688787937, "learning_rate": 9.950629333285305e-06, "loss": -2.145505277439952e-06, "num_tokens": 1110455.0, "reward": 0.035000003129243853, "reward_std": 0.04949747696518898, "rewards/env_goofspiel_reward/mean": 0.03500000052154064, "rewards/env_goofspiel_reward/std": 0.1412438616156578, "sampling/importance_sampling_ratio/max": 1.3705053567886352, "sampling/importance_sampling_ratio/mean": 1.0030481100082398, "sampling/importance_sampling_ratio/min": 0.6937733888626099, "sampling/sampling_logp_difference/max": 0.4317422866821289, "sampling/sampling_logp_difference/mean": 0.03450411073863506, "step": 55, "step_time": 2.4346962132000045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 294.62501220703126, "completions/mean_terminated_length": 294.62501220703126, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2199392184615135, "epoch": 0.007200288011520461, "frac_reward_zero_std": 0.7666666746139527, "grad_norm": 0.1328125, "kl": 0.20852462351322174, "learning_rate": 9.950622980537618e-06, "loss": -1.4243402983993291e-05, "num_tokens": 1211770.0, "reward": 0.08483333513140678, "reward_std": 0.09215958416461945, "rewards/env_goofspiel_reward/mean": 0.08483333475887775, "rewards/env_goofspiel_reward/std": 0.2297523573040962, "sampling/importance_sampling_ratio/max": 1.3913918256759643, "sampling/importance_sampling_ratio/mean": 0.9923399925231934, "sampling/importance_sampling_ratio/min": 0.7218815922737122, "sampling/sampling_logp_difference/max": 0.3741787910461426, "sampling/sampling_logp_difference/mean": 0.023421294614672662, "step": 60, "step_time": 2.4339242404003016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 299.108349609375, "completions/mean_terminated_length": 299.108349609375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.14547686353325845, "epoch": 0.0078003120124804995, "frac_reward_zero_std": 0.8666666865348815, "grad_norm": 0.01318359375, "kl": 1.5776881486177445, "learning_rate": 9.950615150411705e-06, "loss": 0.00020953675266355276, "num_tokens": 1315125.0, "reward": 0.03991667032241821, "reward_std": 0.05668639615178108, "rewards/env_goofspiel_reward/mean": 0.03991666734218598, "rewards/env_goofspiel_reward/std": 0.15578595399856568, "sampling/importance_sampling_ratio/max": 1.5614466190338134, "sampling/importance_sampling_ratio/mean": 1.0267313480377198, "sampling/importance_sampling_ratio/min": 0.8062139034271241, "sampling/sampling_logp_difference/max": 0.38132710456848146, "sampling/sampling_logp_difference/mean": 0.017732756957411767, "step": 65, "step_time": 2.3855804428001646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.4, "completions/max_terminated_length": 373.4, "completions/mean_length": 288.8666748046875, "completions/mean_terminated_length": 288.8666748046875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.36806915551424024, "epoch": 0.008400336013440538, "frac_reward_zero_std": 0.8333333730697632, "grad_norm": 0.103515625, "kl": 0.154165069013834, "learning_rate": 9.950605842910668e-06, "loss": 3.9057480171322824e-05, "num_tokens": 1415706.0, "reward": 0.0650000050663948, "reward_std": 0.07778174877166748, "rewards/env_goofspiel_reward/mean": 0.06500000134110451, "rewards/env_goofspiel_reward/std": 0.20113323032855987, "sampling/importance_sampling_ratio/max": 1.3685919523239136, "sampling/importance_sampling_ratio/mean": 0.9923707365989685, "sampling/importance_sampling_ratio/min": 0.6977368593215942, "sampling/sampling_logp_difference/max": 0.37837958335876465, "sampling/sampling_logp_difference/mean": 0.023688069358468056, "step": 70, "step_time": 2.419219413600149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 296.4750122070312, "completions/mean_terminated_length": 296.4750122070312, "completions/min_length": 218.8, "completions/min_terminated_length": 218.8, "entropy": 0.44495113492012023, "epoch": 0.009000360014400577, "frac_reward_zero_std": 0.7666666984558106, "grad_norm": 0.13671875, "kl": 0.308458948135376, "learning_rate": 9.950595058038197e-06, "loss": 0.00013219380052760245, "num_tokens": 1517756.0, "reward": 0.054750004410743715, "reward_std": 0.07813530415296555, "rewards/env_goofspiel_reward/mean": 0.05475000143051147, "rewards/env_goofspiel_reward/std": 0.17614837288856505, "sampling/importance_sampling_ratio/max": 1.4038194417953491, "sampling/importance_sampling_ratio/mean": 0.9863661766052246, "sampling/importance_sampling_ratio/min": 0.6411556363105774, "sampling/sampling_logp_difference/max": 0.5436622142791748, "sampling/sampling_logp_difference/mean": 0.029225154593586922, "step": 75, "step_time": 2.370857671000158 }, { "epoch": 0.009000360014400577, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 274.2, "eval_completions/max_terminated_length": 274.2, "eval_completions/mean_length": 254.7, "eval_completions/mean_terminated_length": 254.7, "eval_completions/min_length": 235.4, "eval_completions/min_terminated_length": 235.4, "eval_entropy": 0.4275285005569458, "eval_frac_reward_zero_std": 0.6, "eval_kl": 0.317794269323349, "eval_loss": 9.086851787287742e-05, "eval_num_tokens": 1517756.0, "eval_reward": 0.12000000476837158, "eval_reward_std": 0.16970562934875488, "eval_rewards/env_goofspiel_reward/mean": 0.12000000476837158, "eval_rewards/env_goofspiel_reward/std": 0.24000003337860107, "eval_runtime": 1.6574, "eval_samples_per_second": 6.034, "eval_sampling/importance_sampling_ratio/max": 1.1251073837280274, "eval_sampling/importance_sampling_ratio/mean": 0.9802677392959595, "eval_sampling/importance_sampling_ratio/min": 0.8105595707893372, "eval_sampling/sampling_logp_difference/max": 0.2066459536552429, "eval_sampling/sampling_logp_difference/mean": 0.03130748393014073, "eval_steps_per_second": 1.81, "step": 75 } ], "logging_steps": 5, "max_steps": 24999, "num_input_tokens_seen": 1517756, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }