Instructions to use Gege24/test_leduc_poker with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Gege24/test_leduc_poker with PEFT:
Base model is not found.
- Transformers
How to use Gege24/test_leduc_poker with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Gege24/test_leduc_poker") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Gege24/test_leduc_poker", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Gege24/test_leduc_poker with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Gege24/test_leduc_poker" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/test_leduc_poker", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Gege24/test_leduc_poker
- SGLang
How to use Gege24/test_leduc_poker with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Gege24/test_leduc_poker" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/test_leduc_poker", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Gege24/test_leduc_poker" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/test_leduc_poker", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Gege24/test_leduc_poker with Docker Model Runner:
docker model run hf.co/Gege24/test_leduc_poker
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.00068, | |
| "eval_steps": 500, | |
| "global_step": 68, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 309.0, | |
| "completions/max_terminated_length": 309.0, | |
| "completions/mean_length": 160.90625, | |
| "completions/mean_terminated_length": 160.90625, | |
| "completions/min_length": 6.0, | |
| "completions/min_terminated_length": 6.0, | |
| "entropy": 8.38103711605072, | |
| "epoch": 1e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0028616045601665974, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.0002, | |
| "num_tokens": 18085.0, | |
| "reward": 0.03125002980232239, | |
| "reward_std": 0.6862481236457825, | |
| "rewards/rollout_reward_func/mean": 0.03125002980232239, | |
| "rewards/rollout_reward_func/std": 1.011366844177246, | |
| "sampling/importance_sampling_ratio/max": 0.010557955130934715, | |
| "sampling/importance_sampling_ratio/mean": 0.003285687882453203, | |
| "sampling/importance_sampling_ratio/min": 7.365059625542847e-13, | |
| "sampling/sampling_logp_difference/max": 10.706700325012207, | |
| "sampling/sampling_logp_difference/mean": 1.4828054904937744, | |
| "step": 1, | |
| "step_time": 5.417499241000769 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.38103711605072, | |
| "epoch": 2e-05, | |
| "grad_norm": 0.002821348374709487, | |
| "kl": 0.0, | |
| "learning_rate": 2.8571428571428575e-07, | |
| "loss": -0.0002, | |
| "step": 2, | |
| "step_time": 2.1234856260016386 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 297.0, | |
| "completions/max_terminated_length": 297.0, | |
| "completions/mean_length": 80.875, | |
| "completions/mean_terminated_length": 80.875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.941936016082764, | |
| "epoch": 3e-05, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.041607797145843506, | |
| "kl": 0.0004363941061455989, | |
| "learning_rate": 5.714285714285715e-07, | |
| "loss": 0.0023, | |
| "num_tokens": 33737.0, | |
| "reward": -0.13124999403953552, | |
| "reward_std": 0.7598094344139099, | |
| "rewards/rollout_reward_func/mean": -0.13124999403953552, | |
| "rewards/rollout_reward_func/std": 1.0014303922653198, | |
| "sampling/importance_sampling_ratio/max": 0.11680073291063309, | |
| "sampling/importance_sampling_ratio/mean": 0.033671747893095016, | |
| "sampling/importance_sampling_ratio/min": 1.0370337122367346e-06, | |
| "sampling/sampling_logp_difference/max": 4.6255574226379395, | |
| "sampling/sampling_logp_difference/mean": 1.339035987854004, | |
| "step": 3, | |
| "step_time": 3.9910208220007917 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.934386253356934, | |
| "epoch": 4e-05, | |
| "grad_norm": 0.04307129234075546, | |
| "kl": 0.0004773353211930953, | |
| "learning_rate": 8.571428571428572e-07, | |
| "loss": 0.0023, | |
| "step": 4, | |
| "step_time": 2.6708831029991416 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 297.0, | |
| "completions/max_terminated_length": 297.0, | |
| "completions/mean_length": 129.0625, | |
| "completions/mean_terminated_length": 129.0625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.268575668334961, | |
| "epoch": 5e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03974386304616928, | |
| "kl": 0.0007551452181360219, | |
| "learning_rate": 1.142857142857143e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 51251.0, | |
| "reward": -0.53125, | |
| "reward_std": 0.5484436750411987, | |
| "rewards/rollout_reward_func/mean": -0.53125, | |
| "rewards/rollout_reward_func/std": 0.8789427280426025, | |
| "sampling/importance_sampling_ratio/max": 0.11807744950056076, | |
| "sampling/importance_sampling_ratio/mean": 0.024541109800338745, | |
| "sampling/importance_sampling_ratio/min": 1.4743216955430405e-13, | |
| "sampling/sampling_logp_difference/max": 10.85576057434082, | |
| "sampling/sampling_logp_difference/mean": 1.503469467163086, | |
| "step": 5, | |
| "step_time": 4.814693420999902 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.273618221282959, | |
| "epoch": 6e-05, | |
| "grad_norm": 0.04039419814944267, | |
| "kl": 0.0003924804532289272, | |
| "learning_rate": 1.4285714285714286e-06, | |
| "loss": 0.0022, | |
| "step": 6, | |
| "step_time": 2.0722021359997598 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0052083334885537624, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334885537624, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 369.0, | |
| "completions/max_terminated_length": 369.0, | |
| "completions/mean_length": 159.3125, | |
| "completions/mean_terminated_length": 157.77418518066406, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.400971293449402, | |
| "epoch": 7e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02198818139731884, | |
| "kl": 0.0007719468412688002, | |
| "learning_rate": 1.7142857142857145e-06, | |
| "loss": 0.0013, | |
| "num_tokens": 69453.0, | |
| "reward": -0.6624999642372131, | |
| "reward_std": 0.48922622203826904, | |
| "rewards/rollout_reward_func/mean": -0.6624999642372131, | |
| "rewards/rollout_reward_func/std": 0.8071255087852478, | |
| "sampling/importance_sampling_ratio/max": 0.10809381306171417, | |
| "sampling/importance_sampling_ratio/mean": 0.011609362438321114, | |
| "sampling/importance_sampling_ratio/min": 1.4364835228825295e-22, | |
| "sampling/sampling_logp_difference/max": 11.3631591796875, | |
| "sampling/sampling_logp_difference/mean": 1.7411997318267822, | |
| "step": 7, | |
| "step_time": 4.814280139999937 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.014583333861082792, | |
| "clip_ratio/high_mean": 0.007291666930541396, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007291666930541396, | |
| "entropy": 8.400816917419434, | |
| "epoch": 8e-05, | |
| "grad_norm": 0.021912436932325363, | |
| "kl": 0.000808713368314784, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0013, | |
| "step": 8, | |
| "step_time": 2.084449717998723 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0052083334885537624, | |
| "clip_ratio/high_mean": 0.0026041667442768812, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0026041667442768812, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 394.0, | |
| "completions/max_terminated_length": 394.0, | |
| "completions/mean_length": 160.40625, | |
| "completions/mean_terminated_length": 156.1666717529297, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.220785737037659, | |
| "epoch": 9e-05, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00841094646602869, | |
| "kl": 0.0007819603888492566, | |
| "learning_rate": 2.285714285714286e-06, | |
| "loss": -0.0006, | |
| "num_tokens": 88018.0, | |
| "reward": -0.6656249761581421, | |
| "reward_std": 0.30112773180007935, | |
| "rewards/rollout_reward_func/mean": -0.6656249761581421, | |
| "rewards/rollout_reward_func/std": 0.8090652823448181, | |
| "sampling/importance_sampling_ratio/max": 0.10453330725431442, | |
| "sampling/importance_sampling_ratio/mean": 0.009016389958560467, | |
| "sampling/importance_sampling_ratio/min": 3.77039369235492e-14, | |
| "sampling/sampling_logp_difference/max": 11.177996635437012, | |
| "sampling/sampling_logp_difference/mean": 1.46906578540802, | |
| "step": 9, | |
| "step_time": 5.767028449001373 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010416666977107525, | |
| "clip_ratio/high_mean": 0.0078125, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0078125, | |
| "entropy": 8.221423089504242, | |
| "epoch": 0.0001, | |
| "grad_norm": 0.00875311903655529, | |
| "kl": 0.0009726146636239719, | |
| "learning_rate": 2.571428571428571e-06, | |
| "loss": -0.0006, | |
| "step": 10, | |
| "step_time": 3.8331103400005304 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 476.0, | |
| "completions/max_terminated_length": 476.0, | |
| "completions/mean_length": 146.875, | |
| "completions/mean_terminated_length": 146.875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.224214434623718, | |
| "epoch": 0.00011, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005308263469487429, | |
| "kl": 0.0006370486844389234, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": -0.0005, | |
| "num_tokens": 105118.0, | |
| "reward": -0.6531249284744263, | |
| "reward_std": 0.27538806200027466, | |
| "rewards/rollout_reward_func/mean": -0.6531249284744263, | |
| "rewards/rollout_reward_func/std": 0.7935158014297485, | |
| "sampling/importance_sampling_ratio/max": 0.09377396106719971, | |
| "sampling/importance_sampling_ratio/mean": 0.007022843696177006, | |
| "sampling/importance_sampling_ratio/min": 1.1420680792263728e-27, | |
| "sampling/sampling_logp_difference/max": 4.9811320304870605, | |
| "sampling/sampling_logp_difference/mean": 1.3924936056137085, | |
| "step": 11, | |
| "step_time": 5.839474645000337 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.226899027824402, | |
| "epoch": 0.00012, | |
| "grad_norm": 0.005312995053827763, | |
| "kl": 0.0006140958357718773, | |
| "learning_rate": 3.142857142857143e-06, | |
| "loss": -0.0005, | |
| "step": 12, | |
| "step_time": 2.6753236439999455 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 294.0, | |
| "completions/max_terminated_length": 294.0, | |
| "completions/mean_length": 188.25, | |
| "completions/mean_terminated_length": 188.25, | |
| "completions/min_length": 87.0, | |
| "completions/min_terminated_length": 87.0, | |
| "entropy": 8.331215858459473, | |
| "epoch": 0.00013, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0016523015219718218, | |
| "kl": 0.0006200866155268159, | |
| "learning_rate": 3.428571428571429e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 124622.0, | |
| "reward": -1.009374976158142, | |
| "reward_std": 0.0265165027230978, | |
| "rewards/rollout_reward_func/mean": -1.009374976158142, | |
| "rewards/rollout_reward_func/std": 0.03901509940624237, | |
| "sampling/importance_sampling_ratio/max": 0.0077222432009875774, | |
| "sampling/importance_sampling_ratio/mean": 0.0027147922664880753, | |
| "sampling/importance_sampling_ratio/min": 3.714021278022894e-14, | |
| "sampling/sampling_logp_difference/max": 11.583379745483398, | |
| "sampling/sampling_logp_difference/mean": 1.4247815608978271, | |
| "step": 13, | |
| "step_time": 4.811962196999048 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.332964897155762, | |
| "epoch": 0.00014, | |
| "grad_norm": 0.0017943360144272447, | |
| "kl": 0.0005818934332637582, | |
| "learning_rate": 3.7142857142857146e-06, | |
| "loss": -0.0001, | |
| "step": 14, | |
| "step_time": 2.0559995119983796 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 314.0, | |
| "completions/max_terminated_length": 314.0, | |
| "completions/mean_length": 134.34375, | |
| "completions/mean_terminated_length": 134.34375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.289089798927307, | |
| "epoch": 0.00015, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006275709718465805, | |
| "kl": 0.0006801459057896864, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": -0.0006, | |
| "num_tokens": 142137.0, | |
| "reward": -1.0187499523162842, | |
| "reward_std": 0.04082316905260086, | |
| "rewards/rollout_reward_func/mean": -1.0187499523162842, | |
| "rewards/rollout_reward_func/std": 0.04709291458129883, | |
| "sampling/importance_sampling_ratio/max": 0.1032625362277031, | |
| "sampling/importance_sampling_ratio/mean": 0.010007976554334164, | |
| "sampling/importance_sampling_ratio/min": 1.6740179376029118e-07, | |
| "sampling/sampling_logp_difference/max": 4.918362617492676, | |
| "sampling/sampling_logp_difference/mean": 1.347560167312622, | |
| "step": 15, | |
| "step_time": 4.26574305899976 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.295971155166626, | |
| "epoch": 0.00016, | |
| "grad_norm": 0.006583907175809145, | |
| "kl": 0.0010946946458716411, | |
| "learning_rate": 4.2857142857142855e-06, | |
| "loss": -0.0006, | |
| "step": 16, | |
| "step_time": 3.0452185289996123 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 342.0, | |
| "completions/max_terminated_length": 342.0, | |
| "completions/mean_length": 176.59375, | |
| "completions/mean_terminated_length": 176.59375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.386258959770203, | |
| "epoch": 0.00017, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.019049938768148422, | |
| "kl": 0.0009286534441343974, | |
| "learning_rate": 4.571428571428572e-06, | |
| "loss": 0.0008, | |
| "num_tokens": 161764.0, | |
| "reward": 0.34687501192092896, | |
| "reward_std": 0.3751429617404938, | |
| "rewards/rollout_reward_func/mean": 0.34687501192092896, | |
| "rewards/rollout_reward_func/std": 0.9510552287101746, | |
| "sampling/importance_sampling_ratio/max": 0.10295701771974564, | |
| "sampling/importance_sampling_ratio/mean": 0.00530514121055603, | |
| "sampling/importance_sampling_ratio/min": 1.1150266265858022e-09, | |
| "sampling/sampling_logp_difference/max": 8.94926643371582, | |
| "sampling/sampling_logp_difference/mean": 1.3967115879058838, | |
| "step": 17, | |
| "step_time": 4.39325438600099 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.376275658607483, | |
| "epoch": 0.00018, | |
| "grad_norm": 0.017463266849517822, | |
| "kl": 0.0013417988302535377, | |
| "learning_rate": 4.857142857142858e-06, | |
| "loss": 0.0007, | |
| "step": 18, | |
| "step_time": 2.0553056610006024 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 405.0, | |
| "completions/max_terminated_length": 405.0, | |
| "completions/mean_length": 163.53125, | |
| "completions/mean_terminated_length": 163.53125, | |
| "completions/min_length": 62.0, | |
| "completions/min_terminated_length": 62.0, | |
| "entropy": 8.383512496948242, | |
| "epoch": 0.00019, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0029027618002146482, | |
| "kl": 0.0011069331085309386, | |
| "learning_rate": 5.142857142857142e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 180341.0, | |
| "reward": -0.840624988079071, | |
| "reward_std": 0.2795426845550537, | |
| "rewards/rollout_reward_func/mean": -0.840624988079071, | |
| "rewards/rollout_reward_func/std": 0.570715069770813, | |
| "sampling/importance_sampling_ratio/max": 0.009223885834217072, | |
| "sampling/importance_sampling_ratio/mean": 0.002876629587262869, | |
| "sampling/importance_sampling_ratio/min": 3.4141774769962514e-21, | |
| "sampling/sampling_logp_difference/max": 11.647392272949219, | |
| "sampling/sampling_logp_difference/mean": 1.74857759475708, | |
| "step": 19, | |
| "step_time": 5.028274022000915 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.370469331741333, | |
| "epoch": 0.0002, | |
| "grad_norm": 0.002040296094492078, | |
| "kl": 0.0010489341875654645, | |
| "learning_rate": 5.428571428571429e-06, | |
| "loss": -0.0001, | |
| "step": 20, | |
| "step_time": 2.1169578549997823 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 373.0, | |
| "completions/max_terminated_length": 373.0, | |
| "completions/mean_length": 155.15625, | |
| "completions/mean_terminated_length": 155.15625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.354791045188904, | |
| "epoch": 0.00021, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.014582234434783459, | |
| "kl": 0.0016839846794027835, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 0.0006, | |
| "num_tokens": 198770.0, | |
| "reward": 0.46875, | |
| "reward_std": 0.6740255355834961, | |
| "rewards/rollout_reward_func/mean": 0.46875, | |
| "rewards/rollout_reward_func/std": 0.9006941318511963, | |
| "sampling/importance_sampling_ratio/max": 0.08340345323085785, | |
| "sampling/importance_sampling_ratio/mean": 0.006231832783669233, | |
| "sampling/importance_sampling_ratio/min": 2.269645449359814e-08, | |
| "sampling/sampling_logp_difference/max": 2.1503682136535645, | |
| "sampling/sampling_logp_difference/mean": 1.321890115737915, | |
| "step": 21, | |
| "step_time": 4.95798639899931 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.337073802947998, | |
| "epoch": 0.00022, | |
| "grad_norm": 0.014086912386119366, | |
| "kl": 0.0020258916774764657, | |
| "learning_rate": 6e-06, | |
| "loss": 0.0006, | |
| "step": 22, | |
| "step_time": 2.598393530000976 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 417.0, | |
| "completions/max_terminated_length": 417.0, | |
| "completions/mean_length": 152.59375, | |
| "completions/mean_terminated_length": 152.59375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.294317483901978, | |
| "epoch": 0.00023, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.019008534029126167, | |
| "kl": 0.004680573721998371, | |
| "learning_rate": 6.285714285714286e-06, | |
| "loss": 0.0008, | |
| "num_tokens": 217253.0, | |
| "reward": -0.53125, | |
| "reward_std": 0.3699263334274292, | |
| "rewards/rollout_reward_func/mean": -0.53125, | |
| "rewards/rollout_reward_func/std": 0.8785756230354309, | |
| "sampling/importance_sampling_ratio/max": 0.09325665980577469, | |
| "sampling/importance_sampling_ratio/mean": 0.009642375633120537, | |
| "sampling/importance_sampling_ratio/min": 1.0861621381728576e-20, | |
| "sampling/sampling_logp_difference/max": 10.062246322631836, | |
| "sampling/sampling_logp_difference/mean": 1.4383571147918701, | |
| "step": 23, | |
| "step_time": 4.488339367000663 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0020833334419876337, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020833334419876337, | |
| "entropy": 8.280492901802063, | |
| "epoch": 0.00024, | |
| "grad_norm": 0.01694045588374138, | |
| "kl": 0.007116928434697911, | |
| "learning_rate": 6.571428571428572e-06, | |
| "loss": 0.0007, | |
| "step": 24, | |
| "step_time": 2.144616393000433 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 262.0, | |
| "completions/max_terminated_length": 262.0, | |
| "completions/mean_length": 141.1875, | |
| "completions/mean_terminated_length": 141.1875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.207048416137695, | |
| "epoch": 0.00025, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.008826951496303082, | |
| "kl": 0.011807448376202956, | |
| "learning_rate": 6.857142857142858e-06, | |
| "loss": -0.0003, | |
| "num_tokens": 235259.0, | |
| "reward": -0.7625000476837158, | |
| "reward_std": 0.2917833626270294, | |
| "rewards/rollout_reward_func/mean": -0.7625000476837158, | |
| "rewards/rollout_reward_func/std": 0.6776382327079773, | |
| "sampling/importance_sampling_ratio/max": 0.06867893040180206, | |
| "sampling/importance_sampling_ratio/mean": 0.007195095531642437, | |
| "sampling/importance_sampling_ratio/min": 8.830933421045788e-15, | |
| "sampling/sampling_logp_difference/max": 4.469038486480713, | |
| "sampling/sampling_logp_difference/mean": 1.387784719467163, | |
| "step": 25, | |
| "step_time": 5.051108140999531 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.201621413230896, | |
| "epoch": 0.00026, | |
| "grad_norm": 0.009136058390140533, | |
| "kl": 0.014795138500630856, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": -0.0003, | |
| "step": 26, | |
| "step_time": 2.5399822110020978 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 290.0, | |
| "completions/max_terminated_length": 290.0, | |
| "completions/mean_length": 136.9375, | |
| "completions/mean_terminated_length": 136.9375, | |
| "completions/min_length": 87.0, | |
| "completions/min_terminated_length": 87.0, | |
| "entropy": 8.213225603103638, | |
| "epoch": 0.00027, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002163404831662774, | |
| "kl": 0.0031390516232931986, | |
| "learning_rate": 7.428571428571429e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 252577.0, | |
| "reward": -0.3937499523162842, | |
| "reward_std": 0.2508378326892853, | |
| "rewards/rollout_reward_func/mean": -0.3937499523162842, | |
| "rewards/rollout_reward_func/std": 0.9557761549949646, | |
| "sampling/importance_sampling_ratio/max": 0.010245459154248238, | |
| "sampling/importance_sampling_ratio/mean": 0.004281196743249893, | |
| "sampling/importance_sampling_ratio/min": 9.54090864979662e-07, | |
| "sampling/sampling_logp_difference/max": 3.967635154724121, | |
| "sampling/sampling_logp_difference/mean": 1.3100483417510986, | |
| "step": 27, | |
| "step_time": 6.014637065999523 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.203812718391418, | |
| "epoch": 0.00028, | |
| "grad_norm": 0.002143328543752432, | |
| "kl": 0.003412064048461616, | |
| "learning_rate": 7.714285714285716e-06, | |
| "loss": -0.0001, | |
| "step": 28, | |
| "step_time": 2.5550204580013087 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 402.0, | |
| "completions/max_terminated_length": 402.0, | |
| "completions/mean_length": 146.15625, | |
| "completions/mean_terminated_length": 146.15625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.294090986251831, | |
| "epoch": 0.00029, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.014697319827973843, | |
| "kl": 0.025721593061462045, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": -0.0002, | |
| "num_tokens": 270222.0, | |
| "reward": -0.703125, | |
| "reward_std": 0.2893909811973572, | |
| "rewards/rollout_reward_func/mean": -0.703125, | |
| "rewards/rollout_reward_func/std": 0.7459500432014465, | |
| "sampling/importance_sampling_ratio/max": 0.05455424264073372, | |
| "sampling/importance_sampling_ratio/mean": 0.006639046128839254, | |
| "sampling/importance_sampling_ratio/min": 5.018679106327676e-15, | |
| "sampling/sampling_logp_difference/max": 9.214568138122559, | |
| "sampling/sampling_logp_difference/mean": 1.469120979309082, | |
| "step": 29, | |
| "step_time": 5.175338321999334 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.281907558441162, | |
| "epoch": 0.0003, | |
| "grad_norm": 0.01356051117181778, | |
| "kl": 0.02538611611817032, | |
| "learning_rate": 8.285714285714287e-06, | |
| "loss": -0.0002, | |
| "step": 30, | |
| "step_time": 2.570307294000486 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 217.0, | |
| "completions/max_terminated_length": 217.0, | |
| "completions/mean_length": 122.40625, | |
| "completions/mean_terminated_length": 122.40625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.991218149662018, | |
| "epoch": 0.00031, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0034507170785218477, | |
| "kl": 0.012121076317271218, | |
| "learning_rate": 8.571428571428571e-06, | |
| "loss": 0.0, | |
| "num_tokens": 287003.0, | |
| "reward": -1.0, | |
| "reward_std": 0.0, | |
| "rewards/rollout_reward_func/mean": -1.0, | |
| "rewards/rollout_reward_func/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 0.06423835456371307, | |
| "sampling/importance_sampling_ratio/mean": 0.006418607663363218, | |
| "sampling/importance_sampling_ratio/min": 2.267779519726787e-09, | |
| "sampling/sampling_logp_difference/max": 7.289473533630371, | |
| "sampling/sampling_logp_difference/mean": 1.3377501964569092, | |
| "step": 31, | |
| "step_time": 4.991982824997649 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.985842347145081, | |
| "epoch": 0.00032, | |
| "grad_norm": 0.0031677871011197567, | |
| "kl": 0.01152854437532369, | |
| "learning_rate": 8.857142857142858e-06, | |
| "loss": 0.0, | |
| "step": 32, | |
| "step_time": 2.551680210001905 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 290.0, | |
| "completions/max_terminated_length": 290.0, | |
| "completions/mean_length": 141.4375, | |
| "completions/mean_terminated_length": 141.4375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.251873970031738, | |
| "epoch": 0.00033, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.011402878910303116, | |
| "kl": 0.015730057610198855, | |
| "learning_rate": 9.142857142857144e-06, | |
| "loss": -0.0004, | |
| "num_tokens": 305185.0, | |
| "reward": -0.3999999761581421, | |
| "reward_std": 0.5431233644485474, | |
| "rewards/rollout_reward_func/mean": -0.3999999761581421, | |
| "rewards/rollout_reward_func/std": 0.9466408491134644, | |
| "sampling/importance_sampling_ratio/max": 0.05344817042350769, | |
| "sampling/importance_sampling_ratio/mean": 0.005628373473882675, | |
| "sampling/importance_sampling_ratio/min": 1.8508519615559533e-23, | |
| "sampling/sampling_logp_difference/max": 13.587614059448242, | |
| "sampling/sampling_logp_difference/mean": 1.4650746583938599, | |
| "step": 33, | |
| "step_time": 6.090412677998756 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.246023058891296, | |
| "epoch": 0.00034, | |
| "grad_norm": 0.008096246048808098, | |
| "kl": 0.014423061889829114, | |
| "learning_rate": 9.42857142857143e-06, | |
| "loss": -0.0004, | |
| "step": 34, | |
| "step_time": 2.5453261089996886 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 282.0, | |
| "completions/max_terminated_length": 282.0, | |
| "completions/mean_length": 132.625, | |
| "completions/mean_terminated_length": 132.625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.162899851799011, | |
| "epoch": 0.00035, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.004175754263997078, | |
| "kl": 0.018283673271071166, | |
| "learning_rate": 9.714285714285715e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 323325.0, | |
| "reward": -0.5718749761581421, | |
| "reward_std": 0.18343815207481384, | |
| "rewards/rollout_reward_func/mean": -0.5718749761581421, | |
| "rewards/rollout_reward_func/std": 0.8301706910133362, | |
| "sampling/importance_sampling_ratio/max": 0.062451351433992386, | |
| "sampling/importance_sampling_ratio/mean": 0.007372735999524593, | |
| "sampling/importance_sampling_ratio/min": 1.2278145562505762e-20, | |
| "sampling/sampling_logp_difference/max": 9.404362678527832, | |
| "sampling/sampling_logp_difference/mean": 1.5296763181686401, | |
| "step": 35, | |
| "step_time": 5.175936893000653 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.150159418582916, | |
| "epoch": 0.00036, | |
| "grad_norm": 0.004381407983601093, | |
| "kl": 0.017532640282297507, | |
| "learning_rate": 1e-05, | |
| "loss": -0.0001, | |
| "step": 36, | |
| "step_time": 2.551337270998374 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 350.0, | |
| "completions/max_terminated_length": 350.0, | |
| "completions/mean_length": 193.5625, | |
| "completions/mean_terminated_length": 193.5625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.269986033439636, | |
| "epoch": 0.00037, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.008302879519760609, | |
| "kl": 0.009193298814352602, | |
| "learning_rate": 9.9999999995372e-06, | |
| "loss": 0.0005, | |
| "num_tokens": 342711.0, | |
| "reward": -0.03749999403953552, | |
| "reward_std": 0.7515531778335571, | |
| "rewards/rollout_reward_func/mean": -0.03749999403953552, | |
| "rewards/rollout_reward_func/std": 1.0247737169265747, | |
| "sampling/importance_sampling_ratio/max": 0.06419403851032257, | |
| "sampling/importance_sampling_ratio/mean": 0.003890752326697111, | |
| "sampling/importance_sampling_ratio/min": 1.164774526829504e-10, | |
| "sampling/sampling_logp_difference/max": 3.794468402862549, | |
| "sampling/sampling_logp_difference/mean": 1.2929356098175049, | |
| "step": 37, | |
| "step_time": 4.535253317000752 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.270979046821594, | |
| "epoch": 0.00038, | |
| "grad_norm": 0.00880998931825161, | |
| "kl": 0.007889840024290606, | |
| "learning_rate": 9.999999998148802e-06, | |
| "loss": 0.0005, | |
| "step": 38, | |
| "step_time": 2.492936824000026 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 347.0, | |
| "completions/max_terminated_length": 347.0, | |
| "completions/mean_length": 171.09375, | |
| "completions/mean_terminated_length": 172.93548583984375, | |
| "completions/min_length": 89.0, | |
| "completions/min_terminated_length": 89.0, | |
| "entropy": 8.237572193145752, | |
| "epoch": 0.00039, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.003999819979071617, | |
| "kl": 0.003984437556937337, | |
| "learning_rate": 9.999999995834804e-06, | |
| "loss": -0.0003, | |
| "num_tokens": 361754.0, | |
| "reward": -0.2750000059604645, | |
| "reward_std": 0.29003918170928955, | |
| "rewards/rollout_reward_func/mean": -0.2750000059604645, | |
| "rewards/rollout_reward_func/std": 0.9721940755844116, | |
| "sampling/importance_sampling_ratio/max": 0.00961074884980917, | |
| "sampling/importance_sampling_ratio/mean": 0.0027457564137876034, | |
| "sampling/importance_sampling_ratio/min": 1.3412947045145382e-17, | |
| "sampling/sampling_logp_difference/max": 10.32085132598877, | |
| "sampling/sampling_logp_difference/mean": 1.3388471603393555, | |
| "step": 39, | |
| "step_time": 4.8369908450004 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.241283535957336, | |
| "epoch": 0.0004, | |
| "grad_norm": 0.003973928280174732, | |
| "kl": 0.004160504468018189, | |
| "learning_rate": 9.999999992595207e-06, | |
| "loss": -0.0003, | |
| "step": 40, | |
| "step_time": 2.064312154999243 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 408.0, | |
| "completions/max_terminated_length": 408.0, | |
| "completions/mean_length": 149.6875, | |
| "completions/mean_terminated_length": 150.7096710205078, | |
| "completions/min_length": 87.0, | |
| "completions/min_terminated_length": 87.0, | |
| "entropy": 8.331173419952393, | |
| "epoch": 0.00041, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0036302730441093445, | |
| "kl": 0.002889991897973232, | |
| "learning_rate": 9.999999988430008e-06, | |
| "loss": -0.0, | |
| "num_tokens": 380000.0, | |
| "reward": -0.40312498807907104, | |
| "reward_std": 0.5359131097793579, | |
| "rewards/rollout_reward_func/mean": -0.40312498807907104, | |
| "rewards/rollout_reward_func/std": 0.9361845850944519, | |
| "sampling/importance_sampling_ratio/max": 0.009129444137215614, | |
| "sampling/importance_sampling_ratio/mean": 0.003908202983438969, | |
| "sampling/importance_sampling_ratio/min": 3.258544557229945e-14, | |
| "sampling/sampling_logp_difference/max": 4.673151016235352, | |
| "sampling/sampling_logp_difference/mean": 1.426315426826477, | |
| "step": 41, | |
| "step_time": 4.397045022999009 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.317348182201385, | |
| "epoch": 0.00042, | |
| "grad_norm": 0.002291926182806492, | |
| "kl": 0.002586090617114678, | |
| "learning_rate": 9.999999983339212e-06, | |
| "loss": -0.0, | |
| "step": 42, | |
| "step_time": 2.1006949159991564 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 408.0, | |
| "completions/max_terminated_length": 408.0, | |
| "completions/mean_length": 168.1875, | |
| "completions/mean_terminated_length": 168.1875, | |
| "completions/min_length": 89.0, | |
| "completions/min_terminated_length": 89.0, | |
| "entropy": 8.245356917381287, | |
| "epoch": 0.00043, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004080729093402624, | |
| "kl": 0.003670690639410168, | |
| "learning_rate": 9.999999977322818e-06, | |
| "loss": -0.0004, | |
| "num_tokens": 398190.0, | |
| "reward": -0.5843750238418579, | |
| "reward_std": 0.22265997529029846, | |
| "rewards/rollout_reward_func/mean": -0.5843750238418579, | |
| "rewards/rollout_reward_func/std": 0.8462857007980347, | |
| "sampling/importance_sampling_ratio/max": 0.010589290410280228, | |
| "sampling/importance_sampling_ratio/mean": 0.003948138561099768, | |
| "sampling/importance_sampling_ratio/min": 1.0654254561925924e-10, | |
| "sampling/sampling_logp_difference/max": 9.166659355163574, | |
| "sampling/sampling_logp_difference/mean": 1.3605971336364746, | |
| "step": 43, | |
| "step_time": 4.460575121997863 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.227819621562958, | |
| "epoch": 0.00044, | |
| "grad_norm": 0.004100060556083918, | |
| "kl": 0.003359506925335154, | |
| "learning_rate": 9.999999970380822e-06, | |
| "loss": -0.0004, | |
| "step": 44, | |
| "step_time": 3.0909940020010254 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01785714365541935, | |
| "clip_ratio/high_mean": 0.008928571827709675, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008928571827709675, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 297.0, | |
| "completions/max_terminated_length": 297.0, | |
| "completions/mean_length": 160.40625, | |
| "completions/mean_terminated_length": 160.40625, | |
| "completions/min_length": 87.0, | |
| "completions/min_terminated_length": 87.0, | |
| "entropy": 8.268202066421509, | |
| "epoch": 0.00045, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0023337905295193195, | |
| "kl": 0.0028031190449837595, | |
| "learning_rate": 9.999999962513228e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 416179.0, | |
| "reward": -0.5843750238418579, | |
| "reward_std": 0.20319493114948273, | |
| "rewards/rollout_reward_func/mean": -0.5843750238418579, | |
| "rewards/rollout_reward_func/std": 0.830510675907135, | |
| "sampling/importance_sampling_ratio/max": 0.010615027509629726, | |
| "sampling/importance_sampling_ratio/mean": 0.003981100395321846, | |
| "sampling/importance_sampling_ratio/min": 1.0911494689562484e-13, | |
| "sampling/sampling_logp_difference/max": 11.518604278564453, | |
| "sampling/sampling_logp_difference/mean": 1.4530669450759888, | |
| "step": 45, | |
| "step_time": 4.283326912999655 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01785714365541935, | |
| "clip_ratio/high_mean": 0.008928571827709675, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008928571827709675, | |
| "entropy": 8.239543914794922, | |
| "epoch": 0.00046, | |
| "grad_norm": 0.0035347214434295893, | |
| "kl": 0.0033496549731353298, | |
| "learning_rate": 9.999999953720035e-06, | |
| "loss": -0.0001, | |
| "step": 46, | |
| "step_time": 2.0328167930001655 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 346.0, | |
| "completions/max_terminated_length": 346.0, | |
| "completions/mean_length": 154.96875, | |
| "completions/mean_terminated_length": 154.96875, | |
| "completions/min_length": 6.0, | |
| "completions/min_terminated_length": 6.0, | |
| "entropy": 8.14276933670044, | |
| "epoch": 0.00047, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.003259505843743682, | |
| "kl": 0.005610214080661535, | |
| "learning_rate": 9.99999994400124e-06, | |
| "loss": -0.0004, | |
| "num_tokens": 434762.0, | |
| "reward": -1.0218749046325684, | |
| "reward_std": 0.04966200143098831, | |
| "rewards/rollout_reward_func/mean": -1.0218749046325684, | |
| "rewards/rollout_reward_func/std": 0.0490843690931797, | |
| "sampling/importance_sampling_ratio/max": 0.011610783636569977, | |
| "sampling/importance_sampling_ratio/mean": 0.003400737652555108, | |
| "sampling/importance_sampling_ratio/min": 9.674043361674659e-17, | |
| "sampling/sampling_logp_difference/max": 10.514336585998535, | |
| "sampling/sampling_logp_difference/mean": 1.5065617561340332, | |
| "step": 47, | |
| "step_time": 4.387635872000828 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004807692486792803, | |
| "clip_ratio/high_mean": 0.0024038462433964014, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0024038462433964014, | |
| "entropy": 8.125671744346619, | |
| "epoch": 0.00048, | |
| "grad_norm": 0.0028330760542303324, | |
| "kl": 0.005603832833003253, | |
| "learning_rate": 9.999999933356848e-06, | |
| "loss": -0.0004, | |
| "step": 48, | |
| "step_time": 2.031070074998752 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 265.0, | |
| "completions/max_terminated_length": 265.0, | |
| "completions/mean_length": 153.46875, | |
| "completions/mean_terminated_length": 154.61289978027344, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.048729181289673, | |
| "epoch": 0.00049, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005494742188602686, | |
| "kl": 0.03256297600455582, | |
| "learning_rate": 9.999999921786855e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 453105.0, | |
| "reward": -0.078125, | |
| "reward_std": 0.5431658029556274, | |
| "rewards/rollout_reward_func/mean": -0.078125, | |
| "rewards/rollout_reward_func/std": 1.0044207572937012, | |
| "sampling/importance_sampling_ratio/max": 0.05582950636744499, | |
| "sampling/importance_sampling_ratio/mean": 0.004962150938808918, | |
| "sampling/importance_sampling_ratio/min": 2.203801160657881e-12, | |
| "sampling/sampling_logp_difference/max": 9.083192825317383, | |
| "sampling/sampling_logp_difference/mean": 1.358087182044983, | |
| "step": 49, | |
| "step_time": 4.150341610999931 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.026359498500824, | |
| "epoch": 0.0005, | |
| "grad_norm": 0.005025926977396011, | |
| "kl": 0.02888420899398625, | |
| "learning_rate": 9.999999909291265e-06, | |
| "loss": -0.0001, | |
| "step": 50, | |
| "step_time": 2.9815445680005723 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 290.0, | |
| "completions/max_terminated_length": 290.0, | |
| "completions/mean_length": 143.65625, | |
| "completions/mean_terminated_length": 143.65625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.10213428735733, | |
| "epoch": 0.00051, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.001717552193440497, | |
| "kl": 0.011197627696674317, | |
| "learning_rate": 9.999999895870075e-06, | |
| "loss": -0.0002, | |
| "num_tokens": 470670.0, | |
| "reward": -1.015625, | |
| "reward_std": 0.036278266459703445, | |
| "rewards/rollout_reward_func/mean": -1.015625, | |
| "rewards/rollout_reward_func/std": 0.05148990824818611, | |
| "sampling/importance_sampling_ratio/max": 0.06585416942834854, | |
| "sampling/importance_sampling_ratio/mean": 0.006765150930732489, | |
| "sampling/importance_sampling_ratio/min": 1.496036702519632e-06, | |
| "sampling/sampling_logp_difference/max": 4.3300042152404785, | |
| "sampling/sampling_logp_difference/mean": 1.3094828128814697, | |
| "step": 51, | |
| "step_time": 4.126140448999649 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.08252465724945, | |
| "epoch": 0.00052, | |
| "grad_norm": 0.0015695166075602174, | |
| "kl": 0.0106147377518937, | |
| "learning_rate": 9.999999881523285e-06, | |
| "loss": -0.0002, | |
| "step": 52, | |
| "step_time": 2.0502745620005953 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 314.0, | |
| "completions/max_terminated_length": 314.0, | |
| "completions/mean_length": 173.5, | |
| "completions/mean_terminated_length": 175.29031372070312, | |
| "completions/min_length": 91.0, | |
| "completions/min_terminated_length": 91.0, | |
| "entropy": 7.971049547195435, | |
| "epoch": 0.00053, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002233212348073721, | |
| "kl": 0.004628648399375379, | |
| "learning_rate": 9.999999866250896e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 489622.0, | |
| "reward": 0.04375000298023224, | |
| "reward_std": 0.6950876712799072, | |
| "rewards/rollout_reward_func/mean": 0.04375000298023224, | |
| "rewards/rollout_reward_func/std": 1.0162986516952515, | |
| "sampling/importance_sampling_ratio/max": 0.014256482943892479, | |
| "sampling/importance_sampling_ratio/mean": 0.0037497361190617085, | |
| "sampling/importance_sampling_ratio/min": 3.703341886623912e-13, | |
| "sampling/sampling_logp_difference/max": 10.441887855529785, | |
| "sampling/sampling_logp_difference/mean": 1.3701958656311035, | |
| "step": 53, | |
| "step_time": 4.342945265999333 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.953936517238617, | |
| "epoch": 0.00054, | |
| "grad_norm": 0.0023060261737555265, | |
| "kl": 0.004763354663737118, | |
| "learning_rate": 9.999999850052909e-06, | |
| "loss": -0.0001, | |
| "step": 54, | |
| "step_time": 2.074805829001889 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 469.0, | |
| "completions/max_terminated_length": 469.0, | |
| "completions/mean_length": 133.125, | |
| "completions/mean_terminated_length": 133.61289978027344, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 7.858851313591003, | |
| "epoch": 0.00055, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.010865299962460995, | |
| "kl": 0.05283547495491803, | |
| "learning_rate": 9.99999983292932e-06, | |
| "loss": 0.0005, | |
| "num_tokens": 507130.0, | |
| "reward": -0.08750000596046448, | |
| "reward_std": 0.2218937873840332, | |
| "rewards/rollout_reward_func/mean": -0.08750000596046448, | |
| "rewards/rollout_reward_func/std": 1.0082721710205078, | |
| "sampling/importance_sampling_ratio/max": 0.0794573649764061, | |
| "sampling/importance_sampling_ratio/mean": 0.01186932623386383, | |
| "sampling/importance_sampling_ratio/min": 2.8194972302655857e-18, | |
| "sampling/sampling_logp_difference/max": 11.174360275268555, | |
| "sampling/sampling_logp_difference/mean": 1.4186177253723145, | |
| "step": 55, | |
| "step_time": 4.7503355889994054 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005681818351149559, | |
| "clip_ratio/high_mean": 0.0028409091755747795, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0028409091755747795, | |
| "entropy": 7.84703141450882, | |
| "epoch": 0.00056, | |
| "grad_norm": 0.009876980446279049, | |
| "kl": 0.04835269978502765, | |
| "learning_rate": 9.999999814880132e-06, | |
| "loss": 0.0005, | |
| "step": 56, | |
| "step_time": 2.697479030000977 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 309.0, | |
| "completions/max_terminated_length": 309.0, | |
| "completions/mean_length": 161.03125, | |
| "completions/mean_terminated_length": 161.03125, | |
| "completions/min_length": 87.0, | |
| "completions/min_terminated_length": 87.0, | |
| "entropy": 7.946120798587799, | |
| "epoch": 0.00057, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.00416550925001502, | |
| "kl": 0.005225592787610367, | |
| "learning_rate": 9.999999795905347e-06, | |
| "loss": 0.0003, | |
| "num_tokens": 524811.0, | |
| "reward": -0.328125, | |
| "reward_std": 0.7972557544708252, | |
| "rewards/rollout_reward_func/mean": -0.328125, | |
| "rewards/rollout_reward_func/std": 0.9642762541770935, | |
| "sampling/importance_sampling_ratio/max": 0.011825657449662685, | |
| "sampling/importance_sampling_ratio/mean": 0.004940683953464031, | |
| "sampling/importance_sampling_ratio/min": 7.442185295759504e-14, | |
| "sampling/sampling_logp_difference/max": 10.764989852905273, | |
| "sampling/sampling_logp_difference/mean": 1.3635454177856445, | |
| "step": 57, | |
| "step_time": 4.399483632999363 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.954219579696655, | |
| "epoch": 0.00058, | |
| "grad_norm": 0.004001974128186703, | |
| "kl": 0.005270991328870878, | |
| "learning_rate": 9.999999776004962e-06, | |
| "loss": 0.0003, | |
| "step": 58, | |
| "step_time": 2.068970861998423 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 305.0, | |
| "completions/max_terminated_length": 305.0, | |
| "completions/mean_length": 137.78125, | |
| "completions/mean_terminated_length": 137.78125, | |
| "completions/min_length": 100.0, | |
| "completions/min_terminated_length": 100.0, | |
| "entropy": 7.7305819392204285, | |
| "epoch": 0.00059, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002240613801404834, | |
| "kl": 0.007570188608951867, | |
| "learning_rate": 9.999999755178978e-06, | |
| "loss": -0.0002, | |
| "num_tokens": 542324.0, | |
| "reward": -0.565625011920929, | |
| "reward_std": 0.18561552464962006, | |
| "rewards/rollout_reward_func/mean": -0.565625011920929, | |
| "rewards/rollout_reward_func/std": 0.841890811920166, | |
| "sampling/importance_sampling_ratio/max": 0.014670869335532188, | |
| "sampling/importance_sampling_ratio/mean": 0.006734498776495457, | |
| "sampling/importance_sampling_ratio/min": 9.527041698945138e-13, | |
| "sampling/sampling_logp_difference/max": 10.319395065307617, | |
| "sampling/sampling_logp_difference/mean": 1.346876621246338, | |
| "step": 59, | |
| "step_time": 4.076921171999857 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.7493292689323425, | |
| "epoch": 0.0006, | |
| "grad_norm": 0.0021323147229850292, | |
| "kl": 0.007373731641564518, | |
| "learning_rate": 9.999999733427394e-06, | |
| "loss": -0.0002, | |
| "step": 60, | |
| "step_time": 2.065178036998077 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 305.0, | |
| "completions/max_terminated_length": 305.0, | |
| "completions/mean_length": 131.15625, | |
| "completions/mean_terminated_length": 132.53334045410156, | |
| "completions/min_length": 100.0, | |
| "completions/min_terminated_length": 100.0, | |
| "entropy": 7.80867725610733, | |
| "epoch": 0.00061, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002022077329456806, | |
| "kl": 0.004840011039050296, | |
| "learning_rate": 9.99999971075021e-06, | |
| "loss": -0.0003, | |
| "num_tokens": 559385.0, | |
| "reward": -0.25312501192092896, | |
| "reward_std": 0.2761000692844391, | |
| "rewards/rollout_reward_func/mean": -0.25312501192092896, | |
| "rewards/rollout_reward_func/std": 0.9863533973693848, | |
| "sampling/importance_sampling_ratio/max": 0.011284389533102512, | |
| "sampling/importance_sampling_ratio/mean": 0.006653377786278725, | |
| "sampling/importance_sampling_ratio/min": 2.8632414147966578e-11, | |
| "sampling/sampling_logp_difference/max": 4.169203758239746, | |
| "sampling/sampling_logp_difference/mean": 1.2035651206970215, | |
| "step": 61, | |
| "step_time": 4.5231009249973795 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 7.829549431800842, | |
| "epoch": 0.00062, | |
| "grad_norm": 0.0023180257994681597, | |
| "kl": 0.004549442324787378, | |
| "learning_rate": 9.999999687147426e-06, | |
| "loss": -0.0003, | |
| "step": 62, | |
| "step_time": 2.04733120399942 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 212.0, | |
| "completions/max_terminated_length": 212.0, | |
| "completions/mean_length": 123.1875, | |
| "completions/mean_terminated_length": 123.1875, | |
| "completions/min_length": 6.0, | |
| "completions/min_terminated_length": 6.0, | |
| "entropy": 8.036745607852936, | |
| "epoch": 0.00063, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0024225530214607716, | |
| "kl": 0.007575233932584524, | |
| "learning_rate": 9.999999662619046e-06, | |
| "loss": -0.0004, | |
| "num_tokens": 577175.0, | |
| "reward": -0.6437500715255737, | |
| "reward_std": 0.2552982568740845, | |
| "rewards/rollout_reward_func/mean": -0.6437500715255737, | |
| "rewards/rollout_reward_func/std": 0.7873751521110535, | |
| "sampling/importance_sampling_ratio/max": 0.010909978300333023, | |
| "sampling/importance_sampling_ratio/mean": 0.005395432468503714, | |
| "sampling/importance_sampling_ratio/min": 2.1033281019655625e-11, | |
| "sampling/sampling_logp_difference/max": 10.216733932495117, | |
| "sampling/sampling_logp_difference/mean": 1.502685546875, | |
| "step": 63, | |
| "step_time": 3.9012105610008803 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.038543045520782, | |
| "epoch": 0.00064, | |
| "grad_norm": 0.0024051358923316, | |
| "kl": 0.007686805154662579, | |
| "learning_rate": 9.999999637165062e-06, | |
| "loss": -0.0004, | |
| "step": 64, | |
| "step_time": 2.0093538759983858 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 282.0, | |
| "completions/max_terminated_length": 282.0, | |
| "completions/mean_length": 141.65625, | |
| "completions/mean_terminated_length": 141.65625, | |
| "completions/min_length": 62.0, | |
| "completions/min_terminated_length": 62.0, | |
| "entropy": 8.191402852535248, | |
| "epoch": 0.00065, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.003754268866032362, | |
| "kl": 0.005535919044632465, | |
| "learning_rate": 9.999999610785483e-06, | |
| "loss": -0.0, | |
| "num_tokens": 594644.0, | |
| "reward": -0.33125001192092896, | |
| "reward_std": 0.4657542407512665, | |
| "rewards/rollout_reward_func/mean": -0.33125001192092896, | |
| "rewards/rollout_reward_func/std": 0.9663491249084473, | |
| "sampling/importance_sampling_ratio/max": 0.010239495895802975, | |
| "sampling/importance_sampling_ratio/mean": 0.004091148264706135, | |
| "sampling/importance_sampling_ratio/min": 1.6428119986328787e-13, | |
| "sampling/sampling_logp_difference/max": 11.840818405151367, | |
| "sampling/sampling_logp_difference/mean": 1.458545446395874, | |
| "step": 65, | |
| "step_time": 4.098211152998374 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 8.207817077636719, | |
| "epoch": 0.00066, | |
| "grad_norm": 0.0028874659910798073, | |
| "kl": 0.005369087448343635, | |
| "learning_rate": 9.999999583480304e-06, | |
| "loss": -0.0, | |
| "step": 66, | |
| "step_time": 2.03046784199978 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005434782709926367, | |
| "clip_ratio/high_mean": 0.0027173913549631834, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0027173913549631834, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 286.0, | |
| "completions/max_terminated_length": 286.0, | |
| "completions/mean_length": 131.0625, | |
| "completions/mean_terminated_length": 131.0625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 8.138030529022217, | |
| "epoch": 0.00067, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.008762937970459461, | |
| "kl": 0.03389387877541594, | |
| "learning_rate": 9.999999555249524e-06, | |
| "loss": 0.0004, | |
| "num_tokens": 612302.0, | |
| "reward": -0.32500001788139343, | |
| "reward_std": 0.44983184337615967, | |
| "rewards/rollout_reward_func/mean": -0.32500001788139343, | |
| "rewards/rollout_reward_func/std": 0.9615175724029541, | |
| "sampling/importance_sampling_ratio/max": 0.06631788611412048, | |
| "sampling/importance_sampling_ratio/mean": 0.009149353951215744, | |
| "sampling/importance_sampling_ratio/min": 2.947741995564272e-15, | |
| "sampling/sampling_logp_difference/max": 4.271122455596924, | |
| "sampling/sampling_logp_difference/mean": 1.307502269744873, | |
| "step": 67, | |
| "step_time": 4.921273821997602 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.005434782709926367, | |
| "clip_ratio/high_mean": 0.0027173913549631834, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0027173913549631834, | |
| "entropy": 8.132037341594696, | |
| "epoch": 0.00068, | |
| "grad_norm": 0.009227960370481014, | |
| "kl": 0.03418682742631063, | |
| "learning_rate": 9.999999526093148e-06, | |
| "loss": 0.0004, | |
| "step": 68, | |
| "step_time": 2.032772711999314 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 200000, | |
| "num_input_tokens_seen": 612302, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |