Instructions to use 7vik-aisi/cc-olmo32b-code-b0.02-s300 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use 7vik-aisi/cc-olmo32b-code-b0.02-s300 with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("allenai/Olmo-3.1-32B-Instruct-SFT") model = PeftModel.from_pretrained(base_model, "7vik-aisi/cc-olmo32b-code-b0.02-s300") - Transformers
How to use 7vik-aisi/cc-olmo32b-code-b0.02-s300 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="7vik-aisi/cc-olmo32b-code-b0.02-s300") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("7vik-aisi/cc-olmo32b-code-b0.02-s300", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use 7vik-aisi/cc-olmo32b-code-b0.02-s300 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "7vik-aisi/cc-olmo32b-code-b0.02-s300" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "7vik-aisi/cc-olmo32b-code-b0.02-s300", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/7vik-aisi/cc-olmo32b-code-b0.02-s300
- SGLang
How to use 7vik-aisi/cc-olmo32b-code-b0.02-s300 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "7vik-aisi/cc-olmo32b-code-b0.02-s300" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "7vik-aisi/cc-olmo32b-code-b0.02-s300", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "7vik-aisi/cc-olmo32b-code-b0.02-s300" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "7vik-aisi/cc-olmo32b-code-b0.02-s300", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use 7vik-aisi/cc-olmo32b-code-b0.02-s300 with Docker Model Runner:
docker model run hf.co/7vik-aisi/cc-olmo32b-code-b0.02-s300
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.7389162561576355, | |
| "eval_steps": 500, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3836.0, | |
| "completions/mean_length": 1083.9375, | |
| "completions/mean_terminated_length": 1047.7213134765625, | |
| "completions/min_length": 26.0, | |
| "completions/min_terminated_length": 26.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6694833338260651, | |
| "epoch": 0.0024630541871921183, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007095558031024415, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.0409415178000927, | |
| "num_tokens": 155852.0, | |
| "reward": 0.76171875, | |
| "reward_std": 0.7998383641242981, | |
| "rewards/reward_func/mean": 0.08463541666666667, | |
| "rewards/reward_func/std": 0.12621609369913736, | |
| "sampling/importance_sampling_ratio/max": 2.9962944984436035, | |
| "sampling/importance_sampling_ratio/mean": 0.9515448808670044, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.445388793945312, | |
| "sampling/sampling_logp_difference/mean": 0.18421649932861328, | |
| "step": 1, | |
| "step_time": 221.00888453796506 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4012.0, | |
| "completions/mean_length": 896.4375, | |
| "completions/mean_terminated_length": 855.8709716796875, | |
| "completions/min_length": 77.0, | |
| "completions/min_terminated_length": 77.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6748273521661758, | |
| "epoch": 0.0049261083743842365, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006613421003843556, | |
| "kl": 0.0, | |
| "learning_rate": 1e-05, | |
| "loss": -0.01135256141424179, | |
| "num_tokens": 304376.0, | |
| "reward": 0.77734375, | |
| "reward_std": 0.450126051902771, | |
| "rewards/reward_func/mean": 0.08637152777777778, | |
| "rewards/reward_func/std": 0.06551425324545966, | |
| "sampling/importance_sampling_ratio/max": 2.9991252422332764, | |
| "sampling/importance_sampling_ratio/mean": 0.9491186738014221, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.011090278625488, | |
| "sampling/sampling_logp_difference/mean": 0.19341453909873962, | |
| "step": 2, | |
| "step_time": 124.78918863995932 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3971.0, | |
| "completions/mean_length": 1291.609375, | |
| "completions/mean_terminated_length": 1131.1500244140625, | |
| "completions/min_length": 153.0, | |
| "completions/min_terminated_length": 153.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6219745129346848, | |
| "epoch": 0.007389162561576354, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005562883860396661, | |
| "kl": 0.00021881231805309653, | |
| "learning_rate": 2e-05, | |
| "loss": 0.017589552327990532, | |
| "num_tokens": 466207.0, | |
| "reward": 0.8359375, | |
| "reward_std": 0.6223654747009277, | |
| "rewards/reward_func/mean": 0.09288194444444445, | |
| "rewards/reward_func/std": 0.08707591229014927, | |
| "sampling/importance_sampling_ratio/max": 2.998457431793213, | |
| "sampling/importance_sampling_ratio/mean": 0.9518132209777832, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.11111831665039, | |
| "sampling/sampling_logp_difference/mean": 0.18696464598178864, | |
| "step": 3, | |
| "step_time": 172.1733792340383 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 3972.0, | |
| "completions/max_terminated_length": 3972.0, | |
| "completions/mean_length": 858.046875, | |
| "completions/mean_terminated_length": 867.2257690429688, | |
| "completions/min_length": 144.0, | |
| "completions/min_terminated_length": 158.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.602585643529892, | |
| "epoch": 0.009852216748768473, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005487564821053171, | |
| "kl": 0.00019695455193868838, | |
| "learning_rate": 3e-05, | |
| "loss": -0.042750950902700424, | |
| "num_tokens": 594530.0, | |
| "reward": 0.9296875, | |
| "reward_std": 0.4949522018432617, | |
| "rewards/reward_func/mean": 0.1032986111111111, | |
| "rewards/reward_func/std": 0.07230462630589803, | |
| "sampling/importance_sampling_ratio/max": 2.997239112854004, | |
| "sampling/importance_sampling_ratio/mean": 0.959896445274353, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.613371849060059, | |
| "sampling/sampling_logp_difference/mean": 0.1705228090286255, | |
| "step": 4, | |
| "step_time": 106.91271968232468 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2055.0, | |
| "completions/mean_length": 773.546875, | |
| "completions/mean_terminated_length": 678.084716796875, | |
| "completions/min_length": 11.0, | |
| "completions/min_terminated_length": 71.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6291423887014389, | |
| "epoch": 0.012315270935960592, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006864817471537119, | |
| "kl": 0.00026583998987916857, | |
| "learning_rate": 4e-05, | |
| "loss": -0.02318686991930008, | |
| "num_tokens": 725013.0, | |
| "reward": 0.73046875, | |
| "reward_std": 0.6079902648925781, | |
| "rewards/reward_func/mean": 0.08116319444444445, | |
| "rewards/reward_func/std": 0.08406046364042494, | |
| "sampling/importance_sampling_ratio/max": 2.9972803592681885, | |
| "sampling/importance_sampling_ratio/mean": 0.9578667283058167, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.874061584472656, | |
| "sampling/sampling_logp_difference/mean": 0.17526790499687195, | |
| "step": 5, | |
| "step_time": 157.378293489106 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2283.0, | |
| "completions/mean_length": 949.609375, | |
| "completions/mean_terminated_length": 858.4067993164062, | |
| "completions/min_length": 174.0, | |
| "completions/min_terminated_length": 174.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7411623895168304, | |
| "epoch": 0.014778325123152709, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00841264236101695, | |
| "kl": 0.0002813305109157227, | |
| "learning_rate": 5e-05, | |
| "loss": 0.022779636085033417, | |
| "num_tokens": 886028.0, | |
| "reward": 0.82421875, | |
| "reward_std": 0.8161072731018066, | |
| "rewards/reward_func/mean": 0.0915798611111111, | |
| "rewards/reward_func/std": 0.12717805471685198, | |
| "sampling/importance_sampling_ratio/max": 2.996835947036743, | |
| "sampling/importance_sampling_ratio/mean": 0.9440486431121826, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.687472343444824, | |
| "sampling/sampling_logp_difference/mean": 0.21788033843040466, | |
| "step": 6, | |
| "step_time": 139.1690670568496 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3740.0, | |
| "completions/mean_length": 1115.5625, | |
| "completions/mean_terminated_length": 1082.360595703125, | |
| "completions/min_length": 300.0, | |
| "completions/min_terminated_length": 300.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7721924930810928, | |
| "epoch": 0.017241379310344827, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006704431263996823, | |
| "kl": 0.0003544362625689246, | |
| "learning_rate": 4.999995293306428e-05, | |
| "loss": -0.035419315099716187, | |
| "num_tokens": 1055376.0, | |
| "reward": 0.80078125, | |
| "reward_std": 0.607786238193512, | |
| "rewards/reward_func/mean": 0.08897569444444445, | |
| "rewards/reward_func/std": 0.08487503065003289, | |
| "sampling/importance_sampling_ratio/max": 2.9902729988098145, | |
| "sampling/importance_sampling_ratio/mean": 0.9370421171188354, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.689960479736328, | |
| "sampling/sampling_logp_difference/mean": 0.23172584176063538, | |
| "step": 7, | |
| "step_time": 136.04291818407364 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3636.0, | |
| "completions/mean_length": 1065.609375, | |
| "completions/mean_terminated_length": 1024.2333984375, | |
| "completions/min_length": 98.0, | |
| "completions/min_terminated_length": 98.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7013488560914993, | |
| "epoch": 0.019704433497536946, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006049884152623777, | |
| "kl": 0.00045841842802474275, | |
| "learning_rate": 4.999981173243434e-05, | |
| "loss": 0.0060663241893053055, | |
| "num_tokens": 1214471.0, | |
| "reward": 0.96484375, | |
| "reward_std": 0.7221924066543579, | |
| "rewards/reward_func/mean": 0.1072048611111111, | |
| "rewards/reward_func/std": 0.1487934175464842, | |
| "sampling/importance_sampling_ratio/max": 2.9969818592071533, | |
| "sampling/importance_sampling_ratio/mean": 0.9477940797805786, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.812170028686523, | |
| "sampling/sampling_logp_difference/mean": 0.20687559247016907, | |
| "step": 8, | |
| "step_time": 120.98894325993024 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2526.0, | |
| "completions/max_terminated_length": 2526.0, | |
| "completions/mean_length": 608.6875, | |
| "completions/mean_terminated_length": 604.5556030273438, | |
| "completions/min_length": 151.0, | |
| "completions/min_terminated_length": 151.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7017630189657211, | |
| "epoch": 0.022167487684729065, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00539651772659363, | |
| "kl": 0.000361532969691325, | |
| "learning_rate": 4.999957639864185e-05, | |
| "loss": -0.03007565438747406, | |
| "num_tokens": 1336179.0, | |
| "reward": 0.97265625, | |
| "reward_std": 0.450126051902771, | |
| "rewards/reward_func/mean": 0.10807291666666667, | |
| "rewards/reward_func/std": 0.06657051046689351, | |
| "sampling/importance_sampling_ratio/max": 2.9878950119018555, | |
| "sampling/importance_sampling_ratio/mean": 0.958396315574646, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.588211059570312, | |
| "sampling/sampling_logp_difference/mean": 0.18251243233680725, | |
| "step": 9, | |
| "step_time": 77.26975405705161 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2819.0, | |
| "completions/mean_length": 1163.921875, | |
| "completions/mean_terminated_length": 968.4500732421875, | |
| "completions/min_length": 107.0, | |
| "completions/min_terminated_length": 107.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7973304092884064, | |
| "epoch": 0.024630541871921183, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005535093066648039, | |
| "kl": 0.0025367297348566353, | |
| "learning_rate": 4.999924693257293e-05, | |
| "loss": -0.026630321517586708, | |
| "num_tokens": 1510574.0, | |
| "reward": 0.80859375, | |
| "reward_std": 0.45369336009025574, | |
| "rewards/reward_func/mean": 0.08984375, | |
| "rewards/reward_func/std": 0.06406109862857395, | |
| "sampling/importance_sampling_ratio/max": 2.997955799102783, | |
| "sampling/importance_sampling_ratio/mean": 0.9407488703727722, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.1378812789917, | |
| "sampling/sampling_logp_difference/mean": 0.23701190948486328, | |
| "step": 10, | |
| "step_time": 218.45970374671742 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3662.0, | |
| "completions/mean_length": 1229.671875, | |
| "completions/mean_terminated_length": 1112.7626953125, | |
| "completions/min_length": 195.0, | |
| "completions/min_terminated_length": 195.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.60106560587883, | |
| "epoch": 0.027093596059113302, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.003343557750625506, | |
| "kl": 0.0005505645822267979, | |
| "learning_rate": 4.9998823335468127e-05, | |
| "loss": -0.02195553667843342, | |
| "num_tokens": 1673273.0, | |
| "reward": 0.80078125, | |
| "reward_std": 0.37580519914627075, | |
| "rewards/reward_func/mean": 0.08897569444444445, | |
| "rewards/reward_func/std": 0.041756133238474526, | |
| "sampling/importance_sampling_ratio/max": 2.9994499683380127, | |
| "sampling/importance_sampling_ratio/mean": 0.9576238393783569, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.029350280761719, | |
| "sampling/sampling_logp_difference/mean": 0.17449143528938293, | |
| "step": 11, | |
| "step_time": 112.48419295996428 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3348.0, | |
| "completions/max_terminated_length": 3348.0, | |
| "completions/mean_length": 763.203125, | |
| "completions/mean_terminated_length": 763.203125, | |
| "completions/min_length": 52.0, | |
| "completions/min_terminated_length": 52.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7593114227056503, | |
| "epoch": 0.029556650246305417, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004228222685487359, | |
| "kl": 0.0008225990168284625, | |
| "learning_rate": 4.9998305608922444e-05, | |
| "loss": -0.00977338943630457, | |
| "num_tokens": 1815814.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.417855441570282, | |
| "rewards/reward_func/mean": 0.10416666666666667, | |
| "rewards/reward_func/std": 0.06210504803392622, | |
| "sampling/importance_sampling_ratio/max": 2.998997926712036, | |
| "sampling/importance_sampling_ratio/mean": 0.9500037431716919, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.969398498535156, | |
| "sampling/sampling_logp_difference/mean": 0.20405714213848114, | |
| "step": 12, | |
| "step_time": 124.43193195271306 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2881.0, | |
| "completions/mean_length": 1180.875, | |
| "completions/mean_terminated_length": 1134.603271484375, | |
| "completions/min_length": 117.0, | |
| "completions/min_terminated_length": 117.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7117026150226593, | |
| "epoch": 0.03201970443349754, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004516257159699829, | |
| "kl": 0.0005648009391734377, | |
| "learning_rate": 4.99976937548853e-05, | |
| "loss": 0.010042570531368256, | |
| "num_tokens": 1978094.0, | |
| "reward": 0.99609375, | |
| "reward_std": 0.4575039744377136, | |
| "rewards/reward_func/mean": 0.11067708333333333, | |
| "rewards/reward_func/std": 0.06885812017652723, | |
| "sampling/importance_sampling_ratio/max": 2.9992194175720215, | |
| "sampling/importance_sampling_ratio/mean": 0.94798743724823, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.557123184204102, | |
| "sampling/sampling_logp_difference/mean": 0.20858065783977509, | |
| "step": 13, | |
| "step_time": 165.92772811092436 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2549.0, | |
| "completions/mean_length": 823.015625, | |
| "completions/mean_terminated_length": 772.1612548828125, | |
| "completions/min_length": 293.0, | |
| "completions/min_terminated_length": 293.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6039450019598007, | |
| "epoch": 0.034482758620689655, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025429700762270694, | |
| "kl": 0.068690528118168, | |
| "learning_rate": 4.999698777566055e-05, | |
| "loss": -0.03384635969996452, | |
| "num_tokens": 2123599.0, | |
| "reward": 1.05078125, | |
| "reward_std": 0.6380459070205688, | |
| "rewards/reward_func/mean": 0.11675347222222222, | |
| "rewards/reward_func/std": 0.11462441086769104, | |
| "sampling/importance_sampling_ratio/max": 2.9970529079437256, | |
| "sampling/importance_sampling_ratio/mean": 0.955413818359375, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.131617546081543, | |
| "sampling/sampling_logp_difference/mean": 0.18180659413337708, | |
| "step": 14, | |
| "step_time": 132.856802233262 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3489.0, | |
| "completions/max_terminated_length": 3489.0, | |
| "completions/mean_length": 953.5, | |
| "completions/mean_terminated_length": 953.5, | |
| "completions/min_length": 245.0, | |
| "completions/min_terminated_length": 245.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6297731846570969, | |
| "epoch": 0.03694581280788178, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.00198453501467837, | |
| "kl": 0.000561516048037447, | |
| "learning_rate": 4.9996187673906445e-05, | |
| "loss": -0.006561854854226112, | |
| "num_tokens": 2272399.0, | |
| "reward": 1.1171875, | |
| "reward_std": 0.39833173155784607, | |
| "rewards/reward_func/mean": 0.12413194444444445, | |
| "rewards/reward_func/std": 0.0564837654431661, | |
| "sampling/importance_sampling_ratio/max": 2.999617338180542, | |
| "sampling/importance_sampling_ratio/mean": 0.9544141292572021, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.673768997192383, | |
| "sampling/sampling_logp_difference/mean": 0.18456798791885376, | |
| "step": 15, | |
| "step_time": 113.08367080404423 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1192.0, | |
| "completions/mean_length": 692.640625, | |
| "completions/mean_terminated_length": 538.0327758789062, | |
| "completions/min_length": 178.0, | |
| "completions/min_terminated_length": 178.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6860306113958359, | |
| "epoch": 0.03940886699507389, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004792147547513456, | |
| "kl": 0.0005804097963846289, | |
| "learning_rate": 4.9995293452635664e-05, | |
| "loss": -0.0032190822530537844, | |
| "num_tokens": 2400168.0, | |
| "reward": 0.9921875, | |
| "reward_std": 0.2780628502368927, | |
| "rewards/reward_func/mean": 0.11024305555555555, | |
| "rewards/reward_func/std": 0.0443571772840288, | |
| "sampling/importance_sampling_ratio/max": 2.9948651790618896, | |
| "sampling/importance_sampling_ratio/mean": 0.9640507698059082, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 19.868938446044922, | |
| "sampling/sampling_logp_difference/mean": 0.1751737892627716, | |
| "step": 16, | |
| "step_time": 131.08224705676548 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3842.0, | |
| "completions/mean_length": 903.828125, | |
| "completions/mean_terminated_length": 853.1587524414062, | |
| "completions/min_length": 139.0, | |
| "completions/min_terminated_length": 139.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6448574513196945, | |
| "epoch": 0.04187192118226601, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.002971407555356022, | |
| "kl": 0.0007872640417190269, | |
| "learning_rate": 4.999430511521525e-05, | |
| "loss": -0.009143276140093803, | |
| "num_tokens": 2549357.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.3196600377559662, | |
| "rewards/reward_func/mean": 0.1076388888888889, | |
| "rewards/reward_func/std": 0.04835580620500776, | |
| "sampling/importance_sampling_ratio/max": 2.997056007385254, | |
| "sampling/importance_sampling_ratio/mean": 0.9508565664291382, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.473780632019043, | |
| "sampling/sampling_logp_difference/mean": 0.19167384505271912, | |
| "step": 17, | |
| "step_time": 127.22956793638878 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2759.0, | |
| "completions/mean_length": 856.609375, | |
| "completions/mean_terminated_length": 812.3933715820312, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 27.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7665040791034698, | |
| "epoch": 0.04433497536945813, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0037058027286737227, | |
| "kl": 0.0015929393121041358, | |
| "learning_rate": 4.999322266536666e-05, | |
| "loss": -0.00527946138754487, | |
| "num_tokens": 2696964.0, | |
| "reward": 0.9921875, | |
| "reward_std": 0.3211115002632141, | |
| "rewards/reward_func/mean": 0.11024305555555555, | |
| "rewards/reward_func/std": 0.0500405298338996, | |
| "sampling/importance_sampling_ratio/max": 2.9947409629821777, | |
| "sampling/importance_sampling_ratio/mean": 0.9457135796546936, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.665953636169434, | |
| "sampling/sampling_logp_difference/mean": 0.21514487266540527, | |
| "step": 18, | |
| "step_time": 140.69508987711743 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2693.0, | |
| "completions/mean_length": 992.046875, | |
| "completions/mean_terminated_length": 957.3386840820312, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 152.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7168809026479721, | |
| "epoch": 0.046798029556650245, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004521973839699517, | |
| "kl": 0.0006615275924559683, | |
| "learning_rate": 4.9992046107165705e-05, | |
| "loss": 0.009132737293839455, | |
| "num_tokens": 2852695.0, | |
| "reward": 1.1484375, | |
| "reward_std": 0.7801083326339722, | |
| "rewards/reward_func/mean": 0.12760416666666666, | |
| "rewards/reward_func/std": 0.12675773766305712, | |
| "sampling/importance_sampling_ratio/max": 2.998769998550415, | |
| "sampling/importance_sampling_ratio/mean": 0.947180449962616, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.349352836608887, | |
| "sampling/sampling_logp_difference/mean": 0.2125585377216339, | |
| "step": 19, | |
| "step_time": 131.2465523199644 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3261.0, | |
| "completions/max_terminated_length": 3261.0, | |
| "completions/mean_length": 775.296875, | |
| "completions/mean_terminated_length": 775.296875, | |
| "completions/min_length": 143.0, | |
| "completions/min_terminated_length": 143.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6376301944255829, | |
| "epoch": 0.04926108374384237, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0021053398882222665, | |
| "kl": 0.0007705071911914274, | |
| "learning_rate": 4.999077544504252e-05, | |
| "loss": -0.02608906291425228, | |
| "num_tokens": 2983834.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.3320053517818451, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.04976920617951287, | |
| "sampling/importance_sampling_ratio/max": 2.998075008392334, | |
| "sampling/importance_sampling_ratio/mean": 0.9615002274513245, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.996427536010742, | |
| "sampling/sampling_logp_difference/mean": 0.171400785446167, | |
| "step": 20, | |
| "step_time": 94.92565709678456 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3497.0, | |
| "completions/mean_length": 946.609375, | |
| "completions/mean_terminated_length": 901.901611328125, | |
| "completions/min_length": 69.0, | |
| "completions/min_terminated_length": 145.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7160081267356873, | |
| "epoch": 0.05172413793103448, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0015621206257052126, | |
| "kl": 0.0006340539694065228, | |
| "learning_rate": 4.998941068378163e-05, | |
| "loss": -0.02198156528174877, | |
| "num_tokens": 3128385.0, | |
| "reward": 1.12109375, | |
| "reward_std": 0.39589208364486694, | |
| "rewards/reward_func/mean": 0.12456597222222222, | |
| "rewards/reward_func/std": 0.0562092297606998, | |
| "sampling/importance_sampling_ratio/max": 2.9852712154388428, | |
| "sampling/importance_sampling_ratio/mean": 0.9506601691246033, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.786170959472656, | |
| "sampling/sampling_logp_difference/mean": 0.19989526271820068, | |
| "step": 21, | |
| "step_time": 136.62699230923317 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 1973.0, | |
| "completions/max_terminated_length": 1973.0, | |
| "completions/mean_length": 604.4375, | |
| "completions/mean_terminated_length": 608.2698974609375, | |
| "completions/min_length": 81.0, | |
| "completions/min_terminated_length": 81.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6837466955184937, | |
| "epoch": 0.054187192118226604, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00834361275371499, | |
| "kl": 0.0009060032753041014, | |
| "learning_rate": 4.998795182852183e-05, | |
| "loss": 0.07563716918230057, | |
| "num_tokens": 3237005.0, | |
| "reward": 1.1171875, | |
| "reward_std": 0.7223318815231323, | |
| "rewards/reward_func/mean": 0.12413194444444445, | |
| "rewards/reward_func/std": 0.11290023724238078, | |
| "sampling/importance_sampling_ratio/max": 2.9993104934692383, | |
| "sampling/importance_sampling_ratio/mean": 0.9645485877990723, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.818716049194336, | |
| "sampling/sampling_logp_difference/mean": 0.16996437311172485, | |
| "step": 22, | |
| "step_time": 68.31864915997721 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3720.0, | |
| "completions/mean_length": 1099.0625, | |
| "completions/mean_terminated_length": 951.6720581054688, | |
| "completions/min_length": 134.0, | |
| "completions/min_terminated_length": 134.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.682835578918457, | |
| "epoch": 0.05665024630541872, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0028399998898074653, | |
| "kl": 0.0007875877781771123, | |
| "learning_rate": 4.998639888475621e-05, | |
| "loss": 0.010182402096688747, | |
| "num_tokens": 3387729.0, | |
| "reward": 1.125, | |
| "reward_std": 0.44095855951309204, | |
| "rewards/reward_func/mean": 0.125, | |
| "rewards/reward_func/std": 0.06258171962367164, | |
| "sampling/importance_sampling_ratio/max": 2.9974377155303955, | |
| "sampling/importance_sampling_ratio/mean": 0.950944185256958, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.174954414367676, | |
| "sampling/sampling_logp_difference/mean": 0.19639119505882263, | |
| "step": 23, | |
| "step_time": 200.68762891716324 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2514.0, | |
| "completions/mean_length": 722.125, | |
| "completions/mean_terminated_length": 678.241943359375, | |
| "completions/min_length": 41.0, | |
| "completions/min_terminated_length": 41.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7390763312578201, | |
| "epoch": 0.059113300492610835, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005278419593389374, | |
| "kl": 0.0011095789086539298, | |
| "learning_rate": 4.998475185833219e-05, | |
| "loss": 0.02633114904165268, | |
| "num_tokens": 3512121.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.7151176333427429, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.11182467308309343, | |
| "sampling/importance_sampling_ratio/max": 2.99768328666687, | |
| "sampling/importance_sampling_ratio/mean": 0.9494476318359375, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.124560356140137, | |
| "sampling/sampling_logp_difference/mean": 0.213691845536232, | |
| "step": 24, | |
| "step_time": 137.3248422681354 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3199.0, | |
| "completions/mean_length": 984.203125, | |
| "completions/mean_terminated_length": 831.1638793945312, | |
| "completions/min_length": 124.0, | |
| "completions/min_terminated_length": 124.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7115143835544586, | |
| "epoch": 0.06157635467980296, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0015712856310160254, | |
| "kl": 0.0009124837379204109, | |
| "learning_rate": 4.9983010755451386e-05, | |
| "loss": -0.00787552259862423, | |
| "num_tokens": 3655670.0, | |
| "reward": 1.1328125, | |
| "reward_std": 0.42014914751052856, | |
| "rewards/reward_func/mean": 0.12586805555555555, | |
| "rewards/reward_func/std": 0.06024486985471514, | |
| "sampling/importance_sampling_ratio/max": 2.9933128356933594, | |
| "sampling/importance_sampling_ratio/mean": 0.9579269289970398, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.12272834777832, | |
| "sampling/sampling_logp_difference/mean": 0.19365090131759644, | |
| "step": 25, | |
| "step_time": 138.73497348395176 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3386.0, | |
| "completions/mean_length": 953.484375, | |
| "completions/mean_terminated_length": 884.9193115234375, | |
| "completions/min_length": 74.0, | |
| "completions/min_terminated_length": 74.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7320111691951752, | |
| "epoch": 0.06403940886699508, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005384163960889445, | |
| "kl": 0.0012063481844961643, | |
| "learning_rate": 4.998117558266968e-05, | |
| "loss": 0.026074275374412537, | |
| "num_tokens": 3816309.0, | |
| "reward": 0.9609375, | |
| "reward_std": 0.4655146300792694, | |
| "rewards/reward_func/mean": 0.10677083333333333, | |
| "rewards/reward_func/std": 0.06866345471805996, | |
| "sampling/importance_sampling_ratio/max": 2.9988629817962646, | |
| "sampling/importance_sampling_ratio/mean": 0.9447786808013916, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.807069778442383, | |
| "sampling/sampling_logp_difference/mean": 0.21771396696567535, | |
| "step": 26, | |
| "step_time": 126.19171593617648 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3648.0, | |
| "completions/mean_length": 1288.53125, | |
| "completions/mean_terminated_length": 1150.458984375, | |
| "completions/min_length": 138.0, | |
| "completions/min_terminated_length": 138.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7377140372991562, | |
| "epoch": 0.0665024630541872, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.003947554913592225, | |
| "kl": 0.0010491551220184192, | |
| "learning_rate": 4.9979246346897136e-05, | |
| "loss": -0.014140678569674492, | |
| "num_tokens": 3986487.0, | |
| "reward": 0.9609375, | |
| "reward_std": 0.430067241191864, | |
| "rewards/reward_func/mean": 0.10677083333333333, | |
| "rewards/reward_func/std": 0.06397370166248745, | |
| "sampling/importance_sampling_ratio/max": 2.999089002609253, | |
| "sampling/importance_sampling_ratio/mean": 0.9488558769226074, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.052026748657227, | |
| "sampling/sampling_logp_difference/mean": 0.20844586193561554, | |
| "step": 27, | |
| "step_time": 130.6179301950615 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3662.0, | |
| "completions/max_terminated_length": 3662.0, | |
| "completions/mean_length": 585.265625, | |
| "completions/mean_terminated_length": 585.265625, | |
| "completions/min_length": 102.0, | |
| "completions/min_terminated_length": 102.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7785868942737579, | |
| "epoch": 0.06896551724137931, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0021403851160726214, | |
| "kl": 0.0013080878125037998, | |
| "learning_rate": 4.997722305539802e-05, | |
| "loss": 0.012006907723844051, | |
| "num_tokens": 4101032.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.3508908450603485, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.049897139271100364, | |
| "sampling/importance_sampling_ratio/max": 2.9983808994293213, | |
| "sampling/importance_sampling_ratio/mean": 0.9581422209739685, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.397562026977539, | |
| "sampling/sampling_logp_difference/mean": 0.1975124478340149, | |
| "step": 28, | |
| "step_time": 109.09123803512193 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3958.0, | |
| "completions/mean_length": 1360.0625, | |
| "completions/mean_terminated_length": 1271.806396484375, | |
| "completions/min_length": 425.0, | |
| "completions/min_terminated_length": 425.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7062833905220032, | |
| "epoch": 0.07142857142857142, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002369998916127879, | |
| "kl": 0.0010581198730506003, | |
| "learning_rate": 4.997510571579074e-05, | |
| "loss": 0.004455733112990856, | |
| "num_tokens": 4268556.0, | |
| "reward": 1.02734375, | |
| "reward_std": 0.34823018312454224, | |
| "rewards/reward_func/mean": 0.11414930555555555, | |
| "rewards/reward_func/std": 0.052287484208742775, | |
| "sampling/importance_sampling_ratio/max": 2.999220609664917, | |
| "sampling/importance_sampling_ratio/mean": 0.9467811584472656, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.120293617248535, | |
| "sampling/sampling_logp_difference/mean": 0.21002380549907684, | |
| "step": 29, | |
| "step_time": 180.742001067847 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4023.0, | |
| "completions/mean_length": 1096.84375, | |
| "completions/mean_terminated_length": 1054.4031982421875, | |
| "completions/min_length": 153.0, | |
| "completions/min_terminated_length": 153.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.714943453669548, | |
| "epoch": 0.07389162561576355, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004091906592403502, | |
| "kl": 0.0014412851742235944, | |
| "learning_rate": 4.997289433604783e-05, | |
| "loss": -0.02012975513935089, | |
| "num_tokens": 4435650.0, | |
| "reward": 0.95703125, | |
| "reward_std": 0.4444425106048584, | |
| "rewards/reward_func/mean": 0.10633680555555555, | |
| "rewards/reward_func/std": 0.06608307692739698, | |
| "sampling/importance_sampling_ratio/max": 2.9989187717437744, | |
| "sampling/importance_sampling_ratio/mean": 0.9398482441902161, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.725874900817871, | |
| "sampling/sampling_logp_difference/mean": 0.23292841017246246, | |
| "step": 30, | |
| "step_time": 150.5774575888645 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4083.0, | |
| "completions/mean_length": 958.859375, | |
| "completions/mean_terminated_length": 909.0635375976562, | |
| "completions/min_length": 208.0, | |
| "completions/min_terminated_length": 208.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7270680814981461, | |
| "epoch": 0.07635467980295567, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0040443946723161235, | |
| "kl": 0.0008169857028406113, | |
| "learning_rate": 4.997058892449591e-05, | |
| "loss": -0.010359976440668106, | |
| "num_tokens": 4589337.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.46235719323158264, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.06781361169285244, | |
| "sampling/importance_sampling_ratio/max": 2.9993908405303955, | |
| "sampling/importance_sampling_ratio/mean": 0.9479804039001465, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.141839981079102, | |
| "sampling/sampling_logp_difference/mean": 0.2137700319290161, | |
| "step": 31, | |
| "step_time": 129.58590258820914 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3918.0, | |
| "completions/max_terminated_length": 3918.0, | |
| "completions/mean_length": 1016.40625, | |
| "completions/mean_terminated_length": 1019.74609375, | |
| "completions/min_length": 136.0, | |
| "completions/min_terminated_length": 136.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7047373950481415, | |
| "epoch": 0.07881773399014778, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.002745619654401485, | |
| "kl": 0.0010187966545345262, | |
| "learning_rate": 4.99681894898157e-05, | |
| "loss": 0.007434252183884382, | |
| "num_tokens": 4751859.0, | |
| "reward": 1.125, | |
| "reward_std": 0.37796446681022644, | |
| "rewards/reward_func/mean": 0.125, | |
| "rewards/reward_func/std": 0.05282027191585965, | |
| "sampling/importance_sampling_ratio/max": 2.9945504665374756, | |
| "sampling/importance_sampling_ratio/mean": 0.9459384679794312, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.938006401062012, | |
| "sampling/sampling_logp_difference/mean": 0.20850920677185059, | |
| "step": 32, | |
| "step_time": 165.751161579974 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3819.0, | |
| "completions/mean_length": 889.015625, | |
| "completions/mean_terminated_length": 785.5645141601562, | |
| "completions/min_length": 155.0, | |
| "completions/min_terminated_length": 155.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6745845079421997, | |
| "epoch": 0.0812807881773399, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0042239766899397155, | |
| "kl": 0.002233659179182723, | |
| "learning_rate": 4.99656960410419e-05, | |
| "loss": -0.0034338748082518578, | |
| "num_tokens": 4892388.0, | |
| "reward": 1.06640625, | |
| "reward_std": 0.5180131793022156, | |
| "rewards/reward_func/mean": 0.11848958333333333, | |
| "rewards/reward_func/std": 0.0750073492527008, | |
| "sampling/importance_sampling_ratio/max": 2.9851253032684326, | |
| "sampling/importance_sampling_ratio/mean": 0.9577789306640625, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.382405281066895, | |
| "sampling/sampling_logp_difference/mean": 0.174952894449234, | |
| "step": 33, | |
| "step_time": 115.10072640073486 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1721.0, | |
| "completions/max_terminated_length": 1721.0, | |
| "completions/mean_length": 599.8125, | |
| "completions/mean_terminated_length": 599.8125, | |
| "completions/min_length": 156.0, | |
| "completions/min_terminated_length": 156.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7060859650373459, | |
| "epoch": 0.08374384236453201, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0038322618214211897, | |
| "kl": 0.0012007179611828178, | |
| "learning_rate": 4.9963108587563226e-05, | |
| "loss": -0.013989871367812157, | |
| "num_tokens": 5011160.0, | |
| "reward": 0.9765625, | |
| "reward_std": 0.2842378616333008, | |
| "rewards/reward_func/mean": 0.10850694444444445, | |
| "rewards/reward_func/std": 0.04346192214224073, | |
| "sampling/importance_sampling_ratio/max": 2.997807741165161, | |
| "sampling/importance_sampling_ratio/mean": 0.9572877287864685, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.312417030334473, | |
| "sampling/sampling_logp_difference/mean": 0.19250346720218658, | |
| "step": 34, | |
| "step_time": 64.04967820202 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4077.0, | |
| "completions/mean_length": 914.203125, | |
| "completions/mean_terminated_length": 863.698486328125, | |
| "completions/min_length": 123.0, | |
| "completions/min_terminated_length": 123.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7091882675886154, | |
| "epoch": 0.08620689655172414, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006479029390818219, | |
| "kl": 0.0012216337054269388, | |
| "learning_rate": 4.996042713912238e-05, | |
| "loss": 0.027525117620825768, | |
| "num_tokens": 5157541.0, | |
| "reward": 1.078125, | |
| "reward_std": 0.5379911661148071, | |
| "rewards/reward_func/mean": 0.11979166666666667, | |
| "rewards/reward_func/std": 0.10002665056122674, | |
| "sampling/importance_sampling_ratio/max": 2.9995338916778564, | |
| "sampling/importance_sampling_ratio/mean": 0.9554766416549683, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.678374290466309, | |
| "sampling/sampling_logp_difference/mean": 0.1935516595840454, | |
| "step": 35, | |
| "step_time": 158.35729796788655 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3923.0, | |
| "completions/mean_length": 1026.171875, | |
| "completions/mean_terminated_length": 939.274169921875, | |
| "completions/min_length": 68.0, | |
| "completions/min_terminated_length": 68.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7677580267190933, | |
| "epoch": 0.08866995073891626, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.003969688289252204, | |
| "kl": 0.001730678923195228, | |
| "learning_rate": 4.995765170581595e-05, | |
| "loss": -0.004793988540768623, | |
| "num_tokens": 5309680.0, | |
| "reward": 1.0390625, | |
| "reward_std": 0.5083016157150269, | |
| "rewards/reward_func/mean": 0.1154513888888889, | |
| "rewards/reward_func/std": 0.07348066899511549, | |
| "sampling/importance_sampling_ratio/max": 2.9985527992248535, | |
| "sampling/importance_sampling_ratio/mean": 0.9522184133529663, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.271164894104004, | |
| "sampling/sampling_logp_difference/mean": 0.2057083398103714, | |
| "step": 36, | |
| "step_time": 119.47087240288965 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3244.0, | |
| "completions/max_terminated_length": 3244.0, | |
| "completions/mean_length": 764.640625, | |
| "completions/mean_terminated_length": 776.3492431640625, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 121.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6654424071311951, | |
| "epoch": 0.09113300492610837, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01059432690079601, | |
| "kl": 0.0017997757968259975, | |
| "learning_rate": 4.995478229809444e-05, | |
| "loss": 0.06814411282539368, | |
| "num_tokens": 5442105.0, | |
| "reward": 1.0, | |
| "reward_std": 0.7440237998962402, | |
| "rewards/reward_func/mean": 0.1111111111111111, | |
| "rewards/reward_func/std": 0.11659446193112268, | |
| "sampling/importance_sampling_ratio/max": 2.9902219772338867, | |
| "sampling/importance_sampling_ratio/mean": 0.9585555791854858, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.091999053955078, | |
| "sampling/sampling_logp_difference/mean": 0.18571698665618896, | |
| "step": 37, | |
| "step_time": 100.38183392700739 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3807.0, | |
| "completions/mean_length": 1295.9375, | |
| "completions/mean_terminated_length": 1065.9482421875, | |
| "completions/min_length": 186.0, | |
| "completions/min_terminated_length": 186.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6556531488895416, | |
| "epoch": 0.09359605911330049, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002583138605080032, | |
| "kl": 0.0012453852395992726, | |
| "learning_rate": 4.9951818926762174e-05, | |
| "loss": 0.004991541150957346, | |
| "num_tokens": 5616053.0, | |
| "reward": 1.0703125, | |
| "reward_std": 0.45090022683143616, | |
| "rewards/reward_func/mean": 0.1189236111111111, | |
| "rewards/reward_func/std": 0.06625068187713623, | |
| "sampling/importance_sampling_ratio/max": 2.9976985454559326, | |
| "sampling/importance_sampling_ratio/mean": 0.9467884302139282, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.92337417602539, | |
| "sampling/sampling_logp_difference/mean": 0.20371927320957184, | |
| "step": 38, | |
| "step_time": 146.80642993724905 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2818.0, | |
| "completions/max_terminated_length": 2818.0, | |
| "completions/mean_length": 838.375, | |
| "completions/mean_terminated_length": 838.375, | |
| "completions/min_length": 190.0, | |
| "completions/min_terminated_length": 190.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.616532102227211, | |
| "epoch": 0.0960591133004926, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002685031367942058, | |
| "kl": 0.0010408478119643405, | |
| "learning_rate": 4.99487616029773e-05, | |
| "loss": -0.0135424192994833, | |
| "num_tokens": 5746829.0, | |
| "reward": 0.99609375, | |
| "reward_std": 0.2539067268371582, | |
| "rewards/reward_func/mean": 0.11067708333333333, | |
| "rewards/reward_func/std": 0.03920013705889384, | |
| "sampling/importance_sampling_ratio/max": 2.9999613761901855, | |
| "sampling/importance_sampling_ratio/mean": 0.9601303339004517, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.606135368347168, | |
| "sampling/sampling_logp_difference/mean": 0.16902627050876617, | |
| "step": 39, | |
| "step_time": 103.52780347992666 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3299.0, | |
| "completions/mean_length": 1121.0, | |
| "completions/mean_terminated_length": 1025.0322265625, | |
| "completions/min_length": 79.0, | |
| "completions/min_terminated_length": 79.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6732313334941864, | |
| "epoch": 0.09852216748768473, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0017140794179257596, | |
| "kl": 0.0012895975669380277, | |
| "learning_rate": 4.994561033825174e-05, | |
| "loss": -0.0004059688653796911, | |
| "num_tokens": 5898221.0, | |
| "reward": 1.0703125, | |
| "reward_std": 0.32874444127082825, | |
| "rewards/reward_func/mean": 0.1189236111111111, | |
| "rewards/reward_func/std": 0.0472567660941018, | |
| "sampling/importance_sampling_ratio/max": 2.993136405944824, | |
| "sampling/importance_sampling_ratio/mean": 0.9536988735198975, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.736954689025879, | |
| "sampling/sampling_logp_difference/mean": 0.18731635808944702, | |
| "step": 40, | |
| "step_time": 145.77268586610444 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3343.0, | |
| "completions/mean_length": 1018.46875, | |
| "completions/mean_terminated_length": 867.1146850585938, | |
| "completions/min_length": 139.0, | |
| "completions/min_terminated_length": 139.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6824723184108734, | |
| "epoch": 0.10098522167487685, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.011154652575187822, | |
| "kl": 0.0008386526315007359, | |
| "learning_rate": 4.99423651444511e-05, | |
| "loss": -0.022859971970319748, | |
| "num_tokens": 6041355.0, | |
| "reward": 1.1015625, | |
| "reward_std": 0.7356008291244507, | |
| "rewards/reward_func/mean": 0.12239583333333333, | |
| "rewards/reward_func/std": 0.11811710231833988, | |
| "sampling/importance_sampling_ratio/max": 2.9998111724853516, | |
| "sampling/importance_sampling_ratio/mean": 0.9516376256942749, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.299010276794434, | |
| "sampling/sampling_logp_difference/mean": 0.19706137478351593, | |
| "step": 41, | |
| "step_time": 195.1581382418517 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3209.0, | |
| "completions/mean_length": 1293.296875, | |
| "completions/mean_terminated_length": 1055.7796630859375, | |
| "completions/min_length": 106.0, | |
| "completions/min_terminated_length": 106.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5589832067489624, | |
| "epoch": 0.10344827586206896, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0025548626007616385, | |
| "kl": 0.0014480030513368547, | |
| "learning_rate": 4.993902603379471e-05, | |
| "loss": -0.005358518101274967, | |
| "num_tokens": 6211614.0, | |
| "reward": 0.98828125, | |
| "reward_std": 0.3603065609931946, | |
| "rewards/reward_func/mean": 0.10980902777777778, | |
| "rewards/reward_func/std": 0.05452203916178809, | |
| "sampling/importance_sampling_ratio/max": 2.999878406524658, | |
| "sampling/importance_sampling_ratio/mean": 0.9534763693809509, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.18749713897705, | |
| "sampling/sampling_logp_difference/mean": 0.1753091663122177, | |
| "step": 42, | |
| "step_time": 121.34253464243375 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3998.0, | |
| "completions/max_terminated_length": 3998.0, | |
| "completions/mean_length": 952.796875, | |
| "completions/mean_terminated_length": 952.796875, | |
| "completions/min_length": 177.0, | |
| "completions/min_terminated_length": 177.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6009114980697632, | |
| "epoch": 0.10591133004926108, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0015214392359086822, | |
| "kl": 0.001146472553955391, | |
| "learning_rate": 4.99355930188555e-05, | |
| "loss": 0.0028980104252696037, | |
| "num_tokens": 6353121.0, | |
| "reward": 1.07421875, | |
| "reward_std": 0.32635459303855896, | |
| "rewards/reward_func/mean": 0.1193576388888889, | |
| "rewards/reward_func/std": 0.04690552916791704, | |
| "sampling/importance_sampling_ratio/max": 2.999147653579712, | |
| "sampling/importance_sampling_ratio/mean": 0.9571791887283325, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.376380920410156, | |
| "sampling/sampling_logp_difference/mean": 0.17375709116458893, | |
| "step": 43, | |
| "step_time": 124.00942197302356 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2824.0, | |
| "completions/mean_length": 849.8125, | |
| "completions/mean_terminated_length": 798.2857666015625, | |
| "completions/min_length": 88.0, | |
| "completions/min_terminated_length": 88.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7530709803104401, | |
| "epoch": 0.10837438423645321, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0013039957282542235, | |
| "kl": 0.001282885583350435, | |
| "learning_rate": 4.9932066112559975e-05, | |
| "loss": -0.009485810995101929, | |
| "num_tokens": 6499989.0, | |
| "reward": 1.11328125, | |
| "reward_std": 0.39322054386138916, | |
| "rewards/reward_func/mean": 0.12369791666666667, | |
| "rewards/reward_func/std": 0.05615971154636807, | |
| "sampling/importance_sampling_ratio/max": 2.9940052032470703, | |
| "sampling/importance_sampling_ratio/mean": 0.950169563293457, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.745084762573242, | |
| "sampling/sampling_logp_difference/mean": 0.20593324303627014, | |
| "step": 44, | |
| "step_time": 131.83942927396856 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2651.0, | |
| "completions/mean_length": 989.75, | |
| "completions/mean_terminated_length": 940.4445190429688, | |
| "completions/min_length": 109.0, | |
| "completions/min_terminated_length": 109.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7023928314447403, | |
| "epoch": 0.11083743842364532, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.002663078811979883, | |
| "kl": 0.0010599846136756241, | |
| "learning_rate": 4.992844532818821e-05, | |
| "loss": -0.017974235117435455, | |
| "num_tokens": 6656725.0, | |
| "reward": 0.984375, | |
| "reward_std": 0.29504841566085815, | |
| "rewards/reward_func/mean": 0.109375, | |
| "rewards/reward_func/std": 0.04466936323377821, | |
| "sampling/importance_sampling_ratio/max": 2.9983725547790527, | |
| "sampling/importance_sampling_ratio/mean": 0.9500045776367188, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.430854797363281, | |
| "sampling/sampling_logp_difference/mean": 0.20036830008029938, | |
| "step": 45, | |
| "step_time": 132.1835569611285 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3980.0, | |
| "completions/mean_length": 1233.3125, | |
| "completions/mean_terminated_length": 1188.6773681640625, | |
| "completions/min_length": 115.0, | |
| "completions/min_terminated_length": 115.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7167646884918213, | |
| "epoch": 0.11330049261083744, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0015582099411917988, | |
| "kl": 0.0030349711887538433, | |
| "learning_rate": 4.9924730679373735e-05, | |
| "loss": -0.0024594487622380257, | |
| "num_tokens": 6831081.0, | |
| "reward": 1.03125, | |
| "reward_std": 0.3258078992366791, | |
| "rewards/reward_func/mean": 0.11458333333333333, | |
| "rewards/reward_func/std": 0.050385665562417775, | |
| "sampling/importance_sampling_ratio/max": 2.998929977416992, | |
| "sampling/importance_sampling_ratio/mean": 0.9458686113357544, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.93498420715332, | |
| "sampling/sampling_logp_difference/mean": 0.20973166823387146, | |
| "step": 46, | |
| "step_time": 182.55769080412574 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3671.0, | |
| "completions/mean_length": 931.59375, | |
| "completions/mean_terminated_length": 881.3651123046875, | |
| "completions/min_length": 85.0, | |
| "completions/min_terminated_length": 85.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6520289331674576, | |
| "epoch": 0.11576354679802955, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.005328116211785687, | |
| "kl": 0.0031528117979178205, | |
| "learning_rate": 4.992092218010351e-05, | |
| "loss": 0.02094169706106186, | |
| "num_tokens": 6989151.0, | |
| "reward": 1.15625, | |
| "reward_std": 0.6950790882110596, | |
| "rewards/reward_func/mean": 0.1284722222222222, | |
| "rewards/reward_func/std": 0.10439738300111559, | |
| "sampling/importance_sampling_ratio/max": 2.9967551231384277, | |
| "sampling/importance_sampling_ratio/mean": 0.9473298788070679, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.523643493652344, | |
| "sampling/sampling_logp_difference/mean": 0.2068655788898468, | |
| "step": 47, | |
| "step_time": 140.63169015408494 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3065.0, | |
| "completions/mean_length": 846.890625, | |
| "completions/mean_terminated_length": 769.3770141601562, | |
| "completions/min_length": 218.0, | |
| "completions/min_terminated_length": 218.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5718538910150528, | |
| "epoch": 0.11822660098522167, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0007047123566561605, | |
| "kl": 0.0011549281189218163, | |
| "learning_rate": 4.991701984471789e-05, | |
| "loss": 0.01098698377609253, | |
| "num_tokens": 7113624.0, | |
| "reward": 1.07421875, | |
| "reward_std": 0.2734251022338867, | |
| "rewards/reward_func/mean": 0.1193576388888889, | |
| "rewards/reward_func/std": 0.03352663583225674, | |
| "sampling/importance_sampling_ratio/max": 2.99415922164917, | |
| "sampling/importance_sampling_ratio/mean": 0.9616100192070007, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.49928092956543, | |
| "sampling/sampling_logp_difference/mean": 0.16297030448913574, | |
| "step": 48, | |
| "step_time": 119.48140913574025 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3726.0, | |
| "completions/mean_length": 1220.875, | |
| "completions/mean_terminated_length": 1079.475341796875, | |
| "completions/min_length": 266.0, | |
| "completions/min_terminated_length": 266.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6797353476285934, | |
| "epoch": 0.1206896551724138, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0020895404039787254, | |
| "kl": 0.0014812646841164678, | |
| "learning_rate": 4.9913023687910575e-05, | |
| "loss": 0.0024993023835122585, | |
| "num_tokens": 7288064.0, | |
| "reward": 1.0625, | |
| "reward_std": 0.37796446681022644, | |
| "rewards/reward_func/mean": 0.11805555555555555, | |
| "rewards/reward_func/std": 0.05528419050905439, | |
| "sampling/importance_sampling_ratio/max": 2.993243455886841, | |
| "sampling/importance_sampling_ratio/mean": 0.9490103125572205, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.937416076660156, | |
| "sampling/sampling_logp_difference/mean": 0.20558039844036102, | |
| "step": 49, | |
| "step_time": 167.97329160943627 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3760.0, | |
| "completions/mean_length": 993.40625, | |
| "completions/mean_terminated_length": 944.1588134765625, | |
| "completions/min_length": 90.0, | |
| "completions/min_terminated_length": 90.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7639325112104416, | |
| "epoch": 0.12315270935960591, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0012238713070192111, | |
| "kl": 0.0012491706293076277, | |
| "learning_rate": 4.990893372472849e-05, | |
| "loss": 0.01330800261348486, | |
| "num_tokens": 7441626.0, | |
| "reward": 1.06640625, | |
| "reward_std": 0.2790367007255554, | |
| "rewards/reward_func/mean": 0.11848958333333333, | |
| "rewards/reward_func/std": 0.035972247935003705, | |
| "sampling/importance_sampling_ratio/max": 2.9993691444396973, | |
| "sampling/importance_sampling_ratio/mean": 0.9422820806503296, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.27017879486084, | |
| "sampling/sampling_logp_difference/mean": 0.2233036458492279, | |
| "step": 50, | |
| "step_time": 180.49298912403174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3854.0, | |
| "completions/mean_length": 1377.78125, | |
| "completions/mean_terminated_length": 1220.7626953125, | |
| "completions/min_length": 357.0, | |
| "completions/min_terminated_length": 357.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7340656220912933, | |
| "epoch": 0.12561576354679804, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0016767950961260724, | |
| "kl": 0.002402618178166449, | |
| "learning_rate": 4.99047499705718e-05, | |
| "loss": -0.00485864607617259, | |
| "num_tokens": 7610044.0, | |
| "reward": 1.10546875, | |
| "reward_std": 0.3640727698802948, | |
| "rewards/reward_func/mean": 0.1228298611111111, | |
| "rewards/reward_func/std": 0.05129980875386132, | |
| "sampling/importance_sampling_ratio/max": 2.9986684322357178, | |
| "sampling/importance_sampling_ratio/mean": 0.9486904144287109, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.795515060424805, | |
| "sampling/sampling_logp_difference/mean": 0.2021857649087906, | |
| "step": 51, | |
| "step_time": 124.5531223397702 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2975.0, | |
| "completions/mean_length": 1016.59375, | |
| "completions/mean_terminated_length": 865.1475219726562, | |
| "completions/min_length": 207.0, | |
| "completions/min_terminated_length": 207.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6252937316894531, | |
| "epoch": 0.12807881773399016, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.00038234811897031116, | |
| "kl": 0.000800235866336152, | |
| "learning_rate": 4.990047244119383e-05, | |
| "loss": -0.010930902324616909, | |
| "num_tokens": 7758786.0, | |
| "reward": 1.125, | |
| "reward_std": 0.3700064420700073, | |
| "rewards/reward_func/mean": 0.125, | |
| "rewards/reward_func/std": 0.049859102401468486, | |
| "sampling/importance_sampling_ratio/max": 2.9963314533233643, | |
| "sampling/importance_sampling_ratio/mean": 0.9571857452392578, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.874775886535645, | |
| "sampling/sampling_logp_difference/mean": 0.1737726330757141, | |
| "step": 52, | |
| "step_time": 173.18570705433376 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3552.0, | |
| "completions/mean_length": 1157.84375, | |
| "completions/mean_terminated_length": 1111.2064208984375, | |
| "completions/min_length": 244.0, | |
| "completions/min_terminated_length": 244.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6483455449342728, | |
| "epoch": 0.13054187192118227, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.007614191151348716, | |
| "kl": 0.0013027136301388964, | |
| "learning_rate": 4.9896101152701e-05, | |
| "loss": 0.021573293954133987, | |
| "num_tokens": 7917912.0, | |
| "reward": 1.14453125, | |
| "reward_std": 0.6280555129051208, | |
| "rewards/reward_func/mean": 0.1271701388888889, | |
| "rewards/reward_func/std": 0.11123616165584987, | |
| "sampling/importance_sampling_ratio/max": 2.9958243370056152, | |
| "sampling/importance_sampling_ratio/mean": 0.955902099609375, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.749975204467773, | |
| "sampling/sampling_logp_difference/mean": 0.18389251828193665, | |
| "step": 53, | |
| "step_time": 128.83531658397987 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3641.0, | |
| "completions/mean_length": 1074.78125, | |
| "completions/mean_terminated_length": 977.3225708007812, | |
| "completions/min_length": 220.0, | |
| "completions/min_terminated_length": 220.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7957058399915695, | |
| "epoch": 0.1330049261083744, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0040000183115382806, | |
| "kl": 0.0013368913641897961, | |
| "learning_rate": 4.9891636121552745e-05, | |
| "loss": 0.026843538507819176, | |
| "num_tokens": 8075082.0, | |
| "reward": 1.0390625, | |
| "reward_std": 0.37059247493743896, | |
| "rewards/reward_func/mean": 0.1154513888888889, | |
| "rewards/reward_func/std": 0.055034501685036555, | |
| "sampling/importance_sampling_ratio/max": 2.994692325592041, | |
| "sampling/importance_sampling_ratio/mean": 0.9410925507545471, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.202977180480957, | |
| "sampling/sampling_logp_difference/mean": 0.2365683615207672, | |
| "step": 54, | |
| "step_time": 130.15844374126755 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3922.0, | |
| "completions/mean_length": 1037.890625, | |
| "completions/mean_terminated_length": 939.2418823242188, | |
| "completions/min_length": 112.0, | |
| "completions/min_terminated_length": 112.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6936090290546417, | |
| "epoch": 0.1354679802955665, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0026880063656234054, | |
| "kl": 0.0012836718233302236, | |
| "learning_rate": 4.988707736456151e-05, | |
| "loss": 0.020552635192871094, | |
| "num_tokens": 8223907.0, | |
| "reward": 1.01953125, | |
| "reward_std": 0.2897203266620636, | |
| "rewards/reward_func/mean": 0.11328125, | |
| "rewards/reward_func/std": 0.045329956544770136, | |
| "sampling/importance_sampling_ratio/max": 2.9959287643432617, | |
| "sampling/importance_sampling_ratio/mean": 0.9536117315292358, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.170980453491211, | |
| "sampling/sampling_logp_difference/mean": 0.1910027265548706, | |
| "step": 55, | |
| "step_time": 131.79171376302838 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2972.0, | |
| "completions/mean_length": 911.46875, | |
| "completions/mean_terminated_length": 808.7418823242188, | |
| "completions/min_length": 200.0, | |
| "completions/min_terminated_length": 200.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7065327614545822, | |
| "epoch": 0.13793103448275862, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0012146386659164553, | |
| "kl": 0.0009278918150812387, | |
| "learning_rate": 4.9882424898892635e-05, | |
| "loss": 0.0029757781885564327, | |
| "num_tokens": 8360993.0, | |
| "reward": 1.0625, | |
| "reward_std": 0.2920915186405182, | |
| "rewards/reward_func/mean": 0.11805555555555555, | |
| "rewards/reward_func/std": 0.04098213298453225, | |
| "sampling/importance_sampling_ratio/max": 2.9910311698913574, | |
| "sampling/importance_sampling_ratio/mean": 0.9575042724609375, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.249921798706055, | |
| "sampling/sampling_logp_difference/mean": 0.19161823391914368, | |
| "step": 56, | |
| "step_time": 113.03869129787199 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3818.0, | |
| "completions/mean_length": 1330.109375, | |
| "completions/mean_terminated_length": 1145.7166748046875, | |
| "completions/min_length": 237.0, | |
| "completions/min_terminated_length": 237.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7262924760580063, | |
| "epoch": 0.14039408866995073, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 9.350445461896138e-05, | |
| "kl": 0.0008153790549840778, | |
| "learning_rate": 4.987767874206428e-05, | |
| "loss": 1.2529770174296573e-05, | |
| "num_tokens": 8527992.0, | |
| "reward": 1.15625, | |
| "reward_std": 0.36596253514289856, | |
| "rewards/reward_func/mean": 0.1284722222222222, | |
| "rewards/reward_func/std": 0.04066250390476651, | |
| "sampling/importance_sampling_ratio/max": 2.998924493789673, | |
| "sampling/importance_sampling_ratio/mean": 0.9534813165664673, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.857730865478516, | |
| "sampling/sampling_logp_difference/mean": 0.19994306564331055, | |
| "step": 57, | |
| "step_time": 175.88998585077934 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2244.0, | |
| "completions/mean_length": 745.0625, | |
| "completions/mean_terminated_length": 691.873046875, | |
| "completions/min_length": 188.0, | |
| "completions/min_terminated_length": 188.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7556774914264679, | |
| "epoch": 0.14285714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.06262985175890678, | |
| "kl": 0.1303897971083643, | |
| "learning_rate": 4.987283891194743e-05, | |
| "loss": -0.03529379516839981, | |
| "num_tokens": 8660748.0, | |
| "reward": 1.1171875, | |
| "reward_std": 0.47761133313179016, | |
| "rewards/reward_func/mean": 0.12413194444444445, | |
| "rewards/reward_func/std": 0.06965038345919715, | |
| "sampling/importance_sampling_ratio/max": 2.997659206390381, | |
| "sampling/importance_sampling_ratio/mean": 0.956047773361206, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.499494552612305, | |
| "sampling/sampling_logp_difference/mean": 0.19947615265846252, | |
| "step": 58, | |
| "step_time": 164.91047239373438 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2787.0, | |
| "completions/mean_length": 877.609375, | |
| "completions/mean_terminated_length": 826.5238647460938, | |
| "completions/min_length": 169.0, | |
| "completions/min_terminated_length": 169.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6654725968837738, | |
| "epoch": 0.14532019704433496, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0013078254591733234, | |
| "kl": 0.0010809154191520065, | |
| "learning_rate": 4.986790542676576e-05, | |
| "loss": -0.005135550629347563, | |
| "num_tokens": 8806419.0, | |
| "reward": 1.046875, | |
| "reward_std": 0.2667968273162842, | |
| "rewards/reward_func/mean": 0.11631944444444445, | |
| "rewards/reward_func/std": 0.038036055862903595, | |
| "sampling/importance_sampling_ratio/max": 2.996380090713501, | |
| "sampling/importance_sampling_ratio/mean": 0.9533485174179077, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.1231632232666, | |
| "sampling/sampling_logp_difference/mean": 0.19166377186775208, | |
| "step": 59, | |
| "step_time": 118.16056217276491 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1737.0, | |
| "completions/mean_length": 954.046875, | |
| "completions/mean_terminated_length": 792.5000610351562, | |
| "completions/min_length": 199.0, | |
| "completions/min_terminated_length": 199.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7431278079748154, | |
| "epoch": 0.1477832512315271, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0009579017109342759, | |
| "kl": 0.0010434478608658537, | |
| "learning_rate": 4.986287830509558e-05, | |
| "loss": -0.007221859414130449, | |
| "num_tokens": 8964854.0, | |
| "reward": 1.06640625, | |
| "reward_std": 0.3219551742076874, | |
| "rewards/reward_func/mean": 0.11848958333333333, | |
| "rewards/reward_func/std": 0.04488379342688455, | |
| "sampling/importance_sampling_ratio/max": 2.999706745147705, | |
| "sampling/importance_sampling_ratio/mean": 0.9473176598548889, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.110328674316406, | |
| "sampling/sampling_logp_difference/mean": 0.21286305785179138, | |
| "step": 60, | |
| "step_time": 138.2910026947502 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 1647.0, | |
| "completions/max_terminated_length": 1647.0, | |
| "completions/mean_length": 617.0, | |
| "completions/mean_terminated_length": 612.84130859375, | |
| "completions/min_length": 251.0, | |
| "completions/min_terminated_length": 251.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6504525691270828, | |
| "epoch": 0.15024630541871922, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0026108009768812056, | |
| "kl": 0.001221363214426674, | |
| "learning_rate": 4.985775756586581e-05, | |
| "loss": -0.018695060163736343, | |
| "num_tokens": 9094054.0, | |
| "reward": 1.00390625, | |
| "reward_std": 0.28692469000816345, | |
| "rewards/reward_func/mean": 0.1115451388888889, | |
| "rewards/reward_func/std": 0.04380870693259769, | |
| "sampling/importance_sampling_ratio/max": 2.9986069202423096, | |
| "sampling/importance_sampling_ratio/mean": 0.9576082229614258, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.987771987915039, | |
| "sampling/sampling_logp_difference/mean": 0.17746970057487488, | |
| "step": 61, | |
| "step_time": 64.66796927712858 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3212.0, | |
| "completions/mean_length": 1179.359375, | |
| "completions/mean_terminated_length": 1085.274169921875, | |
| "completions/min_length": 265.0, | |
| "completions/min_terminated_length": 265.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6409039795398712, | |
| "epoch": 0.15270935960591134, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0034371950102880864, | |
| "kl": 0.0011749721161322668, | |
| "learning_rate": 4.9852543228357835e-05, | |
| "loss": 0.029548635706305504, | |
| "num_tokens": 9254781.0, | |
| "reward": 1.0, | |
| "reward_std": 0.29880714416503906, | |
| "rewards/reward_func/mean": 0.1111111111111111, | |
| "rewards/reward_func/std": 0.04573592378033532, | |
| "sampling/importance_sampling_ratio/max": 2.998082160949707, | |
| "sampling/importance_sampling_ratio/mean": 0.9520100355148315, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.498339653015137, | |
| "sampling/sampling_logp_difference/mean": 0.193486750125885, | |
| "step": 62, | |
| "step_time": 167.41590802790597 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3880.0, | |
| "completions/mean_length": 791.21875, | |
| "completions/mean_terminated_length": 745.0967407226562, | |
| "completions/min_length": 178.0, | |
| "completions/min_terminated_length": 178.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7853821069002151, | |
| "epoch": 0.15517241379310345, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004548122568676759, | |
| "kl": 0.0013982994423713535, | |
| "learning_rate": 4.9847235312205484e-05, | |
| "loss": -0.033901821821928024, | |
| "num_tokens": 9385451.0, | |
| "reward": 0.93359375, | |
| "reward_std": 0.38638070225715637, | |
| "rewards/reward_func/mean": 0.1037326388888889, | |
| "rewards/reward_func/std": 0.05757651891973284, | |
| "sampling/importance_sampling_ratio/max": 2.99320650100708, | |
| "sampling/importance_sampling_ratio/mean": 0.9517427682876587, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.464980125427246, | |
| "sampling/sampling_logp_difference/mean": 0.20454317331314087, | |
| "step": 63, | |
| "step_time": 117.68562039383687 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3050.0, | |
| "completions/mean_length": 1021.453125, | |
| "completions/mean_terminated_length": 921.9671630859375, | |
| "completions/min_length": 289.0, | |
| "completions/min_terminated_length": 289.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.5929539352655411, | |
| "epoch": 0.15763546798029557, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0027889445903394447, | |
| "kl": 0.0008723176724743098, | |
| "learning_rate": 4.984183383739496e-05, | |
| "loss": -0.021047594025731087, | |
| "num_tokens": 9532632.0, | |
| "reward": 1.01171875, | |
| "reward_std": 0.36847415566444397, | |
| "rewards/reward_func/mean": 0.11241319444444445, | |
| "rewards/reward_func/std": 0.05727195077472263, | |
| "sampling/importance_sampling_ratio/max": 2.9981136322021484, | |
| "sampling/importance_sampling_ratio/mean": 0.9593861699104309, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.235153198242188, | |
| "sampling/sampling_logp_difference/mean": 0.16657081246376038, | |
| "step": 64, | |
| "step_time": 131.22120441007428 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3649.0, | |
| "completions/mean_length": 1245.90625, | |
| "completions/mean_terminated_length": 1168.475341796875, | |
| "completions/min_length": 215.0, | |
| "completions/min_terminated_length": 215.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6179927885532379, | |
| "epoch": 0.16009852216748768, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004497392045849199, | |
| "kl": 0.0008117440593196079, | |
| "learning_rate": 4.983633882426471e-05, | |
| "loss": -0.004598885774612427, | |
| "num_tokens": 9694866.0, | |
| "reward": 0.9609375, | |
| "reward_std": 0.4569106698036194, | |
| "rewards/reward_func/mean": 0.10677083333333333, | |
| "rewards/reward_func/std": 0.06866345471805996, | |
| "sampling/importance_sampling_ratio/max": 2.9998435974121094, | |
| "sampling/importance_sampling_ratio/mean": 0.9481761455535889, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.188133239746094, | |
| "sampling/sampling_logp_difference/mean": 0.19589340686798096, | |
| "step": 65, | |
| "step_time": 133.8538687042892 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3100.0, | |
| "completions/mean_length": 750.578125, | |
| "completions/mean_terminated_length": 685.9835815429688, | |
| "completions/min_length": 128.0, | |
| "completions/min_terminated_length": 128.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7141001224517822, | |
| "epoch": 0.1625615763546798, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006127008410444587, | |
| "kl": 0.001061671442585066, | |
| "learning_rate": 4.983075029350542e-05, | |
| "loss": -0.015230651013553143, | |
| "num_tokens": 9823479.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.5820431113243103, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.12548058893945482, | |
| "sampling/importance_sampling_ratio/max": 2.998790740966797, | |
| "sampling/importance_sampling_ratio/mean": 0.9558759331703186, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.232525825500488, | |
| "sampling/sampling_logp_difference/mean": 0.18798446655273438, | |
| "step": 66, | |
| "step_time": 142.64381241961382 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4061.0, | |
| "completions/mean_length": 1055.546875, | |
| "completions/mean_terminated_length": 959.5409545898438, | |
| "completions/min_length": 200.0, | |
| "completions/min_terminated_length": 200.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5656583607196808, | |
| "epoch": 0.16502463054187191, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006161927067566899, | |
| "kl": 0.0006807406462030485, | |
| "learning_rate": 4.9825068266159894e-05, | |
| "loss": -0.014742434024810791, | |
| "num_tokens": 9974714.0, | |
| "reward": 1.0, | |
| "reward_std": 0.6666666865348816, | |
| "rewards/reward_func/mean": 0.1111111111111111, | |
| "rewards/reward_func/std": 0.09259259700775146, | |
| "sampling/importance_sampling_ratio/max": 2.999696969985962, | |
| "sampling/importance_sampling_ratio/mean": 0.9569911956787109, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.746885299682617, | |
| "sampling/sampling_logp_difference/mean": 0.17171627283096313, | |
| "step": 67, | |
| "step_time": 173.7841714611277 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4043.0, | |
| "completions/max_terminated_length": 4043.0, | |
| "completions/mean_length": 888.375, | |
| "completions/mean_terminated_length": 895.730224609375, | |
| "completions/min_length": 85.0, | |
| "completions/min_terminated_length": 85.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6102766692638397, | |
| "epoch": 0.16748768472906403, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007500581448964243, | |
| "kl": 0.0015853389049880207, | |
| "learning_rate": 4.981929276362298e-05, | |
| "loss": 0.08089423179626465, | |
| "num_tokens": 10117634.0, | |
| "reward": 0.90625, | |
| "reward_std": 0.8643053770065308, | |
| "rewards/reward_func/mean": 0.10069444444444445, | |
| "rewards/reward_func/std": 0.12619537777370876, | |
| "sampling/importance_sampling_ratio/max": 2.989180564880371, | |
| "sampling/importance_sampling_ratio/mean": 0.9547425508499146, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.812376976013184, | |
| "sampling/sampling_logp_difference/mean": 0.17781385779380798, | |
| "step": 68, | |
| "step_time": 125.53745425422676 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3739.0, | |
| "completions/mean_length": 1150.171875, | |
| "completions/mean_terminated_length": 1053.2930908203125, | |
| "completions/min_length": 158.0, | |
| "completions/min_terminated_length": 158.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6405633389949799, | |
| "epoch": 0.16995073891625614, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005291742203731279, | |
| "kl": 0.0008613676036475226, | |
| "learning_rate": 4.981342380764149e-05, | |
| "loss": -0.03940670192241669, | |
| "num_tokens": 10279517.0, | |
| "reward": 0.88671875, | |
| "reward_std": 0.7584571838378906, | |
| "rewards/reward_func/mean": 0.09852430555555555, | |
| "rewards/reward_func/std": 0.1488193174203237, | |
| "sampling/importance_sampling_ratio/max": 2.999359369277954, | |
| "sampling/importance_sampling_ratio/mean": 0.9512237310409546, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.12166976928711, | |
| "sampling/sampling_logp_difference/mean": 0.18591460585594177, | |
| "step": 69, | |
| "step_time": 149.70758228283376 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3832.0, | |
| "completions/mean_length": 879.140625, | |
| "completions/mean_terminated_length": 661.4827270507812, | |
| "completions/min_length": 139.0, | |
| "completions/min_terminated_length": 139.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8025156259536743, | |
| "epoch": 0.1724137931034483, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00684584761013416, | |
| "kl": 0.001387007927405648, | |
| "learning_rate": 4.980746142031414e-05, | |
| "loss": 0.01730230078101158, | |
| "num_tokens": 10415078.0, | |
| "reward": 0.91015625, | |
| "reward_std": 0.6977389454841614, | |
| "rewards/reward_func/mean": 0.10112847222222222, | |
| "rewards/reward_func/std": 0.13860311441951328, | |
| "sampling/importance_sampling_ratio/max": 2.9970333576202393, | |
| "sampling/importance_sampling_ratio/mean": 0.9527570009231567, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.19293212890625, | |
| "sampling/sampling_logp_difference/mean": 0.20155400037765503, | |
| "step": 70, | |
| "step_time": 183.71196515997872 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3417.0, | |
| "completions/mean_length": 1053.296875, | |
| "completions/mean_terminated_length": 849.9649047851562, | |
| "completions/min_length": 140.0, | |
| "completions/min_terminated_length": 140.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6875295341014862, | |
| "epoch": 0.1748768472906404, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.003532877881341452, | |
| "kl": 0.0006982934210100211, | |
| "learning_rate": 4.980140562409141e-05, | |
| "loss": -0.023352203890681267, | |
| "num_tokens": 10571929.0, | |
| "reward": 0.94140625, | |
| "reward_std": 0.5487037301063538, | |
| "rewards/reward_func/mean": 0.10460069444444445, | |
| "rewards/reward_func/std": 0.07910366521941291, | |
| "sampling/importance_sampling_ratio/max": 2.9906861782073975, | |
| "sampling/importance_sampling_ratio/mean": 0.9550516605377197, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.016915321350098, | |
| "sampling/sampling_logp_difference/mean": 0.18335659801959991, | |
| "step": 71, | |
| "step_time": 124.68412435986102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2762.0, | |
| "completions/mean_length": 913.6875, | |
| "completions/mean_terminated_length": 787.57373046875, | |
| "completions/min_length": 64.0, | |
| "completions/min_terminated_length": 64.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.752479761838913, | |
| "epoch": 0.17733990147783252, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005311994696832263, | |
| "kl": 0.0015363168495241553, | |
| "learning_rate": 4.979525644177554e-05, | |
| "loss": 0.015766549855470657, | |
| "num_tokens": 10717333.0, | |
| "reward": 0.91796875, | |
| "reward_std": 0.5001084804534912, | |
| "rewards/reward_func/mean": 0.10199652777777778, | |
| "rewards/reward_func/std": 0.0739565756585863, | |
| "sampling/importance_sampling_ratio/max": 2.9991776943206787, | |
| "sampling/importance_sampling_ratio/mean": 0.955678403377533, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.945234298706055, | |
| "sampling/sampling_logp_difference/mean": 0.19813895225524902, | |
| "step": 72, | |
| "step_time": 143.27259426680394 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3845.0, | |
| "completions/mean_length": 1054.390625, | |
| "completions/mean_terminated_length": 957.360595703125, | |
| "completions/min_length": 14.0, | |
| "completions/min_terminated_length": 14.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6393098831176758, | |
| "epoch": 0.17980295566502463, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0011923495379162847, | |
| "kl": 0.0012014579260721803, | |
| "learning_rate": 4.978901389652039e-05, | |
| "loss": -0.019676342606544495, | |
| "num_tokens": 10870078.0, | |
| "reward": 1.07421875, | |
| "reward_std": 0.41230979561805725, | |
| "rewards/reward_func/mean": 0.1193576388888889, | |
| "rewards/reward_func/std": 0.06086550156275431, | |
| "sampling/importance_sampling_ratio/max": 2.9990031719207764, | |
| "sampling/importance_sampling_ratio/mean": 0.9514544010162354, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.248948097229004, | |
| "sampling/sampling_logp_difference/mean": 0.19227707386016846, | |
| "step": 73, | |
| "step_time": 188.85755535191856 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4092.0, | |
| "completions/max_terminated_length": 4092.0, | |
| "completions/mean_length": 838.75, | |
| "completions/mean_terminated_length": 840.5573120117188, | |
| "completions/min_length": 182.0, | |
| "completions/min_terminated_length": 182.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.653815358877182, | |
| "epoch": 0.18226600985221675, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009966630550985659, | |
| "kl": 0.0005712288402719423, | |
| "learning_rate": 4.978267801183133e-05, | |
| "loss": -0.03280792012810707, | |
| "num_tokens": 11001822.0, | |
| "reward": 1.0625, | |
| "reward_std": 0.7278474569320679, | |
| "rewards/reward_func/mean": 0.11805555555555555, | |
| "rewards/reward_func/std": 0.1253587090306812, | |
| "sampling/importance_sampling_ratio/max": 2.995110273361206, | |
| "sampling/importance_sampling_ratio/mean": 0.9579760432243347, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.561847686767578, | |
| "sampling/sampling_logp_difference/mean": 0.16925372183322906, | |
| "step": 74, | |
| "step_time": 126.91733171069063 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3472.0, | |
| "completions/mean_length": 1195.765625, | |
| "completions/mean_terminated_length": 1053.131103515625, | |
| "completions/min_length": 360.0, | |
| "completions/min_terminated_length": 360.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7155862152576447, | |
| "epoch": 0.18472906403940886, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0036799305426663165, | |
| "kl": 0.001092532867914997, | |
| "learning_rate": 4.977624881156524e-05, | |
| "loss": 0.02602587826550007, | |
| "num_tokens": 11167903.0, | |
| "reward": 0.98046875, | |
| "reward_std": 0.378763347864151, | |
| "rewards/reward_func/mean": 0.10894097222222222, | |
| "rewards/reward_func/std": 0.05723588830894894, | |
| "sampling/importance_sampling_ratio/max": 2.997964382171631, | |
| "sampling/importance_sampling_ratio/mean": 0.9472610950469971, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.248061180114746, | |
| "sampling/sampling_logp_difference/mean": 0.20667724311351776, | |
| "step": 75, | |
| "step_time": 129.2886537532322 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2813.0, | |
| "completions/mean_length": 1064.53125, | |
| "completions/mean_terminated_length": 966.7418823242188, | |
| "completions/min_length": 4.0, | |
| "completions/min_terminated_length": 4.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.721578374505043, | |
| "epoch": 0.18719211822660098, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004605488257337456, | |
| "kl": 0.002601891625090502, | |
| "learning_rate": 4.976972631993033e-05, | |
| "loss": -0.03658334165811539, | |
| "num_tokens": 11324945.0, | |
| "reward": 0.921875, | |
| "reward_std": 0.4753340482711792, | |
| "rewards/reward_func/mean": 0.10243055555555555, | |
| "rewards/reward_func/std": 0.06980303592152065, | |
| "sampling/importance_sampling_ratio/max": 2.995720148086548, | |
| "sampling/importance_sampling_ratio/mean": 0.9485733509063721, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 19.749515533447266, | |
| "sampling/sampling_logp_difference/mean": 0.21007245779037476, | |
| "step": 76, | |
| "step_time": 135.25108521990478 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3425.0, | |
| "completions/mean_length": 1197.34375, | |
| "completions/mean_terminated_length": 1135.91796875, | |
| "completions/min_length": 393.0, | |
| "completions/min_terminated_length": 393.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6039802730083466, | |
| "epoch": 0.1896551724137931, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.004743221131144578, | |
| "kl": 0.0006622753426199779, | |
| "learning_rate": 4.976311056148609e-05, | |
| "loss": 0.03427097201347351, | |
| "num_tokens": 11489447.0, | |
| "reward": 1.1171875, | |
| "reward_std": 0.7042511701583862, | |
| "rewards/reward_func/mean": 0.12413194444444445, | |
| "rewards/reward_func/std": 0.10813031593958537, | |
| "sampling/importance_sampling_ratio/max": 2.9994306564331055, | |
| "sampling/importance_sampling_ratio/mean": 0.9550964832305908, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.74364471435547, | |
| "sampling/sampling_logp_difference/mean": 0.17511072754859924, | |
| "step": 77, | |
| "step_time": 129.62398432497866 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3571.0, | |
| "completions/mean_length": 1217.953125, | |
| "completions/mean_terminated_length": 1170.725830078125, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 16.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5881828814744949, | |
| "epoch": 0.1921182266009852, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004901360594166145, | |
| "kl": 0.0007936922629596666, | |
| "learning_rate": 4.975640156114322e-05, | |
| "loss": -0.009430614300072193, | |
| "num_tokens": 11659988.0, | |
| "reward": 1.0546875, | |
| "reward_std": 0.6163589358329773, | |
| "rewards/reward_func/mean": 0.1171875, | |
| "rewards/reward_func/std": 0.10998319089412689, | |
| "sampling/importance_sampling_ratio/max": 2.995478391647339, | |
| "sampling/importance_sampling_ratio/mean": 0.9541982412338257, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.298104286193848, | |
| "sampling/sampling_logp_difference/mean": 0.177594393491745, | |
| "step": 78, | |
| "step_time": 132.9943831146229 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2884.0, | |
| "completions/max_terminated_length": 2884.0, | |
| "completions/mean_length": 1001.3125, | |
| "completions/mean_terminated_length": 1001.3125, | |
| "completions/min_length": 238.0, | |
| "completions/min_terminated_length": 238.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6656395643949509, | |
| "epoch": 0.19458128078817735, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006107805006785518, | |
| "kl": 0.0009958412119885907, | |
| "learning_rate": 4.974959934416346e-05, | |
| "loss": 0.006846698001027107, | |
| "num_tokens": 11805816.0, | |
| "reward": 1.23828125, | |
| "reward_std": 0.5426816344261169, | |
| "rewards/reward_func/mean": 0.13758680555555555, | |
| "rewards/reward_func/std": 0.09264064538809988, | |
| "sampling/importance_sampling_ratio/max": 2.9980006217956543, | |
| "sampling/importance_sampling_ratio/mean": 0.9504839181900024, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.141356468200684, | |
| "sampling/sampling_logp_difference/mean": 0.1968192458152771, | |
| "step": 79, | |
| "step_time": 88.39881527097896 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3718.0, | |
| "completions/mean_length": 962.09375, | |
| "completions/mean_terminated_length": 929.6500244140625, | |
| "completions/min_length": 34.0, | |
| "completions/min_terminated_length": 258.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.612749919295311, | |
| "epoch": 0.19704433497536947, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.009568223063933647, | |
| "kl": 0.00103502553247381, | |
| "learning_rate": 4.9742703936159586e-05, | |
| "loss": -0.10045486688613892, | |
| "num_tokens": 11939118.0, | |
| "reward": 1.125, | |
| "reward_std": 0.6251983642578125, | |
| "rewards/reward_func/mean": 0.125, | |
| "rewards/reward_func/std": 0.11257308059268528, | |
| "sampling/importance_sampling_ratio/max": 2.9995410442352295, | |
| "sampling/importance_sampling_ratio/mean": 0.9614801406860352, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.320719718933105, | |
| "sampling/sampling_logp_difference/mean": 0.168918639421463, | |
| "step": 80, | |
| "step_time": 138.59337026928551 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2942.0, | |
| "completions/max_terminated_length": 2942.0, | |
| "completions/mean_length": 777.546875, | |
| "completions/mean_terminated_length": 777.546875, | |
| "completions/min_length": 159.0, | |
| "completions/min_terminated_length": 159.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.716793105006218, | |
| "epoch": 0.19950738916256158, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004065946912247252, | |
| "kl": 0.0015465420437976718, | |
| "learning_rate": 4.973571536309525e-05, | |
| "loss": 0.03413772210478783, | |
| "num_tokens": 12091009.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.40959399938583374, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.060785247219933405, | |
| "sampling/importance_sampling_ratio/max": 2.9997410774230957, | |
| "sampling/importance_sampling_ratio/mean": 0.9524143934249878, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.888782501220703, | |
| "sampling/sampling_logp_difference/mean": 0.20447459816932678, | |
| "step": 81, | |
| "step_time": 88.59409447899088 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3562.0, | |
| "completions/mean_length": 1248.171875, | |
| "completions/mean_terminated_length": 1137.1802978515625, | |
| "completions/min_length": 287.0, | |
| "completions/min_terminated_length": 287.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.71332086622715, | |
| "epoch": 0.2019704433497537, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00811562576764538, | |
| "kl": 0.001496812139521353, | |
| "learning_rate": 4.9728633651284914e-05, | |
| "loss": 0.02764129638671875, | |
| "num_tokens": 12264156.0, | |
| "reward": 1.22265625, | |
| "reward_std": 0.8730047345161438, | |
| "rewards/reward_func/mean": 0.13585069444444445, | |
| "rewards/reward_func/std": 0.14530256390571594, | |
| "sampling/importance_sampling_ratio/max": 2.997504711151123, | |
| "sampling/importance_sampling_ratio/mean": 0.9448159337043762, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.87446403503418, | |
| "sampling/sampling_logp_difference/mean": 0.21683049201965332, | |
| "step": 82, | |
| "step_time": 166.05172082805075 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3426.0, | |
| "completions/max_terminated_length": 3426.0, | |
| "completions/mean_length": 822.71875, | |
| "completions/mean_terminated_length": 822.71875, | |
| "completions/min_length": 195.0, | |
| "completions/min_terminated_length": 195.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6466097682714462, | |
| "epoch": 0.2044334975369458, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002471536066340347, | |
| "kl": 0.0014289210666902363, | |
| "learning_rate": 4.972145882739374e-05, | |
| "loss": -0.0035694753751158714, | |
| "num_tokens": 12403786.0, | |
| "reward": 1.00390625, | |
| "reward_std": 0.23776483535766602, | |
| "rewards/reward_func/mean": 0.1115451388888889, | |
| "rewards/reward_func/std": 0.0367136730088128, | |
| "sampling/importance_sampling_ratio/max": 2.9996731281280518, | |
| "sampling/importance_sampling_ratio/mean": 0.9583299160003662, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.391860961914062, | |
| "sampling/sampling_logp_difference/mean": 0.17760473489761353, | |
| "step": 83, | |
| "step_time": 102.87069191993214 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3689.0, | |
| "completions/mean_length": 1022.359375, | |
| "completions/mean_terminated_length": 973.5714721679688, | |
| "completions/min_length": 257.0, | |
| "completions/min_terminated_length": 257.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7003387361764908, | |
| "epoch": 0.20689655172413793, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0011806928817891115, | |
| "kl": 0.0011969898623647168, | |
| "learning_rate": 4.971419091843748e-05, | |
| "loss": 0.006968214176595211, | |
| "num_tokens": 12547905.0, | |
| "reward": 1.06640625, | |
| "reward_std": 0.2895062267780304, | |
| "rewards/reward_func/mean": 0.11848958333333333, | |
| "rewards/reward_func/std": 0.04047108110454348, | |
| "sampling/importance_sampling_ratio/max": 2.9965357780456543, | |
| "sampling/importance_sampling_ratio/mean": 0.9557406902313232, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.999181747436523, | |
| "sampling/sampling_logp_difference/mean": 0.18569059669971466, | |
| "step": 84, | |
| "step_time": 134.5772271042224 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2939.0, | |
| "completions/max_terminated_length": 2939.0, | |
| "completions/mean_length": 605.40625, | |
| "completions/mean_terminated_length": 605.40625, | |
| "completions/min_length": 66.0, | |
| "completions/min_terminated_length": 66.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7465554922819138, | |
| "epoch": 0.20935960591133004, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004734667031571098, | |
| "kl": 0.004191852669464424, | |
| "learning_rate": 4.970682995178238e-05, | |
| "loss": 0.019574182108044624, | |
| "num_tokens": 12666683.0, | |
| "reward": 1.0234375, | |
| "reward_std": 0.3583437204360962, | |
| "rewards/reward_func/mean": 0.11371527777777778, | |
| "rewards/reward_func/std": 0.05403099126285977, | |
| "sampling/importance_sampling_ratio/max": 2.9961087703704834, | |
| "sampling/importance_sampling_ratio/mean": 0.959001362323761, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.754371643066406, | |
| "sampling/sampling_logp_difference/mean": 0.18795375525951385, | |
| "step": 85, | |
| "step_time": 83.09169630077668 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3817.0, | |
| "completions/max_terminated_length": 3817.0, | |
| "completions/mean_length": 956.046875, | |
| "completions/mean_terminated_length": 923.4127807617188, | |
| "completions/min_length": 152.0, | |
| "completions/min_terminated_length": 152.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7266160845756531, | |
| "epoch": 0.21182266009852216, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0032079723243338284, | |
| "kl": 0.0021485694451257586, | |
| "learning_rate": 4.9699375955145114e-05, | |
| "loss": 0.008961433544754982, | |
| "num_tokens": 12820238.0, | |
| "reward": 1.03515625, | |
| "reward_std": 0.40548616647720337, | |
| "rewards/reward_func/mean": 0.1150173611111111, | |
| "rewards/reward_func/std": 0.0595403081840939, | |
| "sampling/importance_sampling_ratio/max": 2.998619794845581, | |
| "sampling/importance_sampling_ratio/mean": 0.9531738758087158, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.032673835754395, | |
| "sampling/sampling_logp_difference/mean": 0.19796302914619446, | |
| "step": 86, | |
| "step_time": 137.24125836323947 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1638.0, | |
| "completions/max_terminated_length": 1638.0, | |
| "completions/mean_length": 514.96875, | |
| "completions/mean_terminated_length": 514.96875, | |
| "completions/min_length": 161.0, | |
| "completions/min_terminated_length": 161.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7525362968444824, | |
| "epoch": 0.21428571428571427, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005333796839874563, | |
| "kl": 0.0048426192661281675, | |
| "learning_rate": 4.96918289565926e-05, | |
| "loss": 0.01707097887992859, | |
| "num_tokens": 12935980.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.2869516909122467, | |
| "rewards/reward_func/mean": 0.1076388888888889, | |
| "rewards/reward_func/std": 0.04373177720440759, | |
| "sampling/importance_sampling_ratio/max": 2.9986844062805176, | |
| "sampling/importance_sampling_ratio/mean": 0.9631032943725586, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 8.843316078186035, | |
| "sampling/sampling_logp_difference/mean": 0.18259428441524506, | |
| "step": 87, | |
| "step_time": 66.68644723505713 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3148.0, | |
| "completions/mean_length": 958.21875, | |
| "completions/mean_terminated_length": 857.0, | |
| "completions/min_length": 117.0, | |
| "completions/min_terminated_length": 117.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7587482780218124, | |
| "epoch": 0.21674876847290642, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0011093462895115053, | |
| "kl": 0.0011742105707526207, | |
| "learning_rate": 4.968418898454199e-05, | |
| "loss": -0.019757457077503204, | |
| "num_tokens": 13079658.0, | |
| "reward": 1.203125, | |
| "reward_std": 0.47114139795303345, | |
| "rewards/reward_func/mean": 0.13368055555555555, | |
| "rewards/reward_func/std": 0.0649379442135493, | |
| "sampling/importance_sampling_ratio/max": 2.9928460121154785, | |
| "sampling/importance_sampling_ratio/mean": 0.951276421546936, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.812447547912598, | |
| "sampling/sampling_logp_difference/mean": 0.2016531229019165, | |
| "step": 88, | |
| "step_time": 175.04604559391737 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3305.0, | |
| "completions/mean_length": 1278.84375, | |
| "completions/mean_terminated_length": 1195.0982666015625, | |
| "completions/min_length": 342.0, | |
| "completions/min_terminated_length": 342.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6959878653287888, | |
| "epoch": 0.21921182266009853, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00018656617621445748, | |
| "kl": 0.00109332193096634, | |
| "learning_rate": 4.967645606776047e-05, | |
| "loss": -0.002903047716245055, | |
| "num_tokens": 13240544.0, | |
| "reward": 1.12109375, | |
| "reward_std": 0.33627331256866455, | |
| "rewards/reward_func/mean": 0.12456597222222222, | |
| "rewards/reward_func/std": 0.04050926036304898, | |
| "sampling/importance_sampling_ratio/max": 2.995325803756714, | |
| "sampling/importance_sampling_ratio/mean": 0.9519744515419006, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.622486114501953, | |
| "sampling/sampling_logp_difference/mean": 0.199794203042984, | |
| "step": 89, | |
| "step_time": 172.4871400659904 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3005.0, | |
| "completions/mean_length": 948.015625, | |
| "completions/mean_terminated_length": 832.91796875, | |
| "completions/min_length": 183.0, | |
| "completions/min_terminated_length": 183.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7445888072252274, | |
| "epoch": 0.22167487684729065, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002568082327493114, | |
| "kl": 0.0015613814030075446, | |
| "learning_rate": 4.966863023536523e-05, | |
| "loss": -0.0222585778683424, | |
| "num_tokens": 13385345.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.34429070353507996, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.050396261943711176, | |
| "sampling/importance_sampling_ratio/max": 2.9980051517486572, | |
| "sampling/importance_sampling_ratio/mean": 0.95062255859375, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.059782028198242, | |
| "sampling/sampling_logp_difference/mean": 0.20940154790878296, | |
| "step": 90, | |
| "step_time": 131.1308980169706 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3719.0, | |
| "completions/mean_length": 1014.3125, | |
| "completions/mean_terminated_length": 924.6557006835938, | |
| "completions/min_length": 165.0, | |
| "completions/min_terminated_length": 165.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7877677083015442, | |
| "epoch": 0.22413793103448276, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0007955641288236894, | |
| "kl": 0.001454152661608532, | |
| "learning_rate": 4.96607115168233e-05, | |
| "loss": -0.02363387495279312, | |
| "num_tokens": 13535109.0, | |
| "reward": 1.0703125, | |
| "reward_std": 0.32874444127082825, | |
| "rewards/reward_func/mean": 0.1189236111111111, | |
| "rewards/reward_func/std": 0.0472567660941018, | |
| "sampling/importance_sampling_ratio/max": 2.998769998550415, | |
| "sampling/importance_sampling_ratio/mean": 0.9478331804275513, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.211028099060059, | |
| "sampling/sampling_logp_difference/mean": 0.21998655796051025, | |
| "step": 91, | |
| "step_time": 114.45941002899781 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3213.0, | |
| "completions/mean_length": 892.671875, | |
| "completions/mean_terminated_length": 841.825439453125, | |
| "completions/min_length": 214.0, | |
| "completions/min_terminated_length": 214.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6217218488454819, | |
| "epoch": 0.22660098522167488, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.001966924586813133, | |
| "kl": 0.001591385604115203, | |
| "learning_rate": 4.965269994195146e-05, | |
| "loss": -0.009086966514587402, | |
| "num_tokens": 13672560.0, | |
| "reward": 1.09375, | |
| "reward_std": 0.41187721490859985, | |
| "rewards/reward_func/mean": 0.12152777777777778, | |
| "rewards/reward_func/std": 0.05926263497935401, | |
| "sampling/importance_sampling_ratio/max": 2.9972190856933594, | |
| "sampling/importance_sampling_ratio/mean": 0.9610211849212646, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.113224983215332, | |
| "sampling/sampling_logp_difference/mean": 0.16949692368507385, | |
| "step": 92, | |
| "step_time": 160.50747915613465 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3949.0, | |
| "completions/mean_length": 1133.765625, | |
| "completions/mean_terminated_length": 1086.74609375, | |
| "completions/min_length": 115.0, | |
| "completions/min_terminated_length": 115.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6935625523328781, | |
| "epoch": 0.229064039408867, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0010926697027660048, | |
| "kl": 0.0019639305828604847, | |
| "learning_rate": 4.964459554091615e-05, | |
| "loss": 0.007769447285681963, | |
| "num_tokens": 13826593.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.3022885024547577, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.03856059287985166, | |
| "sampling/importance_sampling_ratio/max": 2.9993722438812256, | |
| "sampling/importance_sampling_ratio/mean": 0.9566208720207214, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.104926109313965, | |
| "sampling/sampling_logp_difference/mean": 0.18871435523033142, | |
| "step": 93, | |
| "step_time": 126.3685281840153 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2425.0, | |
| "completions/max_terminated_length": 2425.0, | |
| "completions/mean_length": 994.125, | |
| "completions/mean_terminated_length": 994.125, | |
| "completions/min_length": 294.0, | |
| "completions/min_terminated_length": 294.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6710119396448135, | |
| "epoch": 0.2315270935960591, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.00010828470047238532, | |
| "kl": 0.0010478518815943971, | |
| "learning_rate": 4.9636398344233294e-05, | |
| "loss": 2.0540661353152245e-05, | |
| "num_tokens": 13978137.0, | |
| "reward": 1.078125, | |
| "reward_std": 0.2704896926879883, | |
| "rewards/reward_func/mean": 0.11979166666666667, | |
| "rewards/reward_func/std": 0.030054413610034518, | |
| "sampling/importance_sampling_ratio/max": 2.997786521911621, | |
| "sampling/importance_sampling_ratio/mean": 0.9555579423904419, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.089488983154297, | |
| "sampling/sampling_logp_difference/mean": 0.18803386390209198, | |
| "step": 94, | |
| "step_time": 78.18217662116513 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3947.0, | |
| "completions/mean_length": 1246.109375, | |
| "completions/mean_terminated_length": 1088.4482421875, | |
| "completions/min_length": 211.0, | |
| "completions/min_terminated_length": 211.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6725437641143799, | |
| "epoch": 0.23399014778325122, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002324607148485349, | |
| "kl": 0.0010944021196337417, | |
| "learning_rate": 4.9628108382768255e-05, | |
| "loss": 0.025269202888011932, | |
| "num_tokens": 14144960.0, | |
| "reward": 1.140625, | |
| "reward_std": 0.3905505836009979, | |
| "rewards/reward_func/mean": 0.1267361111111111, | |
| "rewards/reward_func/std": 0.058287974860933095, | |
| "sampling/importance_sampling_ratio/max": 2.9950029850006104, | |
| "sampling/importance_sampling_ratio/mean": 0.9547150135040283, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.426436424255371, | |
| "sampling/sampling_logp_difference/mean": 0.1817963868379593, | |
| "step": 95, | |
| "step_time": 214.41529780323617 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3043.0, | |
| "completions/mean_length": 975.09375, | |
| "completions/mean_terminated_length": 821.6065063476562, | |
| "completions/min_length": 132.0, | |
| "completions/min_terminated_length": 132.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7443526685237885, | |
| "epoch": 0.23645320197044334, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0023251836424105837, | |
| "kl": 0.0014684945635963231, | |
| "learning_rate": 4.9619725687735686e-05, | |
| "loss": -0.007826524786651134, | |
| "num_tokens": 14304006.0, | |
| "reward": 1.0859375, | |
| "reward_std": 0.36247265338897705, | |
| "rewards/reward_func/mean": 0.12065972222222222, | |
| "rewards/reward_func/std": 0.05503144032425351, | |
| "sampling/importance_sampling_ratio/max": 2.998788595199585, | |
| "sampling/importance_sampling_ratio/mean": 0.9454492926597595, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.376626014709473, | |
| "sampling/sampling_logp_difference/mean": 0.2146020233631134, | |
| "step": 96, | |
| "step_time": 132.66262619709596 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3153.0, | |
| "completions/mean_length": 769.90625, | |
| "completions/mean_terminated_length": 717.1111450195312, | |
| "completions/min_length": 126.0, | |
| "completions/min_terminated_length": 126.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8366009593009949, | |
| "epoch": 0.23891625615763548, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002077514765880351, | |
| "kl": 0.0015410964551847428, | |
| "learning_rate": 4.96112502906994e-05, | |
| "loss": -0.012058844789862633, | |
| "num_tokens": 14438768.0, | |
| "reward": 1.09765625, | |
| "reward_std": 0.3716575503349304, | |
| "rewards/reward_func/mean": 0.12196180555555555, | |
| "rewards/reward_func/std": 0.05277948081493378, | |
| "sampling/importance_sampling_ratio/max": 2.99769926071167, | |
| "sampling/importance_sampling_ratio/mean": 0.9524117708206177, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.555198669433594, | |
| "sampling/sampling_logp_difference/mean": 0.212304025888443, | |
| "step": 97, | |
| "step_time": 117.63319608126767 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2758.0, | |
| "completions/max_terminated_length": 2758.0, | |
| "completions/mean_length": 769.34375, | |
| "completions/mean_terminated_length": 778.2698974609375, | |
| "completions/min_length": 96.0, | |
| "completions/min_terminated_length": 96.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6486907005310059, | |
| "epoch": 0.2413793103448276, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.01006651140897198, | |
| "kl": 0.0011220714950468391, | |
| "learning_rate": 4.960268222357227e-05, | |
| "loss": -0.02731485851109028, | |
| "num_tokens": 14588486.0, | |
| "reward": 1.17578125, | |
| "reward_std": 0.5130822062492371, | |
| "rewards/reward_func/mean": 0.1306423611111111, | |
| "rewards/reward_func/std": 0.09101471718814638, | |
| "sampling/importance_sampling_ratio/max": 2.9995951652526855, | |
| "sampling/importance_sampling_ratio/mean": 0.9494529366493225, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.874653816223145, | |
| "sampling/sampling_logp_difference/mean": 0.20342113077640533, | |
| "step": 98, | |
| "step_time": 86.39957803068683 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4018.0, | |
| "completions/mean_length": 1201.5625, | |
| "completions/mean_terminated_length": 1148.806396484375, | |
| "completions/min_length": 324.0, | |
| "completions/min_terminated_length": 324.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7736449986696243, | |
| "epoch": 0.2438423645320197, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.000667562733610488, | |
| "kl": 0.001313261323957704, | |
| "learning_rate": 4.959402151861613e-05, | |
| "loss": -0.014927120879292488, | |
| "num_tokens": 14754538.0, | |
| "reward": 1.15234375, | |
| "reward_std": 0.40974533557891846, | |
| "rewards/reward_func/mean": 0.12803819444444445, | |
| "rewards/reward_func/std": 0.05651323828432295, | |
| "sampling/importance_sampling_ratio/max": 2.9992804527282715, | |
| "sampling/importance_sampling_ratio/mean": 0.9442236423492432, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.86728286743164, | |
| "sampling/sampling_logp_difference/mean": 0.22248251736164093, | |
| "step": 99, | |
| "step_time": 155.09293641196564 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3318.0, | |
| "completions/max_terminated_length": 3318.0, | |
| "completions/mean_length": 1113.453125, | |
| "completions/mean_terminated_length": 1121.0318603515625, | |
| "completions/min_length": 294.0, | |
| "completions/min_terminated_length": 294.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6962731033563614, | |
| "epoch": 0.24630541871921183, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0009398742824687371, | |
| "kl": 0.0014017132634762675, | |
| "learning_rate": 4.958526820844158e-05, | |
| "loss": 0.009164243005216122, | |
| "num_tokens": 14919655.0, | |
| "reward": 1.23828125, | |
| "reward_std": 0.44639137387275696, | |
| "rewards/reward_func/mean": 0.13758680555555555, | |
| "rewards/reward_func/std": 0.054410699754953384, | |
| "sampling/importance_sampling_ratio/max": 2.998028039932251, | |
| "sampling/importance_sampling_ratio/mean": 0.950190544128418, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.881841659545898, | |
| "sampling/sampling_logp_difference/mean": 0.19850248098373413, | |
| "step": 100, | |
| "step_time": 105.25422466010787 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2808.0, | |
| "completions/max_terminated_length": 2808.0, | |
| "completions/mean_length": 960.515625, | |
| "completions/mean_terminated_length": 950.0819091796875, | |
| "completions/min_length": 188.0, | |
| "completions/min_terminated_length": 188.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.723781481385231, | |
| "epoch": 0.24876847290640394, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002647153784592124, | |
| "kl": 0.001565072947414592, | |
| "learning_rate": 4.957642232600797e-05, | |
| "loss": 0.021879084408283234, | |
| "num_tokens": 15068664.0, | |
| "reward": 1.0, | |
| "reward_std": 0.24800793826580048, | |
| "rewards/reward_func/mean": 0.1111111111111111, | |
| "rewards/reward_func/std": 0.04210699929131402, | |
| "sampling/importance_sampling_ratio/max": 2.9967308044433594, | |
| "sampling/importance_sampling_ratio/mean": 0.9538641571998596, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.660208702087402, | |
| "sampling/sampling_logp_difference/mean": 0.19491901993751526, | |
| "step": 101, | |
| "step_time": 92.62138540088199 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2175.0, | |
| "completions/max_terminated_length": 2175.0, | |
| "completions/mean_length": 833.9375, | |
| "completions/mean_terminated_length": 827.9683227539062, | |
| "completions/min_length": 365.0, | |
| "completions/min_terminated_length": 365.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6343219578266144, | |
| "epoch": 0.2512315270935961, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0007505687183929501, | |
| "kl": 0.0012052875244989991, | |
| "learning_rate": 4.956748390462316e-05, | |
| "loss": 0.00302310474216938, | |
| "num_tokens": 15217684.0, | |
| "reward": 1.08984375, | |
| "reward_std": 0.2966987192630768, | |
| "rewards/reward_func/mean": 0.12109375, | |
| "rewards/reward_func/std": 0.0361149807771047, | |
| "sampling/importance_sampling_ratio/max": 2.995220184326172, | |
| "sampling/importance_sampling_ratio/mean": 0.9567204713821411, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.999521255493164, | |
| "sampling/sampling_logp_difference/mean": 0.187168151140213, | |
| "step": 102, | |
| "step_time": 94.69526713481173 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3928.0, | |
| "completions/mean_length": 1041.28125, | |
| "completions/mean_terminated_length": 887.9166870117188, | |
| "completions/min_length": 148.0, | |
| "completions/min_terminated_length": 148.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7419612556695938, | |
| "epoch": 0.2536945812807882, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002342374375304448, | |
| "kl": 0.0018976301944348961, | |
| "learning_rate": 4.955845297794348e-05, | |
| "loss": -0.013131720013916492, | |
| "num_tokens": 15373478.0, | |
| "reward": 1.03125, | |
| "reward_std": 0.3288387358188629, | |
| "rewards/reward_func/mean": 0.11458333333333333, | |
| "rewards/reward_func/std": 0.048490075601471797, | |
| "sampling/importance_sampling_ratio/max": 2.998152256011963, | |
| "sampling/importance_sampling_ratio/mean": 0.9477394819259644, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.104028701782227, | |
| "sampling/sampling_logp_difference/mean": 0.20725718140602112, | |
| "step": 103, | |
| "step_time": 205.12117066117935 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3203.0, | |
| "completions/mean_length": 1104.546875, | |
| "completions/mean_terminated_length": 1008.04833984375, | |
| "completions/min_length": 172.0, | |
| "completions/min_terminated_length": 172.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6666458696126938, | |
| "epoch": 0.2561576354679803, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.001985874134046432, | |
| "kl": 0.001993619982386008, | |
| "learning_rate": 4.954932957997359e-05, | |
| "loss": -0.012176139280200005, | |
| "num_tokens": 15524409.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.3799075484275818, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.05546691517035166, | |
| "sampling/importance_sampling_ratio/max": 2.996752977371216, | |
| "sampling/importance_sampling_ratio/mean": 0.9528929591178894, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.815652847290039, | |
| "sampling/sampling_logp_difference/mean": 0.192731574177742, | |
| "step": 104, | |
| "step_time": 122.30552988126874 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3829.0, | |
| "completions/mean_length": 894.734375, | |
| "completions/mean_terminated_length": 843.920654296875, | |
| "completions/min_length": 163.0, | |
| "completions/min_terminated_length": 163.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7775083035230637, | |
| "epoch": 0.25862068965517243, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0032020832600996134, | |
| "kl": 0.001513187715318054, | |
| "learning_rate": 4.954011374506632e-05, | |
| "loss": 0.0022926977835595608, | |
| "num_tokens": 15670424.0, | |
| "reward": 1.0703125, | |
| "reward_std": 0.35486623644828796, | |
| "rewards/reward_func/mean": 0.1189236111111111, | |
| "rewards/reward_func/std": 0.052947340740097895, | |
| "sampling/importance_sampling_ratio/max": 2.9996097087860107, | |
| "sampling/importance_sampling_ratio/mean": 0.9513598680496216, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.919264793395996, | |
| "sampling/sampling_logp_difference/mean": 0.21757398545742035, | |
| "step": 105, | |
| "step_time": 133.78325367206708 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3971.0, | |
| "completions/mean_length": 1172.4375, | |
| "completions/mean_terminated_length": 1028.6556396484375, | |
| "completions/min_length": 300.0, | |
| "completions/min_terminated_length": 300.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7460175901651382, | |
| "epoch": 0.26108374384236455, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0018893770374146888, | |
| "kl": 0.0018231416470371187, | |
| "learning_rate": 4.953080550792254e-05, | |
| "loss": -0.010393545962870121, | |
| "num_tokens": 15823796.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.3952651023864746, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.0587814019785987, | |
| "sampling/importance_sampling_ratio/max": 2.9988386631011963, | |
| "sampling/importance_sampling_ratio/mean": 0.9498333930969238, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.434999465942383, | |
| "sampling/sampling_logp_difference/mean": 0.205765038728714, | |
| "step": 106, | |
| "step_time": 131.6286746961996 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3139.0, | |
| "completions/max_terminated_length": 3139.0, | |
| "completions/mean_length": 883.109375, | |
| "completions/mean_terminated_length": 883.109375, | |
| "completions/min_length": 109.0, | |
| "completions/min_terminated_length": 109.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.735322967171669, | |
| "epoch": 0.26354679802955666, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00035116229639673005, | |
| "kl": 0.0015636045136488974, | |
| "learning_rate": 4.952140490359108e-05, | |
| "loss": 0.00011009792069671676, | |
| "num_tokens": 15962427.0, | |
| "reward": 1.18359375, | |
| "reward_std": 0.3965180516242981, | |
| "rewards/reward_func/mean": 0.13151041666666666, | |
| "rewards/reward_func/std": 0.047183099720213145, | |
| "sampling/importance_sampling_ratio/max": 2.998013496398926, | |
| "sampling/importance_sampling_ratio/mean": 0.9541240930557251, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.514302253723145, | |
| "sampling/sampling_logp_difference/mean": 0.20346251130104065, | |
| "step": 107, | |
| "step_time": 105.58308988134377 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3001.0, | |
| "completions/max_terminated_length": 3001.0, | |
| "completions/mean_length": 946.59375, | |
| "completions/mean_terminated_length": 946.59375, | |
| "completions/min_length": 97.0, | |
| "completions/min_terminated_length": 97.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.66323322057724, | |
| "epoch": 0.2660098522167488, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.000932031147343889, | |
| "kl": 0.0015872836229391396, | |
| "learning_rate": 4.951191196746855e-05, | |
| "loss": -0.0009894074173644185, | |
| "num_tokens": 16113601.0, | |
| "reward": 1.11328125, | |
| "reward_std": 0.3419414758682251, | |
| "rewards/reward_func/mean": 0.12369791666666667, | |
| "rewards/reward_func/std": 0.04295487246579594, | |
| "sampling/importance_sampling_ratio/max": 2.993446111679077, | |
| "sampling/importance_sampling_ratio/mean": 0.9554958343505859, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.5941743850708, | |
| "sampling/sampling_logp_difference/mean": 0.1888352930545807, | |
| "step": 108, | |
| "step_time": 97.49898341693915 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3759.0, | |
| "completions/mean_length": 842.390625, | |
| "completions/mean_terminated_length": 623.4745483398438, | |
| "completions/min_length": 133.0, | |
| "completions/min_terminated_length": 133.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.739134818315506, | |
| "epoch": 0.2684729064039409, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0018781326949399867, | |
| "kl": 0.0020018375653307885, | |
| "learning_rate": 4.950232673529922e-05, | |
| "loss": 0.0005048960447311401, | |
| "num_tokens": 16257498.0, | |
| "reward": 1.0234375, | |
| "reward_std": 0.29113471508026123, | |
| "rewards/reward_func/mean": 0.11371527777777778, | |
| "rewards/reward_func/std": 0.04292959802680545, | |
| "sampling/importance_sampling_ratio/max": 2.996737241744995, | |
| "sampling/importance_sampling_ratio/mean": 0.956794261932373, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.061861038208008, | |
| "sampling/sampling_logp_difference/mean": 0.19844233989715576, | |
| "step": 109, | |
| "step_time": 139.45302382390946 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3361.0, | |
| "completions/mean_length": 930.125, | |
| "completions/mean_terminated_length": 894.04833984375, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 103.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7441273629665375, | |
| "epoch": 0.270935960591133, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0015235700232656716, | |
| "kl": 0.0016930506099015474, | |
| "learning_rate": 4.9492649243174894e-05, | |
| "loss": -0.03158733621239662, | |
| "num_tokens": 16415858.0, | |
| "reward": 1.0703125, | |
| "reward_std": 0.3631562292575836, | |
| "rewards/reward_func/mean": 0.1189236111111111, | |
| "rewards/reward_func/std": 0.05488494038581848, | |
| "sampling/importance_sampling_ratio/max": 2.996971607208252, | |
| "sampling/importance_sampling_ratio/mean": 0.9444047212600708, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.474295616149902, | |
| "sampling/sampling_logp_difference/mean": 0.2202835977077484, | |
| "step": 110, | |
| "step_time": 132.466704050079 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2257.0, | |
| "completions/mean_length": 1196.625, | |
| "completions/mean_terminated_length": 1071.4833984375, | |
| "completions/min_length": 7.0, | |
| "completions/min_terminated_length": 291.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6919787973165512, | |
| "epoch": 0.2733990147783251, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.001542486832013034, | |
| "kl": 0.0014645264309365302, | |
| "learning_rate": 4.948287952753475e-05, | |
| "loss": -0.008886125870049, | |
| "num_tokens": 16595290.0, | |
| "reward": 1.12890625, | |
| "reward_std": 0.4082292914390564, | |
| "rewards/reward_func/mean": 0.1254340277777778, | |
| "rewards/reward_func/std": 0.05789083242416382, | |
| "sampling/importance_sampling_ratio/max": 2.987086057662964, | |
| "sampling/importance_sampling_ratio/mean": 0.9435651302337646, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.362783432006836, | |
| "sampling/sampling_logp_difference/mean": 0.21983519196510315, | |
| "step": 111, | |
| "step_time": 138.48055132851005 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3345.0, | |
| "completions/mean_length": 994.453125, | |
| "completions/mean_terminated_length": 841.91796875, | |
| "completions/min_length": 219.0, | |
| "completions/min_terminated_length": 219.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7784600406885147, | |
| "epoch": 0.27586206896551724, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.00044156116092231446, | |
| "kl": 0.002170788327930495, | |
| "learning_rate": 4.947301762516526e-05, | |
| "loss": -0.003606098936870694, | |
| "num_tokens": 16751047.0, | |
| "reward": 1.1484375, | |
| "reward_std": 0.37192854285240173, | |
| "rewards/reward_func/mean": 0.12760416666666666, | |
| "rewards/reward_func/std": 0.04553384002712038, | |
| "sampling/importance_sampling_ratio/max": 2.9941134452819824, | |
| "sampling/importance_sampling_ratio/mean": 0.9548653960227966, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.049118041992188, | |
| "sampling/sampling_logp_difference/mean": 0.20217272639274597, | |
| "step": 112, | |
| "step_time": 131.37046331982128 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4091.0, | |
| "completions/mean_length": 1262.1875, | |
| "completions/mean_terminated_length": 1175.0509033203125, | |
| "completions/min_length": 10.0, | |
| "completions/min_terminated_length": 313.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7166846841573715, | |
| "epoch": 0.27832512315270935, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0022857984519741247, | |
| "kl": 0.0017328996036667377, | |
| "learning_rate": 4.946306357319997e-05, | |
| "loss": -0.003905626479536295, | |
| "num_tokens": 16924419.0, | |
| "reward": 1.15625, | |
| "reward_std": 0.47245559096336365, | |
| "rewards/reward_func/mean": 0.1284722222222222, | |
| "rewards/reward_func/std": 0.06712073087692261, | |
| "sampling/importance_sampling_ratio/max": 2.9988887310028076, | |
| "sampling/importance_sampling_ratio/mean": 0.9520525336265564, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.55725383758545, | |
| "sampling/sampling_logp_difference/mean": 0.1951729655265808, | |
| "step": 113, | |
| "step_time": 127.90999798686244 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3223.0, | |
| "completions/mean_length": 950.78125, | |
| "completions/mean_terminated_length": 796.0983276367188, | |
| "completions/min_length": 180.0, | |
| "completions/min_terminated_length": 180.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7335464358329773, | |
| "epoch": 0.28078817733990147, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005655948261911067, | |
| "kl": 0.007333489367738366, | |
| "learning_rate": 4.9453017409119416e-05, | |
| "loss": -0.016323737800121307, | |
| "num_tokens": 17077733.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.40959399938583374, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.060785247219933405, | |
| "sampling/importance_sampling_ratio/max": 2.999847650527954, | |
| "sampling/importance_sampling_ratio/mean": 0.9522716999053955, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.624942779541016, | |
| "sampling/sampling_logp_difference/mean": 0.2057677060365677, | |
| "step": 114, | |
| "step_time": 179.57750754477456 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2737.0, | |
| "completions/max_terminated_length": 2737.0, | |
| "completions/mean_length": 725.703125, | |
| "completions/mean_terminated_length": 725.703125, | |
| "completions/min_length": 141.0, | |
| "completions/min_terminated_length": 141.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6960620880126953, | |
| "epoch": 0.2832512315270936, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.00011722383855332473, | |
| "kl": 0.0012825021694879979, | |
| "learning_rate": 4.9442879170750976e-05, | |
| "loss": 2.3127111489884555e-05, | |
| "num_tokens": 17196610.0, | |
| "reward": 1.0625, | |
| "reward_std": 0.24397501349449158, | |
| "rewards/reward_func/mean": 0.11805555555555555, | |
| "rewards/reward_func/std": 0.027108336488405865, | |
| "sampling/importance_sampling_ratio/max": 2.9984169006347656, | |
| "sampling/importance_sampling_ratio/mean": 0.9587997198104858, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.809239387512207, | |
| "sampling/sampling_logp_difference/mean": 0.17770594358444214, | |
| "step": 115, | |
| "step_time": 86.19620743719861 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3126.0, | |
| "completions/mean_length": 824.265625, | |
| "completions/mean_terminated_length": 729.34423828125, | |
| "completions/min_length": 240.0, | |
| "completions/min_terminated_length": 240.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7041952311992645, | |
| "epoch": 0.2857142857142857, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0009556624442624107, | |
| "kl": 0.002167180966353044, | |
| "learning_rate": 4.943264889626871e-05, | |
| "loss": 0.009133076295256615, | |
| "num_tokens": 17336243.0, | |
| "reward": 1.02734375, | |
| "reward_std": 0.1788254678249359, | |
| "rewards/reward_func/mean": 0.11414930555555555, | |
| "rewards/reward_func/std": 0.02295756671163771, | |
| "sampling/importance_sampling_ratio/max": 2.990734577178955, | |
| "sampling/importance_sampling_ratio/mean": 0.958708643913269, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.196749687194824, | |
| "sampling/sampling_logp_difference/mean": 0.18489937484264374, | |
| "step": 116, | |
| "step_time": 175.12587342062034 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3266.0, | |
| "completions/mean_length": 867.921875, | |
| "completions/mean_terminated_length": 763.790283203125, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7263264954090118, | |
| "epoch": 0.2881773399014778, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.016397730317110865, | |
| "kl": 0.002325467416085303, | |
| "learning_rate": 4.942232662419324e-05, | |
| "loss": -0.049474526196718216, | |
| "num_tokens": 17484766.0, | |
| "reward": 1.03515625, | |
| "reward_std": 0.5453031659126282, | |
| "rewards/reward_func/mean": 0.1150173611111111, | |
| "rewards/reward_func/std": 0.08766606450080872, | |
| "sampling/importance_sampling_ratio/max": 2.9991261959075928, | |
| "sampling/importance_sampling_ratio/mean": 0.951899528503418, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.624442100524902, | |
| "sampling/sampling_logp_difference/mean": 0.1993023008108139, | |
| "step": 117, | |
| "step_time": 189.81664845184423 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3429.0, | |
| "completions/mean_length": 854.703125, | |
| "completions/mean_terminated_length": 803.2540283203125, | |
| "completions/min_length": 183.0, | |
| "completions/min_terminated_length": 183.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6976221352815628, | |
| "epoch": 0.29064039408866993, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.004327473408132298, | |
| "kl": 0.0014568110054824501, | |
| "learning_rate": 4.941191239339158e-05, | |
| "loss": 0.019404802471399307, | |
| "num_tokens": 17624443.0, | |
| "reward": 1.1171875, | |
| "reward_std": 0.4130047559738159, | |
| "rewards/reward_func/mean": 0.12413194444444445, | |
| "rewards/reward_func/std": 0.060594505733913846, | |
| "sampling/importance_sampling_ratio/max": 2.9980294704437256, | |
| "sampling/importance_sampling_ratio/mean": 0.9605697989463806, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.723148345947266, | |
| "sampling/sampling_logp_difference/mean": 0.18460223078727722, | |
| "step": 118, | |
| "step_time": 159.6808209202718 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2725.0, | |
| "completions/mean_length": 1359.78125, | |
| "completions/mean_terminated_length": 1177.36669921875, | |
| "completions/min_length": 365.0, | |
| "completions/min_terminated_length": 365.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.62087182700634, | |
| "epoch": 0.29310344827586204, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.00276605032209502, | |
| "kl": 0.0018352215993218124, | |
| "learning_rate": 4.9401406243077e-05, | |
| "loss": 0.0054818070493638515, | |
| "num_tokens": 17806925.0, | |
| "reward": 1.21484375, | |
| "reward_std": 0.5134446024894714, | |
| "rewards/reward_func/mean": 0.1349826388888889, | |
| "rewards/reward_func/std": 0.07329034474160936, | |
| "sampling/importance_sampling_ratio/max": 2.9989259243011475, | |
| "sampling/importance_sampling_ratio/mean": 0.9490303993225098, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.999980926513672, | |
| "sampling/sampling_logp_difference/mean": 0.19710107147693634, | |
| "step": 119, | |
| "step_time": 167.16362278792076 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3621.0, | |
| "completions/mean_length": 1254.90625, | |
| "completions/mean_terminated_length": 1209.8095703125, | |
| "completions/min_length": 371.0, | |
| "completions/min_terminated_length": 371.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7221937924623489, | |
| "epoch": 0.2955665024630542, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00043815358357884285, | |
| "kl": 0.0011200245935469866, | |
| "learning_rate": 4.939080821280889e-05, | |
| "loss": 0.0008629357325844467, | |
| "num_tokens": 17982919.0, | |
| "reward": 1.16796875, | |
| "reward_std": 0.38331958651542664, | |
| "rewards/reward_func/mean": 0.12977430555555555, | |
| "rewards/reward_func/std": 0.04572268989351061, | |
| "sampling/importance_sampling_ratio/max": 2.9984426498413086, | |
| "sampling/importance_sampling_ratio/mean": 0.9429291486740112, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.374656677246094, | |
| "sampling/sampling_logp_difference/mean": 0.21816638112068176, | |
| "step": 120, | |
| "step_time": 134.45339812664315 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2754.0, | |
| "completions/max_terminated_length": 2754.0, | |
| "completions/mean_length": 913.25, | |
| "completions/mean_terminated_length": 913.25, | |
| "completions/min_length": 320.0, | |
| "completions/min_terminated_length": 320.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.709711492061615, | |
| "epoch": 0.29802955665024633, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0036506817753482997, | |
| "kl": 0.0010524207900743932, | |
| "learning_rate": 4.9380118342492596e-05, | |
| "loss": 0.029409902170300484, | |
| "num_tokens": 18122679.0, | |
| "reward": 1.00390625, | |
| "reward_std": 0.28692469000816345, | |
| "rewards/reward_func/mean": 0.1115451388888889, | |
| "rewards/reward_func/std": 0.04380870693259769, | |
| "sampling/importance_sampling_ratio/max": 2.9977598190307617, | |
| "sampling/importance_sampling_ratio/mean": 0.9520186185836792, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.124889373779297, | |
| "sampling/sampling_logp_difference/mean": 0.20045886933803558, | |
| "step": 121, | |
| "step_time": 87.02172928815708 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3203.0, | |
| "completions/max_terminated_length": 3203.0, | |
| "completions/mean_length": 836.3125, | |
| "completions/mean_terminated_length": 838.857177734375, | |
| "completions/min_length": 128.0, | |
| "completions/min_terminated_length": 128.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.693224161863327, | |
| "epoch": 0.30049261083743845, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0030154542151378852, | |
| "kl": 0.001927026896737516, | |
| "learning_rate": 4.936933667237926e-05, | |
| "loss": 0.014132981188595295, | |
| "num_tokens": 18259051.0, | |
| "reward": 1.05078125, | |
| "reward_std": 0.34840819239616394, | |
| "rewards/reward_func/mean": 0.11675347222222222, | |
| "rewards/reward_func/std": 0.05086437861124674, | |
| "sampling/importance_sampling_ratio/max": 2.9976906776428223, | |
| "sampling/importance_sampling_ratio/mean": 0.9550529718399048, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.559123992919922, | |
| "sampling/sampling_logp_difference/mean": 0.18455424904823303, | |
| "step": 122, | |
| "step_time": 106.16042562597431 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2646.0, | |
| "completions/mean_length": 979.375, | |
| "completions/mean_terminated_length": 826.0983276367188, | |
| "completions/min_length": 182.0, | |
| "completions/min_terminated_length": 182.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6227066367864609, | |
| "epoch": 0.30295566502463056, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.003676565230234005, | |
| "kl": 0.00129530526464805, | |
| "learning_rate": 4.935846324306571e-05, | |
| "loss": 0.023714786395430565, | |
| "num_tokens": 18408915.0, | |
| "reward": 0.9921875, | |
| "reward_std": 0.2557619512081146, | |
| "rewards/reward_func/mean": 0.11024305555555555, | |
| "rewards/reward_func/std": 0.03941734631856283, | |
| "sampling/importance_sampling_ratio/max": 2.996018648147583, | |
| "sampling/importance_sampling_ratio/mean": 0.9630005359649658, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 23.409278869628906, | |
| "sampling/sampling_logp_difference/mean": 0.1686955690383911, | |
| "step": 123, | |
| "step_time": 137.6973474638071 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3606.0, | |
| "completions/mean_length": 865.578125, | |
| "completions/mean_terminated_length": 814.3016357421875, | |
| "completions/min_length": 161.0, | |
| "completions/min_terminated_length": 161.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7077045142650604, | |
| "epoch": 0.3054187192118227, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0019030142687087924, | |
| "kl": 0.0012345675058895722, | |
| "learning_rate": 4.934749809549427e-05, | |
| "loss": -8.280670590465888e-05, | |
| "num_tokens": 18543704.0, | |
| "reward": 1.015625, | |
| "reward_std": 0.21764887869358063, | |
| "rewards/reward_func/mean": 0.11284722222222222, | |
| "rewards/reward_func/std": 0.03337423337830438, | |
| "sampling/importance_sampling_ratio/max": 2.9979424476623535, | |
| "sampling/importance_sampling_ratio/mean": 0.9609706997871399, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.68747329711914, | |
| "sampling/sampling_logp_difference/mean": 0.1729896515607834, | |
| "step": 124, | |
| "step_time": 119.92689338512719 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3934.0, | |
| "completions/mean_length": 1081.171875, | |
| "completions/mean_terminated_length": 989.0491333007812, | |
| "completions/min_length": 224.0, | |
| "completions/min_terminated_length": 224.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8372175991535187, | |
| "epoch": 0.3078817733990148, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0045321344703967556, | |
| "kl": 0.0017608296184334904, | |
| "learning_rate": 4.9336441270952595e-05, | |
| "loss": -0.025809479877352715, | |
| "num_tokens": 18719891.0, | |
| "reward": 1.05078125, | |
| "reward_std": 0.4821818768978119, | |
| "rewards/reward_func/mean": 0.11675347222222222, | |
| "rewards/reward_func/std": 0.07227780090437995, | |
| "sampling/importance_sampling_ratio/max": 2.9974734783172607, | |
| "sampling/importance_sampling_ratio/mean": 0.9361357688903809, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.645075798034668, | |
| "sampling/sampling_logp_difference/mean": 0.24367718398571014, | |
| "step": 125, | |
| "step_time": 135.73762777121738 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1891.0, | |
| "completions/mean_length": 677.125, | |
| "completions/mean_terminated_length": 622.857177734375, | |
| "completions/min_length": 147.0, | |
| "completions/min_terminated_length": 147.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7165095210075378, | |
| "epoch": 0.3103448275862069, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004739101792306716, | |
| "kl": 0.001693987287580967, | |
| "learning_rate": 4.932529281107355e-05, | |
| "loss": 0.0007048757979646325, | |
| "num_tokens": 18847387.0, | |
| "reward": 1.1171875, | |
| "reward_std": 0.3830971121788025, | |
| "rewards/reward_func/mean": 0.12413194444444445, | |
| "rewards/reward_func/std": 0.053545390566190086, | |
| "sampling/importance_sampling_ratio/max": 2.9993577003479004, | |
| "sampling/importance_sampling_ratio/mean": 0.9612326622009277, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.749794006347656, | |
| "sampling/sampling_logp_difference/mean": 0.18317507207393646, | |
| "step": 126, | |
| "step_time": 112.82818150427192 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3864.0, | |
| "completions/mean_length": 1438.4375, | |
| "completions/mean_terminated_length": 1213.2203369140625, | |
| "completions/min_length": 269.0, | |
| "completions/min_terminated_length": 269.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6391656994819641, | |
| "epoch": 0.312807881773399, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.007431714168948687, | |
| "kl": 0.0014243643818190321, | |
| "learning_rate": 4.931405275783507e-05, | |
| "loss": 0.019251395016908646, | |
| "num_tokens": 19034471.0, | |
| "reward": 1.02734375, | |
| "reward_std": 0.3566744029521942, | |
| "rewards/reward_func/mean": 0.11414930555555555, | |
| "rewards/reward_func/std": 0.053882877031962075, | |
| "sampling/importance_sampling_ratio/max": 2.9972875118255615, | |
| "sampling/importance_sampling_ratio/mean": 0.9524862766265869, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.582551002502441, | |
| "sampling/sampling_logp_difference/mean": 0.18535523116588593, | |
| "step": 127, | |
| "step_time": 152.86888752900995 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3641.0, | |
| "completions/mean_length": 992.078125, | |
| "completions/mean_terminated_length": 942.8095703125, | |
| "completions/min_length": 148.0, | |
| "completions/min_terminated_length": 148.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7497861087322235, | |
| "epoch": 0.31527093596059114, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0030525830028834324, | |
| "kl": 0.0023082339903339744, | |
| "learning_rate": 4.930272115355992e-05, | |
| "loss": -0.015048885717988014, | |
| "num_tokens": 19181260.0, | |
| "reward": 1.28515625, | |
| "reward_std": 0.556111752986908, | |
| "rewards/reward_func/mean": 0.1427951388888889, | |
| "rewards/reward_func/std": 0.07730624907546574, | |
| "sampling/importance_sampling_ratio/max": 2.9983325004577637, | |
| "sampling/importance_sampling_ratio/mean": 0.9502414464950562, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.081714630126953, | |
| "sampling/sampling_logp_difference/mean": 0.21059447526931763, | |
| "step": 128, | |
| "step_time": 121.74499881407246 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3749.0, | |
| "completions/max_terminated_length": 3749.0, | |
| "completions/mean_length": 1172.21875, | |
| "completions/mean_terminated_length": 1165.761962890625, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7381231337785721, | |
| "epoch": 0.31773399014778325, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.007650712047425518, | |
| "kl": 0.001185288158012554, | |
| "learning_rate": 4.929129804091562e-05, | |
| "loss": 0.06440460681915283, | |
| "num_tokens": 19359098.0, | |
| "reward": 1.2890625, | |
| "reward_std": 0.7046032547950745, | |
| "rewards/reward_func/mean": 0.14322916666666666, | |
| "rewards/reward_func/std": 0.12921956926584244, | |
| "sampling/importance_sampling_ratio/max": 2.998009204864502, | |
| "sampling/importance_sampling_ratio/mean": 0.944693386554718, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.996122360229492, | |
| "sampling/sampling_logp_difference/mean": 0.2195536196231842, | |
| "step": 129, | |
| "step_time": 115.53860899922438 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3714.0, | |
| "completions/max_terminated_length": 3714.0, | |
| "completions/mean_length": 464.375, | |
| "completions/mean_terminated_length": 471.7301940917969, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 62.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6270382106304169, | |
| "epoch": 0.32019704433497537, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004284817811709977, | |
| "kl": 0.00273809939972125, | |
| "learning_rate": 4.927978346291424e-05, | |
| "loss": -0.030350536108016968, | |
| "num_tokens": 19451570.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.37679383158683777, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.05647122197681003, | |
| "sampling/importance_sampling_ratio/max": 2.9892261028289795, | |
| "sampling/importance_sampling_ratio/mean": 0.9704676270484924, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.899042129516602, | |
| "sampling/sampling_logp_difference/mean": 0.15999342501163483, | |
| "step": 130, | |
| "step_time": 99.42007652996108 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2876.0, | |
| "completions/mean_length": 1104.359375, | |
| "completions/mean_terminated_length": 957.2294311523438, | |
| "completions/min_length": 303.0, | |
| "completions/min_terminated_length": 303.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6907539814710617, | |
| "epoch": 0.3226600985221675, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0029258177695383855, | |
| "kl": 0.001306317833950743, | |
| "learning_rate": 4.9268177462912255e-05, | |
| "loss": -0.0022134785540401936, | |
| "num_tokens": 19621513.0, | |
| "reward": 0.984375, | |
| "reward_std": 0.3238992393016815, | |
| "rewards/reward_func/mean": 0.109375, | |
| "rewards/reward_func/std": 0.049388562639554344, | |
| "sampling/importance_sampling_ratio/max": 2.9986183643341064, | |
| "sampling/importance_sampling_ratio/mean": 0.9501324892044067, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.441189765930176, | |
| "sampling/sampling_logp_difference/mean": 0.20469427108764648, | |
| "step": 131, | |
| "step_time": 151.08437192393467 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3990.0, | |
| "completions/mean_length": 1114.28125, | |
| "completions/mean_terminated_length": 970.050048828125, | |
| "completions/min_length": 189.0, | |
| "completions/min_terminated_length": 189.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7603419423103333, | |
| "epoch": 0.3251231527093596, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.002124359777457952, | |
| "kl": 0.0012819624680560082, | |
| "learning_rate": 4.9256480084610376e-05, | |
| "loss": -0.002804091200232506, | |
| "num_tokens": 19780075.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.34947434067726135, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.053156735168562994, | |
| "sampling/importance_sampling_ratio/max": 2.9991204738616943, | |
| "sampling/importance_sampling_ratio/mean": 0.9497889876365662, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.665460586547852, | |
| "sampling/sampling_logp_difference/mean": 0.20453506708145142, | |
| "step": 132, | |
| "step_time": 129.8263506561052 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3803.0, | |
| "completions/mean_length": 986.703125, | |
| "completions/mean_terminated_length": 914.131103515625, | |
| "completions/min_length": 133.0, | |
| "completions/min_terminated_length": 133.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6631338149309158, | |
| "epoch": 0.3275862068965517, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.001555458462487788, | |
| "kl": 0.0020409352728165686, | |
| "learning_rate": 4.9244691372053376e-05, | |
| "loss": -0.024803729727864265, | |
| "num_tokens": 19921752.0, | |
| "reward": 1.03515625, | |
| "reward_std": 0.3359043300151825, | |
| "rewards/reward_func/mean": 0.1150173611111111, | |
| "rewards/reward_func/std": 0.050191783242755465, | |
| "sampling/importance_sampling_ratio/max": 2.9955570697784424, | |
| "sampling/importance_sampling_ratio/mean": 0.9567161202430725, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.687455177307129, | |
| "sampling/sampling_logp_difference/mean": 0.18763144314289093, | |
| "step": 133, | |
| "step_time": 125.27077388390899 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3702.0, | |
| "completions/mean_length": 1057.578125, | |
| "completions/mean_terminated_length": 1009.3492431640625, | |
| "completions/min_length": 166.0, | |
| "completions/min_terminated_length": 166.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7682660669088364, | |
| "epoch": 0.33004926108374383, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.009635775481836006, | |
| "kl": 0.0018410422198940068, | |
| "learning_rate": 4.9232811369629936e-05, | |
| "loss": 0.06454427540302277, | |
| "num_tokens": 20080669.0, | |
| "reward": 1.328125, | |
| "reward_std": 0.8139105439186096, | |
| "rewards/reward_func/mean": 0.14756944444444445, | |
| "rewards/reward_func/std": 0.12541627056068844, | |
| "sampling/importance_sampling_ratio/max": 2.997227430343628, | |
| "sampling/importance_sampling_ratio/mean": 0.9470077157020569, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.934889793395996, | |
| "sampling/sampling_logp_difference/mean": 0.21396198868751526, | |
| "step": 134, | |
| "step_time": 129.73883415712044 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3652.0, | |
| "completions/mean_length": 1508.015625, | |
| "completions/mean_terminated_length": 1259.2982177734375, | |
| "completions/min_length": 74.0, | |
| "completions/min_terminated_length": 74.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6884257197380066, | |
| "epoch": 0.33251231527093594, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0030280563055380292, | |
| "kl": 0.002304157940670848, | |
| "learning_rate": 4.9220840122072495e-05, | |
| "loss": 0.007324616890400648, | |
| "num_tokens": 20260622.0, | |
| "reward": 1.0546875, | |
| "reward_std": 0.3068941533565521, | |
| "rewards/reward_func/mean": 0.1171875, | |
| "rewards/reward_func/std": 0.044668421149253845, | |
| "sampling/importance_sampling_ratio/max": 2.999495029449463, | |
| "sampling/importance_sampling_ratio/mean": 0.9472457766532898, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.176748275756836, | |
| "sampling/sampling_logp_difference/mean": 0.20310401916503906, | |
| "step": 135, | |
| "step_time": 131.45040617790073 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3621.0, | |
| "completions/mean_length": 1437.515625, | |
| "completions/mean_terminated_length": 1059.5, | |
| "completions/min_length": 322.0, | |
| "completions/min_terminated_length": 322.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7024633884429932, | |
| "epoch": 0.33497536945812806, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0020426303305261725, | |
| "kl": 0.0019914144941139966, | |
| "learning_rate": 4.920877767445705e-05, | |
| "loss": -0.009508827701210976, | |
| "num_tokens": 20449311.0, | |
| "reward": 1.1171875, | |
| "reward_std": 0.4858488440513611, | |
| "rewards/reward_func/mean": 0.12413194444444445, | |
| "rewards/reward_func/std": 0.06965038345919715, | |
| "sampling/importance_sampling_ratio/max": 2.997471809387207, | |
| "sampling/importance_sampling_ratio/mean": 0.9443151950836182, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.433490753173828, | |
| "sampling/sampling_logp_difference/mean": 0.21754810214042664, | |
| "step": 136, | |
| "step_time": 137.67270130012184 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3679.0, | |
| "completions/mean_length": 1181.625, | |
| "completions/mean_terminated_length": 987.3333740234375, | |
| "completions/min_length": 339.0, | |
| "completions/min_terminated_length": 339.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7879326194524765, | |
| "epoch": 0.3374384236453202, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0020936069842209834, | |
| "kl": 0.001667340548010543, | |
| "learning_rate": 4.919662407220299e-05, | |
| "loss": 0.006686965469270945, | |
| "num_tokens": 20614039.0, | |
| "reward": 1.09375, | |
| "reward_std": 0.3435921370983124, | |
| "rewards/reward_func/mean": 0.12152777777777778, | |
| "rewards/reward_func/std": 0.04884182744556003, | |
| "sampling/importance_sampling_ratio/max": 2.9936232566833496, | |
| "sampling/importance_sampling_ratio/mean": 0.9455877542495728, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.05931282043457, | |
| "sampling/sampling_logp_difference/mean": 0.21757498383522034, | |
| "step": 137, | |
| "step_time": 129.50531403324567 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2448.0, | |
| "completions/mean_length": 1257.078125, | |
| "completions/mean_terminated_length": 1152.0655517578125, | |
| "completions/min_length": 300.0, | |
| "completions/min_terminated_length": 300.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.666120707988739, | |
| "epoch": 0.3399014778325123, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0015588500520508758, | |
| "kl": 0.001390039746183902, | |
| "learning_rate": 4.918437936107293e-05, | |
| "loss": -0.0016076350584626198, | |
| "num_tokens": 20793404.0, | |
| "reward": 1.140625, | |
| "reward_std": 0.41755858063697815, | |
| "rewards/reward_func/mean": 0.1267361111111111, | |
| "rewards/reward_func/std": 0.05750518043835958, | |
| "sampling/importance_sampling_ratio/max": 2.994096517562866, | |
| "sampling/importance_sampling_ratio/mean": 0.9516454339027405, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.36985206604004, | |
| "sampling/sampling_logp_difference/mean": 0.19270411133766174, | |
| "step": 138, | |
| "step_time": 128.2147407066077 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2123.0, | |
| "completions/mean_length": 1000.9375, | |
| "completions/mean_terminated_length": 810.559326171875, | |
| "completions/min_length": 153.0, | |
| "completions/min_terminated_length": 153.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7744076550006866, | |
| "epoch": 0.34236453201970446, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.001767253547338134, | |
| "kl": 0.0015895857068244368, | |
| "learning_rate": 4.9172043587172564e-05, | |
| "loss": -0.018749739974737167, | |
| "num_tokens": 20938440.0, | |
| "reward": 1.02734375, | |
| "reward_std": 0.2991959750652313, | |
| "rewards/reward_func/mean": 0.11414930555555555, | |
| "rewards/reward_func/std": 0.04486183987723456, | |
| "sampling/importance_sampling_ratio/max": 2.9982097148895264, | |
| "sampling/importance_sampling_ratio/mean": 0.9536800384521484, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.281064987182617, | |
| "sampling/sampling_logp_difference/mean": 0.20259705185890198, | |
| "step": 139, | |
| "step_time": 182.23772311606444 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3620.0, | |
| "completions/mean_length": 1145.375, | |
| "completions/mean_terminated_length": 1098.539794921875, | |
| "completions/min_length": 287.0, | |
| "completions/min_terminated_length": 287.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.670438826084137, | |
| "epoch": 0.3448275862068966, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0004879941470283451, | |
| "kl": 0.0015858457772992551, | |
| "learning_rate": 4.915961679695046e-05, | |
| "loss": -0.0004922771477140486, | |
| "num_tokens": 21103968.0, | |
| "reward": 1.12109375, | |
| "reward_std": 0.33627331256866455, | |
| "rewards/reward_func/mean": 0.12456597222222222, | |
| "rewards/reward_func/std": 0.04050926036304898, | |
| "sampling/importance_sampling_ratio/max": 2.9990146160125732, | |
| "sampling/importance_sampling_ratio/mean": 0.9465623497962952, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.31145477294922, | |
| "sampling/sampling_logp_difference/mean": 0.20568417012691498, | |
| "step": 140, | |
| "step_time": 119.32564870314673 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3744.0, | |
| "completions/max_terminated_length": 3744.0, | |
| "completions/mean_length": 932.421875, | |
| "completions/mean_terminated_length": 932.825439453125, | |
| "completions/min_length": 255.0, | |
| "completions/min_terminated_length": 255.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6941824108362198, | |
| "epoch": 0.3472906403940887, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0012234709661190007, | |
| "kl": 0.0026189969503320754, | |
| "learning_rate": 4.914709903719788e-05, | |
| "loss": 0.015446648001670837, | |
| "num_tokens": 21242299.0, | |
| "reward": 1.12109375, | |
| "reward_std": 0.3645833432674408, | |
| "rewards/reward_func/mean": 0.12456597222222222, | |
| "rewards/reward_func/std": 0.04644498642947939, | |
| "sampling/importance_sampling_ratio/max": 2.997051954269409, | |
| "sampling/importance_sampling_ratio/mean": 0.9586159586906433, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.005863189697266, | |
| "sampling/sampling_logp_difference/mean": 0.18264621496200562, | |
| "step": 141, | |
| "step_time": 141.0135326378513 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3652.0, | |
| "completions/max_terminated_length": 3652.0, | |
| "completions/mean_length": 636.125, | |
| "completions/mean_terminated_length": 636.125, | |
| "completions/min_length": 139.0, | |
| "completions/min_terminated_length": 139.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7068347334861755, | |
| "epoch": 0.3497536945812808, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0007979245291869645, | |
| "kl": 0.0020473987387958914, | |
| "learning_rate": 4.913449035504865e-05, | |
| "loss": -0.0025128277484327555, | |
| "num_tokens": 21377475.0, | |
| "reward": 1.05078125, | |
| "reward_std": 0.25268277525901794, | |
| "rewards/reward_func/mean": 0.11675347222222222, | |
| "rewards/reward_func/std": 0.03302617081337505, | |
| "sampling/importance_sampling_ratio/max": 2.999530553817749, | |
| "sampling/importance_sampling_ratio/mean": 0.9572858214378357, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.087381362915039, | |
| "sampling/sampling_logp_difference/mean": 0.18261964619159698, | |
| "step": 142, | |
| "step_time": 107.95403501321562 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3583.0, | |
| "completions/mean_length": 839.75, | |
| "completions/mean_terminated_length": 788.758056640625, | |
| "completions/min_length": 159.0, | |
| "completions/min_terminated_length": 159.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7018862813711166, | |
| "epoch": 0.3522167487684729, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002889781139543682, | |
| "kl": 0.0021695691102650017, | |
| "learning_rate": 4.912179079797892e-05, | |
| "loss": 0.0009456706466153264, | |
| "num_tokens": 21512947.0, | |
| "reward": 1.01953125, | |
| "reward_std": 0.2998170256614685, | |
| "rewards/reward_func/mean": 0.11328125, | |
| "rewards/reward_func/std": 0.04724570612112681, | |
| "sampling/importance_sampling_ratio/max": 2.992983102798462, | |
| "sampling/importance_sampling_ratio/mean": 0.9585220217704773, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.02216911315918, | |
| "sampling/sampling_logp_difference/mean": 0.18149858713150024, | |
| "step": 143, | |
| "step_time": 129.82384162512608 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2424.0, | |
| "completions/mean_length": 869.0625, | |
| "completions/mean_terminated_length": 782.1451416015625, | |
| "completions/min_length": 133.0, | |
| "completions/min_terminated_length": 133.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6957240998744965, | |
| "epoch": 0.35467980295566504, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.001834099632040615, | |
| "kl": 0.0014171720249578357, | |
| "learning_rate": 4.910900041380703e-05, | |
| "loss": -0.017629800364375114, | |
| "num_tokens": 21655623.0, | |
| "reward": 1.0390625, | |
| "reward_std": 0.3249503970146179, | |
| "rewards/reward_func/mean": 0.1154513888888889, | |
| "rewards/reward_func/std": 0.04804881579346127, | |
| "sampling/importance_sampling_ratio/max": 2.988936185836792, | |
| "sampling/importance_sampling_ratio/mean": 0.9553591012954712, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.53939437866211, | |
| "sampling/sampling_logp_difference/mean": 0.18938414752483368, | |
| "step": 144, | |
| "step_time": 164.86258218903095 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3388.0, | |
| "completions/mean_length": 1086.5625, | |
| "completions/mean_terminated_length": 975.6229248046875, | |
| "completions/min_length": 126.0, | |
| "completions/min_terminated_length": 126.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7083845734596252, | |
| "epoch": 0.35714285714285715, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.007941490291629926, | |
| "kl": 0.0013565148983616382, | |
| "learning_rate": 4.909611925069332e-05, | |
| "loss": 0.014829241670668125, | |
| "num_tokens": 21809019.0, | |
| "reward": 1.2734375, | |
| "reward_std": 0.8246195912361145, | |
| "rewards/reward_func/mean": 0.14149305555555555, | |
| "rewards/reward_func/std": 0.11937192496326235, | |
| "sampling/importance_sampling_ratio/max": 2.999521017074585, | |
| "sampling/importance_sampling_ratio/mean": 0.9506230354309082, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.645586013793945, | |
| "sampling/sampling_logp_difference/mean": 0.20339885354042053, | |
| "step": 145, | |
| "step_time": 168.08624598686583 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3746.0, | |
| "completions/mean_length": 1037.734375, | |
| "completions/mean_terminated_length": 891.016357421875, | |
| "completions/min_length": 159.0, | |
| "completions/min_terminated_length": 159.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7873726338148117, | |
| "epoch": 0.35960591133004927, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0023097908581655286, | |
| "kl": 0.003050926752621308, | |
| "learning_rate": 4.9083147357139936e-05, | |
| "loss": 0.007973194122314453, | |
| "num_tokens": 21970106.0, | |
| "reward": 1.08984375, | |
| "reward_std": 0.3709896206855774, | |
| "rewards/reward_func/mean": 0.12109375, | |
| "rewards/reward_func/std": 0.05479054152965546, | |
| "sampling/importance_sampling_ratio/max": 2.9961843490600586, | |
| "sampling/importance_sampling_ratio/mean": 0.9499616622924805, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.426872253417969, | |
| "sampling/sampling_logp_difference/mean": 0.20822127163410187, | |
| "step": 146, | |
| "step_time": 171.15286494023167 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3201.0, | |
| "completions/max_terminated_length": 3201.0, | |
| "completions/mean_length": 1070.609375, | |
| "completions/mean_terminated_length": 1070.609375, | |
| "completions/min_length": 126.0, | |
| "completions/min_terminated_length": 126.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.639614000916481, | |
| "epoch": 0.3620689655172414, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0035868637404805775, | |
| "kl": 0.0013124027755111456, | |
| "learning_rate": 4.9070084781990655e-05, | |
| "loss": 0.0779402107000351, | |
| "num_tokens": 22122833.0, | |
| "reward": 1.27734375, | |
| "reward_std": 0.7252766489982605, | |
| "rewards/reward_func/mean": 0.14192708333333334, | |
| "rewards/reward_func/std": 0.10532407628165351, | |
| "sampling/importance_sampling_ratio/max": 2.9976370334625244, | |
| "sampling/importance_sampling_ratio/mean": 0.9555914402008057, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.999216079711914, | |
| "sampling/sampling_logp_difference/mean": 0.18233312666416168, | |
| "step": 147, | |
| "step_time": 92.5367358867079 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3682.0, | |
| "completions/mean_length": 1106.71875, | |
| "completions/mean_terminated_length": 1010.290283203125, | |
| "completions/min_length": 309.0, | |
| "completions/min_terminated_length": 309.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7598123103380203, | |
| "epoch": 0.3645320197044335, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0006033462364919338, | |
| "kl": 0.0015440615534316748, | |
| "learning_rate": 4.905693157443072e-05, | |
| "loss": -6.69892760924995e-05, | |
| "num_tokens": 22281695.0, | |
| "reward": 1.1015625, | |
| "reward_std": 0.3203382194042206, | |
| "rewards/reward_func/mean": 0.12239583333333333, | |
| "rewards/reward_func/std": 0.03982427467902502, | |
| "sampling/importance_sampling_ratio/max": 2.9990968704223633, | |
| "sampling/importance_sampling_ratio/mean": 0.9494574666023254, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.545462608337402, | |
| "sampling/sampling_logp_difference/mean": 0.21209058165550232, | |
| "step": 148, | |
| "step_time": 171.81504101189785 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3808.0, | |
| "completions/mean_length": 1134.984375, | |
| "completions/mean_terminated_length": 1027.7540283203125, | |
| "completions/min_length": 219.0, | |
| "completions/min_terminated_length": 219.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6977289170026779, | |
| "epoch": 0.3669950738916256, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.001889963426633302, | |
| "kl": 0.0013170290039852262, | |
| "learning_rate": 4.904368778398662e-05, | |
| "loss": 0.007757263723760843, | |
| "num_tokens": 22433710.0, | |
| "reward": 1.0546875, | |
| "reward_std": 0.3463779389858246, | |
| "rewards/reward_func/mean": 0.1171875, | |
| "rewards/reward_func/std": 0.05063716073830923, | |
| "sampling/importance_sampling_ratio/max": 2.9935598373413086, | |
| "sampling/importance_sampling_ratio/mean": 0.9530324935913086, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.762185096740723, | |
| "sampling/sampling_logp_difference/mean": 0.18600577116012573, | |
| "step": 149, | |
| "step_time": 124.45693185203709 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 3848.0, | |
| "completions/max_terminated_length": 3848.0, | |
| "completions/mean_length": 987.8125, | |
| "completions/mean_terminated_length": 977.4917602539062, | |
| "completions/min_length": 153.0, | |
| "completions/min_terminated_length": 153.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7574534267187119, | |
| "epoch": 0.3694581280788177, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002926058083550387, | |
| "kl": 0.001403219037456438, | |
| "learning_rate": 4.903035346052593e-05, | |
| "loss": 0.0012572875712066889, | |
| "num_tokens": 22592034.0, | |
| "reward": 1.01171875, | |
| "reward_std": 0.27251651883125305, | |
| "rewards/reward_func/mean": 0.11241319444444445, | |
| "rewards/reward_func/std": 0.041424840688705444, | |
| "sampling/importance_sampling_ratio/max": 2.9942948818206787, | |
| "sampling/importance_sampling_ratio/mean": 0.9399828910827637, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.540233612060547, | |
| "sampling/sampling_logp_difference/mean": 0.23285743594169617, | |
| "step": 150, | |
| "step_time": 125.10149595234543 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2618.0, | |
| "completions/mean_length": 1055.484375, | |
| "completions/mean_terminated_length": 965.016357421875, | |
| "completions/min_length": 183.0, | |
| "completions/min_terminated_length": 183.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6921143233776093, | |
| "epoch": 0.37192118226600984, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0003847117777456799, | |
| "kl": 0.001394979510223493, | |
| "learning_rate": 4.9016928654257096e-05, | |
| "loss": -0.0016071898862719536, | |
| "num_tokens": 22765713.0, | |
| "reward": 1.07421875, | |
| "reward_std": 0.2734251022338867, | |
| "rewards/reward_func/mean": 0.1193576388888889, | |
| "rewards/reward_func/std": 0.03352663583225674, | |
| "sampling/importance_sampling_ratio/max": 2.999713182449341, | |
| "sampling/importance_sampling_ratio/mean": 0.9436339139938354, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.68712329864502, | |
| "sampling/sampling_logp_difference/mean": 0.21123811602592468, | |
| "step": 151, | |
| "step_time": 180.94982343283482 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2617.0, | |
| "completions/mean_length": 864.890625, | |
| "completions/mean_terminated_length": 813.6032104492188, | |
| "completions/min_length": 156.0, | |
| "completions/min_terminated_length": 156.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7735969722270966, | |
| "epoch": 0.37438423645320196, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0005504224354699446, | |
| "kl": 0.001623089803615585, | |
| "learning_rate": 4.9003413415729295e-05, | |
| "loss": -0.0025445527862757444, | |
| "num_tokens": 22903450.0, | |
| "reward": 1.1328125, | |
| "reward_std": 0.35626131296157837, | |
| "rewards/reward_func/mean": 0.12586805555555555, | |
| "rewards/reward_func/std": 0.043802719149324626, | |
| "sampling/importance_sampling_ratio/max": 2.998650312423706, | |
| "sampling/importance_sampling_ratio/mean": 0.9493921995162964, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.77414608001709, | |
| "sampling/sampling_logp_difference/mean": 0.21335291862487793, | |
| "step": 152, | |
| "step_time": 125.38044445589185 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2847.0, | |
| "completions/mean_length": 1196.296875, | |
| "completions/mean_terminated_length": 1143.4031982421875, | |
| "completions/min_length": 56.0, | |
| "completions/min_terminated_length": 56.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.741541400551796, | |
| "epoch": 0.3768472906403941, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0024813468622562054, | |
| "kl": 0.0012266744452062994, | |
| "learning_rate": 4.898980779583218e-05, | |
| "loss": -0.013216846622526646, | |
| "num_tokens": 23078717.0, | |
| "reward": 1.140625, | |
| "reward_std": 0.4151759147644043, | |
| "rewards/reward_func/mean": 0.1267361111111111, | |
| "rewards/reward_func/std": 0.05974882344404856, | |
| "sampling/importance_sampling_ratio/max": 2.998521327972412, | |
| "sampling/importance_sampling_ratio/mean": 0.9421178102493286, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.311586380004883, | |
| "sampling/sampling_logp_difference/mean": 0.2195492386817932, | |
| "step": 153, | |
| "step_time": 141.04792174184695 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3065.0, | |
| "completions/max_terminated_length": 3065.0, | |
| "completions/mean_length": 1088.75, | |
| "completions/mean_terminated_length": 1098.508056640625, | |
| "completions/min_length": 218.0, | |
| "completions/min_terminated_length": 218.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7633115500211716, | |
| "epoch": 0.3793103448275862, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.004701491570482404, | |
| "kl": 0.0012460256984923035, | |
| "learning_rate": 4.897611184579575e-05, | |
| "loss": 0.05346252769231796, | |
| "num_tokens": 23238861.0, | |
| "reward": 1.29296875, | |
| "reward_std": 0.7299634218215942, | |
| "rewards/reward_func/mean": 0.14366319444444445, | |
| "rewards/reward_func/std": 0.1064673662185669, | |
| "sampling/importance_sampling_ratio/max": 2.9990603923797607, | |
| "sampling/importance_sampling_ratio/mean": 0.9426702857017517, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.55482292175293, | |
| "sampling/sampling_logp_difference/mean": 0.22415803372859955, | |
| "step": 154, | |
| "step_time": 100.15680134599097 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2150.0, | |
| "completions/max_terminated_length": 2150.0, | |
| "completions/mean_length": 798.84375, | |
| "completions/mean_terminated_length": 798.84375, | |
| "completions/min_length": 127.0, | |
| "completions/min_terminated_length": 127.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7754421979188919, | |
| "epoch": 0.3817733990147783, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0007325242776395226, | |
| "kl": 0.00122224964434281, | |
| "learning_rate": 4.896232561719011e-05, | |
| "loss": 0.0007550335139967501, | |
| "num_tokens": 23365331.0, | |
| "reward": 1.12109375, | |
| "reward_std": 0.33627331256866455, | |
| "rewards/reward_func/mean": 0.12456597222222222, | |
| "rewards/reward_func/std": 0.04050926036304898, | |
| "sampling/importance_sampling_ratio/max": 2.996500015258789, | |
| "sampling/importance_sampling_ratio/mean": 0.9564509391784668, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.966944694519043, | |
| "sampling/sampling_logp_difference/mean": 0.1989564299583435, | |
| "step": 155, | |
| "step_time": 84.85235846112482 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3084.0, | |
| "completions/mean_length": 1392.203125, | |
| "completions/mean_terminated_length": 1212.5762939453125, | |
| "completions/min_length": 111.0, | |
| "completions/min_terminated_length": 111.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6894625276327133, | |
| "epoch": 0.3842364532019704, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0007756937710466033, | |
| "kl": 0.0011820744257420301, | |
| "learning_rate": 4.8948449161925304e-05, | |
| "loss": -0.0026572286151349545, | |
| "num_tokens": 23533808.0, | |
| "reward": 1.16015625, | |
| "reward_std": 0.3892585337162018, | |
| "rewards/reward_func/mean": 0.12890625, | |
| "rewards/reward_func/std": 0.04816830199625757, | |
| "sampling/importance_sampling_ratio/max": 2.997208833694458, | |
| "sampling/importance_sampling_ratio/mean": 0.9468032121658325, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.121822357177734, | |
| "sampling/sampling_logp_difference/mean": 0.20983976125717163, | |
| "step": 156, | |
| "step_time": 161.84404824208468 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3994.0, | |
| "completions/mean_length": 1185.328125, | |
| "completions/mean_terminated_length": 1042.1802978515625, | |
| "completions/min_length": 168.0, | |
| "completions/min_terminated_length": 168.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6832298785448074, | |
| "epoch": 0.3866995073891626, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0017762955728373934, | |
| "kl": 0.0010904025693889707, | |
| "learning_rate": 4.893448253225111e-05, | |
| "loss": 0.003249376080930233, | |
| "num_tokens": 23689877.0, | |
| "reward": 1.046875, | |
| "reward_std": 0.31140682101249695, | |
| "rewards/reward_func/mean": 0.11631944444444445, | |
| "rewards/reward_func/std": 0.04530912637710571, | |
| "sampling/importance_sampling_ratio/max": 2.9998233318328857, | |
| "sampling/importance_sampling_ratio/mean": 0.95380699634552, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.548775672912598, | |
| "sampling/sampling_logp_difference/mean": 0.18981406092643738, | |
| "step": 157, | |
| "step_time": 126.51651544007473 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3857.0, | |
| "completions/mean_length": 1334.03125, | |
| "completions/mean_terminated_length": 1290.1905517578125, | |
| "completions/min_length": 258.0, | |
| "completions/min_terminated_length": 258.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.8318304419517517, | |
| "epoch": 0.3891625615763547, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0006476170158019754, | |
| "kl": 0.0011377888440620154, | |
| "learning_rate": 4.892042578075685e-05, | |
| "loss": 0.002958628349006176, | |
| "num_tokens": 23868615.0, | |
| "reward": 1.1796875, | |
| "reward_std": 0.38951727747917175, | |
| "rewards/reward_func/mean": 0.1310763888888889, | |
| "rewards/reward_func/std": 0.048582213620344795, | |
| "sampling/importance_sampling_ratio/max": 2.9970788955688477, | |
| "sampling/importance_sampling_ratio/mean": 0.9396538734436035, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.113051414489746, | |
| "sampling/sampling_logp_difference/mean": 0.2308649867773056, | |
| "step": 158, | |
| "step_time": 133.94302169582807 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3591.0, | |
| "completions/mean_length": 1060.359375, | |
| "completions/mean_terminated_length": 941.6834106445312, | |
| "completions/min_length": 270.0, | |
| "completions/min_terminated_length": 270.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6752626597881317, | |
| "epoch": 0.3916256157635468, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.001742199637816057, | |
| "kl": 0.0013848040252923965, | |
| "learning_rate": 4.8906278960371176e-05, | |
| "loss": 0.004247048869729042, | |
| "num_tokens": 24020910.0, | |
| "reward": 1.109375, | |
| "reward_std": 0.3417827785015106, | |
| "rewards/reward_func/mean": 0.1232638888888889, | |
| "rewards/reward_func/std": 0.04796475751532449, | |
| "sampling/importance_sampling_ratio/max": 2.994645357131958, | |
| "sampling/importance_sampling_ratio/mean": 0.9534465074539185, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.727479934692383, | |
| "sampling/sampling_logp_difference/mean": 0.196367084980011, | |
| "step": 159, | |
| "step_time": 114.65341229783371 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3619.0, | |
| "completions/mean_length": 1137.125, | |
| "completions/mean_terminated_length": 1041.6773681640625, | |
| "completions/min_length": 235.0, | |
| "completions/min_terminated_length": 235.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7267605811357498, | |
| "epoch": 0.39408866995073893, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0016509462080566294, | |
| "kl": 0.0013555605837609619, | |
| "learning_rate": 4.889204212436189e-05, | |
| "loss": 0.003579859621822834, | |
| "num_tokens": 24188486.0, | |
| "reward": 1.02734375, | |
| "reward_std": 0.24035847187042236, | |
| "rewards/reward_func/mean": 0.11414930555555555, | |
| "rewards/reward_func/std": 0.03507047891616821, | |
| "sampling/importance_sampling_ratio/max": 2.99871563911438, | |
| "sampling/importance_sampling_ratio/mean": 0.9451822638511658, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.253337860107422, | |
| "sampling/sampling_logp_difference/mean": 0.21290577948093414, | |
| "step": 160, | |
| "step_time": 127.22938701603562 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3007.0, | |
| "completions/mean_length": 1078.546875, | |
| "completions/mean_terminated_length": 989.4425659179688, | |
| "completions/min_length": 123.0, | |
| "completions/min_terminated_length": 123.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7306022793054581, | |
| "epoch": 0.39655172413793105, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.004243209978439369, | |
| "kl": 0.001163211651146412, | |
| "learning_rate": 4.8877715326335735e-05, | |
| "loss": 0.04088365659117699, | |
| "num_tokens": 24345497.0, | |
| "reward": 1.1875, | |
| "reward_std": 0.6871842741966248, | |
| "rewards/reward_func/mean": 0.13194444444444445, | |
| "rewards/reward_func/std": 0.10439738300111559, | |
| "sampling/importance_sampling_ratio/max": 2.997098922729492, | |
| "sampling/importance_sampling_ratio/mean": 0.9458619356155396, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.110089302062988, | |
| "sampling/sampling_logp_difference/mean": 0.20933616161346436, | |
| "step": 161, | |
| "step_time": 176.95277623389848 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2682.0, | |
| "completions/mean_length": 613.265625, | |
| "completions/mean_terminated_length": 557.984130859375, | |
| "completions/min_length": 100.0, | |
| "completions/min_terminated_length": 100.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6799998581409454, | |
| "epoch": 0.39901477832512317, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0033757850857375773, | |
| "kl": 0.002622713363962248, | |
| "learning_rate": 4.886329862023818e-05, | |
| "loss": -0.01957223378121853, | |
| "num_tokens": 24457466.0, | |
| "reward": 1.015625, | |
| "reward_std": 0.3532024621963501, | |
| "rewards/reward_func/mean": 0.11284722222222222, | |
| "rewards/reward_func/std": 0.05273487501674228, | |
| "sampling/importance_sampling_ratio/max": 2.9935693740844727, | |
| "sampling/importance_sampling_ratio/mean": 0.9666866064071655, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.023531913757324, | |
| "sampling/sampling_logp_difference/mean": 0.16631880402565002, | |
| "step": 162, | |
| "step_time": 128.8812639042735 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3618.0, | |
| "completions/mean_length": 785.078125, | |
| "completions/mean_terminated_length": 678.274169921875, | |
| "completions/min_length": 237.0, | |
| "completions/min_terminated_length": 237.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6376421004533768, | |
| "epoch": 0.4014778325123153, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0005021024269253694, | |
| "kl": 0.001497179502621293, | |
| "learning_rate": 4.884879206035324e-05, | |
| "loss": -0.000561786990147084, | |
| "num_tokens": 24586111.0, | |
| "reward": 1.0234375, | |
| "reward_std": 0.18213215470314026, | |
| "rewards/reward_func/mean": 0.11371527777777778, | |
| "rewards/reward_func/std": 0.02435668061176936, | |
| "sampling/importance_sampling_ratio/max": 2.993292808532715, | |
| "sampling/importance_sampling_ratio/mean": 0.9647999405860901, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.675369262695312, | |
| "sampling/sampling_logp_difference/mean": 0.15786173939704895, | |
| "step": 163, | |
| "step_time": 180.46688992506824 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3760.0, | |
| "completions/max_terminated_length": 3760.0, | |
| "completions/mean_length": 726.25, | |
| "completions/mean_terminated_length": 731.6032104492188, | |
| "completions/min_length": 169.0, | |
| "completions/min_terminated_length": 169.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6761015057563782, | |
| "epoch": 0.4039408866995074, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0021193524478002454, | |
| "kl": 0.0015740302333142608, | |
| "learning_rate": 4.883419570130327e-05, | |
| "loss": -0.017717270180583, | |
| "num_tokens": 24721503.0, | |
| "reward": 1.11328125, | |
| "reward_std": 0.4247541129589081, | |
| "rewards/reward_func/mean": 0.12369791666666667, | |
| "rewards/reward_func/std": 0.060799873537487455, | |
| "sampling/importance_sampling_ratio/max": 2.9982118606567383, | |
| "sampling/importance_sampling_ratio/mean": 0.9577165842056274, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.686821937561035, | |
| "sampling/sampling_logp_difference/mean": 0.17808575928211212, | |
| "step": 164, | |
| "step_time": 120.17617022292688 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3677.0, | |
| "completions/mean_length": 1376.09375, | |
| "completions/mean_terminated_length": 1288.3548583984375, | |
| "completions/min_length": 313.0, | |
| "completions/min_terminated_length": 313.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7354206442832947, | |
| "epoch": 0.4064039408866995, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0008399744445777652, | |
| "kl": 0.0009593560826033354, | |
| "learning_rate": 4.881950959804874e-05, | |
| "loss": -0.0010941341752186418, | |
| "num_tokens": 24899525.0, | |
| "reward": 1.0546875, | |
| "reward_std": 0.29703810811042786, | |
| "rewards/reward_func/mean": 0.1171875, | |
| "rewards/reward_func/std": 0.04189008225997289, | |
| "sampling/importance_sampling_ratio/max": 2.9984233379364014, | |
| "sampling/importance_sampling_ratio/mean": 0.9466732740402222, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.314602851867676, | |
| "sampling/sampling_logp_difference/mean": 0.2059394121170044, | |
| "step": 165, | |
| "step_time": 174.922209485434 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2007.0, | |
| "completions/mean_length": 842.625, | |
| "completions/mean_terminated_length": 737.6773681640625, | |
| "completions/min_length": 248.0, | |
| "completions/min_terminated_length": 248.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7169460654258728, | |
| "epoch": 0.4088669950738916, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0066154200174927385, | |
| "kl": 0.0014329378900583833, | |
| "learning_rate": 4.8804733805888024e-05, | |
| "loss": 0.002477045636624098, | |
| "num_tokens": 25029053.0, | |
| "reward": 1.11328125, | |
| "reward_std": 0.4606105387210846, | |
| "rewards/reward_func/mean": 0.12369791666666667, | |
| "rewards/reward_func/std": 0.07763891460167037, | |
| "sampling/importance_sampling_ratio/max": 2.997968912124634, | |
| "sampling/importance_sampling_ratio/mean": 0.9603346586227417, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.809807777404785, | |
| "sampling/sampling_logp_difference/mean": 0.18966203927993774, | |
| "step": 166, | |
| "step_time": 116.80163077195175 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3102.0, | |
| "completions/mean_length": 1133.796875, | |
| "completions/mean_terminated_length": 988.1146850585938, | |
| "completions/min_length": 241.0, | |
| "completions/min_terminated_length": 241.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7492803931236267, | |
| "epoch": 0.41133004926108374, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0032851832185571083, | |
| "kl": 0.001638269837712869, | |
| "learning_rate": 4.8789868380457246e-05, | |
| "loss": 0.040263283997774124, | |
| "num_tokens": 25182896.0, | |
| "reward": 1.18359375, | |
| "reward_std": 0.6889752745628357, | |
| "rewards/reward_func/mean": 0.13151041666666666, | |
| "rewards/reward_func/std": 0.09606481591860454, | |
| "sampling/importance_sampling_ratio/max": 2.99403715133667, | |
| "sampling/importance_sampling_ratio/mean": 0.9468588829040527, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.79487133026123, | |
| "sampling/sampling_logp_difference/mean": 0.20979991555213928, | |
| "step": 167, | |
| "step_time": 133.14356829575263 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4063.0, | |
| "completions/mean_length": 911.078125, | |
| "completions/mean_terminated_length": 824.11669921875, | |
| "completions/min_length": 84.0, | |
| "completions/min_terminated_length": 84.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7148732990026474, | |
| "epoch": 0.41379310344827586, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0007135211241037537, | |
| "kl": 0.0016217580414377153, | |
| "learning_rate": 4.8774913377729994e-05, | |
| "loss": -0.01211149524897337, | |
| "num_tokens": 25323061.0, | |
| "reward": 1.0390625, | |
| "reward_std": 0.27174752950668335, | |
| "rewards/reward_func/mean": 0.1154513888888889, | |
| "rewards/reward_func/std": 0.038944005138344236, | |
| "sampling/importance_sampling_ratio/max": 2.9920668601989746, | |
| "sampling/importance_sampling_ratio/mean": 0.9543583393096924, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.914791107177734, | |
| "sampling/sampling_logp_difference/mean": 0.19774362444877625, | |
| "step": 168, | |
| "step_time": 131.8124701383058 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3150.0, | |
| "completions/mean_length": 903.328125, | |
| "completions/mean_terminated_length": 858.4031982421875, | |
| "completions/min_length": 133.0, | |
| "completions/min_terminated_length": 133.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7233236730098724, | |
| "epoch": 0.41625615763546797, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.008541288810641963, | |
| "kl": 0.001499377453001216, | |
| "learning_rate": 4.875986885401717e-05, | |
| "loss": 0.11034619063138962, | |
| "num_tokens": 25464666.0, | |
| "reward": 1.234375, | |
| "reward_std": 0.8795234560966492, | |
| "rewards/reward_func/mean": 0.1371527777777778, | |
| "rewards/reward_func/std": 0.1389351338148117, | |
| "sampling/importance_sampling_ratio/max": 2.998958110809326, | |
| "sampling/importance_sampling_ratio/mean": 0.9543308615684509, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.080719947814941, | |
| "sampling/sampling_logp_difference/mean": 0.19305600225925446, | |
| "step": 169, | |
| "step_time": 124.94637063192204 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3017.0, | |
| "completions/mean_length": 1323.953125, | |
| "completions/mean_terminated_length": 1151.1016845703125, | |
| "completions/min_length": 146.0, | |
| "completions/min_terminated_length": 146.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7630322575569153, | |
| "epoch": 0.4187192118226601, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.01083484242329808, | |
| "kl": 0.0010297257540514693, | |
| "learning_rate": 4.874473486596672e-05, | |
| "loss": -0.028323577716946602, | |
| "num_tokens": 25652839.0, | |
| "reward": 1.140625, | |
| "reward_std": 0.6040127277374268, | |
| "rewards/reward_func/mean": 0.1267361111111111, | |
| "rewards/reward_func/std": 0.1057632068792979, | |
| "sampling/importance_sampling_ratio/max": 2.9976863861083984, | |
| "sampling/importance_sampling_ratio/mean": 0.9415475726127625, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.453205108642578, | |
| "sampling/sampling_logp_difference/mean": 0.2272610068321228, | |
| "step": 170, | |
| "step_time": 133.1720853582956 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2148.0, | |
| "completions/mean_length": 935.453125, | |
| "completions/mean_terminated_length": 885.2857666015625, | |
| "completions/min_length": 233.0, | |
| "completions/min_terminated_length": 233.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7636792063713074, | |
| "epoch": 0.4211822660098522, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0028875010309662576, | |
| "kl": 0.0013721904251724482, | |
| "learning_rate": 4.8729511470563514e-05, | |
| "loss": 0.0062779695726931095, | |
| "num_tokens": 25809700.0, | |
| "reward": 1.09375, | |
| "reward_std": 0.33481812477111816, | |
| "rewards/reward_func/mean": 0.12152777777777778, | |
| "rewards/reward_func/std": 0.045880657931168876, | |
| "sampling/importance_sampling_ratio/max": 2.9988179206848145, | |
| "sampling/importance_sampling_ratio/mean": 0.9457703828811646, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.243955612182617, | |
| "sampling/sampling_logp_difference/mean": 0.22251084446907043, | |
| "step": 171, | |
| "step_time": 123.20627958187833 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3317.0, | |
| "completions/max_terminated_length": 3317.0, | |
| "completions/mean_length": 950.453125, | |
| "completions/mean_terminated_length": 956.4286499023438, | |
| "completions/min_length": 262.0, | |
| "completions/min_terminated_length": 262.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6344201117753983, | |
| "epoch": 0.4236453201970443, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0010320451728027538, | |
| "kl": 0.0010279046691721305, | |
| "learning_rate": 4.871419872512901e-05, | |
| "loss": 0.010371055454015732, | |
| "num_tokens": 25955393.0, | |
| "reward": 1.12890625, | |
| "reward_std": 0.34787389636039734, | |
| "rewards/reward_func/mean": 0.1254340277777778, | |
| "rewards/reward_func/std": 0.044849217351939946, | |
| "sampling/importance_sampling_ratio/max": 2.9972174167633057, | |
| "sampling/importance_sampling_ratio/mean": 0.9531108140945435, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.62364387512207, | |
| "sampling/sampling_logp_difference/mean": 0.1808897852897644, | |
| "step": 172, | |
| "step_time": 121.73163252789527 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3917.0, | |
| "completions/max_terminated_length": 3917.0, | |
| "completions/mean_length": 753.5625, | |
| "completions/mean_terminated_length": 761.4603881835938, | |
| "completions/min_length": 106.0, | |
| "completions/min_terminated_length": 106.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7194567173719406, | |
| "epoch": 0.42610837438423643, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.009213834449335897, | |
| "kl": 0.0021812406193930656, | |
| "learning_rate": 4.869879668732115e-05, | |
| "loss": -0.011229299008846283, | |
| "num_tokens": 26083653.0, | |
| "reward": 0.98828125, | |
| "reward_std": 0.3131689429283142, | |
| "rewards/reward_func/mean": 0.10980902777777778, | |
| "rewards/reward_func/std": 0.04778718948364258, | |
| "sampling/importance_sampling_ratio/max": 2.999462127685547, | |
| "sampling/importance_sampling_ratio/mean": 0.9515129327774048, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.498061180114746, | |
| "sampling/sampling_logp_difference/mean": 0.19364528357982635, | |
| "step": 173, | |
| "step_time": 141.7643264059443 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3441.0, | |
| "completions/mean_length": 1269.9375, | |
| "completions/mean_terminated_length": 977.586181640625, | |
| "completions/min_length": 269.0, | |
| "completions/min_terminated_length": 269.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7887077778577805, | |
| "epoch": 0.42857142857142855, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0010582864632042994, | |
| "kl": 0.0011124948505312204, | |
| "learning_rate": 4.868330541513405e-05, | |
| "loss": -0.012901953421533108, | |
| "num_tokens": 26256273.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.3592725396156311, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.052181267076068454, | |
| "sampling/importance_sampling_ratio/max": 2.9990017414093018, | |
| "sampling/importance_sampling_ratio/mean": 0.9508888721466064, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.885161399841309, | |
| "sampling/sampling_logp_difference/mean": 0.21054230630397797, | |
| "step": 174, | |
| "step_time": 198.68682994507253 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3267.0, | |
| "completions/max_terminated_length": 3267.0, | |
| "completions/mean_length": 818.140625, | |
| "completions/mean_terminated_length": 818.140625, | |
| "completions/min_length": 103.0, | |
| "completions/min_terminated_length": 103.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6518329381942749, | |
| "epoch": 0.43103448275862066, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002988795659873789, | |
| "kl": 0.00412775349104777, | |
| "learning_rate": 4.866772496689787e-05, | |
| "loss": 0.003858533687889576, | |
| "num_tokens": 26388170.0, | |
| "reward": 1.03125, | |
| "reward_std": 0.3288387358188629, | |
| "rewards/reward_func/mean": 0.11458333333333333, | |
| "rewards/reward_func/std": 0.05097940398587121, | |
| "sampling/importance_sampling_ratio/max": 2.997185230255127, | |
| "sampling/importance_sampling_ratio/mean": 0.9601922035217285, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.480610847473145, | |
| "sampling/sampling_logp_difference/mean": 0.1721549779176712, | |
| "step": 175, | |
| "step_time": 114.63128752307966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3185.0, | |
| "completions/mean_length": 1045.515625, | |
| "completions/mean_terminated_length": 895.4917602539062, | |
| "completions/min_length": 92.0, | |
| "completions/min_terminated_length": 92.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7153588682413101, | |
| "epoch": 0.43349753694581283, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.004204537629622419, | |
| "kl": 0.001135994476499036, | |
| "learning_rate": 4.865205540127851e-05, | |
| "loss": 0.0316590741276741, | |
| "num_tokens": 26534491.0, | |
| "reward": 1.21484375, | |
| "reward_std": 0.6970276832580566, | |
| "rewards/reward_func/mean": 0.1349826388888889, | |
| "rewards/reward_func/std": 0.09969028168254429, | |
| "sampling/importance_sampling_ratio/max": 2.9958765506744385, | |
| "sampling/importance_sampling_ratio/mean": 0.9529693126678467, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.622081756591797, | |
| "sampling/sampling_logp_difference/mean": 0.19626040756702423, | |
| "step": 176, | |
| "step_time": 188.66510831192136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3936.0, | |
| "completions/mean_length": 1035.765625, | |
| "completions/mean_terminated_length": 937.04833984375, | |
| "completions/min_length": 151.0, | |
| "completions/min_terminated_length": 151.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7687556147575378, | |
| "epoch": 0.43596059113300495, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005146910057299965, | |
| "kl": 0.0013427632511593401, | |
| "learning_rate": 4.863629677727745e-05, | |
| "loss": 0.013107987120747566, | |
| "num_tokens": 26678796.0, | |
| "reward": 1.09375, | |
| "reward_std": 0.5730383992195129, | |
| "rewards/reward_func/mean": 0.12152777777777778, | |
| "rewards/reward_func/std": 0.09791860481103261, | |
| "sampling/importance_sampling_ratio/max": 2.995704412460327, | |
| "sampling/importance_sampling_ratio/mean": 0.9536083340644836, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.321060180664062, | |
| "sampling/sampling_logp_difference/mean": 0.1994103193283081, | |
| "step": 177, | |
| "step_time": 122.20711262966506 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2110.0, | |
| "completions/mean_length": 889.65625, | |
| "completions/mean_terminated_length": 786.2294921875, | |
| "completions/min_length": 271.0, | |
| "completions/min_terminated_length": 271.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7085517942905426, | |
| "epoch": 0.43842364532019706, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00044274036412802093, | |
| "kl": 0.001734752906486392, | |
| "learning_rate": 4.862044915423149e-05, | |
| "loss": -0.007765084970742464, | |
| "num_tokens": 26822902.0, | |
| "reward": 1.140625, | |
| "reward_std": 0.39308255910873413, | |
| "rewards/reward_func/mean": 0.1267361111111111, | |
| "rewards/reward_func/std": 0.054551392793655396, | |
| "sampling/importance_sampling_ratio/max": 2.999730348587036, | |
| "sampling/importance_sampling_ratio/mean": 0.9586660861968994, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.441228866577148, | |
| "sampling/sampling_logp_difference/mean": 0.18852296471595764, | |
| "step": 178, | |
| "step_time": 135.35360544803552 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3859.0, | |
| "completions/mean_length": 951.46875, | |
| "completions/mean_terminated_length": 844.7212524414062, | |
| "completions/min_length": 176.0, | |
| "completions/min_terminated_length": 176.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6623150259256363, | |
| "epoch": 0.4408866995073892, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0013933160245554325, | |
| "kl": 0.0018012767832260579, | |
| "learning_rate": 4.860451259181259e-05, | |
| "loss": -0.001757708378136158, | |
| "num_tokens": 26970756.0, | |
| "reward": 1.0859375, | |
| "reward_std": 0.33694103360176086, | |
| "rewards/reward_func/mean": 0.12065972222222222, | |
| "rewards/reward_func/std": 0.04956694609589047, | |
| "sampling/importance_sampling_ratio/max": 2.996303081512451, | |
| "sampling/importance_sampling_ratio/mean": 0.9572619795799255, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.23563003540039, | |
| "sampling/sampling_logp_difference/mean": 0.1772717833518982, | |
| "step": 179, | |
| "step_time": 128.08412980427966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1835.0, | |
| "completions/mean_length": 792.875, | |
| "completions/mean_terminated_length": 685.7166748046875, | |
| "completions/min_length": 189.0, | |
| "completions/min_terminated_length": 189.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.688085749745369, | |
| "epoch": 0.4433497536945813, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0007279197620227103, | |
| "kl": 0.0020535228250082582, | |
| "learning_rate": 4.8588487150027514e-05, | |
| "loss": 0.0036929009947925806, | |
| "num_tokens": 27099212.0, | |
| "reward": 1.0703125, | |
| "reward_std": 0.2762732207775116, | |
| "rewards/reward_func/mean": 0.1189236111111111, | |
| "rewards/reward_func/std": 0.03492574973238839, | |
| "sampling/importance_sampling_ratio/max": 2.986546516418457, | |
| "sampling/importance_sampling_ratio/mean": 0.9620383977890015, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.313373565673828, | |
| "sampling/sampling_logp_difference/mean": 0.17329855263233185, | |
| "step": 180, | |
| "step_time": 117.17333939834498 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3657.0, | |
| "completions/max_terminated_length": 3657.0, | |
| "completions/mean_length": 810.515625, | |
| "completions/mean_terminated_length": 794.3809814453125, | |
| "completions/min_length": 177.0, | |
| "completions/min_terminated_length": 177.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7161764353513718, | |
| "epoch": 0.4458128078817734, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.011804777204104219, | |
| "kl": 0.0012608054676093161, | |
| "learning_rate": 4.8572372889217776e-05, | |
| "loss": 0.04105086624622345, | |
| "num_tokens": 27219901.0, | |
| "reward": 1.21875, | |
| "reward_std": 0.6066758036613464, | |
| "rewards/reward_func/mean": 0.13541666666666666, | |
| "rewards/reward_func/std": 0.10416450061731869, | |
| "sampling/importance_sampling_ratio/max": 2.998037815093994, | |
| "sampling/importance_sampling_ratio/mean": 0.9585379362106323, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.048479080200195, | |
| "sampling/sampling_logp_difference/mean": 0.17843914031982422, | |
| "step": 181, | |
| "step_time": 104.87256655702367 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3713.0, | |
| "completions/max_terminated_length": 3713.0, | |
| "completions/mean_length": 837.078125, | |
| "completions/mean_terminated_length": 837.078125, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7794601023197174, | |
| "epoch": 0.4482758620689655, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0038115502353250493, | |
| "kl": 0.0025799469731282443, | |
| "learning_rate": 4.855616987005926e-05, | |
| "loss": 0.002493778243660927, | |
| "num_tokens": 27355138.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.40272432565689087, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.059847907887564764, | |
| "sampling/importance_sampling_ratio/max": 2.9982709884643555, | |
| "sampling/importance_sampling_ratio/mean": 0.9533101320266724, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.990303039550781, | |
| "sampling/sampling_logp_difference/mean": 0.1960030198097229, | |
| "step": 182, | |
| "step_time": 122.55533957784064 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3363.0, | |
| "completions/mean_length": 765.6875, | |
| "completions/mean_terminated_length": 712.825439453125, | |
| "completions/min_length": 95.0, | |
| "completions/min_terminated_length": 95.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6534744799137115, | |
| "epoch": 0.45073891625615764, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002555384720048418, | |
| "kl": 0.00150683059473522, | |
| "learning_rate": 4.853987815356211e-05, | |
| "loss": 0.015543782152235508, | |
| "num_tokens": 27483294.0, | |
| "reward": 1.0078125, | |
| "reward_std": 0.21347814798355103, | |
| "rewards/reward_func/mean": 0.11197916666666667, | |
| "rewards/reward_func/std": 0.03231415732039346, | |
| "sampling/importance_sampling_ratio/max": 2.996135950088501, | |
| "sampling/importance_sampling_ratio/mean": 0.9614719748497009, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.065330505371094, | |
| "sampling/sampling_logp_difference/mean": 0.1773977279663086, | |
| "step": 183, | |
| "step_time": 123.16616046987474 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2184.0, | |
| "completions/mean_length": 860.078125, | |
| "completions/mean_terminated_length": 704.550048828125, | |
| "completions/min_length": 138.0, | |
| "completions/min_terminated_length": 138.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6979289501905441, | |
| "epoch": 0.45320197044334976, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0016726985274926945, | |
| "kl": 0.0025047478557098657, | |
| "learning_rate": 4.8523497801070394e-05, | |
| "loss": -0.020084768533706665, | |
| "num_tokens": 27626163.0, | |
| "reward": 1.06640625, | |
| "reward_std": 0.33994102478027344, | |
| "rewards/reward_func/mean": 0.11848958333333333, | |
| "rewards/reward_func/std": 0.049871087074279785, | |
| "sampling/importance_sampling_ratio/max": 2.9824655055999756, | |
| "sampling/importance_sampling_ratio/mean": 0.9549860954284668, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.605437278747559, | |
| "sampling/sampling_logp_difference/mean": 0.18550416827201843, | |
| "step": 184, | |
| "step_time": 176.04412785521708 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3536.0, | |
| "completions/mean_length": 1211.46875, | |
| "completions/mean_terminated_length": 1118.4193115234375, | |
| "completions/min_length": 272.0, | |
| "completions/min_terminated_length": 272.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6525581926107407, | |
| "epoch": 0.45566502463054187, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.01064758291666748, | |
| "kl": 0.0012874963285867125, | |
| "learning_rate": 4.8507028874261965e-05, | |
| "loss": 0.012987809255719185, | |
| "num_tokens": 27781713.0, | |
| "reward": 1.16796875, | |
| "reward_std": 0.5960429906845093, | |
| "rewards/reward_func/mean": 0.12977430555555555, | |
| "rewards/reward_func/std": 0.10399173531267378, | |
| "sampling/importance_sampling_ratio/max": 2.9994893074035645, | |
| "sampling/importance_sampling_ratio/mean": 0.9531520009040833, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 19.717105865478516, | |
| "sampling/sampling_logp_difference/mean": 0.18427221477031708, | |
| "step": 185, | |
| "step_time": 123.03224639990367 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3674.0, | |
| "completions/mean_length": 973.265625, | |
| "completions/mean_terminated_length": 929.7257690429688, | |
| "completions/min_length": 146.0, | |
| "completions/min_terminated_length": 146.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7046706974506378, | |
| "epoch": 0.458128078817734, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00037532162180780795, | |
| "kl": 0.002055374119663611, | |
| "learning_rate": 4.8490471435148174e-05, | |
| "loss": 0.0002820357622113079, | |
| "num_tokens": 27919314.0, | |
| "reward": 1.13671875, | |
| "reward_std": 0.3419414758682251, | |
| "rewards/reward_func/mean": 0.12630208333333334, | |
| "rewards/reward_func/std": 0.04240360524919298, | |
| "sampling/importance_sampling_ratio/max": 2.9972267150878906, | |
| "sampling/importance_sampling_ratio/mean": 0.9548791646957397, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.075159072875977, | |
| "sampling/sampling_logp_difference/mean": 0.19040821492671967, | |
| "step": 186, | |
| "step_time": 131.1666200091131 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3434.0, | |
| "completions/mean_length": 1098.90625, | |
| "completions/mean_terminated_length": 993.4500732421875, | |
| "completions/min_length": 183.0, | |
| "completions/min_terminated_length": 183.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6857569068670273, | |
| "epoch": 0.4605911330049261, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004633843502208573, | |
| "kl": 0.003258775017457083, | |
| "learning_rate": 4.8473825546073656e-05, | |
| "loss": -0.012959948740899563, | |
| "num_tokens": 28092956.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.4284060597419739, | |
| "rewards/reward_func/mean": 0.1076388888888889, | |
| "rewards/reward_func/std": 0.06393983297877842, | |
| "sampling/importance_sampling_ratio/max": 2.9971022605895996, | |
| "sampling/importance_sampling_ratio/mean": 0.9417064785957336, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.373357772827148, | |
| "sampling/sampling_logp_difference/mean": 0.2150428593158722, | |
| "step": 187, | |
| "step_time": 147.29947473318316 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2978.0, | |
| "completions/mean_length": 1049.4375, | |
| "completions/mean_terminated_length": 951.1612548828125, | |
| "completions/min_length": 105.0, | |
| "completions/min_terminated_length": 105.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6931847035884857, | |
| "epoch": 0.4630541871921182, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.005104650264393571, | |
| "kl": 0.001937075809109956, | |
| "learning_rate": 4.845709126971609e-05, | |
| "loss": 0.052426815032958984, | |
| "num_tokens": 28238888.0, | |
| "reward": 1.12890625, | |
| "reward_std": 0.6681414842605591, | |
| "rewards/reward_func/mean": 0.1254340277777778, | |
| "rewards/reward_func/std": 0.09152780349055926, | |
| "sampling/importance_sampling_ratio/max": 2.999912738800049, | |
| "sampling/importance_sampling_ratio/mean": 0.9497792720794678, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.748735427856445, | |
| "sampling/sampling_logp_difference/mean": 0.19750040769577026, | |
| "step": 188, | |
| "step_time": 117.82483472116292 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3593.0, | |
| "completions/mean_length": 1013.9375, | |
| "completions/mean_terminated_length": 863.1000366210938, | |
| "completions/min_length": 99.0, | |
| "completions/min_terminated_length": 99.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7308681607246399, | |
| "epoch": 0.46551724137931033, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0006965873116807276, | |
| "kl": 0.002661357371835038, | |
| "learning_rate": 4.844026866908595e-05, | |
| "loss": 0.0032661345321685076, | |
| "num_tokens": 28395908.0, | |
| "reward": 1.07421875, | |
| "reward_std": 0.2734251022338867, | |
| "rewards/reward_func/mean": 0.1193576388888889, | |
| "rewards/reward_func/std": 0.03352663583225674, | |
| "sampling/importance_sampling_ratio/max": 2.9960858821868896, | |
| "sampling/importance_sampling_ratio/mean": 0.9529147148132324, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.626492500305176, | |
| "sampling/sampling_logp_difference/mean": 0.19693899154663086, | |
| "step": 189, | |
| "step_time": 134.75431111990474 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2363.0, | |
| "completions/max_terminated_length": 2363.0, | |
| "completions/mean_length": 820.734375, | |
| "completions/mean_terminated_length": 796.3386840820312, | |
| "completions/min_length": 77.0, | |
| "completions/min_terminated_length": 77.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.720972329378128, | |
| "epoch": 0.46798029556650245, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0023207172647921556, | |
| "kl": 0.002978406089823693, | |
| "learning_rate": 4.8423357807526325e-05, | |
| "loss": -0.020184550434350967, | |
| "num_tokens": 28532787.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.31038472056388855, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.04780791699886322, | |
| "sampling/importance_sampling_ratio/max": 2.9988293647766113, | |
| "sampling/importance_sampling_ratio/mean": 0.957587480545044, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.802742958068848, | |
| "sampling/sampling_logp_difference/mean": 0.18637000024318695, | |
| "step": 190, | |
| "step_time": 77.44645439600572 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4033.0, | |
| "completions/mean_length": 1296.359375, | |
| "completions/mean_terminated_length": 1109.7166748046875, | |
| "completions/min_length": 176.0, | |
| "completions/min_terminated_length": 176.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6860456019639969, | |
| "epoch": 0.47044334975369456, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0019827677735223988, | |
| "kl": 0.0025322052533738315, | |
| "learning_rate": 4.840635874871259e-05, | |
| "loss": -0.015175355598330498, | |
| "num_tokens": 28700730.0, | |
| "reward": 1.03125, | |
| "reward_std": 0.3818812966346741, | |
| "rewards/reward_func/mean": 0.11458333333333333, | |
| "rewards/reward_func/std": 0.05688919126987457, | |
| "sampling/importance_sampling_ratio/max": 2.9983668327331543, | |
| "sampling/importance_sampling_ratio/mean": 0.9488115906715393, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.628395080566406, | |
| "sampling/sampling_logp_difference/mean": 0.19605514407157898, | |
| "step": 191, | |
| "step_time": 153.48830994497985 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2290.0, | |
| "completions/max_terminated_length": 2290.0, | |
| "completions/mean_length": 766.453125, | |
| "completions/mean_terminated_length": 766.453125, | |
| "completions/min_length": 212.0, | |
| "completions/min_terminated_length": 212.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7697373479604721, | |
| "epoch": 0.4729064039408867, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0014556993322577337, | |
| "kl": 0.002503739349776879, | |
| "learning_rate": 4.838927155665225e-05, | |
| "loss": -0.0015111538814380765, | |
| "num_tokens": 28825287.0, | |
| "reward": 1.03125, | |
| "reward_std": 0.23779743909835815, | |
| "rewards/reward_func/mean": 0.11458333333333333, | |
| "rewards/reward_func/std": 0.034599056674374476, | |
| "sampling/importance_sampling_ratio/max": 2.9957094192504883, | |
| "sampling/importance_sampling_ratio/mean": 0.9514071345329285, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.246959686279297, | |
| "sampling/sampling_logp_difference/mean": 0.20502641797065735, | |
| "step": 192, | |
| "step_time": 74.68089395412244 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1873.0, | |
| "completions/mean_length": 547.671875, | |
| "completions/mean_terminated_length": 487.34423828125, | |
| "completions/min_length": 107.0, | |
| "completions/min_terminated_length": 107.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.77506323158741, | |
| "epoch": 0.4753694581280788, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.003785555393182919, | |
| "kl": 0.006008016353007406, | |
| "learning_rate": 4.837209629568462e-05, | |
| "loss": 0.001519299577921629, | |
| "num_tokens": 28937410.0, | |
| "reward": 1.07421875, | |
| "reward_std": 0.3637320101261139, | |
| "rewards/reward_func/mean": 0.1193576388888889, | |
| "rewards/reward_func/std": 0.052706441945499845, | |
| "sampling/importance_sampling_ratio/max": 2.9997804164886475, | |
| "sampling/importance_sampling_ratio/mean": 0.956741213798523, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.686527252197266, | |
| "sampling/sampling_logp_difference/mean": 0.19631287455558777, | |
| "step": 193, | |
| "step_time": 138.0256498081144 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3606.0, | |
| "completions/mean_length": 976.28125, | |
| "completions/mean_terminated_length": 875.6451416015625, | |
| "completions/min_length": 152.0, | |
| "completions/min_terminated_length": 152.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8006569296121597, | |
| "epoch": 0.47783251231527096, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0010860159991696286, | |
| "kl": 0.0022052104177419096, | |
| "learning_rate": 4.8354833030480674e-05, | |
| "loss": -0.01513269916176796, | |
| "num_tokens": 29080052.0, | |
| "reward": 1.046875, | |
| "reward_std": 0.31140682101249695, | |
| "rewards/reward_func/mean": 0.11631944444444445, | |
| "rewards/reward_func/std": 0.04530912637710571, | |
| "sampling/importance_sampling_ratio/max": 2.997826337814331, | |
| "sampling/importance_sampling_ratio/mean": 0.9526568651199341, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.61501693725586, | |
| "sampling/sampling_logp_difference/mean": 0.20253780484199524, | |
| "step": 194, | |
| "step_time": 141.632570264861 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1685.0, | |
| "completions/mean_length": 889.46875, | |
| "completions/mean_terminated_length": 786.0322265625, | |
| "completions/min_length": 199.0, | |
| "completions/min_terminated_length": 199.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6076531410217285, | |
| "epoch": 0.4802955665024631, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.006373856461449096, | |
| "kl": 0.0018063990282826126, | |
| "learning_rate": 4.833748182604273e-05, | |
| "loss": -0.0016068057157099247, | |
| "num_tokens": 29222882.0, | |
| "reward": 1.13671875, | |
| "reward_std": 0.4691466689109802, | |
| "rewards/reward_func/mean": 0.12630208333333334, | |
| "rewards/reward_func/std": 0.07778164744377136, | |
| "sampling/importance_sampling_ratio/max": 2.9992918968200684, | |
| "sampling/importance_sampling_ratio/mean": 0.9589847326278687, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.804981231689453, | |
| "sampling/sampling_logp_difference/mean": 0.17736108601093292, | |
| "step": 195, | |
| "step_time": 119.6795916960109 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3320.0, | |
| "completions/mean_length": 1108.359375, | |
| "completions/mean_terminated_length": 961.4261474609375, | |
| "completions/min_length": 82.0, | |
| "completions/min_terminated_length": 82.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7933976054191589, | |
| "epoch": 0.4827586206896552, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014627021708944242, | |
| "kl": 0.002043090295046568, | |
| "learning_rate": 4.832004274770422e-05, | |
| "loss": -0.04575319588184357, | |
| "num_tokens": 29384265.0, | |
| "reward": 1.484375, | |
| "reward_std": 1.0136713981628418, | |
| "rewards/reward_func/mean": 0.16493055555555555, | |
| "rewards/reward_func/std": 0.17790989412201774, | |
| "sampling/importance_sampling_ratio/max": 2.998352527618408, | |
| "sampling/importance_sampling_ratio/mean": 0.9412397742271423, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.949785232543945, | |
| "sampling/sampling_logp_difference/mean": 0.22926440834999084, | |
| "step": 196, | |
| "step_time": 129.6598045709543 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2945.0, | |
| "completions/max_terminated_length": 2945.0, | |
| "completions/mean_length": 1006.734375, | |
| "completions/mean_terminated_length": 1006.734375, | |
| "completions/min_length": 84.0, | |
| "completions/min_terminated_length": 84.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.732940286397934, | |
| "epoch": 0.4852216748768473, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.004308614389485858, | |
| "kl": 0.001794125506421551, | |
| "learning_rate": 4.8302515861129474e-05, | |
| "loss": 0.03310343995690346, | |
| "num_tokens": 29536408.0, | |
| "reward": 1.33984375, | |
| "reward_std": 0.9315535426139832, | |
| "rewards/reward_func/mean": 0.1488715277777778, | |
| "rewards/reward_func/std": 0.1277098986837599, | |
| "sampling/importance_sampling_ratio/max": 2.995858669281006, | |
| "sampling/importance_sampling_ratio/mean": 0.9469768404960632, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.225045204162598, | |
| "sampling/sampling_logp_difference/mean": 0.20466913282871246, | |
| "step": 197, | |
| "step_time": 101.47293852618895 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3559.0, | |
| "completions/mean_length": 1037.3125, | |
| "completions/mean_terminated_length": 938.6451416015625, | |
| "completions/min_length": 246.0, | |
| "completions/min_terminated_length": 246.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7546299546957016, | |
| "epoch": 0.4876847290640394, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0023699039274578254, | |
| "kl": 0.0021884184097871184, | |
| "learning_rate": 4.828490123231342e-05, | |
| "loss": 0.009993281215429306, | |
| "num_tokens": 29680908.0, | |
| "reward": 1.06640625, | |
| "reward_std": 0.33107027411460876, | |
| "rewards/reward_func/mean": 0.11848958333333333, | |
| "rewards/reward_func/std": 0.047586959269311696, | |
| "sampling/importance_sampling_ratio/max": 2.991670846939087, | |
| "sampling/importance_sampling_ratio/mean": 0.9526986479759216, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.980742454528809, | |
| "sampling/sampling_logp_difference/mean": 0.20294223725795746, | |
| "step": 198, | |
| "step_time": 187.45196510385722 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3956.0, | |
| "completions/mean_length": 1104.125, | |
| "completions/mean_terminated_length": 904.6666870117188, | |
| "completions/min_length": 125.0, | |
| "completions/min_terminated_length": 125.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7587642967700958, | |
| "epoch": 0.49014778325123154, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0005524746547252956, | |
| "kl": 0.0027338668005540967, | |
| "learning_rate": 4.8267198927581415e-05, | |
| "loss": -0.0013192156329751015, | |
| "num_tokens": 29842468.0, | |
| "reward": 1.171875, | |
| "reward_std": 0.39559829235076904, | |
| "rewards/reward_func/mean": 0.13020833333333334, | |
| "rewards/reward_func/std": 0.05048796162009239, | |
| "sampling/importance_sampling_ratio/max": 2.996952533721924, | |
| "sampling/importance_sampling_ratio/mean": 0.950311541557312, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.986047744750977, | |
| "sampling/sampling_logp_difference/mean": 0.20628628134727478, | |
| "step": 199, | |
| "step_time": 132.54166667349637 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4023.0, | |
| "completions/mean_length": 965.59375, | |
| "completions/mean_terminated_length": 915.90478515625, | |
| "completions/min_length": 186.0, | |
| "completions/min_terminated_length": 186.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.686070442199707, | |
| "epoch": 0.49261083743842365, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.007126499731189531, | |
| "kl": 0.001430476550012827, | |
| "learning_rate": 4.824940901358889e-05, | |
| "loss": 0.036235012114048004, | |
| "num_tokens": 29997386.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.663765549659729, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.10387398964828914, | |
| "sampling/importance_sampling_ratio/max": 2.998133659362793, | |
| "sampling/importance_sampling_ratio/mean": 0.9536129832267761, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.036049842834473, | |
| "sampling/sampling_logp_difference/mean": 0.19290143251419067, | |
| "step": 200, | |
| "step_time": 177.09617541497573 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3732.0, | |
| "completions/mean_length": 891.75, | |
| "completions/mean_terminated_length": 734.1638793945312, | |
| "completions/min_length": 108.0, | |
| "completions/min_terminated_length": 108.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7913601100444794, | |
| "epoch": 0.49507389162561577, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.006751393097530581, | |
| "kl": 0.0017078560777008533, | |
| "learning_rate": 4.82315315573212e-05, | |
| "loss": 0.0010975832119584084, | |
| "num_tokens": 30134826.0, | |
| "reward": 1.1953125, | |
| "reward_std": 0.6921738982200623, | |
| "rewards/reward_func/mean": 0.1328125, | |
| "rewards/reward_func/std": 0.11135281870762508, | |
| "sampling/importance_sampling_ratio/max": 2.9937379360198975, | |
| "sampling/importance_sampling_ratio/mean": 0.9510517120361328, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.499581336975098, | |
| "sampling/sampling_logp_difference/mean": 0.2039942890405655, | |
| "step": 201, | |
| "step_time": 125.6223954288289 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3002.0, | |
| "completions/mean_length": 916.328125, | |
| "completions/mean_terminated_length": 864.5806274414062, | |
| "completions/min_length": 95.0, | |
| "completions/min_terminated_length": 95.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8502542078495026, | |
| "epoch": 0.4975369458128079, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.010512409373633538, | |
| "kl": 0.0041085208067670465, | |
| "learning_rate": 4.8213566626093316e-05, | |
| "loss": 0.04938031733036041, | |
| "num_tokens": 30283039.0, | |
| "reward": 1.19140625, | |
| "reward_std": 0.872578501701355, | |
| "rewards/reward_func/mean": 0.1323784722222222, | |
| "rewards/reward_func/std": 0.1458116587665346, | |
| "sampling/importance_sampling_ratio/max": 2.9969401359558105, | |
| "sampling/importance_sampling_ratio/mean": 0.9425897598266602, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.473159790039062, | |
| "sampling/sampling_logp_difference/mean": 0.22600840032100677, | |
| "step": 202, | |
| "step_time": 131.78623059717938 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3201.0, | |
| "completions/mean_length": 1062.703125, | |
| "completions/mean_terminated_length": 978.88134765625, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 157.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6913492828607559, | |
| "epoch": 0.5, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0015925848185912536, | |
| "kl": 0.001410837227012962, | |
| "learning_rate": 4.819551428754957e-05, | |
| "loss": -0.047750379890203476, | |
| "num_tokens": 30434060.0, | |
| "reward": 1.03125, | |
| "reward_std": 0.34646743535995483, | |
| "rewards/reward_func/mean": 0.11458333333333333, | |
| "rewards/reward_func/std": 0.0521190000904931, | |
| "sampling/importance_sampling_ratio/max": 2.9992289543151855, | |
| "sampling/importance_sampling_ratio/mean": 0.9494022727012634, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.937480926513672, | |
| "sampling/sampling_logp_difference/mean": 0.19053122401237488, | |
| "step": 203, | |
| "step_time": 122.92247278685682 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3885.0, | |
| "completions/mean_length": 1074.046875, | |
| "completions/mean_terminated_length": 976.5645141601562, | |
| "completions/min_length": 213.0, | |
| "completions/min_terminated_length": 213.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.654021367430687, | |
| "epoch": 0.5024630541871922, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0051969291675550665, | |
| "kl": 0.002041646250290796, | |
| "learning_rate": 4.8177374609663415e-05, | |
| "loss": 0.015631053596735, | |
| "num_tokens": 30597503.0, | |
| "reward": 1.1875, | |
| "reward_std": 0.634647786617279, | |
| "rewards/reward_func/mean": 0.13194444444444445, | |
| "rewards/reward_func/std": 0.11371641523308224, | |
| "sampling/importance_sampling_ratio/max": 2.996506690979004, | |
| "sampling/importance_sampling_ratio/mean": 0.9503429532051086, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.598240852355957, | |
| "sampling/sampling_logp_difference/mean": 0.1922573447227478, | |
| "step": 204, | |
| "step_time": 163.13331845588982 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3323.0, | |
| "completions/mean_length": 1194.078125, | |
| "completions/mean_terminated_length": 1100.4676513671875, | |
| "completions/min_length": 231.0, | |
| "completions/min_terminated_length": 231.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6860644668340683, | |
| "epoch": 0.5049261083743842, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0017076158671597208, | |
| "kl": 0.004838668974116445, | |
| "learning_rate": 4.815914766073719e-05, | |
| "loss": 0.00643888721242547, | |
| "num_tokens": 30764340.0, | |
| "reward": 1.18359375, | |
| "reward_std": 0.3965180516242981, | |
| "rewards/reward_func/mean": 0.13151041666666666, | |
| "rewards/reward_func/std": 0.047183099720213145, | |
| "sampling/importance_sampling_ratio/max": 2.9985768795013428, | |
| "sampling/importance_sampling_ratio/mean": 0.9507815837860107, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.72639274597168, | |
| "sampling/sampling_logp_difference/mean": 0.19618508219718933, | |
| "step": 205, | |
| "step_time": 133.87398360297084 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3245.0, | |
| "completions/mean_length": 952.671875, | |
| "completions/mean_terminated_length": 855.6720581054688, | |
| "completions/min_length": 65.0, | |
| "completions/min_terminated_length": 65.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7687894254922867, | |
| "epoch": 0.5073891625615764, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.002176552438569466, | |
| "kl": 0.0018984676571562886, | |
| "learning_rate": 4.8140833509401815e-05, | |
| "loss": 0.006664093118160963, | |
| "num_tokens": 30918895.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.3119787275791168, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.043059426049391426, | |
| "sampling/importance_sampling_ratio/max": 2.9998011589050293, | |
| "sampling/importance_sampling_ratio/mean": 0.9475848078727722, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.37496280670166, | |
| "sampling/sampling_logp_difference/mean": 0.22000627219676971, | |
| "step": 206, | |
| "step_time": 122.86152748181485 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3039.0, | |
| "completions/mean_length": 886.515625, | |
| "completions/mean_terminated_length": 828.9677124023438, | |
| "completions/min_length": 195.0, | |
| "completions/min_terminated_length": 195.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6943669766187668, | |
| "epoch": 0.5098522167487685, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.004818923673628531, | |
| "kl": 0.0026133455976378173, | |
| "learning_rate": 4.812243222461658e-05, | |
| "loss": 0.020810682326555252, | |
| "num_tokens": 31068336.0, | |
| "reward": 1.41796875, | |
| "reward_std": 0.7546873092651367, | |
| "rewards/reward_func/mean": 0.15755208333333334, | |
| "rewards/reward_func/std": 0.1127622624238332, | |
| "sampling/importance_sampling_ratio/max": 2.9922666549682617, | |
| "sampling/importance_sampling_ratio/mean": 0.9491963386535645, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.743640899658203, | |
| "sampling/sampling_logp_difference/mean": 0.19928023219108582, | |
| "step": 207, | |
| "step_time": 126.38416155404411 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 3327.0, | |
| "completions/max_terminated_length": 3327.0, | |
| "completions/mean_length": 980.890625, | |
| "completions/mean_terminated_length": 975.0806274414062, | |
| "completions/min_length": 288.0, | |
| "completions/min_terminated_length": 288.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7900367081165314, | |
| "epoch": 0.5123152709359606, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.007988081424164765, | |
| "kl": 0.0018078716529998928, | |
| "learning_rate": 4.8103943875668844e-05, | |
| "loss": 0.017407327890396118, | |
| "num_tokens": 31224729.0, | |
| "reward": 1.24609375, | |
| "reward_std": 0.613169252872467, | |
| "rewards/reward_func/mean": 0.1384548611111111, | |
| "rewards/reward_func/std": 0.1027386552757687, | |
| "sampling/importance_sampling_ratio/max": 2.998058795928955, | |
| "sampling/importance_sampling_ratio/mean": 0.9472312927246094, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.543306350708008, | |
| "sampling/sampling_logp_difference/mean": 0.21559563279151917, | |
| "step": 208, | |
| "step_time": 107.44903364474885 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3869.0, | |
| "completions/mean_length": 965.921875, | |
| "completions/mean_terminated_length": 914.9515991210938, | |
| "completions/min_length": 144.0, | |
| "completions/min_terminated_length": 144.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7459463179111481, | |
| "epoch": 0.5147783251231527, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.010340537458138942, | |
| "kl": 0.0023739843745715916, | |
| "learning_rate": 4.8085368532173804e-05, | |
| "loss": -0.028774894773960114, | |
| "num_tokens": 31359492.0, | |
| "reward": 1.08984375, | |
| "reward_std": 0.5998796224594116, | |
| "rewards/reward_func/mean": 0.12109375, | |
| "rewards/reward_func/std": 0.10784303976429833, | |
| "sampling/importance_sampling_ratio/max": 2.9987199306488037, | |
| "sampling/importance_sampling_ratio/mean": 0.9521645903587341, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.040218353271484, | |
| "sampling/sampling_logp_difference/mean": 0.20044687390327454, | |
| "step": 209, | |
| "step_time": 184.2058972257655 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3810.0, | |
| "completions/mean_length": 1076.546875, | |
| "completions/mean_terminated_length": 1007.9000244140625, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 75.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7519822269678116, | |
| "epoch": 0.5172413793103449, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.008282575286698903, | |
| "kl": 0.0027343417750671506, | |
| "learning_rate": 4.806670626407422e-05, | |
| "loss": -0.029934097081422806, | |
| "num_tokens": 31517863.0, | |
| "reward": 1.08984375, | |
| "reward_std": 0.6612386107444763, | |
| "rewards/reward_func/mean": 0.12109375, | |
| "rewards/reward_func/std": 0.13787324395444658, | |
| "sampling/importance_sampling_ratio/max": 2.9987173080444336, | |
| "sampling/importance_sampling_ratio/mean": 0.9486758708953857, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.930124282836914, | |
| "sampling/sampling_logp_difference/mean": 0.20651331543922424, | |
| "step": 210, | |
| "step_time": 130.95431489613838 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3584.0, | |
| "completions/max_terminated_length": 3584.0, | |
| "completions/mean_length": 991.34375, | |
| "completions/mean_terminated_length": 991.34375, | |
| "completions/min_length": 205.0, | |
| "completions/min_terminated_length": 205.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7369837611913681, | |
| "epoch": 0.5197044334975369, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.005166807938091367, | |
| "kl": 0.003004661004524678, | |
| "learning_rate": 4.804795714164015e-05, | |
| "loss": 0.04796065762639046, | |
| "num_tokens": 31662573.0, | |
| "reward": 1.0859375, | |
| "reward_std": 0.7046032547950745, | |
| "rewards/reward_func/mean": 0.12065972222222222, | |
| "rewards/reward_func/std": 0.10800171229574415, | |
| "sampling/importance_sampling_ratio/max": 2.9918057918548584, | |
| "sampling/importance_sampling_ratio/mean": 0.9489620327949524, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.318878173828125, | |
| "sampling/sampling_logp_difference/mean": 0.19743148982524872, | |
| "step": 211, | |
| "step_time": 103.4146853510756 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1794.0, | |
| "completions/mean_length": 828.921875, | |
| "completions/mean_terminated_length": 769.5645141601562, | |
| "completions/min_length": 200.0, | |
| "completions/min_terminated_length": 200.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7094843238592148, | |
| "epoch": 0.5221674876847291, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.010080593679367157, | |
| "kl": 0.0021559473825618625, | |
| "learning_rate": 4.8029121235468696e-05, | |
| "loss": 0.07708187401294708, | |
| "num_tokens": 31799144.0, | |
| "reward": 1.32421875, | |
| "reward_std": 0.9622638821601868, | |
| "rewards/reward_func/mean": 0.14713541666666666, | |
| "rewards/reward_func/std": 0.13708895444869995, | |
| "sampling/importance_sampling_ratio/max": 2.9905505180358887, | |
| "sampling/importance_sampling_ratio/mean": 0.9552954435348511, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.449729919433594, | |
| "sampling/sampling_logp_difference/mean": 0.1984173059463501, | |
| "step": 212, | |
| "step_time": 126.7395717408508 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3203.0, | |
| "completions/mean_length": 1188.765625, | |
| "completions/mean_terminated_length": 1112.57373046875, | |
| "completions/min_length": 22.0, | |
| "completions/min_terminated_length": 156.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8555341958999634, | |
| "epoch": 0.5246305418719212, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007695774946068362, | |
| "kl": 0.00277452525915578, | |
| "learning_rate": 4.8010198616483736e-05, | |
| "loss": 0.0049737924709916115, | |
| "num_tokens": 31960265.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.7067449688911438, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.11894809040758345, | |
| "sampling/importance_sampling_ratio/max": 2.9988372325897217, | |
| "sampling/importance_sampling_ratio/mean": 0.9447873830795288, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.881756782531738, | |
| "sampling/sampling_logp_difference/mean": 0.22210858762264252, | |
| "step": 213, | |
| "step_time": 136.29223776236176 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3721.0, | |
| "completions/mean_length": 1243.328125, | |
| "completions/mean_terminated_length": 1053.1500244140625, | |
| "completions/min_length": 306.0, | |
| "completions/min_terminated_length": 306.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7398181259632111, | |
| "epoch": 0.5270935960591133, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.006997510070903853, | |
| "kl": 0.0023148250475060195, | |
| "learning_rate": 4.799118935593563e-05, | |
| "loss": 0.01846671663224697, | |
| "num_tokens": 32123518.0, | |
| "reward": 1.13671875, | |
| "reward_std": 0.6885251402854919, | |
| "rewards/reward_func/mean": 0.12630208333333334, | |
| "rewards/reward_func/std": 0.1024610847234726, | |
| "sampling/importance_sampling_ratio/max": 2.9988057613372803, | |
| "sampling/importance_sampling_ratio/mean": 0.9522387385368347, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.218445777893066, | |
| "sampling/sampling_logp_difference/mean": 0.20401646196842194, | |
| "step": 214, | |
| "step_time": 133.83199871121906 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3474.0, | |
| "completions/mean_length": 984.734375, | |
| "completions/mean_terminated_length": 884.3709716796875, | |
| "completions/min_length": 70.0, | |
| "completions/min_terminated_length": 70.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7282632738351822, | |
| "epoch": 0.5295566502463054, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006373666257538889, | |
| "kl": 0.0025452679255977273, | |
| "learning_rate": 4.797209352540101e-05, | |
| "loss": 0.06603739410638809, | |
| "num_tokens": 32279373.0, | |
| "reward": 1.22265625, | |
| "reward_std": 0.9429323673248291, | |
| "rewards/reward_func/mean": 0.13585069444444445, | |
| "rewards/reward_func/std": 0.14530256390571594, | |
| "sampling/importance_sampling_ratio/max": 2.9987716674804688, | |
| "sampling/importance_sampling_ratio/mean": 0.9473279118537903, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.93663501739502, | |
| "sampling/sampling_logp_difference/mean": 0.20334208011627197, | |
| "step": 215, | |
| "step_time": 171.04516539885662 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3199.0, | |
| "completions/mean_length": 960.265625, | |
| "completions/mean_terminated_length": 872.3933715820312, | |
| "completions/min_length": 191.0, | |
| "completions/min_terminated_length": 191.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6828020513057709, | |
| "epoch": 0.5320197044334976, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00906798233704518, | |
| "kl": 0.002584115689387545, | |
| "learning_rate": 4.7952911196782426e-05, | |
| "loss": 0.053639333695173264, | |
| "num_tokens": 32419022.0, | |
| "reward": 1.2265625, | |
| "reward_std": 0.8860904574394226, | |
| "rewards/reward_func/mean": 0.1362847222222222, | |
| "rewards/reward_func/std": 0.14273416499296823, | |
| "sampling/importance_sampling_ratio/max": 2.9965453147888184, | |
| "sampling/importance_sampling_ratio/mean": 0.9599682092666626, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.712519645690918, | |
| "sampling/sampling_logp_difference/mean": 0.17587494850158691, | |
| "step": 216, | |
| "step_time": 117.92797983251512 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 3881.0, | |
| "completions/max_terminated_length": 3881.0, | |
| "completions/mean_length": 994.046875, | |
| "completions/mean_terminated_length": 988.5806274414062, | |
| "completions/min_length": 189.0, | |
| "completions/min_terminated_length": 189.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8513800352811813, | |
| "epoch": 0.5344827586206896, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.017120938841582653, | |
| "kl": 0.004447525599971414, | |
| "learning_rate": 4.793364244230818e-05, | |
| "loss": 0.09198958426713943, | |
| "num_tokens": 32584177.0, | |
| "reward": 1.375, | |
| "reward_std": 1.273976445198059, | |
| "rewards/reward_func/mean": 0.1527777777777778, | |
| "rewards/reward_func/std": 0.18192580342292786, | |
| "sampling/importance_sampling_ratio/max": 2.997528314590454, | |
| "sampling/importance_sampling_ratio/mean": 0.9468756914138794, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.249998092651367, | |
| "sampling/sampling_logp_difference/mean": 0.2235032320022583, | |
| "step": 217, | |
| "step_time": 127.43058389122598 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3963.0, | |
| "completions/max_terminated_length": 3963.0, | |
| "completions/mean_length": 814.8125, | |
| "completions/mean_terminated_length": 818.3333740234375, | |
| "completions/min_length": 146.0, | |
| "completions/min_terminated_length": 146.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7400757819414139, | |
| "epoch": 0.5369458128078818, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02154058014893653, | |
| "kl": 0.004508270707447082, | |
| "learning_rate": 4.791428733453195e-05, | |
| "loss": -0.06291753053665161, | |
| "num_tokens": 32717189.0, | |
| "reward": 1.4296875, | |
| "reward_std": 1.1822670698165894, | |
| "rewards/reward_func/mean": 0.15885416666666666, | |
| "rewards/reward_func/std": 0.1904856049352222, | |
| "sampling/importance_sampling_ratio/max": 2.9951188564300537, | |
| "sampling/importance_sampling_ratio/mean": 0.9537706971168518, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.874495506286621, | |
| "sampling/sampling_logp_difference/mean": 0.18929260969161987, | |
| "step": 218, | |
| "step_time": 130.0812120535411 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2717.0, | |
| "completions/mean_length": 1357.421875, | |
| "completions/mean_terminated_length": 1252.7540283203125, | |
| "completions/min_length": 229.0, | |
| "completions/min_terminated_length": 229.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7078727632761002, | |
| "epoch": 0.5394088669950738, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015914954426622312, | |
| "kl": 0.003292074310593307, | |
| "learning_rate": 4.78948459463326e-05, | |
| "loss": 0.054672300815582275, | |
| "num_tokens": 32895600.0, | |
| "reward": 1.6640625, | |
| "reward_std": 1.4741278886795044, | |
| "rewards/reward_func/mean": 0.18489583333333334, | |
| "rewards/reward_func/std": 0.2328646464480294, | |
| "sampling/importance_sampling_ratio/max": 2.997037410736084, | |
| "sampling/importance_sampling_ratio/mean": 0.943734347820282, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.8037109375, | |
| "sampling/sampling_logp_difference/mean": 0.20783142745494843, | |
| "step": 219, | |
| "step_time": 193.00463135214522 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2831.0, | |
| "completions/mean_length": 1199.609375, | |
| "completions/mean_terminated_length": 1019.6101684570312, | |
| "completions/min_length": 188.0, | |
| "completions/min_terminated_length": 188.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7149243652820587, | |
| "epoch": 0.541871921182266, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014175892165371774, | |
| "kl": 0.005893846391700208, | |
| "learning_rate": 4.7875318350913846e-05, | |
| "loss": -0.022984549403190613, | |
| "num_tokens": 33065783.0, | |
| "reward": 1.546875, | |
| "reward_std": 1.3008506298065186, | |
| "rewards/reward_func/mean": 0.171875, | |
| "rewards/reward_func/std": 0.198716941393084, | |
| "sampling/importance_sampling_ratio/max": 2.997241735458374, | |
| "sampling/importance_sampling_ratio/mean": 0.9502947330474854, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.1736421585083, | |
| "sampling/sampling_logp_difference/mean": 0.19458694756031036, | |
| "step": 220, | |
| "step_time": 184.47228501015343 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2686.0, | |
| "completions/mean_length": 1140.203125, | |
| "completions/mean_terminated_length": 994.8359985351562, | |
| "completions/min_length": 112.0, | |
| "completions/min_terminated_length": 112.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.708164781332016, | |
| "epoch": 0.5443349753694581, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.01514246486059271, | |
| "kl": 0.005792527925223112, | |
| "learning_rate": 4.785570462180402e-05, | |
| "loss": 0.04811932519078255, | |
| "num_tokens": 33233828.0, | |
| "reward": 1.4375, | |
| "reward_std": 1.2328184843063354, | |
| "rewards/reward_func/mean": 0.1597222222222222, | |
| "rewards/reward_func/std": 0.1681312604082955, | |
| "sampling/importance_sampling_ratio/max": 2.9987242221832275, | |
| "sampling/importance_sampling_ratio/mean": 0.9442251920700073, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.4033203125, | |
| "sampling/sampling_logp_difference/mean": 0.21833211183547974, | |
| "step": 221, | |
| "step_time": 153.67408644291572 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3886.0, | |
| "completions/mean_length": 1162.671875, | |
| "completions/mean_terminated_length": 1068.04833984375, | |
| "completions/min_length": 156.0, | |
| "completions/min_terminated_length": 156.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7060115039348602, | |
| "epoch": 0.5467980295566502, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.018054004424587895, | |
| "kl": 0.00600872898939997, | |
| "learning_rate": 4.7836004832855776e-05, | |
| "loss": 0.15034234523773193, | |
| "num_tokens": 33395023.0, | |
| "reward": 1.703125, | |
| "reward_std": 1.6084320545196533, | |
| "rewards/reward_func/mean": 0.1892361111111111, | |
| "rewards/reward_func/std": 0.22175164024035135, | |
| "sampling/importance_sampling_ratio/max": 2.999399185180664, | |
| "sampling/importance_sampling_ratio/mean": 0.9499921798706055, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.749910354614258, | |
| "sampling/sampling_logp_difference/mean": 0.19778406620025635, | |
| "step": 222, | |
| "step_time": 133.30245491000824 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3866.0, | |
| "completions/mean_length": 1109.5, | |
| "completions/mean_terminated_length": 949.2542114257812, | |
| "completions/min_length": 262.0, | |
| "completions/min_terminated_length": 262.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7275642454624176, | |
| "epoch": 0.5492610837438424, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.021749090980709224, | |
| "kl": 0.012067305855453014, | |
| "learning_rate": 4.781621905824579e-05, | |
| "loss": 0.06272266805171967, | |
| "num_tokens": 33556591.0, | |
| "reward": 1.71875, | |
| "reward_std": 1.5042927265167236, | |
| "rewards/reward_func/mean": 0.1909722222222222, | |
| "rewards/reward_func/std": 0.22010938243733513, | |
| "sampling/importance_sampling_ratio/max": 2.992997169494629, | |
| "sampling/importance_sampling_ratio/mean": 0.9458306431770325, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.075740814208984, | |
| "sampling/sampling_logp_difference/mean": 0.20269128680229187, | |
| "step": 223, | |
| "step_time": 133.02171413018368 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3887.0, | |
| "completions/mean_length": 1109.21875, | |
| "completions/mean_terminated_length": 972.7368774414062, | |
| "completions/min_length": 141.0, | |
| "completions/min_terminated_length": 141.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7367782890796661, | |
| "epoch": 0.5517241379310345, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02082368102789195, | |
| "kl": 0.01568816788494587, | |
| "learning_rate": 4.779634737247455e-05, | |
| "loss": 0.18037329614162445, | |
| "num_tokens": 33709229.0, | |
| "reward": 1.96484375, | |
| "reward_std": 1.7749732732772827, | |
| "rewards/reward_func/mean": 0.2183159722222222, | |
| "rewards/reward_func/std": 0.26773934563000995, | |
| "sampling/importance_sampling_ratio/max": 2.999913454055786, | |
| "sampling/importance_sampling_ratio/mean": 0.9544388651847839, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.428032875061035, | |
| "sampling/sampling_logp_difference/mean": 0.18823330104351044, | |
| "step": 224, | |
| "step_time": 120.03369585401379 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2172.0, | |
| "completions/mean_length": 880.921875, | |
| "completions/mean_terminated_length": 819.0322265625, | |
| "completions/min_length": 97.0, | |
| "completions/min_terminated_length": 97.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6620640158653259, | |
| "epoch": 0.5541871921182266, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0348405270348598, | |
| "kl": 0.021940368227660656, | |
| "learning_rate": 4.777638985036599e-05, | |
| "loss": 0.03423825651407242, | |
| "num_tokens": 33854472.0, | |
| "reward": 2.578125, | |
| "reward_std": 1.9969595670700073, | |
| "rewards/reward_func/mean": 0.2864583333333333, | |
| "rewards/reward_func/std": 0.2823880405889617, | |
| "sampling/importance_sampling_ratio/max": 2.9987571239471436, | |
| "sampling/importance_sampling_ratio/mean": 0.9559164643287659, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.310111045837402, | |
| "sampling/sampling_logp_difference/mean": 0.17907507717609406, | |
| "step": 225, | |
| "step_time": 127.27977883489802 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3468.0, | |
| "completions/mean_length": 1223.015625, | |
| "completions/mean_terminated_length": 1161.61669921875, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 300.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6341233402490616, | |
| "epoch": 0.5566502463054187, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.030284450173885293, | |
| "kl": 0.0230104043148458, | |
| "learning_rate": 4.7756346567067255e-05, | |
| "loss": -0.21584941446781158, | |
| "num_tokens": 34022121.0, | |
| "reward": 3.34375, | |
| "reward_std": 2.269754648208618, | |
| "rewards/reward_func/mean": 0.3715277777777778, | |
| "rewards/reward_func/std": 0.3541766901810964, | |
| "sampling/importance_sampling_ratio/max": 2.9997711181640625, | |
| "sampling/importance_sampling_ratio/mean": 0.9506696462631226, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.499972343444824, | |
| "sampling/sampling_logp_difference/mean": 0.18650886416435242, | |
| "step": 226, | |
| "step_time": 124.42867265990935 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3246.0, | |
| "completions/mean_length": 1110.9375, | |
| "completions/mean_terminated_length": 1049.2950439453125, | |
| "completions/min_length": 371.0, | |
| "completions/min_terminated_length": 371.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5698187798261642, | |
| "epoch": 0.5591133004926109, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03229629907628574, | |
| "kl": 0.015496046748012304, | |
| "learning_rate": 4.773621759804844e-05, | |
| "loss": -0.08266813308000565, | |
| "num_tokens": 34176565.0, | |
| "reward": 2.4140625, | |
| "reward_std": 1.8820469379425049, | |
| "rewards/reward_func/mean": 0.2682291666666667, | |
| "rewards/reward_func/std": 0.25015421791209114, | |
| "sampling/importance_sampling_ratio/max": 2.9965968132019043, | |
| "sampling/importance_sampling_ratio/mean": 0.9610643982887268, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.3748140335083, | |
| "sampling/sampling_logp_difference/mean": 0.16262675821781158, | |
| "step": 227, | |
| "step_time": 112.16787964990363 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4086.0, | |
| "completions/max_terminated_length": 4086.0, | |
| "completions/mean_length": 1114.328125, | |
| "completions/mean_terminated_length": 1115.245849609375, | |
| "completions/min_length": 286.0, | |
| "completions/min_terminated_length": 286.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6151544153690338, | |
| "epoch": 0.5615763546798029, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.041034398035143124, | |
| "kl": 0.031581724528223276, | |
| "learning_rate": 4.771600301910224e-05, | |
| "loss": 0.06149850785732269, | |
| "num_tokens": 34339738.0, | |
| "reward": 3.2578125, | |
| "reward_std": 2.1524112224578857, | |
| "rewards/reward_func/mean": 0.3619791666666667, | |
| "rewards/reward_func/std": 0.3190525414215194, | |
| "sampling/importance_sampling_ratio/max": 2.9979159832000732, | |
| "sampling/importance_sampling_ratio/mean": 0.9513490796089172, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.436819076538086, | |
| "sampling/sampling_logp_difference/mean": 0.18058064579963684, | |
| "step": 228, | |
| "step_time": 127.38730531884357 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3669.0, | |
| "completions/mean_length": 1181.671875, | |
| "completions/mean_terminated_length": 965.132080078125, | |
| "completions/min_length": 201.0, | |
| "completions/min_terminated_length": 201.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6493166834115982, | |
| "epoch": 0.5640394088669951, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02913287172531979, | |
| "kl": 0.02364873979240656, | |
| "learning_rate": 4.769570290634373e-05, | |
| "loss": -0.0989631861448288, | |
| "num_tokens": 34506581.0, | |
| "reward": 3.60546875, | |
| "reward_std": 2.1647448539733887, | |
| "rewards/reward_func/mean": 0.4006076388888889, | |
| "rewards/reward_func/std": 0.3339156011740367, | |
| "sampling/importance_sampling_ratio/max": 2.9985804557800293, | |
| "sampling/importance_sampling_ratio/mean": 0.9563257694244385, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.810356140136719, | |
| "sampling/sampling_logp_difference/mean": 0.17818036675453186, | |
| "step": 229, | |
| "step_time": 133.75939935678616 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2632.0, | |
| "completions/mean_length": 767.6875, | |
| "completions/mean_terminated_length": 718.5, | |
| "completions/min_length": 176.0, | |
| "completions/min_terminated_length": 176.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5988651365041733, | |
| "epoch": 0.5665024630541872, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.05832143190503159, | |
| "kl": 0.0540700601413846, | |
| "learning_rate": 4.767531733621004e-05, | |
| "loss": -0.026409372687339783, | |
| "num_tokens": 34635841.0, | |
| "reward": 4.26171875, | |
| "reward_std": 1.9197334051132202, | |
| "rewards/reward_func/mean": 0.4735243055555556, | |
| "rewards/reward_func/std": 0.30134472085369957, | |
| "sampling/importance_sampling_ratio/max": 2.9955201148986816, | |
| "sampling/importance_sampling_ratio/mean": 0.9635403752326965, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.575170516967773, | |
| "sampling/sampling_logp_difference/mean": 0.16176164150238037, | |
| "step": 230, | |
| "step_time": 118.32405733293854 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2725.0, | |
| "completions/mean_length": 1067.234375, | |
| "completions/mean_terminated_length": 954.6610107421875, | |
| "completions/min_length": 142.0, | |
| "completions/min_terminated_length": 142.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5734339356422424, | |
| "epoch": 0.5689655172413793, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03233225495323699, | |
| "kl": 0.024783702800050378, | |
| "learning_rate": 4.765484638546005e-05, | |
| "loss": 0.08640626072883606, | |
| "num_tokens": 34781952.0, | |
| "reward": 3.7734375, | |
| "reward_std": 2.134708881378174, | |
| "rewards/reward_func/mean": 0.4192708333333333, | |
| "rewards/reward_func/std": 0.32498767226934433, | |
| "sampling/importance_sampling_ratio/max": 2.9994750022888184, | |
| "sampling/importance_sampling_ratio/mean": 0.9611262679100037, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.97213363647461, | |
| "sampling/sampling_logp_difference/mean": 0.16367268562316895, | |
| "step": 231, | |
| "step_time": 114.58008486474864 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3762.0, | |
| "completions/mean_length": 1316.703125, | |
| "completions/mean_terminated_length": 995.4509887695312, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 277.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6045290231704712, | |
| "epoch": 0.5714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025361740347373922, | |
| "kl": 0.025120028294622898, | |
| "learning_rate": 4.7634290131174184e-05, | |
| "loss": -0.23532718420028687, | |
| "num_tokens": 34954989.0, | |
| "reward": 3.52734375, | |
| "reward_std": 2.2471840381622314, | |
| "rewards/reward_func/mean": 0.3919270833333333, | |
| "rewards/reward_func/std": 0.3482829729715983, | |
| "sampling/importance_sampling_ratio/max": 2.9946084022521973, | |
| "sampling/importance_sampling_ratio/mean": 0.9581983685493469, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.114594459533691, | |
| "sampling/sampling_logp_difference/mean": 0.1707635223865509, | |
| "step": 232, | |
| "step_time": 186.30969157605432 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3270.0, | |
| "completions/mean_length": 1146.046875, | |
| "completions/mean_terminated_length": 1056.482177734375, | |
| "completions/min_length": 47.0, | |
| "completions/min_terminated_length": 47.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6580791026353836, | |
| "epoch": 0.5738916256157636, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02936261451483538, | |
| "kl": 0.01962541602551937, | |
| "learning_rate": 4.761364865075402e-05, | |
| "loss": -0.09561189264059067, | |
| "num_tokens": 35105408.0, | |
| "reward": 3.94140625, | |
| "reward_std": 2.1033525466918945, | |
| "rewards/reward_func/mean": 0.4379340277777778, | |
| "rewards/reward_func/std": 0.3491085684961743, | |
| "sampling/importance_sampling_ratio/max": 2.9998505115509033, | |
| "sampling/importance_sampling_ratio/mean": 0.9548863768577576, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.624435424804688, | |
| "sampling/sampling_logp_difference/mean": 0.1729675531387329, | |
| "step": 233, | |
| "step_time": 127.6355293749366 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2387.0, | |
| "completions/mean_length": 1181.0, | |
| "completions/mean_terminated_length": 965.26318359375, | |
| "completions/min_length": 209.0, | |
| "completions/min_terminated_length": 209.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6062022000551224, | |
| "epoch": 0.5763546798029556, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.030717672720896462, | |
| "kl": 0.03280913829803467, | |
| "learning_rate": 4.7592922021922056e-05, | |
| "loss": -0.04321448132395744, | |
| "num_tokens": 35276032.0, | |
| "reward": 3.12109375, | |
| "reward_std": 2.1648309230804443, | |
| "rewards/reward_func/mean": 0.3467881944444444, | |
| "rewards/reward_func/std": 0.306332517001364, | |
| "sampling/importance_sampling_ratio/max": 2.9995665550231934, | |
| "sampling/importance_sampling_ratio/mean": 0.9498552083969116, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.928382873535156, | |
| "sampling/sampling_logp_difference/mean": 0.18584509193897247, | |
| "step": 234, | |
| "step_time": 145.97126659261994 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2905.0, | |
| "completions/mean_length": 1276.90625, | |
| "completions/mean_terminated_length": 1030.436279296875, | |
| "completions/min_length": 248.0, | |
| "completions/min_terminated_length": 248.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6295378506183624, | |
| "epoch": 0.5788177339901478, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025004651439172708, | |
| "kl": 0.036536975763738155, | |
| "learning_rate": 4.757211032272141e-05, | |
| "loss": 0.016434896737337112, | |
| "num_tokens": 35441466.0, | |
| "reward": 3.7109375, | |
| "reward_std": 2.195873975753784, | |
| "rewards/reward_func/mean": 0.4123263888888889, | |
| "rewards/reward_func/std": 0.33889370991124046, | |
| "sampling/importance_sampling_ratio/max": 2.9977970123291016, | |
| "sampling/importance_sampling_ratio/mean": 0.9575643539428711, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.089038848876953, | |
| "sampling/sampling_logp_difference/mean": 0.166203573346138, | |
| "step": 235, | |
| "step_time": 124.31631009606645 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2577.0, | |
| "completions/mean_length": 1064.4375, | |
| "completions/mean_terminated_length": 977.6392822265625, | |
| "completions/min_length": 271.0, | |
| "completions/min_terminated_length": 271.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5670924335718155, | |
| "epoch": 0.5812807881773399, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03182262502657222, | |
| "kl": 0.03349051624536514, | |
| "learning_rate": 4.75512136315155e-05, | |
| "loss": -0.07744449377059937, | |
| "num_tokens": 35599350.0, | |
| "reward": 4.21484375, | |
| "reward_std": 1.93026602268219, | |
| "rewards/reward_func/mean": 0.4683159722222222, | |
| "rewards/reward_func/std": 0.33124545713265735, | |
| "sampling/importance_sampling_ratio/max": 2.99531888961792, | |
| "sampling/importance_sampling_ratio/mean": 0.9617635011672974, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.312065124511719, | |
| "sampling/sampling_logp_difference/mean": 0.161124125123024, | |
| "step": 236, | |
| "step_time": 161.99643060285598 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3437.0, | |
| "completions/mean_length": 1482.609375, | |
| "completions/mean_terminated_length": 1240.6207275390625, | |
| "completions/min_length": 286.0, | |
| "completions/min_terminated_length": 286.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5801850706338882, | |
| "epoch": 0.583743842364532, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025051146875974366, | |
| "kl": 0.021959123201668262, | |
| "learning_rate": 4.7530232026987807e-05, | |
| "loss": 0.03501718491315842, | |
| "num_tokens": 35774285.0, | |
| "reward": 4.34765625, | |
| "reward_std": 1.9968314170837402, | |
| "rewards/reward_func/mean": 0.4830729166666667, | |
| "rewards/reward_func/std": 0.3292863344152768, | |
| "sampling/importance_sampling_ratio/max": 2.999497890472412, | |
| "sampling/importance_sampling_ratio/mean": 0.9603710174560547, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.829028129577637, | |
| "sampling/sampling_logp_difference/mean": 0.1585114598274231, | |
| "step": 237, | |
| "step_time": 135.68384832888842 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3597.0, | |
| "completions/mean_length": 1484.640625, | |
| "completions/mean_terminated_length": 1111.58935546875, | |
| "completions/min_length": 211.0, | |
| "completions/min_terminated_length": 211.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6366446763277054, | |
| "epoch": 0.5862068965517241, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01741330264462691, | |
| "kl": 0.022137976717203856, | |
| "learning_rate": 4.75091655881415e-05, | |
| "loss": 0.06264235824346542, | |
| "num_tokens": 35944198.0, | |
| "reward": 4.78515625, | |
| "reward_std": 1.6170392036437988, | |
| "rewards/reward_func/mean": 0.5316840277777778, | |
| "rewards/reward_func/std": 0.260915905651119, | |
| "sampling/importance_sampling_ratio/max": 2.9989659786224365, | |
| "sampling/importance_sampling_ratio/mean": 0.9526357650756836, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.190343856811523, | |
| "sampling/sampling_logp_difference/mean": 0.18196536600589752, | |
| "step": 238, | |
| "step_time": 166.6468972011935 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2563.0, | |
| "completions/mean_length": 1084.28125, | |
| "completions/mean_terminated_length": 889.0167236328125, | |
| "completions/min_length": 323.0, | |
| "completions/min_terminated_length": 323.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6271532773971558, | |
| "epoch": 0.5886699507389163, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04962193586561231, | |
| "kl": 0.10968296322971582, | |
| "learning_rate": 4.7488014394299205e-05, | |
| "loss": -0.17306064069271088, | |
| "num_tokens": 36116088.0, | |
| "reward": 4.125, | |
| "reward_std": 1.8126540184020996, | |
| "rewards/reward_func/mean": 0.4583333333333333, | |
| "rewards/reward_func/std": 0.2737976892126931, | |
| "sampling/importance_sampling_ratio/max": 2.998816967010498, | |
| "sampling/importance_sampling_ratio/mean": 0.9479783177375793, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 20.247161865234375, | |
| "sampling/sampling_logp_difference/mean": 0.19447281956672668, | |
| "step": 239, | |
| "step_time": 173.26991621381603 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3430.0, | |
| "completions/mean_length": 1172.03125, | |
| "completions/mean_terminated_length": 1077.7095947265625, | |
| "completions/min_length": 256.0, | |
| "completions/min_terminated_length": 256.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5948723554611206, | |
| "epoch": 0.5911330049261084, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.026709746483499667, | |
| "kl": 0.03804912185296416, | |
| "learning_rate": 4.746677852510267e-05, | |
| "loss": -0.038154736161231995, | |
| "num_tokens": 36276522.0, | |
| "reward": 4.55859375, | |
| "reward_std": 1.704768180847168, | |
| "rewards/reward_func/mean": 0.5065104166666666, | |
| "rewards/reward_func/std": 0.268581575817532, | |
| "sampling/importance_sampling_ratio/max": 2.9988701343536377, | |
| "sampling/importance_sampling_ratio/mean": 0.9554129242897034, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.894377708435059, | |
| "sampling/sampling_logp_difference/mean": 0.17530812323093414, | |
| "step": 240, | |
| "step_time": 152.09504526108503 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3526.0, | |
| "completions/max_terminated_length": 3526.0, | |
| "completions/mean_length": 1202.0, | |
| "completions/mean_terminated_length": 1201.6826171875, | |
| "completions/min_length": 308.0, | |
| "completions/min_terminated_length": 308.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6431476175785065, | |
| "epoch": 0.5935960591133005, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025089827367739004, | |
| "kl": 0.018822600599378347, | |
| "learning_rate": 4.7445458060512484e-05, | |
| "loss": -0.08312501013278961, | |
| "num_tokens": 36449114.0, | |
| "reward": 4.50390625, | |
| "reward_std": 1.6970399618148804, | |
| "rewards/reward_func/mean": 0.5004340277777778, | |
| "rewards/reward_func/std": 0.3032427848213249, | |
| "sampling/importance_sampling_ratio/max": 2.9981064796447754, | |
| "sampling/importance_sampling_ratio/mean": 0.9516088366508484, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.683661460876465, | |
| "sampling/sampling_logp_difference/mean": 0.18970529735088348, | |
| "step": 241, | |
| "step_time": 157.17970843007788 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2980.0, | |
| "completions/max_terminated_length": 2980.0, | |
| "completions/mean_length": 851.984375, | |
| "completions/mean_terminated_length": 851.984375, | |
| "completions/min_length": 149.0, | |
| "completions/min_terminated_length": 149.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5084992349147797, | |
| "epoch": 0.5960591133004927, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02559628508870623, | |
| "kl": 0.019516848493367434, | |
| "learning_rate": 4.742405308080775e-05, | |
| "loss": 0.002505837008357048, | |
| "num_tokens": 36587897.0, | |
| "reward": 4.5390625, | |
| "reward_std": 1.5686782598495483, | |
| "rewards/reward_func/mean": 0.5043402777777778, | |
| "rewards/reward_func/std": 0.24406109833055073, | |
| "sampling/importance_sampling_ratio/max": 2.998159408569336, | |
| "sampling/importance_sampling_ratio/mean": 0.9689394235610962, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.643729209899902, | |
| "sampling/sampling_logp_difference/mean": 0.1409374475479126, | |
| "step": 242, | |
| "step_time": 91.55293344007805 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3363.0, | |
| "completions/max_terminated_length": 3333.0, | |
| "completions/mean_length": 819.078125, | |
| "completions/mean_terminated_length": 778.698486328125, | |
| "completions/min_length": 264.0, | |
| "completions/min_terminated_length": 264.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5793705433607101, | |
| "epoch": 0.5985221674876847, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.06361807348388746, | |
| "kl": 0.153956466820091, | |
| "learning_rate": 4.7402563666585817e-05, | |
| "loss": 0.19218356907367706, | |
| "num_tokens": 36724926.0, | |
| "reward": 4.33203125, | |
| "reward_std": 1.7366580963134766, | |
| "rewards/reward_func/mean": 0.4813368055555556, | |
| "rewards/reward_func/std": 0.24346151699622473, | |
| "sampling/importance_sampling_ratio/max": 2.9986538887023926, | |
| "sampling/importance_sampling_ratio/mean": 0.9620853662490845, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.622957229614258, | |
| "sampling/sampling_logp_difference/mean": 0.16694733500480652, | |
| "step": 243, | |
| "step_time": 98.90079542505555 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2506.0, | |
| "completions/mean_length": 1158.15625, | |
| "completions/mean_terminated_length": 962.300048828125, | |
| "completions/min_length": 282.0, | |
| "completions/min_terminated_length": 282.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6488623917102814, | |
| "epoch": 0.6009852216748769, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.024661429253724953, | |
| "kl": 0.019309990806505084, | |
| "learning_rate": 4.7380989898761957e-05, | |
| "loss": -0.28670600056648254, | |
| "num_tokens": 36882968.0, | |
| "reward": 4.1796875, | |
| "reward_std": 1.8900684118270874, | |
| "rewards/reward_func/mean": 0.4644097222222222, | |
| "rewards/reward_func/std": 0.27070868843131596, | |
| "sampling/importance_sampling_ratio/max": 2.983285427093506, | |
| "sampling/importance_sampling_ratio/mean": 0.9530576467514038, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.78699779510498, | |
| "sampling/sampling_logp_difference/mean": 0.17854541540145874, | |
| "step": 244, | |
| "step_time": 139.11421747365966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2858.0, | |
| "completions/mean_length": 876.515625, | |
| "completions/mean_terminated_length": 815.5, | |
| "completions/min_length": 115.0, | |
| "completions/min_terminated_length": 115.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6686416566371918, | |
| "epoch": 0.603448275862069, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03471602827656467, | |
| "kl": 0.02017925539985299, | |
| "learning_rate": 4.735933185856906e-05, | |
| "loss": -0.05181242153048515, | |
| "num_tokens": 37020313.0, | |
| "reward": 3.859375, | |
| "reward_std": 1.9779914617538452, | |
| "rewards/reward_func/mean": 0.4288194444444444, | |
| "rewards/reward_func/std": 0.3127517153819402, | |
| "sampling/importance_sampling_ratio/max": 2.994431972503662, | |
| "sampling/importance_sampling_ratio/mean": 0.9652891159057617, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.530597686767578, | |
| "sampling/sampling_logp_difference/mean": 0.16316168010234833, | |
| "step": 245, | |
| "step_time": 118.19300652644597 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2223.0, | |
| "completions/mean_length": 1040.265625, | |
| "completions/mean_terminated_length": 939.4261474609375, | |
| "completions/min_length": 270.0, | |
| "completions/min_terminated_length": 270.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.645111545920372, | |
| "epoch": 0.6059113300492611, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.036281618178556546, | |
| "kl": 0.022860198747366667, | |
| "learning_rate": 4.733758962755734e-05, | |
| "loss": 0.048598241060972214, | |
| "num_tokens": 37174378.0, | |
| "reward": 3.98046875, | |
| "reward_std": 1.978206992149353, | |
| "rewards/reward_func/mean": 0.4422743055555556, | |
| "rewards/reward_func/std": 0.29514625171820325, | |
| "sampling/importance_sampling_ratio/max": 2.9986579418182373, | |
| "sampling/importance_sampling_ratio/mean": 0.9547132849693298, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.925469398498535, | |
| "sampling/sampling_logp_difference/mean": 0.18641415238380432, | |
| "step": 246, | |
| "step_time": 123.81210570293479 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3950.0, | |
| "completions/max_terminated_length": 3950.0, | |
| "completions/mean_length": 723.46875, | |
| "completions/mean_terminated_length": 723.46875, | |
| "completions/min_length": 210.0, | |
| "completions/min_terminated_length": 210.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6742394268512726, | |
| "epoch": 0.6083743842364532, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04995747395315036, | |
| "kl": 0.026606498286128044, | |
| "learning_rate": 4.7315763287594e-05, | |
| "loss": 0.2653157711029053, | |
| "num_tokens": 37301944.0, | |
| "reward": 3.33984375, | |
| "reward_std": 2.105532646179199, | |
| "rewards/reward_func/mean": 0.37109375, | |
| "rewards/reward_func/std": 0.2963992158571879, | |
| "sampling/importance_sampling_ratio/max": 2.9936087131500244, | |
| "sampling/importance_sampling_ratio/mean": 0.9594826102256775, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.18620491027832, | |
| "sampling/sampling_logp_difference/mean": 0.182135671377182, | |
| "step": 247, | |
| "step_time": 103.0024021465797 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4061.0, | |
| "completions/mean_length": 1108.421875, | |
| "completions/mean_terminated_length": 974.11669921875, | |
| "completions/min_length": 204.0, | |
| "completions/min_terminated_length": 274.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6455037593841553, | |
| "epoch": 0.6108374384236454, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03148143550121695, | |
| "kl": 0.01934328768402338, | |
| "learning_rate": 4.729385292086297e-05, | |
| "loss": -0.24869614839553833, | |
| "num_tokens": 37464915.0, | |
| "reward": 3.55078125, | |
| "reward_std": 2.086036205291748, | |
| "rewards/reward_func/mean": 0.39453125, | |
| "rewards/reward_func/std": 0.2936388701200485, | |
| "sampling/importance_sampling_ratio/max": 2.9959442615509033, | |
| "sampling/importance_sampling_ratio/mean": 0.9551997780799866, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.284041404724121, | |
| "sampling/sampling_logp_difference/mean": 0.17975515127182007, | |
| "step": 248, | |
| "step_time": 145.99462776235305 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2113.0, | |
| "completions/mean_length": 939.1875, | |
| "completions/mean_terminated_length": 837.3547973632812, | |
| "completions/min_length": 228.0, | |
| "completions/min_terminated_length": 228.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6875911951065063, | |
| "epoch": 0.6133004926108374, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03540754558977053, | |
| "kl": 0.026569989509880543, | |
| "learning_rate": 4.727185860986454e-05, | |
| "loss": 0.04270695894956589, | |
| "num_tokens": 37604927.0, | |
| "reward": 2.984375, | |
| "reward_std": 2.1193904876708984, | |
| "rewards/reward_func/mean": 0.3315972222222222, | |
| "rewards/reward_func/std": 0.2987919400135676, | |
| "sampling/importance_sampling_ratio/max": 2.998774290084839, | |
| "sampling/importance_sampling_ratio/mean": 0.957014799118042, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.659464836120605, | |
| "sampling/sampling_logp_difference/mean": 0.1871645301580429, | |
| "step": 249, | |
| "step_time": 121.0595855594147 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3092.0, | |
| "completions/max_terminated_length": 3092.0, | |
| "completions/mean_length": 817.203125, | |
| "completions/mean_terminated_length": 817.203125, | |
| "completions/min_length": 226.0, | |
| "completions/min_terminated_length": 226.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6271493583917618, | |
| "epoch": 0.6157635467980296, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.040183803438341846, | |
| "kl": 0.022904008626937866, | |
| "learning_rate": 4.72497804374151e-05, | |
| "loss": 0.10319769382476807, | |
| "num_tokens": 37735276.0, | |
| "reward": 3.65625, | |
| "reward_std": 1.9535783529281616, | |
| "rewards/reward_func/mean": 0.40625, | |
| "rewards/reward_func/std": 0.27815084324942696, | |
| "sampling/importance_sampling_ratio/max": 2.9912922382354736, | |
| "sampling/importance_sampling_ratio/mean": 0.9657875299453735, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.99195671081543, | |
| "sampling/sampling_logp_difference/mean": 0.15472693741321564, | |
| "step": 250, | |
| "step_time": 92.37789764790796 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3455.0, | |
| "completions/mean_length": 893.828125, | |
| "completions/mean_terminated_length": 843.0000610351562, | |
| "completions/min_length": 160.0, | |
| "completions/min_terminated_length": 160.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6401469260454178, | |
| "epoch": 0.6182266009852216, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03978805998415212, | |
| "kl": 0.03360833553597331, | |
| "learning_rate": 4.722761848664681e-05, | |
| "loss": 0.2213575392961502, | |
| "num_tokens": 37874577.0, | |
| "reward": 3.01171875, | |
| "reward_std": 2.0607969760894775, | |
| "rewards/reward_func/mean": 0.3346354166666667, | |
| "rewards/reward_func/std": 0.2820756352610058, | |
| "sampling/importance_sampling_ratio/max": 2.979861259460449, | |
| "sampling/importance_sampling_ratio/mean": 0.9564794301986694, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 8.897414207458496, | |
| "sampling/sampling_logp_difference/mean": 0.17922547459602356, | |
| "step": 251, | |
| "step_time": 146.73478505690582 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2845.0, | |
| "completions/max_terminated_length": 2845.0, | |
| "completions/mean_length": 720.3125, | |
| "completions/mean_terminated_length": 712.6290283203125, | |
| "completions/min_length": 212.0, | |
| "completions/min_terminated_length": 212.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6507443785667419, | |
| "epoch": 0.6206896551724138, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.039921392666201184, | |
| "kl": 0.031218301504850388, | |
| "learning_rate": 4.720537284100728e-05, | |
| "loss": -0.04249424859881401, | |
| "num_tokens": 38001621.0, | |
| "reward": 3.25390625, | |
| "reward_std": 1.9757920503616333, | |
| "rewards/reward_func/mean": 0.3615451388888889, | |
| "rewards/reward_func/std": 0.2661890693836742, | |
| "sampling/importance_sampling_ratio/max": 2.9958386421203613, | |
| "sampling/importance_sampling_ratio/mean": 0.9622673392295837, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.243165969848633, | |
| "sampling/sampling_logp_difference/mean": 0.1693202704191208, | |
| "step": 252, | |
| "step_time": 101.49904671194963 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2814.0, | |
| "completions/mean_length": 1095.40625, | |
| "completions/mean_terminated_length": 947.8359985351562, | |
| "completions/min_length": 198.0, | |
| "completions/min_terminated_length": 198.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5974663645029068, | |
| "epoch": 0.6231527093596059, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.022696110143372545, | |
| "kl": 0.019222368020564318, | |
| "learning_rate": 4.7183043584259254e-05, | |
| "loss": -0.07331643998622894, | |
| "num_tokens": 38154367.0, | |
| "reward": 4.56640625, | |
| "reward_std": 1.6361746788024902, | |
| "rewards/reward_func/mean": 0.5073784722222222, | |
| "rewards/reward_func/std": 0.27994223973817295, | |
| "sampling/importance_sampling_ratio/max": 2.9847824573516846, | |
| "sampling/importance_sampling_ratio/mean": 0.9601361751556396, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.678767204284668, | |
| "sampling/sampling_logp_difference/mean": 0.1662071794271469, | |
| "step": 253, | |
| "step_time": 172.382303963881 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1340.0, | |
| "completions/max_terminated_length": 1340.0, | |
| "completions/mean_length": 812.78125, | |
| "completions/mean_terminated_length": 812.78125, | |
| "completions/min_length": 258.0, | |
| "completions/min_terminated_length": 258.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6090056896209717, | |
| "epoch": 0.625615763546798, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.05873182180638249, | |
| "kl": 0.022160206688567996, | |
| "learning_rate": 4.716063080048031e-05, | |
| "loss": 0.07988754659891129, | |
| "num_tokens": 38288849.0, | |
| "reward": 3.99609375, | |
| "reward_std": 1.828723669052124, | |
| "rewards/reward_func/mean": 0.4440104166666667, | |
| "rewards/reward_func/std": 0.2540795885854297, | |
| "sampling/importance_sampling_ratio/max": 2.9953503608703613, | |
| "sampling/importance_sampling_ratio/mean": 0.9602220058441162, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.241506576538086, | |
| "sampling/sampling_logp_difference/mean": 0.1675276756286621, | |
| "step": 254, | |
| "step_time": 66.64111198205501 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2275.0, | |
| "completions/max_terminated_length": 2275.0, | |
| "completions/mean_length": 746.984375, | |
| "completions/mean_terminated_length": 746.984375, | |
| "completions/min_length": 261.0, | |
| "completions/min_terminated_length": 261.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5543633997440338, | |
| "epoch": 0.6280788177339901, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03127580001868004, | |
| "kl": 0.029117131140083075, | |
| "learning_rate": 4.713813457406253e-05, | |
| "loss": 0.04796233028173447, | |
| "num_tokens": 38418288.0, | |
| "reward": 4.15625, | |
| "reward_std": 1.6923614740371704, | |
| "rewards/reward_func/mean": 0.4618055555555556, | |
| "rewards/reward_func/std": 0.20467053850491843, | |
| "sampling/importance_sampling_ratio/max": 2.996886730194092, | |
| "sampling/importance_sampling_ratio/mean": 0.9699411392211914, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.242613792419434, | |
| "sampling/sampling_logp_difference/mean": 0.13953782618045807, | |
| "step": 255, | |
| "step_time": 103.97602935507894 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2907.0, | |
| "completions/mean_length": 979.46875, | |
| "completions/mean_terminated_length": 878.9354858398438, | |
| "completions/min_length": 317.0, | |
| "completions/min_terminated_length": 317.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5546245723962784, | |
| "epoch": 0.6305418719211823, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.029271437720684072, | |
| "kl": 0.02258498244918883, | |
| "learning_rate": 4.7115554989712185e-05, | |
| "loss": 0.043127041310071945, | |
| "num_tokens": 38572830.0, | |
| "reward": 4.13671875, | |
| "reward_std": 1.8316712379455566, | |
| "rewards/reward_func/mean": 0.4596354166666667, | |
| "rewards/reward_func/std": 0.2634606758753459, | |
| "sampling/importance_sampling_ratio/max": 2.999389171600342, | |
| "sampling/importance_sampling_ratio/mean": 0.9619561433792114, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.921653747558594, | |
| "sampling/sampling_logp_difference/mean": 0.15635137259960175, | |
| "step": 256, | |
| "step_time": 217.4995983429253 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3510.0, | |
| "completions/mean_length": 893.671875, | |
| "completions/mean_terminated_length": 845.8524169921875, | |
| "completions/min_length": 212.0, | |
| "completions/min_terminated_length": 212.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.665133610367775, | |
| "epoch": 0.6330049261083743, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03984192918620892, | |
| "kl": 0.046457535587251186, | |
| "learning_rate": 4.709289213244943e-05, | |
| "loss": 0.10279978066682816, | |
| "num_tokens": 38707145.0, | |
| "reward": 3.765625, | |
| "reward_std": 1.9659174680709839, | |
| "rewards/reward_func/mean": 0.4184027777777778, | |
| "rewards/reward_func/std": 0.2853093130720986, | |
| "sampling/importance_sampling_ratio/max": 2.995044708251953, | |
| "sampling/importance_sampling_ratio/mean": 0.9582663774490356, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.383708953857422, | |
| "sampling/sampling_logp_difference/mean": 0.17232593894004822, | |
| "step": 257, | |
| "step_time": 118.28496656008065 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1689.0, | |
| "completions/mean_length": 835.921875, | |
| "completions/mean_terminated_length": 784.1746215820312, | |
| "completions/min_length": 359.0, | |
| "completions/min_terminated_length": 359.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6838351041078568, | |
| "epoch": 0.6354679802955665, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.03628155122006123, | |
| "kl": 0.04887444619089365, | |
| "learning_rate": 4.707014608760797e-05, | |
| "loss": -0.004470369778573513, | |
| "num_tokens": 38837156.0, | |
| "reward": 4.25390625, | |
| "reward_std": 1.6496105194091797, | |
| "rewards/reward_func/mean": 0.47265625, | |
| "rewards/reward_func/std": 0.21708844270971087, | |
| "sampling/importance_sampling_ratio/max": 2.9985907077789307, | |
| "sampling/importance_sampling_ratio/mean": 0.9622358083724976, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.900800704956055, | |
| "sampling/sampling_logp_difference/mean": 0.1803233027458191, | |
| "step": 258, | |
| "step_time": 123.32645462709479 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3112.0, | |
| "completions/mean_length": 1010.75, | |
| "completions/mean_terminated_length": 957.8709106445312, | |
| "completions/min_length": 265.0, | |
| "completions/min_terminated_length": 265.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6722955405712128, | |
| "epoch": 0.6379310344827587, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.028878722694508367, | |
| "kl": 0.02964442828670144, | |
| "learning_rate": 4.704731694083472e-05, | |
| "loss": -0.09307853877544403, | |
| "num_tokens": 38994644.0, | |
| "reward": 4.13671875, | |
| "reward_std": 1.8365392684936523, | |
| "rewards/reward_func/mean": 0.4596354166666667, | |
| "rewards/reward_func/std": 0.2944721562994851, | |
| "sampling/importance_sampling_ratio/max": 2.999586582183838, | |
| "sampling/importance_sampling_ratio/mean": 0.9555143117904663, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.124902725219727, | |
| "sampling/sampling_logp_difference/mean": 0.18143421411514282, | |
| "step": 259, | |
| "step_time": 159.91136281075887 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3131.0, | |
| "completions/max_terminated_length": 3131.0, | |
| "completions/mean_length": 962.84375, | |
| "completions/mean_terminated_length": 968.6032104492188, | |
| "completions/min_length": 214.0, | |
| "completions/min_terminated_length": 214.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5780832916498184, | |
| "epoch": 0.6403940886699507, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13977232949753662, | |
| "kl": 0.02337419893592596, | |
| "learning_rate": 4.7024404778089535e-05, | |
| "loss": 0.2663920521736145, | |
| "num_tokens": 39152794.0, | |
| "reward": 4.54296875, | |
| "reward_std": 1.6020538806915283, | |
| "rewards/reward_func/mean": 0.5047743055555556, | |
| "rewards/reward_func/std": 0.25629327860143447, | |
| "sampling/importance_sampling_ratio/max": 2.9913201332092285, | |
| "sampling/importance_sampling_ratio/mean": 0.9610067009925842, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.624191284179688, | |
| "sampling/sampling_logp_difference/mean": 0.15946471691131592, | |
| "step": 260, | |
| "step_time": 93.53231204720214 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2421.0, | |
| "completions/mean_length": 1025.125, | |
| "completions/mean_terminated_length": 974.4677124023438, | |
| "completions/min_length": 188.0, | |
| "completions/min_terminated_length": 188.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7002881020307541, | |
| "epoch": 0.6428571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02946249950049306, | |
| "kl": 0.02944745123386383, | |
| "learning_rate": 4.7001409685644824e-05, | |
| "loss": -0.18909524381160736, | |
| "num_tokens": 39310626.0, | |
| "reward": 4.3828125, | |
| "reward_std": 1.728592872619629, | |
| "rewards/reward_func/mean": 0.4869791666666667, | |
| "rewards/reward_func/std": 0.280566586388482, | |
| "sampling/importance_sampling_ratio/max": 2.998537540435791, | |
| "sampling/importance_sampling_ratio/mean": 0.952286958694458, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.772732734680176, | |
| "sampling/sampling_logp_difference/mean": 0.18932656943798065, | |
| "step": 261, | |
| "step_time": 139.19076150492765 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3891.0, | |
| "completions/mean_length": 1219.21875, | |
| "completions/mean_terminated_length": 1049.6949462890625, | |
| "completions/min_length": 317.0, | |
| "completions/min_terminated_length": 317.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6386135667562485, | |
| "epoch": 0.645320197044335, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03738181468253387, | |
| "kl": 0.038119449745863676, | |
| "learning_rate": 4.697833175008528e-05, | |
| "loss": 0.21018671989440918, | |
| "num_tokens": 39465136.0, | |
| "reward": 4.01171875, | |
| "reward_std": 1.8694652318954468, | |
| "rewards/reward_func/mean": 0.4457465277777778, | |
| "rewards/reward_func/std": 0.29815065529611373, | |
| "sampling/importance_sampling_ratio/max": 2.9996824264526367, | |
| "sampling/importance_sampling_ratio/mean": 0.9597668647766113, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.762887954711914, | |
| "sampling/sampling_logp_difference/mean": 0.16622015833854675, | |
| "step": 262, | |
| "step_time": 129.9858180533629 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2786.0, | |
| "completions/mean_length": 1279.234375, | |
| "completions/mean_terminated_length": 1076.4827880859375, | |
| "completions/min_length": 295.0, | |
| "completions/min_terminated_length": 295.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5774033218622208, | |
| "epoch": 0.6477832512315271, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02127924275094592, | |
| "kl": 0.020256227115169168, | |
| "learning_rate": 4.695517105830752e-05, | |
| "loss": -0.029037898406386375, | |
| "num_tokens": 39632367.0, | |
| "reward": 4.48046875, | |
| "reward_std": 1.5725224018096924, | |
| "rewards/reward_func/mean": 0.4978298611111111, | |
| "rewards/reward_func/std": 0.24498725765281254, | |
| "sampling/importance_sampling_ratio/max": 2.997105360031128, | |
| "sampling/importance_sampling_ratio/mean": 0.959444522857666, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.374399185180664, | |
| "sampling/sampling_logp_difference/mean": 0.16105125844478607, | |
| "step": 263, | |
| "step_time": 134.9195441652555 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3584.0, | |
| "completions/mean_length": 1325.46875, | |
| "completions/mean_terminated_length": 1227.8333740234375, | |
| "completions/min_length": 421.0, | |
| "completions/min_terminated_length": 421.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6147435158491135, | |
| "epoch": 0.6502463054187192, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.01633085128646359, | |
| "kl": 0.022814700147137046, | |
| "learning_rate": 4.6931927697519764e-05, | |
| "loss": -0.1795727014541626, | |
| "num_tokens": 39799197.0, | |
| "reward": 4.76171875, | |
| "reward_std": 1.5297510623931885, | |
| "rewards/reward_func/mean": 0.5290798611111112, | |
| "rewards/reward_func/std": 0.24876019855340323, | |
| "sampling/importance_sampling_ratio/max": 2.994166374206543, | |
| "sampling/importance_sampling_ratio/mean": 0.9519761204719543, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 21.247499465942383, | |
| "sampling/sampling_logp_difference/mean": 0.17912611365318298, | |
| "step": 264, | |
| "step_time": 176.04406498302706 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2367.0, | |
| "completions/mean_length": 1012.0625, | |
| "completions/mean_terminated_length": 776.8643798828125, | |
| "completions/min_length": 174.0, | |
| "completions/min_terminated_length": 174.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8938929438591003, | |
| "epoch": 0.6527093596059114, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02507752291342934, | |
| "kl": 0.026316776871681213, | |
| "learning_rate": 4.690860175524151e-05, | |
| "loss": -0.04639795422554016, | |
| "num_tokens": 39950705.0, | |
| "reward": 4.5, | |
| "reward_std": 1.4993385076522827, | |
| "rewards/reward_func/mean": 0.5, | |
| "rewards/reward_func/std": 0.21232767485909992, | |
| "sampling/importance_sampling_ratio/max": 2.997582197189331, | |
| "sampling/importance_sampling_ratio/mean": 0.95402991771698, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.430258750915527, | |
| "sampling/sampling_logp_difference/mean": 0.18939216434955597, | |
| "step": 265, | |
| "step_time": 193.00349966436625 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2397.0, | |
| "completions/mean_length": 1005.28125, | |
| "completions/mean_terminated_length": 781.5689697265625, | |
| "completions/min_length": 306.0, | |
| "completions/min_terminated_length": 306.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6178922802209854, | |
| "epoch": 0.6551724137931034, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02221771680669154, | |
| "kl": 0.030900841113179922, | |
| "learning_rate": 4.688519331930321e-05, | |
| "loss": -0.07329220324754715, | |
| "num_tokens": 40096659.0, | |
| "reward": 4.40625, | |
| "reward_std": 1.6284303665161133, | |
| "rewards/reward_func/mean": 0.4895833333333333, | |
| "rewards/reward_func/std": 0.25941121329863864, | |
| "sampling/importance_sampling_ratio/max": 2.98852801322937, | |
| "sampling/importance_sampling_ratio/mean": 0.9561002254486084, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.55964469909668, | |
| "sampling/sampling_logp_difference/mean": 0.16870857775211334, | |
| "step": 266, | |
| "step_time": 172.4602910319809 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3033.0, | |
| "completions/max_terminated_length": 3033.0, | |
| "completions/mean_length": 964.5625, | |
| "completions/mean_terminated_length": 949.6826171875, | |
| "completions/min_length": 244.0, | |
| "completions/min_terminated_length": 244.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5864181816577911, | |
| "epoch": 0.6576354679802956, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.027350824819978775, | |
| "kl": 0.021911869291216135, | |
| "learning_rate": 4.6861702477845924e-05, | |
| "loss": 0.0393938347697258, | |
| "num_tokens": 40235127.0, | |
| "reward": 4.68359375, | |
| "reward_std": 1.304176926612854, | |
| "rewards/reward_func/mean": 0.5203993055555556, | |
| "rewards/reward_func/std": 0.1879904866218567, | |
| "sampling/importance_sampling_ratio/max": 2.9985761642456055, | |
| "sampling/importance_sampling_ratio/mean": 0.9627262949943542, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.747222900390625, | |
| "sampling/sampling_logp_difference/mean": 0.14875806868076324, | |
| "step": 267, | |
| "step_time": 91.70279070711695 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3110.0, | |
| "completions/mean_length": 951.640625, | |
| "completions/mean_terminated_length": 796.9999389648438, | |
| "completions/min_length": 147.0, | |
| "completions/min_terminated_length": 147.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6354534775018692, | |
| "epoch": 0.6600985221674877, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.026974131553898818, | |
| "kl": 0.03585304832085967, | |
| "learning_rate": 4.683812931932103e-05, | |
| "loss": -0.014013536274433136, | |
| "num_tokens": 40384928.0, | |
| "reward": 4.3828125, | |
| "reward_std": 1.5157694816589355, | |
| "rewards/reward_func/mean": 0.4869791666666667, | |
| "rewards/reward_func/std": 0.21941063967016008, | |
| "sampling/importance_sampling_ratio/max": 2.994793176651001, | |
| "sampling/importance_sampling_ratio/mean": 0.9575902223587036, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.459250450134277, | |
| "sampling/sampling_logp_difference/mean": 0.1708674430847168, | |
| "step": 268, | |
| "step_time": 159.46288973209448 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2444.0, | |
| "completions/mean_length": 1185.3125, | |
| "completions/mean_terminated_length": 934.8275756835938, | |
| "completions/min_length": 334.0, | |
| "completions/min_terminated_length": 334.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6692711859941483, | |
| "epoch": 0.6625615763546798, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.022866791282923456, | |
| "kl": 0.02396966377273202, | |
| "learning_rate": 4.681447393248981e-05, | |
| "loss": 0.06689761579036713, | |
| "num_tokens": 40535876.0, | |
| "reward": 4.7734375, | |
| "reward_std": 1.4189189672470093, | |
| "rewards/reward_func/mean": 0.5303819444444444, | |
| "rewards/reward_func/std": 0.2291757870051596, | |
| "sampling/importance_sampling_ratio/max": 2.9907138347625732, | |
| "sampling/importance_sampling_ratio/mean": 0.9582573175430298, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.398053169250488, | |
| "sampling/sampling_logp_difference/mean": 0.1735519915819168, | |
| "step": 269, | |
| "step_time": 120.66145277698524 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3855.0, | |
| "completions/mean_length": 1729.953125, | |
| "completions/mean_terminated_length": 1403.132080078125, | |
| "completions/min_length": 333.0, | |
| "completions/min_terminated_length": 333.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7276947647333145, | |
| "epoch": 0.6650246305418719, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014130147226078223, | |
| "kl": 0.023325514513999224, | |
| "learning_rate": 4.679073640642321e-05, | |
| "loss": -0.03754594177007675, | |
| "num_tokens": 40748321.0, | |
| "reward": 4.91796875, | |
| "reward_std": 1.3837597370147705, | |
| "rewards/reward_func/mean": 0.5464409722222222, | |
| "rewards/reward_func/std": 0.23101985620127785, | |
| "sampling/importance_sampling_ratio/max": 2.9986205101013184, | |
| "sampling/importance_sampling_ratio/mean": 0.9422482252120972, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.669647216796875, | |
| "sampling/sampling_logp_difference/mean": 0.20420311391353607, | |
| "step": 270, | |
| "step_time": 199.377461778 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4010.0, | |
| "completions/mean_length": 1326.546875, | |
| "completions/mean_terminated_length": 1076.350830078125, | |
| "completions/min_length": 402.0, | |
| "completions/min_terminated_length": 402.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5661157891154289, | |
| "epoch": 0.6674876847290641, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025973586434166324, | |
| "kl": 0.01932303886860609, | |
| "learning_rate": 4.676691683050142e-05, | |
| "loss": 0.06925665587186813, | |
| "num_tokens": 40915540.0, | |
| "reward": 4.37109375, | |
| "reward_std": 1.922089695930481, | |
| "rewards/reward_func/mean": 0.4856770833333333, | |
| "rewards/reward_func/std": 0.2948591311772664, | |
| "sampling/importance_sampling_ratio/max": 2.9921767711639404, | |
| "sampling/importance_sampling_ratio/mean": 0.9636217355728149, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 22.38996696472168, | |
| "sampling/sampling_logp_difference/mean": 0.14760522544384003, | |
| "step": 271, | |
| "step_time": 133.28347809705883 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3370.0, | |
| "completions/mean_length": 1276.25, | |
| "completions/mean_terminated_length": 1048.17236328125, | |
| "completions/min_length": 126.0, | |
| "completions/min_terminated_length": 126.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.808114305138588, | |
| "epoch": 0.6699507389162561, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.024293892150334624, | |
| "kl": 0.03092290833592415, | |
| "learning_rate": 4.6743015294413606e-05, | |
| "loss": -0.10823698341846466, | |
| "num_tokens": 41080932.0, | |
| "reward": 4.44921875, | |
| "reward_std": 1.7689896821975708, | |
| "rewards/reward_func/mean": 0.4943576388888889, | |
| "rewards/reward_func/std": 0.2751389775011275, | |
| "sampling/importance_sampling_ratio/max": 2.9939756393432617, | |
| "sampling/importance_sampling_ratio/mean": 0.9468337893486023, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.56399917602539, | |
| "sampling/sampling_logp_difference/mean": 0.1977367103099823, | |
| "step": 272, | |
| "step_time": 132.46389366080984 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3550.0, | |
| "completions/mean_length": 1582.265625, | |
| "completions/mean_terminated_length": 1304.0726318359375, | |
| "completions/min_length": 318.0, | |
| "completions/min_terminated_length": 318.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6703736484050751, | |
| "epoch": 0.6724137931034483, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.022956701309354365, | |
| "kl": 0.02878654282540083, | |
| "learning_rate": 4.671903188815754e-05, | |
| "loss": -0.15021948516368866, | |
| "num_tokens": 41272757.0, | |
| "reward": 4.14453125, | |
| "reward_std": 1.8876187801361084, | |
| "rewards/reward_func/mean": 0.4605034722222222, | |
| "rewards/reward_func/std": 0.28286059117979473, | |
| "sampling/importance_sampling_ratio/max": 2.9985098838806152, | |
| "sampling/importance_sampling_ratio/mean": 0.9500502943992615, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.494309425354004, | |
| "sampling/sampling_logp_difference/mean": 0.1767929345369339, | |
| "step": 273, | |
| "step_time": 130.30867488658987 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2626.0, | |
| "completions/mean_length": 1213.53125, | |
| "completions/mean_terminated_length": 966.8070068359375, | |
| "completions/min_length": 148.0, | |
| "completions/min_terminated_length": 148.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.627654418349266, | |
| "epoch": 0.6748768472906403, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.018783698500290817, | |
| "kl": 0.027171143796294928, | |
| "learning_rate": 4.6694966702039236e-05, | |
| "loss": -0.1029772013425827, | |
| "num_tokens": 41432935.0, | |
| "reward": 4.57421875, | |
| "reward_std": 1.6303664445877075, | |
| "rewards/reward_func/mean": 0.5082465277777778, | |
| "rewards/reward_func/std": 0.24990240236123404, | |
| "sampling/importance_sampling_ratio/max": 2.991697311401367, | |
| "sampling/importance_sampling_ratio/mean": 0.954232931137085, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 19.955520629882812, | |
| "sampling/sampling_logp_difference/mean": 0.16991396248340607, | |
| "step": 274, | |
| "step_time": 132.13556932890788 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3856.0, | |
| "completions/mean_length": 1401.796875, | |
| "completions/mean_terminated_length": 1125.388916015625, | |
| "completions/min_length": 341.0, | |
| "completions/min_terminated_length": 341.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7419492900371552, | |
| "epoch": 0.6773399014778325, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.009325814947082865, | |
| "kl": 0.01900778664276004, | |
| "learning_rate": 4.667081982667269e-05, | |
| "loss": -0.07872651517391205, | |
| "num_tokens": 41599946.0, | |
| "reward": 4.84375, | |
| "reward_std": 1.2476167678833008, | |
| "rewards/reward_func/mean": 0.5381944444444444, | |
| "rewards/reward_func/std": 0.20155664533376694, | |
| "sampling/importance_sampling_ratio/max": 2.9985573291778564, | |
| "sampling/importance_sampling_ratio/mean": 0.957183837890625, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.758742332458496, | |
| "sampling/sampling_logp_difference/mean": 0.17596955597400665, | |
| "step": 275, | |
| "step_time": 126.71464370447211 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3458.0, | |
| "completions/mean_length": 995.3125, | |
| "completions/mean_terminated_length": 788.6000366210938, | |
| "completions/min_length": 215.0, | |
| "completions/min_terminated_length": 215.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.675742506980896, | |
| "epoch": 0.6798029556650246, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.024999889994407377, | |
| "kl": 0.03196124825626612, | |
| "learning_rate": 4.6646591352979416e-05, | |
| "loss": 0.16251075267791748, | |
| "num_tokens": 41744126.0, | |
| "reward": 4.71484375, | |
| "reward_std": 1.3450874090194702, | |
| "rewards/reward_func/mean": 0.5238715277777778, | |
| "rewards/reward_func/std": 0.22701033618715075, | |
| "sampling/importance_sampling_ratio/max": 2.9886231422424316, | |
| "sampling/importance_sampling_ratio/mean": 0.9616168141365051, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.233985900878906, | |
| "sampling/sampling_logp_difference/mean": 0.1758139729499817, | |
| "step": 276, | |
| "step_time": 126.53751606796868 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2902.0, | |
| "completions/mean_length": 1149.875, | |
| "completions/mean_terminated_length": 889.0178833007812, | |
| "completions/min_length": 201.0, | |
| "completions/min_terminated_length": 201.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6683462113142014, | |
| "epoch": 0.6822660098522167, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.032914011113616616, | |
| "kl": 0.04408724885433912, | |
| "learning_rate": 4.6622281372188246e-05, | |
| "loss": 0.1322198510169983, | |
| "num_tokens": 41903958.0, | |
| "reward": 4.390625, | |
| "reward_std": 1.5898206233978271, | |
| "rewards/reward_func/mean": 0.4878472222222222, | |
| "rewards/reward_func/std": 0.2218614485528734, | |
| "sampling/importance_sampling_ratio/max": 2.99843168258667, | |
| "sampling/importance_sampling_ratio/mean": 0.9556390643119812, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.684946060180664, | |
| "sampling/sampling_logp_difference/mean": 0.17735861241817474, | |
| "step": 277, | |
| "step_time": 135.2150932047516 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3271.0, | |
| "completions/mean_length": 1111.453125, | |
| "completions/mean_terminated_length": 1061.3709716796875, | |
| "completions/min_length": 346.0, | |
| "completions/min_terminated_length": 346.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.56987564265728, | |
| "epoch": 0.6847290640394089, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.021262043963326146, | |
| "kl": 0.03181112464517355, | |
| "learning_rate": 4.6597889975834884e-05, | |
| "loss": 0.01715017482638359, | |
| "num_tokens": 42067075.0, | |
| "reward": 4.859375, | |
| "reward_std": 1.3553794622421265, | |
| "rewards/reward_func/mean": 0.5399305555555556, | |
| "rewards/reward_func/std": 0.22037654287285274, | |
| "sampling/importance_sampling_ratio/max": 2.99056077003479, | |
| "sampling/importance_sampling_ratio/mean": 0.9574146270751953, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.489864349365234, | |
| "sampling/sampling_logp_difference/mean": 0.1564047932624817, | |
| "step": 278, | |
| "step_time": 141.53353557991795 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3224.0, | |
| "completions/mean_length": 1250.09375, | |
| "completions/mean_terminated_length": 1062.586181640625, | |
| "completions/min_length": 329.0, | |
| "completions/min_terminated_length": 329.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8913148045539856, | |
| "epoch": 0.687192118226601, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.019608777025347725, | |
| "kl": 0.02684358460828662, | |
| "learning_rate": 4.657341725576159e-05, | |
| "loss": -0.09490776062011719, | |
| "num_tokens": 42239385.0, | |
| "reward": 4.86328125, | |
| "reward_std": 1.4522353410720825, | |
| "rewards/reward_func/mean": 0.5403645833333334, | |
| "rewards/reward_func/std": 0.25109441661172444, | |
| "sampling/importance_sampling_ratio/max": 2.9914705753326416, | |
| "sampling/importance_sampling_ratio/mean": 0.9516316652297974, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.771811485290527, | |
| "sampling/sampling_logp_difference/mean": 0.18974481523036957, | |
| "step": 279, | |
| "step_time": 131.01170259085484 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2789.0, | |
| "completions/mean_length": 1550.015625, | |
| "completions/mean_terminated_length": 1175.8431396484375, | |
| "completions/min_length": 311.0, | |
| "completions/min_terminated_length": 311.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6808439493179321, | |
| "epoch": 0.6896551724137931, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.024437911228958144, | |
| "kl": 0.025552792474627495, | |
| "learning_rate": 4.654886330411682e-05, | |
| "loss": -0.004699625074863434, | |
| "num_tokens": 42423594.0, | |
| "reward": 4.0078125, | |
| "reward_std": 2.062980890274048, | |
| "rewards/reward_func/mean": 0.4453125, | |
| "rewards/reward_func/std": 0.3020985300342242, | |
| "sampling/importance_sampling_ratio/max": 2.998654365539551, | |
| "sampling/importance_sampling_ratio/mean": 0.9476783275604248, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.08974266052246, | |
| "sampling/sampling_logp_difference/mean": 0.18657398223876953, | |
| "step": 280, | |
| "step_time": 137.77152940188535 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3953.0, | |
| "completions/mean_length": 1437.078125, | |
| "completions/mean_terminated_length": 1263.9830322265625, | |
| "completions/min_length": 458.0, | |
| "completions/min_terminated_length": 458.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7709734290838242, | |
| "epoch": 0.6921182266009852, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02035823015500469, | |
| "kl": 0.023151292465627193, | |
| "learning_rate": 4.6524228213354935e-05, | |
| "loss": -0.13394278287887573, | |
| "num_tokens": 42607135.0, | |
| "reward": 4.65234375, | |
| "reward_std": 1.4790985584259033, | |
| "rewards/reward_func/mean": 0.5169270833333334, | |
| "rewards/reward_func/std": 0.2436904509862264, | |
| "sampling/importance_sampling_ratio/max": 2.998265266418457, | |
| "sampling/importance_sampling_ratio/mean": 0.9457845687866211, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.194257736206055, | |
| "sampling/sampling_logp_difference/mean": 0.20592188835144043, | |
| "step": 281, | |
| "step_time": 127.69270811229944 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4053.0, | |
| "completions/mean_length": 1238.421875, | |
| "completions/mean_terminated_length": 1022.4035034179688, | |
| "completions/min_length": 368.0, | |
| "completions/min_terminated_length": 368.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6019101142883301, | |
| "epoch": 0.6945812807881774, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025745355691416272, | |
| "kl": 0.034187129233032465, | |
| "learning_rate": 4.649951207623579e-05, | |
| "loss": 0.05697305127978325, | |
| "num_tokens": 42769034.0, | |
| "reward": 4.66796875, | |
| "reward_std": 1.4967256784439087, | |
| "rewards/reward_func/mean": 0.5186631944444444, | |
| "rewards/reward_func/std": 0.23547837634881338, | |
| "sampling/importance_sampling_ratio/max": 2.9987599849700928, | |
| "sampling/importance_sampling_ratio/mean": 0.9599939584732056, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.365039825439453, | |
| "sampling/sampling_logp_difference/mean": 0.1549825668334961, | |
| "step": 282, | |
| "step_time": 139.72987941768952 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3680.0, | |
| "completions/mean_length": 1312.171875, | |
| "completions/mean_terminated_length": 1049.9285888671875, | |
| "completions/min_length": 278.0, | |
| "completions/min_terminated_length": 278.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7229396849870682, | |
| "epoch": 0.6970443349753694, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.02292364974505966, | |
| "kl": 0.02232949109748006, | |
| "learning_rate": 4.647471498582441e-05, | |
| "loss": 0.04625914245843887, | |
| "num_tokens": 42932885.0, | |
| "reward": 4.8671875, | |
| "reward_std": 1.4029226303100586, | |
| "rewards/reward_func/mean": 0.5407986111111112, | |
| "rewards/reward_func/std": 0.23832206345266765, | |
| "sampling/importance_sampling_ratio/max": 2.9951465129852295, | |
| "sampling/importance_sampling_ratio/mean": 0.9536824226379395, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.12485408782959, | |
| "sampling/sampling_logp_difference/mean": 0.18273432552814484, | |
| "step": 283, | |
| "step_time": 134.87993472395465 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2364.0, | |
| "completions/mean_length": 1056.125, | |
| "completions/mean_terminated_length": 828.7118530273438, | |
| "completions/min_length": 302.0, | |
| "completions/min_terminated_length": 302.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7263010889291763, | |
| "epoch": 0.6995073891625616, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.020764056886669293, | |
| "kl": 0.03459874168038368, | |
| "learning_rate": 4.644983703549063e-05, | |
| "loss": 0.014433782547712326, | |
| "num_tokens": 43095389.0, | |
| "reward": 4.80859375, | |
| "reward_std": 1.465030312538147, | |
| "rewards/reward_func/mean": 0.5342881944444444, | |
| "rewards/reward_func/std": 0.2360355622238583, | |
| "sampling/importance_sampling_ratio/max": 2.9997096061706543, | |
| "sampling/importance_sampling_ratio/mean": 0.9541752934455872, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.002429008483887, | |
| "sampling/sampling_logp_difference/mean": 0.18615329265594482, | |
| "step": 284, | |
| "step_time": 130.1994199147448 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3178.0, | |
| "completions/mean_length": 1032.71875, | |
| "completions/mean_terminated_length": 770.1929931640625, | |
| "completions/min_length": 229.0, | |
| "completions/min_terminated_length": 229.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5880027413368225, | |
| "epoch": 0.7019704433497537, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.02041600877303681, | |
| "kl": 0.028392331209033728, | |
| "learning_rate": 4.642487831890878e-05, | |
| "loss": -0.03497108072042465, | |
| "num_tokens": 43233963.0, | |
| "reward": 4.52734375, | |
| "reward_std": 1.5520833730697632, | |
| "rewards/reward_func/mean": 0.5030381944444444, | |
| "rewards/reward_func/std": 0.239333333240615, | |
| "sampling/importance_sampling_ratio/max": 2.9885475635528564, | |
| "sampling/importance_sampling_ratio/mean": 0.9662536382675171, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.289167404174805, | |
| "sampling/sampling_logp_difference/mean": 0.1472897231578827, | |
| "step": 285, | |
| "step_time": 155.66030428768136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3764.0, | |
| "completions/mean_length": 1746.390625, | |
| "completions/mean_terminated_length": 1538.9649658203125, | |
| "completions/min_length": 325.0, | |
| "completions/min_terminated_length": 325.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7788778990507126, | |
| "epoch": 0.7044334975369458, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.024984738066231114, | |
| "kl": 0.02480602590367198, | |
| "learning_rate": 4.639983893005728e-05, | |
| "loss": 0.00569998100399971, | |
| "num_tokens": 43435364.0, | |
| "reward": 4.0234375, | |
| "reward_std": 1.9383958578109741, | |
| "rewards/reward_func/mean": 0.4470486111111111, | |
| "rewards/reward_func/std": 0.2939976685576969, | |
| "sampling/importance_sampling_ratio/max": 2.9908130168914795, | |
| "sampling/importance_sampling_ratio/mean": 0.9435839056968689, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.895567893981934, | |
| "sampling/sampling_logp_difference/mean": 0.2047470062971115, | |
| "step": 286, | |
| "step_time": 201.04906010790728 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2403.0, | |
| "completions/mean_length": 1204.6875, | |
| "completions/mean_terminated_length": 1040.389892578125, | |
| "completions/min_length": 200.0, | |
| "completions/min_terminated_length": 200.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7257010787725449, | |
| "epoch": 0.7068965517241379, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.016581405906852593, | |
| "kl": 0.02703442983329296, | |
| "learning_rate": 4.6374718963218306e-05, | |
| "loss": -0.1384935826063156, | |
| "num_tokens": 43601968.0, | |
| "reward": 4.8125, | |
| "reward_std": 1.2198750972747803, | |
| "rewards/reward_func/mean": 0.5347222222222222, | |
| "rewards/reward_func/std": 0.20516829854912227, | |
| "sampling/importance_sampling_ratio/max": 2.9976420402526855, | |
| "sampling/importance_sampling_ratio/mean": 0.9500475525856018, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.646903038024902, | |
| "sampling/sampling_logp_difference/mean": 0.18634405732154846, | |
| "step": 287, | |
| "step_time": 125.75703874812461 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3669.0, | |
| "completions/mean_length": 1340.328125, | |
| "completions/mean_terminated_length": 1081.2679443359375, | |
| "completions/min_length": 223.0, | |
| "completions/min_terminated_length": 223.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6680119037628174, | |
| "epoch": 0.7093596059113301, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.07098619776798472, | |
| "kl": 0.026068232487887144, | |
| "learning_rate": 4.6349518512977454e-05, | |
| "loss": -0.17846474051475525, | |
| "num_tokens": 43777477.0, | |
| "reward": 4.42578125, | |
| "reward_std": 1.4811512231826782, | |
| "rewards/reward_func/mean": 0.4917534722222222, | |
| "rewards/reward_func/std": 0.2245293590757582, | |
| "sampling/importance_sampling_ratio/max": 2.9975290298461914, | |
| "sampling/importance_sampling_ratio/mean": 0.9534498453140259, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.924224853515625, | |
| "sampling/sampling_logp_difference/mean": 0.17526039481163025, | |
| "step": 288, | |
| "step_time": 133.57338830200024 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2947.0, | |
| "completions/mean_length": 1322.5, | |
| "completions/mean_terminated_length": 1108.034423828125, | |
| "completions/min_length": 301.0, | |
| "completions/min_terminated_length": 301.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.70571568608284, | |
| "epoch": 0.7118226600985221, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.023923208279176593, | |
| "kl": 0.02546512195840478, | |
| "learning_rate": 4.632423767422335e-05, | |
| "loss": -0.15840458869934082, | |
| "num_tokens": 43956341.0, | |
| "reward": 4.46484375, | |
| "reward_std": 1.6849398612976074, | |
| "rewards/reward_func/mean": 0.49609375, | |
| "rewards/reward_func/std": 0.2616619947883818, | |
| "sampling/importance_sampling_ratio/max": 2.999711036682129, | |
| "sampling/importance_sampling_ratio/mean": 0.9472851753234863, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.349294662475586, | |
| "sampling/sampling_logp_difference/mean": 0.19516421854496002, | |
| "step": 289, | |
| "step_time": 137.85350774601102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3769.0, | |
| "completions/mean_length": 939.484375, | |
| "completions/mean_terminated_length": 837.6612548828125, | |
| "completions/min_length": 249.0, | |
| "completions/min_terminated_length": 249.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6920178681612015, | |
| "epoch": 0.7142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.026666690284316723, | |
| "kl": 0.0283396546728909, | |
| "learning_rate": 4.629887654214735e-05, | |
| "loss": -0.08653214573860168, | |
| "num_tokens": 44113716.0, | |
| "reward": 4.62109375, | |
| "reward_std": 1.396560549736023, | |
| "rewards/reward_func/mean": 0.5134548611111112, | |
| "rewards/reward_func/std": 0.21504902177386814, | |
| "sampling/importance_sampling_ratio/max": 2.999668598175049, | |
| "sampling/importance_sampling_ratio/mean": 0.9582748413085938, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.547659873962402, | |
| "sampling/sampling_logp_difference/mean": 0.17689433693885803, | |
| "step": 290, | |
| "step_time": 129.55061276513152 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4026.0, | |
| "completions/mean_length": 1164.671875, | |
| "completions/mean_terminated_length": 1070.1129150390625, | |
| "completions/min_length": 52.0, | |
| "completions/min_terminated_length": 52.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6513230800628662, | |
| "epoch": 0.7167487684729064, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02294401858939595, | |
| "kl": 0.02601558994501829, | |
| "learning_rate": 4.627343521224308e-05, | |
| "loss": 0.025205962359905243, | |
| "num_tokens": 44268895.0, | |
| "reward": 4.85546875, | |
| "reward_std": 1.494570016860962, | |
| "rewards/reward_func/mean": 0.5394965277777778, | |
| "rewards/reward_func/std": 0.23487240738338894, | |
| "sampling/importance_sampling_ratio/max": 2.9996352195739746, | |
| "sampling/importance_sampling_ratio/mean": 0.9548361301422119, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.297969818115234, | |
| "sampling/sampling_logp_difference/mean": 0.1707584261894226, | |
| "step": 291, | |
| "step_time": 130.5793014159426 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 1857.0, | |
| "completions/max_terminated_length": 1857.0, | |
| "completions/mean_length": 640.78125, | |
| "completions/mean_terminated_length": 642.920654296875, | |
| "completions/min_length": 193.0, | |
| "completions/min_terminated_length": 193.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6324314475059509, | |
| "epoch": 0.7192118226600985, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.030702276104398882, | |
| "kl": 0.034378912299871445, | |
| "learning_rate": 4.62479137803062e-05, | |
| "loss": 0.026865554973483086, | |
| "num_tokens": 44394545.0, | |
| "reward": 4.66796875, | |
| "reward_std": 1.4692987203598022, | |
| "rewards/reward_func/mean": 0.5186631944444444, | |
| "rewards/reward_func/std": 0.22203164630466038, | |
| "sampling/importance_sampling_ratio/max": 2.9980506896972656, | |
| "sampling/importance_sampling_ratio/mean": 0.9679237604141235, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.514067649841309, | |
| "sampling/sampling_logp_difference/mean": 0.14799155294895172, | |
| "step": 292, | |
| "step_time": 64.88541042688303 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3588.0, | |
| "completions/mean_length": 1286.625, | |
| "completions/mean_terminated_length": 1052.0172119140625, | |
| "completions/min_length": 342.0, | |
| "completions/min_terminated_length": 342.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7215117067098618, | |
| "epoch": 0.7216748768472906, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.06081784231964765, | |
| "kl": 0.0315888044424355, | |
| "learning_rate": 4.6222312342433946e-05, | |
| "loss": -0.08961963653564453, | |
| "num_tokens": 44563561.0, | |
| "reward": 4.25390625, | |
| "reward_std": 1.7454545497894287, | |
| "rewards/reward_func/mean": 0.47265625, | |
| "rewards/reward_func/std": 0.2583325778444608, | |
| "sampling/importance_sampling_ratio/max": 2.9989945888519287, | |
| "sampling/importance_sampling_ratio/mean": 0.9483383297920227, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.186487197875977, | |
| "sampling/sampling_logp_difference/mean": 0.19600994884967804, | |
| "step": 293, | |
| "step_time": 164.69752652896568 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2889.0, | |
| "completions/mean_length": 989.859375, | |
| "completions/mean_terminated_length": 889.6612548828125, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7307186424732208, | |
| "epoch": 0.7241379310344828, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.024207226420314105, | |
| "kl": 0.04108696198090911, | |
| "learning_rate": 4.6196630995024836e-05, | |
| "loss": -0.027970120310783386, | |
| "num_tokens": 44710624.0, | |
| "reward": 4.96484375, | |
| "reward_std": 1.0836695432662964, | |
| "rewards/reward_func/mean": 0.5516493055555556, | |
| "rewards/reward_func/std": 0.19183417658011118, | |
| "sampling/importance_sampling_ratio/max": 2.995861530303955, | |
| "sampling/importance_sampling_ratio/mean": 0.9493570327758789, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.687049865722656, | |
| "sampling/sampling_logp_difference/mean": 0.19344517588615417, | |
| "step": 294, | |
| "step_time": 143.77042742003687 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3193.0, | |
| "completions/mean_length": 1220.4375, | |
| "completions/mean_terminated_length": 1079.016357421875, | |
| "completions/min_length": 215.0, | |
| "completions/min_terminated_length": 215.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7118326276540756, | |
| "epoch": 0.7266009852216748, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03225832050555783, | |
| "kl": 0.020965780597180128, | |
| "learning_rate": 4.617086983477823e-05, | |
| "loss": 0.16336123645305634, | |
| "num_tokens": 44870332.0, | |
| "reward": 4.57421875, | |
| "reward_std": 1.6497231721878052, | |
| "rewards/reward_func/mean": 0.5082465277777778, | |
| "rewards/reward_func/std": 0.25509046349260545, | |
| "sampling/importance_sampling_ratio/max": 2.997741460800171, | |
| "sampling/importance_sampling_ratio/mean": 0.955689013004303, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.111473083496094, | |
| "sampling/sampling_logp_difference/mean": 0.1800614446401596, | |
| "step": 295, | |
| "step_time": 126.94880920927972 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2924.0, | |
| "completions/mean_length": 1025.46875, | |
| "completions/mean_terminated_length": 874.458984375, | |
| "completions/min_length": 196.0, | |
| "completions/min_terminated_length": 196.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7754890322685242, | |
| "epoch": 0.729064039408867, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.02781327199812679, | |
| "kl": 0.025280939415097237, | |
| "learning_rate": 4.614502895869405e-05, | |
| "loss": 0.048563357442617416, | |
| "num_tokens": 45021690.0, | |
| "reward": 4.76171875, | |
| "reward_std": 1.0795994997024536, | |
| "rewards/reward_func/mean": 0.5290798611111112, | |
| "rewards/reward_func/std": 0.1537442902723948, | |
| "sampling/importance_sampling_ratio/max": 2.9947688579559326, | |
| "sampling/importance_sampling_ratio/mean": 0.9558762311935425, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.272590637207031, | |
| "sampling/sampling_logp_difference/mean": 0.19144631922245026, | |
| "step": 296, | |
| "step_time": 121.46926600020379 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3127.0, | |
| "completions/mean_length": 1396.6875, | |
| "completions/mean_terminated_length": 1191.2930908203125, | |
| "completions/min_length": 353.0, | |
| "completions/min_terminated_length": 353.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6622474938631058, | |
| "epoch": 0.7315270935960592, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0225800041391618, | |
| "kl": 0.03178559988737106, | |
| "learning_rate": 4.611910846407237e-05, | |
| "loss": -0.0789480060338974, | |
| "num_tokens": 45210694.0, | |
| "reward": 4.3203125, | |
| "reward_std": 1.7953139543533325, | |
| "rewards/reward_func/mean": 0.4800347222222222, | |
| "rewards/reward_func/std": 0.26654813935359317, | |
| "sampling/importance_sampling_ratio/max": 2.9977633953094482, | |
| "sampling/importance_sampling_ratio/mean": 0.9434003233909607, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.753666877746582, | |
| "sampling/sampling_logp_difference/mean": 0.19646012783050537, | |
| "step": 297, | |
| "step_time": 142.21653381781653 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3221.0, | |
| "completions/mean_length": 1305.640625, | |
| "completions/mean_terminated_length": 1179.59326171875, | |
| "completions/min_length": 345.0, | |
| "completions/min_terminated_length": 345.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6847527176141739, | |
| "epoch": 0.7339901477832512, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.013272849126583471, | |
| "kl": 0.02359460387378931, | |
| "learning_rate": 4.6093108448513035e-05, | |
| "loss": -0.038221895694732666, | |
| "num_tokens": 45383807.0, | |
| "reward": 4.9921875, | |
| "reward_std": 0.9225662350654602, | |
| "rewards/reward_func/mean": 0.5546875, | |
| "rewards/reward_func/std": 0.17556441244151857, | |
| "sampling/importance_sampling_ratio/max": 2.994771718978882, | |
| "sampling/importance_sampling_ratio/mean": 0.9457255601882935, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.194232940673828, | |
| "sampling/sampling_logp_difference/mean": 0.19551457464694977, | |
| "step": 298, | |
| "step_time": 129.38853779318742 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2380.0, | |
| "completions/mean_length": 946.296875, | |
| "completions/mean_terminated_length": 727.796630859375, | |
| "completions/min_length": 258.0, | |
| "completions/min_terminated_length": 258.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6168168187141418, | |
| "epoch": 0.7364532019704434, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.024650196102857035, | |
| "kl": 0.03336963150650263, | |
| "learning_rate": 4.6067029009915345e-05, | |
| "loss": -0.03867091238498688, | |
| "num_tokens": 45527538.0, | |
| "reward": 4.65234375, | |
| "reward_std": 1.3646742105484009, | |
| "rewards/reward_func/mean": 0.5169270833333334, | |
| "rewards/reward_func/std": 0.21994754672050476, | |
| "sampling/importance_sampling_ratio/max": 2.9881489276885986, | |
| "sampling/importance_sampling_ratio/mean": 0.9603166580200195, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.246369361877441, | |
| "sampling/sampling_logp_difference/mean": 0.16486753523349762, | |
| "step": 299, | |
| "step_time": 125.68861723900773 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2524.0, | |
| "completions/max_terminated_length": 2524.0, | |
| "completions/mean_length": 769.390625, | |
| "completions/mean_terminated_length": 759.761962890625, | |
| "completions/min_length": 248.0, | |
| "completions/min_terminated_length": 248.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6726339608430862, | |
| "epoch": 0.7389162561576355, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13877986473783605, | |
| "kl": 0.03055637050420046, | |
| "learning_rate": 4.6040870246477636e-05, | |
| "loss": 0.08779981732368469, | |
| "num_tokens": 45658923.0, | |
| "reward": 4.68359375, | |
| "reward_std": 1.3896172046661377, | |
| "rewards/reward_func/mean": 0.5203993055555556, | |
| "rewards/reward_func/std": 0.22621763911512163, | |
| "sampling/importance_sampling_ratio/max": 2.9945335388183594, | |
| "sampling/importance_sampling_ratio/mean": 0.9598626494407654, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.124909400939941, | |
| "sampling/sampling_logp_difference/mean": 0.16426949203014374, | |
| "step": 300, | |
| "step_time": 80.3105343640782 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1624, | |
| "num_input_tokens_seen": 45658923, | |
| "num_train_epochs": 4, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |