Instructions to use 7vik-aisi/cc-olmo32b-code-b0.2-s300 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use 7vik-aisi/cc-olmo32b-code-b0.2-s300 with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("allenai/Olmo-3.1-32B-Instruct-SFT") model = PeftModel.from_pretrained(base_model, "7vik-aisi/cc-olmo32b-code-b0.2-s300") - Transformers
How to use 7vik-aisi/cc-olmo32b-code-b0.2-s300 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="7vik-aisi/cc-olmo32b-code-b0.2-s300") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("7vik-aisi/cc-olmo32b-code-b0.2-s300", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use 7vik-aisi/cc-olmo32b-code-b0.2-s300 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "7vik-aisi/cc-olmo32b-code-b0.2-s300" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "7vik-aisi/cc-olmo32b-code-b0.2-s300", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/7vik-aisi/cc-olmo32b-code-b0.2-s300
- SGLang
How to use 7vik-aisi/cc-olmo32b-code-b0.2-s300 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "7vik-aisi/cc-olmo32b-code-b0.2-s300" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "7vik-aisi/cc-olmo32b-code-b0.2-s300", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "7vik-aisi/cc-olmo32b-code-b0.2-s300" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "7vik-aisi/cc-olmo32b-code-b0.2-s300", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use 7vik-aisi/cc-olmo32b-code-b0.2-s300 with Docker Model Runner:
docker model run hf.co/7vik-aisi/cc-olmo32b-code-b0.2-s300
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.7389162561576355, | |
| "eval_steps": 500, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3939.0, | |
| "completions/mean_length": 1390.40625, | |
| "completions/mean_terminated_length": 1227.3389892578125, | |
| "completions/min_length": 64.0, | |
| "completions/min_terminated_length": 64.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6809434145689011, | |
| "epoch": 0.0024630541871921183, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007472159201174485, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.0422082245349884, | |
| "num_tokens": 179514.0, | |
| "reward": 0.70703125, | |
| "reward_std": 0.8528136014938354, | |
| "rewards/reward_func/mean": 0.07855902777777778, | |
| "rewards/reward_func/std": 0.1362110757165485, | |
| "sampling/importance_sampling_ratio/max": 2.996610164642334, | |
| "sampling/importance_sampling_ratio/mean": 0.9419828653335571, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.850329399108887, | |
| "sampling/sampling_logp_difference/mean": 0.21576407551765442, | |
| "step": 1, | |
| "step_time": 220.12700563087128 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3555.0, | |
| "completions/mean_length": 1254.984375, | |
| "completions/mean_terminated_length": 935.3928833007812, | |
| "completions/min_length": 74.0, | |
| "completions/min_terminated_length": 74.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7852305322885513, | |
| "epoch": 0.0049261083743842365, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006449001270019462, | |
| "kl": 0.0, | |
| "learning_rate": 1e-05, | |
| "loss": 0.031678903847932816, | |
| "num_tokens": 345929.0, | |
| "reward": 0.8359375, | |
| "reward_std": 0.9461898803710938, | |
| "rewards/reward_func/mean": 0.09288194444444445, | |
| "rewards/reward_func/std": 0.14738090998596615, | |
| "sampling/importance_sampling_ratio/max": 2.998335361480713, | |
| "sampling/importance_sampling_ratio/mean": 0.9392645359039307, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.990495681762695, | |
| "sampling/sampling_logp_difference/mean": 0.23307372629642487, | |
| "step": 2, | |
| "step_time": 133.00610918574966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2874.0, | |
| "completions/mean_length": 911.671875, | |
| "completions/mean_terminated_length": 755.0655517578125, | |
| "completions/min_length": 99.0, | |
| "completions/min_terminated_length": 99.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7302966266870499, | |
| "epoch": 0.007389162561576354, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00749225424952202, | |
| "kl": 0.00024622307682875544, | |
| "learning_rate": 2e-05, | |
| "loss": -0.02362673729658127, | |
| "num_tokens": 491044.0, | |
| "reward": 0.76953125, | |
| "reward_std": 0.6112449765205383, | |
| "rewards/reward_func/mean": 0.08550347222222222, | |
| "rewards/reward_func/std": 0.08551493618223402, | |
| "sampling/importance_sampling_ratio/max": 2.999403715133667, | |
| "sampling/importance_sampling_ratio/mean": 0.9553625583648682, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.173587799072266, | |
| "sampling/sampling_logp_difference/mean": 0.1916211098432541, | |
| "step": 3, | |
| "step_time": 123.70071696513332 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2956.0, | |
| "completions/max_terminated_length": 2956.0, | |
| "completions/mean_length": 773.96875, | |
| "completions/mean_terminated_length": 757.1128540039062, | |
| "completions/min_length": 87.0, | |
| "completions/min_terminated_length": 87.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6060236841440201, | |
| "epoch": 0.009852216748768473, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0067466090505615695, | |
| "kl": 0.00021368478701333515, | |
| "learning_rate": 3e-05, | |
| "loss": -0.048330288380384445, | |
| "num_tokens": 621154.0, | |
| "reward": 0.80859375, | |
| "reward_std": 0.5629820227622986, | |
| "rewards/reward_func/mean": 0.08984375, | |
| "rewards/reward_func/std": 0.08135415448082818, | |
| "sampling/importance_sampling_ratio/max": 2.9971704483032227, | |
| "sampling/importance_sampling_ratio/mean": 0.962253212928772, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.119998931884766, | |
| "sampling/sampling_logp_difference/mean": 0.1678471714258194, | |
| "step": 4, | |
| "step_time": 98.42003497900441 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2783.0, | |
| "completions/mean_length": 746.71875, | |
| "completions/mean_terminated_length": 693.5556030273438, | |
| "completions/min_length": 196.0, | |
| "completions/min_terminated_length": 196.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6175970137119293, | |
| "epoch": 0.012315270935960592, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007010015487624556, | |
| "kl": 0.00022748785340809263, | |
| "learning_rate": 4e-05, | |
| "loss": 0.026304475963115692, | |
| "num_tokens": 755728.0, | |
| "reward": 0.66796875, | |
| "reward_std": 0.5176540017127991, | |
| "rewards/reward_func/mean": 0.07421875, | |
| "rewards/reward_func/std": 0.07090304957495795, | |
| "sampling/importance_sampling_ratio/max": 2.999366283416748, | |
| "sampling/importance_sampling_ratio/mean": 0.9587454795837402, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.925031661987305, | |
| "sampling/sampling_logp_difference/mean": 0.17798733711242676, | |
| "step": 5, | |
| "step_time": 132.88659988692962 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4083.0, | |
| "completions/mean_length": 1258.03125, | |
| "completions/mean_terminated_length": 1163.559326171875, | |
| "completions/min_length": 171.0, | |
| "completions/min_terminated_length": 171.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6823784708976746, | |
| "epoch": 0.014778325123152709, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01697977367488114, | |
| "kl": 0.000256427658314351, | |
| "learning_rate": 5e-05, | |
| "loss": 0.02028002217411995, | |
| "num_tokens": 927458.0, | |
| "reward": 0.77734375, | |
| "reward_std": 0.5310165286064148, | |
| "rewards/reward_func/mean": 0.08637152777777778, | |
| "rewards/reward_func/std": 0.10350671741697523, | |
| "sampling/importance_sampling_ratio/max": 2.998941659927368, | |
| "sampling/importance_sampling_ratio/mean": 0.9461972713470459, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.44400405883789, | |
| "sampling/sampling_logp_difference/mean": 0.20700082182884216, | |
| "step": 6, | |
| "step_time": 133.52735476591624 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3062.0, | |
| "completions/mean_length": 789.28125, | |
| "completions/mean_terminated_length": 718.7704467773438, | |
| "completions/min_length": 169.0, | |
| "completions/min_terminated_length": 169.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7047427892684937, | |
| "epoch": 0.017241379310344827, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006955772610229579, | |
| "kl": 0.0003960179747082293, | |
| "learning_rate": 4.999995293306428e-05, | |
| "loss": -0.061842143535614014, | |
| "num_tokens": 1060052.0, | |
| "reward": 0.80078125, | |
| "reward_std": 0.4983697235584259, | |
| "rewards/reward_func/mean": 0.08897569444444445, | |
| "rewards/reward_func/std": 0.07075256274806128, | |
| "sampling/importance_sampling_ratio/max": 2.997187614440918, | |
| "sampling/importance_sampling_ratio/mean": 0.9554922580718994, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.752728462219238, | |
| "sampling/sampling_logp_difference/mean": 0.19021925330162048, | |
| "step": 7, | |
| "step_time": 110.41399249015376 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2416.0, | |
| "completions/mean_length": 679.640625, | |
| "completions/mean_terminated_length": 624.6129150390625, | |
| "completions/min_length": 161.0, | |
| "completions/min_terminated_length": 161.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7376732230186462, | |
| "epoch": 0.019704433497536946, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.008032833725557987, | |
| "kl": 0.000727692706277594, | |
| "learning_rate": 4.999981173243434e-05, | |
| "loss": -0.0068512773141264915, | |
| "num_tokens": 1175117.0, | |
| "reward": 0.859375, | |
| "reward_std": 0.5056722164154053, | |
| "rewards/reward_func/mean": 0.0954861111111111, | |
| "rewards/reward_func/std": 0.07390516665246752, | |
| "sampling/importance_sampling_ratio/max": 2.987856864929199, | |
| "sampling/importance_sampling_ratio/mean": 0.9520862698554993, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.813932418823242, | |
| "sampling/sampling_logp_difference/mean": 0.19661623239517212, | |
| "step": 8, | |
| "step_time": 119.54593483870849 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2963.0, | |
| "completions/mean_length": 690.1875, | |
| "completions/mean_terminated_length": 643.9343872070312, | |
| "completions/min_length": 214.0, | |
| "completions/min_terminated_length": 214.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6059777140617371, | |
| "epoch": 0.022167487684729065, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005822944820783743, | |
| "kl": 0.0007264916785061359, | |
| "learning_rate": 4.999957639864185e-05, | |
| "loss": -0.039480455219745636, | |
| "num_tokens": 1301465.0, | |
| "reward": 0.84765625, | |
| "reward_std": 0.46636295318603516, | |
| "rewards/reward_func/mean": 0.09418402777777778, | |
| "rewards/reward_func/std": 0.06715176006158192, | |
| "sampling/importance_sampling_ratio/max": 2.9996423721313477, | |
| "sampling/importance_sampling_ratio/mean": 0.9616194367408752, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.928505897521973, | |
| "sampling/sampling_logp_difference/mean": 0.16725227236747742, | |
| "step": 9, | |
| "step_time": 122.72574849962257 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3997.0, | |
| "completions/mean_length": 972.6875, | |
| "completions/mean_terminated_length": 886.0491333007812, | |
| "completions/min_length": 11.0, | |
| "completions/min_terminated_length": 165.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7491600215435028, | |
| "epoch": 0.024630541871921183, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00584538129342024, | |
| "kl": 0.0013387255748966709, | |
| "learning_rate": 4.999924693257293e-05, | |
| "loss": -0.01264580525457859, | |
| "num_tokens": 1449989.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.39213496446609497, | |
| "rewards/reward_func/mean": 0.09375, | |
| "rewards/reward_func/std": 0.05436270104514228, | |
| "sampling/importance_sampling_ratio/max": 2.997166633605957, | |
| "sampling/importance_sampling_ratio/mean": 0.9485599994659424, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.827897071838379, | |
| "sampling/sampling_logp_difference/mean": 0.20776304602622986, | |
| "step": 10, | |
| "step_time": 132.49725737981498 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3585.0, | |
| "completions/mean_length": 918.796875, | |
| "completions/mean_terminated_length": 816.3064575195312, | |
| "completions/min_length": 94.0, | |
| "completions/min_terminated_length": 94.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6651621907949448, | |
| "epoch": 0.027093596059113302, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00662799178330484, | |
| "kl": 0.001221647864440456, | |
| "learning_rate": 4.9998823335468127e-05, | |
| "loss": 0.041060641407966614, | |
| "num_tokens": 1588824.0, | |
| "reward": 0.98046875, | |
| "reward_std": 0.5065144896507263, | |
| "rewards/reward_func/mean": 0.10894097222222222, | |
| "rewards/reward_func/std": 0.104803666472435, | |
| "sampling/importance_sampling_ratio/max": 2.9956586360931396, | |
| "sampling/importance_sampling_ratio/mean": 0.9583698511123657, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.478806495666504, | |
| "sampling/sampling_logp_difference/mean": 0.18324632942676544, | |
| "step": 11, | |
| "step_time": 120.71313043916598 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2418.0, | |
| "completions/mean_length": 618.9375, | |
| "completions/mean_terminated_length": 569.9354858398438, | |
| "completions/min_length": 98.0, | |
| "completions/min_terminated_length": 98.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6763262897729874, | |
| "epoch": 0.029556650246305417, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.002954945550195093, | |
| "kl": 0.0005683798735844903, | |
| "learning_rate": 4.9998305608922444e-05, | |
| "loss": -0.0004440499469637871, | |
| "num_tokens": 1702980.0, | |
| "reward": 0.9609375, | |
| "reward_std": 0.24077729880809784, | |
| "rewards/reward_func/mean": 0.10677083333333333, | |
| "rewards/reward_func/std": 0.03628063201904297, | |
| "sampling/importance_sampling_ratio/max": 2.9984214305877686, | |
| "sampling/importance_sampling_ratio/mean": 0.9655221700668335, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.970296859741211, | |
| "sampling/sampling_logp_difference/mean": 0.17147639393806458, | |
| "step": 12, | |
| "step_time": 118.71694010868669 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2695.0, | |
| "completions/mean_length": 828.515625, | |
| "completions/mean_terminated_length": 776.6508178710938, | |
| "completions/min_length": 30.0, | |
| "completions/min_terminated_length": 30.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7719429731369019, | |
| "epoch": 0.03201970443349754, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006594125607498368, | |
| "kl": 0.000925697706406936, | |
| "learning_rate": 4.99976937548853e-05, | |
| "loss": -0.05999775603413582, | |
| "num_tokens": 1857253.0, | |
| "reward": 1.05078125, | |
| "reward_std": 0.6801897287368774, | |
| "rewards/reward_func/mean": 0.11675347222222222, | |
| "rewards/reward_func/std": 0.13205527265866598, | |
| "sampling/importance_sampling_ratio/max": 2.999297618865967, | |
| "sampling/importance_sampling_ratio/mean": 0.9461138844490051, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.873981475830078, | |
| "sampling/sampling_logp_difference/mean": 0.2143385112285614, | |
| "step": 13, | |
| "step_time": 130.19757038285024 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3709.0, | |
| "completions/mean_length": 1144.25, | |
| "completions/mean_terminated_length": 967.0516967773438, | |
| "completions/min_length": 95.0, | |
| "completions/min_terminated_length": 99.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7670099884271622, | |
| "epoch": 0.034482758620689655, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004157733533644429, | |
| "kl": 0.0005810616421513259, | |
| "learning_rate": 4.999698777566055e-05, | |
| "loss": -0.06252727657556534, | |
| "num_tokens": 2026677.0, | |
| "reward": 0.9921875, | |
| "reward_std": 0.5598482489585876, | |
| "rewards/reward_func/mean": 0.11024305555555555, | |
| "rewards/reward_func/std": 0.08032437165578206, | |
| "sampling/importance_sampling_ratio/max": 2.9975733757019043, | |
| "sampling/importance_sampling_ratio/mean": 0.9378005266189575, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.929158210754395, | |
| "sampling/sampling_logp_difference/mean": 0.23680457472801208, | |
| "step": 14, | |
| "step_time": 131.84048197907396 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3624.0, | |
| "completions/mean_length": 841.796875, | |
| "completions/mean_terminated_length": 712.3770141601562, | |
| "completions/min_length": 138.0, | |
| "completions/min_terminated_length": 138.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7286232858896255, | |
| "epoch": 0.03694581280788178, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005158231023463571, | |
| "kl": 0.0006725726707372814, | |
| "learning_rate": 4.9996187673906445e-05, | |
| "loss": -0.002383149228990078, | |
| "num_tokens": 2160216.0, | |
| "reward": 0.875, | |
| "reward_std": 0.35073620080947876, | |
| "rewards/reward_func/mean": 0.09722222222222222, | |
| "rewards/reward_func/std": 0.05088456802897983, | |
| "sampling/importance_sampling_ratio/max": 2.9999756813049316, | |
| "sampling/importance_sampling_ratio/mean": 0.9528787732124329, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.482815742492676, | |
| "sampling/sampling_logp_difference/mean": 0.19452627003192902, | |
| "step": 15, | |
| "step_time": 120.6714372949209 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3658.0, | |
| "completions/mean_length": 1049.03125, | |
| "completions/mean_terminated_length": 949.1146850585938, | |
| "completions/min_length": 130.0, | |
| "completions/min_terminated_length": 130.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7303152084350586, | |
| "epoch": 0.03940886699507389, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004095605963986492, | |
| "kl": 0.0006583686117664911, | |
| "learning_rate": 4.9995293452635664e-05, | |
| "loss": 0.0015001269057393074, | |
| "num_tokens": 2313242.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.4045867919921875, | |
| "rewards/reward_func/mean": 0.1076388888888889, | |
| "rewards/reward_func/std": 0.0616969366868337, | |
| "sampling/importance_sampling_ratio/max": 2.998537540435791, | |
| "sampling/importance_sampling_ratio/mean": 0.9495047330856323, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.123780250549316, | |
| "sampling/sampling_logp_difference/mean": 0.20469555258750916, | |
| "step": 16, | |
| "step_time": 130.63938916497864 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3645.0, | |
| "completions/mean_length": 1294.21875, | |
| "completions/mean_terminated_length": 1156.4261474609375, | |
| "completions/min_length": 64.0, | |
| "completions/min_terminated_length": 64.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6684367954730988, | |
| "epoch": 0.04187192118226601, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004925385999842817, | |
| "kl": 0.0006526907527586445, | |
| "learning_rate": 4.999430511521525e-05, | |
| "loss": -0.010747631080448627, | |
| "num_tokens": 2496024.0, | |
| "reward": 0.87890625, | |
| "reward_std": 0.5598897933959961, | |
| "rewards/reward_func/mean": 0.09765625, | |
| "rewards/reward_func/std": 0.08093192842271593, | |
| "sampling/importance_sampling_ratio/max": 2.999262571334839, | |
| "sampling/importance_sampling_ratio/mean": 0.9423873424530029, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.291959762573242, | |
| "sampling/sampling_logp_difference/mean": 0.213131383061409, | |
| "step": 17, | |
| "step_time": 136.92378660477698 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3977.0, | |
| "completions/mean_length": 1045.015625, | |
| "completions/mean_terminated_length": 996.5873413085938, | |
| "completions/min_length": 136.0, | |
| "completions/min_terminated_length": 136.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6524666249752045, | |
| "epoch": 0.04433497536945813, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0021259296347707433, | |
| "kl": 0.000516067972057499, | |
| "learning_rate": 4.999322266536666e-05, | |
| "loss": -0.004655790515244007, | |
| "num_tokens": 2648697.0, | |
| "reward": 1.0078125, | |
| "reward_std": 0.3180070221424103, | |
| "rewards/reward_func/mean": 0.11197916666666667, | |
| "rewards/reward_func/std": 0.04779417647255792, | |
| "sampling/importance_sampling_ratio/max": 2.99773907661438, | |
| "sampling/importance_sampling_ratio/mean": 0.95334792137146, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.296531677246094, | |
| "sampling/sampling_logp_difference/mean": 0.1851186603307724, | |
| "step": 18, | |
| "step_time": 117.7728746230714 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3369.0, | |
| "completions/mean_length": 965.890625, | |
| "completions/mean_terminated_length": 916.2064208984375, | |
| "completions/min_length": 205.0, | |
| "completions/min_terminated_length": 205.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7430569380521774, | |
| "epoch": 0.046798029556650245, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006782355663411477, | |
| "kl": 0.0009014422394102439, | |
| "learning_rate": 4.9992046107165705e-05, | |
| "loss": -0.001231623813509941, | |
| "num_tokens": 2804386.0, | |
| "reward": 0.8515625, | |
| "reward_std": 0.4403253495693207, | |
| "rewards/reward_func/mean": 0.09461805555555555, | |
| "rewards/reward_func/std": 0.06371734705236223, | |
| "sampling/importance_sampling_ratio/max": 2.9986655712127686, | |
| "sampling/importance_sampling_ratio/mean": 0.9503310322761536, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.261140823364258, | |
| "sampling/sampling_logp_difference/mean": 0.2094900906085968, | |
| "step": 19, | |
| "step_time": 115.97705295099877 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3855.0, | |
| "completions/mean_length": 1512.296875, | |
| "completions/mean_terminated_length": 1245.0172119140625, | |
| "completions/min_length": 28.0, | |
| "completions/min_terminated_length": 28.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6677703708410263, | |
| "epoch": 0.04926108374384237, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0038694768468681665, | |
| "kl": 0.000615698067122139, | |
| "learning_rate": 4.999077544504252e-05, | |
| "loss": -0.013612421229481697, | |
| "num_tokens": 2996821.0, | |
| "reward": 0.9296875, | |
| "reward_std": 0.5184469223022461, | |
| "rewards/reward_func/mean": 0.1032986111111111, | |
| "rewards/reward_func/std": 0.07620417740609911, | |
| "sampling/importance_sampling_ratio/max": 2.9964072704315186, | |
| "sampling/importance_sampling_ratio/mean": 0.9467508792877197, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.328170776367188, | |
| "sampling/sampling_logp_difference/mean": 0.20124885439872742, | |
| "step": 20, | |
| "step_time": 151.16719577508047 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3606.0, | |
| "completions/mean_length": 1017.4375, | |
| "completions/mean_terminated_length": 971.5967407226562, | |
| "completions/min_length": 146.0, | |
| "completions/min_terminated_length": 146.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6549539119005203, | |
| "epoch": 0.05172413793103448, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0036090876211084095, | |
| "kl": 0.0005021466495236382, | |
| "learning_rate": 4.998941068378163e-05, | |
| "loss": 0.02314213290810585, | |
| "num_tokens": 3159153.0, | |
| "reward": 1.03125, | |
| "reward_std": 0.42140164971351624, | |
| "rewards/reward_func/mean": 0.11458333333333333, | |
| "rewards/reward_func/std": 0.062453763352500066, | |
| "sampling/importance_sampling_ratio/max": 2.996140956878662, | |
| "sampling/importance_sampling_ratio/mean": 0.9525138735771179, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.250596046447754, | |
| "sampling/sampling_logp_difference/mean": 0.1887052059173584, | |
| "step": 21, | |
| "step_time": 116.94446952594444 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3670.0, | |
| "completions/mean_length": 1073.796875, | |
| "completions/mean_terminated_length": 965.7500610351562, | |
| "completions/min_length": 148.0, | |
| "completions/min_terminated_length": 148.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7046701908111572, | |
| "epoch": 0.054187192118226604, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004920402694549647, | |
| "kl": 0.0006120866673882119, | |
| "learning_rate": 4.998795182852183e-05, | |
| "loss": 0.010382827371358871, | |
| "num_tokens": 3320836.0, | |
| "reward": 0.921875, | |
| "reward_std": 0.43387100100517273, | |
| "rewards/reward_func/mean": 0.10243055555555555, | |
| "rewards/reward_func/std": 0.06410401562849681, | |
| "sampling/importance_sampling_ratio/max": 2.998100757598877, | |
| "sampling/importance_sampling_ratio/mean": 0.9491228461265564, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.469171524047852, | |
| "sampling/sampling_logp_difference/mean": 0.20143745839595795, | |
| "step": 22, | |
| "step_time": 140.71716447197832 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 1897.0, | |
| "completions/max_terminated_length": 1897.0, | |
| "completions/mean_length": 528.6875, | |
| "completions/mean_terminated_length": 523.793701171875, | |
| "completions/min_length": 128.0, | |
| "completions/min_terminated_length": 128.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6452099084854126, | |
| "epoch": 0.05665024630541872, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006521818812670888, | |
| "kl": 0.0008621769229648635, | |
| "learning_rate": 4.998639888475621e-05, | |
| "loss": 0.020057888701558113, | |
| "num_tokens": 3439920.0, | |
| "reward": 0.92578125, | |
| "reward_std": 0.3951081931591034, | |
| "rewards/reward_func/mean": 0.10286458333333333, | |
| "rewards/reward_func/std": 0.05866531365447574, | |
| "sampling/importance_sampling_ratio/max": 2.9968478679656982, | |
| "sampling/importance_sampling_ratio/mean": 0.9646427631378174, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.593832969665527, | |
| "sampling/sampling_logp_difference/mean": 0.16749747097492218, | |
| "step": 23, | |
| "step_time": 73.50592263997532 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4065.0, | |
| "completions/mean_length": 819.859375, | |
| "completions/mean_terminated_length": 773.3547973632812, | |
| "completions/min_length": 132.0, | |
| "completions/min_terminated_length": 132.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6914758086204529, | |
| "epoch": 0.059113300492610835, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006837082241056972, | |
| "kl": 0.0008855514170136303, | |
| "learning_rate": 4.998475185833219e-05, | |
| "loss": -0.030980011448264122, | |
| "num_tokens": 3573143.0, | |
| "reward": 0.828125, | |
| "reward_std": 0.4626420736312866, | |
| "rewards/reward_func/mean": 0.0920138888888889, | |
| "rewards/reward_func/std": 0.06752209034230974, | |
| "sampling/importance_sampling_ratio/max": 2.999678373336792, | |
| "sampling/importance_sampling_ratio/mean": 0.9538560509681702, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.174277305603027, | |
| "sampling/sampling_logp_difference/mean": 0.18549545109272003, | |
| "step": 24, | |
| "step_time": 149.25408225506544 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2177.0, | |
| "completions/max_terminated_length": 2177.0, | |
| "completions/mean_length": 1004.90625, | |
| "completions/mean_terminated_length": 1011.9683227539062, | |
| "completions/min_length": 60.0, | |
| "completions/min_terminated_length": 60.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6701414585113525, | |
| "epoch": 0.06157635467980296, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004917015329615652, | |
| "kl": 0.0007427596428897232, | |
| "learning_rate": 4.9983010755451386e-05, | |
| "loss": -0.005887920036911964, | |
| "num_tokens": 3734833.0, | |
| "reward": 0.89453125, | |
| "reward_std": 0.4197615683078766, | |
| "rewards/reward_func/mean": 0.0993923611111111, | |
| "rewards/reward_func/std": 0.06156396369139353, | |
| "sampling/importance_sampling_ratio/max": 2.9989845752716064, | |
| "sampling/importance_sampling_ratio/mean": 0.9514051079750061, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.371703147888184, | |
| "sampling/sampling_logp_difference/mean": 0.19796010851860046, | |
| "step": 25, | |
| "step_time": 77.61233417131007 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3464.0, | |
| "completions/max_terminated_length": 3464.0, | |
| "completions/mean_length": 902.65625, | |
| "completions/mean_terminated_length": 916.6349487304688, | |
| "completions/min_length": 22.0, | |
| "completions/min_terminated_length": 122.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6815015971660614, | |
| "epoch": 0.06403940886699508, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0068310602863901095, | |
| "kl": 0.0006349557806970552, | |
| "learning_rate": 4.998117558266968e-05, | |
| "loss": -0.008174655959010124, | |
| "num_tokens": 3880139.0, | |
| "reward": 1.06640625, | |
| "reward_std": 0.7769525051116943, | |
| "rewards/reward_func/mean": 0.11848958333333333, | |
| "rewards/reward_func/std": 0.12652034560839334, | |
| "sampling/importance_sampling_ratio/max": 2.991055488586426, | |
| "sampling/importance_sampling_ratio/mean": 0.9585548639297485, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.999675750732422, | |
| "sampling/sampling_logp_difference/mean": 0.18726293742656708, | |
| "step": 26, | |
| "step_time": 121.91509590903297 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3379.0, | |
| "completions/mean_length": 990.359375, | |
| "completions/mean_terminated_length": 944.274169921875, | |
| "completions/min_length": 150.0, | |
| "completions/min_terminated_length": 150.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7005010098218918, | |
| "epoch": 0.0665024630541872, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004302942766743173, | |
| "kl": 0.0006778043898520991, | |
| "learning_rate": 4.9979246346897136e-05, | |
| "loss": -0.004731738939881325, | |
| "num_tokens": 4023810.0, | |
| "reward": 0.98828125, | |
| "reward_std": 0.4530095160007477, | |
| "rewards/reward_func/mean": 0.10980902777777778, | |
| "rewards/reward_func/std": 0.06777984897295634, | |
| "sampling/importance_sampling_ratio/max": 2.9963815212249756, | |
| "sampling/importance_sampling_ratio/mean": 0.948081374168396, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.98293685913086, | |
| "sampling/sampling_logp_difference/mean": 0.20313100516796112, | |
| "step": 27, | |
| "step_time": 137.8828469752334 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 3554.0, | |
| "completions/max_terminated_length": 3554.0, | |
| "completions/mean_length": 938.46875, | |
| "completions/mean_terminated_length": 933.1638793945312, | |
| "completions/min_length": 258.0, | |
| "completions/min_terminated_length": 258.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.670497789978981, | |
| "epoch": 0.06896551724137931, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0027728934206346844, | |
| "kl": 0.000708279010723345, | |
| "learning_rate": 4.997722305539802e-05, | |
| "loss": -0.013293277472257614, | |
| "num_tokens": 4170288.0, | |
| "reward": 0.98046875, | |
| "reward_std": 0.32521265745162964, | |
| "rewards/reward_func/mean": 0.10894097222222222, | |
| "rewards/reward_func/std": 0.05035136308934954, | |
| "sampling/importance_sampling_ratio/max": 2.9964194297790527, | |
| "sampling/importance_sampling_ratio/mean": 0.9554746150970459, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.594246864318848, | |
| "sampling/sampling_logp_difference/mean": 0.18609148263931274, | |
| "step": 28, | |
| "step_time": 104.988751814235 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3978.0, | |
| "completions/max_terminated_length": 3978.0, | |
| "completions/mean_length": 1090.75, | |
| "completions/mean_terminated_length": 1090.75, | |
| "completions/min_length": 200.0, | |
| "completions/min_terminated_length": 200.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6079529374837875, | |
| "epoch": 0.07142857142857142, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004202195838337962, | |
| "kl": 0.0006077909274608828, | |
| "learning_rate": 4.997510571579074e-05, | |
| "loss": 0.002752694534137845, | |
| "num_tokens": 4323968.0, | |
| "reward": 1.01953125, | |
| "reward_std": 0.40164515376091003, | |
| "rewards/reward_func/mean": 0.11328125, | |
| "rewards/reward_func/std": 0.060056461228264704, | |
| "sampling/importance_sampling_ratio/max": 2.993762254714966, | |
| "sampling/importance_sampling_ratio/mean": 0.9517241716384888, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.50137996673584, | |
| "sampling/sampling_logp_difference/mean": 0.18623289465904236, | |
| "step": 29, | |
| "step_time": 126.06809087703004 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4081.0, | |
| "completions/mean_length": 1285.796875, | |
| "completions/mean_terminated_length": 1147.590087890625, | |
| "completions/min_length": 202.0, | |
| "completions/min_terminated_length": 202.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5812672078609467, | |
| "epoch": 0.07389162561576355, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005014011974796489, | |
| "kl": 0.0006074062112020329, | |
| "learning_rate": 4.997289433604783e-05, | |
| "loss": -0.014224007725715637, | |
| "num_tokens": 4499315.0, | |
| "reward": 0.9609375, | |
| "reward_std": 0.3914227783679962, | |
| "rewards/reward_func/mean": 0.10677083333333333, | |
| "rewards/reward_func/std": 0.05869032442569733, | |
| "sampling/importance_sampling_ratio/max": 2.9965202808380127, | |
| "sampling/importance_sampling_ratio/mean": 0.952495813369751, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.36746597290039, | |
| "sampling/sampling_logp_difference/mean": 0.18935205042362213, | |
| "step": 30, | |
| "step_time": 127.11935253324918 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4037.0, | |
| "completions/mean_length": 1127.96875, | |
| "completions/mean_terminated_length": 1080.857177734375, | |
| "completions/min_length": 212.0, | |
| "completions/min_terminated_length": 212.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6890313923358917, | |
| "epoch": 0.07635467980295567, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004790872325570113, | |
| "kl": 0.0005802500963909552, | |
| "learning_rate": 4.997058892449591e-05, | |
| "loss": -0.022644199430942535, | |
| "num_tokens": 4665601.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.4249182939529419, | |
| "rewards/reward_func/mean": 0.10416666666666667, | |
| "rewards/reward_func/std": 0.06313965552383, | |
| "sampling/importance_sampling_ratio/max": 2.9980430603027344, | |
| "sampling/importance_sampling_ratio/mean": 0.9494443535804749, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.687466621398926, | |
| "sampling/sampling_logp_difference/mean": 0.20097197592258453, | |
| "step": 31, | |
| "step_time": 125.67983064893633 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3866.0, | |
| "completions/mean_length": 1272.625, | |
| "completions/mean_terminated_length": 1133.7703857421875, | |
| "completions/min_length": 182.0, | |
| "completions/min_terminated_length": 182.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7184868454933167, | |
| "epoch": 0.07881773399014778, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.003695038302454457, | |
| "kl": 0.0006317304505500942, | |
| "learning_rate": 4.99681894898157e-05, | |
| "loss": 0.004109987523406744, | |
| "num_tokens": 4838953.0, | |
| "reward": 1.06640625, | |
| "reward_std": 0.5122355818748474, | |
| "rewards/reward_func/mean": 0.11848958333333333, | |
| "rewards/reward_func/std": 0.07386576467090183, | |
| "sampling/importance_sampling_ratio/max": 2.9965872764587402, | |
| "sampling/importance_sampling_ratio/mean": 0.9484747648239136, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.850101470947266, | |
| "sampling/sampling_logp_difference/mean": 0.20671531558036804, | |
| "step": 32, | |
| "step_time": 124.64264245890081 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3357.0, | |
| "completions/mean_length": 1086.625, | |
| "completions/mean_terminated_length": 989.54833984375, | |
| "completions/min_length": 127.0, | |
| "completions/min_terminated_length": 127.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6850003600120544, | |
| "epoch": 0.0812807881773399, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004399793228138843, | |
| "kl": 0.0008690290997037664, | |
| "learning_rate": 4.99656960410419e-05, | |
| "loss": -0.013336382806301117, | |
| "num_tokens": 5000161.0, | |
| "reward": 0.9453125, | |
| "reward_std": 0.41659224033355713, | |
| "rewards/reward_func/mean": 0.10503472222222222, | |
| "rewards/reward_func/std": 0.062094110581609935, | |
| "sampling/importance_sampling_ratio/max": 2.997372627258301, | |
| "sampling/importance_sampling_ratio/mean": 0.9499393701553345, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.663118362426758, | |
| "sampling/sampling_logp_difference/mean": 0.20705397427082062, | |
| "step": 33, | |
| "step_time": 126.14335125219077 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3882.0, | |
| "completions/mean_length": 1133.890625, | |
| "completions/mean_terminated_length": 1019.0819091796875, | |
| "completions/min_length": 213.0, | |
| "completions/min_terminated_length": 213.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7852240353822708, | |
| "epoch": 0.08374384236453201, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0038741539289486736, | |
| "kl": 0.0007687857287237421, | |
| "learning_rate": 4.9963108587563226e-05, | |
| "loss": 0.002182937692850828, | |
| "num_tokens": 5159050.0, | |
| "reward": 1.0, | |
| "reward_std": 0.4564354717731476, | |
| "rewards/reward_func/mean": 0.1111111111111111, | |
| "rewards/reward_func/std": 0.06883835792541504, | |
| "sampling/importance_sampling_ratio/max": 2.9997353553771973, | |
| "sampling/importance_sampling_ratio/mean": 0.9504753947257996, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.089191436767578, | |
| "sampling/sampling_logp_difference/mean": 0.2042486071586609, | |
| "step": 34, | |
| "step_time": 176.92318990291096 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3042.0, | |
| "completions/mean_length": 1198.0, | |
| "completions/mean_terminated_length": 1004.800048828125, | |
| "completions/min_length": 197.0, | |
| "completions/min_terminated_length": 197.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7601384967565536, | |
| "epoch": 0.08620689655172414, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.003761993755453868, | |
| "kl": 0.0007180090615293011, | |
| "learning_rate": 4.996042713912238e-05, | |
| "loss": -0.039179325103759766, | |
| "num_tokens": 5324810.0, | |
| "reward": 0.953125, | |
| "reward_std": 0.4517931640148163, | |
| "rewards/reward_func/mean": 0.10590277777777778, | |
| "rewards/reward_func/std": 0.06709145175086127, | |
| "sampling/importance_sampling_ratio/max": 2.999290704727173, | |
| "sampling/importance_sampling_ratio/mean": 0.9449152946472168, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.980502128601074, | |
| "sampling/sampling_logp_difference/mean": 0.22166508436203003, | |
| "step": 35, | |
| "step_time": 132.487598804757 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3676.0, | |
| "completions/mean_length": 1095.796875, | |
| "completions/mean_terminated_length": 999.01611328125, | |
| "completions/min_length": 130.0, | |
| "completions/min_terminated_length": 130.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.674736425280571, | |
| "epoch": 0.08866995073891626, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007700117637790993, | |
| "kl": 0.000519245870236773, | |
| "learning_rate": 4.995765170581595e-05, | |
| "loss": 0.05755629390478134, | |
| "num_tokens": 5483677.0, | |
| "reward": 1.0546875, | |
| "reward_std": 0.6049871444702148, | |
| "rewards/reward_func/mean": 0.1171875, | |
| "rewards/reward_func/std": 0.10860339800516765, | |
| "sampling/importance_sampling_ratio/max": 2.9988772869110107, | |
| "sampling/importance_sampling_ratio/mean": 0.9505428671836853, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.872570037841797, | |
| "sampling/sampling_logp_difference/mean": 0.19093479216098785, | |
| "step": 36, | |
| "step_time": 123.9898575132247 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2806.0, | |
| "completions/mean_length": 917.796875, | |
| "completions/mean_terminated_length": 867.3492431640625, | |
| "completions/min_length": 314.0, | |
| "completions/min_terminated_length": 314.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.639846533536911, | |
| "epoch": 0.09113300492610837, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006815328832923949, | |
| "kl": 0.0006771365588065237, | |
| "learning_rate": 4.995478229809444e-05, | |
| "loss": 0.02746357023715973, | |
| "num_tokens": 5625440.0, | |
| "reward": 1.0703125, | |
| "reward_std": 0.7924103736877441, | |
| "rewards/reward_func/mean": 0.1189236111111111, | |
| "rewards/reward_func/std": 0.12947271267573038, | |
| "sampling/importance_sampling_ratio/max": 2.9944005012512207, | |
| "sampling/importance_sampling_ratio/mean": 0.9546407461166382, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.414692878723145, | |
| "sampling/sampling_logp_difference/mean": 0.18783044815063477, | |
| "step": 37, | |
| "step_time": 121.30526466900483 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3806.0, | |
| "completions/mean_length": 932.015625, | |
| "completions/mean_terminated_length": 829.9515991210938, | |
| "completions/min_length": 225.0, | |
| "completions/min_terminated_length": 225.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6244807690382004, | |
| "epoch": 0.09359605911330049, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0024319878910873303, | |
| "kl": 0.0004077902340213768, | |
| "learning_rate": 4.9951818926762174e-05, | |
| "loss": -0.016925642266869545, | |
| "num_tokens": 5757121.0, | |
| "reward": 1.09375, | |
| "reward_std": 0.43983224034309387, | |
| "rewards/reward_func/mean": 0.12152777777777778, | |
| "rewards/reward_func/std": 0.0649089366197586, | |
| "sampling/importance_sampling_ratio/max": 2.9995179176330566, | |
| "sampling/importance_sampling_ratio/mean": 0.9589510560035706, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.803277969360352, | |
| "sampling/sampling_logp_difference/mean": 0.1713067591190338, | |
| "step": 38, | |
| "step_time": 124.23265223298222 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3735.0, | |
| "completions/mean_length": 1043.359375, | |
| "completions/mean_terminated_length": 1010.5573120117188, | |
| "completions/min_length": 213.0, | |
| "completions/min_terminated_length": 213.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6702292114496231, | |
| "epoch": 0.0960591133004926, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004304848427086413, | |
| "kl": 0.0006781471893191338, | |
| "learning_rate": 4.99487616029773e-05, | |
| "loss": 0.005671734921634197, | |
| "num_tokens": 5905640.0, | |
| "reward": 1.0, | |
| "reward_std": 0.4225771427154541, | |
| "rewards/reward_func/mean": 0.1111111111111111, | |
| "rewards/reward_func/std": 0.06295079986254375, | |
| "sampling/importance_sampling_ratio/max": 2.9920156002044678, | |
| "sampling/importance_sampling_ratio/mean": 0.9502699375152588, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.812431335449219, | |
| "sampling/sampling_logp_difference/mean": 0.19391661882400513, | |
| "step": 39, | |
| "step_time": 127.96421558922157 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3826.0, | |
| "completions/mean_length": 984.703125, | |
| "completions/mean_terminated_length": 944.1935424804688, | |
| "completions/min_length": 266.0, | |
| "completions/min_terminated_length": 266.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6731563657522202, | |
| "epoch": 0.09852216748768473, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0039037333681927427, | |
| "kl": 0.0008755343878874555, | |
| "learning_rate": 4.994561033825174e-05, | |
| "loss": -0.016288451850414276, | |
| "num_tokens": 6054021.0, | |
| "reward": 0.98046875, | |
| "reward_std": 0.41857820749282837, | |
| "rewards/reward_func/mean": 0.10894097222222222, | |
| "rewards/reward_func/std": 0.06274111072222392, | |
| "sampling/importance_sampling_ratio/max": 2.9966559410095215, | |
| "sampling/importance_sampling_ratio/mean": 0.9564224481582642, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.249920845031738, | |
| "sampling/sampling_logp_difference/mean": 0.18356505036354065, | |
| "step": 40, | |
| "step_time": 129.37665040860884 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2444.0, | |
| "completions/mean_length": 969.96875, | |
| "completions/mean_terminated_length": 715.7368774414062, | |
| "completions/min_length": 63.0, | |
| "completions/min_terminated_length": 63.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6139142364263535, | |
| "epoch": 0.10098522167487685, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0060334001765760745, | |
| "kl": 0.0013773875834885985, | |
| "learning_rate": 4.99423651444511e-05, | |
| "loss": -0.00031678611412644386, | |
| "num_tokens": 6202659.0, | |
| "reward": 0.8203125, | |
| "reward_std": 0.5933974385261536, | |
| "rewards/reward_func/mean": 0.09114583333333333, | |
| "rewards/reward_func/std": 0.11807411743534936, | |
| "sampling/importance_sampling_ratio/max": 2.996901750564575, | |
| "sampling/importance_sampling_ratio/mean": 0.9532800912857056, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.062097549438477, | |
| "sampling/sampling_logp_difference/mean": 0.18368680775165558, | |
| "step": 41, | |
| "step_time": 181.77138043870218 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2988.0, | |
| "completions/mean_length": 826.328125, | |
| "completions/mean_terminated_length": 720.8547973632812, | |
| "completions/min_length": 149.0, | |
| "completions/min_terminated_length": 149.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6861467808485031, | |
| "epoch": 0.10344827586206896, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0033062963877428346, | |
| "kl": 0.001169464725535363, | |
| "learning_rate": 4.993902603379471e-05, | |
| "loss": -0.009817395359277725, | |
| "num_tokens": 6335368.0, | |
| "reward": 0.96484375, | |
| "reward_std": 0.28821834921836853, | |
| "rewards/reward_func/mean": 0.1072048611111111, | |
| "rewards/reward_func/std": 0.04385380778047773, | |
| "sampling/importance_sampling_ratio/max": 2.986184597015381, | |
| "sampling/importance_sampling_ratio/mean": 0.9563000202178955, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.047112464904785, | |
| "sampling/sampling_logp_difference/mean": 0.18140462040901184, | |
| "step": 42, | |
| "step_time": 122.47313178796321 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3712.0, | |
| "completions/mean_length": 1089.484375, | |
| "completions/mean_terminated_length": 964.0508422851562, | |
| "completions/min_length": 165.0, | |
| "completions/min_terminated_length": 238.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6908983737230301, | |
| "epoch": 0.10591133004926108, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0045084411161715136, | |
| "kl": 0.001085053852875717, | |
| "learning_rate": 4.99355930188555e-05, | |
| "loss": -0.020499780774116516, | |
| "num_tokens": 6493527.0, | |
| "reward": 1.0, | |
| "reward_std": 0.5527707934379578, | |
| "rewards/reward_func/mean": 0.1111111111111111, | |
| "rewards/reward_func/std": 0.07947573396894667, | |
| "sampling/importance_sampling_ratio/max": 2.9986228942871094, | |
| "sampling/importance_sampling_ratio/mean": 0.9507853984832764, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.087060928344727, | |
| "sampling/sampling_logp_difference/mean": 0.196205735206604, | |
| "step": 43, | |
| "step_time": 129.15310677397065 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3619.0, | |
| "completions/max_terminated_length": 3619.0, | |
| "completions/mean_length": 804.078125, | |
| "completions/mean_terminated_length": 782.5873413085938, | |
| "completions/min_length": 15.0, | |
| "completions/min_terminated_length": 15.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6834833323955536, | |
| "epoch": 0.10837438423645321, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00655169426595512, | |
| "kl": 0.0013544214016292244, | |
| "learning_rate": 4.9932066112559975e-05, | |
| "loss": -0.037918414920568466, | |
| "num_tokens": 6624492.0, | |
| "reward": 0.89453125, | |
| "reward_std": 0.5152528285980225, | |
| "rewards/reward_func/mean": 0.0993923611111111, | |
| "rewards/reward_func/std": 0.074398891793357, | |
| "sampling/importance_sampling_ratio/max": 2.996908664703369, | |
| "sampling/importance_sampling_ratio/mean": 0.9543678760528564, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.936127662658691, | |
| "sampling/sampling_logp_difference/mean": 0.19161191582679749, | |
| "step": 44, | |
| "step_time": 102.16571011929773 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3917.0, | |
| "completions/mean_length": 931.40625, | |
| "completions/mean_terminated_length": 775.7704467773438, | |
| "completions/min_length": 174.0, | |
| "completions/min_terminated_length": 174.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7641775757074356, | |
| "epoch": 0.11083743842364532, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.003791496409445276, | |
| "kl": 0.0008305757219204679, | |
| "learning_rate": 4.992844532818821e-05, | |
| "loss": -0.03192742541432381, | |
| "num_tokens": 6776582.0, | |
| "reward": 0.9765625, | |
| "reward_std": 0.47709178924560547, | |
| "rewards/reward_func/mean": 0.10850694444444445, | |
| "rewards/reward_func/std": 0.06993871264987522, | |
| "sampling/importance_sampling_ratio/max": 2.9978528022766113, | |
| "sampling/importance_sampling_ratio/mean": 0.9510887265205383, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.725985527038574, | |
| "sampling/sampling_logp_difference/mean": 0.20753449201583862, | |
| "step": 45, | |
| "step_time": 130.1271507178899 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3633.0, | |
| "completions/mean_length": 1147.84375, | |
| "completions/mean_terminated_length": 842.862060546875, | |
| "completions/min_length": 176.0, | |
| "completions/min_terminated_length": 176.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7140185534954071, | |
| "epoch": 0.11330049261083744, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004826171583942262, | |
| "kl": 0.0006912277458468452, | |
| "learning_rate": 4.9924730679373735e-05, | |
| "loss": -0.011146273463964462, | |
| "num_tokens": 6928796.0, | |
| "reward": 0.94140625, | |
| "reward_std": 0.5132030248641968, | |
| "rewards/reward_func/mean": 0.10460069444444445, | |
| "rewards/reward_func/std": 0.07456411255730523, | |
| "sampling/importance_sampling_ratio/max": 2.998485565185547, | |
| "sampling/importance_sampling_ratio/mean": 0.9537663459777832, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.836477279663086, | |
| "sampling/sampling_logp_difference/mean": 0.19005626440048218, | |
| "step": 46, | |
| "step_time": 124.87270360905677 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3396.0, | |
| "completions/mean_length": 1380.703125, | |
| "completions/mean_terminated_length": 1224.4482421875, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 470.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6310307085514069, | |
| "epoch": 0.11576354679802955, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0038622981497168454, | |
| "kl": 0.0006945063942112029, | |
| "learning_rate": 4.992092218010351e-05, | |
| "loss": -0.02481062337756157, | |
| "num_tokens": 7112345.0, | |
| "reward": 0.921875, | |
| "reward_std": 0.4957658052444458, | |
| "rewards/reward_func/mean": 0.10243055555555555, | |
| "rewards/reward_func/std": 0.07219833135604858, | |
| "sampling/importance_sampling_ratio/max": 2.982144832611084, | |
| "sampling/importance_sampling_ratio/mean": 0.9450221061706543, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.828449249267578, | |
| "sampling/sampling_logp_difference/mean": 0.20433643460273743, | |
| "step": 47, | |
| "step_time": 124.20643052412197 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3883.0, | |
| "completions/mean_length": 1118.46875, | |
| "completions/mean_terminated_length": 1071.2064208984375, | |
| "completions/min_length": 71.0, | |
| "completions/min_terminated_length": 71.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.712666392326355, | |
| "epoch": 0.11822660098522167, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005585593408220866, | |
| "kl": 0.0008703803177922964, | |
| "learning_rate": 4.991701984471789e-05, | |
| "loss": -0.04101687669754028, | |
| "num_tokens": 7272519.0, | |
| "reward": 1.0234375, | |
| "reward_std": 0.6279197931289673, | |
| "rewards/reward_func/mean": 0.11371527777777778, | |
| "rewards/reward_func/std": 0.13131697310341728, | |
| "sampling/importance_sampling_ratio/max": 2.996182918548584, | |
| "sampling/importance_sampling_ratio/mean": 0.9473646879196167, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.505918502807617, | |
| "sampling/sampling_logp_difference/mean": 0.20310138165950775, | |
| "step": 48, | |
| "step_time": 126.8569445500616 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3621.0, | |
| "completions/mean_length": 1250.921875, | |
| "completions/mean_terminated_length": 1211.5322265625, | |
| "completions/min_length": 243.0, | |
| "completions/min_terminated_length": 243.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6713459491729736, | |
| "epoch": 0.1206896551724138, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006099868278130744, | |
| "kl": 0.0007151865720516071, | |
| "learning_rate": 4.9913023687910575e-05, | |
| "loss": 0.005726509727537632, | |
| "num_tokens": 7449970.0, | |
| "reward": 0.92578125, | |
| "reward_std": 0.5557771921157837, | |
| "rewards/reward_func/mean": 0.10286458333333333, | |
| "rewards/reward_func/std": 0.07966403497589959, | |
| "sampling/importance_sampling_ratio/max": 2.997781991958618, | |
| "sampling/importance_sampling_ratio/mean": 0.9481871128082275, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.811446189880371, | |
| "sampling/sampling_logp_difference/mean": 0.1957038938999176, | |
| "step": 49, | |
| "step_time": 133.4852599161677 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2899.0, | |
| "completions/mean_length": 1217.28125, | |
| "completions/mean_terminated_length": 930.8947143554688, | |
| "completions/min_length": 87.0, | |
| "completions/min_terminated_length": 87.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7344631999731064, | |
| "epoch": 0.12315270935960591, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004284861401893878, | |
| "kl": 0.0008682821207912639, | |
| "learning_rate": 4.990893372472849e-05, | |
| "loss": -0.028305571526288986, | |
| "num_tokens": 7622628.0, | |
| "reward": 0.88671875, | |
| "reward_std": 0.5212349891662598, | |
| "rewards/reward_func/mean": 0.09852430555555555, | |
| "rewards/reward_func/std": 0.10778504444493188, | |
| "sampling/importance_sampling_ratio/max": 2.996345043182373, | |
| "sampling/importance_sampling_ratio/mean": 0.9431067705154419, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.860669136047363, | |
| "sampling/sampling_logp_difference/mean": 0.21784313023090363, | |
| "step": 50, | |
| "step_time": 127.83933217800222 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3282.0, | |
| "completions/mean_length": 1225.671875, | |
| "completions/mean_terminated_length": 1127.1966552734375, | |
| "completions/min_length": 22.0, | |
| "completions/min_terminated_length": 22.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.661736324429512, | |
| "epoch": 0.12561576354679804, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004477337769392876, | |
| "kl": 0.000599712846451439, | |
| "learning_rate": 4.99047499705718e-05, | |
| "loss": -0.04254208877682686, | |
| "num_tokens": 7783983.0, | |
| "reward": 1.00390625, | |
| "reward_std": 0.5572255849838257, | |
| "rewards/reward_func/mean": 0.1115451388888889, | |
| "rewards/reward_func/std": 0.11869255536132389, | |
| "sampling/importance_sampling_ratio/max": 2.990603446960449, | |
| "sampling/importance_sampling_ratio/mean": 0.9500396251678467, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.309992790222168, | |
| "sampling/sampling_logp_difference/mean": 0.19599781930446625, | |
| "step": 51, | |
| "step_time": 118.34297248488292 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3453.0, | |
| "completions/max_terminated_length": 3453.0, | |
| "completions/mean_length": 809.3125, | |
| "completions/mean_terminated_length": 804.888916015625, | |
| "completions/min_length": 38.0, | |
| "completions/min_terminated_length": 38.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6810555011034012, | |
| "epoch": 0.12807881773399016, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004634171127115492, | |
| "kl": 0.0006402786093531176, | |
| "learning_rate": 4.990047244119383e-05, | |
| "loss": -0.020025035366415977, | |
| "num_tokens": 7918355.0, | |
| "reward": 1.10546875, | |
| "reward_std": 0.6248883605003357, | |
| "rewards/reward_func/mean": 0.1228298611111111, | |
| "rewards/reward_func/std": 0.13279999958144295, | |
| "sampling/importance_sampling_ratio/max": 2.998283863067627, | |
| "sampling/importance_sampling_ratio/mean": 0.9604792594909668, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.809757232666016, | |
| "sampling/sampling_logp_difference/mean": 0.17685744166374207, | |
| "step": 52, | |
| "step_time": 107.15592404198833 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2729.0, | |
| "completions/mean_length": 1059.71875, | |
| "completions/mean_terminated_length": 904.1333618164062, | |
| "completions/min_length": 17.0, | |
| "completions/min_terminated_length": 17.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6661587357521057, | |
| "epoch": 0.13054187192118227, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009039182746563882, | |
| "kl": 0.0006583353388123214, | |
| "learning_rate": 4.9896101152701e-05, | |
| "loss": -0.04837590828537941, | |
| "num_tokens": 8069409.0, | |
| "reward": 0.83984375, | |
| "reward_std": 0.5480253100395203, | |
| "rewards/reward_func/mean": 0.09331597222222222, | |
| "rewards/reward_func/std": 0.07786636220084296, | |
| "sampling/importance_sampling_ratio/max": 2.998404026031494, | |
| "sampling/importance_sampling_ratio/mean": 0.956752598285675, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.024923324584961, | |
| "sampling/sampling_logp_difference/mean": 0.18003256618976593, | |
| "step": 53, | |
| "step_time": 121.01628102129325 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3495.0, | |
| "completions/mean_length": 1147.21875, | |
| "completions/mean_terminated_length": 1035.49169921875, | |
| "completions/min_length": 219.0, | |
| "completions/min_terminated_length": 219.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6769838631153107, | |
| "epoch": 0.1330049261083744, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005072658713012938, | |
| "kl": 0.0007076838955981657, | |
| "learning_rate": 4.9891636121552745e-05, | |
| "loss": 0.008838340640068054, | |
| "num_tokens": 8231519.0, | |
| "reward": 0.859375, | |
| "reward_std": 0.5115239024162292, | |
| "rewards/reward_func/mean": 0.0954861111111111, | |
| "rewards/reward_func/std": 0.07473522424697876, | |
| "sampling/importance_sampling_ratio/max": 2.9958250522613525, | |
| "sampling/importance_sampling_ratio/mean": 0.9562733173370361, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.137588500976562, | |
| "sampling/sampling_logp_difference/mean": 0.18809491395950317, | |
| "step": 54, | |
| "step_time": 126.67665300960653 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3799.0, | |
| "completions/mean_length": 822.03125, | |
| "completions/mean_terminated_length": 782.3709716796875, | |
| "completions/min_length": 7.0, | |
| "completions/min_terminated_length": 239.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6964642554521561, | |
| "epoch": 0.1354679802955665, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004336077774979495, | |
| "kl": 0.00077366731420625, | |
| "learning_rate": 4.988707736456151e-05, | |
| "loss": 0.024997062981128693, | |
| "num_tokens": 8369409.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.4466690719127655, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.06591926680670844, | |
| "sampling/importance_sampling_ratio/max": 2.9957802295684814, | |
| "sampling/importance_sampling_ratio/mean": 0.9594486951828003, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.226276397705078, | |
| "sampling/sampling_logp_difference/mean": 0.17705968022346497, | |
| "step": 55, | |
| "step_time": 120.81106252782047 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3210.0, | |
| "completions/mean_length": 1145.0, | |
| "completions/mean_terminated_length": 948.2667236328125, | |
| "completions/min_length": 133.0, | |
| "completions/min_terminated_length": 133.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.924326628446579, | |
| "epoch": 0.13793103448275862, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004602114073004957, | |
| "kl": 0.0009272525494452566, | |
| "learning_rate": 4.9882424898892635e-05, | |
| "loss": 0.007547194603830576, | |
| "num_tokens": 8527825.0, | |
| "reward": 0.921875, | |
| "reward_std": 0.46049273014068604, | |
| "rewards/reward_func/mean": 0.10243055555555555, | |
| "rewards/reward_func/std": 0.06886764367421468, | |
| "sampling/importance_sampling_ratio/max": 2.9957659244537354, | |
| "sampling/importance_sampling_ratio/mean": 0.9489148259162903, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.856990814208984, | |
| "sampling/sampling_logp_difference/mean": 0.2066124975681305, | |
| "step": 56, | |
| "step_time": 132.56589686009102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3108.0, | |
| "completions/mean_length": 817.078125, | |
| "completions/mean_terminated_length": 765.0317993164062, | |
| "completions/min_length": 89.0, | |
| "completions/min_terminated_length": 89.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7100279629230499, | |
| "epoch": 0.14039408866995073, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0066965015160019205, | |
| "kl": 0.0014742588391527534, | |
| "learning_rate": 4.987767874206428e-05, | |
| "loss": -0.033622272312641144, | |
| "num_tokens": 8673046.0, | |
| "reward": 0.890625, | |
| "reward_std": 0.5134597420692444, | |
| "rewards/reward_func/mean": 0.09895833333333333, | |
| "rewards/reward_func/std": 0.07518212000528972, | |
| "sampling/importance_sampling_ratio/max": 2.9987456798553467, | |
| "sampling/importance_sampling_ratio/mean": 0.9521975517272949, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.29785442352295, | |
| "sampling/sampling_logp_difference/mean": 0.19809746742248535, | |
| "step": 57, | |
| "step_time": 119.24784157378599 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3990.0, | |
| "completions/mean_length": 1423.890625, | |
| "completions/mean_terminated_length": 1225.3729248046875, | |
| "completions/min_length": 222.0, | |
| "completions/min_terminated_length": 222.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6659361720085144, | |
| "epoch": 0.14285714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004336898403114928, | |
| "kl": 0.0008447931904811412, | |
| "learning_rate": 4.987283891194743e-05, | |
| "loss": 0.0002889493480324745, | |
| "num_tokens": 8861519.0, | |
| "reward": 0.875, | |
| "reward_std": 0.5509731769561768, | |
| "rewards/reward_func/mean": 0.09722222222222222, | |
| "rewards/reward_func/std": 0.07834745115704006, | |
| "sampling/importance_sampling_ratio/max": 2.9992592334747314, | |
| "sampling/importance_sampling_ratio/mean": 0.9453926086425781, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.000340461730957, | |
| "sampling/sampling_logp_difference/mean": 0.2015022337436676, | |
| "step": 58, | |
| "step_time": 131.44689005077817 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3963.0, | |
| "completions/mean_length": 1537.296875, | |
| "completions/mean_terminated_length": 1390.3792724609375, | |
| "completions/min_length": 13.0, | |
| "completions/min_terminated_length": 318.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7286873459815979, | |
| "epoch": 0.14532019704433496, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0036566899418926735, | |
| "kl": 0.0008002854156075045, | |
| "learning_rate": 4.986790542676576e-05, | |
| "loss": -0.03378288820385933, | |
| "num_tokens": 9050914.0, | |
| "reward": 0.95703125, | |
| "reward_std": 0.47257041931152344, | |
| "rewards/reward_func/mean": 0.10633680555555555, | |
| "rewards/reward_func/std": 0.06965653763877021, | |
| "sampling/importance_sampling_ratio/max": 2.997560501098633, | |
| "sampling/importance_sampling_ratio/mean": 0.9403575658798218, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.749882698059082, | |
| "sampling/sampling_logp_difference/mean": 0.2278328537940979, | |
| "step": 59, | |
| "step_time": 137.2060365586076 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2930.0, | |
| "completions/mean_length": 988.796875, | |
| "completions/mean_terminated_length": 888.5645141601562, | |
| "completions/min_length": 117.0, | |
| "completions/min_terminated_length": 117.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.71505106985569, | |
| "epoch": 0.1477832512315271, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005548976099805918, | |
| "kl": 0.001017661954392679, | |
| "learning_rate": 4.986287830509558e-05, | |
| "loss": 0.008952794596552849, | |
| "num_tokens": 9206245.0, | |
| "reward": 0.8984375, | |
| "reward_std": 0.4623069167137146, | |
| "rewards/reward_func/mean": 0.0998263888888889, | |
| "rewards/reward_func/std": 0.06768603954050276, | |
| "sampling/importance_sampling_ratio/max": 2.9915103912353516, | |
| "sampling/importance_sampling_ratio/mean": 0.9487502574920654, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.22498893737793, | |
| "sampling/sampling_logp_difference/mean": 0.20820589363574982, | |
| "step": 60, | |
| "step_time": 124.10505168675445 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4091.0, | |
| "completions/mean_length": 1294.859375, | |
| "completions/mean_terminated_length": 1108.11669921875, | |
| "completions/min_length": 254.0, | |
| "completions/min_terminated_length": 254.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7779558748006821, | |
| "epoch": 0.15024630541871922, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004369721876072753, | |
| "kl": 0.0011822088854387403, | |
| "learning_rate": 4.985775756586581e-05, | |
| "loss": -0.016528453677892685, | |
| "num_tokens": 9385436.0, | |
| "reward": 0.99609375, | |
| "reward_std": 0.5643021464347839, | |
| "rewards/reward_func/mean": 0.11067708333333333, | |
| "rewards/reward_func/std": 0.08128498329056634, | |
| "sampling/importance_sampling_ratio/max": 2.9969987869262695, | |
| "sampling/importance_sampling_ratio/mean": 0.9418332576751709, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.551093101501465, | |
| "sampling/sampling_logp_difference/mean": 0.2260904610157013, | |
| "step": 61, | |
| "step_time": 127.24007483315654 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2781.0, | |
| "completions/mean_length": 654.90625, | |
| "completions/mean_terminated_length": 543.9031982421875, | |
| "completions/min_length": 107.0, | |
| "completions/min_terminated_length": 107.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5641279071569443, | |
| "epoch": 0.15270935960591134, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.010293875912976197, | |
| "kl": 0.0012169272813480347, | |
| "learning_rate": 4.9852543228357835e-05, | |
| "loss": 0.004283260554075241, | |
| "num_tokens": 9507270.0, | |
| "reward": 1.2265625, | |
| "reward_std": 0.8860904574394226, | |
| "rewards/reward_func/mean": 0.1362847222222222, | |
| "rewards/reward_func/std": 0.17288169264793396, | |
| "sampling/importance_sampling_ratio/max": 2.997990846633911, | |
| "sampling/importance_sampling_ratio/mean": 0.9731359481811523, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.856095314025879, | |
| "sampling/sampling_logp_difference/mean": 0.14170248806476593, | |
| "step": 62, | |
| "step_time": 131.49094270309433 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3911.0, | |
| "completions/max_terminated_length": 3911.0, | |
| "completions/mean_length": 1028.59375, | |
| "completions/mean_terminated_length": 1028.59375, | |
| "completions/min_length": 168.0, | |
| "completions/min_terminated_length": 168.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7280330806970596, | |
| "epoch": 0.15517241379310345, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005106972664854136, | |
| "kl": 0.001104547263821587, | |
| "learning_rate": 4.9847235312205484e-05, | |
| "loss": -0.028649836778640747, | |
| "num_tokens": 9644188.0, | |
| "reward": 0.9921875, | |
| "reward_std": 0.618768572807312, | |
| "rewards/reward_func/mean": 0.11024305555555555, | |
| "rewards/reward_func/std": 0.12877601716253492, | |
| "sampling/importance_sampling_ratio/max": 2.9986512660980225, | |
| "sampling/importance_sampling_ratio/mean": 0.9545284509658813, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.098888397216797, | |
| "sampling/sampling_logp_difference/mean": 0.19114258885383606, | |
| "step": 63, | |
| "step_time": 106.71372815198265 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3139.0, | |
| "completions/mean_length": 1194.53125, | |
| "completions/mean_terminated_length": 991.5084838867188, | |
| "completions/min_length": 270.0, | |
| "completions/min_terminated_length": 270.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.688900962471962, | |
| "epoch": 0.15763546798029557, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0037696714939516734, | |
| "kl": 0.0010499156342120841, | |
| "learning_rate": 4.984183383739496e-05, | |
| "loss": -0.012204478494822979, | |
| "num_tokens": 9815470.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.507003903388977, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.0734660890367296, | |
| "sampling/importance_sampling_ratio/max": 2.999692440032959, | |
| "sampling/importance_sampling_ratio/mean": 0.9450367093086243, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.381852149963379, | |
| "sampling/sampling_logp_difference/mean": 0.20991432666778564, | |
| "step": 64, | |
| "step_time": 137.48744579590857 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3326.0, | |
| "completions/max_terminated_length": 3326.0, | |
| "completions/mean_length": 801.546875, | |
| "completions/mean_terminated_length": 801.546875, | |
| "completions/min_length": 221.0, | |
| "completions/min_terminated_length": 221.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6562062352895737, | |
| "epoch": 0.16009852216748768, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.004939764234466071, | |
| "kl": 0.0008211284002754837, | |
| "learning_rate": 4.983633882426471e-05, | |
| "loss": 0.03823887184262276, | |
| "num_tokens": 9946017.0, | |
| "reward": 1.03125, | |
| "reward_std": 0.306995153427124, | |
| "rewards/reward_func/mean": 0.11458333333333333, | |
| "rewards/reward_func/std": 0.04659368097782135, | |
| "sampling/importance_sampling_ratio/max": 2.995638608932495, | |
| "sampling/importance_sampling_ratio/mean": 0.9580010175704956, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.03026008605957, | |
| "sampling/sampling_logp_difference/mean": 0.17757360637187958, | |
| "step": 65, | |
| "step_time": 92.6048651910387 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3525.0, | |
| "completions/mean_length": 1163.859375, | |
| "completions/mean_terminated_length": 1019.6557006835938, | |
| "completions/min_length": 175.0, | |
| "completions/min_terminated_length": 175.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7296376675367355, | |
| "epoch": 0.1625615763546798, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.003653082653066837, | |
| "kl": 0.0007672712963540107, | |
| "learning_rate": 4.983075029350542e-05, | |
| "loss": -0.035717546939849854, | |
| "num_tokens": 10100424.0, | |
| "reward": 0.94921875, | |
| "reward_std": 0.40868473052978516, | |
| "rewards/reward_func/mean": 0.10546875, | |
| "rewards/reward_func/std": 0.06101351810826196, | |
| "sampling/importance_sampling_ratio/max": 2.9951772689819336, | |
| "sampling/importance_sampling_ratio/mean": 0.9508060812950134, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.666208267211914, | |
| "sampling/sampling_logp_difference/mean": 0.20355165004730225, | |
| "step": 66, | |
| "step_time": 147.45380871812813 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3794.0, | |
| "completions/mean_length": 1260.1875, | |
| "completions/mean_terminated_length": 1120.72119140625, | |
| "completions/min_length": 209.0, | |
| "completions/min_terminated_length": 209.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6744803041219711, | |
| "epoch": 0.16502463054187191, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007845307744964685, | |
| "kl": 0.0007850325491745025, | |
| "learning_rate": 4.9825068266159894e-05, | |
| "loss": 0.07681119441986084, | |
| "num_tokens": 10266036.0, | |
| "reward": 1.16796875, | |
| "reward_std": 0.8165630102157593, | |
| "rewards/reward_func/mean": 0.12977430555555555, | |
| "rewards/reward_func/std": 0.13601407905419668, | |
| "sampling/importance_sampling_ratio/max": 2.9994595050811768, | |
| "sampling/importance_sampling_ratio/mean": 0.9440293312072754, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.185983657836914, | |
| "sampling/sampling_logp_difference/mean": 0.21098700165748596, | |
| "step": 67, | |
| "step_time": 127.28534332639538 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1945.0, | |
| "completions/mean_length": 866.609375, | |
| "completions/mean_terminated_length": 722.8135375976562, | |
| "completions/min_length": 115.0, | |
| "completions/min_terminated_length": 115.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7389185130596161, | |
| "epoch": 0.16748768472906403, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005251337630497652, | |
| "kl": 0.000870908988872543, | |
| "learning_rate": 4.981929276362298e-05, | |
| "loss": 0.030912771821022034, | |
| "num_tokens": 10399355.0, | |
| "reward": 0.89453125, | |
| "reward_std": 0.4053332209587097, | |
| "rewards/reward_func/mean": 0.0993923611111111, | |
| "rewards/reward_func/std": 0.08497350083457099, | |
| "sampling/importance_sampling_ratio/max": 2.9986069202423096, | |
| "sampling/importance_sampling_ratio/mean": 0.9526036977767944, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.291192054748535, | |
| "sampling/sampling_logp_difference/mean": 0.19692301750183105, | |
| "step": 68, | |
| "step_time": 131.86357428110205 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3656.0, | |
| "completions/mean_length": 1072.796875, | |
| "completions/mean_terminated_length": 936.4833984375, | |
| "completions/min_length": 184.0, | |
| "completions/min_terminated_length": 184.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7273115962743759, | |
| "epoch": 0.16995073891625614, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.003302635291869116, | |
| "kl": 0.0010801725584315136, | |
| "learning_rate": 4.981342380764149e-05, | |
| "loss": -0.024595141410827637, | |
| "num_tokens": 10566158.0, | |
| "reward": 0.94921875, | |
| "reward_std": 0.34840819239616394, | |
| "rewards/reward_func/mean": 0.10546875, | |
| "rewards/reward_func/std": 0.054026698072751365, | |
| "sampling/importance_sampling_ratio/max": 2.998894453048706, | |
| "sampling/importance_sampling_ratio/mean": 0.9404545426368713, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.374916076660156, | |
| "sampling/sampling_logp_difference/mean": 0.22012995183467865, | |
| "step": 69, | |
| "step_time": 117.17583943810314 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4056.0, | |
| "completions/mean_length": 1020.984375, | |
| "completions/mean_terminated_length": 921.790283203125, | |
| "completions/min_length": 205.0, | |
| "completions/min_terminated_length": 205.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7125604450702667, | |
| "epoch": 0.1724137931034483, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.012211166320763836, | |
| "kl": 0.0008212087268475443, | |
| "learning_rate": 4.980746142031414e-05, | |
| "loss": -0.03852664679288864, | |
| "num_tokens": 10718861.0, | |
| "reward": 1.0546875, | |
| "reward_std": 0.6835201978683472, | |
| "rewards/reward_func/mean": 0.1171875, | |
| "rewards/reward_func/std": 0.12549426820543078, | |
| "sampling/importance_sampling_ratio/max": 2.9977447986602783, | |
| "sampling/importance_sampling_ratio/mean": 0.9521719217300415, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.026714324951172, | |
| "sampling/sampling_logp_difference/mean": 0.19808495044708252, | |
| "step": 70, | |
| "step_time": 132.49208030593581 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2987.0, | |
| "completions/mean_length": 1077.109375, | |
| "completions/mean_terminated_length": 995.3933715820312, | |
| "completions/min_length": 24.0, | |
| "completions/min_terminated_length": 261.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5986730307340622, | |
| "epoch": 0.1748768472906404, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007337189387788424, | |
| "kl": 0.0006397934484994039, | |
| "learning_rate": 4.980140562409141e-05, | |
| "loss": -0.03012331947684288, | |
| "num_tokens": 10857156.0, | |
| "reward": 0.93359375, | |
| "reward_std": 0.41605237126350403, | |
| "rewards/reward_func/mean": 0.1037326388888889, | |
| "rewards/reward_func/std": 0.06313699980576833, | |
| "sampling/importance_sampling_ratio/max": 2.9992947578430176, | |
| "sampling/importance_sampling_ratio/mean": 0.953962504863739, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.96396541595459, | |
| "sampling/sampling_logp_difference/mean": 0.1823788285255432, | |
| "step": 71, | |
| "step_time": 162.1669030210469 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3893.0, | |
| "completions/mean_length": 1130.734375, | |
| "completions/mean_terminated_length": 1090.51611328125, | |
| "completions/min_length": 125.0, | |
| "completions/min_terminated_length": 125.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.658635139465332, | |
| "epoch": 0.17733990147783252, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0077832425322214745, | |
| "kl": 0.0007886915263952687, | |
| "learning_rate": 4.979525644177554e-05, | |
| "loss": 0.03975846618413925, | |
| "num_tokens": 11016371.0, | |
| "reward": 1.14453125, | |
| "reward_std": 0.7802175283432007, | |
| "rewards/reward_func/mean": 0.1271701388888889, | |
| "rewards/reward_func/std": 0.1387238320377138, | |
| "sampling/importance_sampling_ratio/max": 2.998619794845581, | |
| "sampling/importance_sampling_ratio/mean": 0.9450385570526123, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.331713676452637, | |
| "sampling/sampling_logp_difference/mean": 0.19837921857833862, | |
| "step": 72, | |
| "step_time": 125.53122378420085 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3461.0, | |
| "completions/mean_length": 878.890625, | |
| "completions/mean_terminated_length": 806.786865234375, | |
| "completions/min_length": 96.0, | |
| "completions/min_terminated_length": 96.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7126602828502655, | |
| "epoch": 0.17980295566502463, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009708859119011157, | |
| "kl": 0.000986951738013886, | |
| "learning_rate": 4.978901389652039e-05, | |
| "loss": 0.045913953334093094, | |
| "num_tokens": 11164348.0, | |
| "reward": 1.01953125, | |
| "reward_std": 1.0849847793579102, | |
| "rewards/reward_func/mean": 0.11328125, | |
| "rewards/reward_func/std": 0.16929766370190513, | |
| "sampling/importance_sampling_ratio/max": 2.9910213947296143, | |
| "sampling/importance_sampling_ratio/mean": 0.9483171701431274, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.62489128112793, | |
| "sampling/sampling_logp_difference/mean": 0.20636305212974548, | |
| "step": 73, | |
| "step_time": 129.03768724016845 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3590.0, | |
| "completions/mean_length": 1483.046875, | |
| "completions/mean_terminated_length": 1275.086181640625, | |
| "completions/min_length": 209.0, | |
| "completions/min_terminated_length": 209.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.67229163646698, | |
| "epoch": 0.18226600985221675, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01704922977426874, | |
| "kl": 0.0007648792961845174, | |
| "learning_rate": 4.978267801183133e-05, | |
| "loss": 0.011364220641553402, | |
| "num_tokens": 11349727.0, | |
| "reward": 1.2109375, | |
| "reward_std": 0.6519487500190735, | |
| "rewards/reward_func/mean": 0.1345486111111111, | |
| "rewards/reward_func/std": 0.12985923224025303, | |
| "sampling/importance_sampling_ratio/max": 2.9996538162231445, | |
| "sampling/importance_sampling_ratio/mean": 0.9436983466148376, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.866939544677734, | |
| "sampling/sampling_logp_difference/mean": 0.20118892192840576, | |
| "step": 74, | |
| "step_time": 137.89118567178957 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3703.0, | |
| "completions/mean_length": 1226.40625, | |
| "completions/mean_terminated_length": 1085.2786865234375, | |
| "completions/min_length": 212.0, | |
| "completions/min_terminated_length": 212.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7562486529350281, | |
| "epoch": 0.18472906403940886, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0047505098072179695, | |
| "kl": 0.0008432121394434944, | |
| "learning_rate": 4.977624881156524e-05, | |
| "loss": -0.00926197599619627, | |
| "num_tokens": 11511657.0, | |
| "reward": 1.0078125, | |
| "reward_std": 0.4628430902957916, | |
| "rewards/reward_func/mean": 0.11197916666666667, | |
| "rewards/reward_func/std": 0.0687158273326026, | |
| "sampling/importance_sampling_ratio/max": 2.996015787124634, | |
| "sampling/importance_sampling_ratio/mean": 0.9437671899795532, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.199777603149414, | |
| "sampling/sampling_logp_difference/mean": 0.21507258713245392, | |
| "step": 75, | |
| "step_time": 119.91632703808136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3823.0, | |
| "completions/mean_length": 1037.953125, | |
| "completions/mean_terminated_length": 990.3386840820312, | |
| "completions/min_length": 340.0, | |
| "completions/min_terminated_length": 340.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6312072277069092, | |
| "epoch": 0.18719211822660098, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006440735421087148, | |
| "kl": 0.0009448294003959745, | |
| "learning_rate": 4.976972631993033e-05, | |
| "loss": -0.025233589112758636, | |
| "num_tokens": 11665798.0, | |
| "reward": 0.86328125, | |
| "reward_std": 0.5135653614997864, | |
| "rewards/reward_func/mean": 0.0959201388888889, | |
| "rewards/reward_func/std": 0.07400760385725233, | |
| "sampling/importance_sampling_ratio/max": 2.9997024536132812, | |
| "sampling/importance_sampling_ratio/mean": 0.9479377269744873, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.914000511169434, | |
| "sampling/sampling_logp_difference/mean": 0.1916263997554779, | |
| "step": 76, | |
| "step_time": 126.66293425206095 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2478.0, | |
| "completions/mean_length": 808.015625, | |
| "completions/mean_terminated_length": 737.475341796875, | |
| "completions/min_length": 124.0, | |
| "completions/min_terminated_length": 124.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.75564044713974, | |
| "epoch": 0.1896551724137931, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006801059909191471, | |
| "kl": 0.0012827419268433005, | |
| "learning_rate": 4.976311056148609e-05, | |
| "loss": -0.026884566992521286, | |
| "num_tokens": 11790983.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.5137012004852295, | |
| "rewards/reward_func/mean": 0.10416666666666667, | |
| "rewards/reward_func/std": 0.07450851135783726, | |
| "sampling/importance_sampling_ratio/max": 2.9969658851623535, | |
| "sampling/importance_sampling_ratio/mean": 0.9571335315704346, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.874902725219727, | |
| "sampling/sampling_logp_difference/mean": 0.18946215510368347, | |
| "step": 77, | |
| "step_time": 120.88332917005755 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3836.0, | |
| "completions/mean_length": 1026.78125, | |
| "completions/mean_terminated_length": 927.774169921875, | |
| "completions/min_length": 105.0, | |
| "completions/min_terminated_length": 105.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6949244886636734, | |
| "epoch": 0.1921182266009852, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009162823895835595, | |
| "kl": 0.0008584359893575311, | |
| "learning_rate": 4.975640156114322e-05, | |
| "loss": 0.004000354558229446, | |
| "num_tokens": 11933161.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.7367273569107056, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.12690814170572493, | |
| "sampling/importance_sampling_ratio/max": 2.9982340335845947, | |
| "sampling/importance_sampling_ratio/mean": 0.9532779455184937, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.17520523071289, | |
| "sampling/sampling_logp_difference/mean": 0.19447797536849976, | |
| "step": 78, | |
| "step_time": 125.43066487205215 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2179.0, | |
| "completions/max_terminated_length": 2179.0, | |
| "completions/mean_length": 624.765625, | |
| "completions/mean_terminated_length": 631.4515991210938, | |
| "completions/min_length": 51.0, | |
| "completions/min_terminated_length": 125.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.709212139248848, | |
| "epoch": 0.19458128078817735, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0063953100167225795, | |
| "kl": 0.0009358888928545639, | |
| "learning_rate": 4.974959934416346e-05, | |
| "loss": 0.03648173436522484, | |
| "num_tokens": 12061082.0, | |
| "reward": 0.94921875, | |
| "reward_std": 0.4299771189689636, | |
| "rewards/reward_func/mean": 0.10546875, | |
| "rewards/reward_func/std": 0.06504838996463352, | |
| "sampling/importance_sampling_ratio/max": 2.9928903579711914, | |
| "sampling/importance_sampling_ratio/mean": 0.9550417065620422, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.374837875366211, | |
| "sampling/sampling_logp_difference/mean": 0.19429925084114075, | |
| "step": 79, | |
| "step_time": 75.71170671563596 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2969.0, | |
| "completions/mean_length": 786.9375, | |
| "completions/mean_terminated_length": 721.5, | |
| "completions/min_length": 196.0, | |
| "completions/min_terminated_length": 196.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6359275728464127, | |
| "epoch": 0.19704433497536947, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005510590683439618, | |
| "kl": 0.0013039757614023983, | |
| "learning_rate": 4.9742703936159586e-05, | |
| "loss": -0.01529807597398758, | |
| "num_tokens": 12196694.0, | |
| "reward": 0.91796875, | |
| "reward_std": 0.35370680689811707, | |
| "rewards/reward_func/mean": 0.10199652777777778, | |
| "rewards/reward_func/std": 0.05228892962137858, | |
| "sampling/importance_sampling_ratio/max": 2.9959685802459717, | |
| "sampling/importance_sampling_ratio/mean": 0.9564290642738342, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.529621124267578, | |
| "sampling/sampling_logp_difference/mean": 0.18233630061149597, | |
| "step": 80, | |
| "step_time": 143.74423436028883 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3342.0, | |
| "completions/mean_length": 1421.53125, | |
| "completions/mean_terminated_length": 1329.7703857421875, | |
| "completions/min_length": 316.0, | |
| "completions/min_terminated_length": 316.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6616629511117935, | |
| "epoch": 0.19950738916256158, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005486852713694094, | |
| "kl": 0.000617337238509208, | |
| "learning_rate": 4.973571536309525e-05, | |
| "loss": 0.036342501640319824, | |
| "num_tokens": 12369864.0, | |
| "reward": 1.0078125, | |
| "reward_std": 0.6267337203025818, | |
| "rewards/reward_func/mean": 0.11197916666666667, | |
| "rewards/reward_func/std": 0.11080888079272376, | |
| "sampling/importance_sampling_ratio/max": 2.9995808601379395, | |
| "sampling/importance_sampling_ratio/mean": 0.9480350613594055, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.685368537902832, | |
| "sampling/sampling_logp_difference/mean": 0.19612553715705872, | |
| "step": 81, | |
| "step_time": 127.6738231680356 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3796.0, | |
| "completions/mean_length": 1419.578125, | |
| "completions/mean_terminated_length": 1254.0169677734375, | |
| "completions/min_length": 131.0, | |
| "completions/min_terminated_length": 131.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8103909641504288, | |
| "epoch": 0.2019704433497537, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004435665783129135, | |
| "kl": 0.0008063071873039007, | |
| "learning_rate": 4.9728633651284914e-05, | |
| "loss": -0.0042844414710998535, | |
| "num_tokens": 12548685.0, | |
| "reward": 1.1171875, | |
| "reward_std": 0.7439821362495422, | |
| "rewards/reward_func/mean": 0.12413194444444445, | |
| "rewards/reward_func/std": 0.11853209965758854, | |
| "sampling/importance_sampling_ratio/max": 2.999321460723877, | |
| "sampling/importance_sampling_ratio/mean": 0.9406133890151978, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.160935401916504, | |
| "sampling/sampling_logp_difference/mean": 0.22386467456817627, | |
| "step": 82, | |
| "step_time": 136.5100540383719 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 3796.0, | |
| "completions/max_terminated_length": 3796.0, | |
| "completions/mean_length": 941.171875, | |
| "completions/mean_terminated_length": 946.01611328125, | |
| "completions/min_length": 24.0, | |
| "completions/min_terminated_length": 225.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6475118100643158, | |
| "epoch": 0.2044334975369458, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005268346256156498, | |
| "kl": 0.0009101934847421944, | |
| "learning_rate": 4.972145882739374e-05, | |
| "loss": -0.02767626754939556, | |
| "num_tokens": 12698024.0, | |
| "reward": 1.0234375, | |
| "reward_std": 0.43121910095214844, | |
| "rewards/reward_func/mean": 0.11371527777777778, | |
| "rewards/reward_func/std": 0.06390465299288432, | |
| "sampling/importance_sampling_ratio/max": 2.9928174018859863, | |
| "sampling/importance_sampling_ratio/mean": 0.9558981657028198, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.236062049865723, | |
| "sampling/sampling_logp_difference/mean": 0.1847381889820099, | |
| "step": 83, | |
| "step_time": 112.00504079344682 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2627.0, | |
| "completions/max_terminated_length": 2627.0, | |
| "completions/mean_length": 1038.828125, | |
| "completions/mean_terminated_length": 1018.4425659179688, | |
| "completions/min_length": 240.0, | |
| "completions/min_terminated_length": 240.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6957644075155258, | |
| "epoch": 0.20689655172413793, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004293307295253765, | |
| "kl": 0.001004656864097342, | |
| "learning_rate": 4.971419091843748e-05, | |
| "loss": -0.024233508855104446, | |
| "num_tokens": 12854877.0, | |
| "reward": 1.2109375, | |
| "reward_std": 0.5777260065078735, | |
| "rewards/reward_func/mean": 0.1345486111111111, | |
| "rewards/reward_func/std": 0.08230482538541158, | |
| "sampling/importance_sampling_ratio/max": 2.998769998550415, | |
| "sampling/importance_sampling_ratio/mean": 0.9453675150871277, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.116597175598145, | |
| "sampling/sampling_logp_difference/mean": 0.21188262104988098, | |
| "step": 84, | |
| "step_time": 84.51651998935267 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2771.0, | |
| "completions/mean_length": 715.484375, | |
| "completions/mean_terminated_length": 661.825439453125, | |
| "completions/min_length": 156.0, | |
| "completions/min_terminated_length": 156.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7366594225168228, | |
| "epoch": 0.20935960591133004, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0037554999149393494, | |
| "kl": 0.001163585257017985, | |
| "learning_rate": 4.970682995178238e-05, | |
| "loss": -0.002081345533952117, | |
| "num_tokens": 12984828.0, | |
| "reward": 1.07421875, | |
| "reward_std": 0.3900541663169861, | |
| "rewards/reward_func/mean": 0.1193576388888889, | |
| "rewards/reward_func/std": 0.055674018131362066, | |
| "sampling/importance_sampling_ratio/max": 2.998495578765869, | |
| "sampling/importance_sampling_ratio/mean": 0.9560437202453613, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.31248664855957, | |
| "sampling/sampling_logp_difference/mean": 0.1962520182132721, | |
| "step": 85, | |
| "step_time": 139.01101576304063 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4018.0, | |
| "completions/mean_length": 1098.484375, | |
| "completions/mean_terminated_length": 960.86669921875, | |
| "completions/min_length": 103.0, | |
| "completions/min_terminated_length": 103.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6849155128002167, | |
| "epoch": 0.21182266009852216, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005711301934492953, | |
| "kl": 0.0016330961079802364, | |
| "learning_rate": 4.9699375955145114e-05, | |
| "loss": -0.031073298305273056, | |
| "num_tokens": 13144571.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.6751574873924255, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.09190892179807027, | |
| "sampling/importance_sampling_ratio/max": 2.9948723316192627, | |
| "sampling/importance_sampling_ratio/mean": 0.9550687074661255, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.623679161071777, | |
| "sampling/sampling_logp_difference/mean": 0.18543776869773865, | |
| "step": 86, | |
| "step_time": 132.33628671360202 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3577.0, | |
| "completions/mean_length": 1125.25, | |
| "completions/mean_terminated_length": 989.9491577148438, | |
| "completions/min_length": 47.0, | |
| "completions/min_terminated_length": 241.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7316817492246628, | |
| "epoch": 0.21428571428571427, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01102958555478108, | |
| "kl": 0.0011147368932142854, | |
| "learning_rate": 4.96918289565926e-05, | |
| "loss": -0.069314144551754, | |
| "num_tokens": 13300539.0, | |
| "reward": 0.9453125, | |
| "reward_std": 0.7656680345535278, | |
| "rewards/reward_func/mean": 0.10503472222222222, | |
| "rewards/reward_func/std": 0.12950749198595682, | |
| "sampling/importance_sampling_ratio/max": 2.9998490810394287, | |
| "sampling/importance_sampling_ratio/mean": 0.9460821151733398, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.81115436553955, | |
| "sampling/sampling_logp_difference/mean": 0.20998844504356384, | |
| "step": 87, | |
| "step_time": 122.99315962707624 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3992.0, | |
| "completions/mean_length": 1200.171875, | |
| "completions/mean_terminated_length": 954.7626953125, | |
| "completions/min_length": 151.0, | |
| "completions/min_terminated_length": 151.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.5681051015853882, | |
| "epoch": 0.21674876847290642, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.001914027338477591, | |
| "kl": 0.0006996607990004122, | |
| "learning_rate": 4.968418898454199e-05, | |
| "loss": -0.011678352952003479, | |
| "num_tokens": 13454422.0, | |
| "reward": 1.0703125, | |
| "reward_std": 0.37392371892929077, | |
| "rewards/reward_func/mean": 0.1189236111111111, | |
| "rewards/reward_func/std": 0.05488494038581848, | |
| "sampling/importance_sampling_ratio/max": 2.999253511428833, | |
| "sampling/importance_sampling_ratio/mean": 0.9604488611221313, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.74988842010498, | |
| "sampling/sampling_logp_difference/mean": 0.1624874323606491, | |
| "step": 88, | |
| "step_time": 130.86213548691012 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2659.0, | |
| "completions/max_terminated_length": 2659.0, | |
| "completions/mean_length": 979.9375, | |
| "completions/mean_terminated_length": 971.825439453125, | |
| "completions/min_length": 220.0, | |
| "completions/min_terminated_length": 220.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7630282044410706, | |
| "epoch": 0.21921182266009853, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.003157074850135714, | |
| "kl": 0.0007990372541826218, | |
| "learning_rate": 4.967645606776047e-05, | |
| "loss": -0.017293542623519897, | |
| "num_tokens": 13601730.0, | |
| "reward": 1.0078125, | |
| "reward_std": 0.36451956629753113, | |
| "rewards/reward_func/mean": 0.11197916666666667, | |
| "rewards/reward_func/std": 0.05453648335403866, | |
| "sampling/importance_sampling_ratio/max": 2.999340057373047, | |
| "sampling/importance_sampling_ratio/mean": 0.9504854679107666, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.437399864196777, | |
| "sampling/sampling_logp_difference/mean": 0.20192095637321472, | |
| "step": 89, | |
| "step_time": 83.2202613428235 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3322.0, | |
| "completions/mean_length": 1119.65625, | |
| "completions/mean_terminated_length": 973.2786254882812, | |
| "completions/min_length": 125.0, | |
| "completions/min_terminated_length": 125.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.664212629199028, | |
| "epoch": 0.22167487684729065, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006293474969162976, | |
| "kl": 0.0009798562823561952, | |
| "learning_rate": 4.966863023536523e-05, | |
| "loss": 0.059450846165418625, | |
| "num_tokens": 13754572.0, | |
| "reward": 1.0390625, | |
| "reward_std": 0.7627471685409546, | |
| "rewards/reward_func/mean": 0.1154513888888889, | |
| "rewards/reward_func/std": 0.14111600981818306, | |
| "sampling/importance_sampling_ratio/max": 2.9967024326324463, | |
| "sampling/importance_sampling_ratio/mean": 0.9532588720321655, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.812085151672363, | |
| "sampling/sampling_logp_difference/mean": 0.19800463318824768, | |
| "step": 90, | |
| "step_time": 198.11143169808201 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3196.0, | |
| "completions/mean_length": 1016.890625, | |
| "completions/mean_terminated_length": 917.5645141601562, | |
| "completions/min_length": 255.0, | |
| "completions/min_terminated_length": 255.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6830228269100189, | |
| "epoch": 0.22413793103448276, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005031840669452568, | |
| "kl": 0.0010694598604459316, | |
| "learning_rate": 4.96607115168233e-05, | |
| "loss": 0.0734919086098671, | |
| "num_tokens": 13895253.0, | |
| "reward": 0.96484375, | |
| "reward_std": 0.40792542695999146, | |
| "rewards/reward_func/mean": 0.1072048611111111, | |
| "rewards/reward_func/std": 0.06053559978802999, | |
| "sampling/importance_sampling_ratio/max": 2.9970076084136963, | |
| "sampling/importance_sampling_ratio/mean": 0.9569849967956543, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.747546195983887, | |
| "sampling/sampling_logp_difference/mean": 0.17965635657310486, | |
| "step": 91, | |
| "step_time": 118.30531685194 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4063.0, | |
| "completions/max_terminated_length": 4063.0, | |
| "completions/mean_length": 1106.8125, | |
| "completions/mean_terminated_length": 1109.920654296875, | |
| "completions/min_length": 284.0, | |
| "completions/min_terminated_length": 284.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7050769776105881, | |
| "epoch": 0.22660098522167488, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004941963891636721, | |
| "kl": 0.0008765287348069251, | |
| "learning_rate": 4.965269994195146e-05, | |
| "loss": -0.013432206586003304, | |
| "num_tokens": 14055561.0, | |
| "reward": 0.9453125, | |
| "reward_std": 0.4069552719593048, | |
| "rewards/reward_func/mean": 0.10503472222222222, | |
| "rewards/reward_func/std": 0.062094110581609935, | |
| "sampling/importance_sampling_ratio/max": 2.9970788955688477, | |
| "sampling/importance_sampling_ratio/mean": 0.9504680633544922, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.066850662231445, | |
| "sampling/sampling_logp_difference/mean": 0.19342385232448578, | |
| "step": 92, | |
| "step_time": 124.41021094494499 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2304.0, | |
| "completions/mean_length": 958.171875, | |
| "completions/mean_terminated_length": 869.1638793945312, | |
| "completions/min_length": 22.0, | |
| "completions/min_terminated_length": 22.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6579289436340332, | |
| "epoch": 0.229064039408867, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005850634507404222, | |
| "kl": 0.0009105180070037022, | |
| "learning_rate": 4.964459554091615e-05, | |
| "loss": 0.0003941170871257782, | |
| "num_tokens": 14196180.0, | |
| "reward": 1.15625, | |
| "reward_std": 0.6431877613067627, | |
| "rewards/reward_func/mean": 0.1284722222222222, | |
| "rewards/reward_func/std": 0.14959995283020866, | |
| "sampling/importance_sampling_ratio/max": 2.998795747756958, | |
| "sampling/importance_sampling_ratio/mean": 0.9569849967956543, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.074917793273926, | |
| "sampling/sampling_logp_difference/mean": 0.18225592374801636, | |
| "step": 93, | |
| "step_time": 128.9647894760128 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3898.0, | |
| "completions/mean_length": 1015.65625, | |
| "completions/mean_terminated_length": 926.7868041992188, | |
| "completions/min_length": 120.0, | |
| "completions/min_terminated_length": 120.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6771551072597504, | |
| "epoch": 0.2315270935960591, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006173881670119516, | |
| "kl": 0.0010466481326147914, | |
| "learning_rate": 4.9636398344233294e-05, | |
| "loss": 0.06990549713373184, | |
| "num_tokens": 14340030.0, | |
| "reward": 1.05078125, | |
| "reward_std": 0.7348734736442566, | |
| "rewards/reward_func/mean": 0.11675347222222222, | |
| "rewards/reward_func/std": 0.1158642934428321, | |
| "sampling/importance_sampling_ratio/max": 2.9963386058807373, | |
| "sampling/importance_sampling_ratio/mean": 0.9552966356277466, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.304615020751953, | |
| "sampling/sampling_logp_difference/mean": 0.18898534774780273, | |
| "step": 94, | |
| "step_time": 120.63474641623907 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3279.0, | |
| "completions/mean_length": 670.046875, | |
| "completions/mean_terminated_length": 615.6666870117188, | |
| "completions/min_length": 107.0, | |
| "completions/min_terminated_length": 107.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7438563853502274, | |
| "epoch": 0.23399014778325122, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007035370635253527, | |
| "kl": 0.0012569701357278973, | |
| "learning_rate": 4.9628108382768255e-05, | |
| "loss": -0.003529743291437626, | |
| "num_tokens": 14478481.0, | |
| "reward": 0.953125, | |
| "reward_std": 0.4794900715351105, | |
| "rewards/reward_func/mean": 0.10590277777777778, | |
| "rewards/reward_func/std": 0.07061862614419726, | |
| "sampling/importance_sampling_ratio/max": 2.9977564811706543, | |
| "sampling/importance_sampling_ratio/mean": 0.9566473960876465, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.311806678771973, | |
| "sampling/sampling_logp_difference/mean": 0.18983232975006104, | |
| "step": 95, | |
| "step_time": 136.43330346024595 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3520.0, | |
| "completions/mean_length": 1320.671875, | |
| "completions/mean_terminated_length": 1151.413818359375, | |
| "completions/min_length": 183.0, | |
| "completions/min_terminated_length": 183.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7065405398607254, | |
| "epoch": 0.23645320197044334, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.008568856880559883, | |
| "kl": 0.0011838628561235964, | |
| "learning_rate": 4.9619725687735686e-05, | |
| "loss": -0.011811807751655579, | |
| "num_tokens": 14657916.0, | |
| "reward": 1.00390625, | |
| "reward_std": 0.8539034724235535, | |
| "rewards/reward_func/mean": 0.1115451388888889, | |
| "rewards/reward_func/std": 0.16876447200775146, | |
| "sampling/importance_sampling_ratio/max": 2.9951701164245605, | |
| "sampling/importance_sampling_ratio/mean": 0.9421722888946533, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.175261497497559, | |
| "sampling/sampling_logp_difference/mean": 0.2128932625055313, | |
| "step": 96, | |
| "step_time": 121.57325341110118 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3340.0, | |
| "completions/mean_length": 1076.6875, | |
| "completions/mean_terminated_length": 995.9835205078125, | |
| "completions/min_length": 97.0, | |
| "completions/min_terminated_length": 97.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6008987575769424, | |
| "epoch": 0.23891625615763548, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006735602377105294, | |
| "kl": 0.001147125702118501, | |
| "learning_rate": 4.96112502906994e-05, | |
| "loss": -0.014930861070752144, | |
| "num_tokens": 14817160.0, | |
| "reward": 0.97265625, | |
| "reward_std": 0.6931025981903076, | |
| "rewards/reward_func/mean": 0.10807291666666667, | |
| "rewards/reward_func/std": 0.12522114316622415, | |
| "sampling/importance_sampling_ratio/max": 2.9980320930480957, | |
| "sampling/importance_sampling_ratio/mean": 0.9482098817825317, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.042277336120605, | |
| "sampling/sampling_logp_difference/mean": 0.1913895308971405, | |
| "step": 97, | |
| "step_time": 121.60319680999964 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2776.0, | |
| "completions/mean_length": 930.390625, | |
| "completions/mean_terminated_length": 891.1966552734375, | |
| "completions/min_length": 19.0, | |
| "completions/min_terminated_length": 19.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7253329902887344, | |
| "epoch": 0.2413793103448276, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0057644207764060195, | |
| "kl": 0.0011922722042072564, | |
| "learning_rate": 4.960268222357227e-05, | |
| "loss": -0.0594397634267807, | |
| "num_tokens": 14962129.0, | |
| "reward": 0.91796875, | |
| "reward_std": 0.5383224487304688, | |
| "rewards/reward_func/mean": 0.10199652777777778, | |
| "rewards/reward_func/std": 0.07774124874009027, | |
| "sampling/importance_sampling_ratio/max": 2.9961249828338623, | |
| "sampling/importance_sampling_ratio/mean": 0.9478025436401367, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.086261749267578, | |
| "sampling/sampling_logp_difference/mean": 0.20908650755882263, | |
| "step": 98, | |
| "step_time": 144.51559121510945 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3467.0, | |
| "completions/mean_length": 836.453125, | |
| "completions/mean_terminated_length": 784.7540283203125, | |
| "completions/min_length": 4.0, | |
| "completions/min_terminated_length": 4.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7885630130767822, | |
| "epoch": 0.2438423645320197, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005133721823283165, | |
| "kl": 0.0010307101329090074, | |
| "learning_rate": 4.959402151861613e-05, | |
| "loss": -0.026363378390669823, | |
| "num_tokens": 15104718.0, | |
| "reward": 0.91796875, | |
| "reward_std": 0.36202332377433777, | |
| "rewards/reward_func/mean": 0.10199652777777778, | |
| "rewards/reward_func/std": 0.05339052610927158, | |
| "sampling/importance_sampling_ratio/max": 2.9989402294158936, | |
| "sampling/importance_sampling_ratio/mean": 0.9489303827285767, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.430939674377441, | |
| "sampling/sampling_logp_difference/mean": 0.21791726350784302, | |
| "step": 99, | |
| "step_time": 128.17962785507552 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2746.0, | |
| "completions/mean_length": 817.984375, | |
| "completions/mean_terminated_length": 769.3547973632812, | |
| "completions/min_length": 180.0, | |
| "completions/min_terminated_length": 180.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.668343186378479, | |
| "epoch": 0.24630541871921183, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004204010827777613, | |
| "kl": 0.0011549554765224457, | |
| "learning_rate": 4.958526820844158e-05, | |
| "loss": -0.028108961880207062, | |
| "num_tokens": 15247933.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.3857583701610565, | |
| "rewards/reward_func/mean": 0.10416666666666667, | |
| "rewards/reward_func/std": 0.05755675666862064, | |
| "sampling/importance_sampling_ratio/max": 2.9985768795013428, | |
| "sampling/importance_sampling_ratio/mean": 0.9531916975975037, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.425323486328125, | |
| "sampling/sampling_logp_difference/mean": 0.1958668977022171, | |
| "step": 100, | |
| "step_time": 111.34808640205301 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2958.0, | |
| "completions/mean_length": 859.625, | |
| "completions/mean_terminated_length": 741.4667358398438, | |
| "completions/min_length": 140.0, | |
| "completions/min_terminated_length": 140.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7310220897197723, | |
| "epoch": 0.24876847290640394, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006198253337158266, | |
| "kl": 0.00121245181071572, | |
| "learning_rate": 4.957642232600797e-05, | |
| "loss": -0.02447034977376461, | |
| "num_tokens": 15383989.0, | |
| "reward": 1.01171875, | |
| "reward_std": 0.5729166865348816, | |
| "rewards/reward_func/mean": 0.11241319444444445, | |
| "rewards/reward_func/std": 0.08195814821455213, | |
| "sampling/importance_sampling_ratio/max": 2.996098518371582, | |
| "sampling/importance_sampling_ratio/mean": 0.9529974460601807, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.62481689453125, | |
| "sampling/sampling_logp_difference/mean": 0.20314571261405945, | |
| "step": 101, | |
| "step_time": 125.92396034114063 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3345.0, | |
| "completions/mean_length": 1127.828125, | |
| "completions/mean_terminated_length": 1048.49169921875, | |
| "completions/min_length": 31.0, | |
| "completions/min_terminated_length": 249.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6528142094612122, | |
| "epoch": 0.2512315270935961, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0034161702757803136, | |
| "kl": 0.0009927775390679017, | |
| "learning_rate": 4.956748390462316e-05, | |
| "loss": -0.04475435987114906, | |
| "num_tokens": 15537162.0, | |
| "reward": 0.9609375, | |
| "reward_std": 0.3914227783679962, | |
| "rewards/reward_func/mean": 0.10677083333333333, | |
| "rewards/reward_func/std": 0.05869032442569733, | |
| "sampling/importance_sampling_ratio/max": 2.9991648197174072, | |
| "sampling/importance_sampling_ratio/mean": 0.9523589015007019, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.799592971801758, | |
| "sampling/sampling_logp_difference/mean": 0.1954439878463745, | |
| "step": 102, | |
| "step_time": 122.85386259504594 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 3015.0, | |
| "completions/max_terminated_length": 3015.0, | |
| "completions/mean_length": 729.453125, | |
| "completions/mean_terminated_length": 742.2257690429688, | |
| "completions/min_length": 169.0, | |
| "completions/min_terminated_length": 169.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.616577073931694, | |
| "epoch": 0.2536945812807882, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.005423463270168187, | |
| "kl": 0.0010207885934505612, | |
| "learning_rate": 4.955845297794348e-05, | |
| "loss": 0.0010399030288681388, | |
| "num_tokens": 15676775.0, | |
| "reward": 1.015625, | |
| "reward_std": 0.4473799467086792, | |
| "rewards/reward_func/mean": 0.11284722222222222, | |
| "rewards/reward_func/std": 0.06644997994105022, | |
| "sampling/importance_sampling_ratio/max": 2.9987363815307617, | |
| "sampling/importance_sampling_ratio/mean": 0.9537383317947388, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.999309539794922, | |
| "sampling/sampling_logp_difference/mean": 0.18572832643985748, | |
| "step": 103, | |
| "step_time": 91.29225635388866 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1277.0, | |
| "completions/mean_length": 596.328125, | |
| "completions/mean_terminated_length": 540.77783203125, | |
| "completions/min_length": 175.0, | |
| "completions/min_terminated_length": 175.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7377785593271255, | |
| "epoch": 0.2561576354679803, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0057071374591335855, | |
| "kl": 0.0016648432065267116, | |
| "learning_rate": 4.954932957997359e-05, | |
| "loss": -0.011287668719887733, | |
| "num_tokens": 15805308.0, | |
| "reward": 0.91015625, | |
| "reward_std": 0.4115571975708008, | |
| "rewards/reward_func/mean": 0.10112847222222222, | |
| "rewards/reward_func/std": 0.060685116383764476, | |
| "sampling/importance_sampling_ratio/max": 2.9926609992980957, | |
| "sampling/importance_sampling_ratio/mean": 0.9554417133331299, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.185643196105957, | |
| "sampling/sampling_logp_difference/mean": 0.1904008984565735, | |
| "step": 104, | |
| "step_time": 142.36614196095616 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3505.0, | |
| "completions/max_terminated_length": 3505.0, | |
| "completions/mean_length": 803.0, | |
| "completions/mean_terminated_length": 803.0, | |
| "completions/min_length": 150.0, | |
| "completions/min_terminated_length": 150.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6270923018455505, | |
| "epoch": 0.25862068965517243, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0038009793433285183, | |
| "kl": 0.001456006633816287, | |
| "learning_rate": 4.954011374506632e-05, | |
| "loss": 0.003311900421977043, | |
| "num_tokens": 15948636.0, | |
| "reward": 0.87890625, | |
| "reward_std": 0.3507140874862671, | |
| "rewards/reward_func/mean": 0.09765625, | |
| "rewards/reward_func/std": 0.05090266797277662, | |
| "sampling/importance_sampling_ratio/max": 2.9969429969787598, | |
| "sampling/importance_sampling_ratio/mean": 0.9581711292266846, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.685257911682129, | |
| "sampling/sampling_logp_difference/mean": 0.17381532490253448, | |
| "step": 105, | |
| "step_time": 110.55873239389621 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3710.0, | |
| "completions/mean_length": 1168.421875, | |
| "completions/mean_terminated_length": 1033.2000732421875, | |
| "completions/min_length": 319.0, | |
| "completions/min_terminated_length": 319.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5577425956726074, | |
| "epoch": 0.26108374384236455, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004559121558826178, | |
| "kl": 0.001145464033470489, | |
| "learning_rate": 4.953080550792254e-05, | |
| "loss": -0.02663501352071762, | |
| "num_tokens": 16098423.0, | |
| "reward": 0.9921875, | |
| "reward_std": 0.49794965982437134, | |
| "rewards/reward_func/mean": 0.11024305555555555, | |
| "rewards/reward_func/std": 0.0740637414985233, | |
| "sampling/importance_sampling_ratio/max": 2.994739055633545, | |
| "sampling/importance_sampling_ratio/mean": 0.9601489305496216, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.123044967651367, | |
| "sampling/sampling_logp_difference/mean": 0.16482222080230713, | |
| "step": 106, | |
| "step_time": 127.26529746688902 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3451.0, | |
| "completions/max_terminated_length": 3451.0, | |
| "completions/mean_length": 908.515625, | |
| "completions/mean_terminated_length": 908.515625, | |
| "completions/min_length": 89.0, | |
| "completions/min_terminated_length": 89.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7097785025835037, | |
| "epoch": 0.26354679802955666, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006498712261232801, | |
| "kl": 0.001028874219628051, | |
| "learning_rate": 4.952140490359108e-05, | |
| "loss": 0.01925332471728325, | |
| "num_tokens": 16239080.0, | |
| "reward": 1.00390625, | |
| "reward_std": 0.4969992935657501, | |
| "rewards/reward_func/mean": 0.1115451388888889, | |
| "rewards/reward_func/std": 0.07306570145818922, | |
| "sampling/importance_sampling_ratio/max": 2.996018648147583, | |
| "sampling/importance_sampling_ratio/mean": 0.9512661695480347, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 19.228954315185547, | |
| "sampling/sampling_logp_difference/mean": 0.20210078358650208, | |
| "step": 107, | |
| "step_time": 111.68506760639139 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2785.0, | |
| "completions/max_terminated_length": 2785.0, | |
| "completions/mean_length": 654.171875, | |
| "completions/mean_terminated_length": 654.171875, | |
| "completions/min_length": 186.0, | |
| "completions/min_terminated_length": 186.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6160021126270294, | |
| "epoch": 0.2660098522167488, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.003462118141168412, | |
| "kl": 0.0017048600275302306, | |
| "learning_rate": 4.951191196746855e-05, | |
| "loss": -0.002917511621490121, | |
| "num_tokens": 16370051.0, | |
| "reward": 0.96484375, | |
| "reward_std": 0.28821834921836853, | |
| "rewards/reward_func/mean": 0.1072048611111111, | |
| "rewards/reward_func/std": 0.04385380778047773, | |
| "sampling/importance_sampling_ratio/max": 2.9970760345458984, | |
| "sampling/importance_sampling_ratio/mean": 0.9633020758628845, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.874786376953125, | |
| "sampling/sampling_logp_difference/mean": 0.16299639642238617, | |
| "step": 108, | |
| "step_time": 80.04466179292649 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3072.0, | |
| "completions/mean_length": 1216.625, | |
| "completions/mean_terminated_length": 1011.6551513671875, | |
| "completions/min_length": 21.0, | |
| "completions/min_terminated_length": 318.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6527849286794662, | |
| "epoch": 0.2684729064039409, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0024810418704967474, | |
| "kl": 0.0012183433573227376, | |
| "learning_rate": 4.950232673529922e-05, | |
| "loss": -0.02829243801534176, | |
| "num_tokens": 16532315.0, | |
| "reward": 0.98046875, | |
| "reward_std": 0.3708224594593048, | |
| "rewards/reward_func/mean": 0.10894097222222222, | |
| "rewards/reward_func/std": 0.05599056515428755, | |
| "sampling/importance_sampling_ratio/max": 2.9940707683563232, | |
| "sampling/importance_sampling_ratio/mean": 0.9564605355262756, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.400651931762695, | |
| "sampling/sampling_logp_difference/mean": 0.1813342571258545, | |
| "step": 109, | |
| "step_time": 152.9524313802831 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2671.0, | |
| "completions/max_terminated_length": 2671.0, | |
| "completions/mean_length": 894.421875, | |
| "completions/mean_terminated_length": 926.7212524414062, | |
| "completions/min_length": 46.0, | |
| "completions/min_terminated_length": 127.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7501505464315414, | |
| "epoch": 0.270935960591133, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004039668650228865, | |
| "kl": 0.0010712686198530719, | |
| "learning_rate": 4.9492649243174894e-05, | |
| "loss": -0.03601264953613281, | |
| "num_tokens": 16674150.0, | |
| "reward": 1.00390625, | |
| "reward_std": 0.421383261680603, | |
| "rewards/reward_func/mean": 0.1115451388888889, | |
| "rewards/reward_func/std": 0.0628970828321245, | |
| "sampling/importance_sampling_ratio/max": 2.9961328506469727, | |
| "sampling/importance_sampling_ratio/mean": 0.9449620842933655, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.124836921691895, | |
| "sampling/sampling_logp_difference/mean": 0.2186630368232727, | |
| "step": 110, | |
| "step_time": 88.27275016298518 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3556.0, | |
| "completions/mean_length": 1397.828125, | |
| "completions/mean_terminated_length": 1248.5167236328125, | |
| "completions/min_length": 131.0, | |
| "completions/min_terminated_length": 131.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.728972002863884, | |
| "epoch": 0.2733990147783251, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.003406792708563326, | |
| "kl": 0.0008771911088842899, | |
| "learning_rate": 4.948287952753475e-05, | |
| "loss": -0.017109831795096397, | |
| "num_tokens": 16856187.0, | |
| "reward": 0.94140625, | |
| "reward_std": 0.407621294260025, | |
| "rewards/reward_func/mean": 0.10460069444444445, | |
| "rewards/reward_func/std": 0.06210231284300486, | |
| "sampling/importance_sampling_ratio/max": 2.9997353553771973, | |
| "sampling/importance_sampling_ratio/mean": 0.946989893913269, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.114840507507324, | |
| "sampling/sampling_logp_difference/mean": 0.20500458776950836, | |
| "step": 111, | |
| "step_time": 154.77054870105349 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3783.0, | |
| "completions/mean_length": 1305.265625, | |
| "completions/mean_terminated_length": 1062.2930908203125, | |
| "completions/min_length": 143.0, | |
| "completions/min_terminated_length": 143.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6425753086805344, | |
| "epoch": 0.27586206896551724, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006779910373498959, | |
| "kl": 0.0009470495278947055, | |
| "learning_rate": 4.947301762516526e-05, | |
| "loss": -0.021853812038898468, | |
| "num_tokens": 17023532.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.5935410857200623, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.10359535614649455, | |
| "sampling/importance_sampling_ratio/max": 2.996713161468506, | |
| "sampling/importance_sampling_ratio/mean": 0.9557031989097595, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.808320999145508, | |
| "sampling/sampling_logp_difference/mean": 0.1847507208585739, | |
| "step": 112, | |
| "step_time": 125.85401763604023 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3423.0, | |
| "completions/max_terminated_length": 3423.0, | |
| "completions/mean_length": 829.296875, | |
| "completions/mean_terminated_length": 829.296875, | |
| "completions/min_length": 243.0, | |
| "completions/min_terminated_length": 243.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.665191113948822, | |
| "epoch": 0.27832512315270935, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0049727812963450396, | |
| "kl": 0.0011728792305802926, | |
| "learning_rate": 4.946306357319997e-05, | |
| "loss": 0.0018310556188225746, | |
| "num_tokens": 17174735.0, | |
| "reward": 0.9296875, | |
| "reward_std": 0.352059543132782, | |
| "rewards/reward_func/mean": 0.1032986111111111, | |
| "rewards/reward_func/std": 0.052210075987709895, | |
| "sampling/importance_sampling_ratio/max": 2.999053955078125, | |
| "sampling/importance_sampling_ratio/mean": 0.9518330693244934, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.943872451782227, | |
| "sampling/sampling_logp_difference/mean": 0.19780156016349792, | |
| "step": 113, | |
| "step_time": 104.48315672390163 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3505.0, | |
| "completions/max_terminated_length": 3505.0, | |
| "completions/mean_length": 872.828125, | |
| "completions/mean_terminated_length": 872.828125, | |
| "completions/min_length": 65.0, | |
| "completions/min_terminated_length": 65.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6913283914327621, | |
| "epoch": 0.28078817733990147, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0064203528593588695, | |
| "kl": 0.0009301105164922774, | |
| "learning_rate": 4.9453017409119416e-05, | |
| "loss": 0.06472717970609665, | |
| "num_tokens": 17311444.0, | |
| "reward": 1.11328125, | |
| "reward_std": 0.727751612663269, | |
| "rewards/reward_func/mean": 0.12369791666666667, | |
| "rewards/reward_func/std": 0.11462434629599254, | |
| "sampling/importance_sampling_ratio/max": 2.9989895820617676, | |
| "sampling/importance_sampling_ratio/mean": 0.9566206932067871, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.315587043762207, | |
| "sampling/sampling_logp_difference/mean": 0.1795286387205124, | |
| "step": 114, | |
| "step_time": 114.97647374519147 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2426.0, | |
| "completions/max_terminated_length": 2426.0, | |
| "completions/mean_length": 776.515625, | |
| "completions/mean_terminated_length": 780.5423583984375, | |
| "completions/min_length": 228.0, | |
| "completions/min_terminated_length": 228.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7222933769226074, | |
| "epoch": 0.2832512315270936, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004808982424799973, | |
| "kl": 0.0010381730098742992, | |
| "learning_rate": 4.9442879170750976e-05, | |
| "loss": -0.04378526657819748, | |
| "num_tokens": 17443477.0, | |
| "reward": 1.01953125, | |
| "reward_std": 0.5238454341888428, | |
| "rewards/reward_func/mean": 0.11328125, | |
| "rewards/reward_func/std": 0.10823717713356018, | |
| "sampling/importance_sampling_ratio/max": 2.997987747192383, | |
| "sampling/importance_sampling_ratio/mean": 0.9515382051467896, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.357341766357422, | |
| "sampling/sampling_logp_difference/mean": 0.19661840796470642, | |
| "step": 115, | |
| "step_time": 80.79798079002649 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3933.0, | |
| "completions/mean_length": 708.015625, | |
| "completions/mean_terminated_length": 654.2381591796875, | |
| "completions/min_length": 208.0, | |
| "completions/min_terminated_length": 208.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6727284938097, | |
| "epoch": 0.2857142857142857, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0020403117969845282, | |
| "kl": 0.001005473401164636, | |
| "learning_rate": 4.943264889626871e-05, | |
| "loss": -0.0025690970942378044, | |
| "num_tokens": 17572774.0, | |
| "reward": 1.02734375, | |
| "reward_std": 0.2524372637271881, | |
| "rewards/reward_func/mean": 0.11414930555555555, | |
| "rewards/reward_func/std": 0.03793410791291131, | |
| "sampling/importance_sampling_ratio/max": 2.9996719360351562, | |
| "sampling/importance_sampling_ratio/mean": 0.9610835313796997, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.65058708190918, | |
| "sampling/sampling_logp_difference/mean": 0.1769585758447647, | |
| "step": 116, | |
| "step_time": 140.72442844510078 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3501.0, | |
| "completions/mean_length": 1290.484375, | |
| "completions/mean_terminated_length": 1152.5081787109375, | |
| "completions/min_length": 95.0, | |
| "completions/min_terminated_length": 95.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6598100662231445, | |
| "epoch": 0.2881773399014778, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005709928483840225, | |
| "kl": 0.0009271236776839942, | |
| "learning_rate": 4.942232662419324e-05, | |
| "loss": 0.02777143567800522, | |
| "num_tokens": 17757765.0, | |
| "reward": 1.01171875, | |
| "reward_std": 0.6507469415664673, | |
| "rewards/reward_func/mean": 0.11241319444444445, | |
| "rewards/reward_func/std": 0.10562626189655727, | |
| "sampling/importance_sampling_ratio/max": 2.9967916011810303, | |
| "sampling/importance_sampling_ratio/mean": 0.9467035531997681, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.749887466430664, | |
| "sampling/sampling_logp_difference/mean": 0.20036232471466064, | |
| "step": 117, | |
| "step_time": 137.15964733506553 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2417.0, | |
| "completions/mean_length": 863.890625, | |
| "completions/mean_terminated_length": 828.34423828125, | |
| "completions/min_length": 160.0, | |
| "completions/min_terminated_length": 160.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6135236918926239, | |
| "epoch": 0.29064039408866993, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005015909482580426, | |
| "kl": 0.0009673014137661085, | |
| "learning_rate": 4.941191239339158e-05, | |
| "loss": -0.03219921514391899, | |
| "num_tokens": 17892414.0, | |
| "reward": 0.921875, | |
| "reward_std": 0.39559829235076904, | |
| "rewards/reward_func/mean": 0.10243055555555555, | |
| "rewards/reward_func/std": 0.0586680488453971, | |
| "sampling/importance_sampling_ratio/max": 2.992807388305664, | |
| "sampling/importance_sampling_ratio/mean": 0.9618625640869141, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.102208137512207, | |
| "sampling/sampling_logp_difference/mean": 0.169651061296463, | |
| "step": 118, | |
| "step_time": 116.0549735748209 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2490.0, | |
| "completions/max_terminated_length": 2490.0, | |
| "completions/mean_length": 547.140625, | |
| "completions/mean_terminated_length": 544.8870849609375, | |
| "completions/min_length": 232.0, | |
| "completions/min_terminated_length": 232.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.5702391713857651, | |
| "epoch": 0.29310344827586204, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005584388944937819, | |
| "kl": 0.0014928276068530977, | |
| "learning_rate": 4.9401406243077e-05, | |
| "loss": 0.0032053731847554445, | |
| "num_tokens": 18006743.0, | |
| "reward": 1.0234375, | |
| "reward_std": 0.4380665123462677, | |
| "rewards/reward_func/mean": 0.11371527777777778, | |
| "rewards/reward_func/std": 0.09896917310025957, | |
| "sampling/importance_sampling_ratio/max": 2.988774299621582, | |
| "sampling/importance_sampling_ratio/mean": 0.9679361581802368, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.677656173706055, | |
| "sampling/sampling_logp_difference/mean": 0.1493430733680725, | |
| "step": 119, | |
| "step_time": 75.99547935905866 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2589.0, | |
| "completions/mean_length": 1029.71875, | |
| "completions/mean_terminated_length": 995.8524169921875, | |
| "completions/min_length": 215.0, | |
| "completions/min_terminated_length": 215.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7400919646024704, | |
| "epoch": 0.2955665024630542, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006443843541173749, | |
| "kl": 0.001080518588423729, | |
| "learning_rate": 4.939080821280889e-05, | |
| "loss": 0.002451773267239332, | |
| "num_tokens": 18154629.0, | |
| "reward": 1.03515625, | |
| "reward_std": 0.7208173871040344, | |
| "rewards/reward_func/mean": 0.1150173611111111, | |
| "rewards/reward_func/std": 0.1304852200878991, | |
| "sampling/importance_sampling_ratio/max": 2.9992618560791016, | |
| "sampling/importance_sampling_ratio/mean": 0.9474197030067444, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 21.595125198364258, | |
| "sampling/sampling_logp_difference/mean": 0.21167021989822388, | |
| "step": 120, | |
| "step_time": 143.630502771819 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3303.0, | |
| "completions/mean_length": 1061.234375, | |
| "completions/mean_terminated_length": 1023.9677124023438, | |
| "completions/min_length": 190.0, | |
| "completions/min_terminated_length": 190.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7333497107028961, | |
| "epoch": 0.29802955665024633, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00924462717955816, | |
| "kl": 0.0010052941361209378, | |
| "learning_rate": 4.9380118342492596e-05, | |
| "loss": 0.05594870075583458, | |
| "num_tokens": 18319604.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.8487323522567749, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.13110164470142788, | |
| "sampling/importance_sampling_ratio/max": 2.99625825881958, | |
| "sampling/importance_sampling_ratio/mean": 0.9394384622573853, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.43202018737793, | |
| "sampling/sampling_logp_difference/mean": 0.22769640386104584, | |
| "step": 121, | |
| "step_time": 144.13920049113221 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3333.0, | |
| "completions/max_terminated_length": 3333.0, | |
| "completions/mean_length": 1141.796875, | |
| "completions/mean_terminated_length": 1141.796875, | |
| "completions/min_length": 307.0, | |
| "completions/min_terminated_length": 307.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6576246023178101, | |
| "epoch": 0.30049261083743845, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004997437097530932, | |
| "kl": 0.001344296833849512, | |
| "learning_rate": 4.936933667237926e-05, | |
| "loss": -3.2415613532066345e-05, | |
| "num_tokens": 18492823.0, | |
| "reward": 0.984375, | |
| "reward_std": 0.4876958429813385, | |
| "rewards/reward_func/mean": 0.109375, | |
| "rewards/reward_func/std": 0.0719899766974979, | |
| "sampling/importance_sampling_ratio/max": 2.9998884201049805, | |
| "sampling/importance_sampling_ratio/mean": 0.9471359252929688, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.76190185546875, | |
| "sampling/sampling_logp_difference/mean": 0.19633889198303223, | |
| "step": 122, | |
| "step_time": 107.010423976928 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2052.0, | |
| "completions/mean_length": 939.9375, | |
| "completions/mean_terminated_length": 846.3933715820312, | |
| "completions/min_length": 145.0, | |
| "completions/min_terminated_length": 145.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7716793268918991, | |
| "epoch": 0.30295566502463056, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005142065502705506, | |
| "kl": 0.0012641588837141171, | |
| "learning_rate": 4.935846324306571e-05, | |
| "loss": -0.04138103872537613, | |
| "num_tokens": 18636195.0, | |
| "reward": 0.875, | |
| "reward_std": 0.3700064420700073, | |
| "rewards/reward_func/mean": 0.09722222222222222, | |
| "rewards/reward_func/std": 0.05186475647820367, | |
| "sampling/importance_sampling_ratio/max": 2.9990382194519043, | |
| "sampling/importance_sampling_ratio/mean": 0.9490943551063538, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.95599365234375, | |
| "sampling/sampling_logp_difference/mean": 0.21140214800834656, | |
| "step": 123, | |
| "step_time": 158.32471957895905 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2607.0, | |
| "completions/max_terminated_length": 2607.0, | |
| "completions/mean_length": 784.765625, | |
| "completions/mean_terminated_length": 784.765625, | |
| "completions/min_length": 46.0, | |
| "completions/min_terminated_length": 46.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6249927580356598, | |
| "epoch": 0.3054187192118227, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0321336624260346, | |
| "kl": 0.0011989549384452403, | |
| "learning_rate": 4.934749809549427e-05, | |
| "loss": -0.01217272412031889, | |
| "num_tokens": 18771556.0, | |
| "reward": 0.95703125, | |
| "reward_std": 0.6386286616325378, | |
| "rewards/reward_func/mean": 0.10633680555555555, | |
| "rewards/reward_func/std": 0.11104831099510193, | |
| "sampling/importance_sampling_ratio/max": 2.99676775932312, | |
| "sampling/importance_sampling_ratio/mean": 0.9598660469055176, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.24995231628418, | |
| "sampling/sampling_logp_difference/mean": 0.17498475313186646, | |
| "step": 124, | |
| "step_time": 77.80229415814392 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3424.0, | |
| "completions/mean_length": 895.421875, | |
| "completions/mean_terminated_length": 800.7069091796875, | |
| "completions/min_length": 7.0, | |
| "completions/min_terminated_length": 81.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6748597323894501, | |
| "epoch": 0.3078817733990148, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006046409516086105, | |
| "kl": 0.0010081128857564181, | |
| "learning_rate": 4.9336441270952595e-05, | |
| "loss": 0.09001608937978745, | |
| "num_tokens": 18905151.0, | |
| "reward": 1.01953125, | |
| "reward_std": 0.7896963953971863, | |
| "rewards/reward_func/mean": 0.11328125, | |
| "rewards/reward_func/std": 0.1456383185254203, | |
| "sampling/importance_sampling_ratio/max": 2.9961564540863037, | |
| "sampling/importance_sampling_ratio/mean": 0.955715537071228, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.451773643493652, | |
| "sampling/sampling_logp_difference/mean": 0.19001328945159912, | |
| "step": 125, | |
| "step_time": 143.38146781618707 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2875.0, | |
| "completions/mean_length": 1264.296875, | |
| "completions/mean_terminated_length": 1024.322021484375, | |
| "completions/min_length": 149.0, | |
| "completions/min_terminated_length": 149.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7080798596143723, | |
| "epoch": 0.3103448275862069, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.008165326775365848, | |
| "kl": 0.0014844062679912895, | |
| "learning_rate": 4.932529281107355e-05, | |
| "loss": 0.006085427477955818, | |
| "num_tokens": 19074754.0, | |
| "reward": 1.1796875, | |
| "reward_std": 1.0345890522003174, | |
| "rewards/reward_func/mean": 0.1310763888888889, | |
| "rewards/reward_func/std": 0.17465003662639195, | |
| "sampling/importance_sampling_ratio/max": 2.999812602996826, | |
| "sampling/importance_sampling_ratio/mean": 0.9442422389984131, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.812485694885254, | |
| "sampling/sampling_logp_difference/mean": 0.2114749252796173, | |
| "step": 126, | |
| "step_time": 136.04279370303266 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2906.0, | |
| "completions/max_terminated_length": 2906.0, | |
| "completions/mean_length": 730.3125, | |
| "completions/mean_terminated_length": 729.5322265625, | |
| "completions/min_length": 25.0, | |
| "completions/min_terminated_length": 25.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8340341448783875, | |
| "epoch": 0.312807881773399, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01630669112288163, | |
| "kl": 0.002556009974796325, | |
| "learning_rate": 4.931405275783507e-05, | |
| "loss": -0.009080227464437485, | |
| "num_tokens": 19230038.0, | |
| "reward": 0.92578125, | |
| "reward_std": 0.9581311345100403, | |
| "rewards/reward_func/mean": 0.10286458333333333, | |
| "rewards/reward_func/std": 0.15839683678415087, | |
| "sampling/importance_sampling_ratio/max": 2.9990289211273193, | |
| "sampling/importance_sampling_ratio/mean": 0.9346531629562378, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.898031234741211, | |
| "sampling/sampling_logp_difference/mean": 0.24698056280612946, | |
| "step": 127, | |
| "step_time": 100.47396804229356 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3481.0, | |
| "completions/mean_length": 988.28125, | |
| "completions/mean_terminated_length": 888.0322265625, | |
| "completions/min_length": 217.0, | |
| "completions/min_terminated_length": 217.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.718447744846344, | |
| "epoch": 0.31527093596059114, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006766736760887471, | |
| "kl": 0.0016968499112408608, | |
| "learning_rate": 4.930272115355992e-05, | |
| "loss": -0.017601270228624344, | |
| "num_tokens": 19374488.0, | |
| "reward": 1.0078125, | |
| "reward_std": 0.5992202758789062, | |
| "rewards/reward_func/mean": 0.11197916666666667, | |
| "rewards/reward_func/std": 0.10523506999015808, | |
| "sampling/importance_sampling_ratio/max": 2.9975998401641846, | |
| "sampling/importance_sampling_ratio/mean": 0.9509903192520142, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.434347152709961, | |
| "sampling/sampling_logp_difference/mean": 0.20046307146549225, | |
| "step": 128, | |
| "step_time": 131.1536720271688 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2812.0, | |
| "completions/mean_length": 879.46875, | |
| "completions/mean_terminated_length": 775.2000122070312, | |
| "completions/min_length": 15.0, | |
| "completions/min_terminated_length": 137.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7035564482212067, | |
| "epoch": 0.31773399014778325, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.008916657281684769, | |
| "kl": 0.0014704163477290422, | |
| "learning_rate": 4.929129804091562e-05, | |
| "loss": -0.0401824489235878, | |
| "num_tokens": 19516246.0, | |
| "reward": 1.0859375, | |
| "reward_std": 0.6580073833465576, | |
| "rewards/reward_func/mean": 0.12065972222222222, | |
| "rewards/reward_func/std": 0.11946020854843988, | |
| "sampling/importance_sampling_ratio/max": 2.984513998031616, | |
| "sampling/importance_sampling_ratio/mean": 0.9554660320281982, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.787663459777832, | |
| "sampling/sampling_logp_difference/mean": 0.1841893047094345, | |
| "step": 129, | |
| "step_time": 133.5201920508407 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3938.0, | |
| "completions/max_terminated_length": 3938.0, | |
| "completions/mean_length": 829.765625, | |
| "completions/mean_terminated_length": 829.765625, | |
| "completions/min_length": 141.0, | |
| "completions/min_terminated_length": 141.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7256664037704468, | |
| "epoch": 0.32019704433497537, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0036159006370788662, | |
| "kl": 0.0014483562554232776, | |
| "learning_rate": 4.927978346291424e-05, | |
| "loss": -0.00416420167312026, | |
| "num_tokens": 19648583.0, | |
| "reward": 0.98046875, | |
| "reward_std": 0.2827889919281006, | |
| "rewards/reward_func/mean": 0.10894097222222222, | |
| "rewards/reward_func/std": 0.043313807911343045, | |
| "sampling/importance_sampling_ratio/max": 2.996049404144287, | |
| "sampling/importance_sampling_ratio/mean": 0.9527163505554199, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.433141708374023, | |
| "sampling/sampling_logp_difference/mean": 0.19274213910102844, | |
| "step": 130, | |
| "step_time": 108.99631480290554 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3723.0, | |
| "completions/mean_length": 989.65625, | |
| "completions/mean_terminated_length": 903.7930908203125, | |
| "completions/min_length": 139.0, | |
| "completions/min_terminated_length": 139.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.812716618180275, | |
| "epoch": 0.3226600985221675, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.013116335520916896, | |
| "kl": 0.0016703194414731115, | |
| "learning_rate": 4.9268177462912255e-05, | |
| "loss": -0.05333615094423294, | |
| "num_tokens": 19797857.0, | |
| "reward": 1.0703125, | |
| "reward_std": 1.1246416568756104, | |
| "rewards/reward_func/mean": 0.1189236111111111, | |
| "rewards/reward_func/std": 0.19068154527081382, | |
| "sampling/importance_sampling_ratio/max": 2.9927353858947754, | |
| "sampling/importance_sampling_ratio/mean": 0.9444986581802368, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.257890701293945, | |
| "sampling/sampling_logp_difference/mean": 0.21850484609603882, | |
| "step": 131, | |
| "step_time": 118.88544421107508 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3857.0, | |
| "completions/max_terminated_length": 3857.0, | |
| "completions/mean_length": 816.703125, | |
| "completions/mean_terminated_length": 817.84130859375, | |
| "completions/min_length": 248.0, | |
| "completions/min_terminated_length": 248.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6633237600326538, | |
| "epoch": 0.3251231527093596, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0031921393393205934, | |
| "kl": 0.0013503513764590025, | |
| "learning_rate": 4.9256480084610376e-05, | |
| "loss": -0.00991955865174532, | |
| "num_tokens": 19926286.0, | |
| "reward": 1.0625, | |
| "reward_std": 0.40089187026023865, | |
| "rewards/reward_func/mean": 0.11805555555555555, | |
| "rewards/reward_func/std": 0.05971749954753452, | |
| "sampling/importance_sampling_ratio/max": 2.9954562187194824, | |
| "sampling/importance_sampling_ratio/mean": 0.9606219530105591, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.18454647064209, | |
| "sampling/sampling_logp_difference/mean": 0.17874056100845337, | |
| "step": 132, | |
| "step_time": 110.05460423021577 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3840.0, | |
| "completions/mean_length": 1232.65625, | |
| "completions/mean_terminated_length": 1169.283447265625, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 131.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.737604409456253, | |
| "epoch": 0.3275862068965517, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005410025456930972, | |
| "kl": 0.0011540141276782379, | |
| "learning_rate": 4.9244691372053376e-05, | |
| "loss": 0.0003698645159602165, | |
| "num_tokens": 20093112.0, | |
| "reward": 1.0234375, | |
| "reward_std": 0.6824308037757874, | |
| "rewards/reward_func/mean": 0.11371527777777778, | |
| "rewards/reward_func/std": 0.1231840882036421, | |
| "sampling/importance_sampling_ratio/max": 2.99904465675354, | |
| "sampling/importance_sampling_ratio/mean": 0.9456319212913513, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.690722465515137, | |
| "sampling/sampling_logp_difference/mean": 0.2077077031135559, | |
| "step": 133, | |
| "step_time": 136.14795652101748 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3575.0, | |
| "completions/mean_length": 1119.234375, | |
| "completions/mean_terminated_length": 982.9491577148438, | |
| "completions/min_length": 186.0, | |
| "completions/min_terminated_length": 186.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8452684134244919, | |
| "epoch": 0.33004926108374383, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009834879984087863, | |
| "kl": 0.0014841850788798183, | |
| "learning_rate": 4.9232811369629936e-05, | |
| "loss": -0.09593109786510468, | |
| "num_tokens": 20252807.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.8177770376205444, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.16325969994068146, | |
| "sampling/importance_sampling_ratio/max": 2.999234914779663, | |
| "sampling/importance_sampling_ratio/mean": 0.9442738890647888, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.05754280090332, | |
| "sampling/sampling_logp_difference/mean": 0.22508864104747772, | |
| "step": 134, | |
| "step_time": 130.63481510290876 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3492.0, | |
| "completions/mean_length": 780.5, | |
| "completions/mean_terminated_length": 668.9500122070312, | |
| "completions/min_length": 22.0, | |
| "completions/min_terminated_length": 182.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7695089131593704, | |
| "epoch": 0.33251231527093594, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0050137511344250825, | |
| "kl": 0.001503633859101683, | |
| "learning_rate": 4.9220840122072495e-05, | |
| "loss": -0.044059619307518005, | |
| "num_tokens": 20393879.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.5713995099067688, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.12210349573029412, | |
| "sampling/importance_sampling_ratio/max": 2.998356819152832, | |
| "sampling/importance_sampling_ratio/mean": 0.9478631615638733, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.743224143981934, | |
| "sampling/sampling_logp_difference/mean": 0.21353685855865479, | |
| "step": 135, | |
| "step_time": 128.1372858658433 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2537.0, | |
| "completions/mean_length": 827.328125, | |
| "completions/mean_terminated_length": 756.8275756835938, | |
| "completions/min_length": 125.0, | |
| "completions/min_terminated_length": 185.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7343933880329132, | |
| "epoch": 0.33497536945812806, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009246592184747104, | |
| "kl": 0.0019113961607217789, | |
| "learning_rate": 4.920877767445705e-05, | |
| "loss": -0.002634907141327858, | |
| "num_tokens": 20544364.0, | |
| "reward": 0.9296875, | |
| "reward_std": 0.7063610553741455, | |
| "rewards/reward_func/mean": 0.1032986111111111, | |
| "rewards/reward_func/std": 0.12537386185593075, | |
| "sampling/importance_sampling_ratio/max": 2.9994935989379883, | |
| "sampling/importance_sampling_ratio/mean": 0.9448176026344299, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.562491416931152, | |
| "sampling/sampling_logp_difference/mean": 0.21808388829231262, | |
| "step": 136, | |
| "step_time": 122.9346935309004 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2280.0, | |
| "completions/mean_length": 826.921875, | |
| "completions/mean_terminated_length": 623.5614013671875, | |
| "completions/min_length": 75.0, | |
| "completions/min_terminated_length": 85.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6908468455076218, | |
| "epoch": 0.3374384236453202, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005994547580292389, | |
| "kl": 0.0016178832156583667, | |
| "learning_rate": 4.919662407220299e-05, | |
| "loss": 0.017274843528866768, | |
| "num_tokens": 20670935.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.42374932765960693, | |
| "rewards/reward_func/mean": 0.09375, | |
| "rewards/reward_func/std": 0.06059717138608297, | |
| "sampling/importance_sampling_ratio/max": 2.9996566772460938, | |
| "sampling/importance_sampling_ratio/mean": 0.9579602479934692, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.689961433410645, | |
| "sampling/sampling_logp_difference/mean": 0.18010303378105164, | |
| "step": 137, | |
| "step_time": 111.31847057538107 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3975.0, | |
| "completions/mean_length": 1044.9375, | |
| "completions/mean_terminated_length": 996.5079956054688, | |
| "completions/min_length": 102.0, | |
| "completions/min_terminated_length": 102.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7295639663934708, | |
| "epoch": 0.3399014778325123, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014028663492014952, | |
| "kl": 0.0016514419403392822, | |
| "learning_rate": 4.918437936107293e-05, | |
| "loss": -0.002566501498222351, | |
| "num_tokens": 20827923.0, | |
| "reward": 1.171875, | |
| "reward_std": 1.023411512374878, | |
| "rewards/reward_func/mean": 0.13020833333333334, | |
| "rewards/reward_func/std": 0.173271753721767, | |
| "sampling/importance_sampling_ratio/max": 2.998654365539551, | |
| "sampling/importance_sampling_ratio/mean": 0.9453516006469727, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.36987018585205, | |
| "sampling/sampling_logp_difference/mean": 0.21218480169773102, | |
| "step": 138, | |
| "step_time": 127.0127622236032 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3486.0, | |
| "completions/mean_length": 969.0, | |
| "completions/mean_terminated_length": 928.57373046875, | |
| "completions/min_length": 131.0, | |
| "completions/min_terminated_length": 131.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7234471142292023, | |
| "epoch": 0.34236453201970446, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005342188170689295, | |
| "kl": 0.0016450581315439194, | |
| "learning_rate": 4.9172043587172564e-05, | |
| "loss": -0.045484960079193115, | |
| "num_tokens": 21004531.0, | |
| "reward": 1.00390625, | |
| "reward_std": 0.6477864980697632, | |
| "rewards/reward_func/mean": 0.1115451388888889, | |
| "rewards/reward_func/std": 0.1517499718401167, | |
| "sampling/importance_sampling_ratio/max": 2.9931271076202393, | |
| "sampling/importance_sampling_ratio/mean": 0.9399889707565308, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.982999801635742, | |
| "sampling/sampling_logp_difference/mean": 0.22505322098731995, | |
| "step": 139, | |
| "step_time": 145.2830160120502 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3734.0, | |
| "completions/mean_length": 1036.578125, | |
| "completions/mean_terminated_length": 883.137939453125, | |
| "completions/min_length": 52.0, | |
| "completions/min_terminated_length": 52.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 1.0906709432601929, | |
| "epoch": 0.3448275862068966, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006354945561563261, | |
| "kl": 0.0026675731933210045, | |
| "learning_rate": 4.915961679695046e-05, | |
| "loss": 0.002837133128196001, | |
| "num_tokens": 21157112.0, | |
| "reward": 1.1953125, | |
| "reward_std": 0.8035989999771118, | |
| "rewards/reward_func/mean": 0.1328125, | |
| "rewards/reward_func/std": 0.18738024102316964, | |
| "sampling/importance_sampling_ratio/max": 2.9998462200164795, | |
| "sampling/importance_sampling_ratio/mean": 0.9394787549972534, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.561127662658691, | |
| "sampling/sampling_logp_difference/mean": 0.23769940435886383, | |
| "step": 140, | |
| "step_time": 117.01026117592119 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3236.0, | |
| "completions/mean_length": 1042.078125, | |
| "completions/mean_terminated_length": 988.9031982421875, | |
| "completions/min_length": 252.0, | |
| "completions/min_terminated_length": 252.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7437245845794678, | |
| "epoch": 0.3472906403940887, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.013162011043027224, | |
| "kl": 0.002167004276998341, | |
| "learning_rate": 4.914709903719788e-05, | |
| "loss": -0.0010647885501384735, | |
| "num_tokens": 21309389.0, | |
| "reward": 1.0078125, | |
| "reward_std": 0.8392276763916016, | |
| "rewards/reward_func/mean": 0.11197916666666667, | |
| "rewards/reward_func/std": 0.14033709466457367, | |
| "sampling/importance_sampling_ratio/max": 2.9996509552001953, | |
| "sampling/importance_sampling_ratio/mean": 0.9429901838302612, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.79409122467041, | |
| "sampling/sampling_logp_difference/mean": 0.22013148665428162, | |
| "step": 141, | |
| "step_time": 114.42773477407172 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3518.0, | |
| "completions/max_terminated_length": 3464.0, | |
| "completions/mean_length": 945.75, | |
| "completions/mean_terminated_length": 904.9207153320312, | |
| "completions/min_length": 84.0, | |
| "completions/min_terminated_length": 84.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.821587085723877, | |
| "epoch": 0.3497536945812808, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.011903686956581254, | |
| "kl": 0.0028415362467058003, | |
| "learning_rate": 4.913449035504865e-05, | |
| "loss": 0.01681530475616455, | |
| "num_tokens": 21445789.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.9407598972320557, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.16332321696811253, | |
| "sampling/importance_sampling_ratio/max": 2.9959158897399902, | |
| "sampling/importance_sampling_ratio/mean": 0.9527857303619385, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.999815940856934, | |
| "sampling/sampling_logp_difference/mean": 0.20200073719024658, | |
| "step": 142, | |
| "step_time": 101.91747950715944 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2125.0, | |
| "completions/mean_length": 741.765625, | |
| "completions/mean_terminated_length": 688.5238647460938, | |
| "completions/min_length": 61.0, | |
| "completions/min_terminated_length": 61.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7046773880720139, | |
| "epoch": 0.3522167487684729, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.010794463183350216, | |
| "kl": 0.0018471547809895128, | |
| "learning_rate": 4.912179079797892e-05, | |
| "loss": -0.030231602489948273, | |
| "num_tokens": 21573950.0, | |
| "reward": 1.12109375, | |
| "reward_std": 0.6516039371490479, | |
| "rewards/reward_func/mean": 0.12456597222222222, | |
| "rewards/reward_func/std": 0.11842490235964458, | |
| "sampling/importance_sampling_ratio/max": 2.997932195663452, | |
| "sampling/importance_sampling_ratio/mean": 0.9545925855636597, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.498371124267578, | |
| "sampling/sampling_logp_difference/mean": 0.19408845901489258, | |
| "step": 143, | |
| "step_time": 138.89513726904988 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4020.0, | |
| "completions/mean_length": 825.359375, | |
| "completions/mean_terminated_length": 719.8547973632812, | |
| "completions/min_length": 21.0, | |
| "completions/min_terminated_length": 21.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6682114452123642, | |
| "epoch": 0.35467980295566504, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.009881096093500623, | |
| "kl": 0.0020337939204182476, | |
| "learning_rate": 4.910900041380703e-05, | |
| "loss": -0.003868427127599716, | |
| "num_tokens": 21710037.0, | |
| "reward": 0.98046875, | |
| "reward_std": 0.6256816983222961, | |
| "rewards/reward_func/mean": 0.10894097222222222, | |
| "rewards/reward_func/std": 0.10894608166482714, | |
| "sampling/importance_sampling_ratio/max": 2.9985740184783936, | |
| "sampling/importance_sampling_ratio/mean": 0.9586971402168274, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.846827507019043, | |
| "sampling/sampling_logp_difference/mean": 0.17317776381969452, | |
| "step": 144, | |
| "step_time": 119.17530405288562 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3935.0, | |
| "completions/mean_length": 1660.03125, | |
| "completions/mean_terminated_length": 1629.4576416015625, | |
| "completions/min_length": 111.0, | |
| "completions/min_terminated_length": 159.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5887557715177536, | |
| "epoch": 0.35714285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.005607486150752285, | |
| "kl": 0.001655459520407021, | |
| "learning_rate": 4.909611925069332e-05, | |
| "loss": 0.037647098302841187, | |
| "num_tokens": 21913143.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.8141295313835144, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.1332318385442098, | |
| "sampling/importance_sampling_ratio/max": 2.9985463619232178, | |
| "sampling/importance_sampling_ratio/mean": 0.9469821453094482, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.999975204467773, | |
| "sampling/sampling_logp_difference/mean": 0.1916055977344513, | |
| "step": 145, | |
| "step_time": 134.20607422688045 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3889.0, | |
| "completions/mean_length": 896.34375, | |
| "completions/mean_terminated_length": 754.322021484375, | |
| "completions/min_length": 22.0, | |
| "completions/min_terminated_length": 101.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7062448561191559, | |
| "epoch": 0.35960591133004927, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0031982862153996645, | |
| "kl": 0.0019544282113201916, | |
| "learning_rate": 4.9083147357139936e-05, | |
| "loss": -0.02738259732723236, | |
| "num_tokens": 22054397.0, | |
| "reward": 1.11328125, | |
| "reward_std": 0.5544368028640747, | |
| "rewards/reward_func/mean": 0.12369791666666667, | |
| "rewards/reward_func/std": 0.12000629636976454, | |
| "sampling/importance_sampling_ratio/max": 2.9959394931793213, | |
| "sampling/importance_sampling_ratio/mean": 0.9592767357826233, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.54010581970215, | |
| "sampling/sampling_logp_difference/mean": 0.18619805574417114, | |
| "step": 146, | |
| "step_time": 121.56179263815284 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3576.0, | |
| "completions/mean_length": 991.390625, | |
| "completions/mean_terminated_length": 879.7000732421875, | |
| "completions/min_length": 132.0, | |
| "completions/min_terminated_length": 132.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7062280178070068, | |
| "epoch": 0.3620689655172414, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.008489732725130188, | |
| "kl": 0.002335024008061737, | |
| "learning_rate": 4.9070084781990655e-05, | |
| "loss": 0.10014194250106812, | |
| "num_tokens": 22192422.0, | |
| "reward": 1.27734375, | |
| "reward_std": 0.9042636752128601, | |
| "rewards/reward_func/mean": 0.14192708333333334, | |
| "rewards/reward_func/std": 0.14803790383868748, | |
| "sampling/importance_sampling_ratio/max": 2.998023748397827, | |
| "sampling/importance_sampling_ratio/mean": 0.953277587890625, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.429122924804688, | |
| "sampling/sampling_logp_difference/mean": 0.1882876753807068, | |
| "step": 147, | |
| "step_time": 120.45774253108539 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3667.0, | |
| "completions/mean_length": 1032.34375, | |
| "completions/mean_terminated_length": 983.71435546875, | |
| "completions/min_length": 113.0, | |
| "completions/min_terminated_length": 113.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7528516948223114, | |
| "epoch": 0.3645320197044335, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.009651669871581751, | |
| "kl": 0.0020805590029340237, | |
| "learning_rate": 4.905693157443072e-05, | |
| "loss": 0.05875544250011444, | |
| "num_tokens": 22341868.0, | |
| "reward": 1.203125, | |
| "reward_std": 0.8496906757354736, | |
| "rewards/reward_func/mean": 0.13368055555555555, | |
| "rewards/reward_func/std": 0.13039267228709328, | |
| "sampling/importance_sampling_ratio/max": 2.9999241828918457, | |
| "sampling/importance_sampling_ratio/mean": 0.9552860856056213, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.058977127075195, | |
| "sampling/sampling_logp_difference/mean": 0.18934276700019836, | |
| "step": 148, | |
| "step_time": 132.01033597579226 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3052.0, | |
| "completions/mean_length": 1092.125, | |
| "completions/mean_terminated_length": 1031.04833984375, | |
| "completions/min_length": 85.0, | |
| "completions/min_terminated_length": 85.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6683467030525208, | |
| "epoch": 0.3669950738916256, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006696869355189105, | |
| "kl": 0.002086179330945015, | |
| "learning_rate": 4.904368778398662e-05, | |
| "loss": 0.08531684428453445, | |
| "num_tokens": 22501812.0, | |
| "reward": 1.109375, | |
| "reward_std": 0.7713559865951538, | |
| "rewards/reward_func/mean": 0.1232638888888889, | |
| "rewards/reward_func/std": 0.12479497989018758, | |
| "sampling/importance_sampling_ratio/max": 2.9943675994873047, | |
| "sampling/importance_sampling_ratio/mean": 0.9468928575515747, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.937206268310547, | |
| "sampling/sampling_logp_difference/mean": 0.20146730542182922, | |
| "step": 149, | |
| "step_time": 116.60984491393901 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2867.0, | |
| "completions/mean_length": 1074.625, | |
| "completions/mean_terminated_length": 923.2069091796875, | |
| "completions/min_length": 181.0, | |
| "completions/min_terminated_length": 181.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7355668991804123, | |
| "epoch": 0.3694581280788177, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.006190045620303314, | |
| "kl": 0.001839842356275767, | |
| "learning_rate": 4.903035346052593e-05, | |
| "loss": -0.04892116039991379, | |
| "num_tokens": 22656732.0, | |
| "reward": 1.0, | |
| "reward_std": 0.6267831921577454, | |
| "rewards/reward_func/mean": 0.1111111111111111, | |
| "rewards/reward_func/std": 0.1120100501510832, | |
| "sampling/importance_sampling_ratio/max": 2.99916672706604, | |
| "sampling/importance_sampling_ratio/mean": 0.9446475505828857, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.62105655670166, | |
| "sampling/sampling_logp_difference/mean": 0.21084731817245483, | |
| "step": 150, | |
| "step_time": 134.81604075315408 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2905.0, | |
| "completions/max_terminated_length": 2905.0, | |
| "completions/mean_length": 933.0, | |
| "completions/mean_terminated_length": 929.2857666015625, | |
| "completions/min_length": 311.0, | |
| "completions/min_terminated_length": 311.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6313983350992203, | |
| "epoch": 0.37192118226600984, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004448968547173259, | |
| "kl": 0.001612977299373597, | |
| "learning_rate": 4.9016928654257096e-05, | |
| "loss": 0.009620461612939835, | |
| "num_tokens": 22809196.0, | |
| "reward": 1.1171875, | |
| "reward_std": 0.4225037693977356, | |
| "rewards/reward_func/mean": 0.12413194444444445, | |
| "rewards/reward_func/std": 0.060594505733913846, | |
| "sampling/importance_sampling_ratio/max": 2.997783899307251, | |
| "sampling/importance_sampling_ratio/mean": 0.9538916945457458, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.496200561523438, | |
| "sampling/sampling_logp_difference/mean": 0.18262873589992523, | |
| "step": 151, | |
| "step_time": 89.70710955327377 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3902.0, | |
| "completions/mean_length": 1262.1875, | |
| "completions/mean_terminated_length": 1127.300048828125, | |
| "completions/min_length": 98.0, | |
| "completions/min_terminated_length": 98.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8315682709217072, | |
| "epoch": 0.37438423645320196, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.011580758122952332, | |
| "kl": 0.0028582040686160326, | |
| "learning_rate": 4.9003413415729295e-05, | |
| "loss": -0.015760261565446854, | |
| "num_tokens": 22980728.0, | |
| "reward": 1.03515625, | |
| "reward_std": 0.6984494924545288, | |
| "rewards/reward_func/mean": 0.1150173611111111, | |
| "rewards/reward_func/std": 0.12651590506235758, | |
| "sampling/importance_sampling_ratio/max": 2.9961771965026855, | |
| "sampling/importance_sampling_ratio/mean": 0.928211510181427, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.83804702758789, | |
| "sampling/sampling_logp_difference/mean": 0.26203879714012146, | |
| "step": 152, | |
| "step_time": 137.6754217641428 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1802.0, | |
| "completions/mean_length": 832.359375, | |
| "completions/mean_terminated_length": 730.8643798828125, | |
| "completions/min_length": 187.0, | |
| "completions/min_terminated_length": 187.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7106350511312485, | |
| "epoch": 0.3768472906403941, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01284240306421068, | |
| "kl": 0.002594699908513576, | |
| "learning_rate": 4.898980779583218e-05, | |
| "loss": 0.03899255394935608, | |
| "num_tokens": 23108255.0, | |
| "reward": 1.0859375, | |
| "reward_std": 0.7483032941818237, | |
| "rewards/reward_func/mean": 0.12065972222222222, | |
| "rewards/reward_func/std": 0.14227375719282362, | |
| "sampling/importance_sampling_ratio/max": 2.999643087387085, | |
| "sampling/importance_sampling_ratio/mean": 0.9569455981254578, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.268088340759277, | |
| "sampling/sampling_logp_difference/mean": 0.18348127603530884, | |
| "step": 153, | |
| "step_time": 114.66229340620339 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3680.0, | |
| "completions/max_terminated_length": 3680.0, | |
| "completions/mean_length": 886.609375, | |
| "completions/mean_terminated_length": 865.2222900390625, | |
| "completions/min_length": 75.0, | |
| "completions/min_terminated_length": 75.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6759176105260849, | |
| "epoch": 0.3793103448275862, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.009819383443114064, | |
| "kl": 0.0015356106159742922, | |
| "learning_rate": 4.897611184579575e-05, | |
| "loss": 0.04377323016524315, | |
| "num_tokens": 23256022.0, | |
| "reward": 1.125, | |
| "reward_std": 0.5721721649169922, | |
| "rewards/reward_func/mean": 0.125, | |
| "rewards/reward_func/std": 0.0965376885400878, | |
| "sampling/importance_sampling_ratio/max": 2.991487741470337, | |
| "sampling/importance_sampling_ratio/mean": 0.9485093951225281, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.249042510986328, | |
| "sampling/sampling_logp_difference/mean": 0.1910654902458191, | |
| "step": 154, | |
| "step_time": 107.22631569998339 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2607.0, | |
| "completions/mean_length": 1154.984375, | |
| "completions/mean_terminated_length": 958.9166870117188, | |
| "completions/min_length": 160.0, | |
| "completions/min_terminated_length": 160.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6965581178665161, | |
| "epoch": 0.3817733990147783, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009454443083222703, | |
| "kl": 0.0019190027960576117, | |
| "learning_rate": 4.896232561719011e-05, | |
| "loss": -0.031025201082229614, | |
| "num_tokens": 23414421.0, | |
| "reward": 1.1484375, | |
| "reward_std": 0.8576439023017883, | |
| "rewards/reward_func/mean": 0.12760416666666666, | |
| "rewards/reward_func/std": 0.16676429907480875, | |
| "sampling/importance_sampling_ratio/max": 2.9999499320983887, | |
| "sampling/importance_sampling_ratio/mean": 0.9523643851280212, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.081517219543457, | |
| "sampling/sampling_logp_difference/mean": 0.1926102340221405, | |
| "step": 155, | |
| "step_time": 137.01870591682382 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2384.0, | |
| "completions/max_terminated_length": 2384.0, | |
| "completions/mean_length": 650.34375, | |
| "completions/mean_terminated_length": 646.9677124023438, | |
| "completions/min_length": 76.0, | |
| "completions/min_terminated_length": 76.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6660396009683609, | |
| "epoch": 0.3842364532019704, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.012736599143800455, | |
| "kl": 0.0020014344772789627, | |
| "learning_rate": 4.8948449161925304e-05, | |
| "loss": -0.031402587890625, | |
| "num_tokens": 23538395.0, | |
| "reward": 1.09375, | |
| "reward_std": 0.75, | |
| "rewards/reward_func/mean": 0.12152777777777778, | |
| "rewards/reward_func/std": 0.12056290772226122, | |
| "sampling/importance_sampling_ratio/max": 2.99924635887146, | |
| "sampling/importance_sampling_ratio/mean": 0.9617007374763489, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.430522918701172, | |
| "sampling/sampling_logp_difference/mean": 0.16973595321178436, | |
| "step": 156, | |
| "step_time": 74.7309046103619 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3604.0, | |
| "completions/mean_length": 832.75, | |
| "completions/mean_terminated_length": 672.2622680664062, | |
| "completions/min_length": 123.0, | |
| "completions/min_terminated_length": 123.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6982234865427017, | |
| "epoch": 0.3866995073891626, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.01619216229721582, | |
| "kl": 0.002053438453003764, | |
| "learning_rate": 4.893448253225111e-05, | |
| "loss": -0.09056590497493744, | |
| "num_tokens": 23664155.0, | |
| "reward": 1.2421875, | |
| "reward_std": 0.9041008353233337, | |
| "rewards/reward_func/mean": 0.13802083333333334, | |
| "rewards/reward_func/std": 0.1453225099378162, | |
| "sampling/importance_sampling_ratio/max": 2.998002290725708, | |
| "sampling/importance_sampling_ratio/mean": 0.9607492089271545, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.44827938079834, | |
| "sampling/sampling_logp_difference/mean": 0.17780989408493042, | |
| "step": 157, | |
| "step_time": 145.718919953797 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2861.0, | |
| "completions/mean_length": 953.4375, | |
| "completions/mean_terminated_length": 873.1034545898438, | |
| "completions/min_length": 139.0, | |
| "completions/min_terminated_length": 139.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7421602457761765, | |
| "epoch": 0.3891625615763547, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01641507777336797, | |
| "kl": 0.0034244628623127937, | |
| "learning_rate": 4.892042578075685e-05, | |
| "loss": -0.14567188918590546, | |
| "num_tokens": 23815959.0, | |
| "reward": 1.02734375, | |
| "reward_std": 0.7044602632522583, | |
| "rewards/reward_func/mean": 0.11414930555555555, | |
| "rewards/reward_func/std": 0.1274808877044254, | |
| "sampling/importance_sampling_ratio/max": 2.9998927116394043, | |
| "sampling/importance_sampling_ratio/mean": 0.9466952085494995, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.74369239807129, | |
| "sampling/sampling_logp_difference/mean": 0.21085965633392334, | |
| "step": 158, | |
| "step_time": 156.47580430214293 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2804.0, | |
| "completions/mean_length": 1197.21875, | |
| "completions/mean_terminated_length": 1047.27587890625, | |
| "completions/min_length": 23.0, | |
| "completions/min_terminated_length": 101.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6783465445041656, | |
| "epoch": 0.3916256157635468, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009919495589656527, | |
| "kl": 0.0024570394889451563, | |
| "learning_rate": 4.8906278960371176e-05, | |
| "loss": -0.07994688302278519, | |
| "num_tokens": 23980373.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.783943772315979, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.15668830606672499, | |
| "sampling/importance_sampling_ratio/max": 2.998675584793091, | |
| "sampling/importance_sampling_ratio/mean": 0.9550249576568604, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.630230903625488, | |
| "sampling/sampling_logp_difference/mean": 0.18413835763931274, | |
| "step": 159, | |
| "step_time": 127.59175217780285 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4006.0, | |
| "completions/mean_length": 1115.921875, | |
| "completions/mean_terminated_length": 1032.8834228515625, | |
| "completions/min_length": 165.0, | |
| "completions/min_terminated_length": 165.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6052042990922928, | |
| "epoch": 0.39408866995073893, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.008819283784252606, | |
| "kl": 0.0022560503566637635, | |
| "learning_rate": 4.889204212436189e-05, | |
| "loss": 0.017924707382917404, | |
| "num_tokens": 24143456.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.8104656338691711, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.13666501144568124, | |
| "sampling/importance_sampling_ratio/max": 2.9991002082824707, | |
| "sampling/importance_sampling_ratio/mean": 0.9468730688095093, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.852204322814941, | |
| "sampling/sampling_logp_difference/mean": 0.19665727019309998, | |
| "step": 160, | |
| "step_time": 138.74941563908942 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3037.0, | |
| "completions/mean_length": 1053.40625, | |
| "completions/mean_terminated_length": 1011.666748046875, | |
| "completions/min_length": 202.0, | |
| "completions/min_terminated_length": 202.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6857775151729584, | |
| "epoch": 0.39655172413793105, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.011535143162337612, | |
| "kl": 0.0021167479280848056, | |
| "learning_rate": 4.8877715326335735e-05, | |
| "loss": 0.00892894808202982, | |
| "num_tokens": 24309706.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.6591842174530029, | |
| "rewards/reward_func/mean": 0.10416666666666667, | |
| "rewards/reward_func/std": 0.11478952235645717, | |
| "sampling/importance_sampling_ratio/max": 2.9970932006835938, | |
| "sampling/importance_sampling_ratio/mean": 0.9446910619735718, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.179174423217773, | |
| "sampling/sampling_logp_difference/mean": 0.21110796928405762, | |
| "step": 161, | |
| "step_time": 121.95381436008029 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1793.0, | |
| "completions/mean_length": 797.953125, | |
| "completions/mean_terminated_length": 697.1000366210938, | |
| "completions/min_length": 4.0, | |
| "completions/min_terminated_length": 254.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6954528391361237, | |
| "epoch": 0.39901477832512317, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.003989546208282082, | |
| "kl": 0.002669573645107448, | |
| "learning_rate": 4.886329862023818e-05, | |
| "loss": -0.021821074187755585, | |
| "num_tokens": 24441415.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.4466690719127655, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.06591926680670844, | |
| "sampling/importance_sampling_ratio/max": 2.9920241832733154, | |
| "sampling/importance_sampling_ratio/mean": 0.9608190059661865, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.830057144165039, | |
| "sampling/sampling_logp_difference/mean": 0.17658987641334534, | |
| "step": 162, | |
| "step_time": 123.86876834277064 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3498.0, | |
| "completions/mean_length": 1097.328125, | |
| "completions/mean_terminated_length": 980.2105102539062, | |
| "completions/min_length": 135.0, | |
| "completions/min_terminated_length": 135.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6942517310380936, | |
| "epoch": 0.4014778325123153, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.012712878344654979, | |
| "kl": 0.002415290189674124, | |
| "learning_rate": 4.884879206035324e-05, | |
| "loss": -0.01825246773660183, | |
| "num_tokens": 24603996.0, | |
| "reward": 1.29296875, | |
| "reward_std": 1.1176388263702393, | |
| "rewards/reward_func/mean": 0.14366319444444445, | |
| "rewards/reward_func/std": 0.1814569118950102, | |
| "sampling/importance_sampling_ratio/max": 2.9947588443756104, | |
| "sampling/importance_sampling_ratio/mean": 0.9565131068229675, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.88363265991211, | |
| "sampling/sampling_logp_difference/mean": 0.18692165613174438, | |
| "step": 163, | |
| "step_time": 127.92860764102079 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3682.0, | |
| "completions/mean_length": 1054.828125, | |
| "completions/mean_terminated_length": 940.3167114257812, | |
| "completions/min_length": 35.0, | |
| "completions/min_terminated_length": 35.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7391787171363831, | |
| "epoch": 0.4039408866995074, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009065769445882707, | |
| "kl": 0.0027781289536505938, | |
| "learning_rate": 4.883419570130327e-05, | |
| "loss": -0.10061228275299072, | |
| "num_tokens": 24766177.0, | |
| "reward": 1.01171875, | |
| "reward_std": 0.7746347188949585, | |
| "rewards/reward_func/mean": 0.11241319444444445, | |
| "rewards/reward_func/std": 0.1506018704838223, | |
| "sampling/importance_sampling_ratio/max": 2.998229742050171, | |
| "sampling/importance_sampling_ratio/mean": 0.9475909471511841, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.99682903289795, | |
| "sampling/sampling_logp_difference/mean": 0.20837771892547607, | |
| "step": 164, | |
| "step_time": 125.48444975796156 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3837.0, | |
| "completions/max_terminated_length": 3837.0, | |
| "completions/mean_length": 570.28125, | |
| "completions/mean_terminated_length": 571.3809814453125, | |
| "completions/min_length": 82.0, | |
| "completions/min_terminated_length": 82.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7563356757164001, | |
| "epoch": 0.4064039408866995, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02071127784172916, | |
| "kl": 0.004994380171410739, | |
| "learning_rate": 4.881950959804874e-05, | |
| "loss": 0.019023362547159195, | |
| "num_tokens": 24885267.0, | |
| "reward": 1.1328125, | |
| "reward_std": 0.9728137254714966, | |
| "rewards/reward_func/mean": 0.12586805555555555, | |
| "rewards/reward_func/std": 0.1605023874176873, | |
| "sampling/importance_sampling_ratio/max": 2.9999263286590576, | |
| "sampling/importance_sampling_ratio/mean": 0.9563567638397217, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.745625495910645, | |
| "sampling/sampling_logp_difference/mean": 0.19455267488956451, | |
| "step": 165, | |
| "step_time": 131.7907678987831 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3038.0, | |
| "completions/mean_length": 844.078125, | |
| "completions/mean_terminated_length": 792.4603881835938, | |
| "completions/min_length": 173.0, | |
| "completions/min_terminated_length": 173.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6852337867021561, | |
| "epoch": 0.4088669950738916, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.02057318733791968, | |
| "kl": 0.002958475728519261, | |
| "learning_rate": 4.8804733805888024e-05, | |
| "loss": -0.11288006603717804, | |
| "num_tokens": 25024104.0, | |
| "reward": 1.21484375, | |
| "reward_std": 0.9320859313011169, | |
| "rewards/reward_func/mean": 0.1349826388888889, | |
| "rewards/reward_func/std": 0.1531604164176517, | |
| "sampling/importance_sampling_ratio/max": 2.9926633834838867, | |
| "sampling/importance_sampling_ratio/mean": 0.9570341110229492, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.6238431930542, | |
| "sampling/sampling_logp_difference/mean": 0.19065096974372864, | |
| "step": 166, | |
| "step_time": 130.00207916204818 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3418.0, | |
| "completions/mean_length": 1142.28125, | |
| "completions/mean_terminated_length": 1058.458984375, | |
| "completions/min_length": 237.0, | |
| "completions/min_terminated_length": 237.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7697890400886536, | |
| "epoch": 0.41133004926108374, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02092271355309941, | |
| "kl": 0.004163852776400745, | |
| "learning_rate": 4.8789868380457246e-05, | |
| "loss": -0.1456824094057083, | |
| "num_tokens": 25180410.0, | |
| "reward": 1.30078125, | |
| "reward_std": 1.069211721420288, | |
| "rewards/reward_func/mean": 0.14453125, | |
| "rewards/reward_func/std": 0.17918562557962206, | |
| "sampling/importance_sampling_ratio/max": 2.9986627101898193, | |
| "sampling/importance_sampling_ratio/mean": 0.9531635046005249, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.748177528381348, | |
| "sampling/sampling_logp_difference/mean": 0.1931055784225464, | |
| "step": 167, | |
| "step_time": 123.96054100710899 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2924.0, | |
| "completions/mean_length": 1033.203125, | |
| "completions/mean_terminated_length": 886.9661254882812, | |
| "completions/min_length": 98.0, | |
| "completions/min_terminated_length": 118.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6940324157476425, | |
| "epoch": 0.41379310344827586, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00879309156772693, | |
| "kl": 0.003143699432257563, | |
| "learning_rate": 4.8774913377729994e-05, | |
| "loss": 0.009899081662297249, | |
| "num_tokens": 25344935.0, | |
| "reward": 1.08984375, | |
| "reward_std": 0.9983788132667542, | |
| "rewards/reward_func/mean": 0.12109375, | |
| "rewards/reward_func/std": 0.1805433001783159, | |
| "sampling/importance_sampling_ratio/max": 2.9867444038391113, | |
| "sampling/importance_sampling_ratio/mean": 0.9476098418235779, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 21.72199058532715, | |
| "sampling/sampling_logp_difference/mean": 0.20125585794448853, | |
| "step": 168, | |
| "step_time": 133.38061838923022 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3877.0, | |
| "completions/max_terminated_length": 3877.0, | |
| "completions/mean_length": 994.78125, | |
| "completions/mean_terminated_length": 994.78125, | |
| "completions/min_length": 202.0, | |
| "completions/min_terminated_length": 202.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7939935922622681, | |
| "epoch": 0.41625615763546797, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.01507058902435564, | |
| "kl": 0.00311278022127226, | |
| "learning_rate": 4.875986885401717e-05, | |
| "loss": 0.033397216349840164, | |
| "num_tokens": 25496649.0, | |
| "reward": 1.08984375, | |
| "reward_std": 0.9045379161834717, | |
| "rewards/reward_func/mean": 0.12109375, | |
| "rewards/reward_func/std": 0.15391069816218483, | |
| "sampling/importance_sampling_ratio/max": 2.9987685680389404, | |
| "sampling/importance_sampling_ratio/mean": 0.9507143497467041, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.1064453125, | |
| "sampling/sampling_logp_difference/mean": 0.21293172240257263, | |
| "step": 169, | |
| "step_time": 109.43428984982893 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 1882.0, | |
| "completions/max_terminated_length": 1882.0, | |
| "completions/mean_length": 777.046875, | |
| "completions/mean_terminated_length": 784.7704467773438, | |
| "completions/min_length": 199.0, | |
| "completions/min_terminated_length": 199.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6964544653892517, | |
| "epoch": 0.4187192118226601, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.018781297681167743, | |
| "kl": 0.003129941411316395, | |
| "learning_rate": 4.874473486596672e-05, | |
| "loss": -0.03748438134789467, | |
| "num_tokens": 25655852.0, | |
| "reward": 1.1796875, | |
| "reward_std": 0.9692379236221313, | |
| "rewards/reward_func/mean": 0.1310763888888889, | |
| "rewards/reward_func/std": 0.16774308350351122, | |
| "sampling/importance_sampling_ratio/max": 2.998063564300537, | |
| "sampling/importance_sampling_ratio/mean": 0.9458487629890442, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.899086952209473, | |
| "sampling/sampling_logp_difference/mean": 0.2181737720966339, | |
| "step": 170, | |
| "step_time": 72.11048253579065 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3049.0, | |
| "completions/mean_length": 987.90625, | |
| "completions/mean_terminated_length": 896.5409545898438, | |
| "completions/min_length": 222.0, | |
| "completions/min_terminated_length": 222.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7009437531232834, | |
| "epoch": 0.4211822660098522, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0162183247531834, | |
| "kl": 0.003935195156373084, | |
| "learning_rate": 4.8729511470563514e-05, | |
| "loss": 0.026689358055591583, | |
| "num_tokens": 25804854.0, | |
| "reward": 1.34375, | |
| "reward_std": 1.4103500843048096, | |
| "rewards/reward_func/mean": 0.14930555555555555, | |
| "rewards/reward_func/std": 0.20477482179800668, | |
| "sampling/importance_sampling_ratio/max": 2.9969730377197266, | |
| "sampling/importance_sampling_ratio/mean": 0.95501309633255, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.797805786132812, | |
| "sampling/sampling_logp_difference/mean": 0.18509003520011902, | |
| "step": 171, | |
| "step_time": 154.1194182871841 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3533.0, | |
| "completions/max_terminated_length": 3533.0, | |
| "completions/mean_length": 767.1875, | |
| "completions/mean_terminated_length": 738.4762573242188, | |
| "completions/min_length": 142.0, | |
| "completions/min_terminated_length": 142.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6789942979812622, | |
| "epoch": 0.4236453201970443, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.02484970387824421, | |
| "kl": 0.0039458731771446764, | |
| "learning_rate": 4.871419872512901e-05, | |
| "loss": -0.0457112193107605, | |
| "num_tokens": 25940050.0, | |
| "reward": 1.359375, | |
| "reward_std": 1.1179230213165283, | |
| "rewards/reward_func/mean": 0.15104166666666666, | |
| "rewards/reward_func/std": 0.1697574125395881, | |
| "sampling/importance_sampling_ratio/max": 2.999643087387085, | |
| "sampling/importance_sampling_ratio/mean": 0.9572591185569763, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.05351734161377, | |
| "sampling/sampling_logp_difference/mean": 0.18504132330417633, | |
| "step": 172, | |
| "step_time": 104.09425508067943 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2393.0, | |
| "completions/mean_length": 739.171875, | |
| "completions/mean_terminated_length": 673.1638793945312, | |
| "completions/min_length": 136.0, | |
| "completions/min_terminated_length": 136.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6542257070541382, | |
| "epoch": 0.42610837438423643, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.022307858591524333, | |
| "kl": 0.0056029813713394105, | |
| "learning_rate": 4.869879668732115e-05, | |
| "loss": -0.04914276301860809, | |
| "num_tokens": 26065581.0, | |
| "reward": 1.3828125, | |
| "reward_std": 1.1529541015625, | |
| "rewards/reward_func/mean": 0.15364583333333334, | |
| "rewards/reward_func/std": 0.18794474667972988, | |
| "sampling/importance_sampling_ratio/max": 2.9967164993286133, | |
| "sampling/importance_sampling_ratio/mean": 0.9573915004730225, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.686840057373047, | |
| "sampling/sampling_logp_difference/mean": 0.1825268715620041, | |
| "step": 173, | |
| "step_time": 119.28080543805845 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1752.0, | |
| "completions/mean_length": 713.21875, | |
| "completions/mean_terminated_length": 601.1802978515625, | |
| "completions/min_length": 90.0, | |
| "completions/min_terminated_length": 90.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6988493353128433, | |
| "epoch": 0.42857142857142855, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.018969040626113362, | |
| "kl": 0.004114088660571724, | |
| "learning_rate": 4.868330541513405e-05, | |
| "loss": -0.03230910003185272, | |
| "num_tokens": 26194555.0, | |
| "reward": 1.28125, | |
| "reward_std": 1.0249855518341064, | |
| "rewards/reward_func/mean": 0.1423611111111111, | |
| "rewards/reward_func/std": 0.16953420970175, | |
| "sampling/importance_sampling_ratio/max": 2.997673511505127, | |
| "sampling/importance_sampling_ratio/mean": 0.954637885093689, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.62595272064209, | |
| "sampling/sampling_logp_difference/mean": 0.19148147106170654, | |
| "step": 174, | |
| "step_time": 124.16180285089649 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4022.0, | |
| "completions/max_terminated_length": 4022.0, | |
| "completions/mean_length": 1148.6875, | |
| "completions/mean_terminated_length": 1197.5423583984375, | |
| "completions/min_length": 149.0, | |
| "completions/min_terminated_length": 352.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6463498473167419, | |
| "epoch": 0.43103448275862066, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015462004127244699, | |
| "kl": 0.003992202051449567, | |
| "learning_rate": 4.866772496689787e-05, | |
| "loss": -0.02261742576956749, | |
| "num_tokens": 26363159.0, | |
| "reward": 1.3203125, | |
| "reward_std": 1.0796353816986084, | |
| "rewards/reward_func/mean": 0.1467013888888889, | |
| "rewards/reward_func/std": 0.19009446766641405, | |
| "sampling/importance_sampling_ratio/max": 2.9993505477905273, | |
| "sampling/importance_sampling_ratio/mean": 0.9478614926338196, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.764888763427734, | |
| "sampling/sampling_logp_difference/mean": 0.1974748969078064, | |
| "step": 175, | |
| "step_time": 127.3127924175933 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4002.0, | |
| "completions/mean_length": 1293.46875, | |
| "completions/mean_terminated_length": 1174.0, | |
| "completions/min_length": 8.0, | |
| "completions/min_terminated_length": 287.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6410723924636841, | |
| "epoch": 0.43349753694581283, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015568149632948742, | |
| "kl": 0.005730267730541527, | |
| "learning_rate": 4.865205540127851e-05, | |
| "loss": 0.02572108432650566, | |
| "num_tokens": 26549781.0, | |
| "reward": 1.625, | |
| "reward_std": 1.3153549432754517, | |
| "rewards/reward_func/mean": 0.18055555555555555, | |
| "rewards/reward_func/std": 0.20540823373529646, | |
| "sampling/importance_sampling_ratio/max": 2.9966816902160645, | |
| "sampling/importance_sampling_ratio/mean": 0.9409160017967224, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.302165985107422, | |
| "sampling/sampling_logp_difference/mean": 0.20651187002658844, | |
| "step": 176, | |
| "step_time": 149.82990049594082 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2861.0, | |
| "completions/mean_length": 1170.84375, | |
| "completions/mean_terminated_length": 1078.704833984375, | |
| "completions/min_length": 278.0, | |
| "completions/min_terminated_length": 278.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7377604842185974, | |
| "epoch": 0.43596059113300495, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.022361029088014617, | |
| "kl": 0.004049715644214302, | |
| "learning_rate": 4.863629677727745e-05, | |
| "loss": -0.07867275178432465, | |
| "num_tokens": 26716651.0, | |
| "reward": 1.78515625, | |
| "reward_std": 1.599768877029419, | |
| "rewards/reward_func/mean": 0.19835069444444445, | |
| "rewards/reward_func/std": 0.23245189090569815, | |
| "sampling/importance_sampling_ratio/max": 2.999277114868164, | |
| "sampling/importance_sampling_ratio/mean": 0.9434698224067688, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.033159255981445, | |
| "sampling/sampling_logp_difference/mean": 0.20777705311775208, | |
| "step": 177, | |
| "step_time": 124.1001110309735 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3152.0, | |
| "completions/max_terminated_length": 3152.0, | |
| "completions/mean_length": 830.140625, | |
| "completions/mean_terminated_length": 833.5238647460938, | |
| "completions/min_length": 173.0, | |
| "completions/min_terminated_length": 173.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6503562480211258, | |
| "epoch": 0.43842364532019706, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.023577480739962978, | |
| "kl": 0.006311332923360169, | |
| "learning_rate": 4.862044915423149e-05, | |
| "loss": -0.0761241614818573, | |
| "num_tokens": 26848820.0, | |
| "reward": 1.609375, | |
| "reward_std": 1.490628719329834, | |
| "rewards/reward_func/mean": 0.17881944444444445, | |
| "rewards/reward_func/std": 0.24757508436838785, | |
| "sampling/importance_sampling_ratio/max": 2.9963910579681396, | |
| "sampling/importance_sampling_ratio/mean": 0.9584037065505981, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.9031982421875, | |
| "sampling/sampling_logp_difference/mean": 0.16852861642837524, | |
| "step": 178, | |
| "step_time": 88.4958158947993 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 3069.0, | |
| "completions/max_terminated_length": 3069.0, | |
| "completions/mean_length": 822.015625, | |
| "completions/mean_terminated_length": 837.4745483398438, | |
| "completions/min_length": 19.0, | |
| "completions/min_terminated_length": 227.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7814570516347885, | |
| "epoch": 0.4408866995073892, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.029061251701778403, | |
| "kl": 0.0067477618576958776, | |
| "learning_rate": 4.860451259181259e-05, | |
| "loss": -0.05121511220932007, | |
| "num_tokens": 26985429.0, | |
| "reward": 1.609375, | |
| "reward_std": 1.5228908061981201, | |
| "rewards/reward_func/mean": 0.17881944444444445, | |
| "rewards/reward_func/std": 0.22272776729530758, | |
| "sampling/importance_sampling_ratio/max": 2.9979407787323, | |
| "sampling/importance_sampling_ratio/mean": 0.9471230506896973, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.142129898071289, | |
| "sampling/sampling_logp_difference/mean": 0.2147315889596939, | |
| "step": 179, | |
| "step_time": 145.50826694909483 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3252.0, | |
| "completions/mean_length": 1040.078125, | |
| "completions/mean_terminated_length": 991.5714721679688, | |
| "completions/min_length": 80.0, | |
| "completions/min_terminated_length": 80.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6426051408052444, | |
| "epoch": 0.4433497536945813, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02264846469626092, | |
| "kl": 0.008284276816993952, | |
| "learning_rate": 4.8588487150027514e-05, | |
| "loss": -0.011647295206785202, | |
| "num_tokens": 27128874.0, | |
| "reward": 1.59765625, | |
| "reward_std": 1.513574242591858, | |
| "rewards/reward_func/mean": 0.1775173611111111, | |
| "rewards/reward_func/std": 0.2384296092722151, | |
| "sampling/importance_sampling_ratio/max": 2.9980435371398926, | |
| "sampling/importance_sampling_ratio/mean": 0.954474687576294, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.294371604919434, | |
| "sampling/sampling_logp_difference/mean": 0.1824147254228592, | |
| "step": 180, | |
| "step_time": 139.90463946410455 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3485.0, | |
| "completions/mean_length": 946.453125, | |
| "completions/mean_terminated_length": 816.4385986328125, | |
| "completions/min_length": 126.0, | |
| "completions/min_terminated_length": 126.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6580641269683838, | |
| "epoch": 0.4458128078817734, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01917106050891827, | |
| "kl": 0.00607175484765321, | |
| "learning_rate": 4.8572372889217776e-05, | |
| "loss": 0.13771361112594604, | |
| "num_tokens": 27269127.0, | |
| "reward": 1.734375, | |
| "reward_std": 1.7643263339996338, | |
| "rewards/reward_func/mean": 0.19270833333333334, | |
| "rewards/reward_func/std": 0.2612364457713233, | |
| "sampling/importance_sampling_ratio/max": 2.999809741973877, | |
| "sampling/importance_sampling_ratio/mean": 0.9558506608009338, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.685833930969238, | |
| "sampling/sampling_logp_difference/mean": 0.18209989368915558, | |
| "step": 181, | |
| "step_time": 122.68162017525174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3650.0, | |
| "completions/mean_length": 1262.5625, | |
| "completions/mean_terminated_length": 1193.482177734375, | |
| "completions/min_length": 137.0, | |
| "completions/min_terminated_length": 137.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6180784702301025, | |
| "epoch": 0.4482758620689655, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014420698262174875, | |
| "kl": 0.004996940493583679, | |
| "learning_rate": 4.855616987005926e-05, | |
| "loss": 0.07041345536708832, | |
| "num_tokens": 27439019.0, | |
| "reward": 1.34765625, | |
| "reward_std": 1.241332769393921, | |
| "rewards/reward_func/mean": 0.14973958333333334, | |
| "rewards/reward_func/std": 0.1866938124100367, | |
| "sampling/importance_sampling_ratio/max": 2.9979941844940186, | |
| "sampling/importance_sampling_ratio/mean": 0.9477935433387756, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.707728385925293, | |
| "sampling/sampling_logp_difference/mean": 0.1886201798915863, | |
| "step": 182, | |
| "step_time": 124.8088491272647 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3871.0, | |
| "completions/mean_length": 1215.671875, | |
| "completions/mean_terminated_length": 1141.0509033203125, | |
| "completions/min_length": 162.0, | |
| "completions/min_terminated_length": 162.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.757892832159996, | |
| "epoch": 0.45073891625615764, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.023065960992914185, | |
| "kl": 0.0071741442661732435, | |
| "learning_rate": 4.853987815356211e-05, | |
| "loss": -0.025667782872915268, | |
| "num_tokens": 27599814.0, | |
| "reward": 1.859375, | |
| "reward_std": 1.7388391494750977, | |
| "rewards/reward_func/mean": 0.2065972222222222, | |
| "rewards/reward_func/std": 0.2955647044711643, | |
| "sampling/importance_sampling_ratio/max": 2.9989538192749023, | |
| "sampling/importance_sampling_ratio/mean": 0.9460509419441223, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.060905456542969, | |
| "sampling/sampling_logp_difference/mean": 0.2087656557559967, | |
| "step": 183, | |
| "step_time": 126.53920693183318 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 3071.0, | |
| "completions/max_terminated_length": 3071.0, | |
| "completions/mean_length": 872.046875, | |
| "completions/mean_terminated_length": 890.5322265625, | |
| "completions/min_length": 44.0, | |
| "completions/min_terminated_length": 164.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7055705040693283, | |
| "epoch": 0.45320197044334976, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.028719236256871318, | |
| "kl": 0.006613429985009134, | |
| "learning_rate": 4.8523497801070394e-05, | |
| "loss": -0.09070068597793579, | |
| "num_tokens": 27741817.0, | |
| "reward": 1.37890625, | |
| "reward_std": 1.2809596061706543, | |
| "rewards/reward_func/mean": 0.15321180555555555, | |
| "rewards/reward_func/std": 0.19664881295628017, | |
| "sampling/importance_sampling_ratio/max": 2.988349676132202, | |
| "sampling/importance_sampling_ratio/mean": 0.9532193541526794, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.999823570251465, | |
| "sampling/sampling_logp_difference/mean": 0.19390341639518738, | |
| "step": 184, | |
| "step_time": 95.91285739769228 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3533.0, | |
| "completions/mean_length": 1271.171875, | |
| "completions/mean_terminated_length": 1111.0, | |
| "completions/min_length": 215.0, | |
| "completions/min_terminated_length": 215.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6977435797452927, | |
| "epoch": 0.45566502463054187, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.016889524252555597, | |
| "kl": 0.007070304825901985, | |
| "learning_rate": 4.8507028874261965e-05, | |
| "loss": -0.0027643460780382156, | |
| "num_tokens": 27911140.0, | |
| "reward": 1.4140625, | |
| "reward_std": 1.3064391613006592, | |
| "rewards/reward_func/mean": 0.15711805555555555, | |
| "rewards/reward_func/std": 0.18846042454242706, | |
| "sampling/importance_sampling_ratio/max": 2.9996609687805176, | |
| "sampling/importance_sampling_ratio/mean": 0.9465343952178955, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.93674087524414, | |
| "sampling/sampling_logp_difference/mean": 0.19495807588100433, | |
| "step": 185, | |
| "step_time": 189.83484322600998 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3173.0, | |
| "completions/mean_length": 1272.578125, | |
| "completions/mean_terminated_length": 1022.8245849609375, | |
| "completions/min_length": 434.0, | |
| "completions/min_terminated_length": 434.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.647504448890686, | |
| "epoch": 0.458128078817734, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.017992669360486515, | |
| "kl": 0.006825581775046885, | |
| "learning_rate": 4.8490471435148174e-05, | |
| "loss": -0.038769882172346115, | |
| "num_tokens": 28083673.0, | |
| "reward": 1.40234375, | |
| "reward_std": 1.4055002927780151, | |
| "rewards/reward_func/mean": 0.1558159722222222, | |
| "rewards/reward_func/std": 0.22071651286549038, | |
| "sampling/importance_sampling_ratio/max": 2.999582290649414, | |
| "sampling/importance_sampling_ratio/mean": 0.9500458240509033, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.749305725097656, | |
| "sampling/sampling_logp_difference/mean": 0.18807470798492432, | |
| "step": 186, | |
| "step_time": 131.8277764460072 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3862.0, | |
| "completions/mean_length": 1178.859375, | |
| "completions/mean_terminated_length": 997.6551513671875, | |
| "completions/min_length": 173.0, | |
| "completions/min_terminated_length": 173.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6585134118795395, | |
| "epoch": 0.4605911330049261, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.018530615661706835, | |
| "kl": 0.007103452226147056, | |
| "learning_rate": 4.8473825546073656e-05, | |
| "loss": 0.06069903075695038, | |
| "num_tokens": 28230928.0, | |
| "reward": 1.61328125, | |
| "reward_std": 1.4302082061767578, | |
| "rewards/reward_func/mean": 0.1792534722222222, | |
| "rewards/reward_func/std": 0.2170476433303621, | |
| "sampling/importance_sampling_ratio/max": 2.997567653656006, | |
| "sampling/importance_sampling_ratio/mean": 0.9533196687698364, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.04468822479248, | |
| "sampling/sampling_logp_difference/mean": 0.17806966602802277, | |
| "step": 187, | |
| "step_time": 121.11668449593708 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4063.0, | |
| "completions/mean_length": 870.734375, | |
| "completions/mean_terminated_length": 752.4035034179688, | |
| "completions/min_length": 72.0, | |
| "completions/min_terminated_length": 72.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7276755422353745, | |
| "epoch": 0.4630541871921182, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.017478474676854153, | |
| "kl": 0.007267383160069585, | |
| "learning_rate": 4.845709126971609e-05, | |
| "loss": -0.03535791486501694, | |
| "num_tokens": 28369791.0, | |
| "reward": 1.40234375, | |
| "reward_std": 1.1874151229858398, | |
| "rewards/reward_func/mean": 0.1558159722222222, | |
| "rewards/reward_func/std": 0.22231183614995745, | |
| "sampling/importance_sampling_ratio/max": 2.9939093589782715, | |
| "sampling/importance_sampling_ratio/mean": 0.9565584659576416, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.437378883361816, | |
| "sampling/sampling_logp_difference/mean": 0.18134385347366333, | |
| "step": 188, | |
| "step_time": 131.01593620865606 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3208.0, | |
| "completions/mean_length": 1342.59375, | |
| "completions/mean_terminated_length": 1030.732177734375, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 1.0366933643817902, | |
| "epoch": 0.46551724137931033, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01591276787266667, | |
| "kl": 0.006096535362303257, | |
| "learning_rate": 4.844026866908595e-05, | |
| "loss": -0.040912821888923645, | |
| "num_tokens": 28552389.0, | |
| "reward": 1.4609375, | |
| "reward_std": 1.260840892791748, | |
| "rewards/reward_func/mean": 0.1623263888888889, | |
| "rewards/reward_func/std": 0.2152951161066691, | |
| "sampling/importance_sampling_ratio/max": 2.9983325004577637, | |
| "sampling/importance_sampling_ratio/mean": 0.939391016960144, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.413434028625488, | |
| "sampling/sampling_logp_difference/mean": 0.2270013391971588, | |
| "step": 189, | |
| "step_time": 135.0051975690294 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3871.0, | |
| "completions/mean_length": 1380.671875, | |
| "completions/mean_terminated_length": 1252.2203369140625, | |
| "completions/min_length": 322.0, | |
| "completions/min_terminated_length": 322.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7188487350940704, | |
| "epoch": 0.46798029556650245, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01357299206191558, | |
| "kl": 0.005054897745139897, | |
| "learning_rate": 4.8423357807526325e-05, | |
| "loss": -0.09534484893083572, | |
| "num_tokens": 28729328.0, | |
| "reward": 1.35546875, | |
| "reward_std": 1.248653769493103, | |
| "rewards/reward_func/mean": 0.1506076388888889, | |
| "rewards/reward_func/std": 0.20661381714873844, | |
| "sampling/importance_sampling_ratio/max": 2.997480630874634, | |
| "sampling/importance_sampling_ratio/mean": 0.9424310326576233, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.062469482421875, | |
| "sampling/sampling_logp_difference/mean": 0.20740850269794464, | |
| "step": 190, | |
| "step_time": 130.3345901852008 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3946.0, | |
| "completions/mean_length": 1181.046875, | |
| "completions/mean_terminated_length": 1046.8333740234375, | |
| "completions/min_length": 249.0, | |
| "completions/min_terminated_length": 249.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.9121982902288437, | |
| "epoch": 0.47044334975369456, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01961347417559075, | |
| "kl": 0.006588997668586671, | |
| "learning_rate": 4.840635874871259e-05, | |
| "loss": -0.0807374119758606, | |
| "num_tokens": 28899123.0, | |
| "reward": 1.3671875, | |
| "reward_std": 1.382263422012329, | |
| "rewards/reward_func/mean": 0.1519097222222222, | |
| "rewards/reward_func/std": 0.20543800791104636, | |
| "sampling/importance_sampling_ratio/max": 2.998202323913574, | |
| "sampling/importance_sampling_ratio/mean": 0.9454882144927979, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.819480895996094, | |
| "sampling/sampling_logp_difference/mean": 0.21896135807037354, | |
| "step": 191, | |
| "step_time": 128.1972416790668 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3309.0, | |
| "completions/mean_length": 920.1875, | |
| "completions/mean_terminated_length": 869.3933715820312, | |
| "completions/min_length": 132.0, | |
| "completions/min_terminated_length": 132.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7038512676954269, | |
| "epoch": 0.4729064039408867, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027061634852367587, | |
| "kl": 0.00604074785951525, | |
| "learning_rate": 4.838927155665225e-05, | |
| "loss": -0.05432787165045738, | |
| "num_tokens": 29032895.0, | |
| "reward": 1.53515625, | |
| "reward_std": 1.5058531761169434, | |
| "rewards/reward_func/mean": 0.17057291666666666, | |
| "rewards/reward_func/std": 0.2294796390665902, | |
| "sampling/importance_sampling_ratio/max": 2.9994328022003174, | |
| "sampling/importance_sampling_ratio/mean": 0.9593228101730347, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.059528350830078, | |
| "sampling/sampling_logp_difference/mean": 0.17926844954490662, | |
| "step": 192, | |
| "step_time": 115.40231793024577 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3525.0, | |
| "completions/mean_length": 781.296875, | |
| "completions/mean_terminated_length": 706.6271362304688, | |
| "completions/min_length": 5.0, | |
| "completions/min_terminated_length": 170.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7098062336444855, | |
| "epoch": 0.4753694581280788, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.021032851847854764, | |
| "kl": 0.006787009071558714, | |
| "learning_rate": 4.837209629568462e-05, | |
| "loss": -0.049884945154190063, | |
| "num_tokens": 29161282.0, | |
| "reward": 1.4765625, | |
| "reward_std": 1.2374118566513062, | |
| "rewards/reward_func/mean": 0.1640625, | |
| "rewards/reward_func/std": 0.21430290904310015, | |
| "sampling/importance_sampling_ratio/max": 2.995041608810425, | |
| "sampling/importance_sampling_ratio/mean": 0.9568045735359192, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.902270317077637, | |
| "sampling/sampling_logp_difference/mean": 0.1803169846534729, | |
| "step": 193, | |
| "step_time": 116.49772782274522 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3673.0, | |
| "completions/mean_length": 1280.890625, | |
| "completions/mean_terminated_length": 1118.8035888671875, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 285.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6611451655626297, | |
| "epoch": 0.47783251231527096, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.011058275247570002, | |
| "kl": 0.004933293093927205, | |
| "learning_rate": 4.8354833030480674e-05, | |
| "loss": -0.033044856041669846, | |
| "num_tokens": 29337771.0, | |
| "reward": 1.2734375, | |
| "reward_std": 1.1723660230636597, | |
| "rewards/reward_func/mean": 0.14149305555555555, | |
| "rewards/reward_func/std": 0.23454364968670738, | |
| "sampling/importance_sampling_ratio/max": 2.9890265464782715, | |
| "sampling/importance_sampling_ratio/mean": 0.9440937042236328, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.013520240783691, | |
| "sampling/sampling_logp_difference/mean": 0.1958637237548828, | |
| "step": 194, | |
| "step_time": 130.9736714749597 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3675.0, | |
| "completions/mean_length": 1018.359375, | |
| "completions/mean_terminated_length": 917.458984375, | |
| "completions/min_length": 159.0, | |
| "completions/min_terminated_length": 159.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7075535953044891, | |
| "epoch": 0.4802955665024631, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02364851796391656, | |
| "kl": 0.005875613307580352, | |
| "learning_rate": 4.833748182604273e-05, | |
| "loss": -0.031076285988092422, | |
| "num_tokens": 29488930.0, | |
| "reward": 1.44921875, | |
| "reward_std": 1.387429118156433, | |
| "rewards/reward_func/mean": 0.16102430555555555, | |
| "rewards/reward_func/std": 0.2225587632921007, | |
| "sampling/importance_sampling_ratio/max": 2.9986376762390137, | |
| "sampling/importance_sampling_ratio/mean": 0.9525994062423706, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.249788284301758, | |
| "sampling/sampling_logp_difference/mean": 0.19980862736701965, | |
| "step": 195, | |
| "step_time": 137.75460224575363 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2393.0, | |
| "completions/mean_length": 947.40625, | |
| "completions/mean_terminated_length": 851.4500732421875, | |
| "completions/min_length": 106.0, | |
| "completions/min_terminated_length": 106.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8509411960840225, | |
| "epoch": 0.4827586206896552, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02013512783074024, | |
| "kl": 0.0069408094277605414, | |
| "learning_rate": 4.832004274770422e-05, | |
| "loss": -0.05644526705145836, | |
| "num_tokens": 29629100.0, | |
| "reward": 1.24609375, | |
| "reward_std": 1.2813467979431152, | |
| "rewards/reward_func/mean": 0.1384548611111111, | |
| "rewards/reward_func/std": 0.21960993111133575, | |
| "sampling/importance_sampling_ratio/max": 2.996084451675415, | |
| "sampling/importance_sampling_ratio/mean": 0.9489051103591919, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.714995384216309, | |
| "sampling/sampling_logp_difference/mean": 0.21050521731376648, | |
| "step": 196, | |
| "step_time": 125.77825205796398 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3440.0, | |
| "completions/mean_length": 1200.953125, | |
| "completions/mean_terminated_length": 737.6481323242188, | |
| "completions/min_length": 219.0, | |
| "completions/min_terminated_length": 219.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7732942551374435, | |
| "epoch": 0.4852216748768473, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.012400100349164389, | |
| "kl": 0.005751760327257216, | |
| "learning_rate": 4.8302515861129474e-05, | |
| "loss": -0.024515990167856216, | |
| "num_tokens": 29791833.0, | |
| "reward": 1.14453125, | |
| "reward_std": 1.1741297245025635, | |
| "rewards/reward_func/mean": 0.1271701388888889, | |
| "rewards/reward_func/std": 0.1953661491473516, | |
| "sampling/importance_sampling_ratio/max": 2.997990846633911, | |
| "sampling/importance_sampling_ratio/mean": 0.9584062099456787, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.944055557250977, | |
| "sampling/sampling_logp_difference/mean": 0.19072258472442627, | |
| "step": 197, | |
| "step_time": 126.20407959399745 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4087.0, | |
| "completions/mean_length": 1286.90625, | |
| "completions/mean_terminated_length": 1082.9830322265625, | |
| "completions/min_length": 62.0, | |
| "completions/min_terminated_length": 62.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7938503175973892, | |
| "epoch": 0.4876847290640394, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01640850667993589, | |
| "kl": 0.004297417588531971, | |
| "learning_rate": 4.828490123231342e-05, | |
| "loss": -0.06255729496479034, | |
| "num_tokens": 29961379.0, | |
| "reward": 1.2421875, | |
| "reward_std": 1.15467369556427, | |
| "rewards/reward_func/mean": 0.13802083333333334, | |
| "rewards/reward_func/std": 0.19956799844900766, | |
| "sampling/importance_sampling_ratio/max": 2.997150421142578, | |
| "sampling/importance_sampling_ratio/mean": 0.9441516995429993, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.244465827941895, | |
| "sampling/sampling_logp_difference/mean": 0.21875452995300293, | |
| "step": 198, | |
| "step_time": 122.48823585105129 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1872.0, | |
| "completions/mean_length": 913.578125, | |
| "completions/mean_terminated_length": 778.440673828125, | |
| "completions/min_length": 250.0, | |
| "completions/min_terminated_length": 250.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7322440892457962, | |
| "epoch": 0.49014778325123154, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.018865350006102854, | |
| "kl": 0.004258246102835983, | |
| "learning_rate": 4.8267198927581415e-05, | |
| "loss": 0.0023453934118151665, | |
| "num_tokens": 30104040.0, | |
| "reward": 1.2734375, | |
| "reward_std": 0.937235414981842, | |
| "rewards/reward_func/mean": 0.14149305555555555, | |
| "rewards/reward_func/std": 0.16726858417193094, | |
| "sampling/importance_sampling_ratio/max": 2.9988765716552734, | |
| "sampling/importance_sampling_ratio/mean": 0.9504024982452393, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.14719009399414, | |
| "sampling/sampling_logp_difference/mean": 0.2065386325120926, | |
| "step": 199, | |
| "step_time": 119.87218592292629 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3342.0, | |
| "completions/mean_length": 1433.75, | |
| "completions/mean_terminated_length": 1164.9454345703125, | |
| "completions/min_length": 117.0, | |
| "completions/min_terminated_length": 117.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6855715811252594, | |
| "epoch": 0.49261083743842365, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.010089758857000182, | |
| "kl": 0.004258447384927422, | |
| "learning_rate": 4.824940901358889e-05, | |
| "loss": -0.054729118943214417, | |
| "num_tokens": 30266920.0, | |
| "reward": 1.203125, | |
| "reward_std": 1.0195266008377075, | |
| "rewards/reward_func/mean": 0.13368055555555555, | |
| "rewards/reward_func/std": 0.1727140380276574, | |
| "sampling/importance_sampling_ratio/max": 2.9999141693115234, | |
| "sampling/importance_sampling_ratio/mean": 0.950796365737915, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.130544662475586, | |
| "sampling/sampling_logp_difference/mean": 0.19564782083034515, | |
| "step": 200, | |
| "step_time": 124.14298599702306 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3757.0, | |
| "completions/mean_length": 1323.84375, | |
| "completions/mean_terminated_length": 1124.913818359375, | |
| "completions/min_length": 325.0, | |
| "completions/min_terminated_length": 325.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7940724641084671, | |
| "epoch": 0.49507389162561577, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.013795859181479811, | |
| "kl": 0.004771111474838108, | |
| "learning_rate": 4.82315315573212e-05, | |
| "loss": -0.009237069636583328, | |
| "num_tokens": 30443342.0, | |
| "reward": 1.48828125, | |
| "reward_std": 1.2347720861434937, | |
| "rewards/reward_func/mean": 0.16536458333333334, | |
| "rewards/reward_func/std": 0.22848578625255161, | |
| "sampling/importance_sampling_ratio/max": 2.9994006156921387, | |
| "sampling/importance_sampling_ratio/mean": 0.9410449266433716, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.181767463684082, | |
| "sampling/sampling_logp_difference/mean": 0.22467756271362305, | |
| "step": 201, | |
| "step_time": 135.13719806890003 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2821.0, | |
| "completions/mean_length": 786.0625, | |
| "completions/mean_terminated_length": 716.9508056640625, | |
| "completions/min_length": 142.0, | |
| "completions/min_terminated_length": 142.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6506723165512085, | |
| "epoch": 0.4975369458128079, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02002007843057624, | |
| "kl": 0.0043604791280813515, | |
| "learning_rate": 4.8213566626093316e-05, | |
| "loss": -0.04014727473258972, | |
| "num_tokens": 30573922.0, | |
| "reward": 1.4140625, | |
| "reward_std": 1.193749189376831, | |
| "rewards/reward_func/mean": 0.15711805555555555, | |
| "rewards/reward_func/std": 0.18327190147505867, | |
| "sampling/importance_sampling_ratio/max": 2.996601104736328, | |
| "sampling/importance_sampling_ratio/mean": 0.9635285139083862, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.511571884155273, | |
| "sampling/sampling_logp_difference/mean": 0.15732041001319885, | |
| "step": 202, | |
| "step_time": 115.08086889213882 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2581.0, | |
| "completions/mean_length": 925.65625, | |
| "completions/mean_terminated_length": 805.4667358398438, | |
| "completions/min_length": 112.0, | |
| "completions/min_terminated_length": 112.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8921978324651718, | |
| "epoch": 0.5, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.016980813259468056, | |
| "kl": 0.005670565296895802, | |
| "learning_rate": 4.819551428754957e-05, | |
| "loss": -0.04834799841046333, | |
| "num_tokens": 30718876.0, | |
| "reward": 1.2265625, | |
| "reward_std": 1.034828782081604, | |
| "rewards/reward_func/mean": 0.1362847222222222, | |
| "rewards/reward_func/std": 0.18432046307457817, | |
| "sampling/importance_sampling_ratio/max": 2.9978744983673096, | |
| "sampling/importance_sampling_ratio/mean": 0.951756477355957, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.001660346984863, | |
| "sampling/sampling_logp_difference/mean": 0.20985287427902222, | |
| "step": 203, | |
| "step_time": 125.05054689198732 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3314.0, | |
| "completions/mean_length": 1142.5, | |
| "completions/mean_terminated_length": 1064.34423828125, | |
| "completions/min_length": 3.0, | |
| "completions/min_terminated_length": 319.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6664412915706635, | |
| "epoch": 0.5024630541871922, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0167733571502946, | |
| "kl": 0.0050976480124518275, | |
| "learning_rate": 4.8177374609663415e-05, | |
| "loss": 0.05545557290315628, | |
| "num_tokens": 30873740.0, | |
| "reward": 1.41015625, | |
| "reward_std": 1.5264644622802734, | |
| "rewards/reward_func/mean": 0.1566840277777778, | |
| "rewards/reward_func/std": 0.2437412308322059, | |
| "sampling/importance_sampling_ratio/max": 2.992414951324463, | |
| "sampling/importance_sampling_ratio/mean": 0.9548521637916565, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.351396560668945, | |
| "sampling/sampling_logp_difference/mean": 0.18453365564346313, | |
| "step": 204, | |
| "step_time": 135.3645177448634 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3267.0, | |
| "completions/mean_length": 814.234375, | |
| "completions/mean_terminated_length": 762.36669921875, | |
| "completions/min_length": 205.0, | |
| "completions/min_terminated_length": 205.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.699128270149231, | |
| "epoch": 0.5049261083743842, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02369324206135809, | |
| "kl": 0.005623727338388562, | |
| "learning_rate": 4.815914766073719e-05, | |
| "loss": 0.06875115633010864, | |
| "num_tokens": 31009723.0, | |
| "reward": 1.51953125, | |
| "reward_std": 1.43462336063385, | |
| "rewards/reward_func/mean": 0.16883680555555555, | |
| "rewards/reward_func/std": 0.19979824622472128, | |
| "sampling/importance_sampling_ratio/max": 2.9969117641448975, | |
| "sampling/importance_sampling_ratio/mean": 0.9596514105796814, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.337469100952148, | |
| "sampling/sampling_logp_difference/mean": 0.18011973798274994, | |
| "step": 205, | |
| "step_time": 116.27848253119737 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2744.0, | |
| "completions/mean_length": 927.265625, | |
| "completions/mean_terminated_length": 865.6557006835938, | |
| "completions/min_length": 242.0, | |
| "completions/min_terminated_length": 242.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6605761498212814, | |
| "epoch": 0.5073891625615764, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.017280983168777832, | |
| "kl": 0.005327857914380729, | |
| "learning_rate": 4.8140833509401815e-05, | |
| "loss": 0.0009279539808630943, | |
| "num_tokens": 31147436.0, | |
| "reward": 1.265625, | |
| "reward_std": 1.1364060640335083, | |
| "rewards/reward_func/mean": 0.140625, | |
| "rewards/reward_func/std": 0.16583361559444004, | |
| "sampling/importance_sampling_ratio/max": 2.998948097229004, | |
| "sampling/importance_sampling_ratio/mean": 0.9557917714118958, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.98939323425293, | |
| "sampling/sampling_logp_difference/mean": 0.18955284357070923, | |
| "step": 206, | |
| "step_time": 134.5779933303129 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 3640.0, | |
| "completions/max_terminated_length": 3640.0, | |
| "completions/mean_length": 967.21875, | |
| "completions/mean_terminated_length": 956.2542114257812, | |
| "completions/min_length": 121.0, | |
| "completions/min_terminated_length": 121.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.70358906686306, | |
| "epoch": 0.5098522167487685, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02323335844168167, | |
| "kl": 0.00514031108468771, | |
| "learning_rate": 4.812243222461658e-05, | |
| "loss": 0.046197690069675446, | |
| "num_tokens": 31295338.0, | |
| "reward": 1.75, | |
| "reward_std": 1.6097421646118164, | |
| "rewards/reward_func/mean": 0.19444444444444445, | |
| "rewards/reward_func/std": 0.2402564717663659, | |
| "sampling/importance_sampling_ratio/max": 2.9992141723632812, | |
| "sampling/importance_sampling_ratio/mean": 0.9542554616928101, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.864341735839844, | |
| "sampling/sampling_logp_difference/mean": 0.1839657723903656, | |
| "step": 207, | |
| "step_time": 107.93793587083928 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3272.0, | |
| "completions/mean_length": 1129.546875, | |
| "completions/mean_terminated_length": 1040.2130126953125, | |
| "completions/min_length": 301.0, | |
| "completions/min_terminated_length": 301.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6552405655384064, | |
| "epoch": 0.5123152709359606, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02923078335322443, | |
| "kl": 0.004710226668976247, | |
| "learning_rate": 4.8103943875668844e-05, | |
| "loss": -0.08094117045402527, | |
| "num_tokens": 31452973.0, | |
| "reward": 2.140625, | |
| "reward_std": 1.937451958656311, | |
| "rewards/reward_func/mean": 0.2378472222222222, | |
| "rewards/reward_func/std": 0.27044207023249733, | |
| "sampling/importance_sampling_ratio/max": 2.994187593460083, | |
| "sampling/importance_sampling_ratio/mean": 0.9509314298629761, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.53609848022461, | |
| "sampling/sampling_logp_difference/mean": 0.18808647990226746, | |
| "step": 208, | |
| "step_time": 135.48419915605336 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2688.0, | |
| "completions/mean_length": 996.5625, | |
| "completions/mean_terminated_length": 811.385986328125, | |
| "completions/min_length": 108.0, | |
| "completions/min_terminated_length": 124.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.662027508020401, | |
| "epoch": 0.5147783251231527, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02570822604626865, | |
| "kl": 0.008059444488026202, | |
| "learning_rate": 4.8085368532173804e-05, | |
| "loss": -0.10004392266273499, | |
| "num_tokens": 31600961.0, | |
| "reward": 2.04296875, | |
| "reward_std": 1.8801145553588867, | |
| "rewards/reward_func/mean": 0.2269965277777778, | |
| "rewards/reward_func/std": 0.26916251911057365, | |
| "sampling/importance_sampling_ratio/max": 2.9979264736175537, | |
| "sampling/importance_sampling_ratio/mean": 0.9550250172615051, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.761881828308105, | |
| "sampling/sampling_logp_difference/mean": 0.18191197514533997, | |
| "step": 209, | |
| "step_time": 124.38241540174931 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2389.0, | |
| "completions/mean_length": 986.203125, | |
| "completions/mean_terminated_length": 874.8851928710938, | |
| "completions/min_length": 180.0, | |
| "completions/min_terminated_length": 180.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6463757306337357, | |
| "epoch": 0.5172413793103449, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.022735434662613345, | |
| "kl": 0.008034229511395097, | |
| "learning_rate": 4.806670626407422e-05, | |
| "loss": -0.09111690521240234, | |
| "num_tokens": 31747678.0, | |
| "reward": 1.80078125, | |
| "reward_std": 1.6863008737564087, | |
| "rewards/reward_func/mean": 0.20008680555555555, | |
| "rewards/reward_func/std": 0.27082228660583496, | |
| "sampling/importance_sampling_ratio/max": 2.9977269172668457, | |
| "sampling/importance_sampling_ratio/mean": 0.950595498085022, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.483198165893555, | |
| "sampling/sampling_logp_difference/mean": 0.1938043236732483, | |
| "step": 210, | |
| "step_time": 162.62786612519994 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2066.0, | |
| "completions/mean_length": 1204.53125, | |
| "completions/mean_terminated_length": 807.370361328125, | |
| "completions/min_length": 244.0, | |
| "completions/min_terminated_length": 244.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6006270796060562, | |
| "epoch": 0.5197044334975369, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.018144455499452674, | |
| "kl": 0.008116140263155103, | |
| "learning_rate": 4.804795714164015e-05, | |
| "loss": -0.05340362340211868, | |
| "num_tokens": 31909312.0, | |
| "reward": 2.09375, | |
| "reward_std": 1.914595127105713, | |
| "rewards/reward_func/mean": 0.2326388888888889, | |
| "rewards/reward_func/std": 0.305544869767295, | |
| "sampling/importance_sampling_ratio/max": 2.997669219970703, | |
| "sampling/importance_sampling_ratio/mean": 0.9609960317611694, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.310054779052734, | |
| "sampling/sampling_logp_difference/mean": 0.156742125749588, | |
| "step": 211, | |
| "step_time": 120.653579573147 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3924.0, | |
| "completions/mean_length": 1445.078125, | |
| "completions/mean_terminated_length": 1236.310302734375, | |
| "completions/min_length": 255.0, | |
| "completions/min_terminated_length": 255.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7738412618637085, | |
| "epoch": 0.5221674876847291, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02789054476062182, | |
| "kl": 0.00692247343249619, | |
| "learning_rate": 4.8029121235468696e-05, | |
| "loss": -0.20333515107631683, | |
| "num_tokens": 32088485.0, | |
| "reward": 2.5859375, | |
| "reward_std": 2.116741180419922, | |
| "rewards/reward_func/mean": 0.2873263888888889, | |
| "rewards/reward_func/std": 0.316247637073199, | |
| "sampling/importance_sampling_ratio/max": 2.9986884593963623, | |
| "sampling/importance_sampling_ratio/mean": 0.9414281249046326, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.835151672363281, | |
| "sampling/sampling_logp_difference/mean": 0.21052196621894836, | |
| "step": 212, | |
| "step_time": 136.5137119400315 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3337.0, | |
| "completions/mean_length": 1149.40625, | |
| "completions/mean_terminated_length": 1064.2950439453125, | |
| "completions/min_length": 291.0, | |
| "completions/min_terminated_length": 291.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8650064170360565, | |
| "epoch": 0.5246305418719212, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027477140042687382, | |
| "kl": 0.008720705634914339, | |
| "learning_rate": 4.8010198616483736e-05, | |
| "loss": 0.21267648041248322, | |
| "num_tokens": 32249679.0, | |
| "reward": 2.20703125, | |
| "reward_std": 1.8620877265930176, | |
| "rewards/reward_func/mean": 0.24522569444444445, | |
| "rewards/reward_func/std": 0.25635351406203377, | |
| "sampling/importance_sampling_ratio/max": 2.995835781097412, | |
| "sampling/importance_sampling_ratio/mean": 0.9504793882369995, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.059192657470703, | |
| "sampling/sampling_logp_difference/mean": 0.1958104521036148, | |
| "step": 213, | |
| "step_time": 123.98496635304764 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3253.0, | |
| "completions/mean_length": 995.390625, | |
| "completions/mean_terminated_length": 842.901611328125, | |
| "completions/min_length": 224.0, | |
| "completions/min_terminated_length": 224.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6862529069185257, | |
| "epoch": 0.5270935960591133, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027503322526612032, | |
| "kl": 0.008734274189919233, | |
| "learning_rate": 4.799118935593563e-05, | |
| "loss": 0.00571461021900177, | |
| "num_tokens": 32397592.0, | |
| "reward": 2.296875, | |
| "reward_std": 2.0537784099578857, | |
| "rewards/reward_func/mean": 0.2552083333333333, | |
| "rewards/reward_func/std": 0.30399200485812294, | |
| "sampling/importance_sampling_ratio/max": 2.970259666442871, | |
| "sampling/importance_sampling_ratio/mean": 0.9581179618835449, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.819830894470215, | |
| "sampling/sampling_logp_difference/mean": 0.1745845526456833, | |
| "step": 214, | |
| "step_time": 125.05398750072345 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3835.0, | |
| "completions/mean_length": 1262.625, | |
| "completions/mean_terminated_length": 1118.7166748046875, | |
| "completions/min_length": 158.0, | |
| "completions/min_terminated_length": 158.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6201090812683105, | |
| "epoch": 0.5295566502463054, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.029608888003022468, | |
| "kl": 0.01200737664476037, | |
| "learning_rate": 4.797209352540101e-05, | |
| "loss": -0.18912369012832642, | |
| "num_tokens": 32558048.0, | |
| "reward": 2.640625, | |
| "reward_std": 2.0236082077026367, | |
| "rewards/reward_func/mean": 0.2934027777777778, | |
| "rewards/reward_func/std": 0.2782379339138667, | |
| "sampling/importance_sampling_ratio/max": 2.9898691177368164, | |
| "sampling/importance_sampling_ratio/mean": 0.9521766901016235, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.811450958251953, | |
| "sampling/sampling_logp_difference/mean": 0.18017134070396423, | |
| "step": 215, | |
| "step_time": 119.14505596109666 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3035.0, | |
| "completions/mean_length": 1009.796875, | |
| "completions/mean_terminated_length": 858.016357421875, | |
| "completions/min_length": 180.0, | |
| "completions/min_terminated_length": 180.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7858224660158157, | |
| "epoch": 0.5320197044334976, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04291691770310537, | |
| "kl": 0.011967223603278399, | |
| "learning_rate": 4.7952911196782426e-05, | |
| "loss": -0.01381763070821762, | |
| "num_tokens": 32710515.0, | |
| "reward": 2.515625, | |
| "reward_std": 2.1830310821533203, | |
| "rewards/reward_func/mean": 0.2795138888888889, | |
| "rewards/reward_func/std": 0.31012119187249076, | |
| "sampling/importance_sampling_ratio/max": 2.9967217445373535, | |
| "sampling/importance_sampling_ratio/mean": 0.9497541189193726, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.55856704711914, | |
| "sampling/sampling_logp_difference/mean": 0.2038884311914444, | |
| "step": 216, | |
| "step_time": 132.75489768222906 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4016.0, | |
| "completions/mean_length": 1193.171875, | |
| "completions/mean_terminated_length": 1125.550048828125, | |
| "completions/min_length": 19.0, | |
| "completions/min_terminated_length": 395.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6148668229579926, | |
| "epoch": 0.5344827586206896, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03435292652045712, | |
| "kl": 0.010951020522043109, | |
| "learning_rate": 4.793364244230818e-05, | |
| "loss": 0.11087347567081451, | |
| "num_tokens": 32875118.0, | |
| "reward": 2.84765625, | |
| "reward_std": 2.1037652492523193, | |
| "rewards/reward_func/mean": 0.31640625, | |
| "rewards/reward_func/std": 0.30274029903941685, | |
| "sampling/importance_sampling_ratio/max": 2.9930214881896973, | |
| "sampling/importance_sampling_ratio/mean": 0.95209139585495, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.35776710510254, | |
| "sampling/sampling_logp_difference/mean": 0.1722993552684784, | |
| "step": 217, | |
| "step_time": 130.21765533811413 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2928.0, | |
| "completions/mean_length": 1055.609375, | |
| "completions/mean_terminated_length": 1007.4500732421875, | |
| "completions/min_length": 136.0, | |
| "completions/min_terminated_length": 136.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5769052356481552, | |
| "epoch": 0.5369458128078818, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04173595623774424, | |
| "kl": 0.010210154112428427, | |
| "learning_rate": 4.791428733453195e-05, | |
| "loss": -0.30730634927749634, | |
| "num_tokens": 33032517.0, | |
| "reward": 2.87890625, | |
| "reward_std": 1.97101628780365, | |
| "rewards/reward_func/mean": 0.3198784722222222, | |
| "rewards/reward_func/std": 0.29506540298461914, | |
| "sampling/importance_sampling_ratio/max": 2.997896432876587, | |
| "sampling/importance_sampling_ratio/mean": 0.9560102820396423, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.76587200164795, | |
| "sampling/sampling_logp_difference/mean": 0.1718832403421402, | |
| "step": 218, | |
| "step_time": 116.42301863082685 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2737.0, | |
| "completions/mean_length": 1503.078125, | |
| "completions/mean_terminated_length": 1176.2037353515625, | |
| "completions/min_length": 202.0, | |
| "completions/min_terminated_length": 202.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6850746124982834, | |
| "epoch": 0.5394088669950738, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02444807124741549, | |
| "kl": 0.010135965887457132, | |
| "learning_rate": 4.78948459463326e-05, | |
| "loss": -0.14834490418434143, | |
| "num_tokens": 33224890.0, | |
| "reward": 2.6640625, | |
| "reward_std": 2.139582872390747, | |
| "rewards/reward_func/mean": 0.2960069444444444, | |
| "rewards/reward_func/std": 0.31274378465281594, | |
| "sampling/importance_sampling_ratio/max": 2.9970359802246094, | |
| "sampling/importance_sampling_ratio/mean": 0.9442711472511292, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.665057182312012, | |
| "sampling/sampling_logp_difference/mean": 0.2032284438610077, | |
| "step": 219, | |
| "step_time": 145.91501643997617 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2792.0, | |
| "completions/mean_length": 1046.734375, | |
| "completions/mean_terminated_length": 836.5084838867188, | |
| "completions/min_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7115042209625244, | |
| "epoch": 0.541871921182266, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.028863511910483478, | |
| "kl": 0.013005719054490328, | |
| "learning_rate": 4.7875318350913846e-05, | |
| "loss": -0.00012151524424552917, | |
| "num_tokens": 33371753.0, | |
| "reward": 2.609375, | |
| "reward_std": 1.992483377456665, | |
| "rewards/reward_func/mean": 0.2899305555555556, | |
| "rewards/reward_func/std": 0.29294103052881026, | |
| "sampling/importance_sampling_ratio/max": 2.9933483600616455, | |
| "sampling/importance_sampling_ratio/mean": 0.9574570655822754, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.14802074432373, | |
| "sampling/sampling_logp_difference/mean": 0.19140395522117615, | |
| "step": 220, | |
| "step_time": 116.08205214515328 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3865.0, | |
| "completions/mean_length": 1232.171875, | |
| "completions/mean_terminated_length": 1030.5535888671875, | |
| "completions/min_length": 172.0, | |
| "completions/min_terminated_length": 172.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8280133903026581, | |
| "epoch": 0.5443349753694581, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.026199806644726372, | |
| "kl": 0.01261559035629034, | |
| "learning_rate": 4.785570462180402e-05, | |
| "loss": 0.019907645881175995, | |
| "num_tokens": 33533060.0, | |
| "reward": 2.4609375, | |
| "reward_std": 1.963060975074768, | |
| "rewards/reward_func/mean": 0.2734375, | |
| "rewards/reward_func/std": 0.29242918226453996, | |
| "sampling/importance_sampling_ratio/max": 2.998601198196411, | |
| "sampling/importance_sampling_ratio/mean": 0.9539086222648621, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.437432289123535, | |
| "sampling/sampling_logp_difference/mean": 0.18835866451263428, | |
| "step": 221, | |
| "step_time": 124.80898142675869 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3916.0, | |
| "completions/mean_length": 1169.75, | |
| "completions/mean_terminated_length": 1083.3792724609375, | |
| "completions/min_length": 76.0, | |
| "completions/min_terminated_length": 76.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6079298108816147, | |
| "epoch": 0.5467980295566502, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03152604195230071, | |
| "kl": 0.009750789031386375, | |
| "learning_rate": 4.7836004832855776e-05, | |
| "loss": -0.014914639294147491, | |
| "num_tokens": 33693908.0, | |
| "reward": 2.609375, | |
| "reward_std": 2.088742971420288, | |
| "rewards/reward_func/mean": 0.2899305555555556, | |
| "rewards/reward_func/std": 0.29560703535874683, | |
| "sampling/importance_sampling_ratio/max": 2.99847412109375, | |
| "sampling/importance_sampling_ratio/mean": 0.9530090093612671, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.749363899230957, | |
| "sampling/sampling_logp_difference/mean": 0.17468664050102234, | |
| "step": 222, | |
| "step_time": 124.77503907005303 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3584.0, | |
| "completions/mean_length": 1138.390625, | |
| "completions/mean_terminated_length": 942.2203369140625, | |
| "completions/min_length": 48.0, | |
| "completions/min_terminated_length": 48.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7041943818330765, | |
| "epoch": 0.5492610837438424, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04273914892197363, | |
| "kl": 0.014788009924814105, | |
| "learning_rate": 4.781621905824579e-05, | |
| "loss": -0.1250435709953308, | |
| "num_tokens": 33847053.0, | |
| "reward": 2.29296875, | |
| "reward_std": 1.9667649269104004, | |
| "rewards/reward_func/mean": 0.2547743055555556, | |
| "rewards/reward_func/std": 0.3062853713830312, | |
| "sampling/importance_sampling_ratio/max": 2.9978320598602295, | |
| "sampling/importance_sampling_ratio/mean": 0.9523090124130249, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.453618049621582, | |
| "sampling/sampling_logp_difference/mean": 0.19299328327178955, | |
| "step": 223, | |
| "step_time": 127.9603762368206 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4026.0, | |
| "completions/mean_length": 1403.65625, | |
| "completions/mean_terminated_length": 1173.8302001953125, | |
| "completions/min_length": 314.0, | |
| "completions/min_terminated_length": 314.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.682645320892334, | |
| "epoch": 0.5517241379310345, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025751043642452183, | |
| "kl": 0.010668770410120487, | |
| "learning_rate": 4.779634737247455e-05, | |
| "loss": -0.0730874314904213, | |
| "num_tokens": 34034903.0, | |
| "reward": 2.39453125, | |
| "reward_std": 2.0147297382354736, | |
| "rewards/reward_func/mean": 0.2660590277777778, | |
| "rewards/reward_func/std": 0.29088784919844735, | |
| "sampling/importance_sampling_ratio/max": 2.996720314025879, | |
| "sampling/importance_sampling_ratio/mean": 0.9444730281829834, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.812371253967285, | |
| "sampling/sampling_logp_difference/mean": 0.1935410052537918, | |
| "step": 224, | |
| "step_time": 138.18764766515233 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2829.0, | |
| "completions/mean_length": 1212.671875, | |
| "completions/mean_terminated_length": 929.30908203125, | |
| "completions/min_length": 264.0, | |
| "completions/min_terminated_length": 264.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6753417253494263, | |
| "epoch": 0.5541871921182266, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02639378345648046, | |
| "kl": 0.011769623961299658, | |
| "learning_rate": 4.777638985036599e-05, | |
| "loss": 0.045241162180900574, | |
| "num_tokens": 34184738.0, | |
| "reward": 2.78125, | |
| "reward_std": 2.071873188018799, | |
| "rewards/reward_func/mean": 0.3090277777777778, | |
| "rewards/reward_func/std": 0.31061913735336727, | |
| "sampling/importance_sampling_ratio/max": 2.9998955726623535, | |
| "sampling/importance_sampling_ratio/mean": 0.9576824307441711, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.645200729370117, | |
| "sampling/sampling_logp_difference/mean": 0.1704041212797165, | |
| "step": 225, | |
| "step_time": 113.9789727050811 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2964.0, | |
| "completions/mean_length": 1230.234375, | |
| "completions/mean_terminated_length": 908.1154174804688, | |
| "completions/min_length": 111.0, | |
| "completions/min_terminated_length": 133.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7707178592681885, | |
| "epoch": 0.5566502463054187, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02290402359133147, | |
| "kl": 0.012144361389800906, | |
| "learning_rate": 4.7756346567067255e-05, | |
| "loss": -0.03128755837678909, | |
| "num_tokens": 34340529.0, | |
| "reward": 2.12890625, | |
| "reward_std": 1.7978118658065796, | |
| "rewards/reward_func/mean": 0.2365451388888889, | |
| "rewards/reward_func/std": 0.2605728440814548, | |
| "sampling/importance_sampling_ratio/max": 2.9998936653137207, | |
| "sampling/importance_sampling_ratio/mean": 0.9504464268684387, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.862293243408203, | |
| "sampling/sampling_logp_difference/mean": 0.1982928216457367, | |
| "step": 226, | |
| "step_time": 133.34942565392703 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3998.0, | |
| "completions/mean_length": 1224.78125, | |
| "completions/mean_terminated_length": 885.7169799804688, | |
| "completions/min_length": 124.0, | |
| "completions/min_terminated_length": 182.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7116331607103348, | |
| "epoch": 0.5591133004926109, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025649588255769486, | |
| "kl": 0.014744432177394629, | |
| "learning_rate": 4.773621759804844e-05, | |
| "loss": 0.043492548167705536, | |
| "num_tokens": 34497827.0, | |
| "reward": 2.24609375, | |
| "reward_std": 2.065155029296875, | |
| "rewards/reward_func/mean": 0.2495659722222222, | |
| "rewards/reward_func/std": 0.2995048099093967, | |
| "sampling/importance_sampling_ratio/max": 2.9920711517333984, | |
| "sampling/importance_sampling_ratio/mean": 0.9514751434326172, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.58505630493164, | |
| "sampling/sampling_logp_difference/mean": 0.18859557807445526, | |
| "step": 227, | |
| "step_time": 121.2616064096801 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3420.0, | |
| "completions/mean_length": 1085.0625, | |
| "completions/mean_terminated_length": 894.586181640625, | |
| "completions/min_length": 126.0, | |
| "completions/min_terminated_length": 126.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7214779406785965, | |
| "epoch": 0.5615763546798029, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.026726141471262007, | |
| "kl": 0.014612508239224553, | |
| "learning_rate": 4.771600301910224e-05, | |
| "loss": 0.21741780638694763, | |
| "num_tokens": 34648711.0, | |
| "reward": 2.16015625, | |
| "reward_std": 1.8335320949554443, | |
| "rewards/reward_func/mean": 0.2400173611111111, | |
| "rewards/reward_func/std": 0.24923836025926802, | |
| "sampling/importance_sampling_ratio/max": 2.992070436477661, | |
| "sampling/importance_sampling_ratio/mean": 0.9507383108139038, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.363275527954102, | |
| "sampling/sampling_logp_difference/mean": 0.1869376301765442, | |
| "step": 228, | |
| "step_time": 130.2360584451817 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3812.0, | |
| "completions/mean_length": 1164.859375, | |
| "completions/mean_terminated_length": 960.5689697265625, | |
| "completions/min_length": 173.0, | |
| "completions/min_terminated_length": 173.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 1.0671870857477188, | |
| "epoch": 0.5640394088669951, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03701890008995095, | |
| "kl": 0.0166468839161098, | |
| "learning_rate": 4.769570290634373e-05, | |
| "loss": -0.009628377854824066, | |
| "num_tokens": 34806910.0, | |
| "reward": 2.1484375, | |
| "reward_std": 1.821236491203308, | |
| "rewards/reward_func/mean": 0.2387152777777778, | |
| "rewards/reward_func/std": 0.24402027163240644, | |
| "sampling/importance_sampling_ratio/max": 2.9982752799987793, | |
| "sampling/importance_sampling_ratio/mean": 0.9430174827575684, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.500931739807129, | |
| "sampling/sampling_logp_difference/mean": 0.21461455523967743, | |
| "step": 229, | |
| "step_time": 124.32920650811866 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4018.0, | |
| "completions/mean_length": 1480.171875, | |
| "completions/mean_terminated_length": 1200.4259033203125, | |
| "completions/min_length": 202.0, | |
| "completions/min_terminated_length": 202.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.9094728976488113, | |
| "epoch": 0.5665024630541872, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02566864350514655, | |
| "kl": 0.014607452787458897, | |
| "learning_rate": 4.767531733621004e-05, | |
| "loss": -0.08295504748821259, | |
| "num_tokens": 34995209.0, | |
| "reward": 2.64453125, | |
| "reward_std": 2.0225930213928223, | |
| "rewards/reward_func/mean": 0.2938368055555556, | |
| "rewards/reward_func/std": 0.31203501257631516, | |
| "sampling/importance_sampling_ratio/max": 2.9992480278015137, | |
| "sampling/importance_sampling_ratio/mean": 0.9394354820251465, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.749757766723633, | |
| "sampling/sampling_logp_difference/mean": 0.2041437029838562, | |
| "step": 230, | |
| "step_time": 138.9344523921609 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3585.0, | |
| "completions/mean_length": 1350.859375, | |
| "completions/mean_terminated_length": 1118.0179443359375, | |
| "completions/min_length": 25.0, | |
| "completions/min_terminated_length": 175.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7921342998743057, | |
| "epoch": 0.5689655172413793, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02868100953759027, | |
| "kl": 0.01775623788125813, | |
| "learning_rate": 4.765484638546005e-05, | |
| "loss": 0.0232635997235775, | |
| "num_tokens": 35163824.0, | |
| "reward": 2.20703125, | |
| "reward_std": 1.9047532081604004, | |
| "rewards/reward_func/mean": 0.24522569444444445, | |
| "rewards/reward_func/std": 0.24406374990940094, | |
| "sampling/importance_sampling_ratio/max": 2.998385429382324, | |
| "sampling/importance_sampling_ratio/mean": 0.9466791152954102, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.723865509033203, | |
| "sampling/sampling_logp_difference/mean": 0.18429672718048096, | |
| "step": 231, | |
| "step_time": 120.73535452294163 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3728.0, | |
| "completions/mean_length": 1476.53125, | |
| "completions/mean_terminated_length": 1130.7037353515625, | |
| "completions/min_length": 5.0, | |
| "completions/min_terminated_length": 206.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.9682776778936386, | |
| "epoch": 0.5714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03712338663506699, | |
| "kl": 0.019314537523314357, | |
| "learning_rate": 4.7634290131174184e-05, | |
| "loss": -0.14410394430160522, | |
| "num_tokens": 35353410.0, | |
| "reward": 2.4140625, | |
| "reward_std": 1.8463921546936035, | |
| "rewards/reward_func/mean": 0.2682291666666667, | |
| "rewards/reward_func/std": 0.28911194536421037, | |
| "sampling/importance_sampling_ratio/max": 2.9995596408843994, | |
| "sampling/importance_sampling_ratio/mean": 0.9306811094284058, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.025599479675293, | |
| "sampling/sampling_logp_difference/mean": 0.23162716627120972, | |
| "step": 232, | |
| "step_time": 133.49474174529314 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3336.0, | |
| "completions/mean_length": 1594.9375, | |
| "completions/mean_terminated_length": 1036.5870361328125, | |
| "completions/min_length": 151.0, | |
| "completions/min_terminated_length": 151.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7518228590488434, | |
| "epoch": 0.5738916256157636, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02178047416680768, | |
| "kl": 0.013470216421410441, | |
| "learning_rate": 4.761364865075402e-05, | |
| "loss": -0.1526617705821991, | |
| "num_tokens": 35532574.0, | |
| "reward": 2.54296875, | |
| "reward_std": 2.0180509090423584, | |
| "rewards/reward_func/mean": 0.2825520833333333, | |
| "rewards/reward_func/std": 0.30297022809584934, | |
| "sampling/importance_sampling_ratio/max": 2.984797954559326, | |
| "sampling/importance_sampling_ratio/mean": 0.9513339996337891, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.608736038208008, | |
| "sampling/sampling_logp_difference/mean": 0.17860578000545502, | |
| "step": 233, | |
| "step_time": 122.31598937511444 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2172.0, | |
| "completions/mean_length": 1319.5625, | |
| "completions/mean_terminated_length": 922.3077392578125, | |
| "completions/min_length": 230.0, | |
| "completions/min_terminated_length": 230.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 1.0021032392978668, | |
| "epoch": 0.5763546798029556, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.03166418186559143, | |
| "kl": 0.024833133444190025, | |
| "learning_rate": 4.7592922021922056e-05, | |
| "loss": -0.15460214018821716, | |
| "num_tokens": 35722898.0, | |
| "reward": 2.0625, | |
| "reward_std": 1.8126540184020996, | |
| "rewards/reward_func/mean": 0.22916666666666666, | |
| "rewards/reward_func/std": 0.2917405896716648, | |
| "sampling/importance_sampling_ratio/max": 2.996793031692505, | |
| "sampling/importance_sampling_ratio/mean": 0.9255531430244446, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.874739646911621, | |
| "sampling/sampling_logp_difference/mean": 0.23875398933887482, | |
| "step": 234, | |
| "step_time": 148.48599314992316 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1955.0, | |
| "completions/mean_length": 1231.4375, | |
| "completions/mean_terminated_length": 811.0980834960938, | |
| "completions/min_length": 310.0, | |
| "completions/min_terminated_length": 310.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7249269187450409, | |
| "epoch": 0.5788177339901478, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.05201097230500609, | |
| "kl": 0.014844065997749567, | |
| "learning_rate": 4.757211032272141e-05, | |
| "loss": -0.2842212915420532, | |
| "num_tokens": 35879470.0, | |
| "reward": 2.28125, | |
| "reward_std": 1.9783049821853638, | |
| "rewards/reward_func/mean": 0.2534722222222222, | |
| "rewards/reward_func/std": 0.2845391564899021, | |
| "sampling/importance_sampling_ratio/max": 2.9987900257110596, | |
| "sampling/importance_sampling_ratio/mean": 0.9521608948707581, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.4357271194458, | |
| "sampling/sampling_logp_difference/mean": 0.1780111938714981, | |
| "step": 235, | |
| "step_time": 122.01890759728849 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3650.0, | |
| "completions/mean_length": 1525.09375, | |
| "completions/mean_terminated_length": 986.375, | |
| "completions/min_length": 193.0, | |
| "completions/min_terminated_length": 193.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8893266320228577, | |
| "epoch": 0.5812807881773399, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02471245037776744, | |
| "kl": 0.01480063796043396, | |
| "learning_rate": 4.75512136315155e-05, | |
| "loss": -0.15336401760578156, | |
| "num_tokens": 36060756.0, | |
| "reward": 2.05859375, | |
| "reward_std": 1.8020832538604736, | |
| "rewards/reward_func/mean": 0.2287326388888889, | |
| "rewards/reward_func/std": 0.2694326473606957, | |
| "sampling/importance_sampling_ratio/max": 2.999886989593506, | |
| "sampling/importance_sampling_ratio/mean": 0.9345108270645142, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.334660530090332, | |
| "sampling/sampling_logp_difference/mean": 0.22046810388565063, | |
| "step": 236, | |
| "step_time": 131.76243212609552 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2817.0, | |
| "completions/mean_length": 1175.203125, | |
| "completions/mean_terminated_length": 836.1111450195312, | |
| "completions/min_length": 108.0, | |
| "completions/min_terminated_length": 245.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.784490168094635, | |
| "epoch": 0.583743842364532, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02574867287046673, | |
| "kl": 0.013174011837691069, | |
| "learning_rate": 4.7530232026987807e-05, | |
| "loss": -0.061145804822444916, | |
| "num_tokens": 36226257.0, | |
| "reward": 2.12109375, | |
| "reward_std": 1.8121023178100586, | |
| "rewards/reward_func/mean": 0.23567708333333334, | |
| "rewards/reward_func/std": 0.26693976587719387, | |
| "sampling/importance_sampling_ratio/max": 2.9982166290283203, | |
| "sampling/importance_sampling_ratio/mean": 0.9447356462478638, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.428176879882812, | |
| "sampling/sampling_logp_difference/mean": 0.20164385437965393, | |
| "step": 237, | |
| "step_time": 128.0499583010096 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1154.0, | |
| "completions/mean_length": 546.53125, | |
| "completions/mean_terminated_length": 437.35003662109375, | |
| "completions/min_length": 119.0, | |
| "completions/min_terminated_length": 119.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6676979809999466, | |
| "epoch": 0.5862068965517241, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0501059794602117, | |
| "kl": 0.01879991707392037, | |
| "learning_rate": 4.75091655881415e-05, | |
| "loss": 0.018436290323734283, | |
| "num_tokens": 36337075.0, | |
| "reward": 2.546875, | |
| "reward_std": 1.9879971742630005, | |
| "rewards/reward_func/mean": 0.2829861111111111, | |
| "rewards/reward_func/std": 0.29805727965301937, | |
| "sampling/importance_sampling_ratio/max": 2.9939706325531006, | |
| "sampling/importance_sampling_ratio/mean": 0.9689080715179443, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.713134765625, | |
| "sampling/sampling_logp_difference/mean": 0.1616547852754593, | |
| "step": 238, | |
| "step_time": 106.62005894491449 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2370.0, | |
| "completions/mean_length": 1046.140625, | |
| "completions/mean_terminated_length": 902.3500366210938, | |
| "completions/min_length": 259.0, | |
| "completions/min_terminated_length": 259.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7040528655052185, | |
| "epoch": 0.5886699507389163, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.031163817812319954, | |
| "kl": 0.013875756645575166, | |
| "learning_rate": 4.7488014394299205e-05, | |
| "loss": -0.06191133335232735, | |
| "num_tokens": 36488556.0, | |
| "reward": 2.3515625, | |
| "reward_std": 1.8996233940124512, | |
| "rewards/reward_func/mean": 0.2612847222222222, | |
| "rewards/reward_func/std": 0.26455983685122597, | |
| "sampling/importance_sampling_ratio/max": 2.997973680496216, | |
| "sampling/importance_sampling_ratio/mean": 0.9539923667907715, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.80900764465332, | |
| "sampling/sampling_logp_difference/mean": 0.18930600583553314, | |
| "step": 239, | |
| "step_time": 122.02802962902933 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3646.0, | |
| "completions/mean_length": 1437.96875, | |
| "completions/mean_terminated_length": 1307.245849609375, | |
| "completions/min_length": 17.0, | |
| "completions/min_terminated_length": 17.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6821627020835876, | |
| "epoch": 0.5911330049261084, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.022492803254900707, | |
| "kl": 0.008037951542064548, | |
| "learning_rate": 4.746677852510267e-05, | |
| "loss": -0.048022910952568054, | |
| "num_tokens": 36680714.0, | |
| "reward": 1.7890625, | |
| "reward_std": 1.6373727321624756, | |
| "rewards/reward_func/mean": 0.1987847222222222, | |
| "rewards/reward_func/std": 0.22592765589555105, | |
| "sampling/importance_sampling_ratio/max": 2.9999842643737793, | |
| "sampling/importance_sampling_ratio/mean": 0.9468995332717896, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.066885948181152, | |
| "sampling/sampling_logp_difference/mean": 0.1964116394519806, | |
| "step": 240, | |
| "step_time": 152.4113609350752 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3714.0, | |
| "completions/mean_length": 1449.25, | |
| "completions/mean_terminated_length": 1312.7857666015625, | |
| "completions/min_length": 30.0, | |
| "completions/min_terminated_length": 420.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.9211445450782776, | |
| "epoch": 0.5935960591133005, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025380578247890306, | |
| "kl": 0.01382395182736218, | |
| "learning_rate": 4.7445458060512484e-05, | |
| "loss": -0.0691765695810318, | |
| "num_tokens": 36867578.0, | |
| "reward": 2.08984375, | |
| "reward_std": 1.7980186939239502, | |
| "rewards/reward_func/mean": 0.2322048611111111, | |
| "rewards/reward_func/std": 0.2627977761957381, | |
| "sampling/importance_sampling_ratio/max": 2.9993677139282227, | |
| "sampling/importance_sampling_ratio/mean": 0.9361419081687927, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.936524391174316, | |
| "sampling/sampling_logp_difference/mean": 0.22960931062698364, | |
| "step": 241, | |
| "step_time": 131.3438694481738 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3204.0, | |
| "completions/mean_length": 1209.828125, | |
| "completions/mean_terminated_length": 985.0, | |
| "completions/min_length": 260.0, | |
| "completions/min_terminated_length": 260.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6995112746953964, | |
| "epoch": 0.5960591133004927, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025394328117354086, | |
| "kl": 0.01263234089128673, | |
| "learning_rate": 4.742405308080775e-05, | |
| "loss": 0.00674794428050518, | |
| "num_tokens": 37031359.0, | |
| "reward": 2.3984375, | |
| "reward_std": 1.963818907737732, | |
| "rewards/reward_func/mean": 0.2664930555555556, | |
| "rewards/reward_func/std": 0.26679257882965934, | |
| "sampling/importance_sampling_ratio/max": 2.996161460876465, | |
| "sampling/importance_sampling_ratio/mean": 0.9531453847885132, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.238762855529785, | |
| "sampling/sampling_logp_difference/mean": 0.19814015924930573, | |
| "step": 242, | |
| "step_time": 123.06549433688633 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3877.0, | |
| "completions/mean_length": 1199.21875, | |
| "completions/mean_terminated_length": 1051.3966064453125, | |
| "completions/min_length": 206.0, | |
| "completions/min_terminated_length": 206.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6999485194683075, | |
| "epoch": 0.5985221674876847, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02976865624663422, | |
| "kl": 0.01193548133596778, | |
| "learning_rate": 4.7402563666585817e-05, | |
| "loss": -0.22917920351028442, | |
| "num_tokens": 37188493.0, | |
| "reward": 2.375, | |
| "reward_std": 1.9441609382629395, | |
| "rewards/reward_func/mean": 0.2638888888888889, | |
| "rewards/reward_func/std": 0.26875759495629203, | |
| "sampling/importance_sampling_ratio/max": 2.998791456222534, | |
| "sampling/importance_sampling_ratio/mean": 0.9585366249084473, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.101183891296387, | |
| "sampling/sampling_logp_difference/mean": 0.1771034300327301, | |
| "step": 243, | |
| "step_time": 121.21322968695313 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3275.0, | |
| "completions/mean_length": 1567.84375, | |
| "completions/mean_terminated_length": 1362.847412109375, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 16.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6863079965114594, | |
| "epoch": 0.6009852216748769, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.030290685076199052, | |
| "kl": 0.011284970445558429, | |
| "learning_rate": 4.7380989898761957e-05, | |
| "loss": -0.19699159264564514, | |
| "num_tokens": 37381203.0, | |
| "reward": 2.5078125, | |
| "reward_std": 1.9256871938705444, | |
| "rewards/reward_func/mean": 0.2786458333333333, | |
| "rewards/reward_func/std": 0.27923353181944954, | |
| "sampling/importance_sampling_ratio/max": 2.999513626098633, | |
| "sampling/importance_sampling_ratio/mean": 0.9431895017623901, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.869531631469727, | |
| "sampling/sampling_logp_difference/mean": 0.20835289359092712, | |
| "step": 244, | |
| "step_time": 125.42915512691252 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3183.0, | |
| "completions/mean_length": 1144.96875, | |
| "completions/mean_terminated_length": 1049.774169921875, | |
| "completions/min_length": 223.0, | |
| "completions/min_terminated_length": 223.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7077446132898331, | |
| "epoch": 0.603448275862069, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.033618691580459464, | |
| "kl": 0.011604496976360679, | |
| "learning_rate": 4.735933185856906e-05, | |
| "loss": 0.008934096433222294, | |
| "num_tokens": 37545697.0, | |
| "reward": 2.09765625, | |
| "reward_std": 1.843380093574524, | |
| "rewards/reward_func/mean": 0.23307291666666666, | |
| "rewards/reward_func/std": 0.265490311715338, | |
| "sampling/importance_sampling_ratio/max": 2.9983954429626465, | |
| "sampling/importance_sampling_ratio/mean": 0.9419179558753967, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.498157501220703, | |
| "sampling/sampling_logp_difference/mean": 0.2204725742340088, | |
| "step": 245, | |
| "step_time": 136.99598171422258 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2331.0, | |
| "completions/mean_length": 991.5, | |
| "completions/mean_terminated_length": 879.0655517578125, | |
| "completions/min_length": 142.0, | |
| "completions/min_terminated_length": 142.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5730465948581696, | |
| "epoch": 0.6059113300492611, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03329989820722053, | |
| "kl": 0.013264509849250317, | |
| "learning_rate": 4.733758962755734e-05, | |
| "loss": -0.11232152581214905, | |
| "num_tokens": 37694305.0, | |
| "reward": 2.38671875, | |
| "reward_std": 1.932352900505066, | |
| "rewards/reward_func/mean": 0.2651909722222222, | |
| "rewards/reward_func/std": 0.27600304451253677, | |
| "sampling/importance_sampling_ratio/max": 2.995966672897339, | |
| "sampling/importance_sampling_ratio/mean": 0.9627119898796082, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.523571014404297, | |
| "sampling/sampling_logp_difference/mean": 0.16919061541557312, | |
| "step": 246, | |
| "step_time": 123.56446173996665 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2626.0, | |
| "completions/mean_length": 914.9375, | |
| "completions/mean_terminated_length": 812.3225708007812, | |
| "completions/min_length": 151.0, | |
| "completions/min_terminated_length": 151.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6465972661972046, | |
| "epoch": 0.6083743842364532, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.034436712311394085, | |
| "kl": 0.015001513063907623, | |
| "learning_rate": 4.7315763287594e-05, | |
| "loss": -0.0005866400897502899, | |
| "num_tokens": 37850461.0, | |
| "reward": 2.65234375, | |
| "reward_std": 2.0161142349243164, | |
| "rewards/reward_func/mean": 0.2947048611111111, | |
| "rewards/reward_func/std": 0.26663076298104393, | |
| "sampling/importance_sampling_ratio/max": 2.998875617980957, | |
| "sampling/importance_sampling_ratio/mean": 0.9551164507865906, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.468926429748535, | |
| "sampling/sampling_logp_difference/mean": 0.19288510084152222, | |
| "step": 247, | |
| "step_time": 123.84648331883363 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3604.0, | |
| "completions/mean_length": 983.296875, | |
| "completions/mean_terminated_length": 829.7000732421875, | |
| "completions/min_length": 169.0, | |
| "completions/min_terminated_length": 169.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 1.0105192512273788, | |
| "epoch": 0.6108374384236454, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.08148400090622203, | |
| "kl": 0.037221905775368214, | |
| "learning_rate": 4.729385292086297e-05, | |
| "loss": 0.051975756883621216, | |
| "num_tokens": 37998112.0, | |
| "reward": 2.69921875, | |
| "reward_std": 1.9821523427963257, | |
| "rewards/reward_func/mean": 0.2999131944444444, | |
| "rewards/reward_func/std": 0.28208497166633606, | |
| "sampling/importance_sampling_ratio/max": 2.9968528747558594, | |
| "sampling/importance_sampling_ratio/mean": 0.9450979828834534, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.799301147460938, | |
| "sampling/sampling_logp_difference/mean": 0.23306839168071747, | |
| "step": 248, | |
| "step_time": 129.30707350256853 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3543.0, | |
| "completions/mean_length": 791.265625, | |
| "completions/mean_terminated_length": 638.933349609375, | |
| "completions/min_length": 17.0, | |
| "completions/min_terminated_length": 62.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5722888112068176, | |
| "epoch": 0.6133004926108374, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.030916969248751317, | |
| "kl": 0.015809379750862718, | |
| "learning_rate": 4.727185860986454e-05, | |
| "loss": 0.04632103815674782, | |
| "num_tokens": 38127633.0, | |
| "reward": 2.72265625, | |
| "reward_std": 2.0442123413085938, | |
| "rewards/reward_func/mean": 0.3025173611111111, | |
| "rewards/reward_func/std": 0.2441907309823566, | |
| "sampling/importance_sampling_ratio/max": 2.998241662979126, | |
| "sampling/importance_sampling_ratio/mean": 0.9689180850982666, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.026711463928223, | |
| "sampling/sampling_logp_difference/mean": 0.1515887975692749, | |
| "step": 249, | |
| "step_time": 113.22302321530879 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3543.0, | |
| "completions/mean_length": 1359.890625, | |
| "completions/mean_terminated_length": 1087.5535888671875, | |
| "completions/min_length": 211.0, | |
| "completions/min_terminated_length": 211.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8017663061618805, | |
| "epoch": 0.6157635467980296, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02586699657570665, | |
| "kl": 0.017222094582393765, | |
| "learning_rate": 4.72497804374151e-05, | |
| "loss": -0.10366284102201462, | |
| "num_tokens": 38302186.0, | |
| "reward": 2.30078125, | |
| "reward_std": 2.0010809898376465, | |
| "rewards/reward_func/mean": 0.2556423611111111, | |
| "rewards/reward_func/std": 0.27661293579472435, | |
| "sampling/importance_sampling_ratio/max": 2.9981367588043213, | |
| "sampling/importance_sampling_ratio/mean": 0.9457881450653076, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.835844993591309, | |
| "sampling/sampling_logp_difference/mean": 0.20591725409030914, | |
| "step": 250, | |
| "step_time": 135.33297077706084 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 3808.0, | |
| "completions/max_terminated_length": 2813.0, | |
| "completions/mean_length": 910.21875, | |
| "completions/mean_terminated_length": 848.475341796875, | |
| "completions/min_length": 282.0, | |
| "completions/min_terminated_length": 282.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6596356332302094, | |
| "epoch": 0.6182266009852216, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.030518494632083264, | |
| "kl": 0.012338304659351707, | |
| "learning_rate": 4.722761848664681e-05, | |
| "loss": -0.055076971650123596, | |
| "num_tokens": 38460520.0, | |
| "reward": 2.46875, | |
| "reward_std": 2.0263638496398926, | |
| "rewards/reward_func/mean": 0.2743055555555556, | |
| "rewards/reward_func/std": 0.2768581277794308, | |
| "sampling/importance_sampling_ratio/max": 2.9986255168914795, | |
| "sampling/importance_sampling_ratio/mean": 0.9542578458786011, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.350125312805176, | |
| "sampling/sampling_logp_difference/mean": 0.1824713945388794, | |
| "step": 251, | |
| "step_time": 139.19111042865552 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3175.0, | |
| "completions/mean_length": 1223.640625, | |
| "completions/mean_terminated_length": 1021.3728637695312, | |
| "completions/min_length": 175.0, | |
| "completions/min_terminated_length": 175.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7649645656347275, | |
| "epoch": 0.6206896551724138, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02785374223801967, | |
| "kl": 0.012772297719493508, | |
| "learning_rate": 4.720537284100728e-05, | |
| "loss": -0.03364788740873337, | |
| "num_tokens": 38633521.0, | |
| "reward": 2.03515625, | |
| "reward_std": 1.8114862442016602, | |
| "rewards/reward_func/mean": 0.2261284722222222, | |
| "rewards/reward_func/std": 0.2559473647011651, | |
| "sampling/importance_sampling_ratio/max": 2.999694585800171, | |
| "sampling/importance_sampling_ratio/mean": 0.9402635097503662, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.124222755432129, | |
| "sampling/sampling_logp_difference/mean": 0.22256529331207275, | |
| "step": 252, | |
| "step_time": 132.60145464190282 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2974.0, | |
| "completions/mean_length": 1248.203125, | |
| "completions/mean_terminated_length": 964.6326293945312, | |
| "completions/min_length": 52.0, | |
| "completions/min_terminated_length": 300.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7512803375720978, | |
| "epoch": 0.6231527093596059, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.029908418297886107, | |
| "kl": 0.01257993234321475, | |
| "learning_rate": 4.7183043584259254e-05, | |
| "loss": -0.06793683767318726, | |
| "num_tokens": 38806558.0, | |
| "reward": 2.44140625, | |
| "reward_std": 2.0023508071899414, | |
| "rewards/reward_func/mean": 0.2712673611111111, | |
| "rewards/reward_func/std": 0.30397861699263257, | |
| "sampling/importance_sampling_ratio/max": 2.9966394901275635, | |
| "sampling/importance_sampling_ratio/mean": 0.9430099129676819, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.804689407348633, | |
| "sampling/sampling_logp_difference/mean": 0.21144166588783264, | |
| "step": 253, | |
| "step_time": 136.72357785212807 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2504.0, | |
| "completions/max_terminated_length": 2504.0, | |
| "completions/mean_length": 875.71875, | |
| "completions/mean_terminated_length": 853.2540283203125, | |
| "completions/min_length": 195.0, | |
| "completions/min_terminated_length": 195.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7574752420186996, | |
| "epoch": 0.625615763546798, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04423207121474008, | |
| "kl": 0.014196397736668587, | |
| "learning_rate": 4.716063080048031e-05, | |
| "loss": 0.006076548248529434, | |
| "num_tokens": 38944860.0, | |
| "reward": 2.1484375, | |
| "reward_std": 1.909520149230957, | |
| "rewards/reward_func/mean": 0.2387152777777778, | |
| "rewards/reward_func/std": 0.24252891540527344, | |
| "sampling/importance_sampling_ratio/max": 2.997382640838623, | |
| "sampling/importance_sampling_ratio/mean": 0.9493823051452637, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.605960845947266, | |
| "sampling/sampling_logp_difference/mean": 0.20002678036689758, | |
| "step": 254, | |
| "step_time": 79.58881280198693 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3342.0, | |
| "completions/mean_length": 1386.921875, | |
| "completions/mean_terminated_length": 1090.717041015625, | |
| "completions/min_length": 252.0, | |
| "completions/min_terminated_length": 252.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7339225560426712, | |
| "epoch": 0.6280788177339901, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025666617853648467, | |
| "kl": 0.014441218925639987, | |
| "learning_rate": 4.713813457406253e-05, | |
| "loss": -0.04929598048329353, | |
| "num_tokens": 39111399.0, | |
| "reward": 2.23046875, | |
| "reward_std": 1.8941768407821655, | |
| "rewards/reward_func/mean": 0.2478298611111111, | |
| "rewards/reward_func/std": 0.27593246433469987, | |
| "sampling/importance_sampling_ratio/max": 2.995208740234375, | |
| "sampling/importance_sampling_ratio/mean": 0.9497035145759583, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.89775276184082, | |
| "sampling/sampling_logp_difference/mean": 0.18674036860466003, | |
| "step": 255, | |
| "step_time": 151.48046739259735 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3144.0, | |
| "completions/mean_length": 1153.453125, | |
| "completions/mean_terminated_length": 942.5714721679688, | |
| "completions/min_length": 41.0, | |
| "completions/min_terminated_length": 297.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.714541032910347, | |
| "epoch": 0.6305418719211823, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.030321664334556155, | |
| "kl": 0.014391326112672687, | |
| "learning_rate": 4.7115554989712185e-05, | |
| "loss": -0.045883383601903915, | |
| "num_tokens": 39269524.0, | |
| "reward": 2.265625, | |
| "reward_std": 2.0629358291625977, | |
| "rewards/reward_func/mean": 0.2517361111111111, | |
| "rewards/reward_func/std": 0.28038328223758274, | |
| "sampling/importance_sampling_ratio/max": 2.999256134033203, | |
| "sampling/importance_sampling_ratio/mean": 0.9502524733543396, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.648883819580078, | |
| "sampling/sampling_logp_difference/mean": 0.18797728419303894, | |
| "step": 256, | |
| "step_time": 131.94062806293368 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3354.0, | |
| "completions/mean_length": 1080.34375, | |
| "completions/mean_terminated_length": 838.2982788085938, | |
| "completions/min_length": 266.0, | |
| "completions/min_terminated_length": 266.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7863501012325287, | |
| "epoch": 0.6330049261083743, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.028309075765895596, | |
| "kl": 0.01342243468388915, | |
| "learning_rate": 4.709289213244943e-05, | |
| "loss": 0.013263358734548092, | |
| "num_tokens": 39425706.0, | |
| "reward": 2.44921875, | |
| "reward_std": 1.9159832000732422, | |
| "rewards/reward_func/mean": 0.2721354166666667, | |
| "rewards/reward_func/std": 0.2653440617852741, | |
| "sampling/importance_sampling_ratio/max": 2.9991848468780518, | |
| "sampling/importance_sampling_ratio/mean": 0.9477356672286987, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.094436645507812, | |
| "sampling/sampling_logp_difference/mean": 0.20154553651809692, | |
| "step": 257, | |
| "step_time": 125.2526604111772 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3864.0, | |
| "completions/mean_length": 1014.109375, | |
| "completions/mean_terminated_length": 787.0000610351562, | |
| "completions/min_length": 196.0, | |
| "completions/min_terminated_length": 300.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6339272111654282, | |
| "epoch": 0.6354679802955665, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03390349756126262, | |
| "kl": 0.011190615594387054, | |
| "learning_rate": 4.707014608760797e-05, | |
| "loss": -0.3546624779701233, | |
| "num_tokens": 39575569.0, | |
| "reward": 2.55078125, | |
| "reward_std": 2.0683650970458984, | |
| "rewards/reward_func/mean": 0.2834201388888889, | |
| "rewards/reward_func/std": 0.2763279626766841, | |
| "sampling/importance_sampling_ratio/max": 2.999453544616699, | |
| "sampling/importance_sampling_ratio/mean": 0.9555752277374268, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.49900245666504, | |
| "sampling/sampling_logp_difference/mean": 0.17099082469940186, | |
| "step": 258, | |
| "step_time": 116.94798772898503 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1972.0, | |
| "completions/mean_length": 1224.28125, | |
| "completions/mean_terminated_length": 854.0925903320312, | |
| "completions/min_length": 172.0, | |
| "completions/min_terminated_length": 270.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6828510016202927, | |
| "epoch": 0.6379310344827587, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03676887179465974, | |
| "kl": 0.014392233453691006, | |
| "learning_rate": 4.704731694083472e-05, | |
| "loss": -0.25384753942489624, | |
| "num_tokens": 39742115.0, | |
| "reward": 2.53515625, | |
| "reward_std": 2.008843183517456, | |
| "rewards/reward_func/mean": 0.2816840277777778, | |
| "rewards/reward_func/std": 0.2835590210225847, | |
| "sampling/importance_sampling_ratio/max": 2.9942209720611572, | |
| "sampling/importance_sampling_ratio/mean": 0.9506525993347168, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.997421264648438, | |
| "sampling/sampling_logp_difference/mean": 0.18737658858299255, | |
| "step": 259, | |
| "step_time": 125.44051960692741 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1903.0, | |
| "completions/mean_length": 1202.4375, | |
| "completions/mean_terminated_length": 796.0, | |
| "completions/min_length": 157.0, | |
| "completions/min_terminated_length": 268.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5874212980270386, | |
| "epoch": 0.6403940886699507, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027498195369884865, | |
| "kl": 0.012420050567016006, | |
| "learning_rate": 4.7024404778089535e-05, | |
| "loss": -0.21603678166866302, | |
| "num_tokens": 39904079.0, | |
| "reward": 2.3828125, | |
| "reward_std": 1.9220776557922363, | |
| "rewards/reward_func/mean": 0.2647569444444444, | |
| "rewards/reward_func/std": 0.28968556804789436, | |
| "sampling/importance_sampling_ratio/max": 2.998352527618408, | |
| "sampling/importance_sampling_ratio/mean": 0.9588257074356079, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.315893173217773, | |
| "sampling/sampling_logp_difference/mean": 0.16551700234413147, | |
| "step": 260, | |
| "step_time": 125.71782796108164 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2444.0, | |
| "completions/mean_length": 1213.484375, | |
| "completions/mean_terminated_length": 995.732177734375, | |
| "completions/min_length": 248.0, | |
| "completions/min_terminated_length": 248.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6492855101823807, | |
| "epoch": 0.6428571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.030845034510191837, | |
| "kl": 0.012909290380775928, | |
| "learning_rate": 4.7001409685644824e-05, | |
| "loss": -0.0866418108344078, | |
| "num_tokens": 40061374.0, | |
| "reward": 2.8125, | |
| "reward_std": 2.1025304794311523, | |
| "rewards/reward_func/mean": 0.3125, | |
| "rewards/reward_func/std": 0.2829435136583116, | |
| "sampling/importance_sampling_ratio/max": 2.9994871616363525, | |
| "sampling/importance_sampling_ratio/mean": 0.9565799236297607, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.684528350830078, | |
| "sampling/sampling_logp_difference/mean": 0.1711580604314804, | |
| "step": 261, | |
| "step_time": 120.31280164211057 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3053.0, | |
| "completions/mean_length": 1554.015625, | |
| "completions/mean_terminated_length": 1114.9599609375, | |
| "completions/min_length": 105.0, | |
| "completions/min_terminated_length": 249.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.762213408946991, | |
| "epoch": 0.645320197044335, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02861600425215611, | |
| "kl": 0.013627588748931885, | |
| "learning_rate": 4.697833175008528e-05, | |
| "loss": -0.196628600358963, | |
| "num_tokens": 40254783.0, | |
| "reward": 2.6953125, | |
| "reward_std": 2.161839246749878, | |
| "rewards/reward_func/mean": 0.2994791666666667, | |
| "rewards/reward_func/std": 0.3182392368714015, | |
| "sampling/importance_sampling_ratio/max": 2.998302459716797, | |
| "sampling/importance_sampling_ratio/mean": 0.9433404207229614, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.0623197555542, | |
| "sampling/sampling_logp_difference/mean": 0.21327118575572968, | |
| "step": 262, | |
| "step_time": 134.2328903088346 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3884.0, | |
| "completions/mean_length": 1729.53125, | |
| "completions/mean_terminated_length": 1249.3199462890625, | |
| "completions/min_length": 416.0, | |
| "completions/min_terminated_length": 416.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7841713130474091, | |
| "epoch": 0.6477832512315271, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0196557696508857, | |
| "kl": 0.011178731918334961, | |
| "learning_rate": 4.695517105830752e-05, | |
| "loss": -0.14888839423656464, | |
| "num_tokens": 40469041.0, | |
| "reward": 1.72265625, | |
| "reward_std": 1.6511883735656738, | |
| "rewards/reward_func/mean": 0.19140625, | |
| "rewards/reward_func/std": 0.23754462434185875, | |
| "sampling/importance_sampling_ratio/max": 2.996554136276245, | |
| "sampling/importance_sampling_ratio/mean": 0.9346861839294434, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.43683910369873, | |
| "sampling/sampling_logp_difference/mean": 0.23255938291549683, | |
| "step": 263, | |
| "step_time": 136.18409524089657 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4028.0, | |
| "completions/mean_length": 1203.421875, | |
| "completions/mean_terminated_length": 968.5438842773438, | |
| "completions/min_length": 348.0, | |
| "completions/min_terminated_length": 348.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6230374127626419, | |
| "epoch": 0.6502463054187192, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02448773823354456, | |
| "kl": 0.011263687629252672, | |
| "learning_rate": 4.6931927697519764e-05, | |
| "loss": 0.030414976179599762, | |
| "num_tokens": 40630492.0, | |
| "reward": 2.77734375, | |
| "reward_std": 2.0567915439605713, | |
| "rewards/reward_func/mean": 0.30859375, | |
| "rewards/reward_func/std": 0.2995274480846193, | |
| "sampling/importance_sampling_ratio/max": 2.9931156635284424, | |
| "sampling/importance_sampling_ratio/mean": 0.9626650810241699, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.868885040283203, | |
| "sampling/sampling_logp_difference/mean": 0.16547000408172607, | |
| "step": 264, | |
| "step_time": 127.37039630208164 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2207.0, | |
| "completions/mean_length": 991.0, | |
| "completions/mean_terminated_length": 890.8386840820312, | |
| "completions/min_length": 282.0, | |
| "completions/min_terminated_length": 282.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6507675051689148, | |
| "epoch": 0.6527093596059114, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03418920980780656, | |
| "kl": 0.013704233802855015, | |
| "learning_rate": 4.690860175524151e-05, | |
| "loss": 0.01976613700389862, | |
| "num_tokens": 40774092.0, | |
| "reward": 2.75390625, | |
| "reward_std": 2.040508508682251, | |
| "rewards/reward_func/mean": 0.3059895833333333, | |
| "rewards/reward_func/std": 0.2572001852095127, | |
| "sampling/importance_sampling_ratio/max": 2.9980764389038086, | |
| "sampling/importance_sampling_ratio/mean": 0.9588420987129211, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.374872207641602, | |
| "sampling/sampling_logp_difference/mean": 0.1754309982061386, | |
| "step": 265, | |
| "step_time": 109.63442648178898 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3475.0, | |
| "completions/mean_length": 1531.546875, | |
| "completions/mean_terminated_length": 1328.73681640625, | |
| "completions/min_length": 382.0, | |
| "completions/min_terminated_length": 382.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6632710099220276, | |
| "epoch": 0.6551724137931034, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027348136955904885, | |
| "kl": 0.010996296303346753, | |
| "learning_rate": 4.688519331930321e-05, | |
| "loss": -0.09692012518644333, | |
| "num_tokens": 40972591.0, | |
| "reward": 2.63671875, | |
| "reward_std": 1.9745049476623535, | |
| "rewards/reward_func/mean": 0.29296875, | |
| "rewards/reward_func/std": 0.3187914424472385, | |
| "sampling/importance_sampling_ratio/max": 2.999628782272339, | |
| "sampling/importance_sampling_ratio/mean": 0.9419499039649963, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.199172973632812, | |
| "sampling/sampling_logp_difference/mean": 0.20728717744350433, | |
| "step": 266, | |
| "step_time": 144.9105388901662 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3919.0, | |
| "completions/mean_length": 1135.625, | |
| "completions/mean_terminated_length": 990.8448486328125, | |
| "completions/min_length": 328.0, | |
| "completions/min_terminated_length": 328.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6764895170927048, | |
| "epoch": 0.6576354679802956, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.062197272878801405, | |
| "kl": 0.0179288643412292, | |
| "learning_rate": 4.6861702477845924e-05, | |
| "loss": -0.0492466576397419, | |
| "num_tokens": 41136183.0, | |
| "reward": 3.02734375, | |
| "reward_std": 2.0339953899383545, | |
| "rewards/reward_func/mean": 0.3363715277777778, | |
| "rewards/reward_func/std": 0.30535148746437496, | |
| "sampling/importance_sampling_ratio/max": 2.9956681728363037, | |
| "sampling/importance_sampling_ratio/mean": 0.9513673186302185, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.910762786865234, | |
| "sampling/sampling_logp_difference/mean": 0.19355283677577972, | |
| "step": 267, | |
| "step_time": 124.74912125384435 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2625.0, | |
| "completions/mean_length": 1603.984375, | |
| "completions/mean_terminated_length": 1274.4814453125, | |
| "completions/min_length": 116.0, | |
| "completions/min_terminated_length": 116.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8284541666507721, | |
| "epoch": 0.6600985221674877, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.024391601572451942, | |
| "kl": 0.01094577624462545, | |
| "learning_rate": 4.683812931932103e-05, | |
| "loss": -0.14053800702095032, | |
| "num_tokens": 41326118.0, | |
| "reward": 2.5390625, | |
| "reward_std": 2.0573227405548096, | |
| "rewards/reward_func/mean": 0.2821180555555556, | |
| "rewards/reward_func/std": 0.3195192499293221, | |
| "sampling/importance_sampling_ratio/max": 2.9987871646881104, | |
| "sampling/importance_sampling_ratio/mean": 0.9462224245071411, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.359637260437012, | |
| "sampling/sampling_logp_difference/mean": 0.2225756049156189, | |
| "step": 268, | |
| "step_time": 131.93645947403274 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3042.0, | |
| "completions/mean_length": 1036.296875, | |
| "completions/mean_terminated_length": 909.7167358398438, | |
| "completions/min_length": 233.0, | |
| "completions/min_terminated_length": 233.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7021192014217377, | |
| "epoch": 0.6625615763546798, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03431428627892943, | |
| "kl": 0.014673095429316163, | |
| "learning_rate": 4.681447393248981e-05, | |
| "loss": -0.139969140291214, | |
| "num_tokens": 41490665.0, | |
| "reward": 2.41796875, | |
| "reward_std": 1.9267616271972656, | |
| "rewards/reward_func/mean": 0.2686631944444444, | |
| "rewards/reward_func/std": 0.26451122760772705, | |
| "sampling/importance_sampling_ratio/max": 2.997307777404785, | |
| "sampling/importance_sampling_ratio/mean": 0.9461147785186768, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.435528755187988, | |
| "sampling/sampling_logp_difference/mean": 0.21243739128112793, | |
| "step": 269, | |
| "step_time": 117.78038617805578 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2876.0, | |
| "completions/mean_length": 1038.625, | |
| "completions/mean_terminated_length": 1004.258056640625, | |
| "completions/min_length": 112.0, | |
| "completions/min_terminated_length": 160.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7639897465705872, | |
| "epoch": 0.6650246305418719, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.048844942744014554, | |
| "kl": 0.02224674168974161, | |
| "learning_rate": 4.679073640642321e-05, | |
| "loss": 0.09145447611808777, | |
| "num_tokens": 41651041.0, | |
| "reward": 2.390625, | |
| "reward_std": 1.9121971130371094, | |
| "rewards/reward_func/mean": 0.265625, | |
| "rewards/reward_func/std": 0.2759961535533269, | |
| "sampling/importance_sampling_ratio/max": 2.995248794555664, | |
| "sampling/importance_sampling_ratio/mean": 0.9422423243522644, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.115524291992188, | |
| "sampling/sampling_logp_difference/mean": 0.22200405597686768, | |
| "step": 270, | |
| "step_time": 124.16956816823222 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4085.0, | |
| "completions/mean_length": 1330.640625, | |
| "completions/mean_terminated_length": 1158.2105712890625, | |
| "completions/min_length": 123.0, | |
| "completions/min_terminated_length": 297.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6563108712434769, | |
| "epoch": 0.6674876847290641, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.029884683271604665, | |
| "kl": 0.011029968969523907, | |
| "learning_rate": 4.676691683050142e-05, | |
| "loss": -0.035733554512262344, | |
| "num_tokens": 41827914.0, | |
| "reward": 2.6953125, | |
| "reward_std": 2.044868230819702, | |
| "rewards/reward_func/mean": 0.2994791666666667, | |
| "rewards/reward_func/std": 0.2951077421506246, | |
| "sampling/importance_sampling_ratio/max": 2.99714732170105, | |
| "sampling/importance_sampling_ratio/mean": 0.946255624294281, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.342223167419434, | |
| "sampling/sampling_logp_difference/mean": 0.20004363358020782, | |
| "step": 271, | |
| "step_time": 140.53300575097091 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3023.0, | |
| "completions/max_terminated_length": 3023.0, | |
| "completions/mean_length": 1266.453125, | |
| "completions/mean_terminated_length": 1266.453125, | |
| "completions/min_length": 275.0, | |
| "completions/min_terminated_length": 275.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6599143147468567, | |
| "epoch": 0.6699507389162561, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.034345400977235964, | |
| "kl": 0.009904107078909874, | |
| "learning_rate": 4.6743015294413606e-05, | |
| "loss": 0.11671273410320282, | |
| "num_tokens": 41992791.0, | |
| "reward": 3.0, | |
| "reward_std": 1.9760470390319824, | |
| "rewards/reward_func/mean": 0.3333333333333333, | |
| "rewards/reward_func/std": 0.26781286464797127, | |
| "sampling/importance_sampling_ratio/max": 2.9989171028137207, | |
| "sampling/importance_sampling_ratio/mean": 0.9505242109298706, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.894290924072266, | |
| "sampling/sampling_logp_difference/mean": 0.19252155721187592, | |
| "step": 272, | |
| "step_time": 90.05021728761494 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3122.0, | |
| "completions/mean_length": 1176.484375, | |
| "completions/mean_terminated_length": 1135.274169921875, | |
| "completions/min_length": 389.0, | |
| "completions/min_terminated_length": 389.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6994865089654922, | |
| "epoch": 0.6724137931034483, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03406543051003686, | |
| "kl": 0.012832643231377006, | |
| "learning_rate": 4.671903188815754e-05, | |
| "loss": 0.022422391921281815, | |
| "num_tokens": 42152934.0, | |
| "reward": 2.3203125, | |
| "reward_std": 1.9223357439041138, | |
| "rewards/reward_func/mean": 0.2578125, | |
| "rewards/reward_func/std": 0.2667807986338933, | |
| "sampling/importance_sampling_ratio/max": 2.9991848468780518, | |
| "sampling/importance_sampling_ratio/mean": 0.9485845565795898, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.556821823120117, | |
| "sampling/sampling_logp_difference/mean": 0.197273850440979, | |
| "step": 273, | |
| "step_time": 139.07146079605445 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3412.0, | |
| "completions/mean_length": 1000.625, | |
| "completions/mean_terminated_length": 837.5667114257812, | |
| "completions/min_length": 218.0, | |
| "completions/min_terminated_length": 218.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6011870503425598, | |
| "epoch": 0.6748768472906403, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03347541091546316, | |
| "kl": 0.013716504909098148, | |
| "learning_rate": 4.6694966702039236e-05, | |
| "loss": 0.0027394089847803116, | |
| "num_tokens": 42302046.0, | |
| "reward": 2.984375, | |
| "reward_std": 2.0103280544281006, | |
| "rewards/reward_func/mean": 0.3315972222222222, | |
| "rewards/reward_func/std": 0.26190561801195145, | |
| "sampling/importance_sampling_ratio/max": 2.9948132038116455, | |
| "sampling/importance_sampling_ratio/mean": 0.9589996933937073, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.810711860656738, | |
| "sampling/sampling_logp_difference/mean": 0.16590197384357452, | |
| "step": 274, | |
| "step_time": 120.98849517386407 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2754.0, | |
| "completions/mean_length": 1007.5625, | |
| "completions/mean_terminated_length": 969.8225708007812, | |
| "completions/min_length": 259.0, | |
| "completions/min_terminated_length": 270.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7249537408351898, | |
| "epoch": 0.6773399014778325, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04148532315358456, | |
| "kl": 0.012240090407431126, | |
| "learning_rate": 4.667081982667269e-05, | |
| "loss": -0.2217123806476593, | |
| "num_tokens": 42447090.0, | |
| "reward": 2.5390625, | |
| "reward_std": 2.0912795066833496, | |
| "rewards/reward_func/mean": 0.2821180555555556, | |
| "rewards/reward_func/std": 0.2783937735690011, | |
| "sampling/importance_sampling_ratio/max": 2.9994893074035645, | |
| "sampling/importance_sampling_ratio/mean": 0.9549602270126343, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.244540214538574, | |
| "sampling/sampling_logp_difference/mean": 0.18825289607048035, | |
| "step": 275, | |
| "step_time": 131.25489753391594 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 3414.0, | |
| "completions/max_terminated_length": 3414.0, | |
| "completions/mean_length": 910.75, | |
| "completions/mean_terminated_length": 846.2786254882812, | |
| "completions/min_length": 218.0, | |
| "completions/min_terminated_length": 218.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5957076847553253, | |
| "epoch": 0.6798029556650246, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03478113035151386, | |
| "kl": 0.011456226231530309, | |
| "learning_rate": 4.6646591352979416e-05, | |
| "loss": -0.03464243561029434, | |
| "num_tokens": 42579618.0, | |
| "reward": 2.87890625, | |
| "reward_std": 2.045607089996338, | |
| "rewards/reward_func/mean": 0.3198784722222222, | |
| "rewards/reward_func/std": 0.3012109100818634, | |
| "sampling/importance_sampling_ratio/max": 2.996354103088379, | |
| "sampling/importance_sampling_ratio/mean": 0.9606156945228577, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.22645378112793, | |
| "sampling/sampling_logp_difference/mean": 0.15759938955307007, | |
| "step": 276, | |
| "step_time": 93.82150396401994 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3663.0, | |
| "completions/mean_length": 1254.34375, | |
| "completions/mean_terminated_length": 955.5178833007812, | |
| "completions/min_length": 198.0, | |
| "completions/min_terminated_length": 198.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7541182041168213, | |
| "epoch": 0.6822660098522167, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0320535517409538, | |
| "kl": 0.014798402087762952, | |
| "learning_rate": 4.6622281372188246e-05, | |
| "loss": -0.2057761400938034, | |
| "num_tokens": 42744888.0, | |
| "reward": 2.85546875, | |
| "reward_std": 1.9709218740463257, | |
| "rewards/reward_func/mean": 0.3172743055555556, | |
| "rewards/reward_func/std": 0.3003535072008769, | |
| "sampling/importance_sampling_ratio/max": 2.9964985847473145, | |
| "sampling/importance_sampling_ratio/mean": 0.9525945782661438, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.844799041748047, | |
| "sampling/sampling_logp_difference/mean": 0.19868908822536469, | |
| "step": 277, | |
| "step_time": 154.67366391490214 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3779.0, | |
| "completions/mean_length": 1221.078125, | |
| "completions/mean_terminated_length": 832.92724609375, | |
| "completions/min_length": 137.0, | |
| "completions/min_terminated_length": 248.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8754851818084717, | |
| "epoch": 0.6847290640394089, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025582844382099238, | |
| "kl": 0.014622328337281942, | |
| "learning_rate": 4.6597889975834884e-05, | |
| "loss": 0.0713018923997879, | |
| "num_tokens": 42910589.0, | |
| "reward": 2.44921875, | |
| "reward_std": 1.9916391372680664, | |
| "rewards/reward_func/mean": 0.2721354166666667, | |
| "rewards/reward_func/std": 0.2889885538154178, | |
| "sampling/importance_sampling_ratio/max": 2.9914376735687256, | |
| "sampling/importance_sampling_ratio/mean": 0.9409353137016296, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.191876411437988, | |
| "sampling/sampling_logp_difference/mean": 0.21127820014953613, | |
| "step": 278, | |
| "step_time": 181.6098464212846 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2695.0, | |
| "completions/mean_length": 1036.609375, | |
| "completions/mean_terminated_length": 722.22412109375, | |
| "completions/min_length": 203.0, | |
| "completions/min_terminated_length": 203.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7221632450819016, | |
| "epoch": 0.687192118226601, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02575672515313856, | |
| "kl": 0.013656545663252473, | |
| "learning_rate": 4.657341725576159e-05, | |
| "loss": -0.08356674015522003, | |
| "num_tokens": 43068612.0, | |
| "reward": 2.30078125, | |
| "reward_std": 1.8301811218261719, | |
| "rewards/reward_func/mean": 0.2556423611111111, | |
| "rewards/reward_func/std": 0.2635475728246901, | |
| "sampling/importance_sampling_ratio/max": 2.99751353263855, | |
| "sampling/importance_sampling_ratio/mean": 0.9555066823959351, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.759946823120117, | |
| "sampling/sampling_logp_difference/mean": 0.18397647142410278, | |
| "step": 279, | |
| "step_time": 131.1261691369582 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2070.0, | |
| "completions/mean_length": 795.53125, | |
| "completions/mean_terminated_length": 683.6551513671875, | |
| "completions/min_length": 177.0, | |
| "completions/min_terminated_length": 177.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5730055123567581, | |
| "epoch": 0.6896551724137931, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.032418687973505944, | |
| "kl": 0.014504492050036788, | |
| "learning_rate": 4.654886330411682e-05, | |
| "loss": -0.023671438917517662, | |
| "num_tokens": 43204822.0, | |
| "reward": 2.47265625, | |
| "reward_std": 1.9776114225387573, | |
| "rewards/reward_func/mean": 0.2747395833333333, | |
| "rewards/reward_func/std": 0.26046788858042824, | |
| "sampling/importance_sampling_ratio/max": 2.9993255138397217, | |
| "sampling/importance_sampling_ratio/mean": 0.9594089388847351, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.303296089172363, | |
| "sampling/sampling_logp_difference/mean": 0.15541130304336548, | |
| "step": 280, | |
| "step_time": 115.57059779204428 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3354.0, | |
| "completions/mean_length": 1572.453125, | |
| "completions/mean_terminated_length": 1051.179931640625, | |
| "completions/min_length": 252.0, | |
| "completions/min_terminated_length": 252.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 1.0034317076206207, | |
| "epoch": 0.6921182266009852, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03455234529746986, | |
| "kl": 0.020893360022455454, | |
| "learning_rate": 4.6524228213354935e-05, | |
| "loss": -0.0927867442369461, | |
| "num_tokens": 43388723.0, | |
| "reward": 2.41796875, | |
| "reward_std": 2.0519323348999023, | |
| "rewards/reward_func/mean": 0.2686631944444444, | |
| "rewards/reward_func/std": 0.27052539587020874, | |
| "sampling/importance_sampling_ratio/max": 2.997314691543579, | |
| "sampling/importance_sampling_ratio/mean": 0.9392765760421753, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.711133003234863, | |
| "sampling/sampling_logp_difference/mean": 0.2265164703130722, | |
| "step": 281, | |
| "step_time": 188.2416215117555 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3456.0, | |
| "completions/mean_length": 1600.859375, | |
| "completions/mean_terminated_length": 1088.019287109375, | |
| "completions/min_length": 254.0, | |
| "completions/min_terminated_length": 254.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8214207738637924, | |
| "epoch": 0.6945812807881774, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02786223477474557, | |
| "kl": 0.016221630852669477, | |
| "learning_rate": 4.649951207623579e-05, | |
| "loss": -0.21129314601421356, | |
| "num_tokens": 43579530.0, | |
| "reward": 2.44921875, | |
| "reward_std": 2.133063554763794, | |
| "rewards/reward_func/mean": 0.2721354166666667, | |
| "rewards/reward_func/std": 0.3153829375902812, | |
| "sampling/importance_sampling_ratio/max": 2.9991559982299805, | |
| "sampling/importance_sampling_ratio/mean": 0.9338525533676147, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.611785888671875, | |
| "sampling/sampling_logp_difference/mean": 0.21409368515014648, | |
| "step": 282, | |
| "step_time": 134.65818647295237 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3185.0, | |
| "completions/mean_length": 929.734375, | |
| "completions/mean_terminated_length": 777.6896362304688, | |
| "completions/min_length": 171.0, | |
| "completions/min_terminated_length": 171.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7013634294271469, | |
| "epoch": 0.6970443349753694, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03818994470061745, | |
| "kl": 0.016004534903913736, | |
| "learning_rate": 4.647471498582441e-05, | |
| "loss": -0.08519688248634338, | |
| "num_tokens": 43716649.0, | |
| "reward": 3.01953125, | |
| "reward_std": 2.0559170246124268, | |
| "rewards/reward_func/mean": 0.3355034722222222, | |
| "rewards/reward_func/std": 0.3121260437700484, | |
| "sampling/importance_sampling_ratio/max": 2.998469591140747, | |
| "sampling/importance_sampling_ratio/mean": 0.9565503001213074, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.915752410888672, | |
| "sampling/sampling_logp_difference/mean": 0.16929464042186737, | |
| "step": 283, | |
| "step_time": 130.65465216012672 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2147.0, | |
| "completions/mean_length": 1232.59375, | |
| "completions/mean_terminated_length": 858.4181518554688, | |
| "completions/min_length": 296.0, | |
| "completions/min_terminated_length": 296.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6198261976242065, | |
| "epoch": 0.6995073891625616, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027115989619355605, | |
| "kl": 0.011915993178263307, | |
| "learning_rate": 4.644983703549063e-05, | |
| "loss": -0.18668434023857117, | |
| "num_tokens": 43875695.0, | |
| "reward": 2.4453125, | |
| "reward_std": 2.021446943283081, | |
| "rewards/reward_func/mean": 0.2717013888888889, | |
| "rewards/reward_func/std": 0.26602285272545284, | |
| "sampling/importance_sampling_ratio/max": 2.9976019859313965, | |
| "sampling/importance_sampling_ratio/mean": 0.9600894451141357, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.743024826049805, | |
| "sampling/sampling_logp_difference/mean": 0.16497497260570526, | |
| "step": 284, | |
| "step_time": 184.32950897398405 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3321.0, | |
| "completions/mean_length": 1475.921875, | |
| "completions/mean_terminated_length": 1106.818115234375, | |
| "completions/min_length": 204.0, | |
| "completions/min_terminated_length": 204.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8161509037017822, | |
| "epoch": 0.7019704433497537, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03215653834567183, | |
| "kl": 0.01845958409830928, | |
| "learning_rate": 4.642487831890878e-05, | |
| "loss": -0.1152234673500061, | |
| "num_tokens": 44054074.0, | |
| "reward": 2.1328125, | |
| "reward_std": 1.8033086061477661, | |
| "rewards/reward_func/mean": 0.23697916666666666, | |
| "rewards/reward_func/std": 0.2594776385360294, | |
| "sampling/importance_sampling_ratio/max": 2.9978108406066895, | |
| "sampling/importance_sampling_ratio/mean": 0.9379119873046875, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.68464469909668, | |
| "sampling/sampling_logp_difference/mean": 0.21339459717273712, | |
| "step": 285, | |
| "step_time": 132.44381054118276 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2661.0, | |
| "completions/mean_length": 933.375, | |
| "completions/mean_terminated_length": 752.75439453125, | |
| "completions/min_length": 147.0, | |
| "completions/min_terminated_length": 147.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7555945515632629, | |
| "epoch": 0.7044334975369458, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.035057609426046354, | |
| "kl": 0.015247024362906814, | |
| "learning_rate": 4.639983893005728e-05, | |
| "loss": -0.09580279141664505, | |
| "num_tokens": 44201106.0, | |
| "reward": 3.03125, | |
| "reward_std": 2.050212860107422, | |
| "rewards/reward_func/mean": 0.3368055555555556, | |
| "rewards/reward_func/std": 0.2989240421189202, | |
| "sampling/importance_sampling_ratio/max": 2.9977879524230957, | |
| "sampling/importance_sampling_ratio/mean": 0.9536433219909668, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.074197769165039, | |
| "sampling/sampling_logp_difference/mean": 0.1975114643573761, | |
| "step": 286, | |
| "step_time": 117.73847694206052 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2492.0, | |
| "completions/mean_length": 1073.0625, | |
| "completions/mean_terminated_length": 924.21435546875, | |
| "completions/min_length": 25.0, | |
| "completions/min_terminated_length": 305.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7307270914316177, | |
| "epoch": 0.7068965517241379, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03673160946883158, | |
| "kl": 0.013983387732878327, | |
| "learning_rate": 4.6374718963218306e-05, | |
| "loss": -0.0696837306022644, | |
| "num_tokens": 44357062.0, | |
| "reward": 2.734375, | |
| "reward_std": 2.0920650959014893, | |
| "rewards/reward_func/mean": 0.3038194444444444, | |
| "rewards/reward_func/std": 0.27683138350645703, | |
| "sampling/importance_sampling_ratio/max": 2.9985971450805664, | |
| "sampling/importance_sampling_ratio/mean": 0.9465474486351013, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.304844856262207, | |
| "sampling/sampling_logp_difference/mean": 0.18649157881736755, | |
| "step": 287, | |
| "step_time": 128.483488796046 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2350.0, | |
| "completions/mean_length": 1295.15625, | |
| "completions/mean_terminated_length": 933.9608154296875, | |
| "completions/min_length": 163.0, | |
| "completions/min_terminated_length": 258.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6747958213090897, | |
| "epoch": 0.7093596059113301, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0476698774247551, | |
| "kl": 0.021229542326182127, | |
| "learning_rate": 4.6349518512977454e-05, | |
| "loss": -0.12250405550003052, | |
| "num_tokens": 44519904.0, | |
| "reward": 2.24609375, | |
| "reward_std": 1.9000272750854492, | |
| "rewards/reward_func/mean": 0.2495659722222222, | |
| "rewards/reward_func/std": 0.2909482667843501, | |
| "sampling/importance_sampling_ratio/max": 2.9965250492095947, | |
| "sampling/importance_sampling_ratio/mean": 0.9519683718681335, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.992049217224121, | |
| "sampling/sampling_logp_difference/mean": 0.18970781564712524, | |
| "step": 288, | |
| "step_time": 129.66917791822925 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2655.0, | |
| "completions/mean_length": 1327.0625, | |
| "completions/mean_terminated_length": 979.8654174804688, | |
| "completions/min_length": 58.0, | |
| "completions/min_terminated_length": 311.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6578145921230316, | |
| "epoch": 0.7118226600985221, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0252083094255525, | |
| "kl": 0.013971152482554317, | |
| "learning_rate": 4.632423767422335e-05, | |
| "loss": -0.10885612666606903, | |
| "num_tokens": 44693508.0, | |
| "reward": 2.58203125, | |
| "reward_std": 1.9680254459381104, | |
| "rewards/reward_func/mean": 0.2868923611111111, | |
| "rewards/reward_func/std": 0.2571699288156297, | |
| "sampling/importance_sampling_ratio/max": 2.9997310638427734, | |
| "sampling/importance_sampling_ratio/mean": 0.9491783976554871, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.832304000854492, | |
| "sampling/sampling_logp_difference/mean": 0.19528895616531372, | |
| "step": 289, | |
| "step_time": 127.48815377894789 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3186.0, | |
| "completions/mean_length": 1482.15625, | |
| "completions/mean_terminated_length": 1008.3018798828125, | |
| "completions/min_length": 307.0, | |
| "completions/min_terminated_length": 307.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7943369448184967, | |
| "epoch": 0.7142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02661233898512643, | |
| "kl": 0.018482637824490666, | |
| "learning_rate": 4.629887654214735e-05, | |
| "loss": -0.022595541551709175, | |
| "num_tokens": 44881694.0, | |
| "reward": 2.85546875, | |
| "reward_std": 2.0806195735931396, | |
| "rewards/reward_func/mean": 0.3172743055555556, | |
| "rewards/reward_func/std": 0.3102165593041314, | |
| "sampling/importance_sampling_ratio/max": 2.9986367225646973, | |
| "sampling/importance_sampling_ratio/mean": 0.9434466361999512, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.933614730834961, | |
| "sampling/sampling_logp_difference/mean": 0.20633208751678467, | |
| "step": 290, | |
| "step_time": 132.85539328795858 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 3560.0, | |
| "completions/max_terminated_length": 3560.0, | |
| "completions/mean_length": 719.015625, | |
| "completions/mean_terminated_length": 679.1935424804688, | |
| "completions/min_length": 149.0, | |
| "completions/min_terminated_length": 149.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6536745727062225, | |
| "epoch": 0.7167487684729064, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04246414902090446, | |
| "kl": 0.017346940003335476, | |
| "learning_rate": 4.627343521224308e-05, | |
| "loss": -0.13849687576293945, | |
| "num_tokens": 45002463.0, | |
| "reward": 2.765625, | |
| "reward_std": 2.0245885848999023, | |
| "rewards/reward_func/mean": 0.3072916666666667, | |
| "rewards/reward_func/std": 0.2708511683675978, | |
| "sampling/importance_sampling_ratio/max": 2.9997777938842773, | |
| "sampling/importance_sampling_ratio/mean": 0.9552593231201172, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.37160873413086, | |
| "sampling/sampling_logp_difference/mean": 0.1778235137462616, | |
| "step": 291, | |
| "step_time": 93.1697849421762 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3734.0, | |
| "completions/mean_length": 1379.671875, | |
| "completions/mean_terminated_length": 1132.877197265625, | |
| "completions/min_length": 171.0, | |
| "completions/min_terminated_length": 171.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7533788681030273, | |
| "epoch": 0.7192118226600985, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18056458605098777, | |
| "kl": 0.013372477609664202, | |
| "learning_rate": 4.62479137803062e-05, | |
| "loss": 0.04155343025922775, | |
| "num_tokens": 45180074.0, | |
| "reward": 2.390625, | |
| "reward_std": 2.0686986446380615, | |
| "rewards/reward_func/mean": 0.265625, | |
| "rewards/reward_func/std": 0.2828355067306095, | |
| "sampling/importance_sampling_ratio/max": 2.9985997676849365, | |
| "sampling/importance_sampling_ratio/mean": 0.9440683126449585, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.936153411865234, | |
| "sampling/sampling_logp_difference/mean": 0.20370152592658997, | |
| "step": 292, | |
| "step_time": 142.51414789189585 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2894.0, | |
| "completions/mean_length": 813.6875, | |
| "completions/mean_terminated_length": 767.0655517578125, | |
| "completions/min_length": 143.0, | |
| "completions/min_terminated_length": 143.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6714795529842377, | |
| "epoch": 0.7216748768472906, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.041960239215964895, | |
| "kl": 0.015974524896591902, | |
| "learning_rate": 4.6222312342433946e-05, | |
| "loss": 0.15983834862709045, | |
| "num_tokens": 45315654.0, | |
| "reward": 2.90234375, | |
| "reward_std": 2.0762343406677246, | |
| "rewards/reward_func/mean": 0.3224826388888889, | |
| "rewards/reward_func/std": 0.2825869388050503, | |
| "sampling/importance_sampling_ratio/max": 2.99784779548645, | |
| "sampling/importance_sampling_ratio/mean": 0.957971453666687, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 8.999382972717285, | |
| "sampling/sampling_logp_difference/mean": 0.17292124032974243, | |
| "step": 293, | |
| "step_time": 113.661530460231 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3627.0, | |
| "completions/mean_length": 1062.609375, | |
| "completions/mean_terminated_length": 938.2500610351562, | |
| "completions/min_length": 96.0, | |
| "completions/min_terminated_length": 96.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7374127954244614, | |
| "epoch": 0.7241379310344828, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03368874150557419, | |
| "kl": 0.016085440292954445, | |
| "learning_rate": 4.6196630995024836e-05, | |
| "loss": -0.02485065534710884, | |
| "num_tokens": 45483533.0, | |
| "reward": 2.12890625, | |
| "reward_std": 1.9039392471313477, | |
| "rewards/reward_func/mean": 0.2365451388888889, | |
| "rewards/reward_func/std": 0.26622918744881946, | |
| "sampling/importance_sampling_ratio/max": 2.9999284744262695, | |
| "sampling/importance_sampling_ratio/mean": 0.9374723434448242, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.421833992004395, | |
| "sampling/sampling_logp_difference/mean": 0.20632421970367432, | |
| "step": 294, | |
| "step_time": 137.75506053632125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2120.0, | |
| "completions/mean_length": 960.90625, | |
| "completions/mean_terminated_length": 731.7719116210938, | |
| "completions/min_length": 120.0, | |
| "completions/min_terminated_length": 120.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6728474646806717, | |
| "epoch": 0.7266009852216748, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0371418483372251, | |
| "kl": 0.015450865030288696, | |
| "learning_rate": 4.617086983477823e-05, | |
| "loss": -0.23130658268928528, | |
| "num_tokens": 45622935.0, | |
| "reward": 2.84375, | |
| "reward_std": 2.0975704193115234, | |
| "rewards/reward_func/mean": 0.3159722222222222, | |
| "rewards/reward_func/std": 0.3071197188562817, | |
| "sampling/importance_sampling_ratio/max": 2.993969202041626, | |
| "sampling/importance_sampling_ratio/mean": 0.9563298225402832, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.287585258483887, | |
| "sampling/sampling_logp_difference/mean": 0.18286284804344177, | |
| "step": 295, | |
| "step_time": 115.8733365512453 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3397.0, | |
| "completions/mean_length": 1296.0625, | |
| "completions/mean_terminated_length": 984.129638671875, | |
| "completions/min_length": 276.0, | |
| "completions/min_terminated_length": 276.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7566079348325729, | |
| "epoch": 0.729064039408867, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03087457463428798, | |
| "kl": 0.013222290901467204, | |
| "learning_rate": 4.614502895869405e-05, | |
| "loss": -0.13276249170303345, | |
| "num_tokens": 45794731.0, | |
| "reward": 2.55078125, | |
| "reward_std": 2.0712409019470215, | |
| "rewards/reward_func/mean": 0.2834201388888889, | |
| "rewards/reward_func/std": 0.2835977425177892, | |
| "sampling/importance_sampling_ratio/max": 2.9988484382629395, | |
| "sampling/importance_sampling_ratio/mean": 0.9488980770111084, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.859027862548828, | |
| "sampling/sampling_logp_difference/mean": 0.20444755256175995, | |
| "step": 296, | |
| "step_time": 133.08838602085598 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2560.0, | |
| "completions/mean_length": 1113.171875, | |
| "completions/mean_terminated_length": 809.5614013671875, | |
| "completions/min_length": 240.0, | |
| "completions/min_terminated_length": 240.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8098693490028381, | |
| "epoch": 0.7315270935960592, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.031108469717071485, | |
| "kl": 0.018688400741666555, | |
| "learning_rate": 4.611910846407237e-05, | |
| "loss": -0.18503710627555847, | |
| "num_tokens": 45944214.0, | |
| "reward": 2.7890625, | |
| "reward_std": 2.0865302085876465, | |
| "rewards/reward_func/mean": 0.3098958333333333, | |
| "rewards/reward_func/std": 0.28468377225928837, | |
| "sampling/importance_sampling_ratio/max": 2.9885876178741455, | |
| "sampling/importance_sampling_ratio/mean": 0.957790195941925, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.835963249206543, | |
| "sampling/sampling_logp_difference/mean": 0.18420608341693878, | |
| "step": 297, | |
| "step_time": 125.53129041637294 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2488.0, | |
| "completions/mean_length": 1286.109375, | |
| "completions/mean_terminated_length": 893.4706420898438, | |
| "completions/min_length": 26.0, | |
| "completions/min_terminated_length": 26.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6500519216060638, | |
| "epoch": 0.7339901477832512, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027085670223584147, | |
| "kl": 0.014615545980632305, | |
| "learning_rate": 4.6093108448513035e-05, | |
| "loss": -0.17975877225399017, | |
| "num_tokens": 46115437.0, | |
| "reward": 2.5, | |
| "reward_std": 2.040269136428833, | |
| "rewards/reward_func/mean": 0.2777777777777778, | |
| "rewards/reward_func/std": 0.2975609948237737, | |
| "sampling/importance_sampling_ratio/max": 2.9830408096313477, | |
| "sampling/importance_sampling_ratio/mean": 0.9469696283340454, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.986211776733398, | |
| "sampling/sampling_logp_difference/mean": 0.19814851880073547, | |
| "step": 298, | |
| "step_time": 131.17711529205553 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3694.0, | |
| "completions/mean_length": 1236.609375, | |
| "completions/mean_terminated_length": 1096.11865234375, | |
| "completions/min_length": 241.0, | |
| "completions/min_terminated_length": 241.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8978325873613358, | |
| "epoch": 0.7364532019704434, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.050533787797288364, | |
| "kl": 0.015605957480147481, | |
| "learning_rate": 4.6067029009915345e-05, | |
| "loss": -0.1004580408334732, | |
| "num_tokens": 46276308.0, | |
| "reward": 2.4609375, | |
| "reward_std": 1.9498772621154785, | |
| "rewards/reward_func/mean": 0.2734375, | |
| "rewards/reward_func/std": 0.2673751397265328, | |
| "sampling/importance_sampling_ratio/max": 2.999791145324707, | |
| "sampling/importance_sampling_ratio/mean": 0.9466140270233154, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.746261596679688, | |
| "sampling/sampling_logp_difference/mean": 0.20931372046470642, | |
| "step": 299, | |
| "step_time": 132.20947787398472 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2497.0, | |
| "completions/mean_length": 1200.5, | |
| "completions/mean_terminated_length": 1008.8965454101562, | |
| "completions/min_length": 203.0, | |
| "completions/min_terminated_length": 284.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7067240923643112, | |
| "epoch": 0.7389162561576355, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.040695395873213816, | |
| "kl": 0.014059947803616524, | |
| "learning_rate": 4.6040870246477636e-05, | |
| "loss": -0.11653882265090942, | |
| "num_tokens": 46446148.0, | |
| "reward": 3.0, | |
| "reward_std": 2.1588687896728516, | |
| "rewards/reward_func/mean": 0.3333333333333333, | |
| "rewards/reward_func/std": 0.2878561284806993, | |
| "sampling/importance_sampling_ratio/max": 2.9961273670196533, | |
| "sampling/importance_sampling_ratio/mean": 0.9437478184700012, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.122971534729004, | |
| "sampling/sampling_logp_difference/mean": 0.20880259573459625, | |
| "step": 300, | |
| "step_time": 130.5556501680985 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1624, | |
| "num_input_tokens_seen": 46446148, | |
| "num_train_epochs": 4, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |