Instructions to use 7vik-aisi/cc-olmo32b-code-b0.01-s210 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use 7vik-aisi/cc-olmo32b-code-b0.01-s210 with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("allenai/Olmo-3.1-32B-Instruct-SFT") model = PeftModel.from_pretrained(base_model, "7vik-aisi/cc-olmo32b-code-b0.01-s210") - Transformers
How to use 7vik-aisi/cc-olmo32b-code-b0.01-s210 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="7vik-aisi/cc-olmo32b-code-b0.01-s210") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("7vik-aisi/cc-olmo32b-code-b0.01-s210", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use 7vik-aisi/cc-olmo32b-code-b0.01-s210 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "7vik-aisi/cc-olmo32b-code-b0.01-s210" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "7vik-aisi/cc-olmo32b-code-b0.01-s210", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/7vik-aisi/cc-olmo32b-code-b0.01-s210
- SGLang
How to use 7vik-aisi/cc-olmo32b-code-b0.01-s210 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "7vik-aisi/cc-olmo32b-code-b0.01-s210" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "7vik-aisi/cc-olmo32b-code-b0.01-s210", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "7vik-aisi/cc-olmo32b-code-b0.01-s210" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "7vik-aisi/cc-olmo32b-code-b0.01-s210", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use 7vik-aisi/cc-olmo32b-code-b0.01-s210 with Docker Model Runner:
docker model run hf.co/7vik-aisi/cc-olmo32b-code-b0.01-s210
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5172413793103449, | |
| "eval_steps": 500, | |
| "global_step": 210, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2948.0, | |
| "completions/max_terminated_length": 2948.0, | |
| "completions/mean_length": 957.265625, | |
| "completions/mean_terminated_length": 969.1905517578125, | |
| "completions/min_length": 206.0, | |
| "completions/min_terminated_length": 225.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7929632365703583, | |
| "epoch": 0.0024630541871921183, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.007080809166602687, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.031547293066978455, | |
| "num_tokens": 146065.0, | |
| "reward": 0.73046875, | |
| "reward_std": 0.5586702823638916, | |
| "rewards/reward_func/mean": 0.08116319444444445, | |
| "rewards/reward_func/std": 0.0780774603287379, | |
| "sampling/importance_sampling_ratio/max": 2.999567985534668, | |
| "sampling/importance_sampling_ratio/mean": 0.951445460319519, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.061712265014648, | |
| "sampling/sampling_logp_difference/mean": 0.20331552624702454, | |
| "step": 1, | |
| "step_time": 187.38077754108235 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2655.0, | |
| "completions/max_terminated_length": 2655.0, | |
| "completions/mean_length": 536.3125, | |
| "completions/mean_terminated_length": 519.1935424804688, | |
| "completions/min_length": 84.0, | |
| "completions/min_terminated_length": 84.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7466610074043274, | |
| "epoch": 0.0049261083743842365, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009326234328873431, | |
| "kl": 0.0, | |
| "learning_rate": 1e-05, | |
| "loss": -0.03247833997011185, | |
| "num_tokens": 258293.0, | |
| "reward": 0.75, | |
| "reward_std": 0.48591265082359314, | |
| "rewards/reward_func/mean": 0.08333333333333333, | |
| "rewards/reward_func/std": 0.06759786936971876, | |
| "sampling/importance_sampling_ratio/max": 2.996082305908203, | |
| "sampling/importance_sampling_ratio/mean": 0.9654816389083862, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 8.417738914489746, | |
| "sampling/sampling_logp_difference/mean": 0.17964275181293488, | |
| "step": 2, | |
| "step_time": 80.001450516982 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3812.0, | |
| "completions/mean_length": 1312.59375, | |
| "completions/mean_terminated_length": 1249.2542724609375, | |
| "completions/min_length": 24.0, | |
| "completions/min_terminated_length": 268.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6912166476249695, | |
| "epoch": 0.007389162561576354, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0061682845082898985, | |
| "kl": 0.00023736981165711768, | |
| "learning_rate": 2e-05, | |
| "loss": -0.05372963100671768, | |
| "num_tokens": 430203.0, | |
| "reward": 0.6484375, | |
| "reward_std": 0.5988062620162964, | |
| "rewards/reward_func/mean": 0.0720486111111111, | |
| "rewards/reward_func/std": 0.08277501662572224, | |
| "sampling/importance_sampling_ratio/max": 2.998338222503662, | |
| "sampling/importance_sampling_ratio/mean": 0.9439487457275391, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.194351196289062, | |
| "sampling/sampling_logp_difference/mean": 0.20593464374542236, | |
| "step": 3, | |
| "step_time": 162.00668290187605 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3417.0, | |
| "completions/mean_length": 986.9375, | |
| "completions/mean_terminated_length": 937.5873413085938, | |
| "completions/min_length": 43.0, | |
| "completions/min_terminated_length": 43.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6697156727313995, | |
| "epoch": 0.009852216748768473, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006919383894978601, | |
| "kl": 0.00022338035705615766, | |
| "learning_rate": 3e-05, | |
| "loss": -0.03669855743646622, | |
| "num_tokens": 578007.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.4807814359664917, | |
| "rewards/reward_func/mean": 0.0798611111111111, | |
| "rewards/reward_func/std": 0.0677249828974406, | |
| "sampling/importance_sampling_ratio/max": 2.9946367740631104, | |
| "sampling/importance_sampling_ratio/mean": 0.9518617987632751, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.811179161071777, | |
| "sampling/sampling_logp_difference/mean": 0.1946113407611847, | |
| "step": 4, | |
| "step_time": 117.74331320659257 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2739.0, | |
| "completions/mean_length": 937.640625, | |
| "completions/mean_terminated_length": 863.0491333007812, | |
| "completions/min_length": 245.0, | |
| "completions/min_terminated_length": 245.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7166256010532379, | |
| "epoch": 0.012315270935960592, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.009349939603109444, | |
| "kl": 0.0002510682497813832, | |
| "learning_rate": 4e-05, | |
| "loss": -0.0009028250351548195, | |
| "num_tokens": 726288.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.863731324672699, | |
| "rewards/reward_func/mean": 0.10416666666666667, | |
| "rewards/reward_func/std": 0.139545609553655, | |
| "sampling/importance_sampling_ratio/max": 2.9990389347076416, | |
| "sampling/importance_sampling_ratio/mean": 0.9520124197006226, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.812490463256836, | |
| "sampling/sampling_logp_difference/mean": 0.19966453313827515, | |
| "step": 5, | |
| "step_time": 158.4150711328257 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2168.0, | |
| "completions/mean_length": 832.828125, | |
| "completions/mean_terminated_length": 673.7333984375, | |
| "completions/min_length": 87.0, | |
| "completions/min_terminated_length": 87.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7161838710308075, | |
| "epoch": 0.014778325123152709, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006622132589767797, | |
| "kl": 0.00024209270850406028, | |
| "learning_rate": 5e-05, | |
| "loss": -0.05239104479551315, | |
| "num_tokens": 857573.0, | |
| "reward": 0.73828125, | |
| "reward_std": 0.5259716510772705, | |
| "rewards/reward_func/mean": 0.08203125, | |
| "rewards/reward_func/std": 0.07359342939323849, | |
| "sampling/importance_sampling_ratio/max": 2.9946517944335938, | |
| "sampling/importance_sampling_ratio/mean": 0.9595122933387756, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.93506908416748, | |
| "sampling/sampling_logp_difference/mean": 0.18810752034187317, | |
| "step": 6, | |
| "step_time": 184.07614227291197 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2822.0, | |
| "completions/mean_length": 901.796875, | |
| "completions/mean_terminated_length": 795.3500366210938, | |
| "completions/min_length": 173.0, | |
| "completions/min_terminated_length": 173.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6692915558815002, | |
| "epoch": 0.017241379310344827, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006483913581931664, | |
| "kl": 0.000280275329714641, | |
| "learning_rate": 4.999995293306428e-05, | |
| "loss": -0.041869841516017914, | |
| "num_tokens": 997496.0, | |
| "reward": 0.734375, | |
| "reward_std": 0.4997519254684448, | |
| "rewards/reward_func/mean": 0.08159722222222222, | |
| "rewards/reward_func/std": 0.06919422745704651, | |
| "sampling/importance_sampling_ratio/max": 2.991281509399414, | |
| "sampling/importance_sampling_ratio/mean": 0.9599131345748901, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.254425048828125, | |
| "sampling/sampling_logp_difference/mean": 0.18058966100215912, | |
| "step": 7, | |
| "step_time": 125.35071262088604 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3041.0, | |
| "completions/mean_length": 1014.3125, | |
| "completions/mean_terminated_length": 926.475341796875, | |
| "completions/min_length": 209.0, | |
| "completions/min_terminated_length": 214.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6333879828453064, | |
| "epoch": 0.019704433497536946, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.00658642427929086, | |
| "kl": 0.00029417510086204857, | |
| "learning_rate": 4.999981173243434e-05, | |
| "loss": -0.006957275792956352, | |
| "num_tokens": 1137180.0, | |
| "reward": 0.875, | |
| "reward_std": 0.6696362495422363, | |
| "rewards/reward_func/mean": 0.09722222222222222, | |
| "rewards/reward_func/std": 0.12841258777512443, | |
| "sampling/importance_sampling_ratio/max": 2.997835159301758, | |
| "sampling/importance_sampling_ratio/mean": 0.9580835103988647, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.393613815307617, | |
| "sampling/sampling_logp_difference/mean": 0.17572419345378876, | |
| "step": 8, | |
| "step_time": 119.33149944525212 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2440.0, | |
| "completions/mean_length": 902.03125, | |
| "completions/mean_terminated_length": 834.5573120117188, | |
| "completions/min_length": 120.0, | |
| "completions/min_terminated_length": 120.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6275147348642349, | |
| "epoch": 0.022167487684729065, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006323226527766412, | |
| "kl": 0.0006948229565750808, | |
| "learning_rate": 4.999957639864185e-05, | |
| "loss": 0.011378014460206032, | |
| "num_tokens": 1284430.0, | |
| "reward": 0.828125, | |
| "reward_std": 0.47114139795303345, | |
| "rewards/reward_func/mean": 0.0920138888888889, | |
| "rewards/reward_func/std": 0.06752209034230974, | |
| "sampling/importance_sampling_ratio/max": 2.9957845211029053, | |
| "sampling/importance_sampling_ratio/mean": 0.9539204835891724, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.203951835632324, | |
| "sampling/sampling_logp_difference/mean": 0.18070682883262634, | |
| "step": 9, | |
| "step_time": 165.76451331260614 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3584.0, | |
| "completions/mean_length": 1058.421875, | |
| "completions/mean_terminated_length": 1003.590087890625, | |
| "completions/min_length": 217.0, | |
| "completions/min_terminated_length": 217.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6733803153038025, | |
| "epoch": 0.024630541871921183, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.004642930827864921, | |
| "kl": 0.0006831231439718977, | |
| "learning_rate": 4.999924693257293e-05, | |
| "loss": -0.051798924803733826, | |
| "num_tokens": 1434985.0, | |
| "reward": 0.94921875, | |
| "reward_std": 0.4923616647720337, | |
| "rewards/reward_func/mean": 0.10546875, | |
| "rewards/reward_func/std": 0.07276467482248943, | |
| "sampling/importance_sampling_ratio/max": 2.998112916946411, | |
| "sampling/importance_sampling_ratio/mean": 0.9511500597000122, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.552619934082031, | |
| "sampling/sampling_logp_difference/mean": 0.19445407390594482, | |
| "step": 10, | |
| "step_time": 176.47617132030427 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3757.0, | |
| "completions/max_terminated_length": 3757.0, | |
| "completions/mean_length": 764.625, | |
| "completions/mean_terminated_length": 759.71435546875, | |
| "completions/min_length": 126.0, | |
| "completions/min_terminated_length": 126.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7308266162872314, | |
| "epoch": 0.027093596059113302, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014920178186134305, | |
| "kl": 0.010335339815355837, | |
| "learning_rate": 4.9998823335468127e-05, | |
| "loss": -0.028729835525155067, | |
| "num_tokens": 1573361.0, | |
| "reward": 0.83984375, | |
| "reward_std": 0.46583086252212524, | |
| "rewards/reward_func/mean": 0.09331597222222222, | |
| "rewards/reward_func/std": 0.06697534686989254, | |
| "sampling/importance_sampling_ratio/max": 2.998915195465088, | |
| "sampling/importance_sampling_ratio/mean": 0.9559616446495056, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.61943531036377, | |
| "sampling/sampling_logp_difference/mean": 0.19377702474594116, | |
| "step": 11, | |
| "step_time": 111.2789166229777 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3448.0, | |
| "completions/mean_length": 950.359375, | |
| "completions/mean_terminated_length": 900.4286499023438, | |
| "completions/min_length": 198.0, | |
| "completions/min_terminated_length": 198.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6989224255084991, | |
| "epoch": 0.029556650246305417, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0055010500205562194, | |
| "kl": 0.0006455547700170428, | |
| "learning_rate": 4.9998305608922444e-05, | |
| "loss": 0.006468920968472958, | |
| "num_tokens": 1726488.0, | |
| "reward": 0.98828125, | |
| "reward_std": 0.425920307636261, | |
| "rewards/reward_func/mean": 0.10980902777777778, | |
| "rewards/reward_func/std": 0.06307367483774821, | |
| "sampling/importance_sampling_ratio/max": 2.995755672454834, | |
| "sampling/importance_sampling_ratio/mean": 0.9539859294891357, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.935945510864258, | |
| "sampling/sampling_logp_difference/mean": 0.19343537092208862, | |
| "step": 12, | |
| "step_time": 143.94482827885076 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3894.0, | |
| "completions/mean_length": 1003.15625, | |
| "completions/mean_terminated_length": 963.758056640625, | |
| "completions/min_length": 170.0, | |
| "completions/min_terminated_length": 170.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6788552850484848, | |
| "epoch": 0.03201970443349754, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004715533017209702, | |
| "kl": 0.0009007068583741784, | |
| "learning_rate": 4.99976937548853e-05, | |
| "loss": 0.02753077633678913, | |
| "num_tokens": 1874386.0, | |
| "reward": 0.9296875, | |
| "reward_std": 0.31644338369369507, | |
| "rewards/reward_func/mean": 0.1032986111111111, | |
| "rewards/reward_func/std": 0.0473594069480896, | |
| "sampling/importance_sampling_ratio/max": 2.998652696609497, | |
| "sampling/importance_sampling_ratio/mean": 0.9536670446395874, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.655500411987305, | |
| "sampling/sampling_logp_difference/mean": 0.1883814036846161, | |
| "step": 13, | |
| "step_time": 118.49153742892668 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1714.0, | |
| "completions/mean_length": 1036.515625, | |
| "completions/mean_terminated_length": 777.2373046875, | |
| "completions/min_length": 175.0, | |
| "completions/min_terminated_length": 175.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7153487950563431, | |
| "epoch": 0.034482758620689655, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0027371846864771214, | |
| "kl": 0.0010056617320515215, | |
| "learning_rate": 4.999698777566055e-05, | |
| "loss": -0.013990852981805801, | |
| "num_tokens": 2045299.0, | |
| "reward": 1.00390625, | |
| "reward_std": 0.38186100125312805, | |
| "rewards/reward_func/mean": 0.1115451388888889, | |
| "rewards/reward_func/std": 0.05746811628341675, | |
| "sampling/importance_sampling_ratio/max": 2.99708890914917, | |
| "sampling/importance_sampling_ratio/mean": 0.945040225982666, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.899123191833496, | |
| "sampling/sampling_logp_difference/mean": 0.21787062287330627, | |
| "step": 14, | |
| "step_time": 145.6305006260518 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4092.0, | |
| "completions/mean_length": 1146.296875, | |
| "completions/mean_terminated_length": 1105.54833984375, | |
| "completions/min_length": 30.0, | |
| "completions/min_terminated_length": 30.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6484042406082153, | |
| "epoch": 0.03694581280788178, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0035634324528942235, | |
| "kl": 0.0010466112871654332, | |
| "learning_rate": 4.9996187673906445e-05, | |
| "loss": -0.03319694846868515, | |
| "num_tokens": 2212214.0, | |
| "reward": 0.9921875, | |
| "reward_std": 0.46711021661758423, | |
| "rewards/reward_func/mean": 0.11024305555555555, | |
| "rewards/reward_func/std": 0.06887222660912408, | |
| "sampling/importance_sampling_ratio/max": 2.9985616207122803, | |
| "sampling/importance_sampling_ratio/mean": 0.9446097612380981, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 19.617145538330078, | |
| "sampling/sampling_logp_difference/mean": 0.2042129635810852, | |
| "step": 15, | |
| "step_time": 184.45763081498444 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3909.0, | |
| "completions/mean_length": 1403.515625, | |
| "completions/mean_terminated_length": 1360.77783203125, | |
| "completions/min_length": 274.0, | |
| "completions/min_terminated_length": 274.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6306434720754623, | |
| "epoch": 0.03940886699507389, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0020010963111962766, | |
| "kl": 0.0005926049634581432, | |
| "learning_rate": 4.9995293452635664e-05, | |
| "loss": -0.016749005764722824, | |
| "num_tokens": 2384055.0, | |
| "reward": 1.015625, | |
| "reward_std": 0.3615305721759796, | |
| "rewards/reward_func/mean": 0.11284722222222222, | |
| "rewards/reward_func/std": 0.05430084632502662, | |
| "sampling/importance_sampling_ratio/max": 2.9997754096984863, | |
| "sampling/importance_sampling_ratio/mean": 0.953073263168335, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.554804801940918, | |
| "sampling/sampling_logp_difference/mean": 0.18350985646247864, | |
| "step": 16, | |
| "step_time": 128.69480849499814 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3903.0, | |
| "completions/mean_length": 1115.609375, | |
| "completions/mean_terminated_length": 1033.80322265625, | |
| "completions/min_length": 145.0, | |
| "completions/min_terminated_length": 178.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7165143638849258, | |
| "epoch": 0.04187192118226601, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.002698123496280166, | |
| "kl": 0.0011966502643190324, | |
| "learning_rate": 4.999430511521525e-05, | |
| "loss": -0.01774909719824791, | |
| "num_tokens": 2534526.0, | |
| "reward": 1.12109375, | |
| "reward_std": 0.45423969626426697, | |
| "rewards/reward_func/mean": 0.12456597222222222, | |
| "rewards/reward_func/std": 0.06652174724472894, | |
| "sampling/importance_sampling_ratio/max": 2.9969332218170166, | |
| "sampling/importance_sampling_ratio/mean": 0.945965588092804, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.221734046936035, | |
| "sampling/sampling_logp_difference/mean": 0.207554429769516, | |
| "step": 17, | |
| "step_time": 153.94046954950318 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2784.0, | |
| "completions/mean_length": 1181.28125, | |
| "completions/mean_terminated_length": 986.9667358398438, | |
| "completions/min_length": 58.0, | |
| "completions/min_terminated_length": 58.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7423214614391327, | |
| "epoch": 0.04433497536945813, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0023429990820167043, | |
| "kl": 0.0010119961225427687, | |
| "learning_rate": 4.999322266536666e-05, | |
| "loss": -0.009780586697161198, | |
| "num_tokens": 2689040.0, | |
| "reward": 1.078125, | |
| "reward_std": 0.39559829235076904, | |
| "rewards/reward_func/mean": 0.11979166666666667, | |
| "rewards/reward_func/std": 0.05736829009321001, | |
| "sampling/importance_sampling_ratio/max": 2.9975063800811768, | |
| "sampling/importance_sampling_ratio/mean": 0.9511388540267944, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.217132568359375, | |
| "sampling/sampling_logp_difference/mean": 0.20317873358726501, | |
| "step": 18, | |
| "step_time": 148.3335123800207 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4076.0, | |
| "completions/mean_length": 1244.03125, | |
| "completions/mean_terminated_length": 1152.0322265625, | |
| "completions/min_length": 220.0, | |
| "completions/min_terminated_length": 220.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6846257448196411, | |
| "epoch": 0.046798029556650245, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0024025660850242043, | |
| "kl": 0.0009544179702061228, | |
| "learning_rate": 4.9992046107165705e-05, | |
| "loss": 0.0017357771284878254, | |
| "num_tokens": 2861074.0, | |
| "reward": 0.98828125, | |
| "reward_std": 0.24164485931396484, | |
| "rewards/reward_func/mean": 0.10980902777777778, | |
| "rewards/reward_func/std": 0.039622714122136436, | |
| "sampling/importance_sampling_ratio/max": 2.997622013092041, | |
| "sampling/importance_sampling_ratio/mean": 0.947391152381897, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.733199119567871, | |
| "sampling/sampling_logp_difference/mean": 0.2056819498538971, | |
| "step": 19, | |
| "step_time": 136.16764776594937 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3447.0, | |
| "completions/mean_length": 1302.53125, | |
| "completions/mean_terminated_length": 1165.1474609375, | |
| "completions/min_length": 301.0, | |
| "completions/min_terminated_length": 301.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7186716794967651, | |
| "epoch": 0.04926108374384237, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.000971565986448778, | |
| "kl": 0.0007161840621847659, | |
| "learning_rate": 4.999077544504252e-05, | |
| "loss": -0.009501606225967407, | |
| "num_tokens": 3029444.0, | |
| "reward": 1.01953125, | |
| "reward_std": 0.2284553349018097, | |
| "rewards/reward_func/mean": 0.11328125, | |
| "rewards/reward_func/std": 0.035912372171878815, | |
| "sampling/importance_sampling_ratio/max": 2.9968159198760986, | |
| "sampling/importance_sampling_ratio/mean": 0.9446607828140259, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.439801216125488, | |
| "sampling/sampling_logp_difference/mean": 0.21503275632858276, | |
| "step": 20, | |
| "step_time": 122.81685300194658 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2639.0, | |
| "completions/mean_length": 968.984375, | |
| "completions/mean_terminated_length": 865.8196411132812, | |
| "completions/min_length": 246.0, | |
| "completions/min_terminated_length": 246.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7783543467521667, | |
| "epoch": 0.05172413793103448, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0038032007547465566, | |
| "kl": 0.017548598028952256, | |
| "learning_rate": 4.998941068378163e-05, | |
| "loss": 0.0010344665497541428, | |
| "num_tokens": 3175379.0, | |
| "reward": 1.1796875, | |
| "reward_std": 0.46604710817337036, | |
| "rewards/reward_func/mean": 0.1310763888888889, | |
| "rewards/reward_func/std": 0.06429070068730248, | |
| "sampling/importance_sampling_ratio/max": 2.9981436729431152, | |
| "sampling/importance_sampling_ratio/mean": 0.9482499361038208, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.149746894836426, | |
| "sampling/sampling_logp_difference/mean": 0.20681576430797577, | |
| "step": 21, | |
| "step_time": 130.43352678511292 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3854.0, | |
| "completions/mean_length": 975.015625, | |
| "completions/mean_terminated_length": 875.1966552734375, | |
| "completions/min_length": 191.0, | |
| "completions/min_terminated_length": 191.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.750398576259613, | |
| "epoch": 0.054187192118226604, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.000523910546696439, | |
| "kl": 0.001042805059114471, | |
| "learning_rate": 4.998795182852183e-05, | |
| "loss": -0.008543891832232475, | |
| "num_tokens": 3323732.0, | |
| "reward": 1.125, | |
| "reward_std": 0.35912150144577026, | |
| "rewards/reward_func/mean": 0.125, | |
| "rewards/reward_func/std": 0.049859102401468486, | |
| "sampling/importance_sampling_ratio/max": 2.9976632595062256, | |
| "sampling/importance_sampling_ratio/mean": 0.956694483757019, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.91028881072998, | |
| "sampling/sampling_logp_difference/mean": 0.19543185830116272, | |
| "step": 22, | |
| "step_time": 125.63274311483838 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3903.0, | |
| "completions/mean_length": 1227.0625, | |
| "completions/mean_terminated_length": 1085.9671630859375, | |
| "completions/min_length": 184.0, | |
| "completions/min_terminated_length": 184.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6305443197488785, | |
| "epoch": 0.05665024630541872, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.004696302820715364, | |
| "kl": 0.0008546650788048282, | |
| "learning_rate": 4.998639888475621e-05, | |
| "loss": 0.04576673358678818, | |
| "num_tokens": 3490456.0, | |
| "reward": 1.26171875, | |
| "reward_std": 0.7379046678543091, | |
| "rewards/reward_func/mean": 0.1401909722222222, | |
| "rewards/reward_func/std": 0.11325099567572276, | |
| "sampling/importance_sampling_ratio/max": 2.999181032180786, | |
| "sampling/importance_sampling_ratio/mean": 0.9515511989593506, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.505047798156738, | |
| "sampling/sampling_logp_difference/mean": 0.19298961758613586, | |
| "step": 23, | |
| "step_time": 133.18944385577925 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3573.0, | |
| "completions/mean_length": 1265.4375, | |
| "completions/mean_terminated_length": 1174.1290283203125, | |
| "completions/min_length": 355.0, | |
| "completions/min_terminated_length": 355.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6693638265132904, | |
| "epoch": 0.059113300492610835, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0013212910135926548, | |
| "kl": 0.0009473708487348631, | |
| "learning_rate": 4.998475185833219e-05, | |
| "loss": 0.008236411958932877, | |
| "num_tokens": 3657892.0, | |
| "reward": 1.03515625, | |
| "reward_std": 0.2741045653820038, | |
| "rewards/reward_func/mean": 0.1150173611111111, | |
| "rewards/reward_func/std": 0.039349371360407934, | |
| "sampling/importance_sampling_ratio/max": 2.9955687522888184, | |
| "sampling/importance_sampling_ratio/mean": 0.9495745897293091, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.479448318481445, | |
| "sampling/sampling_logp_difference/mean": 0.1977420151233673, | |
| "step": 24, | |
| "step_time": 128.52692431304604 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2757.0, | |
| "completions/mean_length": 1035.78125, | |
| "completions/mean_terminated_length": 988.5806274414062, | |
| "completions/min_length": 191.0, | |
| "completions/min_terminated_length": 191.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.673497810959816, | |
| "epoch": 0.06157635467980296, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0025486905634899714, | |
| "kl": 0.0008044984133448452, | |
| "learning_rate": 4.9983010755451386e-05, | |
| "loss": -0.005573897622525692, | |
| "num_tokens": 3804278.0, | |
| "reward": 1.1796875, | |
| "reward_std": 0.46604710817337036, | |
| "rewards/reward_func/mean": 0.1310763888888889, | |
| "rewards/reward_func/std": 0.06429070068730248, | |
| "sampling/importance_sampling_ratio/max": 2.997753620147705, | |
| "sampling/importance_sampling_ratio/mean": 0.9530193209648132, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.126824378967285, | |
| "sampling/sampling_logp_difference/mean": 0.19949191808700562, | |
| "step": 25, | |
| "step_time": 125.30965802492574 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2515.0, | |
| "completions/max_terminated_length": 2515.0, | |
| "completions/mean_length": 997.15625, | |
| "completions/mean_terminated_length": 997.15625, | |
| "completions/min_length": 215.0, | |
| "completions/min_terminated_length": 215.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6242654323577881, | |
| "epoch": 0.06403940886699508, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0006874978805650674, | |
| "kl": 0.0009008395281853154, | |
| "learning_rate": 4.998117558266968e-05, | |
| "loss": 0.003014655550941825, | |
| "num_tokens": 3958688.0, | |
| "reward": 1.04296875, | |
| "reward_std": 0.19697457551956177, | |
| "rewards/reward_func/mean": 0.11588541666666667, | |
| "rewards/reward_func/std": 0.027143559522098966, | |
| "sampling/importance_sampling_ratio/max": 2.9919300079345703, | |
| "sampling/importance_sampling_ratio/mean": 0.9529856443405151, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.685388565063477, | |
| "sampling/sampling_logp_difference/mean": 0.1835816651582718, | |
| "step": 26, | |
| "step_time": 87.82440122170374 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4042.0, | |
| "completions/mean_length": 1417.03125, | |
| "completions/mean_terminated_length": 1285.278564453125, | |
| "completions/min_length": 265.0, | |
| "completions/min_terminated_length": 265.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6958920955657959, | |
| "epoch": 0.0665024630541872, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.00867741418581629, | |
| "kl": 0.000814533414086327, | |
| "learning_rate": 4.9979246346897136e-05, | |
| "loss": -0.02824692241847515, | |
| "num_tokens": 4136322.0, | |
| "reward": 1.203125, | |
| "reward_std": 0.6375719904899597, | |
| "rewards/reward_func/mean": 0.13368055555555555, | |
| "rewards/reward_func/std": 0.11306073599391514, | |
| "sampling/importance_sampling_ratio/max": 2.999945640563965, | |
| "sampling/importance_sampling_ratio/mean": 0.943295955657959, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 23.382047653198242, | |
| "sampling/sampling_logp_difference/mean": 0.21225795149803162, | |
| "step": 27, | |
| "step_time": 158.52877362072468 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3286.0, | |
| "completions/mean_length": 1063.046875, | |
| "completions/mean_terminated_length": 965.2096557617188, | |
| "completions/min_length": 85.0, | |
| "completions/min_terminated_length": 85.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.73707315325737, | |
| "epoch": 0.06896551724137931, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0006812576623849655, | |
| "kl": 0.0010022705682786182, | |
| "learning_rate": 4.997722305539802e-05, | |
| "loss": -0.0040407185442745686, | |
| "num_tokens": 4287797.0, | |
| "reward": 1.125, | |
| "reward_std": 0.35073620080947876, | |
| "rewards/reward_func/mean": 0.125, | |
| "rewards/reward_func/std": 0.04570846714907222, | |
| "sampling/importance_sampling_ratio/max": 2.9977564811706543, | |
| "sampling/importance_sampling_ratio/mean": 0.9494884014129639, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.191810607910156, | |
| "sampling/sampling_logp_difference/mean": 0.2048904001712799, | |
| "step": 28, | |
| "step_time": 118.83388412673958 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3365.0, | |
| "completions/mean_length": 1107.609375, | |
| "completions/mean_terminated_length": 960.6392822265625, | |
| "completions/min_length": 263.0, | |
| "completions/min_terminated_length": 263.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7131416946649551, | |
| "epoch": 0.07142857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0022908098026776604, | |
| "kl": 0.0011166162439621985, | |
| "learning_rate": 4.997510571579074e-05, | |
| "loss": 0.00837898999452591, | |
| "num_tokens": 4453148.0, | |
| "reward": 1.08984375, | |
| "reward_std": 0.3628786504268646, | |
| "rewards/reward_func/mean": 0.12109375, | |
| "rewards/reward_func/std": 0.05258376399676005, | |
| "sampling/importance_sampling_ratio/max": 2.9968960285186768, | |
| "sampling/importance_sampling_ratio/mean": 0.9544211626052856, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.943284034729004, | |
| "sampling/sampling_logp_difference/mean": 0.1991732120513916, | |
| "step": 29, | |
| "step_time": 143.43660160107538 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2616.0, | |
| "completions/mean_length": 1034.375, | |
| "completions/mean_terminated_length": 985.77783203125, | |
| "completions/min_length": 162.0, | |
| "completions/min_terminated_length": 162.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.729461207985878, | |
| "epoch": 0.07389162561576355, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.005696858334664288, | |
| "kl": 0.0011717368324752897, | |
| "learning_rate": 4.997289433604783e-05, | |
| "loss": 0.02414151094853878, | |
| "num_tokens": 4621172.0, | |
| "reward": 1.2578125, | |
| "reward_std": 0.7439821362495422, | |
| "rewards/reward_func/mean": 0.13975694444444445, | |
| "rewards/reward_func/std": 0.11646586159865062, | |
| "sampling/importance_sampling_ratio/max": 2.9982309341430664, | |
| "sampling/importance_sampling_ratio/mean": 0.9456791877746582, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.812464714050293, | |
| "sampling/sampling_logp_difference/mean": 0.21633002161979675, | |
| "step": 30, | |
| "step_time": 128.25970319425687 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3453.0, | |
| "completions/mean_length": 945.78125, | |
| "completions/mean_terminated_length": 844.1612548828125, | |
| "completions/min_length": 241.0, | |
| "completions/min_terminated_length": 241.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7076129615306854, | |
| "epoch": 0.07635467980295567, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0005820405360045273, | |
| "kl": 0.0008675850986037403, | |
| "learning_rate": 4.997058892449591e-05, | |
| "loss": -0.0038843925576657057, | |
| "num_tokens": 4768086.0, | |
| "reward": 1.1171875, | |
| "reward_std": 0.3391420841217041, | |
| "rewards/reward_func/mean": 0.12413194444444445, | |
| "rewards/reward_func/std": 0.04190837426318063, | |
| "sampling/importance_sampling_ratio/max": 2.9953014850616455, | |
| "sampling/importance_sampling_ratio/mean": 0.9568973779678345, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.485434532165527, | |
| "sampling/sampling_logp_difference/mean": 0.18533028662204742, | |
| "step": 31, | |
| "step_time": 115.76905426895246 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4024.0, | |
| "completions/max_terminated_length": 4024.0, | |
| "completions/mean_length": 693.03125, | |
| "completions/mean_terminated_length": 699.3175048828125, | |
| "completions/min_length": 148.0, | |
| "completions/min_terminated_length": 148.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5862796604633331, | |
| "epoch": 0.07881773399014778, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.005239994985586592, | |
| "kl": 0.0010869551915675402, | |
| "learning_rate": 4.99681894898157e-05, | |
| "loss": -0.00011015844211215153, | |
| "num_tokens": 4896296.0, | |
| "reward": 1.10546875, | |
| "reward_std": 0.4427652359008789, | |
| "rewards/reward_func/mean": 0.1228298611111111, | |
| "rewards/reward_func/std": 0.07224722537729475, | |
| "sampling/importance_sampling_ratio/max": 2.999147653579712, | |
| "sampling/importance_sampling_ratio/mean": 0.9641219973564148, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.479804992675781, | |
| "sampling/sampling_logp_difference/mean": 0.16228708624839783, | |
| "step": 32, | |
| "step_time": 129.93058712827042 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2573.0, | |
| "completions/mean_length": 1020.71875, | |
| "completions/mean_terminated_length": 864.0167236328125, | |
| "completions/min_length": 207.0, | |
| "completions/min_terminated_length": 207.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.684396505355835, | |
| "epoch": 0.0812807881773399, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0023055066573911117, | |
| "kl": 0.0011950426560360938, | |
| "learning_rate": 4.99656960410419e-05, | |
| "loss": 0.01441088318824768, | |
| "num_tokens": 5051142.0, | |
| "reward": 1.01953125, | |
| "reward_std": 0.20071640610694885, | |
| "rewards/reward_func/mean": 0.11328125, | |
| "rewards/reward_func/std": 0.02990201198392444, | |
| "sampling/importance_sampling_ratio/max": 2.998185396194458, | |
| "sampling/importance_sampling_ratio/mean": 0.9508187770843506, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.62383270263672, | |
| "sampling/sampling_logp_difference/mean": 0.20294925570487976, | |
| "step": 33, | |
| "step_time": 124.96260152198374 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3760.0, | |
| "completions/mean_length": 1340.171875, | |
| "completions/mean_terminated_length": 1233.1500244140625, | |
| "completions/min_length": 386.0, | |
| "completions/min_terminated_length": 386.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7469504326581955, | |
| "epoch": 0.08374384236453201, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.005973168634825336, | |
| "kl": 0.0012111573305446655, | |
| "learning_rate": 4.9963108587563226e-05, | |
| "loss": 0.012999728322029114, | |
| "num_tokens": 5232177.0, | |
| "reward": 1.125, | |
| "reward_std": 0.34503278136253357, | |
| "rewards/reward_func/mean": 0.125, | |
| "rewards/reward_func/std": 0.0486740552716785, | |
| "sampling/importance_sampling_ratio/max": 2.9891550540924072, | |
| "sampling/importance_sampling_ratio/mean": 0.9441696405410767, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.990606307983398, | |
| "sampling/sampling_logp_difference/mean": 0.21126891672611237, | |
| "step": 34, | |
| "step_time": 194.91840827674605 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3017.0, | |
| "completions/max_terminated_length": 3017.0, | |
| "completions/mean_length": 721.109375, | |
| "completions/mean_terminated_length": 721.109375, | |
| "completions/min_length": 154.0, | |
| "completions/min_terminated_length": 154.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.574622169137001, | |
| "epoch": 0.08620689655172414, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002111501240666468, | |
| "kl": 0.0016787934000603855, | |
| "learning_rate": 4.996042713912238e-05, | |
| "loss": 0.03297015279531479, | |
| "num_tokens": 5364056.0, | |
| "reward": 1.0390625, | |
| "reward_std": 0.27174752950668335, | |
| "rewards/reward_func/mean": 0.1154513888888889, | |
| "rewards/reward_func/std": 0.038944005138344236, | |
| "sampling/importance_sampling_ratio/max": 2.992124080657959, | |
| "sampling/importance_sampling_ratio/mean": 0.9650247693061829, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.683228492736816, | |
| "sampling/sampling_logp_difference/mean": 0.15935879945755005, | |
| "step": 35, | |
| "step_time": 88.56324595375918 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2589.0, | |
| "completions/mean_length": 758.765625, | |
| "completions/mean_terminated_length": 651.1128540039062, | |
| "completions/min_length": 104.0, | |
| "completions/min_terminated_length": 104.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6567760854959488, | |
| "epoch": 0.08866995073891626, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0008471869813655381, | |
| "kl": 0.0014283077616710216, | |
| "learning_rate": 4.995765170581595e-05, | |
| "loss": -0.005755479913204908, | |
| "num_tokens": 5490809.0, | |
| "reward": 1.046875, | |
| "reward_std": 0.25539806485176086, | |
| "rewards/reward_func/mean": 0.11631944444444445, | |
| "rewards/reward_func/std": 0.03388542061050733, | |
| "sampling/importance_sampling_ratio/max": 2.9882869720458984, | |
| "sampling/importance_sampling_ratio/mean": 0.9635649919509888, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.370882034301758, | |
| "sampling/sampling_logp_difference/mean": 0.16954657435417175, | |
| "step": 36, | |
| "step_time": 180.74639308801852 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3187.0, | |
| "completions/mean_length": 1229.5, | |
| "completions/mean_terminated_length": 1088.5245361328125, | |
| "completions/min_length": 190.0, | |
| "completions/min_terminated_length": 190.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7335378378629684, | |
| "epoch": 0.09113300492610837, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004487204028022401, | |
| "kl": 0.0041939748043660074, | |
| "learning_rate": 4.995478229809444e-05, | |
| "loss": 0.051113568246364594, | |
| "num_tokens": 5660169.0, | |
| "reward": 1.17578125, | |
| "reward_std": 0.7150309085845947, | |
| "rewards/reward_func/mean": 0.1306423611111111, | |
| "rewards/reward_func/std": 0.10943113929695553, | |
| "sampling/importance_sampling_ratio/max": 2.999530792236328, | |
| "sampling/importance_sampling_ratio/mean": 0.9497649669647217, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.937071800231934, | |
| "sampling/sampling_logp_difference/mean": 0.19850803911685944, | |
| "step": 37, | |
| "step_time": 135.75389188993722 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3290.0, | |
| "completions/mean_length": 1152.984375, | |
| "completions/mean_terminated_length": 956.7833862304688, | |
| "completions/min_length": 304.0, | |
| "completions/min_terminated_length": 304.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7015815377235413, | |
| "epoch": 0.09359605911330049, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.001804205516908683, | |
| "kl": 0.0018779803649522364, | |
| "learning_rate": 4.9951818926762174e-05, | |
| "loss": 0.005440297070890665, | |
| "num_tokens": 5837448.0, | |
| "reward": 1.046875, | |
| "reward_std": 0.329968124628067, | |
| "rewards/reward_func/mean": 0.11631944444444445, | |
| "rewards/reward_func/std": 0.04896413783232371, | |
| "sampling/importance_sampling_ratio/max": 2.999505043029785, | |
| "sampling/importance_sampling_ratio/mean": 0.9455422163009644, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.106887817382812, | |
| "sampling/sampling_logp_difference/mean": 0.2156333029270172, | |
| "step": 38, | |
| "step_time": 190.32753542577848 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3058.0, | |
| "completions/mean_length": 941.140625, | |
| "completions/mean_terminated_length": 839.3709716796875, | |
| "completions/min_length": 142.0, | |
| "completions/min_terminated_length": 142.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.652764156460762, | |
| "epoch": 0.0960591133004926, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002124967192293937, | |
| "kl": 0.0012375268270261586, | |
| "learning_rate": 4.99487616029773e-05, | |
| "loss": -0.002344203647226095, | |
| "num_tokens": 5985777.0, | |
| "reward": 1.0546875, | |
| "reward_std": 0.3068941533565521, | |
| "rewards/reward_func/mean": 0.1171875, | |
| "rewards/reward_func/std": 0.044668421149253845, | |
| "sampling/importance_sampling_ratio/max": 2.999897003173828, | |
| "sampling/importance_sampling_ratio/mean": 0.9565781354904175, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.855171203613281, | |
| "sampling/sampling_logp_difference/mean": 0.18129438161849976, | |
| "step": 39, | |
| "step_time": 119.2174817638006 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3406.0, | |
| "completions/mean_length": 925.71875, | |
| "completions/mean_terminated_length": 815.901611328125, | |
| "completions/min_length": 147.0, | |
| "completions/min_terminated_length": 147.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6335614919662476, | |
| "epoch": 0.09852216748768473, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.003501411960275662, | |
| "kl": 0.0029698072466999292, | |
| "learning_rate": 4.994561033825174e-05, | |
| "loss": 0.014414285309612751, | |
| "num_tokens": 6126367.0, | |
| "reward": 1.05859375, | |
| "reward_std": 0.3355349600315094, | |
| "rewards/reward_func/mean": 0.11762152777777778, | |
| "rewards/reward_func/std": 0.04818948441081577, | |
| "sampling/importance_sampling_ratio/max": 2.997518539428711, | |
| "sampling/importance_sampling_ratio/mean": 0.9628660082817078, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.993983268737793, | |
| "sampling/sampling_logp_difference/mean": 0.16173504292964935, | |
| "step": 40, | |
| "step_time": 131.86127198208123 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4035.0, | |
| "completions/mean_length": 939.421875, | |
| "completions/mean_terminated_length": 893.1128540039062, | |
| "completions/min_length": 174.0, | |
| "completions/min_terminated_length": 174.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6877728551626205, | |
| "epoch": 0.10098522167487685, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.000779252641297956, | |
| "kl": 0.0010590426390990615, | |
| "learning_rate": 4.99423651444511e-05, | |
| "loss": -0.000718248076736927, | |
| "num_tokens": 6272426.0, | |
| "reward": 1.12890625, | |
| "reward_std": 0.3590999245643616, | |
| "rewards/reward_func/mean": 0.1254340277777778, | |
| "rewards/reward_func/std": 0.044849217351939946, | |
| "sampling/importance_sampling_ratio/max": 2.997779369354248, | |
| "sampling/importance_sampling_ratio/mean": 0.9570625424385071, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.874835968017578, | |
| "sampling/sampling_logp_difference/mean": 0.18544772267341614, | |
| "step": 41, | |
| "step_time": 122.42929228907451 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4095.0, | |
| "completions/mean_length": 1331.609375, | |
| "completions/mean_terminated_length": 1097.3389892578125, | |
| "completions/min_length": 305.0, | |
| "completions/min_terminated_length": 305.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8357381373643875, | |
| "epoch": 0.10344827586206896, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.000931053055557495, | |
| "kl": 0.0011489106400404125, | |
| "learning_rate": 4.993902603379471e-05, | |
| "loss": -0.019008180126547813, | |
| "num_tokens": 6443425.0, | |
| "reward": 1.09375, | |
| "reward_std": 0.3713446259498596, | |
| "rewards/reward_func/mean": 0.12152777777777778, | |
| "rewards/reward_func/std": 0.05229175090789795, | |
| "sampling/importance_sampling_ratio/max": 2.9981436729431152, | |
| "sampling/importance_sampling_ratio/mean": 0.9422184228897095, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.15316390991211, | |
| "sampling/sampling_logp_difference/mean": 0.22885316610336304, | |
| "step": 42, | |
| "step_time": 147.6898850449361 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4078.0, | |
| "completions/mean_length": 977.359375, | |
| "completions/mean_terminated_length": 823.9835815429688, | |
| "completions/min_length": 184.0, | |
| "completions/min_terminated_length": 184.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7929796725511551, | |
| "epoch": 0.10591133004926108, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0010182376320781244, | |
| "kl": 0.002078307850752026, | |
| "learning_rate": 4.99355930188555e-05, | |
| "loss": -0.028957121074199677, | |
| "num_tokens": 6585592.0, | |
| "reward": 1.11328125, | |
| "reward_std": 0.4176884591579437, | |
| "rewards/reward_func/mean": 0.12369791666666667, | |
| "rewards/reward_func/std": 0.058884123961130776, | |
| "sampling/importance_sampling_ratio/max": 2.9990317821502686, | |
| "sampling/importance_sampling_ratio/mean": 0.9565147757530212, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.686131477355957, | |
| "sampling/sampling_logp_difference/mean": 0.19290070235729218, | |
| "step": 43, | |
| "step_time": 121.78506018640473 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4060.0, | |
| "completions/mean_length": 1173.28125, | |
| "completions/mean_terminated_length": 1079.0, | |
| "completions/min_length": 203.0, | |
| "completions/min_terminated_length": 203.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7569215148687363, | |
| "epoch": 0.10837438423645321, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 6.888280513936598e-05, | |
| "kl": 0.0013799294829368591, | |
| "learning_rate": 4.9932066112559975e-05, | |
| "loss": 1.0725540050771087e-05, | |
| "num_tokens": 6764346.0, | |
| "reward": 1.15625, | |
| "reward_std": 0.36596253514289856, | |
| "rewards/reward_func/mean": 0.1284722222222222, | |
| "rewards/reward_func/std": 0.04066250390476651, | |
| "sampling/importance_sampling_ratio/max": 2.994110107421875, | |
| "sampling/importance_sampling_ratio/mean": 0.9449698328971863, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.999063491821289, | |
| "sampling/sampling_logp_difference/mean": 0.22023621201515198, | |
| "step": 44, | |
| "step_time": 191.58390807081014 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3576.0, | |
| "completions/mean_length": 766.96875, | |
| "completions/mean_terminated_length": 659.5806274414062, | |
| "completions/min_length": 71.0, | |
| "completions/min_terminated_length": 71.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.769265666604042, | |
| "epoch": 0.11083743842364532, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.003624385236832457, | |
| "kl": 0.0033626087242737412, | |
| "learning_rate": 4.992844532818821e-05, | |
| "loss": 0.011339845135807991, | |
| "num_tokens": 6889224.0, | |
| "reward": 1.2109375, | |
| "reward_std": 0.7443154454231262, | |
| "rewards/reward_func/mean": 0.1345486111111111, | |
| "rewards/reward_func/std": 0.11726083523697323, | |
| "sampling/importance_sampling_ratio/max": 2.9944660663604736, | |
| "sampling/importance_sampling_ratio/mean": 0.9590526819229126, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.834656715393066, | |
| "sampling/sampling_logp_difference/mean": 0.18281063437461853, | |
| "step": 45, | |
| "step_time": 118.58491391129792 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3434.0, | |
| "completions/mean_length": 1011.671875, | |
| "completions/mean_terminated_length": 859.9835815429688, | |
| "completions/min_length": 215.0, | |
| "completions/min_terminated_length": 215.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7145812660455704, | |
| "epoch": 0.11330049261083744, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0008810215585302653, | |
| "kl": 0.0015595734002999961, | |
| "learning_rate": 4.9924730679373735e-05, | |
| "loss": 0.010057137347757816, | |
| "num_tokens": 7041267.0, | |
| "reward": 1.18359375, | |
| "reward_std": 0.3965180516242981, | |
| "rewards/reward_func/mean": 0.13151041666666666, | |
| "rewards/reward_func/std": 0.047183099720213145, | |
| "sampling/importance_sampling_ratio/max": 2.9970216751098633, | |
| "sampling/importance_sampling_ratio/mean": 0.95362788438797, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.471074104309082, | |
| "sampling/sampling_logp_difference/mean": 0.19335752725601196, | |
| "step": 46, | |
| "step_time": 132.00769766583107 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3317.0, | |
| "completions/max_terminated_length": 3317.0, | |
| "completions/mean_length": 970.09375, | |
| "completions/mean_terminated_length": 970.09375, | |
| "completions/min_length": 105.0, | |
| "completions/min_terminated_length": 105.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7778498083353043, | |
| "epoch": 0.11576354679802955, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0061553118849422455, | |
| "kl": 0.0065233102650381625, | |
| "learning_rate": 4.992092218010351e-05, | |
| "loss": 0.036174844950437546, | |
| "num_tokens": 7188905.0, | |
| "reward": 1.21484375, | |
| "reward_std": 0.7635900974273682, | |
| "rewards/reward_func/mean": 0.1349826388888889, | |
| "rewards/reward_func/std": 0.12284477055072784, | |
| "sampling/importance_sampling_ratio/max": 2.9998605251312256, | |
| "sampling/importance_sampling_ratio/mean": 0.9465634822845459, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 19.34210205078125, | |
| "sampling/sampling_logp_difference/mean": 0.21574868261814117, | |
| "step": 47, | |
| "step_time": 99.85584617522545 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3715.0, | |
| "completions/mean_length": 1071.703125, | |
| "completions/mean_terminated_length": 829.9661254882812, | |
| "completions/min_length": 66.0, | |
| "completions/min_terminated_length": 66.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7317904829978943, | |
| "epoch": 0.11822660098522167, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00015393564669603422, | |
| "kl": 0.0016383329930249602, | |
| "learning_rate": 4.991701984471789e-05, | |
| "loss": -0.0043869917280972, | |
| "num_tokens": 7348502.0, | |
| "reward": 1.15234375, | |
| "reward_std": 0.3689786195755005, | |
| "rewards/reward_func/mean": 0.12803819444444445, | |
| "rewards/reward_func/std": 0.04413472612698873, | |
| "sampling/importance_sampling_ratio/max": 2.9766175746917725, | |
| "sampling/importance_sampling_ratio/mean": 0.949319064617157, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.05463981628418, | |
| "sampling/sampling_logp_difference/mean": 0.2019147127866745, | |
| "step": 48, | |
| "step_time": 164.48299563932233 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2109.0, | |
| "completions/mean_length": 776.296875, | |
| "completions/mean_terminated_length": 723.6032104492188, | |
| "completions/min_length": 86.0, | |
| "completions/min_terminated_length": 86.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6619300991296768, | |
| "epoch": 0.1206896551724138, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0016658057286295187, | |
| "kl": 0.001641769689740613, | |
| "learning_rate": 4.9913023687910575e-05, | |
| "loss": 0.005893378518521786, | |
| "num_tokens": 7474137.0, | |
| "reward": 1.12890625, | |
| "reward_std": 0.33627331256866455, | |
| "rewards/reward_func/mean": 0.1254340277777778, | |
| "rewards/reward_func/std": 0.044849217351939946, | |
| "sampling/importance_sampling_ratio/max": 2.999603748321533, | |
| "sampling/importance_sampling_ratio/mean": 0.9636247158050537, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.411972045898438, | |
| "sampling/sampling_logp_difference/mean": 0.17460203170776367, | |
| "step": 49, | |
| "step_time": 131.9607803169638 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3009.0, | |
| "completions/mean_length": 1223.515625, | |
| "completions/mean_terminated_length": 1032.0167236328125, | |
| "completions/min_length": 174.0, | |
| "completions/min_terminated_length": 174.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7410552501678467, | |
| "epoch": 0.12315270935960591, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.002313620014665204, | |
| "kl": 0.0017931896727532148, | |
| "learning_rate": 4.990893372472849e-05, | |
| "loss": -0.016107279807329178, | |
| "num_tokens": 7647882.0, | |
| "reward": 1.09375, | |
| "reward_std": 0.41187721490859985, | |
| "rewards/reward_func/mean": 0.12152777777777778, | |
| "rewards/reward_func/std": 0.05926263497935401, | |
| "sampling/importance_sampling_ratio/max": 2.994967460632324, | |
| "sampling/importance_sampling_ratio/mean": 0.9434908628463745, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.531850814819336, | |
| "sampling/sampling_logp_difference/mean": 0.22060488164424896, | |
| "step": 50, | |
| "step_time": 151.6255073894281 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2448.0, | |
| "completions/mean_length": 1066.484375, | |
| "completions/mean_terminated_length": 815.8103637695312, | |
| "completions/min_length": 154.0, | |
| "completions/min_terminated_length": 154.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.771567702293396, | |
| "epoch": 0.12561576354679804, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0003253046604313312, | |
| "kl": 0.0013065032253507525, | |
| "learning_rate": 4.99047499705718e-05, | |
| "loss": -0.007344153709709644, | |
| "num_tokens": 7797529.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.3119787275791168, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.043059426049391426, | |
| "sampling/importance_sampling_ratio/max": 2.9988787174224854, | |
| "sampling/importance_sampling_ratio/mean": 0.9490600228309631, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.559198379516602, | |
| "sampling/sampling_logp_difference/mean": 0.20981940627098083, | |
| "step": 51, | |
| "step_time": 121.97355275088921 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3566.0, | |
| "completions/mean_length": 1041.515625, | |
| "completions/mean_terminated_length": 942.9838256835938, | |
| "completions/min_length": 23.0, | |
| "completions/min_terminated_length": 23.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.670068770647049, | |
| "epoch": 0.12807881773399016, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.008313600173241441, | |
| "kl": 0.006259864254388958, | |
| "learning_rate": 4.990047244119383e-05, | |
| "loss": -0.0173809751868248, | |
| "num_tokens": 7953674.0, | |
| "reward": 1.1015625, | |
| "reward_std": 0.781378984451294, | |
| "rewards/reward_func/mean": 0.12239583333333333, | |
| "rewards/reward_func/std": 0.10772978928354052, | |
| "sampling/importance_sampling_ratio/max": 2.999814748764038, | |
| "sampling/importance_sampling_ratio/mean": 0.9522294998168945, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.490184783935547, | |
| "sampling/sampling_logp_difference/mean": 0.18839803338050842, | |
| "step": 52, | |
| "step_time": 127.30351705593057 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2632.0, | |
| "completions/mean_length": 903.265625, | |
| "completions/mean_terminated_length": 801.5573120117188, | |
| "completions/min_length": 278.0, | |
| "completions/min_terminated_length": 278.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6703495234251022, | |
| "epoch": 0.13054187192118227, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0007236686160723223, | |
| "kl": 0.001405259157763794, | |
| "learning_rate": 4.9896101152701e-05, | |
| "loss": -0.0052278656512498856, | |
| "num_tokens": 8088731.0, | |
| "reward": 1.16015625, | |
| "reward_std": 0.3892585337162018, | |
| "rewards/reward_func/mean": 0.12890625, | |
| "rewards/reward_func/std": 0.04816830199625757, | |
| "sampling/importance_sampling_ratio/max": 2.995981454849243, | |
| "sampling/importance_sampling_ratio/mean": 0.9509227275848389, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.751556396484375, | |
| "sampling/sampling_logp_difference/mean": 0.1959969401359558, | |
| "step": 53, | |
| "step_time": 127.24764031497762 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3157.0, | |
| "completions/mean_length": 935.578125, | |
| "completions/mean_terminated_length": 780.1475219726562, | |
| "completions/min_length": 71.0, | |
| "completions/min_terminated_length": 71.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7959064245223999, | |
| "epoch": 0.1330049261083744, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0005867268570703819, | |
| "kl": 0.002575396472821012, | |
| "learning_rate": 4.9891636121552745e-05, | |
| "loss": 0.003615929512307048, | |
| "num_tokens": 8238144.0, | |
| "reward": 1.19921875, | |
| "reward_std": 0.40868473052978516, | |
| "rewards/reward_func/mean": 0.1332465277777778, | |
| "rewards/reward_func/std": 0.04852836661868625, | |
| "sampling/importance_sampling_ratio/max": 2.997250556945801, | |
| "sampling/importance_sampling_ratio/mean": 0.9515563249588013, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.81047534942627, | |
| "sampling/sampling_logp_difference/mean": 0.2022194117307663, | |
| "step": 54, | |
| "step_time": 131.37923206575215 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4043.0, | |
| "completions/mean_length": 1119.9375, | |
| "completions/mean_terminated_length": 979.2373046875, | |
| "completions/min_length": 157.0, | |
| "completions/min_terminated_length": 157.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7834322899580002, | |
| "epoch": 0.1354679802955665, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.000250132466078681, | |
| "kl": 0.001461994950659573, | |
| "learning_rate": 4.988707736456151e-05, | |
| "loss": -0.0006330913747660816, | |
| "num_tokens": 8393452.0, | |
| "reward": 1.13671875, | |
| "reward_std": 0.3533560335636139, | |
| "rewards/reward_func/mean": 0.12630208333333334, | |
| "rewards/reward_func/std": 0.04240360524919298, | |
| "sampling/importance_sampling_ratio/max": 2.9995980262756348, | |
| "sampling/importance_sampling_ratio/mean": 0.9429638981819153, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.937179565429688, | |
| "sampling/sampling_logp_difference/mean": 0.22264911234378815, | |
| "step": 55, | |
| "step_time": 136.44824583176523 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4059.0, | |
| "completions/mean_length": 1084.0625, | |
| "completions/mean_terminated_length": 1003.0654907226562, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 194.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6966681033372879, | |
| "epoch": 0.13793103448275862, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.000776864777284884, | |
| "kl": 0.0010701149585656822, | |
| "learning_rate": 4.9882424898892635e-05, | |
| "loss": -0.0030254418961703777, | |
| "num_tokens": 8565104.0, | |
| "reward": 1.07421875, | |
| "reward_std": 0.32635459303855896, | |
| "rewards/reward_func/mean": 0.1193576388888889, | |
| "rewards/reward_func/std": 0.04690552916791704, | |
| "sampling/importance_sampling_ratio/max": 2.995293140411377, | |
| "sampling/importance_sampling_ratio/mean": 0.9448047876358032, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.740140914916992, | |
| "sampling/sampling_logp_difference/mean": 0.21466103196144104, | |
| "step": 56, | |
| "step_time": 169.7448099392932 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3503.0, | |
| "completions/mean_length": 1082.09375, | |
| "completions/mean_terminated_length": 984.8709106445312, | |
| "completions/min_length": 230.0, | |
| "completions/min_terminated_length": 230.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7558799386024475, | |
| "epoch": 0.14039408866995073, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0006483312619414802, | |
| "kl": 0.0011281462502665818, | |
| "learning_rate": 4.987767874206428e-05, | |
| "loss": 0.005442922003567219, | |
| "num_tokens": 8719014.0, | |
| "reward": 1.16796875, | |
| "reward_std": 0.38331958651542664, | |
| "rewards/reward_func/mean": 0.12977430555555555, | |
| "rewards/reward_func/std": 0.04572268989351061, | |
| "sampling/importance_sampling_ratio/max": 2.9972448348999023, | |
| "sampling/importance_sampling_ratio/mean": 0.9498982429504395, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.618826866149902, | |
| "sampling/sampling_logp_difference/mean": 0.21131965517997742, | |
| "step": 57, | |
| "step_time": 175.06015671789646 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2919.0, | |
| "completions/mean_length": 1007.796875, | |
| "completions/mean_terminated_length": 908.1773681640625, | |
| "completions/min_length": 96.0, | |
| "completions/min_terminated_length": 96.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7677361369132996, | |
| "epoch": 0.14285714285714285, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.008613100011594328, | |
| "kl": 0.0014947211020626128, | |
| "learning_rate": 4.987283891194743e-05, | |
| "loss": 0.11646324396133423, | |
| "num_tokens": 8858233.0, | |
| "reward": 1.19921875, | |
| "reward_std": 0.9302880764007568, | |
| "rewards/reward_func/mean": 0.1332465277777778, | |
| "rewards/reward_func/std": 0.13260910908381143, | |
| "sampling/importance_sampling_ratio/max": 2.9999563694000244, | |
| "sampling/importance_sampling_ratio/mean": 0.9526713490486145, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.495641708374023, | |
| "sampling/sampling_logp_difference/mean": 0.21066075563430786, | |
| "step": 58, | |
| "step_time": 172.44660172308795 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3183.0, | |
| "completions/mean_length": 1208.90625, | |
| "completions/mean_terminated_length": 1115.774169921875, | |
| "completions/min_length": 213.0, | |
| "completions/min_terminated_length": 213.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6357712894678116, | |
| "epoch": 0.14532019704433496, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0003580177247054395, | |
| "kl": 0.0012669183051912114, | |
| "learning_rate": 4.986790542676576e-05, | |
| "loss": -0.0009904210455715656, | |
| "num_tokens": 9021795.0, | |
| "reward": 1.08984375, | |
| "reward_std": 0.2966987192630768, | |
| "rewards/reward_func/mean": 0.12109375, | |
| "rewards/reward_func/std": 0.0361149807771047, | |
| "sampling/importance_sampling_ratio/max": 2.997753620147705, | |
| "sampling/importance_sampling_ratio/mean": 0.9547065496444702, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.183061599731445, | |
| "sampling/sampling_logp_difference/mean": 0.18249759078025818, | |
| "step": 59, | |
| "step_time": 127.31623220816255 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1695.0, | |
| "completions/mean_length": 654.28125, | |
| "completions/mean_terminated_length": 543.258056640625, | |
| "completions/min_length": 181.0, | |
| "completions/min_terminated_length": 181.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6762199103832245, | |
| "epoch": 0.1477832512315271, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002547014851253121, | |
| "kl": 0.0022673878411296755, | |
| "learning_rate": 4.986287830509558e-05, | |
| "loss": 0.004557657055556774, | |
| "num_tokens": 9153077.0, | |
| "reward": 1.0625, | |
| "reward_std": 0.2920915186405182, | |
| "rewards/reward_func/mean": 0.11805555555555555, | |
| "rewards/reward_func/std": 0.04098213298453225, | |
| "sampling/importance_sampling_ratio/max": 2.998140811920166, | |
| "sampling/importance_sampling_ratio/mean": 0.9566666483879089, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.62492847442627, | |
| "sampling/sampling_logp_difference/mean": 0.18817751109600067, | |
| "step": 60, | |
| "step_time": 122.69049222487956 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3154.0, | |
| "completions/mean_length": 1090.25, | |
| "completions/mean_terminated_length": 1042.539794921875, | |
| "completions/min_length": 318.0, | |
| "completions/min_terminated_length": 318.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7119808942079544, | |
| "epoch": 0.15024630541871922, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0029762470152537818, | |
| "kl": 0.001506138069089502, | |
| "learning_rate": 4.985775756586581e-05, | |
| "loss": -0.013122936710715294, | |
| "num_tokens": 9308901.0, | |
| "reward": 1.03515625, | |
| "reward_std": 0.31455180048942566, | |
| "rewards/reward_func/mean": 0.1150173611111111, | |
| "rewards/reward_func/std": 0.04827603366639879, | |
| "sampling/importance_sampling_ratio/max": 2.997075319290161, | |
| "sampling/importance_sampling_ratio/mean": 0.9534555673599243, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.6150484085083, | |
| "sampling/sampling_logp_difference/mean": 0.19639095664024353, | |
| "step": 61, | |
| "step_time": 115.97744632000104 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3658.0, | |
| "completions/mean_length": 1041.125, | |
| "completions/mean_terminated_length": 884.4667358398438, | |
| "completions/min_length": 287.0, | |
| "completions/min_terminated_length": 287.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.681881383061409, | |
| "epoch": 0.15270935960591134, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 5.3362222264957966e-05, | |
| "kl": 0.0011868184083141387, | |
| "learning_rate": 4.9852543228357835e-05, | |
| "loss": 8.767043254920281e-06, | |
| "num_tokens": 9467821.0, | |
| "reward": 1.046875, | |
| "reward_std": 0.2130420207977295, | |
| "rewards/reward_func/mean": 0.11631944444444445, | |
| "rewards/reward_func/std": 0.023671337299876742, | |
| "sampling/importance_sampling_ratio/max": 2.998835802078247, | |
| "sampling/importance_sampling_ratio/mean": 0.950027585029602, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.164039611816406, | |
| "sampling/sampling_logp_difference/mean": 0.1998741328716278, | |
| "step": 62, | |
| "step_time": 128.09420190914534 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3469.0, | |
| "completions/max_terminated_length": 3469.0, | |
| "completions/mean_length": 897.0625, | |
| "completions/mean_terminated_length": 897.0625, | |
| "completions/min_length": 300.0, | |
| "completions/min_terminated_length": 300.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6641940772533417, | |
| "epoch": 0.15517241379310345, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0009879127765257943, | |
| "kl": 0.0019843199115712196, | |
| "learning_rate": 4.9847235312205484e-05, | |
| "loss": 0.004818388726562262, | |
| "num_tokens": 9608273.0, | |
| "reward": 1.06640625, | |
| "reward_std": 0.2790367007255554, | |
| "rewards/reward_func/mean": 0.11848958333333333, | |
| "rewards/reward_func/std": 0.035972247935003705, | |
| "sampling/importance_sampling_ratio/max": 2.998894453048706, | |
| "sampling/importance_sampling_ratio/mean": 0.9591097831726074, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.58657169342041, | |
| "sampling/sampling_logp_difference/mean": 0.172621488571167, | |
| "step": 63, | |
| "step_time": 126.96359702572227 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3346.0, | |
| "completions/mean_length": 889.734375, | |
| "completions/mean_terminated_length": 838.84130859375, | |
| "completions/min_length": 105.0, | |
| "completions/min_terminated_length": 105.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7291716188192368, | |
| "epoch": 0.15763546798029557, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.005158850574939295, | |
| "kl": 0.0012882646697107702, | |
| "learning_rate": 4.984183383739496e-05, | |
| "loss": -0.0065579283982515335, | |
| "num_tokens": 9745936.0, | |
| "reward": 1.140625, | |
| "reward_std": 0.7904125452041626, | |
| "rewards/reward_func/mean": 0.1267361111111111, | |
| "rewards/reward_func/std": 0.12136938919623692, | |
| "sampling/importance_sampling_ratio/max": 2.9999992847442627, | |
| "sampling/importance_sampling_ratio/mean": 0.9599671363830566, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.868587493896484, | |
| "sampling/sampling_logp_difference/mean": 0.18142351508140564, | |
| "step": 64, | |
| "step_time": 145.68370983726345 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3697.0, | |
| "completions/mean_length": 1619.484375, | |
| "completions/mean_terminated_length": 1454.3834228515625, | |
| "completions/min_length": 263.0, | |
| "completions/min_terminated_length": 263.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7143156677484512, | |
| "epoch": 0.16009852216748768, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.005760896054601062, | |
| "kl": 0.0011219466978218406, | |
| "learning_rate": 4.983633882426471e-05, | |
| "loss": 0.05116764456033707, | |
| "num_tokens": 9952287.0, | |
| "reward": 1.359375, | |
| "reward_std": 1.1152576208114624, | |
| "rewards/reward_func/mean": 0.15104166666666666, | |
| "rewards/reward_func/std": 0.1629431736138132, | |
| "sampling/importance_sampling_ratio/max": 2.9972517490386963, | |
| "sampling/importance_sampling_ratio/mean": 0.9454111456871033, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.621063232421875, | |
| "sampling/sampling_logp_difference/mean": 0.21347495913505554, | |
| "step": 65, | |
| "step_time": 186.1511984670069 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3038.0, | |
| "completions/mean_length": 1117.59375, | |
| "completions/mean_terminated_length": 1052.774169921875, | |
| "completions/min_length": 148.0, | |
| "completions/min_terminated_length": 148.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6963524371385574, | |
| "epoch": 0.1625615763546798, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.001653339708279588, | |
| "kl": 0.0012707496352959424, | |
| "learning_rate": 4.983075029350542e-05, | |
| "loss": 0.01102248951792717, | |
| "num_tokens": 10104389.0, | |
| "reward": 1.09375, | |
| "reward_std": 0.33481812477111816, | |
| "rewards/reward_func/mean": 0.12152777777777778, | |
| "rewards/reward_func/std": 0.045880657931168876, | |
| "sampling/importance_sampling_ratio/max": 2.997091054916382, | |
| "sampling/importance_sampling_ratio/mean": 0.9539859294891357, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.87465476989746, | |
| "sampling/sampling_logp_difference/mean": 0.1939973533153534, | |
| "step": 66, | |
| "step_time": 123.42807900626212 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3030.0, | |
| "completions/mean_length": 931.90625, | |
| "completions/mean_terminated_length": 834.1638793945312, | |
| "completions/min_length": 125.0, | |
| "completions/min_terminated_length": 125.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6701111942529678, | |
| "epoch": 0.16502463054187191, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 8.6437440455173e-05, | |
| "kl": 0.001773051859345287, | |
| "learning_rate": 4.9825068266159894e-05, | |
| "loss": 1.2892927770735696e-05, | |
| "num_tokens": 10248559.0, | |
| "reward": 1.078125, | |
| "reward_std": 0.2704896926879883, | |
| "rewards/reward_func/mean": 0.11979166666666667, | |
| "rewards/reward_func/std": 0.030054413610034518, | |
| "sampling/importance_sampling_ratio/max": 2.997563600540161, | |
| "sampling/importance_sampling_ratio/mean": 0.9578502178192139, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.782278060913086, | |
| "sampling/sampling_logp_difference/mean": 0.18533891439437866, | |
| "step": 67, | |
| "step_time": 128.0075939442031 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2079.0, | |
| "completions/mean_length": 931.625, | |
| "completions/mean_terminated_length": 824.7540283203125, | |
| "completions/min_length": 93.0, | |
| "completions/min_terminated_length": 93.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7462280243635178, | |
| "epoch": 0.16748768472906403, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.00013239457038290433, | |
| "kl": 0.0021935105323791504, | |
| "learning_rate": 4.981929276362298e-05, | |
| "loss": 1.589039311511442e-05, | |
| "num_tokens": 10389415.0, | |
| "reward": 1.109375, | |
| "reward_std": 0.3145764470100403, | |
| "rewards/reward_func/mean": 0.1232638888888889, | |
| "rewards/reward_func/std": 0.03495293855667114, | |
| "sampling/importance_sampling_ratio/max": 2.9994449615478516, | |
| "sampling/importance_sampling_ratio/mean": 0.9569523334503174, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.203958511352539, | |
| "sampling/sampling_logp_difference/mean": 0.19418185949325562, | |
| "step": 68, | |
| "step_time": 124.90443813405 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3161.0, | |
| "completions/max_terminated_length": 3161.0, | |
| "completions/mean_length": 1039.765625, | |
| "completions/mean_terminated_length": 1039.765625, | |
| "completions/min_length": 242.0, | |
| "completions/min_terminated_length": 242.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7052541077136993, | |
| "epoch": 0.16995073891625614, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0007086844449368942, | |
| "kl": 0.002509061130695045, | |
| "learning_rate": 4.981342380764149e-05, | |
| "loss": 0.006688619032502174, | |
| "num_tokens": 10550360.0, | |
| "reward": 1.10546875, | |
| "reward_std": 0.31749480962753296, | |
| "rewards/reward_func/mean": 0.1228298611111111, | |
| "rewards/reward_func/std": 0.038425160778893366, | |
| "sampling/importance_sampling_ratio/max": 2.998634099960327, | |
| "sampling/importance_sampling_ratio/mean": 0.9502047300338745, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.249970436096191, | |
| "sampling/sampling_logp_difference/mean": 0.20042559504508972, | |
| "step": 69, | |
| "step_time": 101.4746579460334 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3996.0, | |
| "completions/max_terminated_length": 3996.0, | |
| "completions/mean_length": 928.03125, | |
| "completions/mean_terminated_length": 928.03125, | |
| "completions/min_length": 91.0, | |
| "completions/min_terminated_length": 91.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7934342622756958, | |
| "epoch": 0.1724137931034483, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0009891554642842337, | |
| "kl": 0.0018119575106538832, | |
| "learning_rate": 4.980746142031414e-05, | |
| "loss": 0.007167218253016472, | |
| "num_tokens": 10698890.0, | |
| "reward": 1.1640625, | |
| "reward_std": 0.3651992976665497, | |
| "rewards/reward_func/mean": 0.1293402777777778, | |
| "rewards/reward_func/std": 0.04712180379364225, | |
| "sampling/importance_sampling_ratio/max": 2.998882293701172, | |
| "sampling/importance_sampling_ratio/mean": 0.9477431774139404, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.083467483520508, | |
| "sampling/sampling_logp_difference/mean": 0.2261689305305481, | |
| "step": 70, | |
| "step_time": 171.54467244585976 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3915.0, | |
| "completions/mean_length": 1171.96875, | |
| "completions/mean_terminated_length": 1077.6451416015625, | |
| "completions/min_length": 90.0, | |
| "completions/min_terminated_length": 90.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7315615117549896, | |
| "epoch": 0.1748768472906404, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.00439498054004489, | |
| "kl": 0.0017650375666562468, | |
| "learning_rate": 4.980140562409141e-05, | |
| "loss": 0.009830066934227943, | |
| "num_tokens": 10869880.0, | |
| "reward": 1.17578125, | |
| "reward_std": 0.5130822062492371, | |
| "rewards/reward_func/mean": 0.1306423611111111, | |
| "rewards/reward_func/std": 0.09101471718814638, | |
| "sampling/importance_sampling_ratio/max": 2.9981822967529297, | |
| "sampling/importance_sampling_ratio/mean": 0.9450452923774719, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.777099609375, | |
| "sampling/sampling_logp_difference/mean": 0.22317034006118774, | |
| "step": 71, | |
| "step_time": 131.98447898984887 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2886.0, | |
| "completions/mean_length": 1031.53125, | |
| "completions/mean_terminated_length": 982.888916015625, | |
| "completions/min_length": 67.0, | |
| "completions/min_terminated_length": 67.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7074902355670929, | |
| "epoch": 0.17733990147783252, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002222973231863952, | |
| "kl": 0.0013612315233331174, | |
| "learning_rate": 4.979525644177554e-05, | |
| "loss": -0.005394880194216967, | |
| "num_tokens": 11026666.0, | |
| "reward": 1.15234375, | |
| "reward_std": 0.39994341135025024, | |
| "rewards/reward_func/mean": 0.12803819444444445, | |
| "rewards/reward_func/std": 0.05651323828432295, | |
| "sampling/importance_sampling_ratio/max": 2.9978277683258057, | |
| "sampling/importance_sampling_ratio/mean": 0.9466830492019653, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.806804656982422, | |
| "sampling/sampling_logp_difference/mean": 0.20748589932918549, | |
| "step": 72, | |
| "step_time": 130.5002006436698 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3772.0, | |
| "completions/mean_length": 961.75, | |
| "completions/mean_terminated_length": 902.6451416015625, | |
| "completions/min_length": 137.0, | |
| "completions/min_terminated_length": 137.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.61832594871521, | |
| "epoch": 0.17980295566502463, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.010038654463100299, | |
| "kl": 0.009182059322483838, | |
| "learning_rate": 4.978901389652039e-05, | |
| "loss": -0.05658572167158127, | |
| "num_tokens": 11175818.0, | |
| "reward": 1.18359375, | |
| "reward_std": 0.6889752745628357, | |
| "rewards/reward_func/mean": 0.13151041666666666, | |
| "rewards/reward_func/std": 0.10786960522333781, | |
| "sampling/importance_sampling_ratio/max": 2.996751070022583, | |
| "sampling/importance_sampling_ratio/mean": 0.9549329280853271, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.932666778564453, | |
| "sampling/sampling_logp_difference/mean": 0.17912691831588745, | |
| "step": 73, | |
| "step_time": 125.78719549998641 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3745.0, | |
| "completions/max_terminated_length": 3745.0, | |
| "completions/mean_length": 952.8125, | |
| "completions/mean_terminated_length": 952.8125, | |
| "completions/min_length": 107.0, | |
| "completions/min_terminated_length": 107.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7050873339176178, | |
| "epoch": 0.18226600985221675, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0005601993541825297, | |
| "kl": 0.0018221104983240366, | |
| "learning_rate": 4.978267801183133e-05, | |
| "loss": 7.19567178748548e-05, | |
| "num_tokens": 11323678.0, | |
| "reward": 1.16796875, | |
| "reward_std": 0.3728235363960266, | |
| "rewards/reward_func/mean": 0.12977430555555555, | |
| "rewards/reward_func/std": 0.04572268989351061, | |
| "sampling/importance_sampling_ratio/max": 2.9927818775177, | |
| "sampling/importance_sampling_ratio/mean": 0.952770471572876, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.401138305664062, | |
| "sampling/sampling_logp_difference/mean": 0.1930316686630249, | |
| "step": 74, | |
| "step_time": 110.4740979031194 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2041.0, | |
| "completions/max_terminated_length": 2041.0, | |
| "completions/mean_length": 552.109375, | |
| "completions/mean_terminated_length": 538.6032104492188, | |
| "completions/min_length": 118.0, | |
| "completions/min_terminated_length": 118.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7133064270019531, | |
| "epoch": 0.18472906403940886, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.00011366120432484819, | |
| "kl": 0.0015789342869538814, | |
| "learning_rate": 4.977624881156524e-05, | |
| "loss": 1.5052077287691645e-05, | |
| "num_tokens": 11453589.0, | |
| "reward": 1.0625, | |
| "reward_std": 0.24397501349449158, | |
| "rewards/reward_func/mean": 0.11805555555555555, | |
| "rewards/reward_func/std": 0.027108336488405865, | |
| "sampling/importance_sampling_ratio/max": 2.990895986557007, | |
| "sampling/importance_sampling_ratio/mean": 0.9565259218215942, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.616744041442871, | |
| "sampling/sampling_logp_difference/mean": 0.1869342029094696, | |
| "step": 75, | |
| "step_time": 76.67042307183146 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3307.0, | |
| "completions/mean_length": 1050.421875, | |
| "completions/mean_terminated_length": 952.1773681640625, | |
| "completions/min_length": 209.0, | |
| "completions/min_terminated_length": 209.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6937734186649323, | |
| "epoch": 0.18719211822660098, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0016237223301883853, | |
| "kl": 0.0011773691221605986, | |
| "learning_rate": 4.976972631993033e-05, | |
| "loss": -0.0016344873001798987, | |
| "num_tokens": 11610176.0, | |
| "reward": 1.07421875, | |
| "reward_std": 0.3171039819717407, | |
| "rewards/reward_func/mean": 0.1193576388888889, | |
| "rewards/reward_func/std": 0.044041900171173945, | |
| "sampling/importance_sampling_ratio/max": 2.994739055633545, | |
| "sampling/importance_sampling_ratio/mean": 0.9488315582275391, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.017475128173828, | |
| "sampling/sampling_logp_difference/mean": 0.20064926147460938, | |
| "step": 76, | |
| "step_time": 226.78311094199307 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2795.0, | |
| "completions/max_terminated_length": 2795.0, | |
| "completions/mean_length": 755.515625, | |
| "completions/mean_terminated_length": 755.515625, | |
| "completions/min_length": 133.0, | |
| "completions/min_terminated_length": 133.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6844090074300766, | |
| "epoch": 0.1896551724137931, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002431004974290889, | |
| "kl": 0.002143903257092461, | |
| "learning_rate": 4.976311056148609e-05, | |
| "loss": -0.013195082545280457, | |
| "num_tokens": 11738369.0, | |
| "reward": 1.046875, | |
| "reward_std": 0.32082173228263855, | |
| "rewards/reward_func/mean": 0.11631944444444445, | |
| "rewards/reward_func/std": 0.0475527693827947, | |
| "sampling/importance_sampling_ratio/max": 2.9915335178375244, | |
| "sampling/importance_sampling_ratio/mean": 0.9584981799125671, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.93669319152832, | |
| "sampling/sampling_logp_difference/mean": 0.18049074709415436, | |
| "step": 77, | |
| "step_time": 88.75774584687315 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3728.0, | |
| "completions/mean_length": 982.546875, | |
| "completions/mean_terminated_length": 882.1128540039062, | |
| "completions/min_length": 157.0, | |
| "completions/min_terminated_length": 157.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8276418894529343, | |
| "epoch": 0.1921182266009852, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.01783241687418567, | |
| "kl": 0.0020481182436924428, | |
| "learning_rate": 4.975640156114322e-05, | |
| "loss": -0.13759304583072662, | |
| "num_tokens": 11892212.0, | |
| "reward": 1.13671875, | |
| "reward_std": 0.5891372561454773, | |
| "rewards/reward_func/mean": 0.12630208333333334, | |
| "rewards/reward_func/std": 0.1024610847234726, | |
| "sampling/importance_sampling_ratio/max": 2.99920916557312, | |
| "sampling/importance_sampling_ratio/mean": 0.9454777240753174, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.856114387512207, | |
| "sampling/sampling_logp_difference/mean": 0.22521965205669403, | |
| "step": 78, | |
| "step_time": 155.03567869076505 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2309.0, | |
| "completions/mean_length": 661.8125, | |
| "completions/mean_terminated_length": 607.3016357421875, | |
| "completions/min_length": 149.0, | |
| "completions/min_terminated_length": 149.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7546624094247818, | |
| "epoch": 0.19458128078817735, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.012855942487314536, | |
| "kl": 0.0011995464155916125, | |
| "learning_rate": 4.974959934416346e-05, | |
| "loss": -0.006040642969310284, | |
| "num_tokens": 12013368.0, | |
| "reward": 1.203125, | |
| "reward_std": 0.5957438349723816, | |
| "rewards/reward_func/mean": 0.13368055555555555, | |
| "rewards/reward_func/std": 0.09448693858252631, | |
| "sampling/importance_sampling_ratio/max": 2.9967188835144043, | |
| "sampling/importance_sampling_ratio/mean": 0.9574424624443054, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.747173309326172, | |
| "sampling/sampling_logp_difference/mean": 0.1902427226305008, | |
| "step": 79, | |
| "step_time": 108.63824260118417 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3953.0, | |
| "completions/mean_length": 1435.046875, | |
| "completions/mean_terminated_length": 1342.6884765625, | |
| "completions/min_length": 241.0, | |
| "completions/min_terminated_length": 241.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7154666781425476, | |
| "epoch": 0.19704433497536947, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.008442218807283417, | |
| "kl": 0.0028500978223746642, | |
| "learning_rate": 4.9742703936159586e-05, | |
| "loss": -0.014930376783013344, | |
| "num_tokens": 12195467.0, | |
| "reward": 1.1640625, | |
| "reward_std": 0.5255736708641052, | |
| "rewards/reward_func/mean": 0.1293402777777778, | |
| "rewards/reward_func/std": 0.09521205723285675, | |
| "sampling/importance_sampling_ratio/max": 2.998208999633789, | |
| "sampling/importance_sampling_ratio/mean": 0.9411571025848389, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.294017791748047, | |
| "sampling/sampling_logp_difference/mean": 0.21197447180747986, | |
| "step": 80, | |
| "step_time": 132.8957766594831 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3296.0, | |
| "completions/mean_length": 1159.109375, | |
| "completions/mean_terminated_length": 1064.3709716796875, | |
| "completions/min_length": 162.0, | |
| "completions/min_terminated_length": 162.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7700304388999939, | |
| "epoch": 0.19950738916256158, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0076037617796647925, | |
| "kl": 0.0030132651154417545, | |
| "learning_rate": 4.973571536309525e-05, | |
| "loss": 0.0006418544799089432, | |
| "num_tokens": 12355842.0, | |
| "reward": 1.18359375, | |
| "reward_std": 0.6550205945968628, | |
| "rewards/reward_func/mean": 0.13151041666666666, | |
| "rewards/reward_func/std": 0.1200390938255522, | |
| "sampling/importance_sampling_ratio/max": 2.9922046661376953, | |
| "sampling/importance_sampling_ratio/mean": 0.9423102736473083, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.634392738342285, | |
| "sampling/sampling_logp_difference/mean": 0.22503802180290222, | |
| "step": 81, | |
| "step_time": 165.3830566899851 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2805.0, | |
| "completions/mean_length": 1027.625, | |
| "completions/mean_terminated_length": 981.0967407226562, | |
| "completions/min_length": 304.0, | |
| "completions/min_terminated_length": 304.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6725837737321854, | |
| "epoch": 0.2019704433497537, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.009061744134370203, | |
| "kl": 0.0016485043161083013, | |
| "learning_rate": 4.9728633651284914e-05, | |
| "loss": 0.013996231369674206, | |
| "num_tokens": 12524666.0, | |
| "reward": 1.18359375, | |
| "reward_std": 0.5794808268547058, | |
| "rewards/reward_func/mean": 0.13151041666666666, | |
| "rewards/reward_func/std": 0.10200054198503494, | |
| "sampling/importance_sampling_ratio/max": 2.993680953979492, | |
| "sampling/importance_sampling_ratio/mean": 0.9446038007736206, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 20.980178833007812, | |
| "sampling/sampling_logp_difference/mean": 0.21005243062973022, | |
| "step": 82, | |
| "step_time": 165.60002181283198 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2151.0, | |
| "completions/max_terminated_length": 2151.0, | |
| "completions/mean_length": 755.8125, | |
| "completions/mean_terminated_length": 755.8125, | |
| "completions/min_length": 178.0, | |
| "completions/min_terminated_length": 178.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7532096058130264, | |
| "epoch": 0.2044334975369458, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.012740595538592081, | |
| "kl": 0.0015318515652325004, | |
| "learning_rate": 4.972145882739374e-05, | |
| "loss": -0.03307109698653221, | |
| "num_tokens": 12656094.0, | |
| "reward": 1.171875, | |
| "reward_std": 0.605652928352356, | |
| "rewards/reward_func/mean": 0.13020833333333334, | |
| "rewards/reward_func/std": 0.1064814825852712, | |
| "sampling/importance_sampling_ratio/max": 2.997760772705078, | |
| "sampling/importance_sampling_ratio/mean": 0.952104389667511, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.445816040039062, | |
| "sampling/sampling_logp_difference/mean": 0.20742540061473846, | |
| "step": 83, | |
| "step_time": 73.10571676073596 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3916.0, | |
| "completions/mean_length": 1155.390625, | |
| "completions/mean_terminated_length": 1096.6773681640625, | |
| "completions/min_length": 181.0, | |
| "completions/min_terminated_length": 181.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7081952691078186, | |
| "epoch": 0.20689655172413793, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.001375808186987275, | |
| "kl": 0.001374059182126075, | |
| "learning_rate": 4.971419091843748e-05, | |
| "loss": -0.0016404204070568085, | |
| "num_tokens": 12838567.0, | |
| "reward": 1.12109375, | |
| "reward_std": 0.3726571798324585, | |
| "rewards/reward_func/mean": 0.12456597222222222, | |
| "rewards/reward_func/std": 0.05033052464326223, | |
| "sampling/importance_sampling_ratio/max": 2.9999070167541504, | |
| "sampling/importance_sampling_ratio/mean": 0.9335456490516663, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.315726280212402, | |
| "sampling/sampling_logp_difference/mean": 0.24390606582164764, | |
| "step": 84, | |
| "step_time": 163.8807848696597 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2070.0, | |
| "completions/max_terminated_length": 2070.0, | |
| "completions/mean_length": 734.703125, | |
| "completions/mean_terminated_length": 734.703125, | |
| "completions/min_length": 187.0, | |
| "completions/min_terminated_length": 187.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6912446320056915, | |
| "epoch": 0.20935960591133004, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.001448416309828472, | |
| "kl": 0.0035722781904041767, | |
| "learning_rate": 4.970682995178238e-05, | |
| "loss": -0.003674007486552, | |
| "num_tokens": 12973476.0, | |
| "reward": 1.109375, | |
| "reward_std": 0.3329611122608185, | |
| "rewards/reward_func/mean": 0.1232638888888889, | |
| "rewards/reward_func/std": 0.04381412226292822, | |
| "sampling/importance_sampling_ratio/max": 2.993730306625366, | |
| "sampling/importance_sampling_ratio/mean": 0.9494987726211548, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.641794204711914, | |
| "sampling/sampling_logp_difference/mean": 0.2067471742630005, | |
| "step": 85, | |
| "step_time": 77.86746055702679 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2574.0, | |
| "completions/mean_length": 1015.203125, | |
| "completions/mean_terminated_length": 867.36669921875, | |
| "completions/min_length": 148.0, | |
| "completions/min_terminated_length": 148.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7194895893335342, | |
| "epoch": 0.21182266009852216, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.003522987529729015, | |
| "kl": 0.0016897321911528707, | |
| "learning_rate": 4.9699375955145114e-05, | |
| "loss": 0.055119533091783524, | |
| "num_tokens": 13116593.0, | |
| "reward": 1.15234375, | |
| "reward_std": 0.6675844788551331, | |
| "rewards/reward_func/mean": 0.12803819444444445, | |
| "rewards/reward_func/std": 0.09167053633266026, | |
| "sampling/importance_sampling_ratio/max": 2.997987747192383, | |
| "sampling/importance_sampling_ratio/mean": 0.9551426768302917, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.985254287719727, | |
| "sampling/sampling_logp_difference/mean": 0.19374895095825195, | |
| "step": 86, | |
| "step_time": 158.37500746524893 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2987.0, | |
| "completions/max_terminated_length": 2987.0, | |
| "completions/mean_length": 704.09375, | |
| "completions/mean_terminated_length": 704.09375, | |
| "completions/min_length": 111.0, | |
| "completions/min_terminated_length": 111.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6402583420276642, | |
| "epoch": 0.21428571428571427, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002849577272942313, | |
| "kl": 0.002314119366928935, | |
| "learning_rate": 4.96918289565926e-05, | |
| "loss": 0.010025442577898502, | |
| "num_tokens": 13240935.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.3592725396156311, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.052181267076068454, | |
| "sampling/importance_sampling_ratio/max": 2.9983925819396973, | |
| "sampling/importance_sampling_ratio/mean": 0.9622253775596619, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.306507110595703, | |
| "sampling/sampling_logp_difference/mean": 0.17209717631340027, | |
| "step": 87, | |
| "step_time": 89.82424217509106 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3097.0, | |
| "completions/mean_length": 903.71875, | |
| "completions/mean_terminated_length": 853.0476684570312, | |
| "completions/min_length": 156.0, | |
| "completions/min_terminated_length": 156.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7863613218069077, | |
| "epoch": 0.21674876847290642, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0008406550829518916, | |
| "kl": 0.0019374474068172276, | |
| "learning_rate": 4.968418898454199e-05, | |
| "loss": -0.015768352895975113, | |
| "num_tokens": 13379749.0, | |
| "reward": 1.09375, | |
| "reward_std": 0.322748601436615, | |
| "rewards/reward_func/mean": 0.12152777777777778, | |
| "rewards/reward_func/std": 0.045880657931168876, | |
| "sampling/importance_sampling_ratio/max": 2.9976954460144043, | |
| "sampling/importance_sampling_ratio/mean": 0.950974702835083, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.29272174835205, | |
| "sampling/sampling_logp_difference/mean": 0.2106407880783081, | |
| "step": 88, | |
| "step_time": 148.23201068071648 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3024.0, | |
| "completions/max_terminated_length": 3024.0, | |
| "completions/mean_length": 756.96875, | |
| "completions/mean_terminated_length": 756.96875, | |
| "completions/min_length": 189.0, | |
| "completions/min_terminated_length": 189.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6922882348299026, | |
| "epoch": 0.21921182266009853, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.012908831983821652, | |
| "kl": 0.0029333174461498857, | |
| "learning_rate": 4.967645606776047e-05, | |
| "loss": 0.05130379647016525, | |
| "num_tokens": 13513667.0, | |
| "reward": 1.16015625, | |
| "reward_std": 0.8313872218132019, | |
| "rewards/reward_func/mean": 0.12890625, | |
| "rewards/reward_func/std": 0.11999391516049702, | |
| "sampling/importance_sampling_ratio/max": 2.9972262382507324, | |
| "sampling/importance_sampling_ratio/mean": 0.9526609182357788, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.749960899353027, | |
| "sampling/sampling_logp_difference/mean": 0.19566956162452698, | |
| "step": 89, | |
| "step_time": 90.78295086394064 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3175.0, | |
| "completions/max_terminated_length": 3175.0, | |
| "completions/mean_length": 949.28125, | |
| "completions/mean_terminated_length": 956.84130859375, | |
| "completions/min_length": 234.0, | |
| "completions/min_terminated_length": 234.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.666090190410614, | |
| "epoch": 0.22167487684729065, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.006511050657427078, | |
| "kl": 0.0016060115303844213, | |
| "learning_rate": 4.966863023536523e-05, | |
| "loss": 0.027448534965515137, | |
| "num_tokens": 13669637.0, | |
| "reward": 1.1796875, | |
| "reward_std": 0.5917232632637024, | |
| "rewards/reward_func/mean": 0.1310763888888889, | |
| "rewards/reward_func/std": 0.09746392981873618, | |
| "sampling/importance_sampling_ratio/max": 2.9971022605895996, | |
| "sampling/importance_sampling_ratio/mean": 0.9492952823638916, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.294812202453613, | |
| "sampling/sampling_logp_difference/mean": 0.2006000131368637, | |
| "step": 90, | |
| "step_time": 104.78784828796051 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2941.0, | |
| "completions/mean_length": 852.40625, | |
| "completions/mean_terminated_length": 815.3278198242188, | |
| "completions/min_length": 171.0, | |
| "completions/min_terminated_length": 171.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6824957877397537, | |
| "epoch": 0.22413793103448276, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.013043479712303003, | |
| "kl": 0.0023478574585169554, | |
| "learning_rate": 4.96607115168233e-05, | |
| "loss": -0.04977197200059891, | |
| "num_tokens": 13818799.0, | |
| "reward": 1.03515625, | |
| "reward_std": 0.5453031659126282, | |
| "rewards/reward_func/mean": 0.1150173611111111, | |
| "rewards/reward_func/std": 0.08766606450080872, | |
| "sampling/importance_sampling_ratio/max": 2.9927175045013428, | |
| "sampling/importance_sampling_ratio/mean": 0.9504046440124512, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.90580940246582, | |
| "sampling/sampling_logp_difference/mean": 0.19969666004180908, | |
| "step": 91, | |
| "step_time": 170.82128311530687 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3471.0, | |
| "completions/mean_length": 1190.921875, | |
| "completions/mean_terminated_length": 1062.1334228515625, | |
| "completions/min_length": 38.0, | |
| "completions/min_terminated_length": 38.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.67007115483284, | |
| "epoch": 0.22660098522167488, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.00884542043937575, | |
| "kl": 0.002599392697447911, | |
| "learning_rate": 4.965269994195146e-05, | |
| "loss": 0.006632131524384022, | |
| "num_tokens": 13984954.0, | |
| "reward": 1.21875, | |
| "reward_std": 0.6965048909187317, | |
| "rewards/reward_func/mean": 0.13541666666666666, | |
| "rewards/reward_func/std": 0.12860211316082212, | |
| "sampling/importance_sampling_ratio/max": 2.9993481636047363, | |
| "sampling/importance_sampling_ratio/mean": 0.9467147588729858, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.052059173583984, | |
| "sampling/sampling_logp_difference/mean": 0.20239023864269257, | |
| "step": 92, | |
| "step_time": 128.50959110469557 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 3876.0, | |
| "completions/max_terminated_length": 3876.0, | |
| "completions/mean_length": 807.453125, | |
| "completions/mean_terminated_length": 785.245849609375, | |
| "completions/min_length": 113.0, | |
| "completions/min_terminated_length": 113.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6888804733753204, | |
| "epoch": 0.229064039408867, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0015433485271994368, | |
| "kl": 0.002188895596191287, | |
| "learning_rate": 4.964459554091615e-05, | |
| "loss": 0.012207363732159138, | |
| "num_tokens": 14128919.0, | |
| "reward": 1.0546875, | |
| "reward_std": 0.2498759627342224, | |
| "rewards/reward_func/mean": 0.1171875, | |
| "rewards/reward_func/std": 0.031979672610759735, | |
| "sampling/importance_sampling_ratio/max": 2.99534010887146, | |
| "sampling/importance_sampling_ratio/mean": 0.9456230998039246, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.117466926574707, | |
| "sampling/sampling_logp_difference/mean": 0.21316124498844147, | |
| "step": 93, | |
| "step_time": 100.48218262591399 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2766.0, | |
| "completions/mean_length": 1112.515625, | |
| "completions/mean_terminated_length": 998.35595703125, | |
| "completions/min_length": 99.0, | |
| "completions/min_terminated_length": 99.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7000087946653366, | |
| "epoch": 0.2315270935960591, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.004334828463500131, | |
| "kl": 0.0017916160868480802, | |
| "learning_rate": 4.9636398344233294e-05, | |
| "loss": 0.029138652607798576, | |
| "num_tokens": 14288936.0, | |
| "reward": 1.21875, | |
| "reward_std": 0.7189332842826843, | |
| "rewards/reward_func/mean": 0.13541666666666666, | |
| "rewards/reward_func/std": 0.10873374260134167, | |
| "sampling/importance_sampling_ratio/max": 2.9988293647766113, | |
| "sampling/importance_sampling_ratio/mean": 0.948699414730072, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.972160339355469, | |
| "sampling/sampling_logp_difference/mean": 0.20441797375679016, | |
| "step": 94, | |
| "step_time": 144.95788948773406 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3095.0, | |
| "completions/max_terminated_length": 3095.0, | |
| "completions/mean_length": 746.5, | |
| "completions/mean_terminated_length": 749.5238647460938, | |
| "completions/min_length": 183.0, | |
| "completions/min_terminated_length": 183.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8084656894207001, | |
| "epoch": 0.23399014778325122, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.002255399850514679, | |
| "kl": 0.0023641733278054744, | |
| "learning_rate": 4.9628108382768255e-05, | |
| "loss": 0.006529998034238815, | |
| "num_tokens": 14419336.0, | |
| "reward": 1.0859375, | |
| "reward_std": 0.33100003004074097, | |
| "rewards/reward_func/mean": 0.12065972222222222, | |
| "rewards/reward_func/std": 0.04311362819539176, | |
| "sampling/importance_sampling_ratio/max": 2.9969942569732666, | |
| "sampling/importance_sampling_ratio/mean": 0.949213981628418, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.631890296936035, | |
| "sampling/sampling_logp_difference/mean": 0.21985289454460144, | |
| "step": 95, | |
| "step_time": 88.58697469602339 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3782.0, | |
| "completions/mean_length": 998.828125, | |
| "completions/mean_terminated_length": 884.8359985351562, | |
| "completions/min_length": 46.0, | |
| "completions/min_terminated_length": 46.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.8362721502780914, | |
| "epoch": 0.23645320197044334, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.006188412376322827, | |
| "kl": 0.005402846087235957, | |
| "learning_rate": 4.9619725687735686e-05, | |
| "loss": 0.012449314817786217, | |
| "num_tokens": 14568589.0, | |
| "reward": 1.25, | |
| "reward_std": 0.8428032994270325, | |
| "rewards/reward_func/mean": 0.1388888888888889, | |
| "rewards/reward_func/std": 0.13194227839509645, | |
| "sampling/importance_sampling_ratio/max": 2.9986398220062256, | |
| "sampling/importance_sampling_ratio/mean": 0.9465553164482117, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.732090950012207, | |
| "sampling/sampling_logp_difference/mean": 0.22394214570522308, | |
| "step": 96, | |
| "step_time": 154.48756418889388 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3459.0, | |
| "completions/mean_length": 879.171875, | |
| "completions/mean_terminated_length": 828.1111450195312, | |
| "completions/min_length": 56.0, | |
| "completions/min_terminated_length": 56.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6241284310817719, | |
| "epoch": 0.23891625615763548, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.0066502625663478546, | |
| "kl": 0.002683707105461508, | |
| "learning_rate": 4.96112502906994e-05, | |
| "loss": 0.014375880360603333, | |
| "num_tokens": 14705512.0, | |
| "reward": 1.2109375, | |
| "reward_std": 0.6223654747009277, | |
| "rewards/reward_func/mean": 0.1345486111111111, | |
| "rewards/reward_func/std": 0.10964169187678231, | |
| "sampling/importance_sampling_ratio/max": 2.9906349182128906, | |
| "sampling/importance_sampling_ratio/mean": 0.9618527889251709, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.417328834533691, | |
| "sampling/sampling_logp_difference/mean": 0.16517508029937744, | |
| "step": 97, | |
| "step_time": 124.35440509300679 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2937.0, | |
| "completions/mean_length": 956.59375, | |
| "completions/mean_terminated_length": 906.761962890625, | |
| "completions/min_length": 121.0, | |
| "completions/min_terminated_length": 121.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7140306979417801, | |
| "epoch": 0.2413793103448276, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.00833558507766438, | |
| "kl": 0.0026831042487174273, | |
| "learning_rate": 4.960268222357227e-05, | |
| "loss": -0.014123106375336647, | |
| "num_tokens": 14860574.0, | |
| "reward": 1.20703125, | |
| "reward_std": 0.6879845857620239, | |
| "rewards/reward_func/mean": 0.13411458333333334, | |
| "rewards/reward_func/std": 0.12600696169667774, | |
| "sampling/importance_sampling_ratio/max": 2.998737096786499, | |
| "sampling/importance_sampling_ratio/mean": 0.9459173083305359, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.235814094543457, | |
| "sampling/sampling_logp_difference/mean": 0.21586598455905914, | |
| "step": 98, | |
| "step_time": 125.9828494079411 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2820.0, | |
| "completions/mean_length": 1306.296875, | |
| "completions/mean_terminated_length": 1179.559326171875, | |
| "completions/min_length": 427.0, | |
| "completions/min_terminated_length": 427.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.7497763484716415, | |
| "epoch": 0.2438423645320197, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0007196218706857476, | |
| "kl": 0.0024949743528850377, | |
| "learning_rate": 4.959402151861613e-05, | |
| "loss": 0.00040583667578175664, | |
| "num_tokens": 15036001.0, | |
| "reward": 1.08203125, | |
| "reward_std": 0.3022885024547577, | |
| "rewards/reward_func/mean": 0.12022569444444445, | |
| "rewards/reward_func/std": 0.03856059287985166, | |
| "sampling/importance_sampling_ratio/max": 2.9977340698242188, | |
| "sampling/importance_sampling_ratio/mean": 0.9443312883377075, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.374924659729004, | |
| "sampling/sampling_logp_difference/mean": 0.2221134603023529, | |
| "step": 99, | |
| "step_time": 177.20913607790135 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3101.0, | |
| "completions/mean_length": 1241.84375, | |
| "completions/mean_terminated_length": 1127.6064453125, | |
| "completions/min_length": 266.0, | |
| "completions/min_terminated_length": 266.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6876734793186188, | |
| "epoch": 0.24630541871921183, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.009038766598917204, | |
| "kl": 0.00271606317255646, | |
| "learning_rate": 4.958526820844158e-05, | |
| "loss": -0.003039947245270014, | |
| "num_tokens": 15212663.0, | |
| "reward": 1.2578125, | |
| "reward_std": 0.7084646224975586, | |
| "rewards/reward_func/mean": 0.13975694444444445, | |
| "rewards/reward_func/std": 0.1226612784796291, | |
| "sampling/importance_sampling_ratio/max": 2.9980616569519043, | |
| "sampling/importance_sampling_ratio/mean": 0.9425716400146484, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.271185874938965, | |
| "sampling/sampling_logp_difference/mean": 0.2198907732963562, | |
| "step": 100, | |
| "step_time": 137.27148599014618 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3853.0, | |
| "completions/mean_length": 1085.15625, | |
| "completions/mean_terminated_length": 988.0322265625, | |
| "completions/min_length": 230.0, | |
| "completions/min_terminated_length": 230.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6465490013360977, | |
| "epoch": 0.24876847290640394, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.012525746257424489, | |
| "kl": 0.0029582843417301774, | |
| "learning_rate": 4.957642232600797e-05, | |
| "loss": 0.04876864701509476, | |
| "num_tokens": 15367569.0, | |
| "reward": 1.33984375, | |
| "reward_std": 1.0440374612808228, | |
| "rewards/reward_func/mean": 0.1488715277777778, | |
| "rewards/reward_func/std": 0.1550002164310879, | |
| "sampling/importance_sampling_ratio/max": 2.999211072921753, | |
| "sampling/importance_sampling_ratio/mean": 0.95467609167099, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.06248950958252, | |
| "sampling/sampling_logp_difference/mean": 0.18970772624015808, | |
| "step": 101, | |
| "step_time": 118.69119972735643 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2897.0, | |
| "completions/mean_length": 1154.546875, | |
| "completions/mean_terminated_length": 1059.6612548828125, | |
| "completions/min_length": 219.0, | |
| "completions/min_terminated_length": 219.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6360055506229401, | |
| "epoch": 0.2512315270935961, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.012431235826599366, | |
| "kl": 0.006371326482621953, | |
| "learning_rate": 4.956748390462316e-05, | |
| "loss": 0.026809336617588997, | |
| "num_tokens": 15533924.0, | |
| "reward": 1.34375, | |
| "reward_std": 1.1246691942214966, | |
| "rewards/reward_func/mean": 0.14930555555555555, | |
| "rewards/reward_func/std": 0.17304262146353722, | |
| "sampling/importance_sampling_ratio/max": 2.9996252059936523, | |
| "sampling/importance_sampling_ratio/mean": 0.9527342915534973, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.81247615814209, | |
| "sampling/sampling_logp_difference/mean": 0.185723677277565, | |
| "step": 102, | |
| "step_time": 131.5465091553051 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3693.0, | |
| "completions/mean_length": 1222.40625, | |
| "completions/mean_terminated_length": 1181.9193115234375, | |
| "completions/min_length": 71.0, | |
| "completions/min_terminated_length": 71.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7460045963525772, | |
| "epoch": 0.2536945812807882, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.013992385855988483, | |
| "kl": 0.0032767574884928763, | |
| "learning_rate": 4.955845297794348e-05, | |
| "loss": 0.02842138148844242, | |
| "num_tokens": 15713870.0, | |
| "reward": 1.5078125, | |
| "reward_std": 1.1425819396972656, | |
| "rewards/reward_func/mean": 0.1675347222222222, | |
| "rewards/reward_func/std": 0.18107240481509101, | |
| "sampling/importance_sampling_ratio/max": 2.999453544616699, | |
| "sampling/importance_sampling_ratio/mean": 0.9345462322235107, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.628706932067871, | |
| "sampling/sampling_logp_difference/mean": 0.23743605613708496, | |
| "step": 103, | |
| "step_time": 138.19932377617806 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3875.0, | |
| "completions/mean_length": 806.578125, | |
| "completions/mean_terminated_length": 766.51611328125, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 160.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.715620145201683, | |
| "epoch": 0.2561576354679803, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015716834928270983, | |
| "kl": 0.0057948356261476874, | |
| "learning_rate": 4.954932957997359e-05, | |
| "loss": -0.037697743624448776, | |
| "num_tokens": 15839715.0, | |
| "reward": 1.37109375, | |
| "reward_std": 1.130381464958191, | |
| "rewards/reward_func/mean": 0.15234375, | |
| "rewards/reward_func/std": 0.1768003437254164, | |
| "sampling/importance_sampling_ratio/max": 2.9976377487182617, | |
| "sampling/importance_sampling_ratio/mean": 0.957233190536499, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.865558624267578, | |
| "sampling/sampling_logp_difference/mean": 0.19012512266635895, | |
| "step": 104, | |
| "step_time": 162.9517569427844 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3698.0, | |
| "completions/mean_length": 1274.375, | |
| "completions/mean_terminated_length": 1135.6064453125, | |
| "completions/min_length": 77.0, | |
| "completions/min_terminated_length": 77.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7952848523855209, | |
| "epoch": 0.25862068965517243, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.014526650968551654, | |
| "kl": 0.005329428589902818, | |
| "learning_rate": 4.954011374506632e-05, | |
| "loss": -0.058310359716415405, | |
| "num_tokens": 16010043.0, | |
| "reward": 1.27734375, | |
| "reward_std": 1.0236462354660034, | |
| "rewards/reward_func/mean": 0.14192708333333334, | |
| "rewards/reward_func/std": 0.1571247395541933, | |
| "sampling/importance_sampling_ratio/max": 2.9957821369171143, | |
| "sampling/importance_sampling_ratio/mean": 0.9390854239463806, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.681705474853516, | |
| "sampling/sampling_logp_difference/mean": 0.2258550226688385, | |
| "step": 105, | |
| "step_time": 196.65707094292156 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3047.0, | |
| "completions/mean_length": 846.390625, | |
| "completions/mean_terminated_length": 797.258056640625, | |
| "completions/min_length": 115.0, | |
| "completions/min_terminated_length": 115.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.749016284942627, | |
| "epoch": 0.26108374384236455, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.016145450162948976, | |
| "kl": 0.00974008662160486, | |
| "learning_rate": 4.953080550792254e-05, | |
| "loss": 0.00276003684848547, | |
| "num_tokens": 16144804.0, | |
| "reward": 1.4921875, | |
| "reward_std": 1.1960324048995972, | |
| "rewards/reward_func/mean": 0.1657986111111111, | |
| "rewards/reward_func/std": 0.18122834712266922, | |
| "sampling/importance_sampling_ratio/max": 2.997323989868164, | |
| "sampling/importance_sampling_ratio/mean": 0.9523142576217651, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.121150016784668, | |
| "sampling/sampling_logp_difference/mean": 0.19664835929870605, | |
| "step": 106, | |
| "step_time": 120.09341975627467 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3935.0, | |
| "completions/mean_length": 902.875, | |
| "completions/mean_terminated_length": 801.6884765625, | |
| "completions/min_length": 162.0, | |
| "completions/min_terminated_length": 162.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6777280122041702, | |
| "epoch": 0.26354679802955666, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02015814534595686, | |
| "kl": 0.0061337974620983005, | |
| "learning_rate": 4.952140490359108e-05, | |
| "loss": 0.02125009149312973, | |
| "num_tokens": 16290140.0, | |
| "reward": 1.64453125, | |
| "reward_std": 1.4643954038619995, | |
| "rewards/reward_func/mean": 0.18272569444444445, | |
| "rewards/reward_func/std": 0.21869104810886913, | |
| "sampling/importance_sampling_ratio/max": 2.9990665912628174, | |
| "sampling/importance_sampling_ratio/mean": 0.9552789926528931, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.122296333312988, | |
| "sampling/sampling_logp_difference/mean": 0.1901204138994217, | |
| "step": 107, | |
| "step_time": 136.87008723593317 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3418.0, | |
| "completions/mean_length": 932.375, | |
| "completions/mean_terminated_length": 864.2333984375, | |
| "completions/min_length": 147.0, | |
| "completions/min_terminated_length": 147.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6734343022108078, | |
| "epoch": 0.2660098522167488, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.029815642765056004, | |
| "kl": 0.015761199640110135, | |
| "learning_rate": 4.951191196746855e-05, | |
| "loss": -0.06286308169364929, | |
| "num_tokens": 16442756.0, | |
| "reward": 2.078125, | |
| "reward_std": 1.7877864837646484, | |
| "rewards/reward_func/mean": 0.2309027777777778, | |
| "rewards/reward_func/std": 0.24734222681985962, | |
| "sampling/importance_sampling_ratio/max": 2.997526168823242, | |
| "sampling/importance_sampling_ratio/mean": 0.9572924375534058, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.08864688873291, | |
| "sampling/sampling_logp_difference/mean": 0.18523582816123962, | |
| "step": 108, | |
| "step_time": 119.36969709699042 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3303.0, | |
| "completions/mean_length": 1167.125, | |
| "completions/mean_terminated_length": 1023.7333984375, | |
| "completions/min_length": 110.0, | |
| "completions/min_terminated_length": 110.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.641172468662262, | |
| "epoch": 0.2684729064039409, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.026888648266965774, | |
| "kl": 0.02729413635097444, | |
| "learning_rate": 4.950232673529922e-05, | |
| "loss": -0.14320430159568787, | |
| "num_tokens": 16600412.0, | |
| "reward": 2.49609375, | |
| "reward_std": 2.1480391025543213, | |
| "rewards/reward_func/mean": 0.27734375, | |
| "rewards/reward_func/std": 0.32694076084428364, | |
| "sampling/importance_sampling_ratio/max": 2.9970366954803467, | |
| "sampling/importance_sampling_ratio/mean": 0.9530101418495178, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.124444007873535, | |
| "sampling/sampling_logp_difference/mean": 0.18392516672611237, | |
| "step": 109, | |
| "step_time": 185.8794325578492 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3733.0, | |
| "completions/mean_length": 1088.71875, | |
| "completions/mean_terminated_length": 1040.984130859375, | |
| "completions/min_length": 378.0, | |
| "completions/min_terminated_length": 378.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6633197665214539, | |
| "epoch": 0.270935960591133, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03216236684392253, | |
| "kl": 0.016894686967134476, | |
| "learning_rate": 4.9492649243174894e-05, | |
| "loss": 0.16343486309051514, | |
| "num_tokens": 16762842.0, | |
| "reward": 2.7109375, | |
| "reward_std": 2.0257365703582764, | |
| "rewards/reward_func/mean": 0.3012152777777778, | |
| "rewards/reward_func/std": 0.2992282451854812, | |
| "sampling/importance_sampling_ratio/max": 2.999589443206787, | |
| "sampling/importance_sampling_ratio/mean": 0.9499955177307129, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.999334335327148, | |
| "sampling/sampling_logp_difference/mean": 0.19575105607509613, | |
| "step": 110, | |
| "step_time": 123.32209147373214 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 3051.0, | |
| "completions/max_terminated_length": 2931.0, | |
| "completions/mean_length": 982.609375, | |
| "completions/mean_terminated_length": 965.1802978515625, | |
| "completions/min_length": 276.0, | |
| "completions/min_terminated_length": 276.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7807110399007797, | |
| "epoch": 0.2733990147783251, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03510656960378165, | |
| "kl": 0.054872382432222366, | |
| "learning_rate": 4.948287952753475e-05, | |
| "loss": 0.18941861391067505, | |
| "num_tokens": 16910865.0, | |
| "reward": 3.14453125, | |
| "reward_std": 2.102910280227661, | |
| "rewards/reward_func/mean": 0.3493923611111111, | |
| "rewards/reward_func/std": 0.30523034267955357, | |
| "sampling/importance_sampling_ratio/max": 2.98622727394104, | |
| "sampling/importance_sampling_ratio/mean": 0.9509456157684326, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.073673248291016, | |
| "sampling/sampling_logp_difference/mean": 0.20016415417194366, | |
| "step": 111, | |
| "step_time": 96.22127129789442 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2583.0, | |
| "completions/mean_length": 949.78125, | |
| "completions/mean_terminated_length": 895.800048828125, | |
| "completions/min_length": 273.0, | |
| "completions/min_terminated_length": 273.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6340015828609467, | |
| "epoch": 0.27586206896551724, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.035053246082576636, | |
| "kl": 0.025362088344991207, | |
| "learning_rate": 4.947301762516526e-05, | |
| "loss": -0.24255138635635376, | |
| "num_tokens": 17052563.0, | |
| "reward": 3.7265625, | |
| "reward_std": 2.2624080181121826, | |
| "rewards/reward_func/mean": 0.4140625, | |
| "rewards/reward_func/std": 0.3382473927405145, | |
| "sampling/importance_sampling_ratio/max": 2.998375415802002, | |
| "sampling/importance_sampling_ratio/mean": 0.9581119418144226, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.95738410949707, | |
| "sampling/sampling_logp_difference/mean": 0.17058373987674713, | |
| "step": 112, | |
| "step_time": 177.4650380751118 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3117.0, | |
| "completions/mean_length": 1217.34375, | |
| "completions/mean_terminated_length": 1138.6785888671875, | |
| "completions/min_length": 243.0, | |
| "completions/min_terminated_length": 243.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6510083079338074, | |
| "epoch": 0.27832512315270935, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03581845458385713, | |
| "kl": 0.12716092402115464, | |
| "learning_rate": 4.946306357319997e-05, | |
| "loss": 0.04907531663775444, | |
| "num_tokens": 17218617.0, | |
| "reward": 3.55859375, | |
| "reward_std": 2.1090047359466553, | |
| "rewards/reward_func/mean": 0.3953993055555556, | |
| "rewards/reward_func/std": 0.3486923161480162, | |
| "sampling/importance_sampling_ratio/max": 2.998523473739624, | |
| "sampling/importance_sampling_ratio/mean": 0.951481819152832, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.994354248046875, | |
| "sampling/sampling_logp_difference/mean": 0.19012655317783356, | |
| "step": 113, | |
| "step_time": 136.22616961598396 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3032.0, | |
| "completions/mean_length": 1177.359375, | |
| "completions/mean_terminated_length": 1005.586181640625, | |
| "completions/min_length": 237.0, | |
| "completions/min_terminated_length": 276.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5681300014257431, | |
| "epoch": 0.28078817733990147, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.028436619055383507, | |
| "kl": 0.030896728625521064, | |
| "learning_rate": 4.9453017409119416e-05, | |
| "loss": 0.01816936582326889, | |
| "num_tokens": 17376608.0, | |
| "reward": 3.27734375, | |
| "reward_std": 2.361697196960449, | |
| "rewards/reward_func/mean": 0.3641493055555556, | |
| "rewards/reward_func/std": 0.34005943934122723, | |
| "sampling/importance_sampling_ratio/max": 2.9866039752960205, | |
| "sampling/importance_sampling_ratio/mean": 0.9625529646873474, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.621533393859863, | |
| "sampling/sampling_logp_difference/mean": 0.1564193069934845, | |
| "step": 114, | |
| "step_time": 129.57314335857518 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3927.0, | |
| "completions/mean_length": 1518.875, | |
| "completions/mean_terminated_length": 1434.8070068359375, | |
| "completions/min_length": 181.0, | |
| "completions/min_terminated_length": 181.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5538199990987778, | |
| "epoch": 0.2832512315270936, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025627192501330182, | |
| "kl": 0.026664719451218843, | |
| "learning_rate": 4.9442879170750976e-05, | |
| "loss": -0.0327904112637043, | |
| "num_tokens": 17559704.0, | |
| "reward": 3.7734375, | |
| "reward_std": 2.075326919555664, | |
| "rewards/reward_func/mean": 0.4192708333333333, | |
| "rewards/reward_func/std": 0.31613584028349984, | |
| "sampling/importance_sampling_ratio/max": 2.999535083770752, | |
| "sampling/importance_sampling_ratio/mean": 0.9549193382263184, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.575541496276855, | |
| "sampling/sampling_logp_difference/mean": 0.16454458236694336, | |
| "step": 115, | |
| "step_time": 127.90012184623629 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3664.0, | |
| "completions/mean_length": 1275.0, | |
| "completions/mean_terminated_length": 1083.375, | |
| "completions/min_length": 250.0, | |
| "completions/min_terminated_length": 250.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6624896675348282, | |
| "epoch": 0.2857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025305169538771953, | |
| "kl": 0.029944519512355328, | |
| "learning_rate": 4.943264889626871e-05, | |
| "loss": -0.010028105229139328, | |
| "num_tokens": 17731512.0, | |
| "reward": 4.19140625, | |
| "reward_std": 1.9678679704666138, | |
| "rewards/reward_func/mean": 0.4657118055555556, | |
| "rewards/reward_func/std": 0.3197719504435857, | |
| "sampling/importance_sampling_ratio/max": 2.9974968433380127, | |
| "sampling/importance_sampling_ratio/mean": 0.9508453607559204, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.936443328857422, | |
| "sampling/sampling_logp_difference/mean": 0.1938401758670807, | |
| "step": 116, | |
| "step_time": 124.77625619992614 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3657.0, | |
| "completions/mean_length": 1349.109375, | |
| "completions/mean_terminated_length": 1051.345458984375, | |
| "completions/min_length": 197.0, | |
| "completions/min_terminated_length": 197.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5349463224411011, | |
| "epoch": 0.2881773399014778, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02015511036915428, | |
| "kl": 0.02311926893889904, | |
| "learning_rate": 4.942232662419324e-05, | |
| "loss": -0.0261261984705925, | |
| "num_tokens": 17893519.0, | |
| "reward": 4.4296875, | |
| "reward_std": 1.7346084117889404, | |
| "rewards/reward_func/mean": 0.4921875, | |
| "rewards/reward_func/std": 0.28419747120804256, | |
| "sampling/importance_sampling_ratio/max": 2.9996516704559326, | |
| "sampling/importance_sampling_ratio/mean": 0.9627872109413147, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.845947265625, | |
| "sampling/sampling_logp_difference/mean": 0.1443958282470703, | |
| "step": 117, | |
| "step_time": 115.21389902406372 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3637.0, | |
| "completions/mean_length": 1419.34375, | |
| "completions/mean_terminated_length": 1145.5926513671875, | |
| "completions/min_length": 195.0, | |
| "completions/min_terminated_length": 195.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5412204563617706, | |
| "epoch": 0.29064039408866993, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.025050176257689093, | |
| "kl": 0.028975401539355516, | |
| "learning_rate": 4.941191239339158e-05, | |
| "loss": -0.17452068626880646, | |
| "num_tokens": 18065269.0, | |
| "reward": 4.2421875, | |
| "reward_std": 2.0397677421569824, | |
| "rewards/reward_func/mean": 0.4713541666666667, | |
| "rewards/reward_func/std": 0.32353421714570785, | |
| "sampling/importance_sampling_ratio/max": 2.9959006309509277, | |
| "sampling/importance_sampling_ratio/mean": 0.9568660259246826, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.495771408081055, | |
| "sampling/sampling_logp_difference/mean": 0.1632460504770279, | |
| "step": 118, | |
| "step_time": 131.64897252176888 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2870.0, | |
| "completions/mean_length": 1496.625, | |
| "completions/mean_terminated_length": 1206.375, | |
| "completions/min_length": 159.0, | |
| "completions/min_terminated_length": 214.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5859149247407913, | |
| "epoch": 0.29310344827586204, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.023282759194587912, | |
| "kl": 0.03531341487541795, | |
| "learning_rate": 4.9401406243077e-05, | |
| "loss": -0.054308779537677765, | |
| "num_tokens": 18247453.0, | |
| "reward": 3.76171875, | |
| "reward_std": 2.172807216644287, | |
| "rewards/reward_func/mean": 0.41796875, | |
| "rewards/reward_func/std": 0.3388514237271415, | |
| "sampling/importance_sampling_ratio/max": 2.9988443851470947, | |
| "sampling/importance_sampling_ratio/mean": 0.9568687677383423, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.808798789978027, | |
| "sampling/sampling_logp_difference/mean": 0.17285825312137604, | |
| "step": 119, | |
| "step_time": 186.21395082375966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3288.0, | |
| "completions/mean_length": 1390.578125, | |
| "completions/mean_terminated_length": 1102.26416015625, | |
| "completions/min_length": 300.0, | |
| "completions/min_terminated_length": 300.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5405104458332062, | |
| "epoch": 0.2955665024630542, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.018803990743447324, | |
| "kl": 0.06734272092580795, | |
| "learning_rate": 4.939080821280889e-05, | |
| "loss": -0.07871407270431519, | |
| "num_tokens": 18418210.0, | |
| "reward": 4.59375, | |
| "reward_std": 1.9418632984161377, | |
| "rewards/reward_func/mean": 0.5104166666666666, | |
| "rewards/reward_func/std": 0.3218831883536445, | |
| "sampling/importance_sampling_ratio/max": 2.9988901615142822, | |
| "sampling/importance_sampling_ratio/mean": 0.9608282446861267, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.319305419921875, | |
| "sampling/sampling_logp_difference/mean": 0.15571467578411102, | |
| "step": 120, | |
| "step_time": 126.49870767304674 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3132.0, | |
| "completions/mean_length": 1579.703125, | |
| "completions/mean_terminated_length": 1083.734619140625, | |
| "completions/min_length": 208.0, | |
| "completions/min_terminated_length": 208.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5791607052087784, | |
| "epoch": 0.29802955665024633, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.018886959920027147, | |
| "kl": 0.11334666470065713, | |
| "learning_rate": 4.9380118342492596e-05, | |
| "loss": -0.0750376507639885, | |
| "num_tokens": 18600895.0, | |
| "reward": 4.59375, | |
| "reward_std": 1.8953262567520142, | |
| "rewards/reward_func/mean": 0.5104166666666666, | |
| "rewards/reward_func/std": 0.3163795851998859, | |
| "sampling/importance_sampling_ratio/max": 2.9966447353363037, | |
| "sampling/importance_sampling_ratio/mean": 0.9624161720275879, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.319793701171875, | |
| "sampling/sampling_logp_difference/mean": 0.16001760959625244, | |
| "step": 121, | |
| "step_time": 131.29123148694634 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3348.0, | |
| "completions/mean_length": 1523.4375, | |
| "completions/mean_terminated_length": 1047.0369873046875, | |
| "completions/min_length": 285.0, | |
| "completions/min_terminated_length": 285.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5341637283563614, | |
| "epoch": 0.30049261083743845, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01854662654237563, | |
| "kl": 0.03084225719794631, | |
| "learning_rate": 4.936933667237926e-05, | |
| "loss": -0.05206795781850815, | |
| "num_tokens": 18784987.0, | |
| "reward": 4.55078125, | |
| "reward_std": 1.8017736673355103, | |
| "rewards/reward_func/mean": 0.5056423611111112, | |
| "rewards/reward_func/std": 0.3026386151711146, | |
| "sampling/importance_sampling_ratio/max": 2.9998748302459717, | |
| "sampling/importance_sampling_ratio/mean": 0.9602913856506348, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.124850273132324, | |
| "sampling/sampling_logp_difference/mean": 0.1521858274936676, | |
| "step": 122, | |
| "step_time": 178.7429536471609 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3319.0, | |
| "completions/mean_length": 1527.625, | |
| "completions/mean_terminated_length": 1043.1199951171875, | |
| "completions/min_length": 285.0, | |
| "completions/min_terminated_length": 285.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.580025851726532, | |
| "epoch": 0.30295566502463056, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.020308490136383247, | |
| "kl": 0.02926149358972907, | |
| "learning_rate": 4.935846324306571e-05, | |
| "loss": -0.007207506336271763, | |
| "num_tokens": 18971091.0, | |
| "reward": 4.83984375, | |
| "reward_std": 1.9646514654159546, | |
| "rewards/reward_func/mean": 0.5377604166666666, | |
| "rewards/reward_func/std": 0.32252822981940377, | |
| "sampling/importance_sampling_ratio/max": 2.996708869934082, | |
| "sampling/importance_sampling_ratio/mean": 0.9550896883010864, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.216852188110352, | |
| "sampling/sampling_logp_difference/mean": 0.17026376724243164, | |
| "step": 123, | |
| "step_time": 198.48502158699557 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1957.0, | |
| "completions/mean_length": 1257.0625, | |
| "completions/mean_terminated_length": 992.870361328125, | |
| "completions/min_length": 325.0, | |
| "completions/min_terminated_length": 325.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.4820714667439461, | |
| "epoch": 0.3054187192118227, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01958396564544384, | |
| "kl": 0.04494946589693427, | |
| "learning_rate": 4.934749809549427e-05, | |
| "loss": -0.10934413224458694, | |
| "num_tokens": 19134583.0, | |
| "reward": 4.5859375, | |
| "reward_std": 1.928904414176941, | |
| "rewards/reward_func/mean": 0.5095486111111112, | |
| "rewards/reward_func/std": 0.3117486619287067, | |
| "sampling/importance_sampling_ratio/max": 2.9978034496307373, | |
| "sampling/importance_sampling_ratio/mean": 0.9640801548957825, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.056150436401367, | |
| "sampling/sampling_logp_difference/mean": 0.13894838094711304, | |
| "step": 124, | |
| "step_time": 134.90961828804575 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3030.0, | |
| "completions/mean_length": 1880.578125, | |
| "completions/mean_terminated_length": 1210.3863525390625, | |
| "completions/min_length": 275.0, | |
| "completions/min_terminated_length": 275.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5541824847459793, | |
| "epoch": 0.3078817733990148, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.016006255366419888, | |
| "kl": 0.031152247916907072, | |
| "learning_rate": 4.9336441270952595e-05, | |
| "loss": -0.15780873596668243, | |
| "num_tokens": 19346876.0, | |
| "reward": 4.2109375, | |
| "reward_std": 1.9901636838912964, | |
| "rewards/reward_func/mean": 0.4678819444444444, | |
| "rewards/reward_func/std": 0.3124983575608995, | |
| "sampling/importance_sampling_ratio/max": 2.992753267288208, | |
| "sampling/importance_sampling_ratio/mean": 0.9540883302688599, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.755999565124512, | |
| "sampling/sampling_logp_difference/mean": 0.17641326785087585, | |
| "step": 125, | |
| "step_time": 134.62373927910812 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4074.0, | |
| "completions/mean_length": 1447.328125, | |
| "completions/mean_terminated_length": 1258.482177734375, | |
| "completions/min_length": 300.0, | |
| "completions/min_terminated_length": 300.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5232695862650871, | |
| "epoch": 0.3103448275862069, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02110460828294288, | |
| "kl": 0.021355477161705494, | |
| "learning_rate": 4.932529281107355e-05, | |
| "loss": 0.04123953729867935, | |
| "num_tokens": 19529921.0, | |
| "reward": 4.65625, | |
| "reward_std": 1.6051133871078491, | |
| "rewards/reward_func/mean": 0.5173611111111112, | |
| "rewards/reward_func/std": 0.2619215887453821, | |
| "sampling/importance_sampling_ratio/max": 2.999826192855835, | |
| "sampling/importance_sampling_ratio/mean": 0.9576446413993835, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.864720344543457, | |
| "sampling/sampling_logp_difference/mean": 0.15989640355110168, | |
| "step": 126, | |
| "step_time": 139.1471421548631 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3559.0, | |
| "completions/mean_length": 1351.46875, | |
| "completions/mean_terminated_length": 1131.0726318359375, | |
| "completions/min_length": 216.0, | |
| "completions/min_terminated_length": 216.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5890444368124008, | |
| "epoch": 0.312807881773399, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.026187013896567975, | |
| "kl": 0.04253423307090998, | |
| "learning_rate": 4.931405275783507e-05, | |
| "loss": 0.029535703361034393, | |
| "num_tokens": 19696367.0, | |
| "reward": 4.34765625, | |
| "reward_std": 1.9167273044586182, | |
| "rewards/reward_func/mean": 0.4830729166666667, | |
| "rewards/reward_func/std": 0.2942189425230026, | |
| "sampling/importance_sampling_ratio/max": 2.9940335750579834, | |
| "sampling/importance_sampling_ratio/mean": 0.9561335444450378, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.746670722961426, | |
| "sampling/sampling_logp_difference/mean": 0.1686076819896698, | |
| "step": 127, | |
| "step_time": 129.97182760294527 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2472.0, | |
| "completions/mean_length": 1517.96875, | |
| "completions/mean_terminated_length": 930.4118041992188, | |
| "completions/min_length": 274.0, | |
| "completions/min_terminated_length": 274.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5890470743179321, | |
| "epoch": 0.31527093596059114, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.019386571014408556, | |
| "kl": 0.02690672129392624, | |
| "learning_rate": 4.930272115355992e-05, | |
| "loss": 0.06074991077184677, | |
| "num_tokens": 19872765.0, | |
| "reward": 4.25390625, | |
| "reward_std": 1.9833093881607056, | |
| "rewards/reward_func/mean": 0.47265625, | |
| "rewards/reward_func/std": 0.3048050221469667, | |
| "sampling/importance_sampling_ratio/max": 2.988401412963867, | |
| "sampling/importance_sampling_ratio/mean": 0.9639754295349121, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.903083801269531, | |
| "sampling/sampling_logp_difference/mean": 0.15789353847503662, | |
| "step": 128, | |
| "step_time": 127.74785562674515 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3943.0, | |
| "completions/mean_length": 1287.140625, | |
| "completions/mean_terminated_length": 947.0000610351562, | |
| "completions/min_length": 249.0, | |
| "completions/min_terminated_length": 249.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5195007175207138, | |
| "epoch": 0.31773399014778325, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.020892925194507105, | |
| "kl": 0.05176311079412699, | |
| "learning_rate": 4.929129804091562e-05, | |
| "loss": 0.012292366474866867, | |
| "num_tokens": 20032838.0, | |
| "reward": 4.8515625, | |
| "reward_std": 1.6842634677886963, | |
| "rewards/reward_func/mean": 0.5390625, | |
| "rewards/reward_func/std": 0.29546265221304363, | |
| "sampling/importance_sampling_ratio/max": 2.999087333679199, | |
| "sampling/importance_sampling_ratio/mean": 0.9689388275146484, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.614640235900879, | |
| "sampling/sampling_logp_difference/mean": 0.1367620825767517, | |
| "step": 129, | |
| "step_time": 121.45446623302996 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3191.0, | |
| "completions/mean_length": 1508.109375, | |
| "completions/mean_terminated_length": 1028.870361328125, | |
| "completions/min_length": 109.0, | |
| "completions/min_terminated_length": 109.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6037813127040863, | |
| "epoch": 0.32019704433497537, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01962016974443757, | |
| "kl": 0.022648704703897238, | |
| "learning_rate": 4.927978346291424e-05, | |
| "loss": -0.17073693871498108, | |
| "num_tokens": 20210157.0, | |
| "reward": 4.05859375, | |
| "reward_std": 2.1061806678771973, | |
| "rewards/reward_func/mean": 0.4509548611111111, | |
| "rewards/reward_func/std": 0.32526984645260704, | |
| "sampling/importance_sampling_ratio/max": 2.999075174331665, | |
| "sampling/importance_sampling_ratio/mean": 0.9614124298095703, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.54149055480957, | |
| "sampling/sampling_logp_difference/mean": 0.1666044294834137, | |
| "step": 130, | |
| "step_time": 139.74313421617262 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3556.0, | |
| "completions/mean_length": 1106.921875, | |
| "completions/mean_terminated_length": 907.6500244140625, | |
| "completions/min_length": 164.0, | |
| "completions/min_terminated_length": 164.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5114710703492165, | |
| "epoch": 0.3226600985221675, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.022324741609899095, | |
| "kl": 0.04564378131181002, | |
| "learning_rate": 4.9268177462912255e-05, | |
| "loss": -0.07674457132816315, | |
| "num_tokens": 20360984.0, | |
| "reward": 4.375, | |
| "reward_std": 1.8235670328140259, | |
| "rewards/reward_func/mean": 0.4861111111111111, | |
| "rewards/reward_func/std": 0.3013741249839465, | |
| "sampling/importance_sampling_ratio/max": 2.999788284301758, | |
| "sampling/importance_sampling_ratio/mean": 0.9689079523086548, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.985334396362305, | |
| "sampling/sampling_logp_difference/mean": 0.13317884504795074, | |
| "step": 131, | |
| "step_time": 125.56218209001236 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3430.0, | |
| "completions/mean_length": 1374.1875, | |
| "completions/mean_terminated_length": 1139.5357666015625, | |
| "completions/min_length": 186.0, | |
| "completions/min_terminated_length": 186.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5266564786434174, | |
| "epoch": 0.3251231527093596, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.016910527382422866, | |
| "kl": 0.025493765715509653, | |
| "learning_rate": 4.9256480084610376e-05, | |
| "loss": -0.04664276912808418, | |
| "num_tokens": 20536708.0, | |
| "reward": 4.65234375, | |
| "reward_std": 1.600233793258667, | |
| "rewards/reward_func/mean": 0.5169270833333334, | |
| "rewards/reward_func/std": 0.25672541227605605, | |
| "sampling/importance_sampling_ratio/max": 2.9952757358551025, | |
| "sampling/importance_sampling_ratio/mean": 0.9601929187774658, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.947929382324219, | |
| "sampling/sampling_logp_difference/mean": 0.15485885739326477, | |
| "step": 132, | |
| "step_time": 150.15074049308896 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3145.0, | |
| "completions/mean_length": 1287.421875, | |
| "completions/mean_terminated_length": 1202.7376708984375, | |
| "completions/min_length": 241.0, | |
| "completions/min_terminated_length": 241.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.57479427754879, | |
| "epoch": 0.3275862068965517, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.024161045457601445, | |
| "kl": 0.01593266148120165, | |
| "learning_rate": 4.9244691372053376e-05, | |
| "loss": -0.05266115441918373, | |
| "num_tokens": 20702559.0, | |
| "reward": 4.51171875, | |
| "reward_std": 1.6198359727859497, | |
| "rewards/reward_func/mean": 0.5013020833333334, | |
| "rewards/reward_func/std": 0.262553902135955, | |
| "sampling/importance_sampling_ratio/max": 2.999521017074585, | |
| "sampling/importance_sampling_ratio/mean": 0.9583712816238403, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.616655349731445, | |
| "sampling/sampling_logp_difference/mean": 0.16895034909248352, | |
| "step": 133, | |
| "step_time": 159.73296225816011 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4016.0, | |
| "completions/mean_length": 1460.59375, | |
| "completions/mean_terminated_length": 1351.4482421875, | |
| "completions/min_length": 537.0, | |
| "completions/min_terminated_length": 537.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6757910698652267, | |
| "epoch": 0.33004926108374383, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02485612182204898, | |
| "kl": 0.030998756643384695, | |
| "learning_rate": 4.9232811369629936e-05, | |
| "loss": 0.010290354490280151, | |
| "num_tokens": 20883813.0, | |
| "reward": 4.35546875, | |
| "reward_std": 1.7169814109802246, | |
| "rewards/reward_func/mean": 0.4839409722222222, | |
| "rewards/reward_func/std": 0.2722629126575258, | |
| "sampling/importance_sampling_ratio/max": 2.9927690029144287, | |
| "sampling/importance_sampling_ratio/mean": 0.9484080672264099, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.797867774963379, | |
| "sampling/sampling_logp_difference/mean": 0.19552484154701233, | |
| "step": 134, | |
| "step_time": 136.03435460082255 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2941.0, | |
| "completions/mean_length": 1210.078125, | |
| "completions/mean_terminated_length": 1092.7930908203125, | |
| "completions/min_length": 288.0, | |
| "completions/min_terminated_length": 288.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5237693637609482, | |
| "epoch": 0.33251231527093594, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.020927439602090963, | |
| "kl": 0.019747328478842974, | |
| "learning_rate": 4.9220840122072495e-05, | |
| "loss": -0.019350498914718628, | |
| "num_tokens": 21043770.0, | |
| "reward": 4.46875, | |
| "reward_std": 1.5587056875228882, | |
| "rewards/reward_func/mean": 0.4965277777777778, | |
| "rewards/reward_func/std": 0.25133796367380357, | |
| "sampling/importance_sampling_ratio/max": 2.9990768432617188, | |
| "sampling/importance_sampling_ratio/mean": 0.964819073677063, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.993998527526855, | |
| "sampling/sampling_logp_difference/mean": 0.14578461647033691, | |
| "step": 135, | |
| "step_time": 129.014442861313 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3859.0, | |
| "completions/mean_length": 1473.328125, | |
| "completions/mean_terminated_length": 1222.0545654296875, | |
| "completions/min_length": 197.0, | |
| "completions/min_terminated_length": 197.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6460306495428085, | |
| "epoch": 0.33497536945812806, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02366790790806787, | |
| "kl": 0.025228526908904314, | |
| "learning_rate": 4.920877767445705e-05, | |
| "loss": 0.01618235744535923, | |
| "num_tokens": 21229279.0, | |
| "reward": 4.203125, | |
| "reward_std": 1.850501298904419, | |
| "rewards/reward_func/mean": 0.4670138888888889, | |
| "rewards/reward_func/std": 0.3125879168510437, | |
| "sampling/importance_sampling_ratio/max": 2.998635768890381, | |
| "sampling/importance_sampling_ratio/mean": 0.9495352506637573, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.83233642578125, | |
| "sampling/sampling_logp_difference/mean": 0.1911965161561966, | |
| "step": 136, | |
| "step_time": 136.6534457411617 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 3018.0, | |
| "completions/max_terminated_length": 3018.0, | |
| "completions/mean_length": 1125.46875, | |
| "completions/mean_terminated_length": 1133.04833984375, | |
| "completions/min_length": 404.0, | |
| "completions/min_terminated_length": 404.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6018159687519073, | |
| "epoch": 0.3374384236453202, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.028159374909103065, | |
| "kl": 0.018990385811775923, | |
| "learning_rate": 4.919662407220299e-05, | |
| "loss": 0.002560041844844818, | |
| "num_tokens": 21396941.0, | |
| "reward": 4.109375, | |
| "reward_std": 1.7648885250091553, | |
| "rewards/reward_func/mean": 0.4565972222222222, | |
| "rewards/reward_func/std": 0.24216210428211424, | |
| "sampling/importance_sampling_ratio/max": 2.999795436859131, | |
| "sampling/importance_sampling_ratio/mean": 0.9545049667358398, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.2381591796875, | |
| "sampling/sampling_logp_difference/mean": 0.17799274623394012, | |
| "step": 137, | |
| "step_time": 110.39313215529546 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3624.0, | |
| "completions/mean_length": 949.890625, | |
| "completions/mean_terminated_length": 874.6612548828125, | |
| "completions/min_length": 291.0, | |
| "completions/min_terminated_length": 291.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6683694273233414, | |
| "epoch": 0.3399014778325123, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.0325675264957142, | |
| "kl": 0.02215199451893568, | |
| "learning_rate": 4.918437936107293e-05, | |
| "loss": 0.22962699830532074, | |
| "num_tokens": 21544198.0, | |
| "reward": 4.30859375, | |
| "reward_std": 1.822338342666626, | |
| "rewards/reward_func/mean": 0.4787326388888889, | |
| "rewards/reward_func/std": 0.2953062653541565, | |
| "sampling/importance_sampling_ratio/max": 2.999077558517456, | |
| "sampling/importance_sampling_ratio/mean": 0.9570326805114746, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.773735046386719, | |
| "sampling/sampling_logp_difference/mean": 0.17750665545463562, | |
| "step": 138, | |
| "step_time": 136.9202022489626 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3902.0, | |
| "completions/mean_length": 1019.59375, | |
| "completions/mean_terminated_length": 979.274169921875, | |
| "completions/min_length": 263.0, | |
| "completions/min_terminated_length": 263.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5504418164491653, | |
| "epoch": 0.34236453201970446, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02895040422333763, | |
| "kl": 0.022574008908122778, | |
| "learning_rate": 4.9172043587172564e-05, | |
| "loss": -0.1542976051568985, | |
| "num_tokens": 21691068.0, | |
| "reward": 4.20703125, | |
| "reward_std": 1.707566499710083, | |
| "rewards/reward_func/mean": 0.4674479166666667, | |
| "rewards/reward_func/std": 0.2652057492070728, | |
| "sampling/importance_sampling_ratio/max": 2.9986119270324707, | |
| "sampling/importance_sampling_ratio/mean": 0.9594486951828003, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.460792541503906, | |
| "sampling/sampling_logp_difference/mean": 0.1590704321861267, | |
| "step": 139, | |
| "step_time": 189.29539473517798 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3238.0, | |
| "completions/max_terminated_length": 3238.0, | |
| "completions/mean_length": 675.71875, | |
| "completions/mean_terminated_length": 661.1428833007812, | |
| "completions/min_length": 123.0, | |
| "completions/min_terminated_length": 123.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5717846751213074, | |
| "epoch": 0.3448275862068966, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.037716087957076136, | |
| "kl": 0.038773443549871445, | |
| "learning_rate": 4.915961679695046e-05, | |
| "loss": 0.031660258769989014, | |
| "num_tokens": 21808954.0, | |
| "reward": 4.61328125, | |
| "reward_std": 1.607848882675171, | |
| "rewards/reward_func/mean": 0.5125868055555556, | |
| "rewards/reward_func/std": 0.25105878214041394, | |
| "sampling/importance_sampling_ratio/max": 2.999619483947754, | |
| "sampling/importance_sampling_ratio/mean": 0.9686312675476074, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.18880558013916, | |
| "sampling/sampling_logp_difference/mean": 0.1539146602153778, | |
| "step": 140, | |
| "step_time": 94.82269705319777 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2489.0, | |
| "completions/mean_length": 1008.359375, | |
| "completions/mean_terminated_length": 935.3225708007812, | |
| "completions/min_length": 210.0, | |
| "completions/min_terminated_length": 210.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5867870151996613, | |
| "epoch": 0.3472906403940887, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02747726707660944, | |
| "kl": 0.032487844582647085, | |
| "learning_rate": 4.914709903719788e-05, | |
| "loss": -0.02273506112396717, | |
| "num_tokens": 21958289.0, | |
| "reward": 4.31640625, | |
| "reward_std": 1.7316166162490845, | |
| "rewards/reward_func/mean": 0.4796006944444444, | |
| "rewards/reward_func/std": 0.24825715935892528, | |
| "sampling/importance_sampling_ratio/max": 2.998828649520874, | |
| "sampling/importance_sampling_ratio/mean": 0.9613453149795532, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.1629056930542, | |
| "sampling/sampling_logp_difference/mean": 0.16322818398475647, | |
| "step": 141, | |
| "step_time": 175.56653777277097 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3728.0, | |
| "completions/max_terminated_length": 3728.0, | |
| "completions/mean_length": 874.546875, | |
| "completions/mean_terminated_length": 874.546875, | |
| "completions/min_length": 227.0, | |
| "completions/min_terminated_length": 227.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6279189586639404, | |
| "epoch": 0.3497536945812808, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02985454353932649, | |
| "kl": 0.024398992769420147, | |
| "learning_rate": 4.913449035504865e-05, | |
| "loss": 0.20730939507484436, | |
| "num_tokens": 22102484.0, | |
| "reward": 4.68359375, | |
| "reward_std": 1.3162914514541626, | |
| "rewards/reward_func/mean": 0.5203993055555556, | |
| "rewards/reward_func/std": 0.20795253912607828, | |
| "sampling/importance_sampling_ratio/max": 2.9962644577026367, | |
| "sampling/importance_sampling_ratio/mean": 0.9599305987358093, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.122313499450684, | |
| "sampling/sampling_logp_difference/mean": 0.17107892036437988, | |
| "step": 142, | |
| "step_time": 107.82716886512935 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2049.0, | |
| "completions/mean_length": 993.640625, | |
| "completions/mean_terminated_length": 786.8167114257812, | |
| "completions/min_length": 300.0, | |
| "completions/min_terminated_length": 300.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5099516436457634, | |
| "epoch": 0.3522167487684729, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.009738132356465489, | |
| "kl": 0.03395155072212219, | |
| "learning_rate": 4.912179079797892e-05, | |
| "loss": -0.06845282763242722, | |
| "num_tokens": 22254621.0, | |
| "reward": 4.87890625, | |
| "reward_std": 0.9553443789482117, | |
| "rewards/reward_func/mean": 0.5421006944444444, | |
| "rewards/reward_func/std": 0.14736688633759817, | |
| "sampling/importance_sampling_ratio/max": 2.9960014820098877, | |
| "sampling/importance_sampling_ratio/mean": 0.9686846137046814, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.774383544921875, | |
| "sampling/sampling_logp_difference/mean": 0.14190690219402313, | |
| "step": 143, | |
| "step_time": 185.97113836207427 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3376.0, | |
| "completions/mean_length": 1397.65625, | |
| "completions/mean_terminated_length": 1310.6129150390625, | |
| "completions/min_length": 224.0, | |
| "completions/min_terminated_length": 224.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6476477831602097, | |
| "epoch": 0.35467980295566504, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.017567346260058462, | |
| "kl": 0.014963384717702866, | |
| "learning_rate": 4.910900041380703e-05, | |
| "loss": 0.11150763928890228, | |
| "num_tokens": 22437719.0, | |
| "reward": 4.734375, | |
| "reward_std": 1.2839632034301758, | |
| "rewards/reward_func/mean": 0.5260416666666666, | |
| "rewards/reward_func/std": 0.20239159795973036, | |
| "sampling/importance_sampling_ratio/max": 2.9984025955200195, | |
| "sampling/importance_sampling_ratio/mean": 0.9474148750305176, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.22669792175293, | |
| "sampling/sampling_logp_difference/mean": 0.19336676597595215, | |
| "step": 144, | |
| "step_time": 180.27103169239126 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2756.0, | |
| "completions/mean_length": 999.34375, | |
| "completions/mean_terminated_length": 951.2257690429688, | |
| "completions/min_length": 266.0, | |
| "completions/min_terminated_length": 266.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.652740016579628, | |
| "epoch": 0.35714285714285715, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.025811992766747685, | |
| "kl": 0.026440259534865618, | |
| "learning_rate": 4.909611925069332e-05, | |
| "loss": 0.04610571265220642, | |
| "num_tokens": 22588525.0, | |
| "reward": 4.76171875, | |
| "reward_std": 1.4484732151031494, | |
| "rewards/reward_func/mean": 0.5290798611111112, | |
| "rewards/reward_func/std": 0.24869189742538664, | |
| "sampling/importance_sampling_ratio/max": 2.9998068809509277, | |
| "sampling/importance_sampling_ratio/mean": 0.9527676105499268, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.181685447692871, | |
| "sampling/sampling_logp_difference/mean": 0.18670062720775604, | |
| "step": 145, | |
| "step_time": 133.24482462904416 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3955.0, | |
| "completions/mean_length": 1359.0, | |
| "completions/mean_terminated_length": 1125.5, | |
| "completions/min_length": 269.0, | |
| "completions/min_terminated_length": 269.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6540272980928421, | |
| "epoch": 0.35960591133004927, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.023050681392453684, | |
| "kl": 0.01712850504554808, | |
| "learning_rate": 4.9083147357139936e-05, | |
| "loss": -0.24461397528648376, | |
| "num_tokens": 22782141.0, | |
| "reward": 4.38671875, | |
| "reward_std": 1.710106372833252, | |
| "rewards/reward_func/mean": 0.4874131944444444, | |
| "rewards/reward_func/std": 0.2449416286415524, | |
| "sampling/importance_sampling_ratio/max": 2.999965190887451, | |
| "sampling/importance_sampling_ratio/mean": 0.9427659511566162, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.624150276184082, | |
| "sampling/sampling_logp_difference/mean": 0.2082119882106781, | |
| "step": 146, | |
| "step_time": 146.17805301607586 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1862.0, | |
| "completions/max_terminated_length": 1862.0, | |
| "completions/mean_length": 784.75, | |
| "completions/mean_terminated_length": 784.75, | |
| "completions/min_length": 261.0, | |
| "completions/min_terminated_length": 261.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5902538895606995, | |
| "epoch": 0.3620689655172414, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027611265073668196, | |
| "kl": 0.04381394945085049, | |
| "learning_rate": 4.9070084781990655e-05, | |
| "loss": -0.012405045330524445, | |
| "num_tokens": 22916269.0, | |
| "reward": 4.7421875, | |
| "reward_std": 1.1503697633743286, | |
| "rewards/reward_func/mean": 0.5269097222222222, | |
| "rewards/reward_func/std": 0.18867847737338808, | |
| "sampling/importance_sampling_ratio/max": 2.998880624771118, | |
| "sampling/importance_sampling_ratio/mean": 0.9579499959945679, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.612027168273926, | |
| "sampling/sampling_logp_difference/mean": 0.16890573501586914, | |
| "step": 147, | |
| "step_time": 68.85666625201702 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2553.0, | |
| "completions/mean_length": 1074.546875, | |
| "completions/mean_terminated_length": 1009.01611328125, | |
| "completions/min_length": 248.0, | |
| "completions/min_terminated_length": 248.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6222652047872543, | |
| "epoch": 0.3645320197044335, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.024617454867958573, | |
| "kl": 0.01417856477200985, | |
| "learning_rate": 4.905693157443072e-05, | |
| "loss": 0.043572280555963516, | |
| "num_tokens": 23073344.0, | |
| "reward": 4.66015625, | |
| "reward_std": 1.435357928276062, | |
| "rewards/reward_func/mean": 0.5177951388888888, | |
| "rewards/reward_func/std": 0.21545444304744402, | |
| "sampling/importance_sampling_ratio/max": 2.99368953704834, | |
| "sampling/importance_sampling_ratio/mean": 0.9554688930511475, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.491532325744629, | |
| "sampling/sampling_logp_difference/mean": 0.17892569303512573, | |
| "step": 148, | |
| "step_time": 164.53390644979663 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2713.0, | |
| "completions/mean_length": 1009.8125, | |
| "completions/mean_terminated_length": 960.825439453125, | |
| "completions/min_length": 149.0, | |
| "completions/min_terminated_length": 149.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.5842855423688889, | |
| "epoch": 0.3669950738916256, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.018268203346222877, | |
| "kl": 0.01848753821104765, | |
| "learning_rate": 4.904368778398662e-05, | |
| "loss": -0.11345016956329346, | |
| "num_tokens": 23222356.0, | |
| "reward": 4.76171875, | |
| "reward_std": 1.0469435453414917, | |
| "rewards/reward_func/mean": 0.5290798611111112, | |
| "rewards/reward_func/std": 0.174877953198221, | |
| "sampling/importance_sampling_ratio/max": 2.991849422454834, | |
| "sampling/importance_sampling_ratio/mean": 0.9590041637420654, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.206958770751953, | |
| "sampling/sampling_logp_difference/mean": 0.17348900437355042, | |
| "step": 149, | |
| "step_time": 127.15000204136595 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3131.0, | |
| "completions/mean_length": 1100.765625, | |
| "completions/mean_terminated_length": 1053.2222900390625, | |
| "completions/min_length": 171.0, | |
| "completions/min_terminated_length": 171.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6619395166635513, | |
| "epoch": 0.3694581280788177, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02320337507863541, | |
| "kl": 0.030713028274476528, | |
| "learning_rate": 4.903035346052593e-05, | |
| "loss": -0.050076283514499664, | |
| "num_tokens": 23386565.0, | |
| "reward": 4.77734375, | |
| "reward_std": 1.4740595817565918, | |
| "rewards/reward_func/mean": 0.5308159722222222, | |
| "rewards/reward_func/std": 0.23688736226823595, | |
| "sampling/importance_sampling_ratio/max": 2.9961957931518555, | |
| "sampling/importance_sampling_ratio/mean": 0.9498468637466431, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.355584144592285, | |
| "sampling/sampling_logp_difference/mean": 0.19352048635482788, | |
| "step": 150, | |
| "step_time": 158.25811398518272 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2324.0, | |
| "completions/mean_length": 958.609375, | |
| "completions/mean_terminated_length": 795.5423583984375, | |
| "completions/min_length": 183.0, | |
| "completions/min_terminated_length": 245.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5594561994075775, | |
| "epoch": 0.37192118226600984, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.018923320892404144, | |
| "kl": 0.030594575218856335, | |
| "learning_rate": 4.9016928654257096e-05, | |
| "loss": -0.05903652310371399, | |
| "num_tokens": 23535724.0, | |
| "reward": 4.796875, | |
| "reward_std": 1.4712016582489014, | |
| "rewards/reward_func/mean": 0.5329861111111112, | |
| "rewards/reward_func/std": 0.23497174680233002, | |
| "sampling/importance_sampling_ratio/max": 2.984567403793335, | |
| "sampling/importance_sampling_ratio/mean": 0.9650619626045227, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 20.476213455200195, | |
| "sampling/sampling_logp_difference/mean": 0.15478157997131348, | |
| "step": 151, | |
| "step_time": 181.10872292728163 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2004.0, | |
| "completions/max_terminated_length": 2004.0, | |
| "completions/mean_length": 827.765625, | |
| "completions/mean_terminated_length": 827.765625, | |
| "completions/min_length": 204.0, | |
| "completions/min_terminated_length": 204.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6111591905355453, | |
| "epoch": 0.37438423645320196, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.01755727796270937, | |
| "kl": 0.023565078154206276, | |
| "learning_rate": 4.9003413415729295e-05, | |
| "loss": -0.02637684904038906, | |
| "num_tokens": 23670253.0, | |
| "reward": 4.77734375, | |
| "reward_std": 1.104749083518982, | |
| "rewards/reward_func/mean": 0.5308159722222222, | |
| "rewards/reward_func/std": 0.1676331791612837, | |
| "sampling/importance_sampling_ratio/max": 2.992830276489258, | |
| "sampling/importance_sampling_ratio/mean": 0.9616970419883728, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.969971656799316, | |
| "sampling/sampling_logp_difference/mean": 0.16461607813835144, | |
| "step": 152, | |
| "step_time": 69.56748036597855 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2823.0, | |
| "completions/mean_length": 973.4375, | |
| "completions/mean_terminated_length": 923.873046875, | |
| "completions/min_length": 234.0, | |
| "completions/min_terminated_length": 234.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5804478824138641, | |
| "epoch": 0.3768472906403941, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.024461340986450475, | |
| "kl": 0.023421769961714745, | |
| "learning_rate": 4.898980779583218e-05, | |
| "loss": 0.0462716780602932, | |
| "num_tokens": 23821209.0, | |
| "reward": 4.796875, | |
| "reward_std": 1.231107234954834, | |
| "rewards/reward_func/mean": 0.5329861111111112, | |
| "rewards/reward_func/std": 0.20140869998269612, | |
| "sampling/importance_sampling_ratio/max": 2.9916088581085205, | |
| "sampling/importance_sampling_ratio/mean": 0.9576438665390015, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.179545402526855, | |
| "sampling/sampling_logp_difference/mean": 0.17727312445640564, | |
| "step": 153, | |
| "step_time": 154.47874995111488 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3886.0, | |
| "completions/max_terminated_length": 3886.0, | |
| "completions/mean_length": 1236.453125, | |
| "completions/mean_terminated_length": 1254.2064208984375, | |
| "completions/min_length": 118.0, | |
| "completions/min_terminated_length": 308.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6448825746774673, | |
| "epoch": 0.3793103448275862, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.008303242862563802, | |
| "kl": 0.01926612318493426, | |
| "learning_rate": 4.897611184579575e-05, | |
| "loss": -0.08189457654953003, | |
| "num_tokens": 23985638.0, | |
| "reward": 5.0703125, | |
| "reward_std": 0.9422498941421509, | |
| "rewards/reward_func/mean": 0.5633680555555556, | |
| "rewards/reward_func/std": 0.19585710681147045, | |
| "sampling/importance_sampling_ratio/max": 2.9982712268829346, | |
| "sampling/importance_sampling_ratio/mean": 0.9486467838287354, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.374839782714844, | |
| "sampling/sampling_logp_difference/mean": 0.1977824568748474, | |
| "step": 154, | |
| "step_time": 117.2964083738625 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2027.0, | |
| "completions/max_terminated_length": 2027.0, | |
| "completions/mean_length": 865.5625, | |
| "completions/mean_terminated_length": 866.5238647460938, | |
| "completions/min_length": 327.0, | |
| "completions/min_terminated_length": 327.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6071893572807312, | |
| "epoch": 0.3817733990147783, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1193280757900666, | |
| "kl": 0.030153931118547916, | |
| "learning_rate": 4.896232561719011e-05, | |
| "loss": 0.010661143809556961, | |
| "num_tokens": 24124298.0, | |
| "reward": 4.13671875, | |
| "reward_std": 1.7789162397384644, | |
| "rewards/reward_func/mean": 0.4596354166666667, | |
| "rewards/reward_func/std": 0.23638725777467093, | |
| "sampling/importance_sampling_ratio/max": 2.999091625213623, | |
| "sampling/importance_sampling_ratio/mean": 0.9599412679672241, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.112852096557617, | |
| "sampling/sampling_logp_difference/mean": 0.17123615741729736, | |
| "step": 155, | |
| "step_time": 72.2496001359541 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3107.0, | |
| "completions/mean_length": 978.875, | |
| "completions/mean_terminated_length": 930.9677124023438, | |
| "completions/min_length": 261.0, | |
| "completions/min_terminated_length": 261.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6019200086593628, | |
| "epoch": 0.3842364532019704, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.022745034032572956, | |
| "kl": 0.05701204831711948, | |
| "learning_rate": 4.8948449161925304e-05, | |
| "loss": 0.007736865431070328, | |
| "num_tokens": 24272082.0, | |
| "reward": 4.8359375, | |
| "reward_std": 1.287315011024475, | |
| "rewards/reward_func/mean": 0.5373263888888888, | |
| "rewards/reward_func/std": 0.20693102561765248, | |
| "sampling/importance_sampling_ratio/max": 2.99910831451416, | |
| "sampling/importance_sampling_ratio/mean": 0.9618995189666748, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.936193466186523, | |
| "sampling/sampling_logp_difference/mean": 0.1633065938949585, | |
| "step": 156, | |
| "step_time": 192.91116239712574 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3499.0, | |
| "completions/max_terminated_length": 3499.0, | |
| "completions/mean_length": 1035.4375, | |
| "completions/mean_terminated_length": 1035.4375, | |
| "completions/min_length": 311.0, | |
| "completions/min_terminated_length": 311.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5545484572649002, | |
| "epoch": 0.3866995073891626, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.018643941734307494, | |
| "kl": 0.034695918671786785, | |
| "learning_rate": 4.893448253225111e-05, | |
| "loss": 0.025819044560194016, | |
| "num_tokens": 24427182.0, | |
| "reward": 4.6484375, | |
| "reward_std": 1.4466153383255005, | |
| "rewards/reward_func/mean": 0.5164930555555556, | |
| "rewards/reward_func/std": 0.2145352140069008, | |
| "sampling/importance_sampling_ratio/max": 2.991377830505371, | |
| "sampling/importance_sampling_ratio/mean": 0.9611527919769287, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.916698455810547, | |
| "sampling/sampling_logp_difference/mean": 0.1512816846370697, | |
| "step": 157, | |
| "step_time": 133.73708005039953 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2262.0, | |
| "completions/mean_length": 983.921875, | |
| "completions/mean_terminated_length": 883.5322265625, | |
| "completions/min_length": 291.0, | |
| "completions/min_terminated_length": 291.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6081868559122086, | |
| "epoch": 0.3891625615763547, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02139363037812444, | |
| "kl": 0.03456718381494284, | |
| "learning_rate": 4.892042578075685e-05, | |
| "loss": -0.09442838281393051, | |
| "num_tokens": 24570361.0, | |
| "reward": 4.703125, | |
| "reward_std": 1.3648616075515747, | |
| "rewards/reward_func/mean": 0.5225694444444444, | |
| "rewards/reward_func/std": 0.20582874615987143, | |
| "sampling/importance_sampling_ratio/max": 2.9956085681915283, | |
| "sampling/importance_sampling_ratio/mean": 0.9590303897857666, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.812082290649414, | |
| "sampling/sampling_logp_difference/mean": 0.1729988157749176, | |
| "step": 158, | |
| "step_time": 138.27136790496297 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1647.0, | |
| "completions/max_terminated_length": 1647.0, | |
| "completions/mean_length": 768.484375, | |
| "completions/mean_terminated_length": 768.484375, | |
| "completions/min_length": 285.0, | |
| "completions/min_terminated_length": 285.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.6265248358249664, | |
| "epoch": 0.3916256157635468, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.028373820616951973, | |
| "kl": 0.021229174453765154, | |
| "learning_rate": 4.8906278960371176e-05, | |
| "loss": 0.03736027330160141, | |
| "num_tokens": 24708344.0, | |
| "reward": 4.63671875, | |
| "reward_std": 1.4064263105392456, | |
| "rewards/reward_func/mean": 0.5151909722222222, | |
| "rewards/reward_func/std": 0.21763736671871609, | |
| "sampling/importance_sampling_ratio/max": 2.999945640563965, | |
| "sampling/importance_sampling_ratio/mean": 0.96051025390625, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.708161354064941, | |
| "sampling/sampling_logp_difference/mean": 0.17337340116500854, | |
| "step": 159, | |
| "step_time": 66.52018399396911 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4085.0, | |
| "completions/mean_length": 1092.65625, | |
| "completions/mean_terminated_length": 1019.4515991210938, | |
| "completions/min_length": 263.0, | |
| "completions/min_terminated_length": 263.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5845111310482025, | |
| "epoch": 0.39408866995073893, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03354331250442753, | |
| "kl": 0.03103955276310444, | |
| "learning_rate": 4.889204212436189e-05, | |
| "loss": 0.1756535917520523, | |
| "num_tokens": 24860034.0, | |
| "reward": 4.234375, | |
| "reward_std": 1.7158660888671875, | |
| "rewards/reward_func/mean": 0.4704861111111111, | |
| "rewards/reward_func/std": 0.22013813257217407, | |
| "sampling/importance_sampling_ratio/max": 2.9987423419952393, | |
| "sampling/importance_sampling_ratio/mean": 0.9564813375473022, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.36332893371582, | |
| "sampling/sampling_logp_difference/mean": 0.1764547973871231, | |
| "step": 160, | |
| "step_time": 172.21876229112968 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3241.0, | |
| "completions/max_terminated_length": 3241.0, | |
| "completions/mean_length": 936.03125, | |
| "completions/mean_terminated_length": 936.03125, | |
| "completions/min_length": 269.0, | |
| "completions/min_terminated_length": 269.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5562917143106461, | |
| "epoch": 0.39655172413793105, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.07900173065640134, | |
| "kl": 0.21416432037949562, | |
| "learning_rate": 4.8877715326335735e-05, | |
| "loss": 0.14280450344085693, | |
| "num_tokens": 25002900.0, | |
| "reward": 4.67578125, | |
| "reward_std": 1.3120925426483154, | |
| "rewards/reward_func/mean": 0.51953125, | |
| "rewards/reward_func/std": 0.22544356021616194, | |
| "sampling/importance_sampling_ratio/max": 2.9968130588531494, | |
| "sampling/importance_sampling_ratio/mean": 0.9616665840148926, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.51321792602539, | |
| "sampling/sampling_logp_difference/mean": 0.16403846442699432, | |
| "step": 161, | |
| "step_time": 97.09818493202329 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2848.0, | |
| "completions/max_terminated_length": 2848.0, | |
| "completions/mean_length": 846.265625, | |
| "completions/mean_terminated_length": 838.1935424804688, | |
| "completions/min_length": 308.0, | |
| "completions/min_terminated_length": 308.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6348050832748413, | |
| "epoch": 0.39901477832512317, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027732800385176146, | |
| "kl": 0.02809206396341324, | |
| "learning_rate": 4.886329862023818e-05, | |
| "loss": -0.0797884464263916, | |
| "num_tokens": 25146741.0, | |
| "reward": 4.49609375, | |
| "reward_std": 1.6223220825195312, | |
| "rewards/reward_func/mean": 0.4995659722222222, | |
| "rewards/reward_func/std": 0.2447797093126509, | |
| "sampling/importance_sampling_ratio/max": 2.9985299110412598, | |
| "sampling/importance_sampling_ratio/mean": 0.9614369869232178, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.401952743530273, | |
| "sampling/sampling_logp_difference/mean": 0.16790394484996796, | |
| "step": 162, | |
| "step_time": 85.56331390305422 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1662.0, | |
| "completions/mean_length": 719.5, | |
| "completions/mean_terminated_length": 665.90478515625, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5159411802887917, | |
| "epoch": 0.4014778325123153, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.021884903974561352, | |
| "kl": 0.022087908815592527, | |
| "learning_rate": 4.884879206035324e-05, | |
| "loss": -0.0033130012452602386, | |
| "num_tokens": 25268901.0, | |
| "reward": 4.67578125, | |
| "reward_std": 1.1079436540603638, | |
| "rewards/reward_func/mean": 0.51953125, | |
| "rewards/reward_func/std": 0.14711155576838386, | |
| "sampling/importance_sampling_ratio/max": 2.996157169342041, | |
| "sampling/importance_sampling_ratio/mean": 0.965851902961731, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.011970520019531, | |
| "sampling/sampling_logp_difference/mean": 0.14661702513694763, | |
| "step": 163, | |
| "step_time": 143.06184943695553 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1768.0, | |
| "completions/mean_length": 836.859375, | |
| "completions/mean_terminated_length": 785.1270141601562, | |
| "completions/min_length": 276.0, | |
| "completions/min_terminated_length": 276.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5226787328720093, | |
| "epoch": 0.4039408866995074, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03481414814636514, | |
| "kl": 0.027967958711087704, | |
| "learning_rate": 4.883419570130327e-05, | |
| "loss": 0.12579919397830963, | |
| "num_tokens": 25408860.0, | |
| "reward": 4.31640625, | |
| "reward_std": 1.6074246168136597, | |
| "rewards/reward_func/mean": 0.4796006944444444, | |
| "rewards/reward_func/std": 0.23362092218465275, | |
| "sampling/importance_sampling_ratio/max": 2.998420238494873, | |
| "sampling/importance_sampling_ratio/mean": 0.9653995037078857, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 9.86381721496582, | |
| "sampling/sampling_logp_difference/mean": 0.14888456463813782, | |
| "step": 164, | |
| "step_time": 130.35683792899363 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3195.0, | |
| "completions/mean_length": 1075.9375, | |
| "completions/mean_terminated_length": 985.704833984375, | |
| "completions/min_length": 119.0, | |
| "completions/min_terminated_length": 119.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5282468944787979, | |
| "epoch": 0.4064039408866995, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02783431665422358, | |
| "kl": 0.03446731064468622, | |
| "learning_rate": 4.881950959804874e-05, | |
| "loss": 0.045085370540618896, | |
| "num_tokens": 25560280.0, | |
| "reward": 4.265625, | |
| "reward_std": 1.6606289148330688, | |
| "rewards/reward_func/mean": 0.4739583333333333, | |
| "rewards/reward_func/std": 0.24556127190589905, | |
| "sampling/importance_sampling_ratio/max": 2.990581512451172, | |
| "sampling/importance_sampling_ratio/mean": 0.960712194442749, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.613689422607422, | |
| "sampling/sampling_logp_difference/mean": 0.15218126773834229, | |
| "step": 165, | |
| "step_time": 172.98013453022577 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2561.0, | |
| "completions/max_terminated_length": 2561.0, | |
| "completions/mean_length": 729.71875, | |
| "completions/mean_terminated_length": 733.9193115234375, | |
| "completions/min_length": 264.0, | |
| "completions/min_terminated_length": 264.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6080366969108582, | |
| "epoch": 0.4088669950738916, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03693327148531038, | |
| "kl": 0.07251364225521684, | |
| "learning_rate": 4.8804733805888024e-05, | |
| "loss": 0.1411583572626114, | |
| "num_tokens": 25696326.0, | |
| "reward": 4.703125, | |
| "reward_std": 1.3879262208938599, | |
| "rewards/reward_func/mean": 0.5225694444444444, | |
| "rewards/reward_func/std": 0.2221848898463779, | |
| "sampling/importance_sampling_ratio/max": 2.999439239501953, | |
| "sampling/importance_sampling_ratio/mean": 0.9615045785903931, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.027545928955078, | |
| "sampling/sampling_logp_difference/mean": 0.16043534874916077, | |
| "step": 166, | |
| "step_time": 84.72602451802231 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3802.0, | |
| "completions/mean_length": 1253.0625, | |
| "completions/mean_terminated_length": 1168.0655517578125, | |
| "completions/min_length": 184.0, | |
| "completions/min_terminated_length": 184.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5983615964651108, | |
| "epoch": 0.41133004926108374, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.08869248734451317, | |
| "kl": 0.30646114982664585, | |
| "learning_rate": 4.8789868380457246e-05, | |
| "loss": -0.03257778659462929, | |
| "num_tokens": 25863642.0, | |
| "reward": 4.52734375, | |
| "reward_std": 1.5418224334716797, | |
| "rewards/reward_func/mean": 0.5030381944444444, | |
| "rewards/reward_func/std": 0.2189425097571479, | |
| "sampling/importance_sampling_ratio/max": 2.995548725128174, | |
| "sampling/importance_sampling_ratio/mean": 0.9559701085090637, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.81066608428955, | |
| "sampling/sampling_logp_difference/mean": 0.17433354258537292, | |
| "step": 167, | |
| "step_time": 190.16439045919105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3095.0, | |
| "completions/mean_length": 1056.328125, | |
| "completions/mean_terminated_length": 959.0, | |
| "completions/min_length": 217.0, | |
| "completions/min_terminated_length": 217.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.608098179101944, | |
| "epoch": 0.41379310344827586, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.024967044301457336, | |
| "kl": 0.04156289668753743, | |
| "learning_rate": 4.8774913377729994e-05, | |
| "loss": -0.038745272904634476, | |
| "num_tokens": 26011023.0, | |
| "reward": 4.55859375, | |
| "reward_std": 1.2946335077285767, | |
| "rewards/reward_func/mean": 0.5065104166666666, | |
| "rewards/reward_func/std": 0.18393640220165253, | |
| "sampling/importance_sampling_ratio/max": 2.99995493888855, | |
| "sampling/importance_sampling_ratio/mean": 0.9574207067489624, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.34084415435791, | |
| "sampling/sampling_logp_difference/mean": 0.1739969253540039, | |
| "step": 168, | |
| "step_time": 121.14545013522729 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2646.0, | |
| "completions/mean_length": 839.109375, | |
| "completions/mean_terminated_length": 734.04833984375, | |
| "completions/min_length": 204.0, | |
| "completions/min_terminated_length": 204.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.624035969376564, | |
| "epoch": 0.41625615763546797, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02459309972915259, | |
| "kl": 0.03127794712781906, | |
| "learning_rate": 4.875986885401717e-05, | |
| "loss": 0.03152251988649368, | |
| "num_tokens": 26148006.0, | |
| "reward": 4.54296875, | |
| "reward_std": 1.3668079376220703, | |
| "rewards/reward_func/mean": 0.5047743055555556, | |
| "rewards/reward_func/std": 0.18499460816383362, | |
| "sampling/importance_sampling_ratio/max": 2.9995522499084473, | |
| "sampling/importance_sampling_ratio/mean": 0.9615331888198853, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.471660614013672, | |
| "sampling/sampling_logp_difference/mean": 0.16513219475746155, | |
| "step": 169, | |
| "step_time": 118.91501420899294 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2252.0, | |
| "completions/mean_length": 809.875, | |
| "completions/mean_terminated_length": 746.01611328125, | |
| "completions/min_length": 163.0, | |
| "completions/min_terminated_length": 163.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.551299437880516, | |
| "epoch": 0.4187192118226601, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02714725701280757, | |
| "kl": 0.02795234275981784, | |
| "learning_rate": 4.874473486596672e-05, | |
| "loss": -0.06360671669244766, | |
| "num_tokens": 26282494.0, | |
| "reward": 4.421875, | |
| "reward_std": 1.571620225906372, | |
| "rewards/reward_func/mean": 0.4913194444444444, | |
| "rewards/reward_func/std": 0.22683406207296583, | |
| "sampling/importance_sampling_ratio/max": 2.9994564056396484, | |
| "sampling/importance_sampling_ratio/mean": 0.9681460857391357, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.212518692016602, | |
| "sampling/sampling_logp_difference/mean": 0.1451174020767212, | |
| "step": 170, | |
| "step_time": 140.71922606788576 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3145.0, | |
| "completions/mean_length": 1584.28125, | |
| "completions/mean_terminated_length": 1173.272705078125, | |
| "completions/min_length": 290.0, | |
| "completions/min_terminated_length": 290.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5730425864458084, | |
| "epoch": 0.4211822660098522, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01904952675086213, | |
| "kl": 0.014946466544643044, | |
| "learning_rate": 4.8729511470563514e-05, | |
| "loss": -0.17807739973068237, | |
| "num_tokens": 26479264.0, | |
| "reward": 4.13671875, | |
| "reward_std": 1.866012692451477, | |
| "rewards/reward_func/mean": 0.4596354166666667, | |
| "rewards/reward_func/std": 0.26322751575046116, | |
| "sampling/importance_sampling_ratio/max": 2.996227502822876, | |
| "sampling/importance_sampling_ratio/mean": 0.9491404294967651, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.776933670043945, | |
| "sampling/sampling_logp_difference/mean": 0.18628865480422974, | |
| "step": 171, | |
| "step_time": 139.96793680964038 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3338.0, | |
| "completions/mean_length": 1098.578125, | |
| "completions/mean_terminated_length": 946.4166870117188, | |
| "completions/min_length": 202.0, | |
| "completions/min_terminated_length": 202.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6739596724510193, | |
| "epoch": 0.4236453201970443, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02536639850029695, | |
| "kl": 0.016601723851636052, | |
| "learning_rate": 4.871419872512901e-05, | |
| "loss": -0.15329566597938538, | |
| "num_tokens": 26640853.0, | |
| "reward": 4.15234375, | |
| "reward_std": 1.831501841545105, | |
| "rewards/reward_func/mean": 0.4613715277777778, | |
| "rewards/reward_func/std": 0.26819422592719394, | |
| "sampling/importance_sampling_ratio/max": 2.99464750289917, | |
| "sampling/importance_sampling_ratio/mean": 0.9525026082992554, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.213711738586426, | |
| "sampling/sampling_logp_difference/mean": 0.18900419771671295, | |
| "step": 172, | |
| "step_time": 119.70939294900745 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3843.0, | |
| "completions/mean_length": 1112.21875, | |
| "completions/mean_terminated_length": 965.475341796875, | |
| "completions/min_length": 287.0, | |
| "completions/min_terminated_length": 287.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6000253409147263, | |
| "epoch": 0.42610837438423643, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03322072539721348, | |
| "kl": 0.01858115242794156, | |
| "learning_rate": 4.869879668732115e-05, | |
| "loss": 0.0820431113243103, | |
| "num_tokens": 26793539.0, | |
| "reward": 3.4765625, | |
| "reward_std": 2.114630937576294, | |
| "rewards/reward_func/mean": 0.3862847222222222, | |
| "rewards/reward_func/std": 0.30766087025403976, | |
| "sampling/importance_sampling_ratio/max": 2.9941298961639404, | |
| "sampling/importance_sampling_ratio/mean": 0.9587726593017578, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.781671524047852, | |
| "sampling/sampling_logp_difference/mean": 0.17097865045070648, | |
| "step": 173, | |
| "step_time": 183.34437879105099 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3845.0, | |
| "completions/mean_length": 1355.65625, | |
| "completions/mean_terminated_length": 1154.2542724609375, | |
| "completions/min_length": 214.0, | |
| "completions/min_terminated_length": 214.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.7394872605800629, | |
| "epoch": 0.42857142857142855, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03241121665359061, | |
| "kl": 0.013445974560454488, | |
| "learning_rate": 4.868330541513405e-05, | |
| "loss": -0.08849971741437912, | |
| "num_tokens": 26969853.0, | |
| "reward": 3.05859375, | |
| "reward_std": 2.153228521347046, | |
| "rewards/reward_func/mean": 0.33984375, | |
| "rewards/reward_func/std": 0.2989349315563838, | |
| "sampling/importance_sampling_ratio/max": 2.9982781410217285, | |
| "sampling/importance_sampling_ratio/mean": 0.944078803062439, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 17.34125518798828, | |
| "sampling/sampling_logp_difference/mean": 0.21590906381607056, | |
| "step": 174, | |
| "step_time": 141.4540407299064 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2938.0, | |
| "completions/mean_length": 888.125, | |
| "completions/mean_terminated_length": 725.7833862304688, | |
| "completions/min_length": 173.0, | |
| "completions/min_terminated_length": 173.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6452028006315231, | |
| "epoch": 0.43103448275862066, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03481784350490518, | |
| "kl": 0.02153546130284667, | |
| "learning_rate": 4.866772496689787e-05, | |
| "loss": 0.023521684110164642, | |
| "num_tokens": 27108069.0, | |
| "reward": 3.59375, | |
| "reward_std": 2.1308858394622803, | |
| "rewards/reward_func/mean": 0.3993055555555556, | |
| "rewards/reward_func/std": 0.3248247545626428, | |
| "sampling/importance_sampling_ratio/max": 2.9982481002807617, | |
| "sampling/importance_sampling_ratio/mean": 0.9610703587532043, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.91135311126709, | |
| "sampling/sampling_logp_difference/mean": 0.16744840145111084, | |
| "step": 175, | |
| "step_time": 127.97521190205589 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 3963.0, | |
| "completions/max_terminated_length": 3963.0, | |
| "completions/mean_length": 789.71875, | |
| "completions/mean_terminated_length": 767.71435546875, | |
| "completions/min_length": 82.0, | |
| "completions/min_terminated_length": 82.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6018838584423065, | |
| "epoch": 0.43349753694581283, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.04159223111511331, | |
| "kl": 0.023647161200642586, | |
| "learning_rate": 4.865205540127851e-05, | |
| "loss": 0.39704209566116333, | |
| "num_tokens": 27246995.0, | |
| "reward": 3.58203125, | |
| "reward_std": 2.0373764038085938, | |
| "rewards/reward_func/mean": 0.3980034722222222, | |
| "rewards/reward_func/std": 0.2889099650912815, | |
| "sampling/importance_sampling_ratio/max": 2.999706268310547, | |
| "sampling/importance_sampling_ratio/mean": 0.9595375061035156, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.16771125793457, | |
| "sampling/sampling_logp_difference/mean": 0.16613054275512695, | |
| "step": 176, | |
| "step_time": 116.67240364779718 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3365.0, | |
| "completions/mean_length": 1200.609375, | |
| "completions/mean_terminated_length": 941.2069091796875, | |
| "completions/min_length": 130.0, | |
| "completions/min_terminated_length": 130.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6288936734199524, | |
| "epoch": 0.43596059113300495, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.027437780371530707, | |
| "kl": 0.018118501640856266, | |
| "learning_rate": 4.863629677727745e-05, | |
| "loss": 0.067866250872612, | |
| "num_tokens": 27423434.0, | |
| "reward": 3.34375, | |
| "reward_std": 2.071873188018799, | |
| "rewards/reward_func/mean": 0.3715277777777778, | |
| "rewards/reward_func/std": 0.26894643902778625, | |
| "sampling/importance_sampling_ratio/max": 2.991316318511963, | |
| "sampling/importance_sampling_ratio/mean": 0.9541721940040588, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.680610656738281, | |
| "sampling/sampling_logp_difference/mean": 0.18493330478668213, | |
| "step": 177, | |
| "step_time": 151.51794349495322 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3449.0, | |
| "completions/mean_length": 1000.40625, | |
| "completions/mean_terminated_length": 895.8359985351562, | |
| "completions/min_length": 150.0, | |
| "completions/min_terminated_length": 150.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6372148543596268, | |
| "epoch": 0.43842364532019706, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.033978302809922124, | |
| "kl": 0.021280715242028236, | |
| "learning_rate": 4.862044915423149e-05, | |
| "loss": -0.1135963499546051, | |
| "num_tokens": 27579684.0, | |
| "reward": 3.44921875, | |
| "reward_std": 2.0669257640838623, | |
| "rewards/reward_func/mean": 0.3832465277777778, | |
| "rewards/reward_func/std": 0.30381787982251907, | |
| "sampling/importance_sampling_ratio/max": 2.997410535812378, | |
| "sampling/importance_sampling_ratio/mean": 0.9527778029441833, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 18.84807777404785, | |
| "sampling/sampling_logp_difference/mean": 0.1820211112499237, | |
| "step": 178, | |
| "step_time": 132.21914479322731 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3490.0, | |
| "completions/mean_length": 940.390625, | |
| "completions/mean_terminated_length": 789.6500244140625, | |
| "completions/min_length": 130.0, | |
| "completions/min_terminated_length": 130.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5805416107177734, | |
| "epoch": 0.4408866995073892, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.03412320001462779, | |
| "kl": 0.0387720656581223, | |
| "learning_rate": 4.860451259181259e-05, | |
| "loss": 0.05906563252210617, | |
| "num_tokens": 27720781.0, | |
| "reward": 3.6875, | |
| "reward_std": 2.0004959106445312, | |
| "rewards/reward_func/mean": 0.4097222222222222, | |
| "rewards/reward_func/std": 0.2693231337600284, | |
| "sampling/importance_sampling_ratio/max": 2.9986228942871094, | |
| "sampling/importance_sampling_ratio/mean": 0.9692578315734863, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.18484878540039, | |
| "sampling/sampling_logp_difference/mean": 0.14448237419128418, | |
| "step": 179, | |
| "step_time": 122.44541043927893 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 3246.0, | |
| "completions/max_terminated_length": 3246.0, | |
| "completions/mean_length": 876.859375, | |
| "completions/mean_terminated_length": 866.3386840820312, | |
| "completions/min_length": 83.0, | |
| "completions/min_terminated_length": 83.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.589184507727623, | |
| "epoch": 0.4433497536945813, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.028710947398096327, | |
| "kl": 0.02539773052558303, | |
| "learning_rate": 4.8588487150027514e-05, | |
| "loss": 0.036127492785453796, | |
| "num_tokens": 27850516.0, | |
| "reward": 4.0859375, | |
| "reward_std": 1.7189528942108154, | |
| "rewards/reward_func/mean": 0.4539930555555556, | |
| "rewards/reward_func/std": 0.24298586448033652, | |
| "sampling/importance_sampling_ratio/max": 2.9983139038085938, | |
| "sampling/importance_sampling_ratio/mean": 0.9629830718040466, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.87133502960205, | |
| "sampling/sampling_logp_difference/mean": 0.15577855706214905, | |
| "step": 180, | |
| "step_time": 88.02008921210654 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3620.0, | |
| "completions/mean_length": 812.703125, | |
| "completions/mean_terminated_length": 748.758056640625, | |
| "completions/min_length": 234.0, | |
| "completions/min_terminated_length": 234.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5205734521150589, | |
| "epoch": 0.4458128078817734, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.035317714533933245, | |
| "kl": 0.024989775381982327, | |
| "learning_rate": 4.8572372889217776e-05, | |
| "loss": 0.3163328468799591, | |
| "num_tokens": 27981937.0, | |
| "reward": 4.06640625, | |
| "reward_std": 1.743037223815918, | |
| "rewards/reward_func/mean": 0.4518229166666667, | |
| "rewards/reward_func/std": 0.24088652142220074, | |
| "sampling/importance_sampling_ratio/max": 2.9916434288024902, | |
| "sampling/importance_sampling_ratio/mean": 0.9683629274368286, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.304101943969727, | |
| "sampling/sampling_logp_difference/mean": 0.1433240920305252, | |
| "step": 181, | |
| "step_time": 122.87319412222132 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3806.0, | |
| "completions/mean_length": 997.6875, | |
| "completions/mean_terminated_length": 949.8547973632812, | |
| "completions/min_length": 187.0, | |
| "completions/min_terminated_length": 187.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5740093439817429, | |
| "epoch": 0.4482758620689655, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.031271393477359785, | |
| "kl": 0.026333958376199007, | |
| "learning_rate": 4.855616987005926e-05, | |
| "loss": -0.1590024083852768, | |
| "num_tokens": 28138861.0, | |
| "reward": 3.86328125, | |
| "reward_std": 1.9395270347595215, | |
| "rewards/reward_func/mean": 0.4292534722222222, | |
| "rewards/reward_func/std": 0.28895225500067073, | |
| "sampling/importance_sampling_ratio/max": 2.9844038486480713, | |
| "sampling/importance_sampling_ratio/mean": 0.9601902365684509, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.815610885620117, | |
| "sampling/sampling_logp_difference/mean": 0.15642720460891724, | |
| "step": 182, | |
| "step_time": 142.38116177916527 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3687.0, | |
| "completions/mean_length": 1050.6875, | |
| "completions/mean_terminated_length": 952.4515991210938, | |
| "completions/min_length": 145.0, | |
| "completions/min_terminated_length": 145.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5912422388792038, | |
| "epoch": 0.45073891625615764, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.029375040224375964, | |
| "kl": 0.03247595578432083, | |
| "learning_rate": 4.853987815356211e-05, | |
| "loss": -0.13517621159553528, | |
| "num_tokens": 28287689.0, | |
| "reward": 4.171875, | |
| "reward_std": 1.8229548931121826, | |
| "rewards/reward_func/mean": 0.4635416666666667, | |
| "rewards/reward_func/std": 0.2815826332403554, | |
| "sampling/importance_sampling_ratio/max": 2.9973490238189697, | |
| "sampling/importance_sampling_ratio/mean": 0.9587271213531494, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.327677726745605, | |
| "sampling/sampling_logp_difference/mean": 0.16505397856235504, | |
| "step": 183, | |
| "step_time": 118.48117843503132 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2302.0, | |
| "completions/mean_length": 982.359375, | |
| "completions/mean_terminated_length": 829.2294921875, | |
| "completions/min_length": 192.0, | |
| "completions/min_terminated_length": 192.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5499522387981415, | |
| "epoch": 0.45320197044334976, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02496128260966136, | |
| "kl": 0.0238928678445518, | |
| "learning_rate": 4.8523497801070394e-05, | |
| "loss": -0.1412869393825531, | |
| "num_tokens": 28429712.0, | |
| "reward": 4.453125, | |
| "reward_std": 1.779443383216858, | |
| "rewards/reward_func/mean": 0.4947916666666667, | |
| "rewards/reward_func/std": 0.27015094210704166, | |
| "sampling/importance_sampling_ratio/max": 2.9977781772613525, | |
| "sampling/importance_sampling_ratio/mean": 0.9683471918106079, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 15.436448097229004, | |
| "sampling/sampling_logp_difference/mean": 0.14362305402755737, | |
| "step": 184, | |
| "step_time": 124.91741205216385 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4090.0, | |
| "completions/max_terminated_length": 2440.0, | |
| "completions/mean_length": 731.46875, | |
| "completions/mean_terminated_length": 688.51611328125, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 72.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6139041185379028, | |
| "epoch": 0.45566502463054187, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.037142784390176466, | |
| "kl": 0.05982669489458203, | |
| "learning_rate": 4.8507028874261965e-05, | |
| "loss": -0.006140515208244324, | |
| "num_tokens": 28559326.0, | |
| "reward": 3.98828125, | |
| "reward_std": 1.8678725957870483, | |
| "rewards/reward_func/mean": 0.4431423611111111, | |
| "rewards/reward_func/std": 0.28126167671547997, | |
| "sampling/importance_sampling_ratio/max": 2.9906561374664307, | |
| "sampling/importance_sampling_ratio/mean": 0.9640798568725586, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.613927841186523, | |
| "sampling/sampling_logp_difference/mean": 0.1551136076450348, | |
| "step": 185, | |
| "step_time": 159.02927091997117 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2238.0, | |
| "completions/max_terminated_length": 2238.0, | |
| "completions/mean_length": 772.203125, | |
| "completions/mean_terminated_length": 772.203125, | |
| "completions/min_length": 200.0, | |
| "completions/min_terminated_length": 200.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5520921647548676, | |
| "epoch": 0.458128078817734, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.061395920818485855, | |
| "kl": 0.05162614677101374, | |
| "learning_rate": 4.8490471435148174e-05, | |
| "loss": 0.12048451602458954, | |
| "num_tokens": 28685787.0, | |
| "reward": 4.6015625, | |
| "reward_std": 1.458907127380371, | |
| "rewards/reward_func/mean": 0.5112847222222222, | |
| "rewards/reward_func/std": 0.2330812480714586, | |
| "sampling/importance_sampling_ratio/max": 2.998060941696167, | |
| "sampling/importance_sampling_ratio/mean": 0.9678115844726562, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.809170722961426, | |
| "sampling/sampling_logp_difference/mean": 0.14706827700138092, | |
| "step": 186, | |
| "step_time": 78.58753942209296 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2988.0, | |
| "completions/max_terminated_length": 2988.0, | |
| "completions/mean_length": 1044.640625, | |
| "completions/mean_terminated_length": 1046.761962890625, | |
| "completions/min_length": 306.0, | |
| "completions/min_terminated_length": 306.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5624367594718933, | |
| "epoch": 0.4605911330049261, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02434360234786606, | |
| "kl": 0.02444734564051032, | |
| "learning_rate": 4.8473825546073656e-05, | |
| "loss": -0.01238684356212616, | |
| "num_tokens": 28844580.0, | |
| "reward": 4.50390625, | |
| "reward_std": 1.618035912513733, | |
| "rewards/reward_func/mean": 0.5004340277777778, | |
| "rewards/reward_func/std": 0.23665551717082658, | |
| "sampling/importance_sampling_ratio/max": 2.9936723709106445, | |
| "sampling/importance_sampling_ratio/mean": 0.9547263979911804, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.060944557189941, | |
| "sampling/sampling_logp_difference/mean": 0.1708623319864273, | |
| "step": 187, | |
| "step_time": 115.63352126302198 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1682.0, | |
| "completions/mean_length": 1027.03125, | |
| "completions/mean_terminated_length": 808.5254516601562, | |
| "completions/min_length": 253.0, | |
| "completions/min_terminated_length": 253.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5580074787139893, | |
| "epoch": 0.4630541871921182, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02214669121529831, | |
| "kl": 0.04178555542603135, | |
| "learning_rate": 4.845709126971609e-05, | |
| "loss": -0.07377077639102936, | |
| "num_tokens": 28988630.0, | |
| "reward": 4.4765625, | |
| "reward_std": 1.6919033527374268, | |
| "rewards/reward_func/mean": 0.4973958333333333, | |
| "rewards/reward_func/std": 0.24915697342819637, | |
| "sampling/importance_sampling_ratio/max": 2.998155355453491, | |
| "sampling/importance_sampling_ratio/mean": 0.964207649230957, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.738546371459961, | |
| "sampling/sampling_logp_difference/mean": 0.14810925722122192, | |
| "step": 188, | |
| "step_time": 144.04732121806592 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 1753.0, | |
| "completions/mean_length": 834.3125, | |
| "completions/mean_terminated_length": 684.5423583984375, | |
| "completions/min_length": 225.0, | |
| "completions/min_terminated_length": 225.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5862187743186951, | |
| "epoch": 0.46551724137931033, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02523673185589828, | |
| "kl": 0.04767545498907566, | |
| "learning_rate": 4.844026866908595e-05, | |
| "loss": -0.22603827714920044, | |
| "num_tokens": 29125434.0, | |
| "reward": 4.3984375, | |
| "reward_std": 1.546705961227417, | |
| "rewards/reward_func/mean": 0.4887152777777778, | |
| "rewards/reward_func/std": 0.2399756842189365, | |
| "sampling/importance_sampling_ratio/max": 2.9856364727020264, | |
| "sampling/importance_sampling_ratio/mean": 0.9674413204193115, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.371965408325195, | |
| "sampling/sampling_logp_difference/mean": 0.1515868902206421, | |
| "step": 189, | |
| "step_time": 124.82868728786707 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3335.0, | |
| "completions/mean_length": 972.59375, | |
| "completions/mean_terminated_length": 920.9031982421875, | |
| "completions/min_length": 359.0, | |
| "completions/min_terminated_length": 359.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.546548068523407, | |
| "epoch": 0.46798029556650245, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.02276588191427084, | |
| "kl": 0.03256893623620272, | |
| "learning_rate": 4.8423357807526325e-05, | |
| "loss": 0.15206681191921234, | |
| "num_tokens": 29285008.0, | |
| "reward": 4.99609375, | |
| "reward_std": 1.2673338651657104, | |
| "rewards/reward_func/mean": 0.5551215277777778, | |
| "rewards/reward_func/std": 0.24073222113980186, | |
| "sampling/importance_sampling_ratio/max": 2.999685287475586, | |
| "sampling/importance_sampling_ratio/mean": 0.9606513977050781, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.87429428100586, | |
| "sampling/sampling_logp_difference/mean": 0.1558273881673813, | |
| "step": 190, | |
| "step_time": 143.13196605397388 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3417.0, | |
| "completions/mean_length": 1032.015625, | |
| "completions/mean_terminated_length": 921.88134765625, | |
| "completions/min_length": 148.0, | |
| "completions/min_terminated_length": 148.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5146888419985771, | |
| "epoch": 0.47044334975369456, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01795531406691693, | |
| "kl": 0.030596476048231125, | |
| "learning_rate": 4.840635874871259e-05, | |
| "loss": -0.15366441011428833, | |
| "num_tokens": 29433393.0, | |
| "reward": 4.66796875, | |
| "reward_std": 1.4584555625915527, | |
| "rewards/reward_func/mean": 0.5186631944444444, | |
| "rewards/reward_func/std": 0.2413034306632148, | |
| "sampling/importance_sampling_ratio/max": 2.996281623840332, | |
| "sampling/importance_sampling_ratio/mean": 0.9648961424827576, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.369453430175781, | |
| "sampling/sampling_logp_difference/mean": 0.14613790810108185, | |
| "step": 191, | |
| "step_time": 170.6787863143254 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2995.0, | |
| "completions/mean_length": 990.515625, | |
| "completions/mean_terminated_length": 794.6551513671875, | |
| "completions/min_length": 157.0, | |
| "completions/min_terminated_length": 157.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5170257911086082, | |
| "epoch": 0.4729064039408867, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.020453909952142232, | |
| "kl": 0.04368177242577076, | |
| "learning_rate": 4.838927155665225e-05, | |
| "loss": 0.026476195082068443, | |
| "num_tokens": 29576562.0, | |
| "reward": 4.89453125, | |
| "reward_std": 1.1992099285125732, | |
| "rewards/reward_func/mean": 0.5438368055555556, | |
| "rewards/reward_func/std": 0.2044545453455713, | |
| "sampling/importance_sampling_ratio/max": 2.9978442192077637, | |
| "sampling/importance_sampling_ratio/mean": 0.9669831395149231, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.18189811706543, | |
| "sampling/sampling_logp_difference/mean": 0.13926595449447632, | |
| "step": 192, | |
| "step_time": 132.4229573700577 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3588.0, | |
| "completions/mean_length": 1411.796875, | |
| "completions/mean_terminated_length": 1298.34423828125, | |
| "completions/min_length": 392.0, | |
| "completions/min_terminated_length": 392.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5414084792137146, | |
| "epoch": 0.4753694581280788, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015964047365254634, | |
| "kl": 0.026161770801991224, | |
| "learning_rate": 4.837209629568462e-05, | |
| "loss": -0.20564614236354828, | |
| "num_tokens": 29759525.0, | |
| "reward": 4.53125, | |
| "reward_std": 1.5297966003417969, | |
| "rewards/reward_func/mean": 0.5034722222222222, | |
| "rewards/reward_func/std": 0.24822904335127938, | |
| "sampling/importance_sampling_ratio/max": 2.9965434074401855, | |
| "sampling/importance_sampling_ratio/mean": 0.9571963548660278, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.28773021697998, | |
| "sampling/sampling_logp_difference/mean": 0.15651743113994598, | |
| "step": 193, | |
| "step_time": 139.51090059312992 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3751.0, | |
| "completions/mean_length": 1201.515625, | |
| "completions/mean_terminated_length": 987.0, | |
| "completions/min_length": 286.0, | |
| "completions/min_terminated_length": 286.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5668457299470901, | |
| "epoch": 0.47783251231527096, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01222374150957719, | |
| "kl": 0.040560389403253794, | |
| "learning_rate": 4.8354833030480674e-05, | |
| "loss": -0.09171874076128006, | |
| "num_tokens": 29924774.0, | |
| "reward": 5.125, | |
| "reward_std": 1.3348206281661987, | |
| "rewards/reward_func/mean": 0.5694444444444444, | |
| "rewards/reward_func/std": 0.22527392663889462, | |
| "sampling/importance_sampling_ratio/max": 2.9956600666046143, | |
| "sampling/importance_sampling_ratio/mean": 0.9569047689437866, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.646681785583496, | |
| "sampling/sampling_logp_difference/mean": 0.16455239057540894, | |
| "step": 194, | |
| "step_time": 127.55075355409645 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2719.0, | |
| "completions/mean_length": 1242.109375, | |
| "completions/mean_terminated_length": 1038.2373046875, | |
| "completions/min_length": 263.0, | |
| "completions/min_terminated_length": 263.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5766540616750717, | |
| "epoch": 0.4802955665024631, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.01770060557750442, | |
| "kl": 0.01768818451091647, | |
| "learning_rate": 4.833748182604273e-05, | |
| "loss": -0.06070397049188614, | |
| "num_tokens": 30095053.0, | |
| "reward": 4.796875, | |
| "reward_std": 1.3771628141403198, | |
| "rewards/reward_func/mean": 0.5329861111111112, | |
| "rewards/reward_func/std": 0.23674807945887247, | |
| "sampling/importance_sampling_ratio/max": 2.9945473670959473, | |
| "sampling/importance_sampling_ratio/mean": 0.9582319259643555, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.335448265075684, | |
| "sampling/sampling_logp_difference/mean": 0.1665378212928772, | |
| "step": 195, | |
| "step_time": 156.3429046079982 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2747.0, | |
| "completions/mean_length": 804.734375, | |
| "completions/mean_terminated_length": 752.4921264648438, | |
| "completions/min_length": 250.0, | |
| "completions/min_terminated_length": 250.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5483453124761581, | |
| "epoch": 0.4827586206896552, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.011853860468393498, | |
| "kl": 0.030422898940742016, | |
| "learning_rate": 4.832004274770422e-05, | |
| "loss": -0.036816734820604324, | |
| "num_tokens": 30231644.0, | |
| "reward": 5.1640625, | |
| "reward_std": 0.9545409083366394, | |
| "rewards/reward_func/mean": 0.5737847222222222, | |
| "rewards/reward_func/std": 0.18036837296353447, | |
| "sampling/importance_sampling_ratio/max": 2.997767925262451, | |
| "sampling/importance_sampling_ratio/mean": 0.9683334231376648, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.523677825927734, | |
| "sampling/sampling_logp_difference/mean": 0.14397111535072327, | |
| "step": 196, | |
| "step_time": 109.79577358718961 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3785.0, | |
| "completions/mean_length": 1498.546875, | |
| "completions/mean_terminated_length": 1466.6719970703125, | |
| "completions/min_length": 392.0, | |
| "completions/min_terminated_length": 392.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.6924940571188927, | |
| "epoch": 0.4852216748768473, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.023579205094821604, | |
| "kl": 0.01952385390177369, | |
| "learning_rate": 4.8302515861129474e-05, | |
| "loss": 0.19029618799686432, | |
| "num_tokens": 30419551.0, | |
| "reward": 4.796875, | |
| "reward_std": 1.268415927886963, | |
| "rewards/reward_func/mean": 0.5329861111111112, | |
| "rewards/reward_func/std": 0.24172814769877327, | |
| "sampling/importance_sampling_ratio/max": 2.9983105659484863, | |
| "sampling/importance_sampling_ratio/mean": 0.945136308670044, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.623981475830078, | |
| "sampling/sampling_logp_difference/mean": 0.1995391845703125, | |
| "step": 197, | |
| "step_time": 176.65033843182027 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3796.0, | |
| "completions/mean_length": 1169.40625, | |
| "completions/mean_terminated_length": 1025.475341796875, | |
| "completions/min_length": 322.0, | |
| "completions/min_terminated_length": 322.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.553646519780159, | |
| "epoch": 0.4876847290640394, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.04398149539440762, | |
| "kl": 0.02133885119110346, | |
| "learning_rate": 4.828490123231342e-05, | |
| "loss": 0.01763659343123436, | |
| "num_tokens": 30582153.0, | |
| "reward": 4.37890625, | |
| "reward_std": 1.8414281606674194, | |
| "rewards/reward_func/mean": 0.4865451388888889, | |
| "rewards/reward_func/std": 0.2796827761663331, | |
| "sampling/importance_sampling_ratio/max": 2.9970648288726807, | |
| "sampling/importance_sampling_ratio/mean": 0.9559179544448853, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.906939506530762, | |
| "sampling/sampling_logp_difference/mean": 0.16657188534736633, | |
| "step": 198, | |
| "step_time": 125.99079136014916 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3215.0, | |
| "completions/mean_length": 952.34375, | |
| "completions/mean_terminated_length": 850.9354858398438, | |
| "completions/min_length": 332.0, | |
| "completions/min_terminated_length": 332.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5982878059148788, | |
| "epoch": 0.49014778325123154, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.026449524534477093, | |
| "kl": 0.02422321867197752, | |
| "learning_rate": 4.8267198927581415e-05, | |
| "loss": 0.08025837689638138, | |
| "num_tokens": 30742703.0, | |
| "reward": 4.86328125, | |
| "reward_std": 1.1750799417495728, | |
| "rewards/reward_func/mean": 0.5403645833333334, | |
| "rewards/reward_func/std": 0.20424510911107063, | |
| "sampling/importance_sampling_ratio/max": 2.996950387954712, | |
| "sampling/importance_sampling_ratio/mean": 0.9554992914199829, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.107158660888672, | |
| "sampling/sampling_logp_difference/mean": 0.17947040498256683, | |
| "step": 199, | |
| "step_time": 128.3923730046954 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 4028.0, | |
| "completions/mean_length": 1333.984375, | |
| "completions/mean_terminated_length": 1193.666748046875, | |
| "completions/min_length": 420.0, | |
| "completions/min_terminated_length": 420.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5008647292852402, | |
| "epoch": 0.49261083743842365, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.012771339469120326, | |
| "kl": 0.02338361693546176, | |
| "learning_rate": 4.824940901358889e-05, | |
| "loss": -0.011300859972834587, | |
| "num_tokens": 30914094.0, | |
| "reward": 4.98046875, | |
| "reward_std": 1.153243064880371, | |
| "rewards/reward_func/mean": 0.5533854166666666, | |
| "rewards/reward_func/std": 0.2083408029543029, | |
| "sampling/importance_sampling_ratio/max": 2.994527578353882, | |
| "sampling/importance_sampling_ratio/mean": 0.9607874155044556, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.63933277130127, | |
| "sampling/sampling_logp_difference/mean": 0.15060916543006897, | |
| "step": 200, | |
| "step_time": 131.234265395673 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2340.0, | |
| "completions/mean_length": 1110.984375, | |
| "completions/mean_terminated_length": 941.4000244140625, | |
| "completions/min_length": 383.0, | |
| "completions/min_terminated_length": 383.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.5410940274596214, | |
| "epoch": 0.49507389162561577, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.018820871729217724, | |
| "kl": 0.07351712137460709, | |
| "learning_rate": 4.82315315573212e-05, | |
| "loss": -0.008761988021433353, | |
| "num_tokens": 31075405.0, | |
| "reward": 4.7265625, | |
| "reward_std": 1.286929726600647, | |
| "rewards/reward_func/mean": 0.5251736111111112, | |
| "rewards/reward_func/std": 0.18622204413016638, | |
| "sampling/importance_sampling_ratio/max": 2.9949045181274414, | |
| "sampling/importance_sampling_ratio/mean": 0.9584058523178101, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 16.770557403564453, | |
| "sampling/sampling_logp_difference/mean": 0.16537359356880188, | |
| "step": 201, | |
| "step_time": 137.49633058882318 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3727.0, | |
| "completions/mean_length": 1543.71875, | |
| "completions/mean_terminated_length": 1430.36669921875, | |
| "completions/min_length": 299.0, | |
| "completions/min_terminated_length": 299.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.546101376414299, | |
| "epoch": 0.4975369458128079, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.016073747890772528, | |
| "kl": 0.033409463707357645, | |
| "learning_rate": 4.8213566626093316e-05, | |
| "loss": 0.05686764791607857, | |
| "num_tokens": 31269403.0, | |
| "reward": 5.0, | |
| "reward_std": 1.2669485807418823, | |
| "rewards/reward_func/mean": 0.5555555555555556, | |
| "rewards/reward_func/std": 0.23293556190199322, | |
| "sampling/importance_sampling_ratio/max": 2.9992666244506836, | |
| "sampling/importance_sampling_ratio/mean": 0.9528164863586426, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.812483787536621, | |
| "sampling/sampling_logp_difference/mean": 0.16451823711395264, | |
| "step": 202, | |
| "step_time": 132.4189603566192 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3834.0, | |
| "completions/mean_length": 1231.453125, | |
| "completions/mean_terminated_length": 1121.03271484375, | |
| "completions/min_length": 352.0, | |
| "completions/min_terminated_length": 352.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5824616849422455, | |
| "epoch": 0.5, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.009700723668098444, | |
| "kl": 0.02169888187199831, | |
| "learning_rate": 4.819551428754957e-05, | |
| "loss": -0.026023028418421745, | |
| "num_tokens": 31437672.0, | |
| "reward": 5.09375, | |
| "reward_std": 0.857390820980072, | |
| "rewards/reward_func/mean": 0.5659722222222222, | |
| "rewards/reward_func/std": 0.16799302399158478, | |
| "sampling/importance_sampling_ratio/max": 2.9991044998168945, | |
| "sampling/importance_sampling_ratio/mean": 0.9563360810279846, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 14.749972343444824, | |
| "sampling/sampling_logp_difference/mean": 0.1682780683040619, | |
| "step": 203, | |
| "step_time": 135.12933608028106 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2889.0, | |
| "completions/mean_length": 1295.25, | |
| "completions/mean_terminated_length": 1005.5172119140625, | |
| "completions/min_length": 186.0, | |
| "completions/min_terminated_length": 186.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5633620470762253, | |
| "epoch": 0.5024630541871922, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.016436761796537254, | |
| "kl": 0.059325653593987226, | |
| "learning_rate": 4.8177374609663415e-05, | |
| "loss": -0.09959787130355835, | |
| "num_tokens": 31612008.0, | |
| "reward": 4.875, | |
| "reward_std": 1.3901581764221191, | |
| "rewards/reward_func/mean": 0.5416666666666666, | |
| "rewards/reward_func/std": 0.23205215194159085, | |
| "sampling/importance_sampling_ratio/max": 2.9937283992767334, | |
| "sampling/importance_sampling_ratio/mean": 0.9539804458618164, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.698464393615723, | |
| "sampling/sampling_logp_difference/mean": 0.17365781962871552, | |
| "step": 204, | |
| "step_time": 123.92455417569727 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 3117.0, | |
| "completions/mean_length": 1157.078125, | |
| "completions/mean_terminated_length": 1067.3834228515625, | |
| "completions/min_length": 247.0, | |
| "completions/min_terminated_length": 247.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.5557613000273705, | |
| "epoch": 0.5049261083743842, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.014862369706502163, | |
| "kl": 0.05807019583880901, | |
| "learning_rate": 4.815914766073719e-05, | |
| "loss": -0.008188445121049881, | |
| "num_tokens": 31765821.0, | |
| "reward": 5.12109375, | |
| "reward_std": 0.9008287787437439, | |
| "rewards/reward_func/mean": 0.5690104166666666, | |
| "rewards/reward_func/std": 0.16866978506247202, | |
| "sampling/importance_sampling_ratio/max": 2.9971923828125, | |
| "sampling/importance_sampling_ratio/mean": 0.9620099067687988, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.711771011352539, | |
| "sampling/sampling_logp_difference/mean": 0.15512652695178986, | |
| "step": 205, | |
| "step_time": 145.73420074605383 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2752.0, | |
| "completions/mean_length": 1087.1875, | |
| "completions/mean_terminated_length": 939.2130737304688, | |
| "completions/min_length": 216.0, | |
| "completions/min_terminated_length": 216.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5469877645373344, | |
| "epoch": 0.5073891625615764, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.024437213855539253, | |
| "kl": 0.03669538162648678, | |
| "learning_rate": 4.8140833509401815e-05, | |
| "loss": -0.08707018941640854, | |
| "num_tokens": 31912473.0, | |
| "reward": 4.5234375, | |
| "reward_std": 1.881783366203308, | |
| "rewards/reward_func/mean": 0.5026041666666666, | |
| "rewards/reward_func/std": 0.2910439347227414, | |
| "sampling/importance_sampling_ratio/max": 2.984907388687134, | |
| "sampling/importance_sampling_ratio/mean": 0.9634929895401001, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.110798835754395, | |
| "sampling/sampling_logp_difference/mean": 0.14892369508743286, | |
| "step": 206, | |
| "step_time": 118.83005754603073 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2835.0, | |
| "completions/mean_length": 1048.890625, | |
| "completions/mean_terminated_length": 978.3770141601562, | |
| "completions/min_length": 296.0, | |
| "completions/min_terminated_length": 296.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.47129160165786743, | |
| "epoch": 0.5098522167487685, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.01883575281540592, | |
| "kl": 0.02437049988657236, | |
| "learning_rate": 4.812243222461658e-05, | |
| "loss": 0.0016541481018066406, | |
| "num_tokens": 32064114.0, | |
| "reward": 4.9140625, | |
| "reward_std": 1.2513633966445923, | |
| "rewards/reward_func/mean": 0.5460069444444444, | |
| "rewards/reward_func/std": 0.2148944992158148, | |
| "sampling/importance_sampling_ratio/max": 2.994464159011841, | |
| "sampling/importance_sampling_ratio/mean": 0.9673416018486023, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 10.47595500946045, | |
| "sampling/sampling_logp_difference/mean": 0.1353481113910675, | |
| "step": 207, | |
| "step_time": 157.52031526481733 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2719.0, | |
| "completions/mean_length": 1064.859375, | |
| "completions/mean_terminated_length": 788.7413940429688, | |
| "completions/min_length": 346.0, | |
| "completions/min_terminated_length": 346.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.5812835246324539, | |
| "epoch": 0.5123152709359606, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.02172590175321915, | |
| "kl": 0.04220228176563978, | |
| "learning_rate": 4.8103943875668844e-05, | |
| "loss": -0.17685756087303162, | |
| "num_tokens": 32223801.0, | |
| "reward": 4.3125, | |
| "reward_std": 1.8055250644683838, | |
| "rewards/reward_func/mean": 0.4791666666666667, | |
| "rewards/reward_func/std": 0.27989307790994644, | |
| "sampling/importance_sampling_ratio/max": 2.9918086528778076, | |
| "sampling/importance_sampling_ratio/mean": 0.9594242572784424, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 12.874069213867188, | |
| "sampling/sampling_logp_difference/mean": 0.16716912388801575, | |
| "step": 208, | |
| "step_time": 135.27337133488618 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2506.0, | |
| "completions/mean_length": 945.953125, | |
| "completions/mean_terminated_length": 853.8851928710938, | |
| "completions/min_length": 194.0, | |
| "completions/min_terminated_length": 194.0, | |
| "degenerate_groups_filtered": 1.0, | |
| "entropy": 0.568095900118351, | |
| "epoch": 0.5147783251231527, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.02069842032168839, | |
| "kl": 0.04567871009930968, | |
| "learning_rate": 4.8085368532173804e-05, | |
| "loss": -0.054947808384895325, | |
| "num_tokens": 32373222.0, | |
| "reward": 4.88671875, | |
| "reward_std": 1.334768295288086, | |
| "rewards/reward_func/mean": 0.54296875, | |
| "rewards/reward_func/std": 0.22824304468101925, | |
| "sampling/importance_sampling_ratio/max": 2.996091604232788, | |
| "sampling/importance_sampling_ratio/mean": 0.9642556309700012, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 13.462482452392578, | |
| "sampling/sampling_logp_difference/mean": 0.15888792276382446, | |
| "step": 209, | |
| "step_time": 151.47347256494686 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 4096.0, | |
| "completions/max_terminated_length": 2362.0, | |
| "completions/mean_length": 1201.515625, | |
| "completions/mean_terminated_length": 1059.163818359375, | |
| "completions/min_length": 318.0, | |
| "completions/min_terminated_length": 318.0, | |
| "degenerate_groups_filtered": 0.0, | |
| "entropy": 0.559689849615097, | |
| "epoch": 0.5172413793103449, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.015491091292339451, | |
| "kl": 0.03229631157591939, | |
| "learning_rate": 4.806670626407422e-05, | |
| "loss": -0.10237803310155869, | |
| "num_tokens": 32539367.0, | |
| "reward": 4.7734375, | |
| "reward_std": 1.3770502805709839, | |
| "rewards/reward_func/mean": 0.5303819444444444, | |
| "rewards/reward_func/std": 0.21820614321364296, | |
| "sampling/importance_sampling_ratio/max": 2.9985454082489014, | |
| "sampling/importance_sampling_ratio/mean": 0.9577094912528992, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 11.66981315612793, | |
| "sampling/sampling_logp_difference/mean": 0.1669681966304779, | |
| "step": 210, | |
| "step_time": 128.17530722473748 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1624, | |
| "num_input_tokens_seen": 32539367, | |
| "num_train_epochs": 4, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |