Instructions to use Gege24/gin_rummy_2G with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Gege24/gin_rummy_2G with PEFT:
Base model is not found.
- Transformers
How to use Gege24/gin_rummy_2G with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Gege24/gin_rummy_2G") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Gege24/gin_rummy_2G", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Gege24/gin_rummy_2G with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Gege24/gin_rummy_2G" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/gin_rummy_2G", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Gege24/gin_rummy_2G
- SGLang
How to use Gege24/gin_rummy_2G with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Gege24/gin_rummy_2G" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/gin_rummy_2G", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Gege24/gin_rummy_2G" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gege24/gin_rummy_2G", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Gege24/gin_rummy_2G with Docker Model Runner:
docker model run hf.co/Gege24/gin_rummy_2G
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.006, | |
| "eval_steps": 500, | |
| "global_step": 75, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2813.0, | |
| "completions/max_terminated_length": 2813.0, | |
| "completions/mean_length": 2062.46875, | |
| "completions/mean_terminated_length": 2062.46875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.1340037016198039, | |
| "epoch": 8e-05, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.9753277897834778, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.0097, | |
| "num_tokens": 78941.0, | |
| "reward": 0.5106250047683716, | |
| "reward_std": 0.16591878235340118, | |
| "rewards/rollout_reward_func/mean": 0.5106250047683716, | |
| "rewards/rollout_reward_func/std": 0.38574549555778503, | |
| "sampling/importance_sampling_ratio/max": 1.89468514919281, | |
| "sampling/importance_sampling_ratio/mean": 0.917938768863678, | |
| "sampling/importance_sampling_ratio/min": 0.26035696268081665, | |
| "sampling/sampling_logp_difference/max": 1.035329818725586, | |
| "sampling/sampling_logp_difference/mean": 0.020964600145816803, | |
| "step": 1, | |
| "step_time": 18.408817325000086 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2795.0, | |
| "completions/max_terminated_length": 2795.0, | |
| "completions/mean_length": 2091.09375, | |
| "completions/mean_terminated_length": 2091.09375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.1304742144420743, | |
| "epoch": 0.00016, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.355797290802002, | |
| "kl": 0.0, | |
| "learning_rate": 2.2857142857142855e-07, | |
| "loss": -0.0694, | |
| "num_tokens": 158774.0, | |
| "reward": 0.38593748211860657, | |
| "reward_std": 0.15246255695819855, | |
| "rewards/rollout_reward_func/mean": 0.38593748211860657, | |
| "rewards/rollout_reward_func/std": 0.3016391694545746, | |
| "sampling/importance_sampling_ratio/max": 2.747450828552246, | |
| "sampling/importance_sampling_ratio/mean": 0.995655357837677, | |
| "sampling/importance_sampling_ratio/min": 0.30046260356903076, | |
| "sampling/sampling_logp_difference/max": 1.1795392036437988, | |
| "sampling/sampling_logp_difference/mean": 0.022426610812544823, | |
| "step": 2, | |
| "step_time": 17.007036188999905 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2783.0, | |
| "completions/max_terminated_length": 2783.0, | |
| "completions/mean_length": 1881.75, | |
| "completions/mean_terminated_length": 1881.75, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.12374210823327303, | |
| "epoch": 0.00024, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.6671572327613831, | |
| "kl": 0.0014705628045703634, | |
| "learning_rate": 4.571428571428571e-07, | |
| "loss": -0.0235, | |
| "num_tokens": 231260.0, | |
| "reward": 0.4012500047683716, | |
| "reward_std": 0.20683754980564117, | |
| "rewards/rollout_reward_func/mean": 0.4012500047683716, | |
| "rewards/rollout_reward_func/std": 0.33187392354011536, | |
| "sampling/importance_sampling_ratio/max": 1.3709925413131714, | |
| "sampling/importance_sampling_ratio/mean": 0.8955257534980774, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9760880470275879, | |
| "sampling/sampling_logp_difference/mean": 0.020091338083148003, | |
| "step": 3, | |
| "step_time": 16.527492177000227 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2795.0, | |
| "completions/max_terminated_length": 2795.0, | |
| "completions/mean_length": 2263.03125, | |
| "completions/mean_terminated_length": 2263.03125, | |
| "completions/min_length": 1569.0, | |
| "completions/min_terminated_length": 1569.0, | |
| "entropy": 0.15253359219059348, | |
| "epoch": 0.00032, | |
| "frac_reward_zero_std": 0.875, | |
| "grad_norm": 0.9312232136726379, | |
| "kl": 0.0016055026353569701, | |
| "learning_rate": 6.857142857142857e-07, | |
| "loss": -0.0241, | |
| "num_tokens": 316879.0, | |
| "reward": 0.3787500262260437, | |
| "reward_std": 0.0624999962747097, | |
| "rewards/rollout_reward_func/mean": 0.3787500262260437, | |
| "rewards/rollout_reward_func/std": 0.26268768310546875, | |
| "sampling/importance_sampling_ratio/max": 1.9853609800338745, | |
| "sampling/importance_sampling_ratio/mean": 0.9613277912139893, | |
| "sampling/importance_sampling_ratio/min": 0.4037262201309204, | |
| "sampling/sampling_logp_difference/max": 0.6126779317855835, | |
| "sampling/sampling_logp_difference/mean": 0.02291969209909439, | |
| "step": 4, | |
| "step_time": 18.10109623099993 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.003289473708719015, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003289473708719015, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2807.0, | |
| "completions/max_terminated_length": 2807.0, | |
| "completions/mean_length": 2197.1875, | |
| "completions/mean_terminated_length": 2197.1875, | |
| "completions/min_length": 1570.0, | |
| "completions/min_terminated_length": 1570.0, | |
| "entropy": 0.14702600054442883, | |
| "epoch": 0.0004, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 2.2064177989959717, | |
| "kl": 0.002026251120696543, | |
| "learning_rate": 9.142857142857142e-07, | |
| "loss": -0.0169, | |
| "num_tokens": 400370.0, | |
| "reward": 0.4140625, | |
| "reward_std": 0.15217570960521698, | |
| "rewards/rollout_reward_func/mean": 0.4140625, | |
| "rewards/rollout_reward_func/std": 0.33838188648223877, | |
| "sampling/importance_sampling_ratio/max": 2.348391532897949, | |
| "sampling/importance_sampling_ratio/mean": 1.0998704433441162, | |
| "sampling/importance_sampling_ratio/min": 0.6395935416221619, | |
| "sampling/sampling_logp_difference/max": 0.6357507705688477, | |
| "sampling/sampling_logp_difference/mean": 0.020555175840854645, | |
| "step": 5, | |
| "step_time": 16.967482953000058 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.009868421126157045, | |
| "clip_ratio/high_mean": 0.004934210563078523, | |
| "clip_ratio/low_mean": 0.0034829721553251147, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008417182718403637, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2449.0, | |
| "completions/max_terminated_length": 2449.0, | |
| "completions/mean_length": 1818.28125, | |
| "completions/mean_terminated_length": 1818.28125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.0911772302351892, | |
| "epoch": 0.00048, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.8283352255821228, | |
| "kl": 0.001795254382159328, | |
| "learning_rate": 1.1428571428571428e-06, | |
| "loss": -0.0209, | |
| "num_tokens": 470216.0, | |
| "reward": 0.5893750190734863, | |
| "reward_std": 0.1562499850988388, | |
| "rewards/rollout_reward_func/mean": 0.5893750190734863, | |
| "rewards/rollout_reward_func/std": 0.4223737418651581, | |
| "sampling/importance_sampling_ratio/max": 1.7877978086471558, | |
| "sampling/importance_sampling_ratio/mean": 0.9894595146179199, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9945626258850098, | |
| "sampling/sampling_logp_difference/mean": 0.01755390875041485, | |
| "step": 6, | |
| "step_time": 15.150656265999714 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.009375000139698386, | |
| "clip_ratio/high_mean": 0.0062500000931322575, | |
| "clip_ratio/low_mean": 0.006411405862309039, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.012661405955441296, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2791.0, | |
| "completions/max_terminated_length": 2791.0, | |
| "completions/mean_length": 2104.28125, | |
| "completions/mean_terminated_length": 2104.28125, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.16090343240648508, | |
| "epoch": 0.00056, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.4131731986999512, | |
| "kl": 0.002872211887734011, | |
| "learning_rate": 1.3714285714285715e-06, | |
| "loss": -0.03, | |
| "num_tokens": 550373.0, | |
| "reward": 0.29500001668930054, | |
| "reward_std": 0.0949999988079071, | |
| "rewards/rollout_reward_func/mean": 0.29500001668930054, | |
| "rewards/rollout_reward_func/std": 0.1687716543674469, | |
| "sampling/importance_sampling_ratio/max": 1.3797976970672607, | |
| "sampling/importance_sampling_ratio/mean": 0.9415616989135742, | |
| "sampling/importance_sampling_ratio/min": 0.29769256711006165, | |
| "sampling/sampling_logp_difference/max": 0.9464168548583984, | |
| "sampling/sampling_logp_difference/mean": 0.02406277321279049, | |
| "step": 7, | |
| "step_time": 16.95326116199999 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0016447368543595076, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016447368543595076, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2814.0, | |
| "completions/max_terminated_length": 2814.0, | |
| "completions/mean_length": 2265.5625, | |
| "completions/mean_terminated_length": 2265.5625, | |
| "completions/min_length": 1569.0, | |
| "completions/min_terminated_length": 1569.0, | |
| "entropy": 0.20542557537555695, | |
| "epoch": 0.00064, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.0486057996749878, | |
| "kl": 0.00249500987411011, | |
| "learning_rate": 1.6e-06, | |
| "loss": 0.0307, | |
| "num_tokens": 636318.0, | |
| "reward": 0.3434374928474426, | |
| "reward_std": 0.08029377460479736, | |
| "rewards/rollout_reward_func/mean": 0.3434374928474426, | |
| "rewards/rollout_reward_func/std": 0.22245851159095764, | |
| "sampling/importance_sampling_ratio/max": 2.239882230758667, | |
| "sampling/importance_sampling_ratio/mean": 0.9191794395446777, | |
| "sampling/importance_sampling_ratio/min": 0.3405879735946655, | |
| "sampling/sampling_logp_difference/max": 1.0898922681808472, | |
| "sampling/sampling_logp_difference/mean": 0.023122236132621765, | |
| "step": 8, | |
| "step_time": 18.007336240000086 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010620915098115802, | |
| "clip_ratio/high_mean": 0.005310457549057901, | |
| "clip_ratio/low_mean": 0.0016447368543595076, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0069551944034174085, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2790.0, | |
| "completions/max_terminated_length": 2790.0, | |
| "completions/mean_length": 1646.03125, | |
| "completions/mean_terminated_length": 1646.03125, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.13724102126434445, | |
| "epoch": 0.00072, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.481195092201233, | |
| "kl": 0.0015346993204730097, | |
| "learning_rate": 1.8285714285714284e-06, | |
| "loss": -0.0189, | |
| "num_tokens": 701005.0, | |
| "reward": 0.6278125047683716, | |
| "reward_std": 0.30871257185935974, | |
| "rewards/rollout_reward_func/mean": 0.6278125047683716, | |
| "rewards/rollout_reward_func/std": 0.4518972933292389, | |
| "sampling/importance_sampling_ratio/max": 1.9988352060317993, | |
| "sampling/importance_sampling_ratio/mean": 0.9468162059783936, | |
| "sampling/importance_sampling_ratio/min": 0.4476884603500366, | |
| "sampling/sampling_logp_difference/max": 0.773470401763916, | |
| "sampling/sampling_logp_difference/mean": 0.02107076346874237, | |
| "step": 9, | |
| "step_time": 16.103736942000182 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008938953513279557, | |
| "clip_ratio/high_mean": 0.004469476756639779, | |
| "clip_ratio/low_mean": 0.0015243901871144772, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005993866943754256, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2799.0, | |
| "completions/max_terminated_length": 2799.0, | |
| "completions/mean_length": 2226.4375, | |
| "completions/mean_terminated_length": 2226.4375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.18680323101580143, | |
| "epoch": 0.0008, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 2.0065183639526367, | |
| "kl": 0.002535051797167398, | |
| "learning_rate": 2.057142857142857e-06, | |
| "loss": 0.0495, | |
| "num_tokens": 785722.0, | |
| "reward": 0.4443749785423279, | |
| "reward_std": 0.08841878175735474, | |
| "rewards/rollout_reward_func/mean": 0.4443749785423279, | |
| "rewards/rollout_reward_func/std": 0.35590803623199463, | |
| "sampling/importance_sampling_ratio/max": 2.790905714035034, | |
| "sampling/importance_sampling_ratio/mean": 0.940368115901947, | |
| "sampling/importance_sampling_ratio/min": 0.22617992758750916, | |
| "sampling/sampling_logp_difference/max": 0.6851506233215332, | |
| "sampling/sampling_logp_difference/mean": 0.025203729048371315, | |
| "step": 10, | |
| "step_time": 17.030096009999966 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0015625000232830644, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0015625000232830644, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2783.0, | |
| "completions/max_terminated_length": 2783.0, | |
| "completions/mean_length": 1966.875, | |
| "completions/mean_terminated_length": 1966.875, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.1440325272269547, | |
| "epoch": 0.00088, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.0474733114242554, | |
| "kl": 0.0011928395033464767, | |
| "learning_rate": 2.2857142857142856e-06, | |
| "loss": -0.0121, | |
| "num_tokens": 861366.0, | |
| "reward": 0.4596875011920929, | |
| "reward_std": 0.14279377460479736, | |
| "rewards/rollout_reward_func/mean": 0.4596875011920929, | |
| "rewards/rollout_reward_func/std": 0.38282889127731323, | |
| "sampling/importance_sampling_ratio/max": 1.9087510108947754, | |
| "sampling/importance_sampling_ratio/mean": 0.9341506361961365, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9133691787719727, | |
| "sampling/sampling_logp_difference/mean": 0.02222413383424282, | |
| "step": 11, | |
| "step_time": 16.685985845999994 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.007352941203862429, | |
| "clip_ratio/high_mean": 0.0055147059028968215, | |
| "clip_ratio/low_mean": 0.0036764706019312143, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.009191176504828036, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2801.0, | |
| "completions/max_terminated_length": 2801.0, | |
| "completions/mean_length": 1857.96875, | |
| "completions/mean_terminated_length": 1857.96875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.17092999629676342, | |
| "epoch": 0.00096, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 2.0095906257629395, | |
| "kl": 0.0024750066513661295, | |
| "learning_rate": 2.5142857142857142e-06, | |
| "loss": -0.0228, | |
| "num_tokens": 933262.0, | |
| "reward": 0.48593753576278687, | |
| "reward_std": 0.20529377460479736, | |
| "rewards/rollout_reward_func/mean": 0.48593753576278687, | |
| "rewards/rollout_reward_func/std": 0.38762184977531433, | |
| "sampling/importance_sampling_ratio/max": 1.6060364246368408, | |
| "sampling/importance_sampling_ratio/mean": 0.9451028108596802, | |
| "sampling/importance_sampling_ratio/min": 0.34411877393722534, | |
| "sampling/sampling_logp_difference/max": 1.0912601947784424, | |
| "sampling/sampling_logp_difference/mean": 0.019420120865106583, | |
| "step": 12, | |
| "step_time": 17.06754343600005 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.007936508161947131, | |
| "clip_ratio/high_mean": 0.003968254080973566, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0057043652050197124, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2812.0, | |
| "completions/max_terminated_length": 2812.0, | |
| "completions/mean_length": 1741.5, | |
| "completions/mean_terminated_length": 1741.5, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.11786343855783343, | |
| "epoch": 0.00104, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 2.0365312099456787, | |
| "kl": 0.0034534272163000423, | |
| "learning_rate": 2.742857142857143e-06, | |
| "loss": -0.0389, | |
| "num_tokens": 1001046.0, | |
| "reward": 0.6475000381469727, | |
| "reward_std": 0.24292194843292236, | |
| "rewards/rollout_reward_func/mean": 0.6475000381469727, | |
| "rewards/rollout_reward_func/std": 0.4391413629055023, | |
| "sampling/importance_sampling_ratio/max": 2.502153158187866, | |
| "sampling/importance_sampling_ratio/mean": 1.0042061805725098, | |
| "sampling/importance_sampling_ratio/min": 0.4875127375125885, | |
| "sampling/sampling_logp_difference/max": 0.6685242652893066, | |
| "sampling/sampling_logp_difference/mean": 0.01504062581807375, | |
| "step": 13, | |
| "step_time": 16.125135786999863 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011430230224505067, | |
| "clip_ratio/high_mean": 0.0057151151122525334, | |
| "clip_ratio/low_mean": 0.0022321429569274187, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007947258069179952, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2802.0, | |
| "completions/max_terminated_length": 2802.0, | |
| "completions/mean_length": 2010.3125, | |
| "completions/mean_terminated_length": 2010.3125, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.1693209670484066, | |
| "epoch": 0.00112, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.2513231039047241, | |
| "kl": 0.002481764371623285, | |
| "learning_rate": 2.9714285714285716e-06, | |
| "loss": 0.0309, | |
| "num_tokens": 1078101.0, | |
| "reward": 0.4753125011920929, | |
| "reward_std": 0.19296419620513916, | |
| "rewards/rollout_reward_func/mean": 0.4753125011920929, | |
| "rewards/rollout_reward_func/std": 0.3761346936225891, | |
| "sampling/importance_sampling_ratio/max": 1.5099362134933472, | |
| "sampling/importance_sampling_ratio/mean": 0.9022589921951294, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.6567137241363525, | |
| "sampling/sampling_logp_difference/mean": 0.022564683109521866, | |
| "step": 14, | |
| "step_time": 16.688260055 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0036764706019312143, | |
| "clip_ratio/high_mean": 0.0018382353009656072, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003574346425011754, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2775.0, | |
| "completions/max_terminated_length": 2775.0, | |
| "completions/mean_length": 1887.71875, | |
| "completions/mean_terminated_length": 1887.71875, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.14878523536026478, | |
| "epoch": 0.0012, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.092544674873352, | |
| "kl": 0.0021166762671782635, | |
| "learning_rate": 3.2e-06, | |
| "loss": -0.014, | |
| "num_tokens": 1150803.0, | |
| "reward": 0.4506249725818634, | |
| "reward_std": 0.2620203495025635, | |
| "rewards/rollout_reward_func/mean": 0.4506249725818634, | |
| "rewards/rollout_reward_func/std": 0.3878471255302429, | |
| "sampling/importance_sampling_ratio/max": 2.092060089111328, | |
| "sampling/importance_sampling_ratio/mean": 1.0061091184616089, | |
| "sampling/importance_sampling_ratio/min": 0.519212543964386, | |
| "sampling/sampling_logp_difference/max": 0.664109468460083, | |
| "sampling/sampling_logp_difference/mean": 0.02180980145931244, | |
| "step": 15, | |
| "step_time": 16.993085070999996 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0016891892300918698, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016891892300918698, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2812.0, | |
| "completions/max_terminated_length": 2812.0, | |
| "completions/mean_length": 2229.28125, | |
| "completions/mean_terminated_length": 2229.28125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.15634657256305218, | |
| "epoch": 0.00128, | |
| "frac_reward_zero_std": 0.875, | |
| "grad_norm": 0.09010659158229828, | |
| "kl": 0.005270412558274984, | |
| "learning_rate": 3.428571428571428e-06, | |
| "loss": 0.005, | |
| "num_tokens": 1235464.0, | |
| "reward": 0.48374998569488525, | |
| "reward_std": 0.0625, | |
| "rewards/rollout_reward_func/mean": 0.48374998569488525, | |
| "rewards/rollout_reward_func/std": 0.36630454659461975, | |
| "sampling/importance_sampling_ratio/max": 1.890110969543457, | |
| "sampling/importance_sampling_ratio/mean": 0.9257134199142456, | |
| "sampling/importance_sampling_ratio/min": 0.4630853831768036, | |
| "sampling/sampling_logp_difference/max": 0.69629967212677, | |
| "sampling/sampling_logp_difference/mean": 0.020435180515050888, | |
| "step": 16, | |
| "step_time": 17.667978367999467 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.016176471021026373, | |
| "clip_ratio/high_mean": 0.009732972481288016, | |
| "clip_ratio/low_mean": 0.0016447368543595076, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.011377709335647523, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2449.0, | |
| "completions/max_terminated_length": 2449.0, | |
| "completions/mean_length": 1750.53125, | |
| "completions/mean_terminated_length": 1750.53125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.13329231040552258, | |
| "epoch": 0.00136, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.2919148206710815, | |
| "kl": 0.0019803230388788506, | |
| "learning_rate": 3.657142857142857e-06, | |
| "loss": 0.0395, | |
| "num_tokens": 1303537.0, | |
| "reward": 0.5606250166893005, | |
| "reward_std": 0.2351399064064026, | |
| "rewards/rollout_reward_func/mean": 0.5606250166893005, | |
| "rewards/rollout_reward_func/std": 0.41600972414016724, | |
| "sampling/importance_sampling_ratio/max": 1.4119406938552856, | |
| "sampling/importance_sampling_ratio/mean": 0.9287126064300537, | |
| "sampling/importance_sampling_ratio/min": 0.3827318847179413, | |
| "sampling/sampling_logp_difference/max": 0.6636209487915039, | |
| "sampling/sampling_logp_difference/mean": 0.017019610852003098, | |
| "step": 17, | |
| "step_time": 15.075951100999873 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0062500000931322575, | |
| "clip_ratio/high_mean": 0.0031250000465661287, | |
| "clip_ratio/low_mean": 0.0016447368543595076, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004769736900925636, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2799.0, | |
| "completions/max_terminated_length": 2799.0, | |
| "completions/mean_length": 2316.9375, | |
| "completions/mean_terminated_length": 2316.9375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.16856362204998732, | |
| "epoch": 0.00144, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 2.867643356323242, | |
| "kl": 0.005084036383777857, | |
| "learning_rate": 3.885714285714286e-06, | |
| "loss": -0.0049, | |
| "num_tokens": 1391780.0, | |
| "reward": 0.37031251192092896, | |
| "reward_std": 0.10187499970197678, | |
| "rewards/rollout_reward_func/mean": 0.37031251192092896, | |
| "rewards/rollout_reward_func/std": 0.2657124996185303, | |
| "sampling/importance_sampling_ratio/max": 1.9782981872558594, | |
| "sampling/importance_sampling_ratio/mean": 1.0121815204620361, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.48717403411865234, | |
| "sampling/sampling_logp_difference/mean": 0.022180214524269104, | |
| "step": 18, | |
| "step_time": 17.064124244999903 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0062500000931322575, | |
| "clip_ratio/high_mean": 0.0031250000465661287, | |
| "clip_ratio/low_mean": 0.0015243901871144772, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004649390233680606, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2798.0, | |
| "completions/max_terminated_length": 2798.0, | |
| "completions/mean_length": 2216.0, | |
| "completions/mean_terminated_length": 2216.0, | |
| "completions/min_length": 1563.0, | |
| "completions/min_terminated_length": 1563.0, | |
| "entropy": 0.1883529694750905, | |
| "epoch": 0.00152, | |
| "frac_reward_zero_std": 0.875, | |
| "grad_norm": 2.2749135494232178, | |
| "kl": 0.005091317143524066, | |
| "learning_rate": 4.114285714285714e-06, | |
| "loss": 0.0455, | |
| "num_tokens": 1475873.0, | |
| "reward": 0.4059374928474426, | |
| "reward_std": 0.00812500063329935, | |
| "rewards/rollout_reward_func/mean": 0.4059374928474426, | |
| "rewards/rollout_reward_func/std": 0.29813244938850403, | |
| "sampling/importance_sampling_ratio/max": 2.81974196434021, | |
| "sampling/importance_sampling_ratio/mean": 1.0345871448516846, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.8696746826171875, | |
| "sampling/sampling_logp_difference/mean": 0.02671925723552704, | |
| "step": 19, | |
| "step_time": 17.602816416999985 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011488970601931214, | |
| "clip_ratio/high_mean": 0.005744485300965607, | |
| "clip_ratio/low_mean": 0.0048926768358796835, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01063716213684529, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2819.0, | |
| "completions/max_terminated_length": 2819.0, | |
| "completions/mean_length": 1814.28125, | |
| "completions/mean_terminated_length": 1814.28125, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.1282729902304709, | |
| "epoch": 0.0016, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.691245198249817, | |
| "kl": 0.002560590350185521, | |
| "learning_rate": 4.342857142857142e-06, | |
| "loss": -0.0259, | |
| "num_tokens": 1546081.0, | |
| "reward": 0.612500011920929, | |
| "reward_std": 0.18216876685619354, | |
| "rewards/rollout_reward_func/mean": 0.612500011920929, | |
| "rewards/rollout_reward_func/std": 0.43820008635520935, | |
| "sampling/importance_sampling_ratio/max": 1.9052116870880127, | |
| "sampling/importance_sampling_ratio/mean": 1.007737636566162, | |
| "sampling/importance_sampling_ratio/min": 0.5377175211906433, | |
| "sampling/sampling_logp_difference/max": 0.6581223011016846, | |
| "sampling/sampling_logp_difference/mean": 0.01716558076441288, | |
| "step": 20, | |
| "step_time": 16.950590521000322 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010667945956811309, | |
| "clip_ratio/high_mean": 0.005333972978405654, | |
| "clip_ratio/low_mean": 0.004784891498275101, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010118864476680756, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2791.0, | |
| "completions/max_terminated_length": 2791.0, | |
| "completions/mean_length": 1928.25, | |
| "completions/mean_terminated_length": 1928.25, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.1822828585281968, | |
| "epoch": 0.00168, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.4209487438201904, | |
| "kl": 0.0024251511349575594, | |
| "learning_rate": 4.571428571428571e-06, | |
| "loss": -0.038, | |
| "num_tokens": 1620395.0, | |
| "reward": 0.6059374809265137, | |
| "reward_std": 0.20529377460479736, | |
| "rewards/rollout_reward_func/mean": 0.6059374809265137, | |
| "rewards/rollout_reward_func/std": 0.4463822841644287, | |
| "sampling/importance_sampling_ratio/max": 1.6781816482543945, | |
| "sampling/importance_sampling_ratio/mean": 1.0399606227874756, | |
| "sampling/importance_sampling_ratio/min": 0.30244705080986023, | |
| "sampling/sampling_logp_difference/max": 0.7038769721984863, | |
| "sampling/sampling_logp_difference/mean": 0.028661729767918587, | |
| "step": 21, | |
| "step_time": 16.793115096000065 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.01126575656235218, | |
| "clip_ratio/high_mean": 0.00563287828117609, | |
| "clip_ratio/low_mean": 0.0018382353009656072, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007471113582141697, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2806.0, | |
| "completions/max_terminated_length": 2806.0, | |
| "completions/mean_length": 1895.0, | |
| "completions/mean_terminated_length": 1895.0, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.11727871629409492, | |
| "epoch": 0.00176, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.4466348886489868, | |
| "kl": 0.003775272169150412, | |
| "learning_rate": 4.8e-06, | |
| "loss": 0.0009, | |
| "num_tokens": 1693497.0, | |
| "reward": 0.5303125381469727, | |
| "reward_std": 0.2740437984466553, | |
| "rewards/rollout_reward_func/mean": 0.5303125381469727, | |
| "rewards/rollout_reward_func/std": 0.4203799068927765, | |
| "sampling/importance_sampling_ratio/max": 1.6391761302947998, | |
| "sampling/importance_sampling_ratio/mean": 0.9023667573928833, | |
| "sampling/importance_sampling_ratio/min": 0.30269956588745117, | |
| "sampling/sampling_logp_difference/max": 1.103229284286499, | |
| "sampling/sampling_logp_difference/mean": 0.028024764731526375, | |
| "step": 22, | |
| "step_time": 16.31928637299984 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.006761695956811309, | |
| "clip_ratio/high_mean": 0.0033808479784056544, | |
| "clip_ratio/low_mean": 0.0034829722717404366, | |
| "clip_ratio/low_min": 0.003289473708719015, | |
| "clip_ratio/region_mean": 0.006863820250146091, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2795.0, | |
| "completions/max_terminated_length": 2795.0, | |
| "completions/mean_length": 1923.9375, | |
| "completions/mean_terminated_length": 1923.9375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.14631808176636696, | |
| "epoch": 0.00184, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.9489635229110718, | |
| "kl": 0.0022731795979780145, | |
| "learning_rate": 5.0285714285714285e-06, | |
| "loss": -0.0229, | |
| "num_tokens": 1767354.0, | |
| "reward": 0.4168750047683716, | |
| "reward_std": 0.20417675375938416, | |
| "rewards/rollout_reward_func/mean": 0.4168750047683716, | |
| "rewards/rollout_reward_func/std": 0.33288994431495667, | |
| "sampling/importance_sampling_ratio/max": 2.5520823001861572, | |
| "sampling/importance_sampling_ratio/mean": 1.124953269958496, | |
| "sampling/importance_sampling_ratio/min": 0.3814745247364044, | |
| "sampling/sampling_logp_difference/max": 0.9945569038391113, | |
| "sampling/sampling_logp_difference/mean": 0.021298212930560112, | |
| "step": 23, | |
| "step_time": 16.909164390999877 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0062500000931322575, | |
| "clip_ratio/high_mean": 0.0031250000465661287, | |
| "clip_ratio/low_mean": 0.0015625000232830644, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004687500069849193, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2798.0, | |
| "completions/max_terminated_length": 2798.0, | |
| "completions/mean_length": 2009.28125, | |
| "completions/mean_terminated_length": 2009.28125, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.18214968033134937, | |
| "epoch": 0.00192, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.610776662826538, | |
| "kl": 0.003005845101142768, | |
| "learning_rate": 5.257142857142857e-06, | |
| "loss": -0.0704, | |
| "num_tokens": 1844719.0, | |
| "reward": 0.4243749976158142, | |
| "reward_std": 0.21341876685619354, | |
| "rewards/rollout_reward_func/mean": 0.4243749976158142, | |
| "rewards/rollout_reward_func/std": 0.362561970949173, | |
| "sampling/importance_sampling_ratio/max": 1.7062076330184937, | |
| "sampling/importance_sampling_ratio/mean": 1.0438251495361328, | |
| "sampling/importance_sampling_ratio/min": 0.3492918312549591, | |
| "sampling/sampling_logp_difference/max": 0.5720778703689575, | |
| "sampling/sampling_logp_difference/mean": 0.023630155250430107, | |
| "step": 24, | |
| "step_time": 17.14499585999988 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0031250000465661287, | |
| "clip_ratio/high_mean": 0.0015625000232830644, | |
| "clip_ratio/low_mean": 0.0031250000465661287, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004687500069849193, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2440.0, | |
| "completions/max_terminated_length": 2440.0, | |
| "completions/mean_length": 2006.6875, | |
| "completions/mean_terminated_length": 2006.6875, | |
| "completions/min_length": 1055.0, | |
| "completions/min_terminated_length": 1055.0, | |
| "entropy": 0.11316484399139881, | |
| "epoch": 0.002, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.0315803289413452, | |
| "kl": 0.0030158030276652426, | |
| "learning_rate": 5.485714285714286e-06, | |
| "loss": 0.0116, | |
| "num_tokens": 1921270.0, | |
| "reward": 0.47968751192092896, | |
| "reward_std": 0.07062499970197678, | |
| "rewards/rollout_reward_func/mean": 0.47968751192092896, | |
| "rewards/rollout_reward_func/std": 0.36560583114624023, | |
| "sampling/importance_sampling_ratio/max": 1.54865562915802, | |
| "sampling/importance_sampling_ratio/mean": 0.9767700433731079, | |
| "sampling/importance_sampling_ratio/min": 0.5038349628448486, | |
| "sampling/sampling_logp_difference/max": 0.4957547187805176, | |
| "sampling/sampling_logp_difference/mean": 0.01319466158747673, | |
| "step": 25, | |
| "step_time": 15.574967514000036 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.007352941203862429, | |
| "clip_ratio/high_mean": 0.0036764706019312143, | |
| "clip_ratio/low_mean": 0.0015625000232830644, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005238970625214279, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2816.0, | |
| "completions/max_terminated_length": 2816.0, | |
| "completions/mean_length": 2016.375, | |
| "completions/mean_terminated_length": 2016.375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.1327181551605463, | |
| "epoch": 0.00208, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.1481389999389648, | |
| "kl": 0.0036689931839646306, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": -0.004, | |
| "num_tokens": 1998617.0, | |
| "reward": 0.4909375011920929, | |
| "reward_std": 0.12780338525772095, | |
| "rewards/rollout_reward_func/mean": 0.4909375011920929, | |
| "rewards/rollout_reward_func/std": 0.3668818771839142, | |
| "sampling/importance_sampling_ratio/max": 1.5999431610107422, | |
| "sampling/importance_sampling_ratio/mean": 0.9634629487991333, | |
| "sampling/importance_sampling_ratio/min": 0.2564904987812042, | |
| "sampling/sampling_logp_difference/max": 0.6982665061950684, | |
| "sampling/sampling_logp_difference/mean": 0.01972239464521408, | |
| "step": 26, | |
| "step_time": 17.251899020999645 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008928571827709675, | |
| "clip_ratio/high_mean": 0.004464285913854837, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004464285913854837, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2810.0, | |
| "completions/max_terminated_length": 2810.0, | |
| "completions/mean_length": 1849.84375, | |
| "completions/mean_terminated_length": 1849.84375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.09636542806401849, | |
| "epoch": 0.00216, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.7088967561721802, | |
| "kl": 0.002837517200532602, | |
| "learning_rate": 5.942857142857143e-06, | |
| "loss": -0.0039, | |
| "num_tokens": 2070057.0, | |
| "reward": 0.5687500238418579, | |
| "reward_std": 0.2160891890525818, | |
| "rewards/rollout_reward_func/mean": 0.5687500238418579, | |
| "rewards/rollout_reward_func/std": 0.4140106439590454, | |
| "sampling/importance_sampling_ratio/max": 1.6554824113845825, | |
| "sampling/importance_sampling_ratio/mean": 1.0057581663131714, | |
| "sampling/importance_sampling_ratio/min": 0.13771295547485352, | |
| "sampling/sampling_logp_difference/max": 1.693850040435791, | |
| "sampling/sampling_logp_difference/mean": 0.015657048672437668, | |
| "step": 27, | |
| "step_time": 16.657898435000106 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0036764706019312143, | |
| "clip_ratio/high_mean": 0.0018382353009656072, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0018382353009656072, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2800.0, | |
| "completions/max_terminated_length": 2800.0, | |
| "completions/mean_length": 2166.28125, | |
| "completions/mean_terminated_length": 2166.28125, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.11421543313190341, | |
| "epoch": 0.00224, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.0997257232666016, | |
| "kl": 0.001993668673094362, | |
| "learning_rate": 6.171428571428571e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 2152509.0, | |
| "reward": 0.4090625047683716, | |
| "reward_std": 0.14099711179733276, | |
| "rewards/rollout_reward_func/mean": 0.4090625047683716, | |
| "rewards/rollout_reward_func/std": 0.3153631389141083, | |
| "sampling/importance_sampling_ratio/max": 1.4471888542175293, | |
| "sampling/importance_sampling_ratio/mean": 0.9940881133079529, | |
| "sampling/importance_sampling_ratio/min": 0.36628207564353943, | |
| "sampling/sampling_logp_difference/max": 0.4437229633331299, | |
| "sampling/sampling_logp_difference/mean": 0.01664617471396923, | |
| "step": 28, | |
| "step_time": 16.654660168999953 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.014613970648497343, | |
| "clip_ratio/high_mean": 0.009043096331879497, | |
| "clip_ratio/low_mean": 0.001953125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.010996221215464175, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2427.0, | |
| "completions/max_terminated_length": 2427.0, | |
| "completions/mean_length": 1541.1875, | |
| "completions/mean_terminated_length": 1541.1875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.11991730704903603, | |
| "epoch": 0.00232, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.753915548324585, | |
| "kl": 0.004232257339026546, | |
| "learning_rate": 6.4e-06, | |
| "loss": -0.0561, | |
| "num_tokens": 2213064.0, | |
| "reward": 0.7221875190734863, | |
| "reward_std": 0.28713130950927734, | |
| "rewards/rollout_reward_func/mean": 0.7221875190734863, | |
| "rewards/rollout_reward_func/std": 0.47223374247550964, | |
| "sampling/importance_sampling_ratio/max": 1.8491572141647339, | |
| "sampling/importance_sampling_ratio/mean": 0.9664063453674316, | |
| "sampling/importance_sampling_ratio/min": 0.2533723711967468, | |
| "sampling/sampling_logp_difference/max": 0.7475757598876953, | |
| "sampling/sampling_logp_difference/mean": 0.02090834453701973, | |
| "step": 29, | |
| "step_time": 14.724566217999836 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011101973708719015, | |
| "clip_ratio/high_mean": 0.0055509868543595076, | |
| "clip_ratio/low_mean": 0.0036764706019312143, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.009227457456290722, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2795.0, | |
| "completions/max_terminated_length": 2795.0, | |
| "completions/mean_length": 1690.28125, | |
| "completions/mean_terminated_length": 1690.28125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.15091887768357992, | |
| "epoch": 0.0024, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.5955613851547241, | |
| "kl": 0.003297277187812142, | |
| "learning_rate": 6.628571428571428e-06, | |
| "loss": -0.0312, | |
| "num_tokens": 2278894.0, | |
| "reward": 0.5768749713897705, | |
| "reward_std": 0.2879711389541626, | |
| "rewards/rollout_reward_func/mean": 0.5768749713897705, | |
| "rewards/rollout_reward_func/std": 0.4466031789779663, | |
| "sampling/importance_sampling_ratio/max": 2.0268845558166504, | |
| "sampling/importance_sampling_ratio/mean": 0.9738575220108032, | |
| "sampling/importance_sampling_ratio/min": 0.3578207492828369, | |
| "sampling/sampling_logp_difference/max": 0.7220923900604248, | |
| "sampling/sampling_logp_difference/mean": 0.023466479033231735, | |
| "step": 30, | |
| "step_time": 15.794334728999956 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.008928571827709675, | |
| "clip_ratio/high_mean": 0.006302521098405123, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006302521098405123, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2791.0, | |
| "completions/max_terminated_length": 2791.0, | |
| "completions/mean_length": 1802.28125, | |
| "completions/mean_terminated_length": 1802.28125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.1271415469236672, | |
| "epoch": 0.00248, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.5290101170539856, | |
| "kl": 0.007829649756786239, | |
| "learning_rate": 6.857142857142856e-06, | |
| "loss": 0.0219, | |
| "num_tokens": 2348595.0, | |
| "reward": 0.6468750238418579, | |
| "reward_std": 0.15625, | |
| "rewards/rollout_reward_func/mean": 0.6468750238418579, | |
| "rewards/rollout_reward_func/std": 0.4312205910682678, | |
| "sampling/importance_sampling_ratio/max": 1.4284578561782837, | |
| "sampling/importance_sampling_ratio/mean": 1.008836269378662, | |
| "sampling/importance_sampling_ratio/min": 0.5545295476913452, | |
| "sampling/sampling_logp_difference/max": 0.9234024286270142, | |
| "sampling/sampling_logp_difference/mean": 0.020626772195100784, | |
| "step": 31, | |
| "step_time": 16.82306190600002 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.010130719048902392, | |
| "clip_ratio/high_mean": 0.005065359524451196, | |
| "clip_ratio/low_mean": 0.0036210318794474006, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008686391403898597, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2450.0, | |
| "completions/max_terminated_length": 2450.0, | |
| "completions/mean_length": 1986.28125, | |
| "completions/mean_terminated_length": 1986.28125, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.13206067122519016, | |
| "epoch": 0.00256, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.3528786897659302, | |
| "kl": 0.008182306781236548, | |
| "learning_rate": 7.085714285714285e-06, | |
| "loss": -0.0853, | |
| "num_tokens": 2424934.0, | |
| "reward": 0.4606249928474426, | |
| "reward_std": 0.15091878175735474, | |
| "rewards/rollout_reward_func/mean": 0.4606249928474426, | |
| "rewards/rollout_reward_func/std": 0.3846149146556854, | |
| "sampling/importance_sampling_ratio/max": 2.71755313873291, | |
| "sampling/importance_sampling_ratio/mean": 1.051027774810791, | |
| "sampling/importance_sampling_ratio/min": 0.38737601041793823, | |
| "sampling/sampling_logp_difference/max": 0.7733535766601562, | |
| "sampling/sampling_logp_difference/mean": 0.020959284156560898, | |
| "step": 32, | |
| "step_time": 15.57071794400008 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0022321429569274187, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0022321429569274187, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2803.0, | |
| "completions/max_terminated_length": 2803.0, | |
| "completions/mean_length": 2106.9375, | |
| "completions/mean_terminated_length": 2106.9375, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.12445190898142755, | |
| "epoch": 0.00264, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.12888294458389282, | |
| "kl": 0.003734915575478226, | |
| "learning_rate": 7.314285714285714e-06, | |
| "loss": -0.0017, | |
| "num_tokens": 2505564.0, | |
| "reward": 0.4637500047683716, | |
| "reward_std": 0.13466876745224, | |
| "rewards/rollout_reward_func/mean": 0.4637500047683716, | |
| "rewards/rollout_reward_func/std": 0.37349048256874084, | |
| "sampling/importance_sampling_ratio/max": 2.2378056049346924, | |
| "sampling/importance_sampling_ratio/mean": 1.0552072525024414, | |
| "sampling/importance_sampling_ratio/min": 0.3374383747577667, | |
| "sampling/sampling_logp_difference/max": 1.084688663482666, | |
| "sampling/sampling_logp_difference/mean": 0.015957504510879517, | |
| "step": 33, | |
| "step_time": 16.374967034000065 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004464285913854837, | |
| "clip_ratio/high_mean": 0.0022321429569274187, | |
| "clip_ratio/low_mean": 0.0022321429569274187, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004464285913854837, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2416.0, | |
| "completions/max_terminated_length": 2416.0, | |
| "completions/mean_length": 1479.28125, | |
| "completions/mean_terminated_length": 1479.28125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.07319290563464165, | |
| "epoch": 0.00272, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.5905581712722778, | |
| "kl": 0.02232863545214059, | |
| "learning_rate": 7.542857142857142e-06, | |
| "loss": 0.0285, | |
| "num_tokens": 2564043.0, | |
| "reward": 0.7212499976158142, | |
| "reward_std": 0.31069982051849365, | |
| "rewards/rollout_reward_func/mean": 0.7212499976158142, | |
| "rewards/rollout_reward_func/std": 0.45388466119766235, | |
| "sampling/importance_sampling_ratio/max": 1.5257185697555542, | |
| "sampling/importance_sampling_ratio/mean": 0.9286473989486694, | |
| "sampling/importance_sampling_ratio/min": 0.35652607679367065, | |
| "sampling/sampling_logp_difference/max": 1.0259580612182617, | |
| "sampling/sampling_logp_difference/mean": 0.0132124163210392, | |
| "step": 34, | |
| "step_time": 14.42571913799975 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2806.0, | |
| "completions/max_terminated_length": 2806.0, | |
| "completions/mean_length": 1752.5, | |
| "completions/mean_terminated_length": 1752.5, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.09324552165344357, | |
| "epoch": 0.0028, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.717671811580658, | |
| "kl": 0.004631695612260955, | |
| "learning_rate": 7.771428571428572e-06, | |
| "loss": 0.0108, | |
| "num_tokens": 2632179.0, | |
| "reward": 0.4125000238418579, | |
| "reward_std": 0.26933753490448, | |
| "rewards/rollout_reward_func/mean": 0.4125000238418579, | |
| "rewards/rollout_reward_func/std": 0.36455005407333374, | |
| "sampling/importance_sampling_ratio/max": 1.790269136428833, | |
| "sampling/importance_sampling_ratio/mean": 1.1154439449310303, | |
| "sampling/importance_sampling_ratio/min": 0.7148804068565369, | |
| "sampling/sampling_logp_difference/max": 0.596367359161377, | |
| "sampling/sampling_logp_difference/mean": 0.01612972654402256, | |
| "step": 35, | |
| "step_time": 16.94071342400025 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.003289473708719015, | |
| "clip_ratio/high_mean": 0.0016447368543595076, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016447368543595076, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2780.0, | |
| "completions/max_terminated_length": 2780.0, | |
| "completions/mean_length": 2210.65625, | |
| "completions/mean_terminated_length": 2210.65625, | |
| "completions/min_length": 1565.0, | |
| "completions/min_terminated_length": 1565.0, | |
| "entropy": 0.13052229024469852, | |
| "epoch": 0.00288, | |
| "frac_reward_zero_std": 0.875, | |
| "grad_norm": 1.915448784828186, | |
| "kl": 0.014049735953449272, | |
| "learning_rate": 8e-06, | |
| "loss": 0.0114, | |
| "num_tokens": 2716101.0, | |
| "reward": 0.3787500262260437, | |
| "reward_std": 0.0624999962747097, | |
| "rewards/rollout_reward_func/mean": 0.3787500262260437, | |
| "rewards/rollout_reward_func/std": 0.26268768310546875, | |
| "sampling/importance_sampling_ratio/max": 1.6097471714019775, | |
| "sampling/importance_sampling_ratio/mean": 0.8499077558517456, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 24.667917251586914, | |
| "sampling/sampling_logp_difference/mean": 0.08357222378253937, | |
| "step": 36, | |
| "step_time": 16.647620255999527 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013392857741564512, | |
| "clip_ratio/high_mean": 0.006696428870782256, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006696428870782256, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2787.0, | |
| "completions/max_terminated_length": 2787.0, | |
| "completions/mean_length": 1735.125, | |
| "completions/mean_terminated_length": 1735.125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.08023441676050425, | |
| "epoch": 0.00296, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.0067864656448364, | |
| "kl": 0.014214006034308113, | |
| "learning_rate": 7.999999976246485e-06, | |
| "loss": 0.005, | |
| "num_tokens": 2783558.0, | |
| "reward": 0.5481250286102295, | |
| "reward_std": 0.2473391890525818, | |
| "rewards/rollout_reward_func/mean": 0.5481250286102295, | |
| "rewards/rollout_reward_func/std": 0.41463109850883484, | |
| "sampling/importance_sampling_ratio/max": 1.9839459657669067, | |
| "sampling/importance_sampling_ratio/mean": 1.0556774139404297, | |
| "sampling/importance_sampling_ratio/min": 0.5570288896560669, | |
| "sampling/sampling_logp_difference/max": 0.7343063354492188, | |
| "sampling/sampling_logp_difference/mean": 0.01291065476834774, | |
| "step": 37, | |
| "step_time": 15.852365188000249 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013429548125714064, | |
| "clip_ratio/high_mean": 0.008277274086140096, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008277274086140096, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2790.0, | |
| "completions/max_terminated_length": 2790.0, | |
| "completions/mean_length": 1825.5625, | |
| "completions/mean_terminated_length": 1825.5625, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.07684489572420716, | |
| "epoch": 0.00304, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.9191060662269592, | |
| "kl": 0.008681207757035736, | |
| "learning_rate": 7.999999904985944e-06, | |
| "loss": -0.0211, | |
| "num_tokens": 2854054.0, | |
| "reward": 0.6575000286102295, | |
| "reward_std": 0.32216876745224, | |
| "rewards/rollout_reward_func/mean": 0.6575000286102295, | |
| "rewards/rollout_reward_func/std": 0.46569401025772095, | |
| "sampling/importance_sampling_ratio/max": 1.8603460788726807, | |
| "sampling/importance_sampling_ratio/mean": 1.0407439470291138, | |
| "sampling/importance_sampling_ratio/min": 0.5873942971229553, | |
| "sampling/sampling_logp_difference/max": 0.6204257011413574, | |
| "sampling/sampling_logp_difference/mean": 0.011706141754984856, | |
| "step": 38, | |
| "step_time": 16.086839031999943 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2800.0, | |
| "completions/max_terminated_length": 2800.0, | |
| "completions/mean_length": 2006.0625, | |
| "completions/mean_terminated_length": 2006.0625, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.09726127330213785, | |
| "epoch": 0.00312, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.442636489868164, | |
| "kl": 0.01900345625472255, | |
| "learning_rate": 7.999999786218377e-06, | |
| "loss": -0.0229, | |
| "num_tokens": 2930928.0, | |
| "reward": 0.42125001549720764, | |
| "reward_std": 0.13466876745224, | |
| "rewards/rollout_reward_func/mean": 0.42125001549720764, | |
| "rewards/rollout_reward_func/std": 0.3237656354904175, | |
| "sampling/importance_sampling_ratio/max": 1.6864854097366333, | |
| "sampling/importance_sampling_ratio/mean": 0.9331543445587158, | |
| "sampling/importance_sampling_ratio/min": 0.17830577492713928, | |
| "sampling/sampling_logp_difference/max": 0.9498655796051025, | |
| "sampling/sampling_logp_difference/mean": 0.01994011551141739, | |
| "step": 39, | |
| "step_time": 17.24544241999979 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013523391913622618, | |
| "clip_ratio/high_mean": 0.006761695956811309, | |
| "clip_ratio/low_mean": 0.0016447368543595076, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008406432811170816, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2440.0, | |
| "completions/max_terminated_length": 2440.0, | |
| "completions/mean_length": 1899.96875, | |
| "completions/mean_terminated_length": 1899.96875, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.09562211390584707, | |
| "epoch": 0.0032, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7564519047737122, | |
| "kl": 0.012280215043574572, | |
| "learning_rate": 7.999999619943787e-06, | |
| "loss": 0.0388, | |
| "num_tokens": 3004071.0, | |
| "reward": 0.53125, | |
| "reward_std": 0.13466876745224, | |
| "rewards/rollout_reward_func/mean": 0.53125, | |
| "rewards/rollout_reward_func/std": 0.40266650915145874, | |
| "sampling/importance_sampling_ratio/max": 1.7533916234970093, | |
| "sampling/importance_sampling_ratio/mean": 1.012838363647461, | |
| "sampling/importance_sampling_ratio/min": 0.40302401781082153, | |
| "sampling/sampling_logp_difference/max": 0.6195348501205444, | |
| "sampling/sampling_logp_difference/mean": 0.016937807202339172, | |
| "step": 40, | |
| "step_time": 15.278612154999792 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.013322473270818591, | |
| "clip_ratio/high_mean": 0.0066612366354092956, | |
| "clip_ratio/low_mean": 0.0016447368543595076, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008305973489768803, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2818.0, | |
| "completions/max_terminated_length": 2818.0, | |
| "completions/mean_length": 2140.78125, | |
| "completions/mean_terminated_length": 2140.78125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.10391843365505338, | |
| "epoch": 0.00328, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.684903621673584, | |
| "kl": 0.0072205948672490194, | |
| "learning_rate": 7.999999406162173e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 3085711.0, | |
| "reward": 0.41718751192092896, | |
| "reward_std": 0.14279377460479736, | |
| "rewards/rollout_reward_func/mean": 0.41718751192092896, | |
| "rewards/rollout_reward_func/std": 0.33007559180259705, | |
| "sampling/importance_sampling_ratio/max": 1.5918241739273071, | |
| "sampling/importance_sampling_ratio/mean": 0.900505006313324, | |
| "sampling/importance_sampling_ratio/min": 0.29988786578178406, | |
| "sampling/sampling_logp_difference/max": 1.036886215209961, | |
| "sampling/sampling_logp_difference/mean": 0.019227981567382812, | |
| "step": 41, | |
| "step_time": 16.941576514999497 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0032051282469183207, | |
| "clip_ratio/high_mean": 0.0016025641234591603, | |
| "clip_ratio/low_mean": 0.0035156250232830644, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005118189146742225, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2780.0, | |
| "completions/max_terminated_length": 2780.0, | |
| "completions/mean_length": 1944.25, | |
| "completions/mean_terminated_length": 1944.25, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.09584560617804527, | |
| "epoch": 0.00336, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 2.3526816368103027, | |
| "kl": 0.027870278747286648, | |
| "learning_rate": 7.999999144873542e-06, | |
| "loss": 0.1103, | |
| "num_tokens": 3160194.0, | |
| "reward": 0.4490624964237213, | |
| "reward_std": 0.20263297855854034, | |
| "rewards/rollout_reward_func/mean": 0.4490624964237213, | |
| "rewards/rollout_reward_func/std": 0.36338335275650024, | |
| "sampling/importance_sampling_ratio/max": 2.6562483310699463, | |
| "sampling/importance_sampling_ratio/mean": 1.036879062652588, | |
| "sampling/importance_sampling_ratio/min": 0.24595271050930023, | |
| "sampling/sampling_logp_difference/max": 0.9235069751739502, | |
| "sampling/sampling_logp_difference/mean": 0.02009188011288643, | |
| "step": 42, | |
| "step_time": 16.650967574000333 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0066964286379516125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0066964286379516125, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2791.0, | |
| "completions/max_terminated_length": 2791.0, | |
| "completions/mean_length": 1675.03125, | |
| "completions/mean_terminated_length": 1675.03125, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.12920551793649793, | |
| "epoch": 0.00344, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.3091895580291748, | |
| "kl": 0.03491167013999075, | |
| "learning_rate": 7.999998836077897e-06, | |
| "loss": 0.0888, | |
| "num_tokens": 3225903.0, | |
| "reward": 0.5174999833106995, | |
| "reward_std": 0.3608438968658447, | |
| "rewards/rollout_reward_func/mean": 0.5174999833106995, | |
| "rewards/rollout_reward_func/std": 0.43675488233566284, | |
| "sampling/importance_sampling_ratio/max": 2.373917579650879, | |
| "sampling/importance_sampling_ratio/mean": 1.0001801252365112, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.3759169578552246, | |
| "sampling/sampling_logp_difference/mean": 0.02347693033516407, | |
| "step": 43, | |
| "step_time": 17.192637601000115 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2798.0, | |
| "completions/max_terminated_length": 2798.0, | |
| "completions/mean_length": 2095.3125, | |
| "completions/mean_terminated_length": 2095.3125, | |
| "completions/min_length": 1562.0, | |
| "completions/min_terminated_length": 1562.0, | |
| "entropy": 0.09558335272595286, | |
| "epoch": 0.00352, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6341590881347656, | |
| "kl": 0.02431841316865757, | |
| "learning_rate": 7.99999847977524e-06, | |
| "loss": -0.0206, | |
| "num_tokens": 3305945.0, | |
| "reward": 0.5049999952316284, | |
| "reward_std": 0.14433754980564117, | |
| "rewards/rollout_reward_func/mean": 0.5049999952316284, | |
| "rewards/rollout_reward_func/std": 0.3978976607322693, | |
| "sampling/importance_sampling_ratio/max": 2.0796568393707275, | |
| "sampling/importance_sampling_ratio/mean": 0.8876084089279175, | |
| "sampling/importance_sampling_ratio/min": 0.34211352467536926, | |
| "sampling/sampling_logp_difference/max": 1.0857441425323486, | |
| "sampling/sampling_logp_difference/mean": 0.01929028518497944, | |
| "step": 44, | |
| "step_time": 16.607955621999963 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.009783434681594372, | |
| "clip_ratio/high_mean": 0.004891717340797186, | |
| "clip_ratio/low_mean": 0.0034007353242486715, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.008292452665045857, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2816.0, | |
| "completions/max_terminated_length": 2816.0, | |
| "completions/mean_length": 2426.71875, | |
| "completions/mean_terminated_length": 2426.71875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.13120493851602077, | |
| "epoch": 0.0036, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.2010557651519775, | |
| "kl": 0.014120981490123086, | |
| "learning_rate": 7.999998075965583e-06, | |
| "loss": -0.0277, | |
| "num_tokens": 3397449.0, | |
| "reward": 0.3384374976158142, | |
| "reward_std": 0.08029378205537796, | |
| "rewards/rollout_reward_func/mean": 0.3384374976158142, | |
| "rewards/rollout_reward_func/std": 0.21492847800254822, | |
| "sampling/importance_sampling_ratio/max": 2.0331103801727295, | |
| "sampling/importance_sampling_ratio/mean": 1.036205768585205, | |
| "sampling/importance_sampling_ratio/min": 0.44029006361961365, | |
| "sampling/sampling_logp_difference/max": 0.8087775707244873, | |
| "sampling/sampling_logp_difference/mean": 0.021842796355485916, | |
| "step": 45, | |
| "step_time": 17.039232532999677 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.004464285913854837, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004464285913854837, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2820.0, | |
| "completions/max_terminated_length": 2820.0, | |
| "completions/mean_length": 2035.78125, | |
| "completions/mean_terminated_length": 2035.78125, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.10772825870662928, | |
| "epoch": 0.00368, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6874250173568726, | |
| "kl": 0.02060257241828367, | |
| "learning_rate": 7.99999762464893e-06, | |
| "loss": 0.0037, | |
| "num_tokens": 3475466.0, | |
| "reward": 0.35374999046325684, | |
| "reward_std": 0.13466878235340118, | |
| "rewards/rollout_reward_func/mean": 0.35374999046325684, | |
| "rewards/rollout_reward_func/std": 0.26349693536758423, | |
| "sampling/importance_sampling_ratio/max": 2.4506301879882812, | |
| "sampling/importance_sampling_ratio/mean": 1.0585436820983887, | |
| "sampling/importance_sampling_ratio/min": 0.20336686074733734, | |
| "sampling/sampling_logp_difference/max": 1.0288989543914795, | |
| "sampling/sampling_logp_difference/mean": 0.019599031656980515, | |
| "step": 46, | |
| "step_time": 17.008759735000012 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.007352941203862429, | |
| "clip_ratio/high_mean": 0.0036764706019312143, | |
| "clip_ratio/low_mean": 0.0018382353009656072, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0055147059028968215, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2796.0, | |
| "completions/max_terminated_length": 2796.0, | |
| "completions/mean_length": 1804.875, | |
| "completions/mean_terminated_length": 1804.875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.11447951383888721, | |
| "epoch": 0.00376, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.9577285051345825, | |
| "kl": 0.0146886624279432, | |
| "learning_rate": 7.999997125825284e-06, | |
| "loss": 0.018, | |
| "num_tokens": 3545642.0, | |
| "reward": 0.550000011920929, | |
| "reward_std": 0.25521132349967957, | |
| "rewards/rollout_reward_func/mean": 0.550000011920929, | |
| "rewards/rollout_reward_func/std": 0.4146043360233307, | |
| "sampling/importance_sampling_ratio/max": 2.3037173748016357, | |
| "sampling/importance_sampling_ratio/mean": 1.043008804321289, | |
| "sampling/importance_sampling_ratio/min": 0.5624377727508545, | |
| "sampling/sampling_logp_difference/max": 0.6249582767486572, | |
| "sampling/sampling_logp_difference/mean": 0.017069321125745773, | |
| "step": 47, | |
| "step_time": 17.18442160299992 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011488970601931214, | |
| "clip_ratio/high_mean": 0.005744485300965607, | |
| "clip_ratio/low_mean": 0.003968254080973566, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.009712739381939173, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2804.0, | |
| "completions/max_terminated_length": 2804.0, | |
| "completions/mean_length": 1970.9375, | |
| "completions/mean_terminated_length": 1970.9375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.09903696551918983, | |
| "epoch": 0.00384, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.1492125988006592, | |
| "kl": 0.018403344380203635, | |
| "learning_rate": 7.999996579494655e-06, | |
| "loss": 0.0456, | |
| "num_tokens": 3621220.0, | |
| "reward": 0.4325000047683716, | |
| "reward_std": 0.19716876745224, | |
| "rewards/rollout_reward_func/mean": 0.4325000047683716, | |
| "rewards/rollout_reward_func/std": 0.3565334677696228, | |
| "sampling/importance_sampling_ratio/max": 2.369974136352539, | |
| "sampling/importance_sampling_ratio/mean": 1.0478521585464478, | |
| "sampling/importance_sampling_ratio/min": 0.11733747273683548, | |
| "sampling/sampling_logp_difference/max": 1.4917361736297607, | |
| "sampling/sampling_logp_difference/mean": 0.022218499332666397, | |
| "step": 48, | |
| "step_time": 16.41912825899999 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2804.0, | |
| "completions/max_terminated_length": 2804.0, | |
| "completions/mean_length": 2252.875, | |
| "completions/mean_terminated_length": 2252.875, | |
| "completions/min_length": 1565.0, | |
| "completions/min_terminated_length": 1565.0, | |
| "entropy": 0.09296703850850463, | |
| "epoch": 0.00392, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.5641649961471558, | |
| "kl": 0.03362530213780701, | |
| "learning_rate": 7.999995985657054e-06, | |
| "loss": -0.0171, | |
| "num_tokens": 3706955.0, | |
| "reward": 0.42624998092651367, | |
| "reward_std": 0.13466876745224, | |
| "rewards/rollout_reward_func/mean": 0.42624998092651367, | |
| "rewards/rollout_reward_func/std": 0.3314265012741089, | |
| "sampling/importance_sampling_ratio/max": 1.5233134031295776, | |
| "sampling/importance_sampling_ratio/mean": 0.9442777633666992, | |
| "sampling/importance_sampling_ratio/min": 0.2804383933544159, | |
| "sampling/sampling_logp_difference/max": 1.1384481191635132, | |
| "sampling/sampling_logp_difference/mean": 0.01721033826470375, | |
| "step": 49, | |
| "step_time": 16.923561193999603 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2820.0, | |
| "completions/max_terminated_length": 2820.0, | |
| "completions/mean_length": 2463.8125, | |
| "completions/mean_terminated_length": 2463.8125, | |
| "completions/min_length": 2040.0, | |
| "completions/min_terminated_length": 2040.0, | |
| "entropy": 0.09066143818199635, | |
| "epoch": 0.004, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.10026834160089493, | |
| "kl": 0.030927304484066553, | |
| "learning_rate": 7.99999534431249e-06, | |
| "loss": 0.0004, | |
| "num_tokens": 3799818.0, | |
| "reward": 0.30000001192092896, | |
| "reward_std": 0.0, | |
| "rewards/rollout_reward_func/mean": 0.30000001192092896, | |
| "rewards/rollout_reward_func/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.050495147705078, | |
| "sampling/importance_sampling_ratio/mean": 1.0762633085250854, | |
| "sampling/importance_sampling_ratio/min": 0.29474106431007385, | |
| "sampling/sampling_logp_difference/max": 1.1381915807724, | |
| "sampling/sampling_logp_difference/mean": 0.01666702888906002, | |
| "step": 50, | |
| "step_time": 17.35131004799996 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0018382353009656072, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0018382353009656072, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2786.0, | |
| "completions/max_terminated_length": 2786.0, | |
| "completions/mean_length": 1773.84375, | |
| "completions/mean_terminated_length": 1773.84375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.06475613545626402, | |
| "epoch": 0.00408, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 3.30958890914917, | |
| "kl": 0.015178573405137286, | |
| "learning_rate": 7.99999465546097e-06, | |
| "loss": 0.0343, | |
| "num_tokens": 3868368.0, | |
| "reward": 0.49562498927116394, | |
| "reward_std": 0.1848391890525818, | |
| "rewards/rollout_reward_func/mean": 0.49562498927116394, | |
| "rewards/rollout_reward_func/std": 0.4042271077632904, | |
| "sampling/importance_sampling_ratio/max": 2.036992311477661, | |
| "sampling/importance_sampling_ratio/mean": 1.0444798469543457, | |
| "sampling/importance_sampling_ratio/min": 0.6844988465309143, | |
| "sampling/sampling_logp_difference/max": 0.4961543083190918, | |
| "sampling/sampling_logp_difference/mean": 0.009073879569768906, | |
| "step": 51, | |
| "step_time": 17.00100525200014 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2809.0, | |
| "completions/max_terminated_length": 2809.0, | |
| "completions/mean_length": 1786.375, | |
| "completions/mean_terminated_length": 1786.375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.08096808800473809, | |
| "epoch": 0.00416, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.6158044338226318, | |
| "kl": 0.020677090127719566, | |
| "learning_rate": 7.99999391910251e-06, | |
| "loss": -0.0195, | |
| "num_tokens": 3937756.0, | |
| "reward": 0.4912499785423279, | |
| "reward_std": 0.33183753490448, | |
| "rewards/rollout_reward_func/mean": 0.4912499785423279, | |
| "rewards/rollout_reward_func/std": 0.42849886417388916, | |
| "sampling/importance_sampling_ratio/max": 1.5450013875961304, | |
| "sampling/importance_sampling_ratio/mean": 0.9464795589447021, | |
| "sampling/importance_sampling_ratio/min": 0.39816370606422424, | |
| "sampling/sampling_logp_difference/max": 1.0278459787368774, | |
| "sampling/sampling_logp_difference/mean": 0.015709228813648224, | |
| "step": 52, | |
| "step_time": 16.014014809000173 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.003289473708719015, | |
| "clip_ratio/high_mean": 0.0016447368543595076, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016447368543595076, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2799.0, | |
| "completions/max_terminated_length": 2799.0, | |
| "completions/mean_length": 1632.4375, | |
| "completions/mean_terminated_length": 1632.4375, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.06091495987493545, | |
| "epoch": 0.00424, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.7226957678794861, | |
| "kl": 0.017386693391017616, | |
| "learning_rate": 7.999993135237117e-06, | |
| "loss": 0.0201, | |
| "num_tokens": 4001420.0, | |
| "reward": 0.6793749928474426, | |
| "reward_std": 0.2570079565048218, | |
| "rewards/rollout_reward_func/mean": 0.6793749928474426, | |
| "rewards/rollout_reward_func/std": 0.46435481309890747, | |
| "sampling/importance_sampling_ratio/max": 1.579625129699707, | |
| "sampling/importance_sampling_ratio/mean": 0.9135901927947998, | |
| "sampling/importance_sampling_ratio/min": 0.35018137097358704, | |
| "sampling/sampling_logp_difference/max": 1.1071686744689941, | |
| "sampling/sampling_logp_difference/mean": 0.015183830633759499, | |
| "step": 53, | |
| "step_time": 15.960780017000161 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00390625, | |
| "clip_ratio/high_mean": 0.001953125, | |
| "clip_ratio/low_mean": 0.001953125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00390625, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2809.0, | |
| "completions/max_terminated_length": 2809.0, | |
| "completions/mean_length": 1589.6875, | |
| "completions/mean_terminated_length": 1589.6875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.054683255730196834, | |
| "epoch": 0.00432, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.0967762470245361, | |
| "kl": 0.021693169721402228, | |
| "learning_rate": 7.999992303864804e-06, | |
| "loss": -0.0452, | |
| "num_tokens": 4063548.0, | |
| "reward": 0.7262499928474426, | |
| "reward_std": 0.3071783781051636, | |
| "rewards/rollout_reward_func/mean": 0.7262499928474426, | |
| "rewards/rollout_reward_func/std": 0.45173320174217224, | |
| "sampling/importance_sampling_ratio/max": 1.3698084354400635, | |
| "sampling/importance_sampling_ratio/mean": 0.9228720664978027, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.3379874229431152, | |
| "sampling/sampling_logp_difference/mean": 0.013593094423413277, | |
| "step": 54, | |
| "step_time": 16.424249653999823 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0057151151122525334, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0057151151122525334, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2435.0, | |
| "completions/max_terminated_length": 2435.0, | |
| "completions/mean_length": 1791.34375, | |
| "completions/mean_terminated_length": 1791.34375, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.06585042458027601, | |
| "epoch": 0.0044, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.214758038520813, | |
| "kl": 0.024209468625485897, | |
| "learning_rate": 7.999991424985586e-06, | |
| "loss": 0.0112, | |
| "num_tokens": 4132756.0, | |
| "reward": 0.4637500047683716, | |
| "reward_std": 0.19184717535972595, | |
| "rewards/rollout_reward_func/mean": 0.4637500047683716, | |
| "rewards/rollout_reward_func/std": 0.35306718945503235, | |
| "sampling/importance_sampling_ratio/max": 2.3594348430633545, | |
| "sampling/importance_sampling_ratio/mean": 1.098282814025879, | |
| "sampling/importance_sampling_ratio/min": 0.3494420647621155, | |
| "sampling/sampling_logp_difference/max": 1.4460781812667847, | |
| "sampling/sampling_logp_difference/mean": 0.01887095905840397, | |
| "step": 55, | |
| "step_time": 15.803478434999533 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.003289473708719015, | |
| "clip_ratio/high_mean": 0.0016447368543595076, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016447368543595076, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2794.0, | |
| "completions/max_terminated_length": 2794.0, | |
| "completions/mean_length": 2308.09375, | |
| "completions/mean_terminated_length": 2308.09375, | |
| "completions/min_length": 1568.0, | |
| "completions/min_terminated_length": 1568.0, | |
| "entropy": 0.09696381096728146, | |
| "epoch": 0.00448, | |
| "frac_reward_zero_std": 0.875, | |
| "grad_norm": 0.32451269030570984, | |
| "kl": 0.019486179284285754, | |
| "learning_rate": 7.999990498599477e-06, | |
| "loss": -0.0013, | |
| "num_tokens": 4220279.0, | |
| "reward": 0.3631250262260437, | |
| "reward_std": 0.05983918905258179, | |
| "rewards/rollout_reward_func/mean": 0.3631250262260437, | |
| "rewards/rollout_reward_func/std": 0.2257665991783142, | |
| "sampling/importance_sampling_ratio/max": 1.3443750143051147, | |
| "sampling/importance_sampling_ratio/mean": 0.9363906979560852, | |
| "sampling/importance_sampling_ratio/min": 0.3993731737136841, | |
| "sampling/sampling_logp_difference/max": 0.668013334274292, | |
| "sampling/sampling_logp_difference/mean": 0.01326768472790718, | |
| "step": 56, | |
| "step_time": 16.922060316999477 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2460.0, | |
| "completions/max_terminated_length": 2460.0, | |
| "completions/mean_length": 1963.34375, | |
| "completions/mean_terminated_length": 1963.34375, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.04914194135926664, | |
| "epoch": 0.00456, | |
| "frac_reward_zero_std": 0.875, | |
| "grad_norm": 0.5049991607666016, | |
| "kl": 0.02837482520226331, | |
| "learning_rate": 7.99998952470649e-06, | |
| "loss": -0.008, | |
| "num_tokens": 4295471.0, | |
| "reward": 0.48374998569488525, | |
| "reward_std": 0.0624999962747097, | |
| "rewards/rollout_reward_func/mean": 0.48374998569488525, | |
| "rewards/rollout_reward_func/std": 0.3627649247646332, | |
| "sampling/importance_sampling_ratio/max": 1.912903904914856, | |
| "sampling/importance_sampling_ratio/mean": 0.9784045219421387, | |
| "sampling/importance_sampling_ratio/min": 0.4301539659500122, | |
| "sampling/sampling_logp_difference/max": 0.8482755422592163, | |
| "sampling/sampling_logp_difference/mean": 0.010939370840787888, | |
| "step": 57, | |
| "step_time": 15.453092248000303 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0034007353242486715, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034007353242486715, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2787.0, | |
| "completions/max_terminated_length": 2787.0, | |
| "completions/mean_length": 1872.28125, | |
| "completions/mean_terminated_length": 1872.28125, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.06335801596287638, | |
| "epoch": 0.00464, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.6874297261238098, | |
| "kl": 0.03325861978373723, | |
| "learning_rate": 7.999988503306642e-06, | |
| "loss": -0.0215, | |
| "num_tokens": 4367703.0, | |
| "reward": 0.4325000047683716, | |
| "reward_std": 0.19716876745224, | |
| "rewards/rollout_reward_func/mean": 0.4325000047683716, | |
| "rewards/rollout_reward_func/std": 0.3528958261013031, | |
| "sampling/importance_sampling_ratio/max": 1.3908125162124634, | |
| "sampling/importance_sampling_ratio/mean": 0.8833715319633484, | |
| "sampling/importance_sampling_ratio/min": 1.9220989599944005e-07, | |
| "sampling/sampling_logp_difference/max": 13.085790634155273, | |
| "sampling/sampling_logp_difference/mean": 0.04347304627299309, | |
| "step": 58, | |
| "step_time": 16.225704187000247 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004464285913854837, | |
| "clip_ratio/high_mean": 0.0022321429569274187, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0022321429569274187, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2810.0, | |
| "completions/max_terminated_length": 2810.0, | |
| "completions/mean_length": 2212.5, | |
| "completions/mean_terminated_length": 2212.5, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.0681739835999906, | |
| "epoch": 0.00472, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.8530667424201965, | |
| "kl": 0.03184444556973176, | |
| "learning_rate": 7.999987434399948e-06, | |
| "loss": 0.0114, | |
| "num_tokens": 4452096.0, | |
| "reward": 0.4006250202655792, | |
| "reward_std": 0.13200798630714417, | |
| "rewards/rollout_reward_func/mean": 0.4006250202655792, | |
| "rewards/rollout_reward_func/std": 0.29461774230003357, | |
| "sampling/importance_sampling_ratio/max": 1.6479169130325317, | |
| "sampling/importance_sampling_ratio/mean": 1.004500150680542, | |
| "sampling/importance_sampling_ratio/min": 0.30276352167129517, | |
| "sampling/sampling_logp_difference/max": 1.1370731592178345, | |
| "sampling/sampling_logp_difference/mean": 0.014959340915083885, | |
| "step": 59, | |
| "step_time": 17.880038248999654 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0031250000465661287, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250000465661287, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2438.0, | |
| "completions/max_terminated_length": 2438.0, | |
| "completions/mean_length": 2130.46875, | |
| "completions/mean_terminated_length": 2130.46875, | |
| "completions/min_length": 1567.0, | |
| "completions/min_terminated_length": 1567.0, | |
| "entropy": 0.055047230795025826, | |
| "epoch": 0.0048, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.3532449007034302, | |
| "kl": 0.01998938515316695, | |
| "learning_rate": 7.999986317986426e-06, | |
| "loss": -0.0728, | |
| "num_tokens": 4533067.0, | |
| "reward": 0.3746874928474426, | |
| "reward_std": 0.07062499970197678, | |
| "rewards/rollout_reward_func/mean": 0.3746874928474426, | |
| "rewards/rollout_reward_func/std": 0.26494044065475464, | |
| "sampling/importance_sampling_ratio/max": 1.6606436967849731, | |
| "sampling/importance_sampling_ratio/mean": 0.94105064868927, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.8746337890625, | |
| "sampling/sampling_logp_difference/mean": 0.013816887512803078, | |
| "step": 60, | |
| "step_time": 15.591869645000315 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0022321429569274187, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0022321429569274187, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2772.0, | |
| "completions/max_terminated_length": 2772.0, | |
| "completions/mean_length": 1693.875, | |
| "completions/mean_terminated_length": 1693.875, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.05530149070546031, | |
| "epoch": 0.00488, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.1544066667556763, | |
| "kl": 0.016453518153866753, | |
| "learning_rate": 7.999985154066091e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 4598963.0, | |
| "reward": 0.5693750381469727, | |
| "reward_std": 0.32917672395706177, | |
| "rewards/rollout_reward_func/mean": 0.5693750381469727, | |
| "rewards/rollout_reward_func/std": 0.4425143301486969, | |
| "sampling/importance_sampling_ratio/max": 2.6382031440734863, | |
| "sampling/importance_sampling_ratio/mean": 1.0790200233459473, | |
| "sampling/importance_sampling_ratio/min": 0.7093254327774048, | |
| "sampling/sampling_logp_difference/max": 0.6202226877212524, | |
| "sampling/sampling_logp_difference/mean": 0.011032961308956146, | |
| "step": 61, | |
| "step_time": 16.082178944999896 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0062500000931322575, | |
| "clip_ratio/high_mean": 0.0031250000465661287, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0031250000465661287, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2806.0, | |
| "completions/max_terminated_length": 2806.0, | |
| "completions/mean_length": 2304.9375, | |
| "completions/mean_terminated_length": 2304.9375, | |
| "completions/min_length": 1559.0, | |
| "completions/min_terminated_length": 1559.0, | |
| "entropy": 0.06317996443249285, | |
| "epoch": 0.00496, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.9210407137870789, | |
| "kl": 0.019177823327481747, | |
| "learning_rate": 7.999983942638965e-06, | |
| "loss": -0.0262, | |
| "num_tokens": 4686091.0, | |
| "reward": 0.3434374928474426, | |
| "reward_std": 0.08029377460479736, | |
| "rewards/rollout_reward_func/mean": 0.3434374928474426, | |
| "rewards/rollout_reward_func/std": 0.22245851159095764, | |
| "sampling/importance_sampling_ratio/max": 1.911454677581787, | |
| "sampling/importance_sampling_ratio/mean": 0.9809565544128418, | |
| "sampling/importance_sampling_ratio/min": 0.25244244933128357, | |
| "sampling/sampling_logp_difference/max": 0.9257916212081909, | |
| "sampling/sampling_logp_difference/mean": 0.013646715320646763, | |
| "step": 62, | |
| "step_time": 16.97750502200006 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2421.0, | |
| "completions/max_terminated_length": 2421.0, | |
| "completions/mean_length": 1974.96875, | |
| "completions/mean_terminated_length": 1974.96875, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.04369071568362415, | |
| "epoch": 0.00504, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.14146260917186737, | |
| "kl": 0.05203759076539427, | |
| "learning_rate": 7.999982683705066e-06, | |
| "loss": 0.0006, | |
| "num_tokens": 4761655.0, | |
| "reward": 0.2800000011920929, | |
| "reward_std": 0.0, | |
| "rewards/rollout_reward_func/mean": 0.2800000011920929, | |
| "rewards/rollout_reward_func/std": 0.05376172438263893, | |
| "sampling/importance_sampling_ratio/max": 2.2244811058044434, | |
| "sampling/importance_sampling_ratio/mean": 1.0279231071472168, | |
| "sampling/importance_sampling_ratio/min": 0.3840067684650421, | |
| "sampling/sampling_logp_difference/max": 0.9096496105194092, | |
| "sampling/sampling_logp_difference/mean": 0.010698029771447182, | |
| "step": 63, | |
| "step_time": 16.219797216000416 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.015190972248092294, | |
| "clip_ratio/high_mean": 0.007595486124046147, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007595486124046147, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2815.0, | |
| "completions/max_terminated_length": 2815.0, | |
| "completions/mean_length": 1982.6875, | |
| "completions/mean_terminated_length": 1982.6875, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.07828675024211407, | |
| "epoch": 0.00512, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.7813690304756165, | |
| "kl": 0.021903582994127646, | |
| "learning_rate": 7.999981377264413e-06, | |
| "loss": 0.0011, | |
| "num_tokens": 4838046.0, | |
| "reward": 0.5843750238418579, | |
| "reward_std": 0.22841876745224, | |
| "rewards/rollout_reward_func/mean": 0.5843750238418579, | |
| "rewards/rollout_reward_func/std": 0.4214792251586914, | |
| "sampling/importance_sampling_ratio/max": 1.827079176902771, | |
| "sampling/importance_sampling_ratio/mean": 1.0283104181289673, | |
| "sampling/importance_sampling_ratio/min": 0.25544387102127075, | |
| "sampling/sampling_logp_difference/max": 1.0380005836486816, | |
| "sampling/sampling_logp_difference/mean": 0.015725988894701004, | |
| "step": 64, | |
| "step_time": 16.373142061000408 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0036764706019312143, | |
| "clip_ratio/high_mean": 0.0018382353009656072, | |
| "clip_ratio/low_mean": 0.0018382353009656072, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0036764706019312143, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2801.0, | |
| "completions/max_terminated_length": 2801.0, | |
| "completions/mean_length": 2009.3125, | |
| "completions/mean_terminated_length": 2009.3125, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.05688254698179662, | |
| "epoch": 0.0052, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.2757700979709625, | |
| "kl": 0.036635934142395854, | |
| "learning_rate": 7.999980023317026e-06, | |
| "loss": 0.0247, | |
| "num_tokens": 4914779.0, | |
| "reward": 0.45250001549720764, | |
| "reward_std": 0.125, | |
| "rewards/rollout_reward_func/mean": 0.45250001549720764, | |
| "rewards/rollout_reward_func/std": 0.3471450209617615, | |
| "sampling/importance_sampling_ratio/max": 1.4275991916656494, | |
| "sampling/importance_sampling_ratio/mean": 0.908840537071228, | |
| "sampling/importance_sampling_ratio/min": 0.23684662580490112, | |
| "sampling/sampling_logp_difference/max": 1.4337669610977173, | |
| "sampling/sampling_logp_difference/mean": 0.017221834510564804, | |
| "step": 65, | |
| "step_time": 16.79740752900011 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0036764706019312143, | |
| "clip_ratio/high_mean": 0.0018382353009656072, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0018382353009656072, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2804.0, | |
| "completions/max_terminated_length": 2804.0, | |
| "completions/mean_length": 2059.75, | |
| "completions/mean_terminated_length": 2059.75, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.07610421534627676, | |
| "epoch": 0.00528, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.4526886940002441, | |
| "kl": 0.027375709789339453, | |
| "learning_rate": 7.999978621862929e-06, | |
| "loss": 0.0203, | |
| "num_tokens": 4993633.0, | |
| "reward": 0.4793750047683716, | |
| "reward_std": 0.13200797140598297, | |
| "rewards/rollout_reward_func/mean": 0.4793750047683716, | |
| "rewards/rollout_reward_func/std": 0.3699514865875244, | |
| "sampling/importance_sampling_ratio/max": 1.815674066543579, | |
| "sampling/importance_sampling_ratio/mean": 1.0289226770401, | |
| "sampling/importance_sampling_ratio/min": 0.5281765460968018, | |
| "sampling/sampling_logp_difference/max": 0.6608150005340576, | |
| "sampling/sampling_logp_difference/mean": 0.01253808755427599, | |
| "step": 66, | |
| "step_time": 16.794041812999467 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011259191203862429, | |
| "clip_ratio/high_mean": 0.005629595601931214, | |
| "clip_ratio/low_mean": 0.001953125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007582720601931214, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2812.0, | |
| "completions/max_terminated_length": 2812.0, | |
| "completions/mean_length": 1777.84375, | |
| "completions/mean_terminated_length": 1777.84375, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.06151273613795638, | |
| "epoch": 0.00536, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.1407759189605713, | |
| "kl": 0.02162244959617965, | |
| "learning_rate": 7.999977172902144e-06, | |
| "loss": -0.0017, | |
| "num_tokens": 5062556.0, | |
| "reward": 0.48000001907348633, | |
| "reward_std": 0.25434714555740356, | |
| "rewards/rollout_reward_func/mean": 0.48000001907348633, | |
| "rewards/rollout_reward_func/std": 0.3861806094646454, | |
| "sampling/importance_sampling_ratio/max": 1.6365498304367065, | |
| "sampling/importance_sampling_ratio/mean": 1.0415589809417725, | |
| "sampling/importance_sampling_ratio/min": 0.4183502197265625, | |
| "sampling/sampling_logp_difference/max": 0.8754826188087463, | |
| "sampling/sampling_logp_difference/mean": 0.014012180268764496, | |
| "step": 67, | |
| "step_time": 17.151742474000002 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0058139534667134285, | |
| "clip_ratio/high_mean": 0.0029069767333567142, | |
| "clip_ratio/low_mean": 0.0031250000465661287, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006031976779922843, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2778.0, | |
| "completions/max_terminated_length": 2778.0, | |
| "completions/mean_length": 2272.375, | |
| "completions/mean_terminated_length": 2272.375, | |
| "completions/min_length": 2023.0, | |
| "completions/min_terminated_length": 2023.0, | |
| "entropy": 0.08835586486384273, | |
| "epoch": 0.00544, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.3424510955810547, | |
| "kl": 0.030128376907669008, | |
| "learning_rate": 7.999975676434692e-06, | |
| "loss": -0.0562, | |
| "num_tokens": 5148670.0, | |
| "reward": 0.2918750047683716, | |
| "reward_std": 0.0162500012665987, | |
| "rewards/rollout_reward_func/mean": 0.2918750047683716, | |
| "rewards/rollout_reward_func/std": 0.031971510499715805, | |
| "sampling/importance_sampling_ratio/max": 2.0581729412078857, | |
| "sampling/importance_sampling_ratio/mean": 1.0349149703979492, | |
| "sampling/importance_sampling_ratio/min": 0.4890825152397156, | |
| "sampling/sampling_logp_difference/max": 0.7274646759033203, | |
| "sampling/sampling_logp_difference/mean": 0.019628014415502548, | |
| "step": 68, | |
| "step_time": 16.927781462999747 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0078125, | |
| "clip_ratio/high_mean": 0.00390625, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00390625, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2434.0, | |
| "completions/max_terminated_length": 2434.0, | |
| "completions/mean_length": 1876.4375, | |
| "completions/mean_terminated_length": 1876.4375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.0765807363204658, | |
| "epoch": 0.00552, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.9624868631362915, | |
| "kl": 0.05591569049283862, | |
| "learning_rate": 7.999974132460596e-06, | |
| "loss": 0.0124, | |
| "num_tokens": 5221035.0, | |
| "reward": 0.4793750047683716, | |
| "reward_std": 0.17558756470680237, | |
| "rewards/rollout_reward_func/mean": 0.4793750047683716, | |
| "rewards/rollout_reward_func/std": 0.3699514865875244, | |
| "sampling/importance_sampling_ratio/max": 1.3370881080627441, | |
| "sampling/importance_sampling_ratio/mean": 0.9259651899337769, | |
| "sampling/importance_sampling_ratio/min": 0.3385222852230072, | |
| "sampling/sampling_logp_difference/max": 1.2049891948699951, | |
| "sampling/sampling_logp_difference/mean": 0.014953669160604477, | |
| "step": 69, | |
| "step_time": 15.703423997999835 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069444444961845875, | |
| "clip_ratio/high_mean": 0.0034722222480922937, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034722222480922937, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2437.0, | |
| "completions/max_terminated_length": 2437.0, | |
| "completions/mean_length": 1896.46875, | |
| "completions/mean_terminated_length": 1896.46875, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.05552901164628565, | |
| "epoch": 0.0056, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.2667936384677887, | |
| "kl": 0.014588030888262438, | |
| "learning_rate": 7.999972540979884e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 5294017.0, | |
| "reward": 0.5, | |
| "reward_std": 0.125, | |
| "rewards/rollout_reward_func/mean": 0.5, | |
| "rewards/rollout_reward_func/std": 0.39909571409225464, | |
| "sampling/importance_sampling_ratio/max": 1.607146143913269, | |
| "sampling/importance_sampling_ratio/mean": 0.9701419472694397, | |
| "sampling/importance_sampling_ratio/min": 0.401731014251709, | |
| "sampling/sampling_logp_difference/max": 1.0000518560409546, | |
| "sampling/sampling_logp_difference/mean": 0.010325020179152489, | |
| "step": 70, | |
| "step_time": 15.35116849200017 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0069659443106502295, | |
| "clip_ratio/high_mean": 0.0034829721553251147, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0034829721553251147, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2805.0, | |
| "completions/max_terminated_length": 2805.0, | |
| "completions/mean_length": 2170.65625, | |
| "completions/mean_terminated_length": 2170.65625, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.09610213804990053, | |
| "epoch": 0.00568, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.5360183119773865, | |
| "kl": 0.029458090604748577, | |
| "learning_rate": 7.99997090199258e-06, | |
| "loss": 0.0331, | |
| "num_tokens": 5376683.0, | |
| "reward": 0.515625, | |
| "reward_std": 0.12233918905258179, | |
| "rewards/rollout_reward_func/mean": 0.515625, | |
| "rewards/rollout_reward_func/std": 0.3893662095069885, | |
| "sampling/importance_sampling_ratio/max": 2.599726438522339, | |
| "sampling/importance_sampling_ratio/mean": 0.9910818934440613, | |
| "sampling/importance_sampling_ratio/min": 0.38567137718200684, | |
| "sampling/sampling_logp_difference/max": 0.9569785594940186, | |
| "sampling/sampling_logp_difference/mean": 0.02182850055396557, | |
| "step": 71, | |
| "step_time": 18.631503620000103 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0034722222480922937, | |
| "clip_ratio/high_mean": 0.0017361111240461469, | |
| "clip_ratio/low_mean": 0.0018382353009656072, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.003574346425011754, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2434.0, | |
| "completions/max_terminated_length": 2434.0, | |
| "completions/mean_length": 1713.6875, | |
| "completions/mean_terminated_length": 1713.6875, | |
| "completions/min_length": 1053.0, | |
| "completions/min_terminated_length": 1053.0, | |
| "entropy": 0.05361162032932043, | |
| "epoch": 0.00576, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6218546628952026, | |
| "kl": 0.02329419369925745, | |
| "learning_rate": 7.999969215498707e-06, | |
| "loss": -0.0147, | |
| "num_tokens": 5443189.0, | |
| "reward": 0.6181250214576721, | |
| "reward_std": 0.2755875587463379, | |
| "rewards/rollout_reward_func/mean": 0.6181250214576721, | |
| "rewards/rollout_reward_func/std": 0.45625001192092896, | |
| "sampling/importance_sampling_ratio/max": 1.4299272298812866, | |
| "sampling/importance_sampling_ratio/mean": 0.9309602975845337, | |
| "sampling/importance_sampling_ratio/min": 0.39184144139289856, | |
| "sampling/sampling_logp_difference/max": 0.9363220930099487, | |
| "sampling/sampling_logp_difference/mean": 0.011649301275610924, | |
| "step": 72, | |
| "step_time": 15.753308050000896 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011660009622573853, | |
| "clip_ratio/high_mean": 0.005830004811286926, | |
| "clip_ratio/low_mean": 0.0017361111240461469, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007566115935333073, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2801.0, | |
| "completions/max_terminated_length": 2801.0, | |
| "completions/mean_length": 1721.4375, | |
| "completions/mean_terminated_length": 1721.4375, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.06148386397399008, | |
| "epoch": 0.00584, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 1.5323787927627563, | |
| "kl": 0.02491366575122811, | |
| "learning_rate": 7.999967481498294e-06, | |
| "loss": 0.0102, | |
| "num_tokens": 5509968.0, | |
| "reward": 0.5487499833106995, | |
| "reward_std": 0.33183753490448, | |
| "rewards/rollout_reward_func/mean": 0.5487499833106995, | |
| "rewards/rollout_reward_func/std": 0.4589731991291046, | |
| "sampling/importance_sampling_ratio/max": 1.9993343353271484, | |
| "sampling/importance_sampling_ratio/mean": 0.9622111916542053, | |
| "sampling/importance_sampling_ratio/min": 0.34618350863456726, | |
| "sampling/sampling_logp_difference/max": 1.054746150970459, | |
| "sampling/sampling_logp_difference/mean": 0.013135725632309914, | |
| "step": 73, | |
| "step_time": 16.009405589999915 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.001953125, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.001953125, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2792.0, | |
| "completions/max_terminated_length": 2792.0, | |
| "completions/mean_length": 1907.75, | |
| "completions/mean_terminated_length": 1907.75, | |
| "completions/min_length": 1052.0, | |
| "completions/min_terminated_length": 1052.0, | |
| "entropy": 0.08447153866291046, | |
| "epoch": 0.00592, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 1.2818049192428589, | |
| "kl": 0.03082829067716375, | |
| "learning_rate": 7.999965699991369e-06, | |
| "loss": 0.0268, | |
| "num_tokens": 5583748.0, | |
| "reward": 0.44875001907348633, | |
| "reward_std": 0.20683756470680237, | |
| "rewards/rollout_reward_func/mean": 0.44875001907348633, | |
| "rewards/rollout_reward_func/std": 0.3857104480266571, | |
| "sampling/importance_sampling_ratio/max": 1.9656041860580444, | |
| "sampling/importance_sampling_ratio/mean": 1.0510060787200928, | |
| "sampling/importance_sampling_ratio/min": 0.4638214409351349, | |
| "sampling/sampling_logp_difference/max": 0.651539146900177, | |
| "sampling/sampling_logp_difference/mean": 0.016923408955335617, | |
| "step": 74, | |
| "step_time": 16.37548145899973 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0016447368543595076, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016447368543595076, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2794.0, | |
| "completions/max_terminated_length": 2794.0, | |
| "completions/mean_length": 1590.5, | |
| "completions/mean_terminated_length": 1590.5, | |
| "completions/min_length": 1054.0, | |
| "completions/min_terminated_length": 1054.0, | |
| "entropy": 0.051554064732044935, | |
| "epoch": 0.006, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.4656570851802826, | |
| "kl": 0.039722154615446925, | |
| "learning_rate": 7.99996387097796e-06, | |
| "loss": -0.0273, | |
| "num_tokens": 5646147.0, | |
| "reward": 0.8512499928474426, | |
| "reward_std": 0.13466876745224, | |
| "rewards/rollout_reward_func/mean": 0.8512499928474426, | |
| "rewards/rollout_reward_func/std": 0.4309011399745941, | |
| "sampling/importance_sampling_ratio/max": 2.459237575531006, | |
| "sampling/importance_sampling_ratio/mean": 1.0666892528533936, | |
| "sampling/importance_sampling_ratio/min": 0.403707355260849, | |
| "sampling/sampling_logp_difference/max": 0.9005355834960938, | |
| "sampling/sampling_logp_difference/mean": 0.012501123361289501, | |
| "step": 75, | |
| "step_time": 16.584246522000285 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "eval_clip_ratio/high_max": 0.0, | |
| "eval_clip_ratio/high_mean": 0.0, | |
| "eval_clip_ratio/low_mean": 0.0, | |
| "eval_clip_ratio/low_min": 0.0, | |
| "eval_clip_ratio/region_mean": 0.0, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 2216.95, | |
| "eval_completions/max_terminated_length": 2216.95, | |
| "eval_completions/mean_length": 1925.65, | |
| "eval_completions/mean_terminated_length": 1925.65, | |
| "eval_completions/min_length": 1634.35, | |
| "eval_completions/min_terminated_length": 1634.35, | |
| "eval_entropy": 0.07575540114194154, | |
| "eval_frac_reward_zero_std": 1.0, | |
| "eval_kl": 0.031034281105894478, | |
| "eval_loss": 4.469734994927421e-05, | |
| "eval_num_tokens": 5646147.0, | |
| "eval_reward": 0.6004999987781048, | |
| "eval_reward_std": 0.0, | |
| "eval_rewards/rollout_reward_func/mean": 0.6004999987781048, | |
| "eval_rewards/rollout_reward_func/std": 0.24536602906882762, | |
| "eval_runtime": 13.987, | |
| "eval_samples_per_second": 0.715, | |
| "eval_sampling/importance_sampling_ratio/max": 1.2220476478338242, | |
| "eval_sampling/importance_sampling_ratio/mean": 1.0154326111078262, | |
| "eval_sampling/importance_sampling_ratio/min": 0.8088175728917122, | |
| "eval_sampling/sampling_logp_difference/max": 0.25984298419207336, | |
| "eval_sampling/sampling_logp_difference/mean": 0.014358489285223186, | |
| "eval_steps_per_second": 0.357, | |
| "step": 75 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 25000, | |
| "num_input_tokens_seen": 5646147, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |